1 package OpenILS::Application::URLVerify;
2 use base qw/OpenILS::Application/;
3 use strict; use warnings;
4 use OpenSRF::Utils::Logger qw(:logger);
5 use OpenSRF::MultiSession;
6 use OpenILS::Utils::Fieldmapper;
7 use OpenILS::Utils::CStoreEditor q/:funcs/;
8 use OpenILS::Application::AppUtils;
11 my $U = 'OpenILS::Application::AppUtils';
14 __PACKAGE__->register_method(
15 method => 'validate_session',
16 api_name => 'open-ils.url_verify.session.validate',
20 Performs verification on all (or a subset of the) URLs within the requested session.
23 {desc => 'Authentication token', type => 'string'},
24 {desc => 'Session ID (url_verify.session.id)', type => 'number'},
25 {desc => 'URL ID list (optional). An empty list will result in no URLs being processed', type => 'array'},
29 report_all => bypass response throttling and return all URL sub-process
30 responses to the caller. Not recommened for remote (web, etc.) clients,
31 because it can be a lot of data.
32 resume_attempt => atttempt_id. Resume verification after a failure.
33 resume_with_new_attempt => If true, resume from resume_attempt, but
34 create a new attempt to track the resumption.
40 Stream of objects containing the number of URLs to be processed (url_count),
41 the number processed thus far including redirects (total_processed),
42 and the current url_verification object (current_verification).
44 Note that total_processed may ultimately exceed url_count, since it
45 includes non-anticipate-able redirects.
47 The final response contains url_count, total_processed, and the
48 verification_attempt object (attempt).
54 sub validate_session {
55 my ($self, $client, $auth, $session_id, $url_ids, $options) = @_;
58 my $e = new_editor(authtoken => $auth, xact => 1);
59 return $e->die_event unless $e->checkauth;
60 return $e->die_event unless $e->allowed('VERIFY_URL');
62 my $session = $e->retrieve_url_verify_session($session_id)
63 or return $e->die_event;
65 my $attempt_id = $options->{resume_attempt};
69 # No URLs provided, load all URLs for the requested session
72 select => {uvu => ['id']},
75 cbrebi => { # bucket item
76 join => { cbreb => { # bucket
77 join => { uvs => { # session
78 filter => {id => $session_id}
88 # when resuming an existing attempt (that presumably failed
89 # mid-processing), we only want to process URLs that either
90 # have no linked url_verification or have an un-completed
93 $logger->info("url: resuming attempt $attempt_id");
95 $query->{from}->{uvu}->{uvuv} = {
97 filter => {attempt => $attempt_id}
103 {id => undef}, # no verification started
104 {res_code => undef} # verification started but did no complete
111 # this is a new attempt, so we only want to process URLs that
112 # originated from the source records and not from redirects.
115 '+uvu' => {redirect_from => undef}
119 my $ids = $e->json_query($query);
120 $url_ids = [ map {$_->{id}} @$ids ];
123 my $url_count = scalar(@$url_ids);
124 $logger->info("url: processing $url_count URLs");
127 if ($attempt_id and !$options->{resume_with_new_attempt}) {
129 $attempt = $e->retrieve_url_verification_attempt($attempt_id)
130 or return $e->die_event;
132 # no data was written
137 $attempt = Fieldmapper::url_verify::verification_attempt->new;
138 $attempt->session($session_id);
139 $attempt->usr($e->requestor->id);
140 $attempt->start_time('now');
142 $e->create_url_verify_verification_attempt($attempt)
143 or return $e->die_event;
150 # Now cycle through the URLs in batches.
152 my $batch_size = $U->ou_ancestor_setting_value(
153 $session->owning_lib,
154 'url_verify.verification_batch_size', $e) || 5;
156 my $num_processed = 0; # total number processed, including redirects
159 # before we start the real work, let the caller know
160 # the attempt (id) so recovery is possible.
163 url_count => $url_count,
164 total_processed => $num_processed,
168 my $multises = OpenSRF::MultiSession->new(
170 app => 'open-ils.url_verify', # hey, that's us!
173 success_handler => sub {
174 my ($self, $req) = @_;
176 # API call streams fleshed url_verification objects. We wrap
177 # those up with some extra info and pass them on to the caller.
179 for my $resp (@{$req->{response}}) {
180 my $content = $resp->content;
186 if ($options->{report_all} or ($num_processed % $resp_window == 0)) {
188 url_count => $url_count,
189 current_verification => $content,
190 total_processed => $num_processed
194 # start off responding quickly, then throttle
195 # back to only relaying every 256 messages.
196 $resp_window *= 2 unless $resp_window == 256;
201 failure_handler => sub {
202 my ($self, $req) = @_;
204 # {error} should be an Error w/ a toString
205 $logger->error("url: error processing URL: " . $req->{error});
209 sort_and_fire_domains($e, $auth, $attempt, $url_ids, $multises);
211 # Wait for all requests to be completed
212 $multises->session_wait(1);
214 # All done. Let's wrap up the attempt.
215 $attempt->finish_time('now');
218 $e->update_url_verify_verification_attempt($attempt) or return $e->die_event;
222 url_count => $url_count,
223 total_processed => $num_processed,
228 # retrieves the URL domains and sorts them into buckets
229 # Iterates over the buckets and fires the multi-session call
230 # the main drawback to this domain sorting approach is that
231 # any domain used a lot more than the others will be the
232 # only domain standing after the others are exhausted, which
233 # means it will take a beating at the end of the batch.
234 sub sort_and_fire_domains {
235 my ($e, $auth, $attempt, $url_ids, $multises) = @_;
237 # there is potential here for data sets to be too large
238 # for delivery, but it's not likely, since we're only
239 # fetching ID and domain.
240 my $urls = $e->json_query(
242 select => {uvu => ['id', 'domain']},
244 where => {id => $url_ids}
246 # {substream => 1} only if needed
249 # sort them into buckets based on domain name
251 for my $url (@$urls) {
252 $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
253 push(@{$domains{$url->{domain}}}, $url->{id});
256 # loop through the domains and fire the verification call
257 while (keys %domains) {
258 for my $domain (keys %domains) {
260 my $url_id = pop(@{$domains{$domain}});
261 delete $domains{$domain} unless @{$domains{$domain}};
264 'open-ils.url_verify.verify_url',
265 $auth, $attempt->id, $url_id);
271 __PACKAGE__->register_method(
272 method => 'verify_url',
273 api_name => 'open-ils.url_verify.verify_url',
277 Performs verification of a single URL. When a redirect is detected,
278 a new URL is created to model the redirect and the redirected URL
279 is then tested, up to max-redirects or a loop is detected.
282 {desc => 'Authentication token', type => 'string'},
283 {desc => 'Verification attempt ID (url_verify.verification_attempt.id)', type => 'number'},
284 {desc => 'URL id (url_verify.url.id)', type => 'number'},
286 return => {desc => q/Stream of url_verification objects, one per URL tested/}
292 verification.res_code:
294 999 bad hostname, etc. (IO::Socket::Inet errors)
295 998 in-flight errors (e.g connection closed prematurely)
300 verification.res_text:
302 $@ or custom message "Redirect Loop"
307 my ($self, $client, $auth, $attempt_id, $url_id) = @_;
310 my $e = new_editor(authtoken => $auth);
311 return $e->event unless $e->checkauth;
313 my $url = $e->retrieve_url_verify_url($url_id) or return $e->event;
315 my ($attempt, $delay, $max_redirects, $timeout) =
316 collect_verify_attempt_and_settings($e, $attempt_id);
318 return $e->event unless $e->allowed(
319 'VERIFY_URL', $attempt->session->owning_lib);
322 my $loop_detected = 0;
325 while ($redir_count++ < $max_redirects) {
327 if ($seen_urls{$cur_url->full_url}) {
332 $seen_urls{$cur_url->full_url} = $cur_url;
334 my $url_resp = verify_one_url($e, $attempt, $cur_url, $timeout);
336 # something tragic happened
337 return $url_resp if $U->is_event($url_resp);
339 # flesh and respond to the caller
340 $url_resp->{verification}->url($cur_url);
341 $client->respond($url_resp->{verification});
343 $cur_url = $url_resp->{redirect_url} or last;
346 if ($loop_detected or $redir_count > $max_redirects) {
348 my $vcation = Fieldmapper::url_verify::url_verification->new;
349 $vcation->url($cur_url->id);
350 $vcation->attempt($attempt->id);
351 $vcation->req_time('now');
353 if ($loop_detected) {
354 $logger->info("url: redirect loop detected at " . $cur_url->full_url);
355 $vcation->res_code('996');
356 $vcation->res_text('Redirect Loop');
359 $logger->info("url: max redirects reached for source URL " . $url->full_url);
360 $vcation->res_code('995');
361 $vcation->res_text('Max Redirects');
365 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
369 # The calling code is likely not multi-threaded, so a
370 # per-URL (i.e. per-thread) delay would not be possible.
371 # Applying the delay here allows the caller to process
372 # batches of URLs without having to worry about the delay.
378 # temporarily cache some data to avoid a pile
379 # of data lookups on every URL processed.
381 sub collect_verify_attempt_and_settings {
382 my ($e, $attempt_id) = @_;
385 if (!(keys %cache) or $cache{age} > 20) { # configurable?
395 if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
397 # attempt may have just been created, so
398 # we need to guarantee a write-DB read.
402 $e->retrieve_url_verify_verification_attempt([
405 flesh_fields => {uvva => ['session']}
407 ]) or return $e->die_event;
411 $cache{attempt}{$attempt_id} = $attempt;
414 my $org = $attempt->session->owning_lib;
416 if (!$cache{timeout}{$org}) {
418 $cache{delay}{$org} = $U->ou_ancestor_setting_value(
419 $org, 'url_verify.url_verification_delay', $e);
422 $cache{delay}{$org} = 2 unless defined $cache{delay}{$org};
424 $cache{redirects}{$org} = $U->ou_ancestor_setting_value(
425 $org, 'url_verify.url_verification_max_redirects', $e) || 20;
427 $cache{timeout}{$org} = $U->ou_ancestor_setting_value(
428 $org, 'url_verify.url_verification_max_wait', $e) || 5;
431 sprintf("url: loaded settings delay=%s; max_redirects=%s; timeout=%s",
432 $cache{delay}{$org}, $cache{redirects}{$org}, $cache{timeout}{$org}));
439 $cache{attempt}{$attempt_id},
441 $cache{redirects}{$org},
442 $cache{timeout}{$org}
447 # searches for a completed url_verfication for any url processed
448 # within this verification attempt whose full_url matches the
449 # full_url of the provided URL.
450 sub find_matching_url_for_attempt {
451 my ($e, $attempt, $url) = @_;
453 my $match = $e->json_query({
454 select => {uvuv => ['id']},
458 filter => {id => $attempt->id}
465 id => {'!=' => $url->id},
466 full_url => $url->full_url
469 # There could be multiple verifications for matching URLs
470 # We only want a verification that completed.
471 # Note also that 2 identical URLs processed within the same
472 # sub-batch will have to each be fully processed in their own
473 # right, since neither knows how the other will ultimately fare.
475 res_time => {'!=' => undef}
480 return $e->retrieve_url_verify_url_verification($match->{id}) if $match;
487 1. create the verification object and commit.
489 3. update the verification object to capture the results of the test
490 4. Return redirect_url object if this is a redirect, otherwise undef.
495 my ($e, $attempt, $url, $timeout) = @_;
497 my $url_text = $url->full_url;
500 # first, create the verification object so we can a) indicate that
501 # we're working on this URL and b) get the DB to set the req_time.
503 my $vcation = Fieldmapper::url_verify::url_verification->new;
504 $vcation->url($url->id);
505 $vcation->attempt($attempt->id);
506 $vcation->req_time('now');
508 # begin phase-I DB communication
512 my $match_vcation = find_matching_url_for_attempt($e, $attempt, $url);
514 if ($match_vcation) {
515 $logger->info("url: found matching URL in verification attempt [$url_text]");
516 $vcation->res_code($match_vcation->res_code);
517 $vcation->res_text($match_vcation->res_text);
518 $vcation->redirect_to($match_vcation->redirect_to);
521 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
524 # found a matching URL, no need to re-process
525 return {verification => $vcation} if $match_vcation;
527 # End phase-I DB communication
528 # No active DB xact means no cstore timeout concerns.
532 $ENV{FTP_PASSIVE} = 1; # TODO: setting?
534 my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 0}); # TODO: verify_hostname setting?
535 $ua->timeout($timeout);
537 my $req = HTTP::Request->new(HEAD => $url->full_url);
539 # simple_request avoids LWP's auto-redirect magic
540 my $res = $ua->simple_request($req);
542 $logger->info(sprintf(
543 "url: received HTTP '%s' / '%s' [%s]",
549 $vcation->res_code($res->code);
550 $vcation->res_text($res->message);
552 # is this a redirect?
553 if ($res->code =~ /^3/) {
555 if (my $loc = $res->headers->{location}) {
556 $redir_url = Fieldmapper::url_verify::url->new;
557 $redir_url->redirect_from($url->id);
558 $redir_url->full_url($loc);
560 $logger->info("url: redirect found $url_text => $loc");
563 $logger->info("url: server returned 3XX but no 'Location' header for url $url_text");
567 # Begin phase-II DB communication
572 $redir_url = $e->create_url_verify_url($redir_url) or return $e->die_event;
573 $vcation->redirect_to($redir_url->id);
576 $vcation->res_time('now');
577 $e->update_url_verify_url_verification($vcation) or return $e->die_event;
581 verification => $vcation,
582 redirect_url => $redir_url