1 package OpenILS::Application::URLVerify;
3 # For code searchability, I'm telling you this is the "link checker."
5 use base qw/OpenILS::Application/;
6 use strict; use warnings;
7 use OpenSRF::Utils::Logger qw(:logger);
8 use OpenSRF::MultiSession;
9 use OpenILS::Utils::Fieldmapper;
10 use OpenILS::Utils::CStoreEditor q/:funcs/;
11 use OpenILS::Application::AppUtils;
16 $Data::Dumper::Indent = 0;
18 my $U = 'OpenILS::Application::AppUtils';
21 __PACKAGE__->register_method(
22 method => 'verify_session',
23 api_name => 'open-ils.url_verify.session.verify',
28 Performs verification on all (or a subset of the) URLs within the requested session.
31 {desc => 'Authentication token', type => 'string'},
32 {desc => 'Session ID (url_verify.session.id)', type => 'number'},
33 {desc => 'URL ID list (optional). An empty list will result in no URLs being processed, but null will result in all the URLs for the session being processed', type => 'array'},
37 report_all => bypass response throttling and return all URL sub-process
38 responses to the caller. Not recommened for remote (web, etc.) clients,
39 because it can be a lot of data.
40 resume_attempt => atttempt_id. Resume verification after a failure.
41 resume_with_new_attempt => If true, resume from resume_attempt, but
42 create a new attempt to track the resumption.
48 Stream of objects containing the number of URLs to be processed (url_count),
49 the number processed thus far including redirects (total_processed),
50 and the current url_verification object (current_verification).
52 Note that total_processed may ultimately exceed url_count, since it
53 includes non-anticipate-able redirects.
55 The final response contains url_count, total_processed, and the
56 verification_attempt object (attempt).
62 # "verify_session" sounds like something to do with authentication, but it
63 # actually means for a given session, verify all the URLs associated with
66 my ($self, $client, $auth, $session_id, $url_ids, $options) = @_;
69 my $e = new_editor(authtoken => $auth, xact => 1);
70 return $e->die_event unless $e->checkauth;
71 return $e->die_event unless $e->allowed('URL_VERIFY');
73 my $session = $e->retrieve_url_verify_session($session_id)
74 or return $e->die_event;
76 my $attempt_id = $options->{resume_attempt};
80 # No URLs provided, load all URLs for the requested session
83 select => {uvu => ['id']},
87 filter => {id => $session_id}
95 # when resuming an existing attempt (that presumably failed
96 # mid-processing), we only want to process URLs that either
97 # have no linked url_verification or have an un-completed
100 $logger->info("url: resuming attempt $attempt_id");
102 $query->{from}->{uvu}->{uvuv} = {
104 filter => {attempt => $attempt_id}
110 {id => undef}, # no verification started
111 {res_code => undef} # verification started but did no complete
118 # this is a new attempt, so we only want to process URLs that
119 # originated from the source records and not from redirects.
122 '+uvu' => {redirect_from => undef}
126 my $ids = $e->json_query($query);
127 $url_ids = [ map {$_->{id}} @$ids ];
130 my $url_count = scalar(@$url_ids);
131 $logger->info("url: processing $url_count URLs");
134 if ($attempt_id and !$options->{resume_with_new_attempt}) {
136 $attempt = $e->retrieve_url_verification_attempt($attempt_id)
137 or return $e->die_event;
139 # no data was written
144 $attempt = Fieldmapper::url_verify::verification_attempt->new;
145 $attempt->session($session_id);
146 $attempt->usr($e->requestor->id);
147 $attempt->start_time('now');
149 $e->create_url_verify_verification_attempt($attempt)
150 or return $e->die_event;
158 # Now cycle through the URLs in batches.
160 my $batch_size = $U->ou_ancestor_setting_value(
161 $session->owning_lib,
162 'url_verify.verification_batch_size', $e) || 5;
164 my $total_excluding_redirects = 0;
165 my $total_processed = 0; # total number processed, including redirects
168 # before we start the real work, let the caller know
169 # the attempt (id) so recovery is possible.
172 url_count => $url_count,
173 total_processed => $total_processed,
174 total_excluding_redirects => $total_excluding_redirects,
178 my $multises = OpenSRF::MultiSession->new(
180 app => 'open-ils.url_verify', # hey, that's us!
183 success_handler => sub {
184 my ($self, $req) = @_;
186 # API call streams fleshed url_verification objects. We wrap
187 # those up with some extra info and pass them on to the caller.
189 for my $resp (@{$req->{response}}) {
190 my $content = $resp->content;
196 if ($options->{report_all} or ($total_processed % $resp_window == 0)) {
199 url_count => $url_count,
200 current_verification => $content,
201 total_excluding_redirects => $total_excluding_redirects,
202 total_processed => $total_processed
205 # start off responding quickly, then throttle
206 # back to only relaying every 256 messages.
207 $resp_window *= 2 unless $resp_window >= 256;
213 failure_handler => sub {
214 my ($self, $req) = @_;
216 # {error} should be an Error w/ a toString
217 $logger->error("url: error processing URL: " . $req->{error});
221 sort_and_fire_domains(
222 $e, $auth, $attempt, $url_ids, $multises, \$total_excluding_redirects
225 # Wait for all requests to be completed
226 $multises->session_wait(1);
228 # All done. Let's wrap up the attempt.
229 $attempt->finish_time('now');
232 $e->update_url_verify_verification_attempt($attempt) or
233 return $e->die_event;
237 # This way the caller gets an actual timestamp in the "finish_time" field
238 # instead of the string "now".
239 $attempt = $e->retrieve_url_verify_verification_attempt($e->data) or
240 return $e->die_event;
245 url_count => $url_count,
246 total_processed => $total_processed,
247 total_excluding_redirects => $total_excluding_redirects,
252 # retrieves the URL domains and sorts them into buckets*
253 # Iterates over the buckets and fires the multi-session call
254 # the main drawback to this domain sorting approach is that
255 # any domain used a lot more than the others will be the
256 # only domain standing after the others are exhausted, which
257 # means it will take a beating at the end of the batch.
259 # * local data structures, not container.* buckets
260 sub sort_and_fire_domains {
261 my ($e, $auth, $attempt, $url_ids, $multises, $count) = @_;
263 # there is potential here for data sets to be too large
264 # for delivery, but it's not likely, since we're only
265 # fetching ID and domain.
266 my $urls = $e->json_query(
268 select => {uvu => ['id', 'domain']},
270 where => {id => $url_ids}
272 # {substream => 1} only if needed
275 # sort them into buckets based on domain name
277 for my $url (@$urls) {
278 $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
279 push(@{$domains{$url->{domain}}}, $url->{id});
282 # loop through the domains and fire the verification call
283 while (keys %domains) {
284 for my $domain (keys %domains) {
286 my $url_id = pop(@{$domains{$domain}});
287 delete $domains{$domain} unless @{$domains{$domain}};
290 'open-ils.url_verify.verify_url',
291 $auth, $attempt->id, $url_id);
293 $$count++; # sic, a reference to a scalar
299 # XXX I really want to move this method to open-ils.storage, so we don't have
300 # to authenticate a zillion times. LFW
302 __PACKAGE__->register_method(
303 method => 'verify_url',
304 api_name => 'open-ils.url_verify.verify_url',
308 Performs verification of a single URL. When a redirect is detected,
309 a new URL is created to model the redirect and the redirected URL
310 is then tested, up to max-redirects or a loop is detected.
313 {desc => 'Authentication token', type => 'string'},
314 {desc => 'Verification attempt ID (url_verify.verification_attempt.id)', type => 'number'},
315 {desc => 'URL id (url_verify.url.id)', type => 'number'},
317 return => {desc => q/Stream of url_verification objects, one per URL tested/}
323 verification.res_code:
325 999 bad hostname, etc. (IO::Socket::Inet errors)
326 998 in-flight errors (e.g connection closed prematurely)
331 verification.res_text:
333 $@ or custom message "Redirect Loop"
338 my ($self, $client, $auth, $attempt_id, $url_id) = @_;
341 my $e = new_editor(authtoken => $auth);
342 return $e->event unless $e->checkauth;
344 my $url = $e->retrieve_url_verify_url($url_id) or return $e->event;
346 my ($attempt, $delay, $max_redirects, $timeout) =
347 collect_verify_attempt_and_settings($e, $attempt_id);
349 return $e->event unless $e->allowed(
350 'URL_VERIFY', $attempt->session->owning_lib);
353 my $loop_detected = 0;
356 while ($redir_count++ < $max_redirects) {
358 if ($seen_urls{$cur_url->full_url}) {
363 $seen_urls{$cur_url->full_url} = $cur_url;
365 my $url_resp = verify_one_url($e, $attempt, $cur_url, $timeout);
367 # something tragic happened
368 return $url_resp if $U->is_event($url_resp);
370 # flesh and respond to the caller
371 $url_resp->{verification}->url($cur_url);
372 $client->respond($url_resp->{verification});
374 $cur_url = $url_resp->{redirect_url} or last;
377 if ($loop_detected or $redir_count > $max_redirects) {
379 my $vcation = Fieldmapper::url_verify::url_verification->new;
380 $vcation->url($cur_url->id);
381 $vcation->attempt($attempt->id);
382 $vcation->req_time('now');
384 if ($loop_detected) {
385 $logger->info("url: redirect loop detected at " . $cur_url->full_url);
386 $vcation->res_code('996');
387 $vcation->res_text('Redirect Loop');
390 $logger->info("url: max redirects reached for source URL " . $url->full_url);
391 $vcation->res_code('995');
392 $vcation->res_text('Max Redirects');
396 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
400 # The calling code is likely not multi-threaded, so a
401 # per-URL (i.e. per-thread) delay would not be possible.
402 # Applying the delay here allows the caller to process
403 # batches of URLs without having to worry about the delay.
409 # temporarily cache some data to avoid a pile
410 # of data lookups on every URL processed.
412 sub collect_verify_attempt_and_settings {
413 my ($e, $attempt_id) = @_;
416 if (!(keys %cache) or $cache{age} > 20) { # configurable?
426 if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
428 # attempt may have just been created, so
429 # we need to guarantee a write-DB read.
433 $e->retrieve_url_verify_verification_attempt([
436 flesh_fields => {uvva => ['session']}
438 ]) or return $e->die_event;
442 $cache{attempt}{$attempt_id} = $attempt;
445 my $org = $attempt->session->owning_lib;
447 if (!$cache{timeout}{$org}) {
449 $cache{delay}{$org} = $U->ou_ancestor_setting_value(
450 $org, 'url_verify.url_verification_delay', $e);
453 $cache{delay}{$org} = 2 unless defined $cache{delay}{$org};
455 $cache{redirects}{$org} = $U->ou_ancestor_setting_value(
456 $org, 'url_verify.url_verification_max_redirects', $e) || 20;
458 $cache{timeout}{$org} = $U->ou_ancestor_setting_value(
459 $org, 'url_verify.url_verification_max_wait', $e) || 5;
462 sprintf("url: loaded settings delay=%s; max_redirects=%s; timeout=%s",
463 $cache{delay}{$org}, $cache{redirects}{$org}, $cache{timeout}{$org}));
470 $cache{attempt}{$attempt_id},
472 $cache{redirects}{$org},
473 $cache{timeout}{$org}
478 # searches for a completed url_verfication for any url processed
479 # within this verification attempt whose full_url matches the
480 # full_url of the provided URL.
481 sub find_matching_url_for_attempt {
482 my ($e, $attempt, $url) = @_;
484 my $match = $e->json_query({
485 select => {uvuv => ['id']},
489 filter => {id => $attempt->id}
496 id => {'!=' => $url->id},
497 full_url => $url->full_url
500 # There could be multiple verifications for matching URLs
501 # We only want a verification that completed.
502 # Note also that 2 identical URLs processed within the same
503 # sub-batch will have to each be fully processed in their own
504 # right, since neither knows how the other will ultimately fare.
506 res_time => {'!=' => undef}
511 return $e->retrieve_url_verify_url_verification($match->{id}) if $match;
518 1. create the verification object and commit.
520 3. update the verification object to capture the results of the test
521 4. Return redirect_url object if this is a redirect, otherwise undef.
526 my ($e, $attempt, $url, $timeout) = @_;
528 my $url_text = $url->full_url;
531 # first, create the verification object so we can a) indicate that
532 # we're working on this URL and b) get the DB to set the req_time.
534 my $vcation = Fieldmapper::url_verify::url_verification->new;
535 $vcation->url($url->id);
536 $vcation->attempt($attempt->id);
537 $vcation->req_time('now');
539 # begin phase-I DB communication
543 my $match_vcation = find_matching_url_for_attempt($e, $attempt, $url);
545 if ($match_vcation) {
546 $logger->info("url: found matching URL in verification attempt [$url_text]");
547 $vcation->res_code($match_vcation->res_code);
548 $vcation->res_text($match_vcation->res_text);
549 $vcation->redirect_to($match_vcation->redirect_to);
552 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
555 # found a matching URL, no need to re-process
556 return {verification => $vcation} if $match_vcation;
558 # End phase-I DB communication
559 # No active DB xact means no cstore timeout concerns.
563 $ENV{FTP_PASSIVE} = 1; # TODO: setting?
565 my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 0}); # TODO: verify_hostname setting?
566 $ua->timeout($timeout);
568 my $req = HTTP::Request->new(HEAD => $url->full_url);
570 # simple_request avoids LWP's auto-redirect magic
571 my $res = $ua->simple_request($req);
573 $logger->info(sprintf(
574 "url: received HTTP '%s' / '%s' [%s]",
580 $vcation->res_code($res->code);
581 $vcation->res_text($res->message);
583 # is this a redirect?
584 if ($res->code =~ /^3/) {
586 if (my $loc = $res->headers->{location}) {
587 $redir_url = Fieldmapper::url_verify::url->new;
588 $redir_url->session($attempt->session);
589 $redir_url->redirect_from($url->id);
590 $redir_url->full_url($loc);
592 $logger->info("url: redirect found $url_text => $loc");
595 $logger->info("url: server returned 3XX but no 'Location' header for url $url_text");
599 # Begin phase-II DB communication
604 $redir_url = $e->create_url_verify_url($redir_url) or return $e->die_event;
605 $vcation->redirect_to($redir_url->id);
608 $vcation->res_time('now');
609 $e->update_url_verify_url_verification($vcation) or return $e->die_event;
613 verification => $vcation,
614 redirect_url => $redir_url
619 __PACKAGE__->register_method(
620 method => "create_session",
621 api_name => "open-ils.url_verify.session.create",
623 desc => q/Create a URL verify session. Also automatically create and
626 {desc => "Authentication token", type => "string"},
627 {desc => "session name", type => "string"},
628 {desc => "QueryParser search", type => "string"},
629 {desc => "owning_lib (defaults to ws_ou)", type => "number"},
631 return => {desc => "ID of new session or event on error", type => "number"}
636 my ($self, $client, $auth, $name, $search, $owning_lib) = @_;
638 my $e = new_editor(authtoken => $auth, xact => 1);
639 return $e->die_event unless $e->checkauth;
641 $owning_lib ||= $e->requestor->ws_ou;
642 return $e->die_event unless $e->allowed("URL_VERIFY", $owning_lib);
644 my $session = Fieldmapper::url_verify::session->new;
645 $session->name($name);
646 $session->owning_lib($owning_lib);
647 $session->creator($e->requestor->id);
648 $session->search($search);
650 my $container = Fieldmapper::container::biblio_record_entry_bucket->new;
651 $container->btype("url_verify");
652 $container->owner($e->requestor->id);
653 $container->name($name);
654 $container->description("Automatically generated");
656 $e->create_container_biblio_record_entry_bucket($container) or
657 return $e->die_event;
659 $session->container($e->data->id);
660 $e->create_url_verify_session($session) or
661 return $e->die_event;
663 $e->commit or return $e->die_event;
668 # _check_for_existing_bucket_items() is used later by session_search_and_extract()
669 sub _check_for_existing_bucket_items {
670 my ($e, $session) = @_;
672 my $items = $e->json_query(
674 select => {cbrebi => ['id']},
675 from => {cbrebi => {}},
676 where => {bucket => $session->container},
679 ) or return $e->die_event;
681 return new OpenILS::Event("URL_VERIFY_SESSION_ALREADY_SEARCHED") if @$items;
686 # _get_all_search_results() is used later by session_search_and_extract()
687 sub _get_all_search_results {
688 my ($client, $session) = @_;
692 # Don't loop if the user has specified their own offset.
693 if ($session->search =~ /offset\(\d+\)/) {
694 my $res = $U->simplereq(
696 "open-ils.search.biblio.multiclass.query.staff",
700 return new OpenILS::Event("UNKNOWN") unless $res;
701 return $res if $U->is_event($res);
703 @result_ids = map { shift @$_ } @{$res->{ids}}; # IDs nested in array
708 LOOP: { do { # Fun fact: you cannot "last" out of a do/while in Perl
709 # unless you wrap it another loop structure.
710 my $search = $session->search . " offset(".scalar(@result_ids).")";
712 my $res = $U->simplereq(
714 "open-ils.search.biblio.multiclass.query.staff",
718 return new OpenILS::Event("UNKNOWN") unless $res;
719 return $res if $U->is_event($res);
721 # Search only returns the total count when offset is 0.
722 # We can't get more than one superpage this way, XXX TODO ?
723 $count = $res->{count} unless defined $count;
725 my @this_batch = map { shift @$_ } @{$res->{ids}}; # unnest IDs
726 push @result_ids, @this_batch;
728 # Send a keepalive in case search is slow, although it'll probably
729 # be the query for the first ten results that's slowest.
730 $client->status(new OpenSRF::DomainObject::oilsContinueStatus);
732 last unless @this_batch; # Protect against getting fewer results
733 # than count promised.
735 } while ($count - scalar(@result_ids) > 0); }
738 return (undef, @result_ids);
742 __PACKAGE__->register_method(
743 method => "session_search_and_extract",
744 api_name => "open-ils.url_verify.session.search_and_extract",
748 Perform the search contained in the session,
749 populating the linked bucket, and extracting URLs /,
751 {desc => "Authentication token", type => "string"},
752 {desc => "url_verify.session id", type => "number"},
755 desc => q/stream of numbers: first number of search results, then
756 numbers of extracted URLs for each record, grouped into arrays
763 sub session_search_and_extract {
764 my ($self, $client, $auth, $ses_id) = @_;
766 my $e = new_editor(authtoken => $auth);
767 return $e->die_event unless $e->checkauth;
769 my $session = $e->retrieve_url_verify_session(int($ses_id));
771 return $e->die_event unless
772 $session and $e->allowed("URL_VERIFY", $session->owning_lib);
774 if ($session->creator != $e->requestor->id) {
776 return new OpenILS::Event("URL_VERIFY_NOT_SESSION_CREATOR");
780 _check_for_existing_bucket_items($e, $session);
784 return $delete_error;
787 my ($search_error, @result_ids) =
788 _get_all_search_results($client, $session);
792 return $search_error;
797 # Make and save a bucket item for each search result.
802 # There's an opportunity below to parallelize the extraction of URLs if
805 foreach my $bre_id (@result_ids) {
807 Fieldmapper::container::biblio_record_entry_bucket_item->new;
809 $bucket_item->bucket($session->container);
810 $bucket_item->target_biblio_record_entry($bre_id);
811 $bucket_item->pos($pos++);
813 $e->create_container_biblio_record_entry_bucket_item($bucket_item) or
814 return $e->die_event;
816 push @item_ids, $e->data->id;
821 $client->respond($pos); # first response: the number of items created
822 # (number of search results)
824 # For each contain item, extract URLs. Report counts of URLs extracted
825 # from each record in batches at every hundred records. XXX Arbitrary.
828 foreach my $item_id (@item_ids) {
829 my $res = $e->json_query({
830 from => ["url_verify.extract_urls", $ses_id, $item_id]
831 }) or return $e->die_event;
833 push @url_counts, $res->[0]{"url_verify.extract_urls"};
835 if (scalar(@url_counts) % 100 == 0) {
836 $client->respond([ @url_counts ]);
841 $client->respond([ @url_counts ]) if @url_counts;