1 package OpenILS::Application::URLVerify;
3 # For code searchability, I'm telling you this is the "link checker."
5 use base qw/OpenILS::Application/;
6 use strict; use warnings;
7 use OpenSRF::Utils::Logger qw(:logger);
8 use OpenSRF::MultiSession;
9 use OpenILS::Utils::Fieldmapper;
10 use OpenILS::Utils::CStoreEditor q/:funcs/;
11 use OpenILS::Application::AppUtils;
16 $Data::Dumper::Indent = 0;
18 my $U = 'OpenILS::Application::AppUtils';
21 __PACKAGE__->register_method(
22 method => 'verify_session',
23 api_name => 'open-ils.url_verify.session.verify',
28 Performs verification on all (or a subset of the) URLs within the requested session.
31 {desc => 'Authentication token', type => 'string'},
32 {desc => 'Session ID (url_verify.session.id)', type => 'number'},
33 {desc => 'URL ID list (optional). An empty list will result in no URLs being processed, but null will result in all the URLs for the session being processed', type => 'array'},
37 report_all => bypass response throttling and return all URL sub-process
38 responses to the caller. Not recommened for remote (web, etc.) clients,
39 because it can be a lot of data.
40 resume_attempt => atttempt_id. Resume verification after a failure.
41 resume_with_new_attempt => If true, resume from resume_attempt, but
42 create a new attempt to track the resumption.
48 Stream of objects containing the number of URLs to be processed (url_count),
49 the number processed thus far including redirects (total_processed),
50 and the current url_verification object (current_verification).
52 Note that total_processed may ultimately exceed url_count, since it
53 includes non-anticipate-able redirects.
55 The final response contains url_count, total_processed, and the
56 verification_attempt object (attempt).
62 # "verify_session" sounds like something to do with authentication, but it
63 # actually means for a given session, verify all the URLs associated with
66 my ($self, $client, $auth, $session_id, $url_ids, $options) = @_;
69 my $e = new_editor(authtoken => $auth, xact => 1);
70 return $e->die_event unless $e->checkauth;
71 return $e->die_event unless $e->allowed('URL_VERIFY');
73 my $session = $e->retrieve_url_verify_session($session_id)
74 or return $e->die_event;
76 my $attempt_id = $options->{resume_attempt};
80 # No URLs provided, load all URLs for the requested session
83 select => {uvu => ['id']},
86 cbrebi => { # bucket item
87 join => { cbreb => { # bucket
88 join => { uvs => { # session
89 filter => {id => $session_id}
99 # when resuming an existing attempt (that presumably failed
100 # mid-processing), we only want to process URLs that either
101 # have no linked url_verification or have an un-completed
104 $logger->info("url: resuming attempt $attempt_id");
106 $query->{from}->{uvu}->{uvuv} = {
108 filter => {attempt => $attempt_id}
114 {id => undef}, # no verification started
115 {res_code => undef} # verification started but did no complete
122 # this is a new attempt, so we only want to process URLs that
123 # originated from the source records and not from redirects.
126 '+uvu' => {redirect_from => undef}
130 my $ids = $e->json_query($query);
131 $url_ids = [ map {$_->{id}} @$ids ];
134 my $url_count = scalar(@$url_ids);
135 $logger->info("url: processing $url_count URLs");
138 if ($attempt_id and !$options->{resume_with_new_attempt}) {
140 $attempt = $e->retrieve_url_verification_attempt($attempt_id)
141 or return $e->die_event;
143 # no data was written
148 $attempt = Fieldmapper::url_verify::verification_attempt->new;
149 $attempt->session($session_id);
150 $attempt->usr($e->requestor->id);
151 $attempt->start_time('now');
153 $e->create_url_verify_verification_attempt($attempt)
154 or return $e->die_event;
162 # Now cycle through the URLs in batches.
164 my $batch_size = $U->ou_ancestor_setting_value(
165 $session->owning_lib,
166 'url_verify.verification_batch_size', $e) || 5;
168 my $total_excluding_redirects = 0;
169 my $total_processed = 0; # total number processed, including redirects
172 # before we start the real work, let the caller know
173 # the attempt (id) so recovery is possible.
176 url_count => $url_count,
177 total_processed => $total_processed,
178 total_excluding_redirects => $total_excluding_redirects,
182 my $multises = OpenSRF::MultiSession->new(
184 app => 'open-ils.url_verify', # hey, that's us!
187 success_handler => sub {
188 my ($self, $req) = @_;
190 # API call streams fleshed url_verification objects. We wrap
191 # those up with some extra info and pass them on to the caller.
193 for my $resp (@{$req->{response}}) {
194 my $content = $resp->content;
200 if ($options->{report_all} or ($total_processed % $resp_window == 0)) {
203 url_count => $url_count,
204 current_verification => $content,
205 total_excluding_redirects => $total_excluding_redirects,
206 total_processed => $total_processed
209 # start off responding quickly, then throttle
210 # back to only relaying every 256 messages.
211 $resp_window *= 2 unless $resp_window >= 256;
217 failure_handler => sub {
218 my ($self, $req) = @_;
220 # {error} should be an Error w/ a toString
221 $logger->error("url: error processing URL: " . $req->{error});
225 sort_and_fire_domains(
226 $e, $auth, $attempt, $url_ids, $multises, \$total_excluding_redirects
229 # Wait for all requests to be completed
230 $multises->session_wait(1);
232 # All done. Let's wrap up the attempt.
233 $attempt->finish_time('now');
236 $e->update_url_verify_verification_attempt($attempt) or
237 return $e->die_event;
241 # This way the caller gets an actual timestamp in the "finish_time" field
242 # instead of the string "now".
243 $attempt = $e->retrieve_url_verify_verification_attempt($e->data) or
244 return $e->die_event;
249 url_count => $url_count,
250 total_processed => $total_processed,
251 total_excluding_redirects => $total_excluding_redirects,
256 # retrieves the URL domains and sorts them into buckets*
257 # Iterates over the buckets and fires the multi-session call
258 # the main drawback to this domain sorting approach is that
259 # any domain used a lot more than the others will be the
260 # only domain standing after the others are exhausted, which
261 # means it will take a beating at the end of the batch.
263 # * local data structures, not container.* buckets
264 sub sort_and_fire_domains {
265 my ($e, $auth, $attempt, $url_ids, $multises, $count) = @_;
267 # there is potential here for data sets to be too large
268 # for delivery, but it's not likely, since we're only
269 # fetching ID and domain.
270 my $urls = $e->json_query(
272 select => {uvu => ['id', 'domain']},
274 where => {id => $url_ids}
276 # {substream => 1} only if needed
279 # sort them into buckets based on domain name
281 for my $url (@$urls) {
282 $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
283 push(@{$domains{$url->{domain}}}, $url->{id});
286 # loop through the domains and fire the verification call
287 while (keys %domains) {
288 for my $domain (keys %domains) {
290 my $url_id = pop(@{$domains{$domain}});
291 delete $domains{$domain} unless @{$domains{$domain}};
294 'open-ils.url_verify.verify_url',
295 $auth, $attempt->id, $url_id);
297 $$count++; # sic, a reference to a scalar
303 # XXX I really want to move this method to open-ils.storage, so we don't have
304 # to authenticate a zillion times. LFW
306 __PACKAGE__->register_method(
307 method => 'verify_url',
308 api_name => 'open-ils.url_verify.verify_url',
312 Performs verification of a single URL. When a redirect is detected,
313 a new URL is created to model the redirect and the redirected URL
314 is then tested, up to max-redirects or a loop is detected.
317 {desc => 'Authentication token', type => 'string'},
318 {desc => 'Verification attempt ID (url_verify.verification_attempt.id)', type => 'number'},
319 {desc => 'URL id (url_verify.url.id)', type => 'number'},
321 return => {desc => q/Stream of url_verification objects, one per URL tested/}
327 verification.res_code:
329 999 bad hostname, etc. (IO::Socket::Inet errors)
330 998 in-flight errors (e.g connection closed prematurely)
335 verification.res_text:
337 $@ or custom message "Redirect Loop"
342 my ($self, $client, $auth, $attempt_id, $url_id) = @_;
345 my $e = new_editor(authtoken => $auth);
346 return $e->event unless $e->checkauth;
348 my $url = $e->retrieve_url_verify_url($url_id) or return $e->event;
350 my ($attempt, $delay, $max_redirects, $timeout) =
351 collect_verify_attempt_and_settings($e, $attempt_id);
353 return $e->event unless $e->allowed(
354 'URL_VERIFY', $attempt->session->owning_lib);
357 my $loop_detected = 0;
360 while ($redir_count++ < $max_redirects) {
362 if ($seen_urls{$cur_url->full_url}) {
367 $seen_urls{$cur_url->full_url} = $cur_url;
369 my $url_resp = verify_one_url($e, $attempt, $cur_url, $timeout);
371 # something tragic happened
372 return $url_resp if $U->is_event($url_resp);
374 # flesh and respond to the caller
375 $url_resp->{verification}->url($cur_url);
376 $client->respond($url_resp->{verification});
378 $cur_url = $url_resp->{redirect_url} or last;
381 if ($loop_detected or $redir_count > $max_redirects) {
383 my $vcation = Fieldmapper::url_verify::url_verification->new;
384 $vcation->url($cur_url->id);
385 $vcation->attempt($attempt->id);
386 $vcation->req_time('now');
388 if ($loop_detected) {
389 $logger->info("url: redirect loop detected at " . $cur_url->full_url);
390 $vcation->res_code('996');
391 $vcation->res_text('Redirect Loop');
394 $logger->info("url: max redirects reached for source URL " . $url->full_url);
395 $vcation->res_code('995');
396 $vcation->res_text('Max Redirects');
400 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
404 # The calling code is likely not multi-threaded, so a
405 # per-URL (i.e. per-thread) delay would not be possible.
406 # Applying the delay here allows the caller to process
407 # batches of URLs without having to worry about the delay.
413 # temporarily cache some data to avoid a pile
414 # of data lookups on every URL processed.
416 sub collect_verify_attempt_and_settings {
417 my ($e, $attempt_id) = @_;
420 if (!(keys %cache) or $cache{age} > 20) { # configurable?
430 if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
432 # attempt may have just been created, so
433 # we need to guarantee a write-DB read.
437 $e->retrieve_url_verify_verification_attempt([
440 flesh_fields => {uvva => ['session']}
442 ]) or return $e->die_event;
446 $cache{attempt}{$attempt_id} = $attempt;
449 my $org = $attempt->session->owning_lib;
451 if (!$cache{timeout}{$org}) {
453 $cache{delay}{$org} = $U->ou_ancestor_setting_value(
454 $org, 'url_verify.url_verification_delay', $e);
457 $cache{delay}{$org} = 2 unless defined $cache{delay}{$org};
459 $cache{redirects}{$org} = $U->ou_ancestor_setting_value(
460 $org, 'url_verify.url_verification_max_redirects', $e) || 20;
462 $cache{timeout}{$org} = $U->ou_ancestor_setting_value(
463 $org, 'url_verify.url_verification_max_wait', $e) || 5;
466 sprintf("url: loaded settings delay=%s; max_redirects=%s; timeout=%s",
467 $cache{delay}{$org}, $cache{redirects}{$org}, $cache{timeout}{$org}));
474 $cache{attempt}{$attempt_id},
476 $cache{redirects}{$org},
477 $cache{timeout}{$org}
482 # searches for a completed url_verfication for any url processed
483 # within this verification attempt whose full_url matches the
484 # full_url of the provided URL.
485 sub find_matching_url_for_attempt {
486 my ($e, $attempt, $url) = @_;
488 my $match = $e->json_query({
489 select => {uvuv => ['id']},
493 filter => {id => $attempt->id}
500 id => {'!=' => $url->id},
501 full_url => $url->full_url
504 # There could be multiple verifications for matching URLs
505 # We only want a verification that completed.
506 # Note also that 2 identical URLs processed within the same
507 # sub-batch will have to each be fully processed in their own
508 # right, since neither knows how the other will ultimately fare.
510 res_time => {'!=' => undef}
515 return $e->retrieve_url_verify_url_verification($match->{id}) if $match;
522 1. create the verification object and commit.
524 3. update the verification object to capture the results of the test
525 4. Return redirect_url object if this is a redirect, otherwise undef.
530 my ($e, $attempt, $url, $timeout) = @_;
532 my $url_text = $url->full_url;
535 # first, create the verification object so we can a) indicate that
536 # we're working on this URL and b) get the DB to set the req_time.
538 my $vcation = Fieldmapper::url_verify::url_verification->new;
539 $vcation->url($url->id);
540 $vcation->attempt($attempt->id);
541 $vcation->req_time('now');
543 # begin phase-I DB communication
547 my $match_vcation = find_matching_url_for_attempt($e, $attempt, $url);
549 if ($match_vcation) {
550 $logger->info("url: found matching URL in verification attempt [$url_text]");
551 $vcation->res_code($match_vcation->res_code);
552 $vcation->res_text($match_vcation->res_text);
553 $vcation->redirect_to($match_vcation->redirect_to);
556 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
559 # found a matching URL, no need to re-process
560 return {verification => $vcation} if $match_vcation;
562 # End phase-I DB communication
563 # No active DB xact means no cstore timeout concerns.
567 $ENV{FTP_PASSIVE} = 1; # TODO: setting?
569 my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 0}); # TODO: verify_hostname setting?
570 $ua->timeout($timeout);
572 my $req = HTTP::Request->new(HEAD => $url->full_url);
574 # simple_request avoids LWP's auto-redirect magic
575 my $res = $ua->simple_request($req);
577 $logger->info(sprintf(
578 "url: received HTTP '%s' / '%s' [%s]",
584 $vcation->res_code($res->code);
585 $vcation->res_text($res->message);
587 # is this a redirect?
588 if ($res->code =~ /^3/) {
590 if (my $loc = $res->headers->{location}) {
591 $redir_url = Fieldmapper::url_verify::url->new;
592 $redir_url->redirect_from($url->id);
593 $redir_url->full_url($loc);
595 $logger->info("url: redirect found $url_text => $loc");
598 $logger->info("url: server returned 3XX but no 'Location' header for url $url_text");
602 # Begin phase-II DB communication
607 $redir_url = $e->create_url_verify_url($redir_url) or return $e->die_event;
608 $vcation->redirect_to($redir_url->id);
611 $vcation->res_time('now');
612 $e->update_url_verify_url_verification($vcation) or return $e->die_event;
616 verification => $vcation,
617 redirect_url => $redir_url
622 __PACKAGE__->register_method(
623 method => "create_session",
624 api_name => "open-ils.url_verify.session.create",
626 desc => q/Create a URL verify session. Also automatically create and
629 {desc => "Authentication token", type => "string"},
630 {desc => "session name", type => "string"},
631 {desc => "QueryParser search", type => "string"},
632 {desc => "owning_lib (defaults to ws_ou)", type => "number"},
634 return => {desc => "ID of new session or event on error", type => "number"}
639 my ($self, $client, $auth, $name, $search, $owning_lib) = @_;
641 my $e = new_editor(authtoken => $auth, xact => 1);
642 return $e->die_event unless $e->checkauth;
644 $owning_lib ||= $e->requestor->ws_ou;
645 return $e->die_event unless $e->allowed("URL_VERIFY", $owning_lib);
647 my $session = Fieldmapper::url_verify::session->new;
648 $session->name($name);
649 $session->owning_lib($owning_lib);
650 $session->creator($e->requestor->id);
651 $session->search($search);
653 my $container = Fieldmapper::container::biblio_record_entry_bucket->new;
654 $container->btype("url_verify");
655 $container->owner($e->requestor->id);
656 $container->name($name);
657 $container->description("Automatically generated");
659 $e->create_container_biblio_record_entry_bucket($container) or
660 return $e->die_event;
662 $session->container($e->data->id);
663 $e->create_url_verify_session($session) or
664 return $e->die_event;
666 $e->commit or return $e->die_event;
671 # _check_for_existing_bucket_items() is used later by session_search_and_extract()
672 sub _check_for_existing_bucket_items {
673 my ($e, $session) = @_;
675 my $items = $e->json_query(
677 select => {cbrebi => ['id']},
678 from => {cbrebi => {}},
679 where => {bucket => $session->container},
682 ) or return $e->die_event;
684 return new OpenILS::Event("URL_VERIFY_SESSION_ALREADY_SEARCHED") if @$items;
689 # _get_all_search_results() is used later by session_search_and_extract()
690 sub _get_all_search_results {
691 my ($client, $session) = @_;
695 # Don't loop if the user has specified their own offset.
696 if ($session->search =~ /offset\(\d+\)/) {
697 my $res = $U->simplereq(
699 "open-ils.search.biblio.multiclass.query.staff",
703 return new OpenILS::Event("UNKNOWN") unless $res;
704 return $res if $U->is_event($res);
706 @result_ids = map { shift @$_ } @{$res->{ids}}; # IDs nested in array
711 LOOP: { do { # Fun fact: you cannot "last" out of a do/while in Perl
712 # unless you wrap it another loop structure.
713 my $search = $session->search . " offset(".scalar(@result_ids).")";
715 my $res = $U->simplereq(
717 "open-ils.search.biblio.multiclass.query.staff",
721 return new OpenILS::Event("UNKNOWN") unless $res;
722 return $res if $U->is_event($res);
724 # Search only returns the total count when offset is 0.
725 # We can't get more than one superpage this way, XXX TODO ?
726 $count = $res->{count} unless defined $count;
728 my @this_batch = map { shift @$_ } @{$res->{ids}}; # unnest IDs
729 push @result_ids, @this_batch;
731 # Send a keepalive in case search is slow, although it'll probably
732 # be the query for the first ten results that's slowest.
733 $client->status(new OpenSRF::DomainObject::oilsContinueStatus);
735 last unless @this_batch; # Protect against getting fewer results
736 # than count promised.
738 } while ($count - scalar(@result_ids) > 0); }
741 return (undef, @result_ids);
745 __PACKAGE__->register_method(
746 method => "session_search_and_extract",
747 api_name => "open-ils.url_verify.session.search_and_extract",
751 Perform the search contained in the session,
752 populating the linked bucket, and extracting URLs /,
754 {desc => "Authentication token", type => "string"},
755 {desc => "url_verify.session id", type => "number"},
758 desc => q/stream of numbers: first number of search results, then
759 numbers of extracted URLs for each record, grouped into arrays
766 sub session_search_and_extract {
767 my ($self, $client, $auth, $ses_id) = @_;
769 my $e = new_editor(authtoken => $auth);
770 return $e->die_event unless $e->checkauth;
772 my $session = $e->retrieve_url_verify_session(int($ses_id));
774 return $e->die_event unless
775 $session and $e->allowed("URL_VERIFY", $session->owning_lib);
777 if ($session->creator != $e->requestor->id) {
779 return new OpenILS::Event("URL_VERIFY_NOT_SESSION_CREATOR");
783 _check_for_existing_bucket_items($e, $session);
787 return $delete_error;
790 my ($search_error, @result_ids) =
791 _get_all_search_results($client, $session);
795 return $search_error;
800 # Make and save a bucket item for each search result.
805 # There's an opportunity below to parallelize the extraction of URLs if
808 foreach my $bre_id (@result_ids) {
810 Fieldmapper::container::biblio_record_entry_bucket_item->new;
812 $bucket_item->bucket($session->container);
813 $bucket_item->target_biblio_record_entry($bre_id);
814 $bucket_item->pos($pos++);
816 $e->create_container_biblio_record_entry_bucket_item($bucket_item) or
817 return $e->die_event;
819 push @item_ids, $e->data->id;
824 $client->respond($pos); # first response: the number of items created
825 # (number of search results)
827 # For each contain item, extract URLs. Report counts of URLs extracted
828 # from each record in batches at every hundred records. XXX Arbitrary.
831 foreach my $item_id (@item_ids) {
832 my $res = $e->json_query({
833 from => ["url_verify.extract_urls", $ses_id, $item_id]
834 }) or return $e->die_event;
836 push @url_counts, $res->[0]{"url_verify.extract_urls"};
838 if (scalar(@url_counts) % 100 == 0) {
839 $client->respond([ @url_counts ]);
844 $client->respond([ @url_counts ]) if @url_counts;