1 package OpenILS::Application::URLVerify;
3 # For code searchability, I'm telling you this is the "link checker."
5 use base qw/OpenILS::Application/;
6 use strict; use warnings;
7 use OpenSRF::Utils::Logger qw(:logger);
8 use OpenSRF::MultiSession;
9 use OpenSRF::Utils::SettingsClient;
10 use OpenILS::Utils::Fieldmapper;
11 use OpenILS::Utils::CStoreEditor q/:funcs/;
12 use OpenILS::Application::AppUtils;
17 $Data::Dumper::Indent = 0;
19 my $U = 'OpenILS::Application::AppUtils';
21 my $user_agent_string;
24 my $conf = new OpenSRF::Utils::SettingsClient;
26 my @confpath = qw/apps open-ils.url_verify app_settings user_agent/;
29 sprintf($conf->config_value(@confpath), __PACKAGE__->ils_version);
31 $logger->info("using '$user_agent_string' as User Agent string");
34 __PACKAGE__->register_method(
35 method => 'verify_session',
36 api_name => 'open-ils.url_verify.session.verify',
41 Performs verification on all (or a subset of the) URLs within the requested session.
44 {desc => 'Authentication token', type => 'string'},
45 {desc => 'Session ID (url_verify.session.id)', type => 'number'},
46 {desc => 'URL ID list (optional). An empty list will result in no URLs being processed, but null will result in all the URLs for the session being processed', type => 'array'},
50 report_all => bypass response throttling and return all URL sub-process
51 responses to the caller. Not recommened for remote (web, etc.) clients,
52 because it can be a lot of data.
53 resume_attempt => atttempt_id. Resume verification after a failure.
54 resume_with_new_attempt => If true, resume from resume_attempt, but
55 create a new attempt to track the resumption.
61 Stream of objects containing the number of URLs to be processed (url_count),
62 the number processed thus far including redirects (total_processed),
63 and the current url_verification object (current_verification).
65 Note that total_processed may ultimately exceed url_count, since it
66 includes non-anticipate-able redirects.
68 The final response contains url_count, total_processed, and the
69 verification_attempt object (attempt).
75 # "verify_session" sounds like something to do with authentication, but it
76 # actually means for a given session, verify all the URLs associated with
79 my ($self, $client, $auth, $session_id, $url_ids, $options) = @_;
82 my $e = new_editor(authtoken => $auth, xact => 1);
83 return $e->die_event unless $e->checkauth;
84 return $e->die_event unless $e->allowed('URL_VERIFY');
86 my $session = $e->retrieve_url_verify_session($session_id)
87 or return $e->die_event;
89 my $attempt_id = $options->{resume_attempt};
93 # No URLs provided, load all URLs for the requested session
96 select => {uvu => ['id']},
100 filter => {id => $session_id}
108 # when resuming an existing attempt (that presumably failed
109 # mid-processing), we only want to process URLs that either
110 # have no linked url_verification or have an un-completed
113 $logger->info("url: resuming attempt $attempt_id");
115 $query->{from}->{uvu}->{uvuv} = {
117 filter => {attempt => $attempt_id}
123 {id => undef}, # no verification started
124 {res_code => undef} # verification started but did no complete
131 # this is a new attempt, so we only want to process URLs that
132 # originated from the source records and not from redirects.
135 '+uvu' => {redirect_from => undef}
139 my $ids = $e->json_query($query);
140 $url_ids = [ map {$_->{id}} @$ids ];
143 my $url_count = scalar(@$url_ids);
144 $logger->info("url: processing $url_count URLs");
147 if ($attempt_id and !$options->{resume_with_new_attempt}) {
149 $attempt = $e->retrieve_url_verification_attempt($attempt_id)
150 or return $e->die_event;
152 # no data was written
157 $attempt = Fieldmapper::url_verify::verification_attempt->new;
158 $attempt->session($session_id);
159 $attempt->usr($e->requestor->id);
160 $attempt->start_time('now');
162 $e->create_url_verify_verification_attempt($attempt)
163 or return $e->die_event;
171 # Now cycle through the URLs in batches.
173 my $batch_size = $U->ou_ancestor_setting_value(
174 $session->owning_lib,
175 'url_verify.verification_batch_size', $e) || 5;
177 my $total_excluding_redirects = 0;
178 my $total_processed = 0; # total number processed, including redirects
181 # before we start the real work, let the caller know
182 # the attempt (id) so recovery is possible.
185 url_count => $url_count,
186 total_processed => $total_processed,
187 total_excluding_redirects => $total_excluding_redirects,
191 my $multises = OpenSRF::MultiSession->new(
193 app => 'open-ils.url_verify', # hey, that's us!
196 success_handler => sub {
197 my ($self, $req) = @_;
199 # API call streams fleshed url_verification objects. We wrap
200 # those up with some extra info and pass them on to the caller.
202 for my $resp (@{$req->{response}}) {
203 my $content = $resp->content;
209 if ($options->{report_all} or ($total_processed % $resp_window == 0)) {
212 url_count => $url_count,
213 current_verification => $content,
214 total_excluding_redirects => $total_excluding_redirects,
215 total_processed => $total_processed
218 # start off responding quickly, then throttle
219 # back to only relaying every 256 messages.
220 $resp_window *= 2 unless $resp_window >= 256;
226 failure_handler => sub {
227 my ($self, $req) = @_;
229 # {error} should be an Error w/ a toString
230 $logger->error("url: error processing URL: " . $req->{error});
234 sort_and_fire_domains(
235 $e, $auth, $attempt, $url_ids, $multises, \$total_excluding_redirects
238 # Wait for all requests to be completed
239 $multises->session_wait(1);
241 # All done. Let's wrap up the attempt.
242 $attempt->finish_time('now');
245 $e->update_url_verify_verification_attempt($attempt) or
246 return $e->die_event;
250 # This way the caller gets an actual timestamp in the "finish_time" field
251 # instead of the string "now".
252 $attempt = $e->retrieve_url_verify_verification_attempt($e->data) or
253 return $e->die_event;
258 url_count => $url_count,
259 total_processed => $total_processed,
260 total_excluding_redirects => $total_excluding_redirects,
265 # retrieves the URL domains and sorts them into buckets*
266 # Iterates over the buckets and fires the multi-session call
267 # the main drawback to this domain sorting approach is that
268 # any domain used a lot more than the others will be the
269 # only domain standing after the others are exhausted, which
270 # means it will take a beating at the end of the batch.
272 # * local data structures, not container.* buckets
273 sub sort_and_fire_domains {
274 my ($e, $auth, $attempt, $url_ids, $multises, $count) = @_;
276 # there is potential here for data sets to be too large
277 # for delivery, but it's not likely, since we're only
278 # fetching ID and domain.
279 my $urls = $e->json_query(
281 select => {uvu => ['id', 'domain']},
283 where => {id => $url_ids}
285 # {substream => 1} only if needed
288 # sort them into buckets based on domain name
290 for my $url (@$urls) {
291 $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
292 push(@{$domains{$url->{domain}}}, $url->{id});
295 # loop through the domains and fire the verification call
296 while (keys %domains) {
297 for my $domain (keys %domains) {
299 my $url_id = pop(@{$domains{$domain}});
300 delete $domains{$domain} unless @{$domains{$domain}};
303 'open-ils.url_verify.verify_url',
304 $auth, $attempt->id, $url_id);
306 $$count++; # sic, a reference to a scalar
312 # XXX I really want to move this method to open-ils.storage, so we don't have
313 # to authenticate a zillion times. LFW
315 __PACKAGE__->register_method(
316 method => 'verify_url',
317 api_name => 'open-ils.url_verify.verify_url',
321 Performs verification of a single URL. When a redirect is detected,
322 a new URL is created to model the redirect and the redirected URL
323 is then tested, up to max-redirects or a loop is detected.
326 {desc => 'Authentication token', type => 'string'},
327 {desc => 'Verification attempt ID (url_verify.verification_attempt.id)', type => 'number'},
328 {desc => 'URL id (url_verify.url.id)', type => 'number'},
330 return => {desc => q/Stream of url_verification objects, one per URL tested/}
336 verification.res_code:
338 999 bad hostname, etc. (IO::Socket::Inet errors)
339 998 in-flight errors (e.g connection closed prematurely)
344 verification.res_text:
346 $@ or custom message "Redirect Loop"
351 my ($self, $client, $auth, $attempt_id, $url_id) = @_;
354 my $e = new_editor(authtoken => $auth);
355 return $e->event unless $e->checkauth;
357 my $url = $e->retrieve_url_verify_url($url_id) or return $e->event;
359 my ($attempt, $delay, $max_redirects, $timeout) =
360 collect_verify_attempt_and_settings($e, $attempt_id);
362 return $e->event unless $e->allowed(
363 'URL_VERIFY', $attempt->session->owning_lib);
366 my $loop_detected = 0;
369 while ($redir_count++ < $max_redirects) {
371 if ($seen_urls{$cur_url->full_url}) {
376 $seen_urls{$cur_url->full_url} = $cur_url;
378 my $url_resp = verify_one_url($e, $attempt, $cur_url, $timeout);
380 # something tragic happened
381 return $url_resp if $U->is_event($url_resp);
383 # flesh and respond to the caller
384 $url_resp->{verification}->url($cur_url);
385 $client->respond($url_resp->{verification});
387 $cur_url = $url_resp->{redirect_url} or last;
390 if ($loop_detected or $redir_count > $max_redirects) {
392 my $vcation = Fieldmapper::url_verify::url_verification->new;
393 $vcation->url($cur_url->id);
394 $vcation->attempt($attempt->id);
395 $vcation->req_time('now');
397 if ($loop_detected) {
398 $logger->info("url: redirect loop detected at " . $cur_url->full_url);
399 $vcation->res_code('996');
400 $vcation->res_text('Redirect Loop');
403 $logger->info("url: max redirects reached for source URL " . $url->full_url);
404 $vcation->res_code('995');
405 $vcation->res_text('Max Redirects');
409 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
413 # The calling code is likely not multi-threaded, so a
414 # per-URL (i.e. per-thread) delay would not be possible.
415 # Applying the delay here allows the caller to process
416 # batches of URLs without having to worry about the delay.
422 # temporarily cache some data to avoid a pile
423 # of data lookups on every URL processed.
425 sub collect_verify_attempt_and_settings {
426 my ($e, $attempt_id) = @_;
429 if (!(keys %cache) or $cache{age} > 20) { # configurable?
439 if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
441 # attempt may have just been created, so
442 # we need to guarantee a write-DB read.
446 $e->retrieve_url_verify_verification_attempt([
449 flesh_fields => {uvva => ['session']}
451 ]) or return $e->die_event;
455 $cache{attempt}{$attempt_id} = $attempt;
458 my $org = $attempt->session->owning_lib;
460 if (!$cache{timeout}{$org}) {
462 $cache{delay}{$org} = $U->ou_ancestor_setting_value(
463 $org, 'url_verify.url_verification_delay', $e);
466 $cache{delay}{$org} = 2 unless defined $cache{delay}{$org};
468 $cache{redirects}{$org} = $U->ou_ancestor_setting_value(
469 $org, 'url_verify.url_verification_max_redirects', $e) || 20;
471 $cache{timeout}{$org} = $U->ou_ancestor_setting_value(
472 $org, 'url_verify.url_verification_max_wait', $e) || 5;
475 sprintf("url: loaded settings delay=%s; max_redirects=%s; timeout=%s",
476 $cache{delay}{$org}, $cache{redirects}{$org}, $cache{timeout}{$org}));
483 $cache{attempt}{$attempt_id},
485 $cache{redirects}{$org},
486 $cache{timeout}{$org}
491 # searches for a completed url_verfication for any url processed
492 # within this verification attempt whose full_url matches the
493 # full_url of the provided URL.
494 sub find_matching_url_for_attempt {
495 my ($e, $attempt, $url) = @_;
497 my $match = $e->json_query({
498 select => {uvuv => ['id']},
502 filter => {id => $attempt->id}
509 id => {'!=' => $url->id},
510 full_url => $url->full_url
513 # There could be multiple verifications for matching URLs
514 # We only want a verification that completed.
515 # Note also that 2 identical URLs processed within the same
516 # sub-batch will have to each be fully processed in their own
517 # right, since neither knows how the other will ultimately fare.
519 res_time => {'!=' => undef}
524 return $e->retrieve_url_verify_url_verification($match->{id}) if $match;
531 1. create the verification object and commit.
533 3. update the verification object to capture the results of the test
534 4. Return redirect_url object if this is a redirect, otherwise undef.
539 my ($e, $attempt, $url, $timeout) = @_;
541 my $url_text = $url->full_url;
544 # first, create the verification object so we can a) indicate that
545 # we're working on this URL and b) get the DB to set the req_time.
547 my $vcation = Fieldmapper::url_verify::url_verification->new;
548 $vcation->url($url->id);
549 $vcation->attempt($attempt->id);
550 $vcation->req_time('now');
552 # begin phase-I DB communication
556 my $match_vcation = find_matching_url_for_attempt($e, $attempt, $url);
558 if ($match_vcation) {
559 $logger->info("url: found matching URL in verification attempt [$url_text]");
560 $vcation->res_code($match_vcation->res_code);
561 $vcation->res_text($match_vcation->res_text);
562 $vcation->redirect_to($match_vcation->redirect_to);
565 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
568 # found a matching URL, no need to re-process
569 return {verification => $vcation} if $match_vcation;
571 # End phase-I DB communication
572 # No active DB xact means no cstore timeout concerns.
576 $ENV{FTP_PASSIVE} = 1; # TODO: setting?
578 my $ua = LWP::UserAgent->new(
579 ssl_opts => {verify_hostname => 0}, # TODO: verify_hostname setting?
580 agent => $user_agent_string
583 $ua->timeout($timeout);
585 my $req = HTTP::Request->new(HEAD => $url->full_url);
587 # simple_request avoids LWP's auto-redirect magic
588 my $res = $ua->simple_request($req);
590 $logger->info(sprintf(
591 "url: received HTTP '%s' / '%s' [%s]",
597 $vcation->res_code($res->code);
598 $vcation->res_text($res->message);
600 # is this a redirect?
601 if ($res->code =~ /^3/) {
603 if (my $loc = $res->headers->{location}) {
604 $redir_url = Fieldmapper::url_verify::url->new;
605 $redir_url->session($attempt->session);
606 $redir_url->redirect_from($url->id);
607 $redir_url->full_url($loc);
609 $logger->info("url: redirect found $url_text => $loc");
612 $logger->info("url: server returned 3XX but no 'Location' header for url $url_text");
616 # Begin phase-II DB communication
621 $redir_url = $e->create_url_verify_url($redir_url) or return $e->die_event;
622 $vcation->redirect_to($redir_url->id);
625 $vcation->res_time('now');
626 $e->update_url_verify_url_verification($vcation) or return $e->die_event;
630 verification => $vcation,
631 redirect_url => $redir_url
636 __PACKAGE__->register_method(
637 method => "create_session",
638 api_name => "open-ils.url_verify.session.create",
640 desc => q/Create a URL verify session. Also automatically create and
643 {desc => "Authentication token", type => "string"},
644 {desc => "session name", type => "string"},
645 {desc => "QueryParser search", type => "string"},
646 {desc => "owning_lib (defaults to ws_ou)", type => "number"},
648 return => {desc => "ID of new session or event on error", type => "number"}
653 my ($self, $client, $auth, $name, $search, $owning_lib) = @_;
655 my $e = new_editor(authtoken => $auth, xact => 1);
656 return $e->die_event unless $e->checkauth;
658 $owning_lib ||= $e->requestor->ws_ou;
659 return $e->die_event unless $e->allowed("URL_VERIFY", $owning_lib);
662 my $name_test = $e->search_url_verify_session({name => $name});
663 return $e->die_event unless $name_test; # db error
664 return $e->die_event(
665 new OpenILS::Event("OBJECT_UNIQUE_IDENTIFIER_USED", note => "name"),
666 ) if @$name_test; # already existing sessions with that name
668 my $session = Fieldmapper::url_verify::session->new;
669 $session->name($name);
670 $session->owning_lib($owning_lib);
671 $session->creator($e->requestor->id);
672 $session->search($search);
674 my $container = Fieldmapper::container::biblio_record_entry_bucket->new;
675 $container->btype("url_verify");
676 $container->owner($e->requestor->id);
677 $container->name($name);
678 $container->description("Automatically generated");
680 $e->create_container_biblio_record_entry_bucket($container) or
681 return $e->die_event;
683 $session->container($e->data->id);
684 $e->create_url_verify_session($session) or
685 return $e->die_event;
687 $e->commit or return $e->die_event;
692 # _check_for_existing_bucket_items() is used later by session_search_and_extract()
693 sub _check_for_existing_bucket_items {
694 my ($e, $session) = @_;
696 my $items = $e->json_query(
698 select => {cbrebi => ['id']},
699 from => {cbrebi => {}},
700 where => {bucket => $session->container},
703 ) or return $e->die_event;
705 return new OpenILS::Event("URL_VERIFY_SESSION_ALREADY_SEARCHED") if @$items;
710 # _get_all_search_results() is used later by session_search_and_extract()
711 sub _get_all_search_results {
712 my ($client, $session) = @_;
716 # Don't loop if the user has specified their own offset.
717 if ($session->search =~ /offset\(\d+\)/) {
718 my $res = $U->simplereq(
720 "open-ils.search.biblio.multiclass.query.staff",
724 return new OpenILS::Event("UNKNOWN") unless $res;
725 return $res if $U->is_event($res);
727 @result_ids = map { shift @$_ } @{$res->{ids}}; # IDs nested in array
732 LOOP: { do { # Fun fact: you cannot "last" out of a do/while in Perl
733 # unless you wrap it another loop structure.
734 my $search = $session->search . " offset(".scalar(@result_ids).")";
736 my $res = $U->simplereq(
738 "open-ils.search.biblio.multiclass.query.staff",
742 return new OpenILS::Event("UNKNOWN") unless $res;
743 return $res if $U->is_event($res);
745 # Search only returns the total count when offset is 0.
746 # We can't get more than one superpage this way, XXX TODO ?
747 $count = $res->{count} unless defined $count;
749 my @this_batch = map { shift @$_ } @{$res->{ids}}; # unnest IDs
750 push @result_ids, @this_batch;
752 # Send a keepalive in case search is slow, although it'll probably
753 # be the query for the first ten results that's slowest.
754 $client->status(new OpenSRF::DomainObject::oilsContinueStatus);
756 last unless @this_batch; # Protect against getting fewer results
757 # than count promised.
759 } while ($count - scalar(@result_ids) > 0); }
762 return (undef, @result_ids);
766 __PACKAGE__->register_method(
767 method => "session_search_and_extract",
768 api_name => "open-ils.url_verify.session.search_and_extract",
772 Perform the search contained in the session,
773 populating the linked bucket, and extracting URLs /,
775 {desc => "Authentication token", type => "string"},
776 {desc => "url_verify.session id", type => "number"},
779 desc => q/stream of numbers: first number of search results, then
780 numbers of extracted URLs for each record, grouped into arrays
787 sub session_search_and_extract {
788 my ($self, $client, $auth, $ses_id) = @_;
790 my $e = new_editor(authtoken => $auth);
791 return $e->die_event unless $e->checkauth;
793 my $session = $e->retrieve_url_verify_session(int($ses_id));
795 return $e->die_event unless
796 $session and $e->allowed("URL_VERIFY", $session->owning_lib);
798 if ($session->creator != $e->requestor->id) {
800 return new OpenILS::Event("URL_VERIFY_NOT_SESSION_CREATOR");
804 _check_for_existing_bucket_items($e, $session);
808 return $delete_error;
811 my ($search_error, @result_ids) =
812 _get_all_search_results($client, $session);
816 return $search_error;
821 # Make and save a bucket item for each search result.
826 # There's an opportunity below to parallelize the extraction of URLs if
829 foreach my $bre_id (@result_ids) {
831 Fieldmapper::container::biblio_record_entry_bucket_item->new;
833 $bucket_item->bucket($session->container);
834 $bucket_item->target_biblio_record_entry($bre_id);
835 $bucket_item->pos($pos++);
837 $e->create_container_biblio_record_entry_bucket_item($bucket_item) or
838 return $e->die_event;
840 push @item_ids, $e->data->id;
845 $client->respond($pos); # first response: the number of items created
846 # (number of search results)
848 # For each contain item, extract URLs. Report counts of URLs extracted
849 # from each record in batches at every hundred records. XXX Arbitrary.
852 foreach my $item_id (@item_ids) {
853 my $res = $e->json_query({
854 from => ["url_verify.extract_urls", $ses_id, $item_id]
855 }) or return $e->die_event;
857 push @url_counts, $res->[0]{"url_verify.extract_urls"};
859 if (scalar(@url_counts) % 100 == 0) {
860 $client->respond([ @url_counts ]);
865 $client->respond([ @url_counts ]) if @url_counts;