1 package OpenILS::Application::URLVerify;
3 # For code searchability, I'm telling you this is the "link checker."
5 use base qw/OpenILS::Application/;
6 use strict; use warnings;
7 use OpenSRF::Utils::Logger qw(:logger);
8 use OpenSRF::MultiSession;
9 use OpenILS::Utils::Fieldmapper;
10 use OpenILS::Utils::CStoreEditor q/:funcs/;
11 use OpenILS::Application::AppUtils;
16 $Data::Dumper::Indent = 0;
18 my $U = 'OpenILS::Application::AppUtils';
21 __PACKAGE__->register_method(
22 method => 'verify_session',
23 api_name => 'open-ils.url_verify.session.verify',
27 Performs verification on all (or a subset of the) URLs within the requested session.
30 {desc => 'Authentication token', type => 'string'},
31 {desc => 'Session ID (url_verify.session.id)', type => 'number'},
32 {desc => 'URL ID list (optional). An empty list will result in no URLs being processed, but null will result in all the URLs for the session being processed', type => 'array'},
36 report_all => bypass response throttling and return all URL sub-process
37 responses to the caller. Not recommened for remote (web, etc.) clients,
38 because it can be a lot of data.
39 resume_attempt => atttempt_id. Resume verification after a failure.
40 resume_with_new_attempt => If true, resume from resume_attempt, but
41 create a new attempt to track the resumption.
47 Stream of objects containing the number of URLs to be processed (url_count),
48 the number processed thus far including redirects (total_processed),
49 and the current url_verification object (current_verification).
51 Note that total_processed may ultimately exceed url_count, since it
52 includes non-anticipate-able redirects.
54 The final response contains url_count, total_processed, and the
55 verification_attempt object (attempt).
61 # "verify_session" sounds like something to do with authentication, but it
62 # actually means for a given session, verify all the URLs associated with
65 my ($self, $client, $auth, $session_id, $url_ids, $options) = @_;
68 my $e = new_editor(authtoken => $auth, xact => 1);
69 return $e->die_event unless $e->checkauth;
70 return $e->die_event unless $e->allowed('URL_VERIFY');
72 my $session = $e->retrieve_url_verify_session($session_id)
73 or return $e->die_event;
75 my $attempt_id = $options->{resume_attempt};
79 # No URLs provided, load all URLs for the requested session
82 select => {uvu => ['id']},
85 cbrebi => { # bucket item
86 join => { cbreb => { # bucket
87 join => { uvs => { # session
88 filter => {id => $session_id}
98 # when resuming an existing attempt (that presumably failed
99 # mid-processing), we only want to process URLs that either
100 # have no linked url_verification or have an un-completed
103 $logger->info("url: resuming attempt $attempt_id");
105 $query->{from}->{uvu}->{uvuv} = {
107 filter => {attempt => $attempt_id}
113 {id => undef}, # no verification started
114 {res_code => undef} # verification started but did no complete
121 # this is a new attempt, so we only want to process URLs that
122 # originated from the source records and not from redirects.
125 '+uvu' => {redirect_from => undef}
129 my $ids = $e->json_query($query);
130 $url_ids = [ map {$_->{id}} @$ids ];
133 my $url_count = scalar(@$url_ids);
134 $logger->info("url: processing $url_count URLs");
137 if ($attempt_id and !$options->{resume_with_new_attempt}) {
139 $attempt = $e->retrieve_url_verification_attempt($attempt_id)
140 or return $e->die_event;
142 # no data was written
147 $attempt = Fieldmapper::url_verify::verification_attempt->new;
148 $attempt->session($session_id);
149 $attempt->usr($e->requestor->id);
150 $attempt->start_time('now');
152 $e->create_url_verify_verification_attempt($attempt)
153 or return $e->die_event;
161 # Now cycle through the URLs in batches.
163 my $batch_size = $U->ou_ancestor_setting_value(
164 $session->owning_lib,
165 'url_verify.verification_batch_size', $e) || 5;
167 my $total_excluding_redirects = 0;
168 my $total_processed = 0; # total number processed, including redirects
171 # before we start the real work, let the caller know
172 # the attempt (id) so recovery is possible.
175 url_count => $url_count,
176 total_processed => $total_processed,
177 total_excluding_redirects => $total_excluding_redirects,
181 my $multises = OpenSRF::MultiSession->new(
183 app => 'open-ils.url_verify', # hey, that's us!
186 success_handler => sub {
187 my ($self, $req) = @_;
189 # API call streams fleshed url_verification objects. We wrap
190 # those up with some extra info and pass them on to the caller.
192 for my $resp (@{$req->{response}}) {
193 my $content = $resp->content;
199 if ($options->{report_all} or ($total_processed % $resp_window == 0)) {
201 url_count => $url_count,
202 current_verification => $content,
203 total_excluding_redirects => $total_excluding_redirects,
204 total_processed => $total_processed
208 # start off responding quickly, then throttle
209 # back to only relaying every 256 messages.
210 $resp_window *= 2 unless $resp_window >= 256;
215 failure_handler => sub {
216 my ($self, $req) = @_;
218 # {error} should be an Error w/ a toString
219 $logger->error("url: error processing URL: " . $req->{error});
223 sort_and_fire_domains(
224 $e, $auth, $attempt, $url_ids, $multises, \$total_excluding_redirects
227 # Wait for all requests to be completed
228 $multises->session_wait(1);
230 # All done. Let's wrap up the attempt.
231 $attempt->finish_time('now');
234 $e->update_url_verify_verification_attempt($attempt) or
235 return $e->die_event;
239 # This way the caller gets an actual timestamp in the "finish_time" field
240 # instead of the string "now".
241 $attempt = $e->retrieve_url_verify_verification_attempt($e->data) or
242 return $e->die_event;
247 url_count => $url_count,
248 total_processed => $total_processed,
249 total_excluding_redirects => $total_excluding_redirects,
254 # retrieves the URL domains and sorts them into buckets*
255 # Iterates over the buckets and fires the multi-session call
256 # the main drawback to this domain sorting approach is that
257 # any domain used a lot more than the others will be the
258 # only domain standing after the others are exhausted, which
259 # means it will take a beating at the end of the batch.
261 # * local data structures, not container.* buckets
262 sub sort_and_fire_domains {
263 my ($e, $auth, $attempt, $url_ids, $multises, $count) = @_;
265 # there is potential here for data sets to be too large
266 # for delivery, but it's not likely, since we're only
267 # fetching ID and domain.
268 my $urls = $e->json_query(
270 select => {uvu => ['id', 'domain']},
272 where => {id => $url_ids}
274 # {substream => 1} only if needed
277 # sort them into buckets based on domain name
279 for my $url (@$urls) {
280 $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
281 push(@{$domains{$url->{domain}}}, $url->{id});
284 # loop through the domains and fire the verification call
285 while (keys %domains) {
286 for my $domain (keys %domains) {
288 my $url_id = pop(@{$domains{$domain}});
289 delete $domains{$domain} unless @{$domains{$domain}};
292 'open-ils.url_verify.verify_url',
293 $auth, $attempt->id, $url_id);
295 $$count++; # sic, a reference to a scalar
301 # XXX I really want to move this method to open-ils.storage, so we don't have
302 # to authenticate a zillion times. LFW
304 __PACKAGE__->register_method(
305 method => 'verify_url',
306 api_name => 'open-ils.url_verify.verify_url',
310 Performs verification of a single URL. When a redirect is detected,
311 a new URL is created to model the redirect and the redirected URL
312 is then tested, up to max-redirects or a loop is detected.
315 {desc => 'Authentication token', type => 'string'},
316 {desc => 'Verification attempt ID (url_verify.verification_attempt.id)', type => 'number'},
317 {desc => 'URL id (url_verify.url.id)', type => 'number'},
319 return => {desc => q/Stream of url_verification objects, one per URL tested/}
325 verification.res_code:
327 999 bad hostname, etc. (IO::Socket::Inet errors)
328 998 in-flight errors (e.g connection closed prematurely)
333 verification.res_text:
335 $@ or custom message "Redirect Loop"
340 my ($self, $client, $auth, $attempt_id, $url_id) = @_;
343 my $e = new_editor(authtoken => $auth);
344 return $e->event unless $e->checkauth;
346 my $url = $e->retrieve_url_verify_url($url_id) or return $e->event;
348 my ($attempt, $delay, $max_redirects, $timeout) =
349 collect_verify_attempt_and_settings($e, $attempt_id);
351 return $e->event unless $e->allowed(
352 'URL_VERIFY', $attempt->session->owning_lib);
355 my $loop_detected = 0;
358 while ($redir_count++ < $max_redirects) {
360 if ($seen_urls{$cur_url->full_url}) {
365 $seen_urls{$cur_url->full_url} = $cur_url;
367 my $url_resp = verify_one_url($e, $attempt, $cur_url, $timeout);
369 # something tragic happened
370 return $url_resp if $U->is_event($url_resp);
372 # flesh and respond to the caller
373 $url_resp->{verification}->url($cur_url);
374 $client->respond($url_resp->{verification});
376 $cur_url = $url_resp->{redirect_url} or last;
379 if ($loop_detected or $redir_count > $max_redirects) {
381 my $vcation = Fieldmapper::url_verify::url_verification->new;
382 $vcation->url($cur_url->id);
383 $vcation->attempt($attempt->id);
384 $vcation->req_time('now');
386 if ($loop_detected) {
387 $logger->info("url: redirect loop detected at " . $cur_url->full_url);
388 $vcation->res_code('996');
389 $vcation->res_text('Redirect Loop');
392 $logger->info("url: max redirects reached for source URL " . $url->full_url);
393 $vcation->res_code('995');
394 $vcation->res_text('Max Redirects');
398 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
402 # The calling code is likely not multi-threaded, so a
403 # per-URL (i.e. per-thread) delay would not be possible.
404 # Applying the delay here allows the caller to process
405 # batches of URLs without having to worry about the delay.
411 # temporarily cache some data to avoid a pile
412 # of data lookups on every URL processed.
414 sub collect_verify_attempt_and_settings {
415 my ($e, $attempt_id) = @_;
418 if (!(keys %cache) or $cache{age} > 20) { # configurable?
428 if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
430 # attempt may have just been created, so
431 # we need to guarantee a write-DB read.
435 $e->retrieve_url_verify_verification_attempt([
438 flesh_fields => {uvva => ['session']}
440 ]) or return $e->die_event;
444 $cache{attempt}{$attempt_id} = $attempt;
447 my $org = $attempt->session->owning_lib;
449 if (!$cache{timeout}{$org}) {
451 $cache{delay}{$org} = $U->ou_ancestor_setting_value(
452 $org, 'url_verify.url_verification_delay', $e);
455 $cache{delay}{$org} = 2 unless defined $cache{delay}{$org};
457 $cache{redirects}{$org} = $U->ou_ancestor_setting_value(
458 $org, 'url_verify.url_verification_max_redirects', $e) || 20;
460 $cache{timeout}{$org} = $U->ou_ancestor_setting_value(
461 $org, 'url_verify.url_verification_max_wait', $e) || 5;
464 sprintf("url: loaded settings delay=%s; max_redirects=%s; timeout=%s",
465 $cache{delay}{$org}, $cache{redirects}{$org}, $cache{timeout}{$org}));
472 $cache{attempt}{$attempt_id},
474 $cache{redirects}{$org},
475 $cache{timeout}{$org}
480 # searches for a completed url_verfication for any url processed
481 # within this verification attempt whose full_url matches the
482 # full_url of the provided URL.
483 sub find_matching_url_for_attempt {
484 my ($e, $attempt, $url) = @_;
486 my $match = $e->json_query({
487 select => {uvuv => ['id']},
491 filter => {id => $attempt->id}
498 id => {'!=' => $url->id},
499 full_url => $url->full_url
502 # There could be multiple verifications for matching URLs
503 # We only want a verification that completed.
504 # Note also that 2 identical URLs processed within the same
505 # sub-batch will have to each be fully processed in their own
506 # right, since neither knows how the other will ultimately fare.
508 res_time => {'!=' => undef}
513 return $e->retrieve_url_verify_url_verification($match->{id}) if $match;
520 1. create the verification object and commit.
522 3. update the verification object to capture the results of the test
523 4. Return redirect_url object if this is a redirect, otherwise undef.
528 my ($e, $attempt, $url, $timeout) = @_;
530 my $url_text = $url->full_url;
533 # first, create the verification object so we can a) indicate that
534 # we're working on this URL and b) get the DB to set the req_time.
536 my $vcation = Fieldmapper::url_verify::url_verification->new;
537 $vcation->url($url->id);
538 $vcation->attempt($attempt->id);
539 $vcation->req_time('now');
541 # begin phase-I DB communication
545 my $match_vcation = find_matching_url_for_attempt($e, $attempt, $url);
547 if ($match_vcation) {
548 $logger->info("url: found matching URL in verification attempt [$url_text]");
549 $vcation->res_code($match_vcation->res_code);
550 $vcation->res_text($match_vcation->res_text);
551 $vcation->redirect_to($match_vcation->redirect_to);
554 $e->create_url_verify_url_verification($vcation) or return $e->die_event;
557 # found a matching URL, no need to re-process
558 return {verification => $vcation} if $match_vcation;
560 # End phase-I DB communication
561 # No active DB xact means no cstore timeout concerns.
565 $ENV{FTP_PASSIVE} = 1; # TODO: setting?
567 my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 0}); # TODO: verify_hostname setting?
568 $ua->timeout($timeout);
570 my $req = HTTP::Request->new(HEAD => $url->full_url);
572 # simple_request avoids LWP's auto-redirect magic
573 my $res = $ua->simple_request($req);
575 $logger->info(sprintf(
576 "url: received HTTP '%s' / '%s' [%s]",
582 $vcation->res_code($res->code);
583 $vcation->res_text($res->message);
585 # is this a redirect?
586 if ($res->code =~ /^3/) {
588 if (my $loc = $res->headers->{location}) {
589 $redir_url = Fieldmapper::url_verify::url->new;
590 $redir_url->redirect_from($url->id);
591 $redir_url->full_url($loc);
593 $logger->info("url: redirect found $url_text => $loc");
596 $logger->info("url: server returned 3XX but no 'Location' header for url $url_text");
600 # Begin phase-II DB communication
605 $redir_url = $e->create_url_verify_url($redir_url) or return $e->die_event;
606 $vcation->redirect_to($redir_url->id);
609 $vcation->res_time('now');
610 $e->update_url_verify_url_verification($vcation) or return $e->die_event;
614 verification => $vcation,
615 redirect_url => $redir_url
620 __PACKAGE__->register_method(
621 method => "create_session",
622 api_name => "open-ils.url_verify.session.create",
624 desc => q/Create a URL verify session. Also automatically create and
627 {desc => "Authentication token", type => "string"},
628 {desc => "session name", type => "string"},
629 {desc => "QueryParser search", type => "string"},
630 {desc => "owning_lib (defaults to ws_ou)", type => "number"},
632 return => {desc => "ID of new session or event on error", type => "number"}
637 my ($self, $client, $auth, $name, $search, $owning_lib) = @_;
639 my $e = new_editor(authtoken => $auth, xact => 1);
640 return $e->die_event unless $e->checkauth;
642 $owning_lib ||= $e->requestor->ws_ou;
643 return $e->die_event unless $e->allowed("URL_VERIFY", $owning_lib);
645 my $session = Fieldmapper::url_verify::session->new;
646 $session->name($name);
647 $session->owning_lib($owning_lib);
648 $session->creator($e->requestor->id);
649 $session->search($search);
651 my $container = Fieldmapper::container::biblio_record_entry_bucket->new;
652 $container->btype("url_verify");
653 $container->owner($e->requestor->id);
654 $container->name($name);
655 $container->description("Automatically generated");
657 $e->create_container_biblio_record_entry_bucket($container) or
658 return $e->die_event;
660 $session->container($e->data->id);
661 $e->create_url_verify_session($session) or
662 return $e->die_event;
664 $e->commit or return $e->die_event;
669 # _check_for_existing_bucket_items() is used later by session_search_and_extract()
670 sub _check_for_existing_bucket_items {
671 my ($e, $session) = @_;
673 my $items = $e->json_query(
675 select => {cbrebi => ['id']},
676 from => {cbrebi => {}},
677 where => {bucket => $session->container},
680 ) or return $e->die_event;
682 return new OpenILS::Event("URL_VERIFY_SESSION_ALREADY_SEARCHED") if @$items;
687 # _get_all_search_results() is used later by session_search_and_extract()
688 sub _get_all_search_results {
689 my ($client, $session) = @_;
693 # Don't loop if the user has specified their own offset.
694 if ($session->search =~ /offset\(\d+\)/) {
695 my $res = $U->simplereq(
697 "open-ils.search.biblio.multiclass.query.staff",
701 return new OpenILS::Event("UNKNOWN") unless $res;
702 return $res if $U->is_event($res);
704 @result_ids = map { shift @$_ } @{$res->{ids}}; # IDs nested in array
709 LOOP: { do { # Fun fact: you cannot "last" out of a do/while in Perl
710 # unless you wrap it another loop structure.
711 my $search = $session->search . " offset(".scalar(@result_ids).")";
713 my $res = $U->simplereq(
715 "open-ils.search.biblio.multiclass.query.staff",
719 return new OpenILS::Event("UNKNOWN") unless $res;
720 return $res if $U->is_event($res);
722 # Search only returns the total count when offset is 0.
723 # We can't get more than one superpage this way, XXX TODO ?
724 $count = $res->{count} unless defined $count;
726 my @this_batch = map { shift @$_ } @{$res->{ids}}; # unnest IDs
727 push @result_ids, @this_batch;
729 # Send a keepalive in case search is slow, although it'll probably
730 # be the query for the first ten results that's slowest.
731 $client->status(new OpenSRF::DomainObject::oilsContinueStatus);
733 last unless @this_batch; # Protect against getting fewer results
734 # than count promised.
736 } while ($count - scalar(@result_ids) > 0); }
739 return (undef, @result_ids);
743 __PACKAGE__->register_method(
744 method => "session_search_and_extract",
745 api_name => "open-ils.url_verify.session.search_and_extract",
749 Perform the search contained in the session,
750 populating the linked bucket, and extracting URLs /,
752 {desc => "Authentication token", type => "string"},
753 {desc => "url_verify.session id", type => "number"},
756 desc => q/stream of numbers: first number of search results, then
757 numbers of extracted URLs for each record, grouped into arrays
764 sub session_search_and_extract {
765 my ($self, $client, $auth, $ses_id) = @_;
767 my $e = new_editor(authtoken => $auth);
768 return $e->die_event unless $e->checkauth;
770 my $session = $e->retrieve_url_verify_session(int($ses_id));
772 return $e->die_event unless
773 $session and $e->allowed("URL_VERIFY", $session->owning_lib);
775 if ($session->creator != $e->requestor->id) {
777 return new OpenILS::Event("URL_VERIFY_NOT_SESSION_CREATOR");
781 _check_for_existing_bucket_items($e, $session);
785 return $delete_error;
788 my ($search_error, @result_ids) =
789 _get_all_search_results($client, $session);
793 return $search_error;
798 # Make and save a bucket item for each search result.
803 # There's an opportunity below to parallelize the extraction of URLs if
806 foreach my $bre_id (@result_ids) {
808 Fieldmapper::container::biblio_record_entry_bucket_item->new;
810 $bucket_item->bucket($session->container);
811 $bucket_item->target_biblio_record_entry($bre_id);
812 $bucket_item->pos($pos++);
814 $e->create_container_biblio_record_entry_bucket_item($bucket_item) or
815 return $e->die_event;
817 push @item_ids, $e->data->id;
822 $client->respond($pos); # first response: the number of items created
823 # (number of search results)
825 # For each contain item, extract URLs. Report counts of URLs extracted
826 # from each record in batches at every hundred records. XXX Arbitrary.
829 foreach my $item_id (@item_ids) {
830 my $res = $e->json_query({
831 from => ["url_verify.extract_urls", $ses_id, $item_id]
832 }) or return $e->die_event;
834 push @url_counts, $res->[0]{"url_verify.extract_urls"};
836 if (scalar(@url_counts) % 100 == 0) {
837 $client->respond([ @url_counts ]);
842 $client->respond([ @url_counts ]) if @url_counts;