From 073bb779d7c11c6e74dba9a1f0023536cccd9872 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Tue, 7 Aug 2012 16:28:33 -0400 Subject: [PATCH] Link checker: middle layer work, actual parallelized machinery to check links Added is_event check to AppUtils URLVerify.pm getting started URLVerify.pm; firing batch url verify calls URLVerify.pm; docs / cleanup Constraints need unique names Liberalize res_code constraint URLVerify.pm; resume options; docs URLVerify.pm; initial testing tweaks Move Fieldmapper API call to Application.pm Move export of opensrf.open-ils.system.fieldmapper API call from Fieldmapper.pm into Application.pm with the rest of the shared API calls. This allows us to remove the OpenSRF::Application base from Fieldmapper, which was causing some method name collisions. For example, if a Fieldmapper object had a field called "session", which, incidentally, is also a method of OpenSRF::Application, the version from OpenSRF::Application would get called instead of the Fieldmapper version, since Fieldmapper methods are defined during AUTOLOAD. Hilarity was guaranteed to ensue. URLVerify.pm; more testing tweaks URLVerify.pm; redirects / error handling url_verify perm/org setting seed data url_verify perm/org setting seed data url_verify perm/org setting seed data url_verify seed data (trigger) url verify seed data / null constraint repairs URLVerify.pm; settings, tmp caching, cleanup / misc url verify seed data repairs url_verify seed data / sql manifest url_verify schema repair / res-code constraint Do the same thing I did to fix constrain names in upgr scripts to baseline URLVerify.pm; move to lwp to support ftp and simplify URLVerify.pm; apply timeout to lwp useragent; comments URLVerify.pm; avoid re-processing same url within the same attempt URLVerify.pm; avoid re-processing same url within the same attempt (thinko) URLVerify.pm; avoid re-processing same url repairs; honor delay=0 URLVerify.pm; docs; url shuffling URLVerify.pm; docs; url shuffling URLVerify.pm; docs; url domain looping; cleanup URLVerify.pm; docs; tested redirect max/loops and repairs Signed-off-by: Bill Erickson Signed-off-by: Mike Rylander --- .../src/perlmods/lib/OpenILS/Application.pm | 14 + .../lib/OpenILS/Application/AppUtils.pm | 14 +- .../lib/OpenILS/Application/URLVerify.pm | 587 ++++++++++++++++++ .../perlmods/lib/OpenILS/Utils/Fieldmapper.pm | 14 - Open-ILS/src/sql/Pg/075.schema.url_verify.sql | 16 +- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 98 ++- Open-ILS/src/sql/Pg/sql_file_manifest | 2 + .../sql/Pg/upgrade/XXXX.schema.url_verify.sql | 16 +- .../Pg/upgrade/YYYY.functions.url_verify.sql | 27 + .../sql/Pg/upgrade/ZZZZ.data.url_verify.sql | 131 ++++ 10 files changed, 887 insertions(+), 32 deletions(-) create mode 100644 Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm create mode 100644 Open-ILS/src/sql/Pg/upgrade/ZZZZ.data.url_verify.sql diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application.pm index cd4dbbf9c0..2888c93918 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application.pm @@ -2,6 +2,7 @@ package OpenILS::Application; use OpenSRF::Application; use UNIVERSAL::require; use base qw/OpenSRF::Application/; +use OpenILS::Utils::Fieldmapper; sub ils_version { # version format is "x-y-z", for example "2-0-0" for Evergreen 2.0.0 @@ -25,6 +26,19 @@ sub get_idl_file { return OpenSRF::Utils::SettingsClient->new->config_value('IDL'); } +sub publish_fieldmapper { + my ($self,$client,$class) = @_; + + return $Fieldmapper::fieldmap unless (defined $class); + return undef unless (exists($$Fieldmapper::fieldmap{$class})); + return {$class => $$Fieldmapper::fieldmap{$class}}; +} +__PACKAGE__->register_method( + api_name => 'opensrf.open-ils.system.fieldmapper', + api_level => 1, + method => 'publish_fieldmapper', +); + sub register_method { my $class = shift; my %args = @_; diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/AppUtils.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/AppUtils.pm index e4d1e7d36e..b8124f2ec5 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/AppUtils.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/AppUtils.pm @@ -140,10 +140,22 @@ sub rollback_db_session { # returns the event code otherwise sub event_code { my( $self, $evt ) = @_; - return $evt->{ilsevent} if( ref($evt) eq 'HASH' and defined($evt->{ilsevent})) ; + return $evt->{ilsevent} if $self->is_event($evt); return undef; } +# some events, in particular auto-generated events, don't have an +# ilsevent key. treat hashes with a 'textcode' key as events. +sub is_event { + my ($self, $evt) = @_; + return ( + ref($evt) eq 'HASH' and ( + defined $evt->{ilsevent} or + defined $evt->{textcode} + ) + ); +} + # --------------------------------------------------------------------------- # Checks to see if a user is logged in. Returns the user record on success, # throws an exception on error. diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm new file mode 100644 index 0000000000..4e86dcd35e --- /dev/null +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm @@ -0,0 +1,587 @@ +package OpenILS::Application::URLVerify; +use base qw/OpenILS::Application/; +use strict; use warnings; +use OpenSRF::Utils::Logger qw(:logger); +use OpenSRF::MultiSession; +use OpenILS::Utils::Fieldmapper; +use OpenILS::Utils::CStoreEditor q/:funcs/; +use OpenILS::Application::AppUtils; +use LWP::UserAgent; + +my $U = 'OpenILS::Application::AppUtils'; + + +__PACKAGE__->register_method( + method => 'validate_session', + api_name => 'open-ils.url_verify.session.validate', + stream => 1, + signature => { + desc => q/ + Performs verification on all (or a subset of the) URLs within the requested session. + /, + params => [ + {desc => 'Authentication token', type => 'string'}, + {desc => 'Session ID (url_verify.session.id)', type => 'number'}, + {desc => 'URL ID list (optional). An empty list will result in no URLs being processed', type => 'array'}, + { + desc => q/ + Options (optional). + report_all => bypass response throttling and return all URL sub-process + responses to the caller. Not recommened for remote (web, etc.) clients, + because it can be a lot of data. + resume_attempt => atttempt_id. Resume verification after a failure. + resume_with_new_attempt => If true, resume from resume_attempt, but + create a new attempt to track the resumption. + /, + type => 'hash' + } + ], + return => {desc => q/ + Stream of objects containing the number of URLs to be processed (url_count), + the number processed thus far including redirects (total_processed), + and the current url_verification object (current_verification). + + Note that total_processed may ultimately exceed url_count, since it + includes non-anticipate-able redirects. + + The final response contains url_count, total_processed, and the + verification_attempt object (attempt). + / + } + } +); + +sub validate_session { + my ($self, $client, $auth, $session_id, $url_ids, $options) = @_; + $options ||= {}; + + my $e = new_editor(authtoken => $auth, xact => 1); + return $e->die_event unless $e->checkauth; + return $e->die_event unless $e->allowed('VERIFY_URL'); + + my $session = $e->retrieve_url_verify_session($session_id) + or return $e->die_event; + + my $attempt_id = $options->{resume_attempt}; + + if (!$url_ids) { + + # No URLs provided, load all URLs for the requested session + + my $query = { + select => {uvu => ['id']}, + from => { + uvu => { # url + cbrebi => { # bucket item + join => { cbreb => { # bucket + join => { uvs => { # session + filter => {id => $session_id} + }} + }} + } + } + } + }; + + if ($attempt_id) { + + # when resuming an existing attempt (that presumably failed + # mid-processing), we only want to process URLs that either + # have no linked url_verification or have an un-completed + # url_verification. + + $logger->info("url: resuming attempt $attempt_id"); + + $query->{from}->{uvu}->{uvuv} = { + type => 'left', + filter => {attempt => $attempt_id} + }; + + $query->{where} = { + '+uvuv' => { + '-or' => [ + {id => undef}, # no verification started + {res_code => undef} # verification started but did no complete + ] + } + }; + + } else { + + # this is a new attempt, so we only want to process URLs that + # originated from the source records and not from redirects. + + $query->{where} = { + '+uvu' => {redirect_from => undef} + }; + } + + my $ids = $e->json_query($query); + $url_ids = [ map {$_->{id}} @$ids ]; + } + + my $url_count = scalar(@$url_ids); + $logger->info("url: processing $url_count URLs"); + + my $attempt; + if ($attempt_id and !$options->{resume_with_new_attempt}) { + + $attempt = $e->retrieve_url_verification_attempt($attempt_id) + or return $e->die_event; + + # no data was written + $e->rollback; + + } else { + + $attempt = Fieldmapper::url_verify::verification_attempt->new; + $attempt->session($session_id); + $attempt->usr($e->requestor->id); + $attempt->start_time('now'); + + $e->create_url_verify_verification_attempt($attempt) + or return $e->die_event; + + $e->commit; + } + + # END DB TRANSACTION + + # Now cycle through the URLs in batches. + + my $batch_size = $U->ou_ancestor_setting_value( + $session->owning_lib, + 'url_verify.verification_batch_size', $e) || 5; + + my $num_processed = 0; # total number processed, including redirects + my $resp_window = 1; + + # before we start the real work, let the caller know + # the attempt (id) so recovery is possible. + + $client->respond({ + url_count => $url_count, + total_processed => $num_processed, + attempt => $attempt + }); + + my $multises = OpenSRF::MultiSession->new( + + app => 'open-ils.url_verify', # hey, that's us! + cap => $batch_size, + + success_handler => sub { + my ($self, $req) = @_; + + # API call streams fleshed url_verification objects. We wrap + # those up with some extra info and pass them on to the caller. + + for my $resp (@{$req->{response}}) { + my $content = $resp->content; + + if ($content) { + + $num_processed++; + + if ($options->{report_all} or ($num_processed % $resp_window == 0)) { + $client->respond({ + url_count => $url_count, + current_verification => $content, + total_processed => $num_processed + }); + } + + # start off responding quickly, then throttle + # back to only relaying every 256 messages. + $resp_window *= 2 unless $resp_window == 256; + } + } + }, + + failure_handler => sub { + my ($self, $req) = @_; + + # {error} should be an Error w/ a toString + $logger->error("url: error processing URL: " . $req->{error}); + } + ); + + sort_and_fire_domains($e, $auth, $attempt, $url_ids, $multises); + + # Wait for all requests to be completed + $multises->session_wait(1); + + # All done. Let's wrap up the attempt. + $attempt->finish_time('now'); + + $e->xact_begin; + $e->update_url_verify_verification_attempt($attempt) or return $e->die_event; + $e->xact_commit; + + return { + url_count => $url_count, + total_processed => $num_processed, + attempt => $attempt + }; +} + +# retrieves the URL domains and sorts them into buckets +# Iterates over the buckets and fires the multi-session call +# the main drawback to this domain sorting approach is that +# any domain used a lot more than the others will be the +# only domain standing after the others are exhausted, which +# means it will take a beating at the end of the batch. +sub sort_and_fire_domains { + my ($e, $auth, $attempt, $url_ids, $multises) = @_; + + # there is potential here for data sets to be too large + # for delivery, but it's not likely, since we're only + # fetching ID and domain. + my $urls = $e->json_query( + { + select => {uvu => ['id', 'domain']}, + from => 'uvu', + where => {id => $url_ids} + }, + # {substream => 1} only if needed + ); + + # sort them into buckets based on domain name + my %domains; + for my $url (@$urls) { + $domains{$url->{domain}} = [] unless $domains{$url->{domain}}; + push(@{$domains{$url->{domain}}}, $url->{id}); + } + + # loop through the domains and fire the verification call + while (keys %domains) { + for my $domain (keys %domains) { + + my $url_id = pop(@{$domains{$domain}}); + delete $domains{$domain} unless @{$domains{$domain}}; + + $multises->request( + 'open-ils.url_verify.verify_url', + $auth, $attempt->id, $url_id); + } + } +} + + +__PACKAGE__->register_method( + method => 'verify_url', + api_name => 'open-ils.url_verify.verify_url', + stream => 1, + signature => { + desc => q/ + Performs verification of a single URL. When a redirect is detected, + a new URL is created to model the redirect and the redirected URL + is then tested, up to max-redirects or a loop is detected. + /, + params => [ + {desc => 'Authentication token', type => 'string'}, + {desc => 'Verification attempt ID (url_verify.verification_attempt.id)', type => 'number'}, + {desc => 'URL id (url_verify.url.id)', type => 'number'}, + ], + return => {desc => q/Stream of url_verification objects, one per URL tested/} + } +); + +=head comment + +verification.res_code: + +999 bad hostname, etc. (IO::Socket::Inet errors) +998 in-flight errors (e.g connection closed prematurely) +997 timeout +996 redirect loop +995 max redirects + +verification.res_text: + +$@ or custom message "Redirect Loop" + +=cut + +sub verify_url { + my ($self, $client, $auth, $attempt_id, $url_id) = @_; + my %seen_urls; + + my $e = new_editor(authtoken => $auth); + return $e->event unless $e->checkauth; + + my $url = $e->retrieve_url_verify_url($url_id) or return $e->event; + + my ($attempt, $delay, $max_redirects, $timeout) = + collect_verify_attempt_and_settings($e, $attempt_id); + + return $e->event unless $e->allowed( + 'VERIFY_URL', $attempt->session->owning_lib); + + my $cur_url = $url; + my $loop_detected = 0; + my $redir_count = 0; + + while ($redir_count++ < $max_redirects) { + + if ($seen_urls{$cur_url->full_url}) { + $loop_detected = 1; + last; + } + + $seen_urls{$cur_url->full_url} = $cur_url; + + my $url_resp = verify_one_url($e, $attempt, $cur_url, $timeout); + + # something tragic happened + return $url_resp if $U->is_event($url_resp); + + # flesh and respond to the caller + $url_resp->{verification}->url($cur_url); + $client->respond($url_resp->{verification}); + + $cur_url = $url_resp->{redirect_url} or last; + } + + if ($loop_detected or $redir_count > $max_redirects) { + + my $vcation = Fieldmapper::url_verify::url_verification->new; + $vcation->url($cur_url->id); + $vcation->attempt($attempt->id); + $vcation->req_time('now'); + + if ($loop_detected) { + $logger->info("url: redirect loop detected at " . $cur_url->full_url); + $vcation->res_code('996'); + $vcation->res_text('Redirect Loop'); + + } else { + $logger->info("url: max redirects reached for source URL " . $url->full_url); + $vcation->res_code('995'); + $vcation->res_text('Max Redirects'); + } + + $e->xact_begin; + $e->create_url_verify_url_verification($vcation) or return $e->die_event; + $e->xact_commit; + } + + # The calling code is likely not multi-threaded, so a + # per-URL (i.e. per-thread) delay would not be possible. + # Applying the delay here allows the caller to process + # batches of URLs without having to worry about the delay. + sleep $delay; + + return undef; +} + +# temporarily cache some data to avoid a pile +# of data lookups on every URL processed. +my %cache; +sub collect_verify_attempt_and_settings { + my ($e, $attempt_id) = @_; + my $attempt; + + if (!(keys %cache) or $cache{age} > 20) { # configurable? + %cache = ( + age => 0, + attempt => {}, + delay => {}, + redirects => {}, + timeout => {}, + ); + } + + if ( !($attempt = $cache{attempt}{$attempt_id}) ) { + + # attempt may have just been created, so + # we need to guarantee a write-DB read. + $e->xact_begin; + + $attempt = + $e->retrieve_url_verify_verification_attempt([ + $attempt_id, { + flesh => 1, + flesh_fields => {uvva => ['session']} + } + ]) or return $e->die_event; + + $e->rollback; + + $cache{attempt}{$attempt_id} = $attempt; + } + + my $org = $attempt->session->owning_lib; + + if (!$cache{timeout}{$org}) { + + $cache{delay}{$org} = $U->ou_ancestor_setting_value( + $org, 'url_verify.url_verification_delay', $e); + + # 0 is a valid delay + $cache{delay}{$org} = 2 unless defined $cache{delay}{$org}; + + $cache{redirects}{$org} = $U->ou_ancestor_setting_value( + $org, 'url_verify.url_verification_max_redirects', $e) || 20; + + $cache{timeout}{$org} = $U->ou_ancestor_setting_value( + $org, 'url_verify.url_verification_max_wait', $e) || 5; + + $logger->info( + sprintf("url: loaded settings delay=%s; max_redirects=%s; timeout=%s", + $cache{delay}{$org}, $cache{redirects}{$org}, $cache{timeout}{$org})); + } + + $cache{age}++; + + + return ( + $cache{attempt}{$attempt_id}, + $cache{delay}{$org}, + $cache{redirects}{$org}, + $cache{timeout}{$org} + ); +} + + +# searches for a completed url_verfication for any url processed +# within this verification attempt whose full_url matches the +# full_url of the provided URL. +sub find_matching_url_for_attempt { + my ($e, $attempt, $url) = @_; + + my $match = $e->json_query({ + select => {uvuv => ['id']}, + from => { + uvuv => { + uvva => { # attempt + filter => {id => $attempt->id} + }, + uvu => {} # url + } + }, + where => { + '+uvu' => { + id => {'!=' => $url->id}, + full_url => $url->full_url + }, + + # There could be multiple verifications for matching URLs + # We only want a verification that completed. + # Note also that 2 identical URLs processed within the same + # sub-batch will have to each be fully processed in their own + # right, since neither knows how the other will ultimately fare. + '+uvuv' => { + res_time => {'!=' => undef} + } + } + })->[0]; + + return $e->retrieve_url_verify_url_verification($match->{id}) if $match; + return undef; +} + + +=head comment + +1. create the verification object and commit. +2. test the URL +3. update the verification object to capture the results of the test +4. Return redirect_url object if this is a redirect, otherwise undef. + +=cut + +sub verify_one_url { + my ($e, $attempt, $url, $timeout) = @_; + + my $url_text = $url->full_url; + my $redir_url; + + # first, create the verification object so we can a) indicate that + # we're working on this URL and b) get the DB to set the req_time. + + my $vcation = Fieldmapper::url_verify::url_verification->new; + $vcation->url($url->id); + $vcation->attempt($attempt->id); + $vcation->req_time('now'); + + # begin phase-I DB communication + + $e->xact_begin; + + my $match_vcation = find_matching_url_for_attempt($e, $attempt, $url); + + if ($match_vcation) { + $logger->info("url: found matching URL in verification attempt [$url_text]"); + $vcation->res_code($match_vcation->res_code); + $vcation->res_text($match_vcation->res_text); + $vcation->redirect_to($match_vcation->redirect_to); + } + + $e->create_url_verify_url_verification($vcation) or return $e->die_event; + $e->xact_commit; + + # found a matching URL, no need to re-process + return {verification => $vcation} if $match_vcation; + + # End phase-I DB communication + # No active DB xact means no cstore timeout concerns. + + # Now test the URL. + + $ENV{FTP_PASSIVE} = 1; # TODO: setting? + + my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 0}); # TODO: verify_hostname setting? + $ua->timeout($timeout); + + my $req = HTTP::Request->new(HEAD => $url->full_url); + + # simple_request avoids LWP's auto-redirect magic + my $res = $ua->simple_request($req); + + $logger->info(sprintf( + "url: received HTTP '%s' / '%s' [%s]", + $res->code, + $res->message, + $url_text + )); + + $vcation->res_code($res->code); + $vcation->res_text($res->message); + + # is this a redirect? + if ($res->code =~ /^3/) { + + if (my $loc = $res->headers->{location}) { + $redir_url = Fieldmapper::url_verify::url->new; + $redir_url->redirect_from($url->id); + $redir_url->full_url($loc); + + $logger->info("url: redirect found $url_text => $loc"); + + } else { + $logger->info("url: server returned 3XX but no 'Location' header for url $url_text"); + } + } + + # Begin phase-II DB communication + + $e->xact_begin; + + if ($redir_url) { + $redir_url = $e->create_url_verify_url($redir_url) or return $e->die_event; + $vcation->redirect_to($redir_url->id); + } + + $vcation->res_time('now'); + $e->update_url_verify_url_verification($vcation) or return $e->die_event; + $e->commit; + + return { + verification => $vcation, + redirect_url => $redir_url + }; +} + + +1; diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Fieldmapper.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Fieldmapper.pm index 33050fd622..709ad5caff 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Fieldmapper.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Fieldmapper.pm @@ -1,7 +1,6 @@ package Fieldmapper; use OpenSRF::Utils::JSON; use Data::Dumper; -use base 'OpenSRF::Application'; use OpenSRF::Utils::Logger; use OpenSRF::Utils::SettingsClient; use OpenSRF::System; @@ -12,19 +11,6 @@ my $log = 'OpenSRF::Utils::Logger'; use vars qw/$fieldmap $VERSION/; -sub publish_fieldmapper { - my ($self,$client,$class) = @_; - - return $fieldmap unless (defined $class); - return undef unless (exists($$fieldmap{$class})); - return {$class => $$fieldmap{$class}}; -} -__PACKAGE__->register_method( - api_name => 'opensrf.open-ils.system.fieldmapper', - api_level => 1, - method => 'publish_fieldmapper', -); - # # To dump the Javascript version of the fieldmapper struct use the command: # diff --git a/Open-ILS/src/sql/Pg/075.schema.url_verify.sql b/Open-ILS/src/sql/Pg/075.schema.url_verify.sql index 753c769c94..db42861d4a 100644 --- a/Open-ILS/src/sql/Pg/075.schema.url_verify.sql +++ b/Open-ILS/src/sql/Pg/075.schema.url_verify.sql @@ -28,7 +28,7 @@ CREATE TABLE url_verify.session ( container INT NOT NULL REFERENCES container.biblio_record_entry_bucket (id) DEFERRABLE INITIALLY DEFERRED, create_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), search TEXT NOT NULL, - CONSTRAINT name_once_per_lib UNIQUE (name, owning_lib) + CONSTRAINT uvs_name_once_per_lib UNIQUE (name, owning_lib) ); CREATE TABLE url_verify.url_selector ( @@ -41,11 +41,11 @@ CREATE TABLE url_verify.url_selector ( CREATE TABLE url_verify.url ( id SERIAL PRIMARY KEY, redirect_from INT REFERENCES url_verify.url(id) DEFERRABLE INITIALLY DEFERRED, - item INT NOT NULL REFERENCES container.biblio_record_entry_bucket_item (id) DEFERRABLE INITIALLY DEFERRED, - url_selector INT NOT NULL REFERENCES url_verify.url_selector (id) DEFERRABLE INITIALLY DEFERRED, - tag TEXT NOT NULL, - subfield TEXT NOT NULL, - ord INT NOT NULL, -- ordinal position of this url within the record as found by url_selector, for later update + item INT REFERENCES container.biblio_record_entry_bucket_item (id) DEFERRABLE INITIALLY DEFERRED, + url_selector INT REFERENCES url_verify.url_selector (id) DEFERRABLE INITIALLY DEFERRED, + tag TEXT, + subfield TEXT, + ord INT, -- ordinal position of this url within the record as found by url_selector, for later update full_url TEXT NOT NULL, scheme TEXT, username TEXT, @@ -83,7 +83,7 @@ CREATE TABLE url_verify.url_verification ( attempt INT NOT NULL REFERENCES url_verify.verification_attempt (id) DEFERRABLE INITIALLY DEFERRED, req_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), res_time TIMESTAMP WITH TIME ZONE, - res_code INT CHECK (res_code BETWEEN 100 AND 599), + res_code INT CHECK (res_code BETWEEN 100 AND 999), -- we know > 599 will never be valid HTTP code, but we use 9XX for other stuff res_text TEXT, redirect_to INT REFERENCES url_verify.url (id) DEFERRABLE INITIALLY DEFERRED -- if redirected ); @@ -95,7 +95,7 @@ CREATE TABLE url_verify.filter_set ( creator INT NOT NULL REFERENCES actor.usr (id) DEFERRABLE INITIALLY DEFERRED, create_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), filter TEXT NOT NULL, - CONSTRAINT name_once_per_lib UNIQUE (name, owning_lib) + CONSTRAINT uvfs_name_once_per_lib UNIQUE (name, owning_lib) ); COMMIT; diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index 88051810eb..fe4952cff0 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -1573,7 +1573,11 @@ INSERT INTO permission.perm_list ( id, code, description ) VALUES ( 541, 'ADMIN_TOOLBAR_FOR_WORKSTATION', oils_i18n_gettext( 541, 'Allows a user to create, edit, and delete custom toolbars for workstations', 'ppl', 'description')), ( 542, 'ADMIN_TOOLBAR_FOR_USER', oils_i18n_gettext( 542, - 'Allows a user to create, edit, and delete custom toolbars for users', 'ppl', 'description')) + 'Allows a user to create, edit, and delete custom toolbars for users', 'ppl', 'description')), + ( 543, 'URL_VERIFY', oils_i18n_gettext( 543, + 'Allows a user to process and verify ULSs', 'ppl', 'description')), + ( 544, 'URL_VERIFY_UPDATE_SETTINGS', oils_i18n_gettext( 544, + 'Allows a user to configure URL verification org unit settings', 'ppl', 'description')) ; @@ -12004,3 +12008,95 @@ INSERT INTO config.org_unit_setting_type ), 'integer' ); + +INSERT INTO config.settings_group (name, label) + VALUES ( + 'url_verify', + oils_i18n_gettext( + 'url_verify', + 'URL Verify', + 'csg', + 'label' + ) + ); + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.url_verification_delay', + 'url_verify', + oils_i18n_gettext( + 'url_verify.url_verification_delay', + 'Number of seconds to wait between URL test attempts.', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.url_verification_delay', + 'Throttling mechanism for batch URL verification runs. Each running process will wait this number of seconds after a URL test before performing the next.', + 'coust', + 'description' + ), + 'integer', + 544 + ); + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.url_verification_max_redirects', + 'url_verify', + oils_i18n_gettext( + 'url_verify.url_verification_max_redirects', + 'Maximum redirect lookups', + 'url_verify.url_verification_max_redirects', + 'For URLs returning 3XX redirects, this is the maximum number of redirects we will follow before giving up.', + 'coust', + 'description' + ), + 'integer', + 544 + ); + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.url_verification_max_wait', + 'url_verify', + oils_i18n_gettext( + 'url_verify.url_verification_max_wait', + 'Maximum wait time (in seconds) for a URL to lookup', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.url_verification_max_wait', + 'If we exceed the wait time, the URL is marked as a "timeout" and the system moves on to the next URL', + 'coust', + 'description' + ), + 'integer', + 544 + ); + + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.verification_batch_size', + 'url_verify', + oils_i18n_gettext( + 'url_verify.verification_batch_size', + 'Number of URLs to test in parallel', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.verification_batch_size', + 'URLs are tested in batches. This number defines the size of each batch and it directly relates to the number of back-end processes performing URL verification.', + 'coust', + 'description' + ), + 'integer', + 544 + ); diff --git a/Open-ILS/src/sql/Pg/sql_file_manifest b/Open-ILS/src/sql/Pg/sql_file_manifest index 70d7dcda96..459b604174 100644 --- a/Open-ILS/src/sql/Pg/sql_file_manifest +++ b/Open-ILS/src/sql/Pg/sql_file_manifest @@ -24,6 +24,8 @@ FTS_CONFIG_FILE 030.schema.metabib.sql 040.schema.asset.sql 070.schema.container.sql +075.schema.url_verify.sql +076.functions.url_verify.sql 080.schema.money.sql 090.schema.action.sql 095.schema.booking.sql diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.url_verify.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.url_verify.sql index 7850b495ec..e001f9f7e4 100644 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.url_verify.sql +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.url_verify.sql @@ -10,7 +10,7 @@ CREATE TABLE url_verify.session ( container INT NOT NULL REFERENCES container.biblio_record_entry_bucket (id) DEFERRABLE INITIALLY DEFERRED, create_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), search TEXT NOT NULL, - CONSTRAINT name_once_per_lib UNIQUE (name, owning_lib) + CONSTRAINT uvs_name_once_per_lib UNIQUE (name, owning_lib) ); CREATE TABLE url_verify.url_selector ( @@ -23,11 +23,11 @@ CREATE TABLE url_verify.url_selector ( CREATE TABLE url_verify.url ( id SERIAL PRIMARY KEY, redirect_from INT REFERENCES url_verify.url(id) DEFERRABLE INITIALLY DEFERRED, - item INT NOT NULL REFERENCES container.biblio_record_entry_bucket_item (id) DEFERRABLE INITIALLY DEFERRED, - url_selector INT NOT NULL REFERENCES url_verify.url_selector (id) DEFERRABLE INITIALLY DEFERRED, - tag TEXT NOT NULL, - subfield TEXT NOT NULL, - ord INT NOT NULL, -- ordinal position of this url within the record as found by url_selector, for later update + item INT REFERENCES container.biblio_record_entry_bucket_item (id) DEFERRABLE INITIALLY DEFERRED, + url_selector INT REFERENCES url_verify.url_selector (id) DEFERRABLE INITIALLY DEFERRED, + tag TEXT, + subfield TEXT, + ord INT, full_url TEXT NOT NULL, scheme TEXT, username TEXT, @@ -65,7 +65,7 @@ CREATE TABLE url_verify.url_verification ( attempt INT NOT NULL REFERENCES url_verify.verification_attempt (id) DEFERRABLE INITIALLY DEFERRED, req_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), res_time TIMESTAMP WITH TIME ZONE, - res_code INT CHECK (res_code BETWEEN 100 AND 599), + res_code INT CHECK (res_code BETWEEN 100 AND 999), -- we know > 599 will never be valid HTTP code, but we use 9XX for other stuff res_text TEXT, redirect_to INT REFERENCES url_verify.url (id) DEFERRABLE INITIALLY DEFERRED -- if redirected ); @@ -77,6 +77,6 @@ CREATE TABLE url_verify.filter_set ( creator INT NOT NULL REFERENCES actor.usr (id) DEFERRABLE INITIALLY DEFERRED, create_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), filter TEXT NOT NULL, - CONSTRAINT name_once_per_lib UNIQUE (name, owning_lib) + CONSTRAINT uvfs_name_once_per_lib UNIQUE (name, owning_lib) ); diff --git a/Open-ILS/src/sql/Pg/upgrade/YYYY.functions.url_verify.sql b/Open-ILS/src/sql/Pg/upgrade/YYYY.functions.url_verify.sql index 2087794714..c63c2d9756 100644 --- a/Open-ILS/src/sql/Pg/upgrade/YYYY.functions.url_verify.sql +++ b/Open-ILS/src/sql/Pg/upgrade/YYYY.functions.url_verify.sql @@ -18,5 +18,32 @@ return \%parts; $$ LANGUAGE PLPERLU; +CREATE OR REPLACE FUNCTION url_verify.ingest_url () RETURNS TRIGGER AS $$ +DECLARE + tmp_row url_verify.url%ROWTYPE; +BEGIN + SELECT * INTO tmp_row FROM url_verify.parse_url(NEW.full_url); + + NEW.scheme := tmp_row.scheme; + NEW.username := tmp_row.username; + NEW.password := tmp_row.password; + NEW.host := tmp_row.host; + NEW.domain := tmp_row.domain; + NEW.tld := tmp_row.tld; + NEW.port := tmp_row.port; + NEW.path := tmp_row.path; + NEW.page := tmp_row.page; + NEW.query := tmp_row.query; + NEW.fragment := tmp_row.fragment; + + RETURN NEW; +END; +$$ LANGUAGE PLPGSQL; + +CREATE TRIGGER ingest_url_tgr + BEFORE INSERT ON url_verify.url + FOR EACH ROW EXECUTE PROCEDURE url_verify.ingest_url(); + + COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/ZZZZ.data.url_verify.sql b/Open-ILS/src/sql/Pg/upgrade/ZZZZ.data.url_verify.sql new file mode 100644 index 0000000000..f0f98469eb --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/ZZZZ.data.url_verify.sql @@ -0,0 +1,131 @@ + +-- NOTE: beware the use of bare perm IDs in the update_perm's below and in +-- the 950 seed data file. Update before merge to match current perm IDs! XXX + +BEGIN; + +INSERT INTO permission.perm_list (id, code, description) + VALUES ( + 543, + 'URL_VERIFY', + oils_i18n_gettext( + 543, + 'Allows a user to process and verify ULSs', + 'ppl', + 'description' + ) + ); + + +INSERT INTO permission.perm_list (id, code, description) + VALUES ( + 544, + 544, + oils_i18n_gettext( + 544, + 'Allows a user to configure URL verification org unit settings', + 'ppl', + 'description' + ) + ); + + +INSERT INTO config.settings_group (name, label) + VALUES ( + 'url_verify', + oils_i18n_gettext( + 'url_verify', + 'URL Verify', + 'csg', + 'label' + ) + ); + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.url_verification_delay', + 'url_verify', + oils_i18n_gettext( + 'url_verify.url_verification_delay', + 'Number of seconds to wait between URL test attempts.', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.url_verification_delay', + 'Throttling mechanism for batch URL verification runs. Each running process will wait this number of seconds after a URL test before performing the next.', + 'coust', + 'description' + ), + 'integer', + 544 + ); + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.url_verification_max_redirects', + 'url_verify', + oils_i18n_gettext( + 'url_verify.url_verification_max_redirects', + 'Maximum redirect lookups', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.url_verification_max_redirects', + 'For URLs returning 3XX redirects, this is the maximum number of redirects we will follow before giving up.', + 'coust', + 'description' + ), + 'integer', + 544 + ); + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.url_verification_max_wait', + 'url_verify', + oils_i18n_gettext( + 'url_verify.url_verification_max_wait', + 'Maximum wait time (in seconds) for a URL to lookup', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.url_verification_max_wait', + 'If we exceed the wait time, the URL is marked as a "timeout" and the system moves on to the next URL', + 'coust', + 'description' + ), + 'integer', + 544 + ); + + +INSERT INTO config.org_unit_setting_type + (name, grp, label, description, datatype, update_perm) + VALUES ( + 'url_verify.verification_batch_size', + 'url_verify', + oils_i18n_gettext( + 'url_verify.verification_batch_size', + 'Number of URLs to test in parallel', + 'coust', + 'label' + ), + oils_i18n_gettext( + 'url_verify.verification_batch_size', + 'URLs are tested in batches. This number defines the size of each batch and it directly relates to the number of back-end processes performing URL verification.', + 'coust', + 'description' + ), + 'integer', + 544 + ); + + +COMMIT; + -- 2.43.2