From 28d87c24928344307fc68338aa01ca2c1d97173a Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Tue, 16 Feb 2016 12:59:55 -0500 Subject: [PATCH] LP 1768715: pingest supports max/min ID, duration, more ops From the new help text: --batch-size Number of records to process per batch --max-child Max number of worker processes --skip-browse --skip-attrs --skip-search --skip-facets Skip the selected reingest component --start-id Start processing at this record ID. --end-id Stop processing when this record ID is reached --max-duration Stop processing after this many total seconds have passed. --help Show this help text. Signed-off-by: Bill Erickson Signed-off-by: Jason Stephenson --- Open-ILS/src/support-scripts/pingest.pl | 147 +++++++++++++++++------- 1 file changed, 104 insertions(+), 43 deletions(-) diff --git a/Open-ILS/src/support-scripts/pingest.pl b/Open-ILS/src/support-scripts/pingest.pl index 28cb031b23..ed9f45e950 100755 --- a/Open-ILS/src/support-scripts/pingest.pl +++ b/Open-ILS/src/support-scripts/pingest.pl @@ -18,49 +18,89 @@ use strict; use warnings; use DBI; +use Getopt::Long; + +# Globals for the command line options: -- # You will want to adjust the next two based on your database size, # i.e. number of bib records as well as the number of cores on your # database server. Using roughly number of cores/2 doesn't seem to # have much impact in off peak times. -use constant { - BATCHSIZE => 10000, - MAXCHILD => 8 -}; - -# Globals for the command line options: -my $do_browse = 1; # Do the browse reingest. -my $do_attrs = 1; # Do the record attributes reingest. -my $do_search = 1; # Do the search reingest. -my $do_facets = 1; # Do the facets reingest. - -# Command line options to skip different reingests. In this case, we -# use the '-' to indicate a minus or a no, so to -# skip browse reingest: -browse or -b -# skip attribute reingest: -attributes or -a -# skip search reingest: -search or -s -# skip facet reingest: -facets or -f -foreach (@ARGV) { - if (/^-b(?:rowse)?$/) { - $do_browse = 0; - } elsif (/^-a(?:ttr(?:ibute)?s?)?$/) { - $do_attrs = 0; - } elsif (/^-s(?:earch)?$/) { - $do_search = 0; - } elsif (/^-f(?:acets?)?$/) { - $do_facets = 0; - } else { - # TODO: Add usage() function to report allowed options. - die ("Unrecognized option: $_"); - } +my $batch_size = 10000; # records processed per batch +my $max_child = 8; # max number of parallel worker processes + +my $skip_browse; # Skip the browse reingest. +my $skip_attrs; # Skip the record attributes reingest. +my $skip_search; # Skip the search reingest. +my $skip_facets; # Skip the facets reingest. +my $start_id; # start processing at this bib ID. +my $end_id; # stop processing when this bib ID is reached. +my $max_duration; # max processing duration in seconds +my $help; # show help text + +GetOptions( + 'batch-size=i' => \$batch_size, + 'max-child=i' => \$max_child, + 'skip-browse' => \$skip_browse, + 'skip-attrs' => \$skip_attrs, + 'skip-search' => \$skip_search, + 'skip-facets' => \$skip_facets, + 'start-id=i' => \$start_id, + 'end-id=i' => \$end_id, + 'max-duration=i' => \$max_duration, + 'help' => \$help +); + +sub help { + print < 0 +$where ORDER BY id ASC END_OF_Q @@ -72,6 +112,13 @@ my @lol = (); # To do the browse-only ingest: my @blist = (); +my $start_epoch = time; + +sub duration_expired { + return 1 if $max_duration && (time - $start_epoch) >= $max_duration; + return 0; +} + # All of the DBI->connect() calls in this file assume that you have # configured the PGHOST, PGPORT, PGDATABASE, PGUSER, and PGPASSWORD # variables in your execution environment. If you have not, you have @@ -88,7 +135,7 @@ foreach my $r (@$results) { my $record = $r->[0]; push(@blist, $record); # separate list of browse-only ingest push(@$records, $record); - if (++$count == BATCHSIZE) { + if (++$count == $batch_size) { $lol[$lists++] = $records; $count = 0; $records = []; @@ -106,17 +153,20 @@ $count = 0; my @running = (); # We start the browse-only ingest before starting the other ingests. -browse_ingest(@blist) if ($do_browse); +browse_ingest(@blist) unless ($skip_browse); -# We loop until we have processed all of the batches stored in @lol: +# We loop until we have processed all of the batches stored in @lol +# or the maximum processing duration has been reached. while ($count < $lists) { - if (scalar(@lol) && scalar(@running) < MAXCHILD) { + my $duration_expired = duration_expired(); + + if (scalar(@lol) && scalar(@running) < $max_child && !$duration_expired) { # Reuse $records for the lulz. $records = shift(@lol); - if ($do_search || $do_facets || $do_attrs) { - reingest($records); - } else { + if ($skip_search && $skip_facets && $skip_attrs) { $count++; + } else { + reingest($records); } } else { my $pid = wait(); @@ -126,6 +176,11 @@ while ($count < $lists) { print "$count of $lists processed\n"; } } + + if ($duration_expired && scalar(@running) == 0) { + warn "Exiting on max_duration ($max_duration)\n"; + exit(0); + } } # This subroutine forks a process to do the browse-only ingest on the @@ -151,6 +206,11 @@ sub browse_ingest { } else { warn ("Browse ingest failed for record $_"); } + if (duration_expired()) { + warn "browse_ingest() stopping on record $_ ". + "after max duration reached\n"; + last; + } } $dbh->disconnect(); exit(0); @@ -168,8 +228,9 @@ sub reingest { push(@running, $pid); } elsif ($pid == 0) { my $dbh = DBI->connect('DBI:Pg:'); - reingest_attributes($dbh, $list) if ($do_attrs); - reingest_field_entries($dbh, $list) if ($do_facets || $do_search); + reingest_attributes($dbh, $list) unless ($skip_attrs); + reingest_field_entries($dbh, $list) + unless ($skip_facets && $skip_search); $dbh->disconnect(); exit(0); } @@ -181,8 +242,8 @@ sub reingest_field_entries { my $list = shift; my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)"); # Because reingest uses "skip" options we invert the logic of do variables. - $sth->bind_param(2, ($do_facets) ? 0 : 1); - $sth->bind_param(3, ($do_search) ? 0 : 1); + $sth->bind_param(2, ($skip_facets) ? 1 : 0); + $sth->bind_param(3, ($skip_search) ? 1 : 0); foreach (@$list) { $sth->bind_param(1, $_); if ($sth->execute()) { -- 2.43.2