Open-ILS/src/support-scripts/pingest.pl

   1 #!/usr/bin/perl
   2 # ---------------------------------------------------------------
   3 # Copyright © 2013,2014 Merrimack Valley Library Consortium
   4 # Jason Stephenson <jstephenson@mvlc.org>
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 # ---------------------------------------------------------------
  16 # TODO: Document with POD.
  17 # This guy parallelizes a reingest.
  18 use strict;
  19 use warnings;
  20 use DBI;
  21 use Getopt::Long;
  22
  23 # Globals for the command line options: --
  24
  25 # You will want to adjust the next two based on your database size,
  26 # i.e. number of bib records as well as the number of cores on your
  27 # database server.  Using roughly number of cores/2 doesn't seem to
  28 # have much impact in off peak times.
  29 my $batch_size = 10000; # records processed per batch
  30 my $max_child  = 8;     # max number of parallel worker processes
  31
  32 my $skip_browse;  # Skip the browse reingest.
  33 my $skip_attrs;   # Skip the record attributes reingest.
  34 my $skip_search;  # Skip the search reingest.
  35 my $skip_facets;  # Skip the facets reingest.
  36 my $start_id;     # start processing at this bib ID.
  37 my $end_id;       # stop processing when this bib ID is reached.
  38 my $max_duration; # max processing duration in seconds
  39 my $help;         # show help text
  40 my $opt_pipe;     # Read record ids from STDIN.
  41
  42 GetOptions(
  43     'batch-size=i'   => \$batch_size,
  44     'max-child=i'    => \$max_child,
  45     'skip-browse'    => \$skip_browse,
  46     'skip-attrs'     => \$skip_attrs,
  47     'skip-search'    => \$skip_search,
  48     'skip-facets'    => \$skip_facets,
  49     'start-id=i'     => \$start_id,
  50     'end-id=i'       => \$end_id,
  51     'pipe'           => \$opt_pipe,
  52     'max-duration=i' => \$max_duration,
  53     'help'           => \$help
  54 );
  55
  56 sub help {
  57     print <<HELP;
  58
  59     $0 --batch-size $batch_size --max-child $max_child \
  60         --start-id 1 --end-id 500000 --duration 14400
  61
  62     --batch-size
  63         Number of records to process per batch
  64
  65     --max-child
  66         Max number of worker processes
  67
  68     --skip-browse
  69     --skip-attrs
  70     --skip-search
  71     --skip-facets
  72         Skip the selected reingest component
  73
  74     --start-id
  75         Start processing at this record ID.
  76
  77     --end-id
  78         Stop processing when this record ID is reached
  79
  80     --pipe
  81         Read record IDs to reingest from standard input.
  82         This option conflicts with --start-id and/or --end-id.
  83
  84     --max-duration
  85         Stop processing after this many total seconds have passed.
  86
  87     --help
  88         Show this help text.
  89
  90 HELP
  91     exit;
  92 }
  93
  94 help() if $help;
  95
  96 # Check for mutually exclusive options:
  97 if ($opt_pipe && ($start_id || $end_id)) {
  98     warn('Mutually exclusive options');
  99     help();
 100 }
 101
 102 my $where = "WHERE deleted = 'f'";
 103 if ($start_id && $end_id) {
 104     $where .= " AND id BETWEEN $start_id AND $end_id";
 105 } elsif ($start_id) {
 106     $where .= " AND id >= $start_id";
 107 } elsif ($end_id) {
 108     $where .= " AND id <= $end_id";
 109 }
 110
 111 # "Gimme the keys!  I'll drive!"
 112 my $q = <<END_OF_Q;
 113 SELECT id
 114 FROM biblio.record_entry
 115 $where
 116 ORDER BY id ASC
 117 END_OF_Q
 118
 119 # Stuffs needed for looping, tracking how many lists of records we
 120 # have, storing the actual list of records, and the list of the lists
 121 # of records.
 122 my ($count, $lists, $records) = (0,0,[]);
 123 my @lol = ();
 124 # To do the browse-only ingest:
 125 my @blist = ();
 126
 127 my $start_epoch = time;
 128
 129 sub duration_expired {
 130     return 1 if $max_duration && (time - $start_epoch) >= $max_duration;
 131     return 0;
 132 }
 133
 134 # All of the DBI->connect() calls in this file assume that you have
 135 # configured the PGHOST, PGPORT, PGDATABASE, PGUSER, and PGPASSWORD
 136 # variables in your execution environment.  If you have not, you have
 137 # two options:
 138 #
 139 # 1) configure them
 140 #
 141 # 2) edit the DBI->connect() calls in this program so that it can
 142 # connect to your database.
 143
 144 # Get the input records from either standard input or the database.
 145 my @input;
 146 if ($opt_pipe) {
 147     while (<STDIN>) {
 148         # Want only numbers, one per line.
 149         if ($_ =~ /([0-9]+)/) {
 150             push(@input, $1);
 151         }
 152     }
 153 } else {
 154     my $dbh = DBI->connect('DBI:Pg:');
 155     @input = @{$dbh->selectcol_arrayref($q)};
 156     $dbh->disconnect();
 157 }
 158
 159 foreach my $record (@input) {
 160     push(@blist, $record); # separate list of browse-only ingest
 161     push(@$records, $record);
 162     if (++$count == $batch_size) {
 163         $lol[$lists++] = $records;
 164         $count = 0;
 165         $records = [];
 166     }
 167 }
 168 $lol[$lists++] = $records if ($count); # Last batch is likely to be
 169                                        # small.
 170
 171 # We're going to reuse $count to keep track of the total number of
 172 # batches processed.
 173 $count = 0;
 174
 175 # @running keeps track of the running child processes.
 176 my @running = ();
 177
 178 # We start the browse-only ingest before starting the other ingests.
 179 browse_ingest(@blist) unless ($skip_browse);
 180
 181 # We loop until we have processed all of the batches stored in @lol
 182 # or the maximum processing duration has been reached.
 183 while ($count < $lists) {
 184     my $duration_expired = duration_expired();
 185
 186     if (scalar(@lol) && scalar(@running) < $max_child && !$duration_expired) {
 187         # Reuse $records for the lulz.
 188         $records = shift(@lol);
 189         if ($skip_search && $skip_facets && $skip_attrs) {
 190             $count++;
 191         } else {
 192             reingest($records);
 193         }
 194     } else {
 195         my $pid = wait();
 196         if (grep {$_ == $pid} @running) {
 197             @running = grep {$_ != $pid} @running;
 198             $count++;
 199             print "$count of $lists processed\n";
 200         }
 201     }
 202
 203     if ($duration_expired && scalar(@running) == 0) {
 204         warn "Exiting on max_duration ($max_duration)\n";
 205         exit(0);
 206     }
 207 }
 208
 209 # This subroutine forks a process to do the browse-only ingest on the
 210 # @blist above.  It cannot be parallelized, but can run in parrallel
 211 # to the other ingests.
 212 sub browse_ingest {
 213     my @list = @_;
 214     my $pid = fork();
 215     if (!defined($pid)) {
 216         die "failed to spawn child";
 217     } elsif ($pid > 0) {
 218         # Add our browser to the list of running children.
 219         push(@running, $pid);
 220         # Increment the number of lists, because this list was not
 221         # previously counted.
 222         $lists++;
 223     } elsif ($pid == 0) {
 224         my $dbh = DBI->connect('DBI:Pg:');
 225         my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, TRUE, FALSE, TRUE)");
 226         foreach (@list) {
 227             if ($sth->execute($_)) {
 228                 my $crap = $sth->fetchall_arrayref();
 229             } else {
 230                 warn ("Browse ingest failed for record $_");
 231             }
 232             if (duration_expired()) {
 233                 warn "browse_ingest() stopping on record $_ ".
 234                     "after max duration reached\n";
 235                 last;
 236             }
 237         }
 238         $dbh->disconnect();
 239         exit(0);
 240     }
 241 }
 242
 243 # Fork a child to do the other reingests:
 244
 245 sub reingest {
 246     my $list = shift;
 247     my $pid = fork();
 248     if (!defined($pid)) {
 249         die "Failed to spawn a child";
 250     } elsif ($pid > 0) {
 251         push(@running, $pid);
 252     } elsif ($pid == 0) {
 253         my $dbh = DBI->connect('DBI:Pg:');
 254         reingest_attributes($dbh, $list) unless ($skip_attrs);
 255         reingest_field_entries($dbh, $list)
 256             unless ($skip_facets && $skip_search);
 257         $dbh->disconnect();
 258         exit(0);
 259     }
 260 }
 261
 262 # Reingest metabib field entries on a list of records.
 263 sub reingest_field_entries {
 264     my $dbh = shift;
 265     my $list = shift;
 266     my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)");
 267     # Because reingest uses "skip" options we invert the logic of do variables.
 268     $sth->bind_param(2, ($skip_facets) ? 1 : 0);
 269     $sth->bind_param(3, ($skip_search) ? 1 : 0);
 270     foreach (@$list) {
 271         $sth->bind_param(1, $_);
 272         if ($sth->execute()) {
 273             my $crap = $sth->fetchall_arrayref();
 274         } else {
 275             warn ("metabib.reingest_metabib_field_entries failed for record $_");
 276         }
 277     }
 278 }
 279
 280 # Reingest record attributes on a list of records.
 281 sub reingest_attributes {
 282     my $dbh = shift;
 283     my $list = shift;
 284     my $sth = $dbh->prepare(<<END_OF_INGEST
 285 SELECT metabib.reingest_record_attributes(id, NULL::TEXT[], marc)
 286 FROM biblio.record_entry
 287 WHERE id = ?
 288 END_OF_INGEST
 289     );
 290     foreach (@$list) {
 291         $sth->bind_param(1, $_);
 292         if ($sth->execute()) {
 293             my $crap = $sth->fetchall_arrayref();
 294         } else {
 295             warn ("metabib.reingest_record_attributes failed for record $_");
 296         }
 297     }
 298 }