2 # ---------------------------------------------------------------
3 # Copyright © 2013,2014 Merrimack Valley Library Consortium
4 # Jason Stephenson <jstephenson@mvlc.org>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 # ---------------------------------------------------------------
16 # TODO: Document with POD.
17 # This guy parallelizes a reingest.
23 # Globals for the command line options: --
25 # You will want to adjust the next two based on your database size,
26 # i.e. number of bib records as well as the number of cores on your
27 # database server. Using roughly number of cores/2 doesn't seem to
28 # have much impact in off peak times.
29 my $batch_size = 10000; # records processed per batch
30 my $max_child = 8; # max number of parallel worker processes
32 my $skip_browse; # Skip the browse reingest.
33 my $skip_attrs; # Skip the record attributes reingest.
34 my $skip_search; # Skip the search reingest.
35 my $skip_facets; # Skip the facets reingest.
36 my $start_id; # start processing at this bib ID.
37 my $end_id; # stop processing when this bib ID is reached.
38 my $max_duration; # max processing duration in seconds
39 my $help; # show help text
42 'batch-size=i' => \$batch_size,
43 'max-child=i' => \$max_child,
44 'skip-browse' => \$skip_browse,
45 'skip-attrs' => \$skip_attrs,
46 'skip-search' => \$skip_search,
47 'skip-facets' => \$skip_facets,
48 'start-id=i' => \$start_id,
49 'end-id=i' => \$end_id,
50 'max-duration=i' => \$max_duration,
57 $0 --batch-size $batch_size --max-child $max_child \
58 --start-id 1 --end-id 500000 --duration 14400
61 Number of records to process per batch
64 Max number of worker processes
70 Skip the selected reingest component
73 Start processing at this record ID.
76 Stop processing when this record ID is reached
79 Stop processing after this many total seconds have passed.
90 my $where = "WHERE deleted = 'f'";
91 if ($start_id && $end_id) {
92 $where .= " AND id BETWEEN $start_id AND $end_id";
94 $where .= " AND id >= $start_id";
96 $where .= " AND id <= $end_id";
99 # "Gimme the keys! I'll drive!"
102 FROM biblio.record_entry
107 # Stuffs needed for looping, tracking how many lists of records we
108 # have, storing the actual list of records, and the list of the lists
110 my ($count, $lists, $records) = (0,0,[]);
112 # To do the browse-only ingest:
115 my $start_epoch = time;
117 sub duration_expired {
118 return 1 if $max_duration && (time - $start_epoch) >= $max_duration;
122 # All of the DBI->connect() calls in this file assume that you have
123 # configured the PGHOST, PGPORT, PGDATABASE, PGUSER, and PGPASSWORD
124 # variables in your execution environment. If you have not, you have
129 # 2) edit the DBI->connect() calls in this program so that it can
130 # connect to your database.
131 my $dbh = DBI->connect('DBI:Pg:');
133 my $results = $dbh->selectall_arrayref($q);
134 foreach my $r (@$results) {
135 my $record = $r->[0];
136 push(@blist, $record); # separate list of browse-only ingest
137 push(@$records, $record);
138 if (++$count == $batch_size) {
139 $lol[$lists++] = $records;
144 $lol[$lists++] = $records if ($count); # Last batch is likely to be
148 # We're going to reuse $count to keep track of the total number of
152 # @running keeps track of the running child processes.
155 # We start the browse-only ingest before starting the other ingests.
156 browse_ingest(@blist) unless ($skip_browse);
158 # We loop until we have processed all of the batches stored in @lol
159 # or the maximum processing duration has been reached.
160 while ($count < $lists) {
161 my $duration_expired = duration_expired();
163 if (scalar(@lol) && scalar(@running) < $max_child && !$duration_expired) {
164 # Reuse $records for the lulz.
165 $records = shift(@lol);
166 if ($skip_search && $skip_facets && $skip_attrs) {
173 if (grep {$_ == $pid} @running) {
174 @running = grep {$_ != $pid} @running;
176 print "$count of $lists processed\n";
180 if ($duration_expired && scalar(@running) == 0) {
181 warn "Exiting on max_duration ($max_duration)\n";
186 # This subroutine forks a process to do the browse-only ingest on the
187 # @blist above. It cannot be parallelized, but can run in parrallel
188 # to the other ingests.
192 if (!defined($pid)) {
193 die "failed to spawn child";
195 # Add our browser to the list of running children.
196 push(@running, $pid);
197 # Increment the number of lists, because this list was not
198 # previously counted.
200 } elsif ($pid == 0) {
201 my $dbh = DBI->connect('DBI:Pg:');
202 my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, TRUE, FALSE, TRUE)");
204 if ($sth->execute($_)) {
205 my $crap = $sth->fetchall_arrayref();
207 warn ("Browse ingest failed for record $_");
209 if (duration_expired()) {
210 warn "browse_ingest() stopping on record $_ ".
211 "after max duration reached\n";
220 # Fork a child to do the other reingests:
225 if (!defined($pid)) {
226 die "Failed to spawn a child";
228 push(@running, $pid);
229 } elsif ($pid == 0) {
230 my $dbh = DBI->connect('DBI:Pg:');
231 reingest_attributes($dbh, $list) unless ($skip_attrs);
232 reingest_field_entries($dbh, $list)
233 unless ($skip_facets && $skip_search);
239 # Reingest metabib field entries on a list of records.
240 sub reingest_field_entries {
243 my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)");
244 # Because reingest uses "skip" options we invert the logic of do variables.
245 $sth->bind_param(2, ($skip_facets) ? 1 : 0);
246 $sth->bind_param(3, ($skip_search) ? 1 : 0);
248 $sth->bind_param(1, $_);
249 if ($sth->execute()) {
250 my $crap = $sth->fetchall_arrayref();
252 warn ("metabib.reingest_metabib_field_entries failed for record $_");
257 # Reingest record attributes on a list of records.
258 sub reingest_attributes {
261 my $sth = $dbh->prepare(<<END_OF_INGEST
262 SELECT metabib.reingest_record_attributes(id, NULL::TEXT[], marc)
263 FROM biblio.record_entry
268 $sth->bind_param(1, $_);
269 if ($sth->execute()) {
270 my $crap = $sth->fetchall_arrayref();
272 warn ("metabib.reingest_record_attributes failed for record $_");