2 # ---------------------------------------------------------------
3 # Copyright © 2013,2014 Merrimack Valley Library Consortium
4 # Jason Stephenson <jstephenson@mvlc.org>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 # ---------------------------------------------------------------
16 # TODO: Document with POD.
17 # This guy parallelizes a reingest.
22 # You will want to adjust the next two based on your database size,
23 # i.e. number of bib records as well as the number of cores on your
24 # database server. Using roughly number of cores/2 doesn't seem to
25 # have much impact in off peak times.
31 # Globals for the command line options:
32 my $do_browse = 1; # Do the browse reingest.
33 my $do_attrs = 1; # Do the record attributes reingest.
34 my $do_search = 1; # Do the search reingest.
35 my $do_facets = 1; # Do the facets reingest.
37 # Command line options to skip different reingests. In this case, we
38 # use the '-' to indicate a minus or a no, so to
39 # skip browse reingest: -browse or -b
40 # skip attribute reingest: -attributes or -a
41 # skip search reingest: -search or -s
42 # skip facet reingest: -facets or -f
44 if (/^-b(?:rowse)?$/) {
46 } elsif (/^-a(?:ttr(?:ibute)?s?)?$/) {
48 } elsif (/^-s(?:earch)?$/) {
50 } elsif (/^-f(?:acets?)?$/) {
53 # TODO: Add usage() function to report allowed options.
54 die ("Unrecognized option: $_");
58 # "Gimme the keys! I'll drive!"
61 FROM biblio.record_entry
67 # Stuffs needed for looping, tracking how many lists of records we
68 # have, storing the actual list of records, and the list of the lists
70 my ($count, $lists, $records) = (0,0,[]);
72 # To do the browse-only ingest:
75 # All of the DBI->connect() calls in this file assume that you have
76 # configured the PGHOST, PGPORT, PGDATABASE, PGUSER, and PGPASSWORD
77 # variables in your execution environment. If you have not, you have
82 # 2) edit the DBI->connect() calls in this program so that it can
83 # connect to your database.
84 my $dbh = DBI->connect('DBI:Pg:');
86 my $results = $dbh->selectall_arrayref($q);
87 foreach my $r (@$results) {
89 push(@blist, $record); # separate list of browse-only ingest
90 push(@$records, $record);
91 if (++$count == BATCHSIZE) {
92 $lol[$lists++] = $records;
97 $lol[$lists++] = $records if ($count); # Last batch is likely to be
101 # We're going to reuse $count to keep track of the total number of
105 # @running keeps track of the running child processes.
108 # We start the browse-only ingest before starting the other ingests.
109 browse_ingest(@blist) if ($do_browse);
111 # We loop until we have processed all of the batches stored in @lol:
112 while ($count < $lists) {
113 if (scalar(@lol) && scalar(@running) < MAXCHILD) {
114 # Reuse $records for the lulz.
115 $records = shift(@lol);
116 if ($do_search || $do_facets || $do_attrs) {
123 if (grep {$_ == $pid} @running) {
124 @running = grep {$_ != $pid} @running;
126 print "$count of $lists processed\n";
131 # This subroutine forks a process to do the browse-only ingest on the
132 # @blist above. It cannot be parallelized, but can run in parrallel
133 # to the other ingests.
137 if (!defined($pid)) {
138 die "failed to spawn child";
140 # Add our browser to the list of running children.
141 push(@running, $pid);
142 # Increment the number of lists, because this list was not
143 # previously counted.
145 } elsif ($pid == 0) {
146 my $dbh = DBI->connect('DBI:Pg:');
147 my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, TRUE, FALSE, TRUE)");
149 if ($sth->execute($_)) {
150 my $crap = $sth->fetchall_arrayref();
152 warn ("Browse ingest failed for record $_");
160 # Fork a child to do the other reingests:
165 if (!defined($pid)) {
166 die "Failed to spawn a child";
168 push(@running, $pid);
169 } elsif ($pid == 0) {
170 my $dbh = DBI->connect('DBI:Pg:');
171 reingest_attributes($dbh, $list) if ($do_attrs);
172 reingest_field_entries($dbh, $list) if ($do_facets || $do_search);
178 # Reingest metabib field entries on a list of records.
179 sub reingest_field_entries {
182 my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)");
183 # Because reingest uses "skip" options we invert the logic of do variables.
184 $sth->bind_param(2, ($do_facets) ? 0 : 1);
185 $sth->bind_param(3, ($do_search) ? 0 : 1);
187 $sth->bind_param(1, $_);
188 if ($sth->execute()) {
189 my $crap = $sth->fetchall_arrayref();
191 warn ("metabib.reingest_metabib_field_entries failed for record $_");
196 # Reingest record attributes on a list of records.
197 sub reingest_attributes {
200 my $sth = $dbh->prepare(<<END_OF_INGEST
201 SELECT metabib.reingest_record_attributes(id, NULL::TEXT[], marc)
202 FROM biblio.record_entry
207 $sth->bind_param(1, $_);
208 if ($sth->execute()) {
209 my $crap = $sth->fetchall_arrayref();
211 warn ("metabib.reingest_record_attributes failed for record $_");