From 7528f1623cb4931b66be434ff413c5bb992ff048 Mon Sep 17 00:00:00 2001 From: dbs Date: Thu, 7 Apr 2011 04:47:00 +0000 Subject: [PATCH] Enable marc2sre.pl to run reasonably fast with a large set of bibs Our previous iteration of marc2sre.pl used an ILIKE stanza beginning with a wildcard to match system control numbers without having to specify the institution's MARC code. This worked, but was painfully slow in large bib sets as the database needed to use a bitmap index scan to find matches. By adding a --prefix flag, the user can specify the institutional MARC code for the set of records and we can use an exact match against metabib.full_rec.value, which is immeasurably faster. This is, of course, a problem if there are multiple institutional MARC codes in use for a given set of bibliographic records. git-svn-id: svn://svn.open-ils.org/ILS/trunk@20012 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/extras/import/marc2sre.pl.in | 40 +++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/Open-ILS/src/extras/import/marc2sre.pl.in b/Open-ILS/src/extras/import/marc2sre.pl.in index a783d10c6b..58e7fa69f7 100755 --- a/Open-ILS/src/extras/import/marc2sre.pl.in +++ b/Open-ILS/src/extras/import/marc2sre.pl.in @@ -8,6 +8,7 @@ use OpenSRF::Utils::SettingsClient; use OpenILS::Application::AppUtils; use OpenILS::Event; use OpenILS::Utils::Fieldmapper; +use OpenILS::Utils::Normalize qw/naco_normalize/; use OpenSRF::Utils::JSON; use Unicode::Normalize; @@ -21,7 +22,7 @@ use Pod::Usage; MARC::Charset->ignore_errors(1); # Command line options, with applicable defaults -my ($idsubfield, $bibfield, $bibsubfield, @files, $libmap, $quiet, $help); +my ($idsubfield, $prefix, $bibfield, $bibsubfield, @files, $libmap, $quiet, $help); my $idfield = '004'; my $count = 1; my $user = 'admin'; @@ -31,6 +32,7 @@ my $marctype = 'USMARC'; my $parse_options = GetOptions( 'idfield=s' => \$idfield, 'idsubfield=s' => \$idsubfield, + 'prefix=s'=> \$prefix, 'bibfield=s'=> \$bibfield, 'bibsubfield=s'=> \$bibsubfield, 'startid=i'=> \$count, @@ -192,16 +194,20 @@ sub get_user_id { return ($result, $evt); } -# Get the biblio.record_entry.id value for the given identifier; note that this -# approach uses a wildcard to match anything that precedes the identifier value +# Get the biblio.record_entry.id value for the given identifier sub map_id_to_bib { my $record = shift; my ($result, $evt); + $record = naco_normalize($record); + if ($prefix) { + $record = "$prefix $record"; + } + my %search = ( tag => $bibfield, - value => { ilike => '%' . $record } + value => naco_normalize($record) ); if ($bibsubfield) { @@ -256,6 +262,12 @@ Specifies the MFHD subfield, if any, where the identifier of the corresponding bibliographic record is found. This option is ignored unless it is accompanied by the B<--idfield> option. Defaults to null. +=item * B<-p> I B<--prefix>=I + +Specifies the MARC code for the organization that should be prefixed to the +bibliographic record identifier. This option is ignored unless it is accompanied +by the B<--bibfield> option. Defaults to null. + =item * B<--bibfield> I Specifies the field in the bibliographic record that holds the identifier @@ -301,12 +313,28 @@ Suppresses the record counter output. =head1 EXAMPLES - marc2sre.pl --idfield 004 --bibfield 035 --bibsubfield a --user cat1 serial_holding.xml + marc2sre.pl --user admin --marctype XML --libmap library.map --file serial_holding.xml + +Processes MFHD records in the B file as a MARC21XML file, +using the default 004 control field for the source of the bibliographic record +ID and converting the ID to a plain integer for matching directly against the +B column. The file B contains the mappings +of library names to integers, and the "admin" user will own the processed MFHD +records. + + marc2sre.pl --idfield 004 --prefix ocolc --bibfield 035 --bibsubfield a --user cat1 serial_holding.mrc + +B: The B<--bibfield> / B<--bibsubfield> options require one database +lookup per MFHD record and will greatly slow down your import. Avoid if at all +possible. Processes MFHD records in the B file. The script pulls the bibliographic record identifier from the 004 control field of the MFHD record and searches for a matching value in the bibliographic record in data field -035, subfield a. The "cat1" user will own the processed MFHD records. +035, subfield a. The prefix "ocolc" will be prepended to the bibliographic +record identifier to provide exact matchings against the +B column. The "cat1" user will own the processed MFHD +records. =head1 AUTHOR -- 2.43.2