From 7125758576667feeff054ceb4b79d7d1510bbc6f Mon Sep 17 00:00:00 2001 From: gmc Date: Wed, 6 Apr 2011 21:49:06 +0000 Subject: [PATCH] install command-line MARC import tools in @prefix@/bin No longer need to keep the source tree around to use marc2are.pl, marc2sre.pl, marc2bre.pl, and parallel_pg_loader.pl. Signed-off-by: Galen Charlton git-svn-id: svn://svn.open-ils.org/ILS/trunk@20008 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/Makefile.am | 2 +- Open-ILS/src/extras/import/marc2are.pl | 2 - Open-ILS/src/extras/import/marc2are.pl.in | 119 ++++++ Open-ILS/src/extras/import/marc2bre.pl | 2 - Open-ILS/src/extras/import/marc2bre.pl.in | 396 ++++++++++++++++++ Open-ILS/src/extras/import/marc2sre.pl.in | 323 ++++++++++++++ .../src/extras/import/parallel_pg_loader.pl | 2 - .../extras/import/parallel_pg_loader.pl.in | 136 ++++++ configure.ac | 9 +- 9 files changed, 983 insertions(+), 8 deletions(-) create mode 100755 Open-ILS/src/extras/import/marc2are.pl.in create mode 100755 Open-ILS/src/extras/import/marc2bre.pl.in create mode 100755 Open-ILS/src/extras/import/marc2sre.pl.in create mode 100755 Open-ILS/src/extras/import/parallel_pg_loader.pl.in diff --git a/Open-ILS/src/Makefile.am b/Open-ILS/src/Makefile.am index 7988b53eb0..e87bbd142b 100644 --- a/Open-ILS/src/Makefile.am +++ b/Open-ILS/src/Makefile.am @@ -138,7 +138,7 @@ if BUILDEGJAVA OILSJAVA_DIR = java endif -bin_SCRIPTS = $(core_scripts) $(reporter_scripts) $(installautojs) @srcdir@/extras/eg_config @srcdir@/extras/fast-extract +bin_SCRIPTS = $(core_scripts) $(reporter_scripts) $(installautojs) @srcdir@/extras/eg_config @srcdir@/extras/fast-extract @srcdir@/extras/import/marc2are.pl @srcdir@/extras/import/marc2bre.pl @srcdir@/extras/import/marc2sre.pl @srcdir@/extras/import/parallel_pg_loader.pl data_DATA = $(core_data) $(reporter_data) # Take care of which subdirectories to build, and which extra files to include in a distribution. diff --git a/Open-ILS/src/extras/import/marc2are.pl b/Open-ILS/src/extras/import/marc2are.pl index 1eb86d43cf..62734adaa1 100755 --- a/Open-ILS/src/extras/import/marc2are.pl +++ b/Open-ILS/src/extras/import/marc2are.pl @@ -2,8 +2,6 @@ use strict; use warnings; -use lib '/openils/lib/perl5/'; - use OpenSRF::System; use OpenSRF::Application; use OpenSRF::EX qw/:try/; diff --git a/Open-ILS/src/extras/import/marc2are.pl.in b/Open-ILS/src/extras/import/marc2are.pl.in new file mode 100755 index 0000000000..d6a4c12c00 --- /dev/null +++ b/Open-ILS/src/extras/import/marc2are.pl.in @@ -0,0 +1,119 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use OpenSRF::System; +use OpenSRF::Application; +use OpenSRF::EX qw/:try/; +use OpenSRF::AppSession; +use OpenSRF::MultiSession; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Application::AppUtils; +use OpenILS::Utils::Fieldmapper; +use Digest::MD5 qw/md5_hex/; +use OpenSRF::Utils::JSON; +use Data::Dumper; +use Unicode::Normalize; + +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; + +MARC::Charset->ignore_errors(1); + +my ($count, $user, $password, $config, $marctype, $keyfile, @files, $quiet) = + (1, 'admin', 'open-ils', '@sysconfdir@/opensrf_core.xml', 'USMARC'); + +GetOptions( + 'startid=i' => \$count, + 'user=s' => \$user, + 'marctype=s' => \$marctype, + 'password=s' => \$password, + 'config=s' => \$config, + 'file=s' => \@files, + 'quiet' => \$quiet, +); + +@files = @ARGV if (!@files); + +my @ses; +my @req; +my %processing_cache; + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +$user = OpenILS::Application::AppUtils->check_user_session( login($user,$password) )->id; + +select STDERR; $| = 1; +select STDOUT; $| = 1; + +my $batch = new MARC::Batch ( $marctype, @files ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + my $id = $count; + + (my $xml = $rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + my $bib = new Fieldmapper::authority::record_entry; + $bib->id($id); + $bib->active('t'); + $bib->deleted('f'); + $bib->marc($xml); + $bib->creator($user); + $bib->create_date('now'); + $bib->editor($user); + $bib->edit_date('now'); + $bib->last_xact_id('IMPORT-'.$starttime); + + print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; + + $count++; + + if (!$quiet && !($count % 20)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } +} + +sub login { + my( $username, $password, $type ) = @_; + + $type |= "staff"; + + my $seed = OpenILS::Application::AppUtils->simplereq( + 'open-ils.auth', + 'open-ils.auth.authenticate.init', + $username + ); + + die("No auth seed. Couldn't talk to the auth server") unless $seed; + + my $response = OpenILS::Application::AppUtils->simplereq( + 'open-ils.auth', + 'open-ils.auth.authenticate.complete', + { username => $username, + password => md5_hex($seed . md5_hex($password)), + type => $type }); + + die("No auth response returned on login.") unless $response; + + my $authtime = $response->{payload}->{authtime}; + my $authtoken = $response->{payload}->{authtoken}; + + die("Login failed for user $username!") unless $authtoken; + + return $authtoken; +} + diff --git a/Open-ILS/src/extras/import/marc2bre.pl b/Open-ILS/src/extras/import/marc2bre.pl index e5c9604dcb..bddde4fb83 100755 --- a/Open-ILS/src/extras/import/marc2bre.pl +++ b/Open-ILS/src/extras/import/marc2bre.pl @@ -2,8 +2,6 @@ use strict; use warnings; -#use lib '/openils/lib/perl5/'; - use Error qw/:try/; use OpenILS::Utils::Fieldmapper; use Digest::MD5 qw/md5_hex/; diff --git a/Open-ILS/src/extras/import/marc2bre.pl.in b/Open-ILS/src/extras/import/marc2bre.pl.in new file mode 100755 index 0000000000..d9de5c3f44 --- /dev/null +++ b/Open-ILS/src/extras/import/marc2bre.pl.in @@ -0,0 +1,396 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use Error qw/:try/; +use OpenILS::Utils::Fieldmapper; +use Digest::MD5 qw/md5_hex/; +use OpenSRF::Utils::JSON; +use OpenILS::Application::AppUtils; +use Data::Dumper; +use Unicode::Normalize; +use Encode; + +use FileHandle; +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; +use DBI; + +#MARC::Charset->ignore_errors(1); + +my ($id_field, $id_subfield, $recid, $user, $config, $idlfile, $marctype, $tcn_offset, $tcn_mapfile, $tcn_dumpfile, $used_id_file, $used_tcn_file, $enc, @files, @trash_fields, @req_fields, $use901, $quiet, $tcn_field, $tcn_subfield) = + ('', 'a', 0, 1, '@sysconfdir@/opensrf_core.xml', '@sysconfdir@/fm_IDL.xml', 'USMARC', 0); + +my ($db_driver, $db_host, $db_port, $db_name, $db_user, $db_pw) = + ('Pg', 'localhost', 5432, 'evergreen', 'postgres', 'postgres'); + +GetOptions( + 'marctype=s' => \$marctype, # format of MARC files being processed defaults to USMARC, often set to XML + 'startid=i' => \$recid, # id number to start with when auto-assigning id numbers, defaults to highest id in database + 1 + 'idfield=s' => \$id_field, # field containing the record's desired internal id, NOT tcn + 'idsubfield=s' => \$id_subfield, # subfield of above record id field + 'tcnfield=s' => \$tcn_field, # field containing the record's desired tcn, NOT the internal id + 'tcnsubfield=s' => \$tcn_subfield, # subfield of above record tcn field + 'tcnoffset=i' => \$tcn_offset, # optionally skip characters at beginning of supplied tcn (e.g. to remove '(Sirsi)') + 'user=s' => \$user, # set creator/editor values for records in database + 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset + 'keyfile=s' => \$tcn_mapfile, # DEPRECATED, use tcn_mapfile instead + 'tcn_mapfile=s' => \$tcn_mapfile, # external file which allows for matching specific record tcns to specific record ids, format = one id_number|tcn_number combo per line + 'tcnfile=s' => \$tcn_dumpfile, # DEPRECATED, use tcn_dumpfile instead + 'tcn_dumpfile=s' => \$tcn_dumpfile, # allows specification of a dumpfile for all used tcn values + 'config=s' => \$config, # location of OpenSRF core config file, defaults to @sysconfdir@/opensrf_core.xml + 'file=s' => \@files, # files to process (or you can simple list the files as unnamed arguments, i.e. @ARGV) + 'required_fields=s' => \@req_fields, # skip any records missing these fields + 'trash=s' => \@trash_fields, # fields to remove from all processed records + 'xml_idl=s' => \$idlfile, # location of XML IDL file, defaults to @sysconfdir@/fm_IDL.xml + 'dontuse=s' => \$used_id_file, # DEPRECATED, use used_id_file instead + 'used_id_file=s' => \$used_id_file, # external file which prevents id collisions by specifying ids already in use in the database, format = one id number per line + 'used_tcn_file=s' => \$used_tcn_file, # external file which prevents tcn collisions by specifying tcns already in use in the database, format = one tcn number per line + "db_driver=s" => \$db_driver, # database driver type, usually 'Pg' + "db_host=s" => \$db_host, # database hostname + "db_port=i" => \$db_port, # database port + "db_name=s" => \$db_name, # database name + "db_user=s" => \$db_user, # database username + "db_pw=s" => \$db_pw, # database password + 'use901' => \$use901, # use values from previously created 901 fields and skip all other processing + 'quiet' => \$quiet # do not output progress count +); + +@trash_fields = split(/,/,join(',',@trash_fields)); +@req_fields = split(/,/,join(',',@req_fields)); + +if ($enc) { + MARC::Charset->ignore_errors(1); + MARC::Charset->assume_encoding($enc); +} + +if (uc($marctype) eq 'XML') { + 'open'->use(':utf8'); +} else { + bytes->use(); +} + +@files = @ARGV if (!@files); + +my @ses; +my @req; +my %processing_cache; + +my $dsn = "dbi:$db_driver:host=$db_host;port=$db_port;dbname=$db_name"; + +if (!$recid) { + my $table = 'biblio_record_entry'; + $table = 'biblio.record_entry' if ($db_driver eq 'Pg'); + + my $dbh = DBI->connect($dsn,$db_user,$db_pw); + my $sth = $dbh->prepare("SELECT MAX(id) + 1 FROM $table"); + + $sth->execute; + $sth->bind_col(1, \$recid); + $sth->fetch; + $sth->finish; + $dbh->disconnect; + + # In a clean Evergreen schema, the maximum ID will be -1; but sequences + # have to start at 1, so handle the clean Evergreen schema situation + if ($recid == 0) { + $recid = 1; + } +} + +my %tcn_source_map = ( + a => 'Sirsi_Auto', + o => 'OCLC', + i => 'ISxN', + l => 'LCCN', + s => 'System', + g => 'Gutenberg', + z => 'Unknown', +); + +Fieldmapper->import(IDL => $idlfile); + +my %tcn_map; +if ($tcn_mapfile) { + open F, $tcn_mapfile or die "Couldn't open key file $tcn_mapfile"; + while () { + if ( /^(\d+)\|(\S+)/o ) { + $tcn_map{$1} = $2; + } + } + close(F); +} + +my %used_recids; +if ($used_id_file) { + open F, $used_id_file or die "Couldn't open used-id file $used_id_file"; + while () { + chomp; + s/^\s*//; + s/\s*$//; + $used_recids{$_} = 1; + } + close(F); +} + +my %used_tcns; +if ($used_tcn_file) { + open F, $used_tcn_file or die "Couldn't open used-tcn file $used_tcn_file"; + while () { + chomp; + s/^\s*//; + s/\s*$//; + $used_tcns{$_} = 1; + } + close(F); +} + +select STDERR; $| = 1; +select STDOUT; $| = 1; + +my $batch = new MARC::Batch ( $marctype, @files ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +my $count = 0; +PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + + $count++; + + # Skip records that don't contain a required field (like '245', for example) + foreach my $req_field (@req_fields) { + if (!$rec->field("$req_field")) { + warn "\n!!! Record $count missing required field $req_field, skipping record.\n"; + next PROCESS; + } + } + + my $id; + my $tcn_value = ''; + my $tcn_source = ''; + # If $use901 is set, use it for the id, the tcn, and the tcn source without ANY further processing (i.e. no error checking) + if ($use901) { + $rec->delete_field($_) for ($rec->field(@trash_fields)); + $tcn_value = $rec->subfield('901' => 'a'); + $tcn_source = $rec->subfield('901' => 'b'); + $id = $rec->subfield('901' => 'c'); + } else { + # This section of code deals with the record's 'id', which is a system-level, numeric, internal identifier + # It is often convenient but not necessary to carry over the internal ids from your previous ILS, so here is where that happens + if ($id_field) { + my $field = $rec->field($id_field); + if ($field) { + if ($field->is_control_field) { + $id = $field->data; + } else { + $id = $field->subfield($id_subfield); + } + # ensure internal record ids are numeric only + $id =~ s/\D+//gso if $id; + } + + # catch problem ids + if (!$id) { + warn "\n!!! Record $count has missing or invalid id field $id_field, assigning new id.\n"; + $id = ''; + } elsif (exists $used_recids{$id}) { + warn "\n!!! Record $count has a duplicate id in field $id_field, assigning new id.\n"; + $id = ''; + } else { + $used_recids{$id} = 1; + } + } + + # id field not specified or found to be invalid, assign auto id + if (!$id) { + while (exists $used_recids{$recid}) { + $recid++; + } + $used_recids{$recid} = 1; + $id = $recid; + $recid++; + } + + # This section of code deals with the record's 'tcn', or title control number, which is a record-level, possibly alpha-numeric, sometimes user-supplied value + if ($tcn_field) { + if ($tcn_mapfile) { + if (my $tcn = $tcn_map{$id}) { + $rec->delete_field( $_ ) for ($rec->field($tcn_field)); + $rec->append_fields( MARC::Field->new( $tcn_field, '', '', $tcn_subfield, $tcn ) ); + } else { + warn "\n!!! ID $id not found in tcn_mapfile, skipping record.\n"; + $count++; + next; + } + } + + my $field = $rec->field($tcn_field); + if ($field) { + if ($field->is_control_field) { + $tcn_value = $field->data; + } else { + $tcn_value = $field->subfield($tcn_subfield); + } + # $tcn_offset is another Sirsi influence, as it will allow you to remove '(Sirsi)' + # from exported tcns, but was added more generically to perhaps support other use cases + if ($tcn_value) { + $tcn_value = substr($tcn_value, $tcn_offset); + } else { + $tcn_value = ''; + } + } + } + + # turn our id and tcn into a 901 field, and also create a tcn and/or figure out the tcn source + ($tcn_value, $tcn_source) = preprocess($rec, $tcn_value, $id); + # delete the old identifier and trash fields + $rec->delete_field($_) for ($rec->field('901', $tcn_field, $id_field, @trash_fields)); + } + + (my $xml = $rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + my $bib = new Fieldmapper::biblio::record_entry; + $bib->id($id); + $bib->active('t'); + $bib->deleted('f'); + $bib->marc($xml); + $bib->creator($user); + $bib->create_date('now'); + $bib->editor($user); + $bib->edit_date('now'); + $bib->tcn_source($tcn_source); + $bib->tcn_value($tcn_value); + $bib->last_xact_id('IMPORT-'.$starttime); + + print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; + $used_tcns{$tcn_value} = 1; + + if (!$quiet && !($count % 50)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } +} + +if ($tcn_dumpfile) { + open TCN_DUMPFILE, '>', $tcn_dumpfile; + print TCN_DUMPFILE "$_\n" for (keys %used_tcns); +} + + +sub preprocess { + my $rec = shift; + my $tcn_value = shift; + my $id = shift; + + my $tcn_source = ''; + # in the following code, $tcn_number represents the portion of the tcn following the source code-letter + my $tcn_number = ''; + my $warn = 0; + my $passed_tcn = ''; + + # this preprocess subroutine is optimized for Sirsi-created tcns, that is, those with a single letter + # followed by some digits (and maybe 'x' in older systems). If using user supplied tcns, try to identify + # the source here, otherwise set to 'z' ('Unknown') + if ($tcn_value =~ /([a-z])([0-9xX]+)/) { + $tcn_source = $1; + $tcn_number = $2; + } else { + $tcn_source = 'z'; + } + + # save and warn if a passed in TCN is replaced + if ($tcn_value && exists $used_tcns{$tcn_value}) { + $passed_tcn = $tcn_value; + $tcn_value = ''; + $tcn_number = ''; + $tcn_source = ''; + $warn = 1; + } + + # we didn't have a user supplied tcn, or it was a duplicate, so let's derive one from commonly unique record fields + if (!$tcn_value) { + my $f = $rec->field('001'); + $tcn_value = despace($f->data) if ($f); + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('000'); + if ($f) { + $tcn_number = despace($f->data); + $tcn_source = 'g'; # only Project Gutenberg seems to use this + $tcn_value = $tcn_source.$tcn_number; + } + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('020'); + if ($f) { + $tcn_number = despace($f->subfield('a')); + $tcn_source = 'i'; + $tcn_value = $tcn_source.$tcn_number; + } + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('022'); + if ($f) { + $tcn_number = despace($f->subfield('a')); + $tcn_source = 'i'; + $tcn_value = $tcn_source.$tcn_number; + } + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('010'); + if ($f) { + $tcn_number = despace($f->subfield('a')); + $tcn_source = 'l'; + $tcn_value = $tcn_source.$tcn_number; + } + } + + # special case to catch possibly passed in full OCLC numbers and those derived from the 001 field + if ($tcn_value =~ /^oc(m|n)(\d+)$/o) { + $tcn_source = 'o'; + $tcn_number = $2; + $tcn_value = $tcn_source.$tcn_number; + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + $tcn_source = 's'; + $tcn_number = $id; + $tcn_value = $tcn_source.$tcn_number; + $warn = 1 + } + + + # expand $tcn_source from code letter to full name + $tcn_source = do { $tcn_source_map{$tcn_source} || 'Unknown' }; + + if ($warn) { + warn "\n!!! TCN $passed_tcn is already in use, using TCN ($tcn_value) derived from $tcn_source ID.\n"; + } + + return ($tcn_value, $tcn_source); +} + +sub despace { + my $value = shift; + + # remove all leading/trailing spaces and trucate at first internal space if present + $value =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/^(\S+).*$/$1/o; + + return $value; +} diff --git a/Open-ILS/src/extras/import/marc2sre.pl.in b/Open-ILS/src/extras/import/marc2sre.pl.in new file mode 100755 index 0000000000..377aaf7045 --- /dev/null +++ b/Open-ILS/src/extras/import/marc2sre.pl.in @@ -0,0 +1,323 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use OpenSRF::System; +use OpenSRF::EX qw/:try/; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Application::AppUtils; +use OpenILS::Event; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::JSON; +use Unicode::Normalize; + +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; +use Pod::Usage; + +MARC::Charset->ignore_errors(1); + +# Command line options, with applicable defaults +my ($idsubfield, $bibfield, $bibsubfield, @files, $libmap, $quiet, $help); +my $idfield = '004'; +my $count = 1; +my $user = 'admin'; +my $config = '@sysconfdir@/opensrf_core.xml'; +my $marctype = 'USMARC'; + +my $parse_options = GetOptions( + 'idfield=s' => \$idfield, + 'idsubfield=s' => \$idsubfield, + 'bibfield=s'=> \$bibfield, + 'bibsubfield=s'=> \$bibsubfield, + 'startid=i'=> \$count, + 'user=s' => \$user, + 'config=s' => \$config, + 'marctype=s' => \$marctype, + 'file=s' => \@files, + 'libmap=s' => \$libmap, + 'quiet' => \$quiet, + 'help' => \$help, +); + +if (!$parse_options or $help) { + pod2usage(0); +} + +@files = @ARGV if (!@files); + +my $U = 'OpenILS::Application::AppUtils'; +my @ses; +my @req; +my %processing_cache; +my $lib_id_map; +if ($libmap) { + $lib_id_map = map_libraries_to_ID($libmap); +} + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +my ($result, $evt) = get_user_id($user); +if ($evt || !$result->id) { + print("Could not retrieve user with user name '$user'\n"); + exit(0); +} + +$user = $result->id; + +select STDERR; $| = 1; +select STDOUT; $| = 1; + +my $batch = new MARC::Batch ( $marctype, @files ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + my $id = $count; + my $record_field; + if ($idsubfield) { + $record_field = $rec->field($idfield, $idsubfield); + } else { + $record_field = $rec->field($idfield); + } + my $record = $count; + + if ($record_field) { + $record = $record_field->data; + } + + # If we have been given bibfield / bibsubfield values, use those to find + # a matching bib record for $record and use _that_ as our record instead + if ($bibfield) { + my ($result, $evt) = map_id_to_bib($record); + if ($evt || !$result->record) { + print("Could not find matching bibliographic record for $record\n"); + } + $record = $result->record; + } else { + # Strip the identifier down to a usable integer + $record =~ s/^.*?(\d+).*?$/$1/o; + } + + (my $xml = $rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + my $bib = new Fieldmapper::serial::record_entry; + $bib->id($id); + $bib->record($record); + $bib->active('t'); + $bib->deleted('f'); + $bib->marc($xml); + $bib->creator($user); + $bib->create_date('now'); + $bib->editor($user); + $bib->edit_date('now'); + $bib->last_xact_id('IMPORT-'.$starttime); + + if ($libmap) { + my $lib_id = get_library_id($rec); + if ($lib_id) { + $bib->owning_lib($lib_id); + } + } + + print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; + + $count++; + + if (!$quiet && !($count % 20)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } +} + +# Generate a hash of library names (as found in the 852b in the MFHD record) to +# integers representing actor.org_unit ID values +sub map_libraries_to_ID { + my $map_filename = shift; + + my %lib_id_map; + + open(MAP_FH, '<', $map_filename) or die "Could not load [$map_filename] $!"; + while () { + my ($lib, $id) = $_ =~ /^(.*?)\t(.*?)$/; + $lib_id_map{$lib} = $id; + } + + return \%lib_id_map; +} + +# Look up the actor.org_unit.id value for this library name +sub get_library_id { + my $record = shift; + + my $lib_name = $record->field('852')->subfield('b'); + my $lib_id = $lib_id_map->{$lib_name}; + + return $lib_id; +} + +# Get the actor.usr.id value for the given username +sub get_user_id { + my $username = shift; + + my ($result, $evt); + + $result = $U->cstorereq( + 'open-ils.cstore.direct.actor.user.search', + { usrname => $username, deleted => 'f' } + ); + $evt = OpenILS::Event->new('ACTOR_USR_NOT_FOUND') unless $result; + + return ($result, $evt); +} + +# Get the biblio.record_entry.id value for the given identifier; note that this +# approach uses a wildcard to match anything that precedes the identifier value +sub map_id_to_bib { + my $record = shift; + + my ($result, $evt); + + my %search = ( + tag => $bibfield, + value => { ilike => '%' . $record } + ); + + if ($bibsubfield) { + $search{'subfield'} = $bibsubfield; + } + + $result = $U->cstorereq( + 'open-ils.cstore.direct.metabib.full_rec.search', \%search + ); + $evt = OpenILS::Event->new('METABIB_FULL_REC_NOT_FOUND') unless $record; + + return ($result, $evt); +} + +__END__ + +=head1 NAME + +marc2sre.pl - Convert MARC Format for Holdings Data (MFHD) records to SRE +(serial.record_entry) JSON objects + +=head1 SYNOPSIS + +C [B<--config>=I] +[[B<--idfield>=I[ B<--idsubfield>=I]] [B<--start_id>=I] +[B<--user>=I] [B<--marctype>=I] +[[B<--file>=I[, ...]] [B<--libmap>=I] [B<--quiet>=I] +[[B<--bibfield>=I [B<--bibsubfield>=]] + +=head1 DESCRIPTION + +For one or more files containing MFHD records, iterate through the records +and generate SRE (serial.record_entry) JSON objects. + +=head1 OPTIONS + +=over + +=item * B<-c> I, B<--config>=I + +Specifies the OpenSRF configuration file used to connect to the OpenSRF router. +Defaults to F<@sysconfdir@/opensrf_core.xml> + +=item * B<--idfield> I + +Specifies the MFHD field where the identifier of the corresponding +bibliographic record is found. Defaults to '004'. + +=item * B<--idsubfield> I + +Specifies the MFHD subfield, if any, where the identifier of the corresponding +bibliographic record is found. This option is ignored unless it is accompanied +by the B<--idfield> option. Defaults to null. + +=item * B<--bibfield> I + +Specifies the field in the bibliographic record that holds the identifier +value. Defaults to null. + +=item * B<--bibsubfield> I + +Specifies the subfield in the bibliographic record, if any, that holds the +identifier value. This option is ignored unless it is accompanied by the +B<--bibfield> option. Defaults to null. + +=item * B<-u> I, B<--user>=I + +Specifies the Evergreen user that will own these serial records. + +=item * B<-m> I, B<--marctype>=I + +Specifies whether the files containg the MFHD records are in MARC21 ('MARC21') +or MARC21XML ('XML') format. Defaults to MARC21. + +=item * B<-l> I, B<--libmap>=I + +Points to a file to containing a mapping of library names to integers. +The integer represents the actor.org_unit.id value of the library. This enables +us to generate an ingest file that does not subsequently need to manually +manipulated. + +The library name must correspond to the 'b' subfield of the 852 field. +Well, it does not have to, but you will have to modify this script +accordingly. + +The format of the map file should be the name of the library, followed +by a tab, followed by the desired numeric ID of the library. For example: + +BR1 4 +BR2 5 + +=item * B<-q>, B<--quiet> + +Suppresses the record counter output. + +=back + +=head1 EXAMPLES + + marc2sre.pl --idfield 004 --bibfield 035 --bibsubfield a --user cat1 serial_holding.xml + +Processes MFHD records in the B file. The script pulls the +bibliographic record identifier from the 004 control field of the MFHD record +and searches for a matching value in the bibliographic record in data field +035, subfield a. The "cat1" user will own the processed MFHD records. + +=head1 AUTHOR + +Dan Scott + +=head1 COPYRIGHT AND LICENSE + +Copyright 2010-2011 by Dan Scott + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +=cut diff --git a/Open-ILS/src/extras/import/parallel_pg_loader.pl b/Open-ILS/src/extras/import/parallel_pg_loader.pl index 43ddaa9ce5..211d7f60e5 100755 --- a/Open-ILS/src/extras/import/parallel_pg_loader.pl +++ b/Open-ILS/src/extras/import/parallel_pg_loader.pl @@ -2,8 +2,6 @@ use strict; use warnings; -use lib '/openils/lib/perl5/'; - use OpenSRF::System; use OpenSRF::EX qw/:try/; use OpenSRF::Utils::SettingsClient; diff --git a/Open-ILS/src/extras/import/parallel_pg_loader.pl.in b/Open-ILS/src/extras/import/parallel_pg_loader.pl.in new file mode 100755 index 0000000000..f276f0b8d9 --- /dev/null +++ b/Open-ILS/src/extras/import/parallel_pg_loader.pl.in @@ -0,0 +1,136 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use OpenSRF::System; +use OpenSRF::EX qw/:try/; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::JSON; +use FileHandle; + +use Time::HiRes qw/time/; +use Getopt::Long; + +my @files; +my ($config, $output, @auto, @order, @wipe) = + ('@sysconfdir@/opensrf_core.xml', 'pg_loader-output'); +my $nocommit = 0; + +GetOptions( + 'config=s' => \$config, + 'output=s' => \$output, + 'wipe=s' => \@wipe, + 'autoprimary=s' => \@auto, + 'order=s' => \@order, + 'nocommit=i' => \$nocommit, +); + +my $pwd = `pwd`; +chop($pwd); + +my %lineset; +my %fieldcache; + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +my $main_out = FileHandle->new(">$output.sql") if ($output); + +binmode($main_out,'utf8'); + +$main_out->print("SET CLIENT_ENCODING TO 'UNICODE';\n\n"); +$main_out->print("BEGIN;\n\n"); + +my %out_files; +for my $h (@order) { + $out_files{$h} = FileHandle->new(">$output.$h.sql"); + binmode($out_files{$h},'utf8'); +} + +my $count = 0; +my $starttime = time; +my $after_commit = ''; +while ( my $rec = <> ) { + next unless ($rec); + + my $row; + try { + $row = OpenSRF::Utils::JSON->JSON2perl($rec); + } catch Error with { + my $e = shift; + warn "\n\n !!! Error : $e \n\n at or around line $count\n"; + }; + next unless ($row); + + my $class = $row->class_name; + my $hint = $row->json_hint; + + next unless ( grep /$hint/, @order ); + + if (!$fieldcache{$hint}) { + my @cols = $row->real_fields; + if (grep { $_ eq $hint} @auto) { + @cols = grep { $_ ne $class->Identity } @cols; + } + + $fieldcache{$hint} = + { table => $class->Table, + sequence => $class->Sequence, + pkey => $class->Identity, + fields => \@cols, + }; + + #XXX it burnnnsssessss + $fieldcache{$hint}{table} =~ s/\.full_rec/.real_full_rec/o if ($hint eq 'mfr'); + + my $fields = join(',', @{ $fieldcache{$hint}{fields} }); + $main_out->print( "DELETE FROM $fieldcache{$hint}{table};\n" ) if (grep {$_ eq $hint } @wipe); + # Speed up loading of bib records + $main_out->print( "COPY $fieldcache{$hint}{table} ($fields) FROM '$pwd/$output.$hint.sql';\n" ); + + } + + my $line = [map { $row->$_ } @{ $fieldcache{$hint}{fields} }]; + my @data; + my $x = 0; + for my $d (@$line) { + if (!defined($d)) { + $d = '\N'; + } else { + $d =~ s/\f/\\f/gos; + $d =~ s/\n/\\n/gos; + $d =~ s/\r/\\r/gos; + $d =~ s/\t/\\t/gos; + $d =~ s/\\/\\\\/gos; + } + if ($hint eq 'bre' and $fieldcache{$hint}{fields}[$x] eq 'quality') { + $d = int($d) if ($d ne '\N'); + } + push @data, $d; + $x++; + } + $out_files{$hint}->print( join("\t", @data)."\n" ); + + if (!($count % 500)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } + + $count++; +} + +for my $hint (@order) { + next if (grep { $_ eq $hint} @auto); + next unless ($fieldcache{$hint}{sequence}); + $after_commit .= "SELECT setval('$fieldcache{$hint}{sequence}'::TEXT, (SELECT MAX($fieldcache{$hint}{pkey}) FROM $fieldcache{$hint}{table}), TRUE);\n"; +} + +if (grep /^mfr$/, %out_files) { + $main_out->print("SELECT reporter.enable_materialized_simple_record_trigger();\n"); + $main_out->print("SELECT reporter.disable_materialized_simple_record_trigger();\n"); +} + +$main_out->print("COMMIT;\n\n") unless $nocommit; +$main_out->print($after_commit); +$main_out->close; + diff --git a/configure.ac b/configure.ac index 88a84dceeb..c1f5137f42 100644 --- a/configure.ac +++ b/configure.ac @@ -373,12 +373,19 @@ AC_CONFIG_FILES([Makefile Open-ILS/updates/Makefile Open-ILS/xul/staff_client/Makefile Open-ILS/src/extras/eg_config - Open-ILS/src/extras/fast-extract + Open-ILS/src/extras/import/marc2are.pl + Open-ILS/src/extras/import/marc2bre.pl + Open-ILS/src/extras/import/marc2sre.pl + Open-ILS/src/extras/import/parallel_pg_loader.pl Open-ILS/src/perlmods/Makefile Open-ILS/src/perlmods/lib/OpenILS/Utils/Cronscript.pm], [ if test -e "./Open-ILS/src/extras/eg_config"; then chmod 755 Open-ILS/src/extras/eg_config; fi; if test -e "./Open-ILS/src/extras/fast-extract"; then chmod 755 Open-ILS/src/extras/fast-extract; fi; + if test -e "./Open-ILS/src/extras/import/marc2are.pl"; then chmod 755 Open-ILS/src/extras/import/marc2are.pl; fi; + if test -e "./Open-ILS/src/extras/import/marc2bre.pl"; then chmod 755 Open-ILS/src/extras/import/marc2bre.pl; fi; + if test -e "./Open-ILS/src/extras/import/marc2sre.pl"; then chmod 755 Open-ILS/src/extras/import/marc2sre.pl; fi; + if test -e "./Open-ILS/src/extras/import/parallel_pg_loader.pl"; then chmod 755 Open-ILS/src/extras/import/parallel_pg_loader.pl; fi; ]) AC_OUTPUT -- 2.43.2