From b5ff44cd4afe3cb68d78a5017f03517b821a5120 Mon Sep 17 00:00:00 2001 From: miker Date: Fri, 14 Jul 2006 20:07:37 +0000 Subject: [PATCH] more import pipeline improvement git-svn-id: svn://svn.open-ils.org/ILS/trunk@5013 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/extras/import/direct_ingest.pl | 16 +++- Open-ILS/src/extras/import/direct_loader.pl | 5 +- Open-ILS/src/extras/import/importer.sh | 36 +++++++ Open-ILS/src/extras/import/marc2bre.pl | 71 ++++++++------ Open-ILS/src/extras/import/pg_loader.pl | 100 ++++++++++++++++++++ 5 files changed, 195 insertions(+), 33 deletions(-) create mode 100755 Open-ILS/src/extras/import/importer.sh create mode 100755 Open-ILS/src/extras/import/pg_loader.pl diff --git a/Open-ILS/src/extras/import/direct_ingest.pl b/Open-ILS/src/extras/import/direct_ingest.pl index b0773477d9..81fceca81b 100755 --- a/Open-ILS/src/extras/import/direct_ingest.pl +++ b/Open-ILS/src/extras/import/direct_ingest.pl @@ -50,7 +50,11 @@ for (1 .. $workers) { push @ses, $w; } else { $0 = "Local Ingest Worker $_"; - worker($r, $_); + if ($workers == 1) { + worker($r, -1); + } else { + worker($r, $_); + } exit; } } @@ -63,9 +67,17 @@ sub worker { OpenSRF::System->bootstrap_client( config_file => $config ); Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + sleep 1; + OpenILS::Application::Ingest->use; - my $f = new FileHandle(">${prefix}$file"); + my $fname = "${prefix}$file"; + if ($file == -1) { + $fname = '&STDOUT'; + } + + my $f = new FileHandle(">$fname"); + while (my $rec = <$pipe>) { my $bib = JSON->JSON2perl($rec); diff --git a/Open-ILS/src/extras/import/direct_loader.pl b/Open-ILS/src/extras/import/direct_loader.pl index 1ed48dacfd..c512585533 100755 --- a/Open-ILS/src/extras/import/direct_loader.pl +++ b/Open-ILS/src/extras/import/direct_loader.pl @@ -67,7 +67,10 @@ while ( my $rec = <> ) { } $count++; + + last if ($count > 10000); } -OpenSRF::Application->method_lookup( "$base.finish" )->run; + +#OpenSRF::Application->method_lookup( "$base.finish" )->run; diff --git a/Open-ILS/src/extras/import/importer.sh b/Open-ILS/src/extras/import/importer.sh new file mode 100755 index 0000000000..ac153259b0 --- /dev/null +++ b/Open-ILS/src/extras/import/importer.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +CONF=$1 +FILE=$2 +OUT=$3 +KEYS=$4 + +if [ "_$OUT" == "_" ]; then + echo "Usage: $0 {Config File} {MARC file} {Output File} [{key file}]" + exit; +fi + +DIR=`dirname $0` + +$DIR/marc2bre.pl \ + -k $KEYS \ + -c $CONF $FILE 2>/dev/null | \ + $DIR/direct_ingest.pl \ + -c $CONF \ + -t 1 2>/dev/null | \ + $DIR/pg_loader.pl -c $CONF \ + -or bre \ + -or mrd \ + -or mfr \ + -or mtfe \ + -or mafe \ + -or msfe \ + -or mkfe \ + -or msefe \ + -a mrd \ + -a mfr \ + -a mtfe \ + -a mafe \ + -a msfe \ + -a mkfe \ + -a msefe diff --git a/Open-ILS/src/extras/import/marc2bre.pl b/Open-ILS/src/extras/import/marc2bre.pl index 19ce90d1ca..53b23233cb 100755 --- a/Open-ILS/src/extras/import/marc2bre.pl +++ b/Open-ILS/src/extras/import/marc2bre.pl @@ -4,26 +4,6 @@ use warnings; use lib '/openils/lib/perl5/'; -#use OpenSRF::System; -#use OpenILS::Utils::Fieldmapper; -#use OpenSRF::Utils::SettingsClient; -# -#OpenSRF::System->bootstrap_client(config_file => -#'/openils/conf/bootstrap.conf'); -#Fieldmapper->import(IDL => -# OpenSRF::Utils::SettingsClient->new->config_value("IDL")); -# -# # do this after bootstrapping/importing IDL -# require OpenILS::Application::Search; -# -# my $meth = OpenSRF::Application->method_lookup( -# 'open-ils.search.biblio.metarecord.mods_slim.retrieve'); -# my @data = $meth->run(1); -# my $mods = shift @data; -# print "Got mvr: " . $mods->title . "\n"; - - - use OpenSRF::System; use OpenSRF::Application; use OpenSRF::EX qw/:try/; @@ -35,6 +15,7 @@ use OpenILS::Utils::Fieldmapper; use Digest::MD5 qw/md5_hex/; use JSON; use Data::Dumper; +use Unicode::Normalize; use Time::HiRes qw/time/; use Getopt::Long; @@ -45,7 +26,7 @@ use UNIVERSAL::require; MARC::Charset->ignore_errors(1); -my ($id_field, $count, $user, $password, $config, $keyfile, @files) = +my ($id_field, $count, $user, $password, $config, $keyfile, @files, @trash_fields) = ('998', 1, 'admin', 'open-ils', '/openils/conf/bootstrap.conf'); GetOptions( @@ -56,6 +37,7 @@ GetOptions( 'keyfile=s' => \$keyfile, 'config=s' => \$config, 'file=s' => \@files, + 'trash=s' => \@trash_fields, ); @files = @ARGV if (!@files); @@ -98,33 +80,47 @@ $batch->warnings_off(); my $starttime = time; while ( my $rec = $batch->next ) { - my $id = $rec->subfield($id_field => 'a') || $count; + my $id; + my $field = $rec->field($id_field); + + if ($field) { + if ($field->is_control_field) { + $id = $field->data; + } else { + $id = $field->subfield('a'); + } + } else { + $id = $count; + } + if ($id =~ /(\d+)/o) { $id = $1; } if ($keyfile) { if (my $tcn = $keymap{$id}) { - $rec->delete_field( $_ ) for ($rec->field('035')); - $rec->append_fields( MARC::Field->new( '035', '', '', 'a', $tcn ) ); + $rec->delete_field( $_ ) for ($rec->field($id_field)); + $rec->append_fields( MARC::Field->new( $id_field, '', '', 'a', $tcn ) ); } else { $count++; next; } } - $rec = preprocess($rec, $id); + $rec = preprocess($rec); if (!$rec) { next; } - my $tcn_value = $rec->subfield('039' => 'a'); - my $tcn_source = $rec->subfield('039' => 'b'); + my $tcn_value = $rec->subfield($id_field => 'a'); + my $tcn_source = $rec->subfield($id_field => 'b'); (my $xml = $rec->as_xml_record()) =~ s/\n//sog; $xml =~ s/^<\?xml.+\?\s*>//go; $xml =~ s/>\s+id($id); @@ -183,7 +179,7 @@ sub preprocess { } if (!$id) { - my $f = $rec->field('035'); + my $f = $rec->field($id_field); $id = $f->subfield('a') if ($f); $source = 's'; } @@ -194,6 +190,8 @@ sub preprocess { return undef; } + $rec->delete_field($_) for ($rec->field($id_field, @trash_fields)); + $id =~ s/\s*$//o; $id =~ s/^\s*//o; $id =~ s/(\S+)$/$1/o; @@ -207,13 +205,12 @@ sub preprocess { } my $tcn = MARC::Field->new( - '039', + $id_field, '', '', 'a', $id, 'b', do { $source_map{$source} || 'System' }, ); - $rec->delete_field($_) for ($rec->field('035','948','998')); $rec->append_fields($tcn); return $rec; @@ -249,3 +246,17 @@ sub login { return $authtoken; } +sub entityize { + my $stuff = shift; + my $form = shift; + + if ($form and $form eq 'D') { + $stuff = NFD($stuff); + } else { + $stuff = NFC($stuff); + } + + $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; + return $stuff; +} + diff --git a/Open-ILS/src/extras/import/pg_loader.pl b/Open-ILS/src/extras/import/pg_loader.pl new file mode 100755 index 0000000000..86a9f1a2f1 --- /dev/null +++ b/Open-ILS/src/extras/import/pg_loader.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl +use strict; + +use lib '/openils/lib/perl5/'; + +use OpenSRF::System; +use OpenSRF::EX qw/:try/; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Utils::Fieldmapper; +use JSON; +use FileHandle; + +use Time::HiRes qw/time/; +use Getopt::Long; + +my @files; +my ($config, $output, @auto, @order) = + ('/openils/conf/bootstrap.conf'); + +GetOptions( + 'config=s' => \$config, + 'output=s' => \$output, + 'autoprimary=s' => \@auto, + 'order=s' => \@order, +); + +my %lineset; +my %fieldcache; + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +my $count = 0; +my $starttime = time; +while ( my $rec = <> ) { + next unless ($rec); + + my $row; + try { + $row = JSON->JSON2perl($rec); + } catch Error with { + my $e = shift; + warn "\n\n !!! Error : $e \n\n at or around line $count\n"; + }; + die unless ($row); + + my $class = $row->class_name; + my $hint = $row->json_hint; + + if (!$lineset{$hint}) { + $lineset{$hint} = []; + my @cols = $row->real_fields; + if (grep { $_ eq $hint} @auto) { + @cols = grep { $_ ne $class->Identity } @cols; + } + + $fieldcache{$hint} = + { table => $class->Table, + fields => \@cols, + }; + } + + push @{ $lineset{$hint} }, [map { $row->$_ } @{ $fieldcache{$hint}{fields} }]; + + if (!($count % 500)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } + + $count++; +} + +print STDERR "\nWriting file ...\n"; + +$output = '&STDOUT' unless ($output); +$output = FileHandle->new(">$output") if ($output); + +binmode($output,'utf8'); + +$output->print("SET CLIENT_ENCODING TO 'UNICODE';\n\n"); + +for my $h (@order) { + my $fields = join(',', @{ $fieldcache{$h}{fields} }); + $output->print( "COPY $fieldcache{$h}{table} ($fields) FROM STDIN;\n" ); + + for my $line (@{ $lineset{$h} }) { + my @data; + for my $d (@$line) { + if (!defined($d)) { + $d = '\N'; + } else { + $d =~ s/\t/\\t/go; + $d =~ s/\\/\\\\/go; + } + push @data, $d; + } + $output->print( join("\t", @data)."\n" ); + } + + $output->print('\.'."\n\n"); +} -- 2.43.2