5 use lib '/openils/lib/perl5/';
8 use OpenILS::Utils::Fieldmapper;
9 use Digest::MD5 qw/md5_hex/;
10 use OpenSRF::Utils::JSON;
12 use Unicode::Normalize;
16 use Time::HiRes qw/time/;
19 use MARC::File::XML ( BinaryEncoding => 'utf-8' );
23 #MARC::Charset->ignore_errors(1);
25 my ($id_field, $id_subfield, $recid, $user, $config, $idlfile, $marctype, $tcn_offset, $tcn_mapfile, $tcn_dumpfile, $used_id_file, $used_tcn_file, $enc, @files, @trash_fields, @req_fields, $use901, $quiet, $tcn_field, $tcn_subfield) =
26 ('', 'a', 0, 1, '/openils/conf/opensrf_core.xml', '/openils/conf/fm_IDL.xml', 'USMARC', 0);
28 my ($db_driver,$db_host,$db_name,$db_user,$db_pw) =
29 ('Pg','localhost','evergreen','postgres','postgres');
32 'marctype=s' => \$marctype, # format of MARC files being processed defaults to USMARC, often set to XML
33 'startid=i' => \$recid, # id number to start with when auto-assigning id numbers, defaults to highest id in database + 1
34 'idfield=s' => \$id_field, # field containing the record's desired internal id, NOT tcn
35 'idsubfield=s' => \$id_subfield, # subfield of above record id field
36 'tcnfield=s' => \$tcn_field, # field containing the record's desired tcn, NOT the internal id
37 'tcnsubfield=s' => \$tcn_subfield, # subfield of above record tcn field
38 'tcnoffset=i' => \$tcn_offset, # optionally skip characters at beginning of supplied tcn (e.g. to remove '(Sirsi)')
39 'user=s' => \$user, # set creator/editor values for records in database
40 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset
41 'keyfile=s' => \$tcn_mapfile, # DEPRECATED, use tcn_mapfile instead
42 'tcn_mapfile=s' => \$tcn_mapfile, # external file which allows for matching specific record tcns to specific record ids, format = one id_number|tcn_number combo per line
43 'tcnfile=s' => \$tcn_dumpfile, # DEPRECATED, use tcn_dumpfile instead
44 'tcn_dumpfile=s' => \$tcn_dumpfile, # allows specification of a dumpfile for all used tcn values
45 'config=s' => \$config, # location of OpenSRF core config file, defaults to /openils/conf/opensrf_core.xml
46 'file=s' => \@files, # files to process (or you can simple list the files as unnamed arguments, i.e. @ARGV)
47 'required_fields=s' => \@req_fields, # skip any records missing these fields
48 'trash=s' => \@trash_fields, # fields to remove from all processed records
49 'xml_idl=s' => \$idlfile, # location of XML IDL file, defaults to /openils/conf/fm_IDL.xml
50 'dontuse=s' => \$used_id_file, # DEPRECATED, use used_id_file instead
51 'used_id_file=s' => \$used_id_file, # external file which prevents id collisions by specifying ids already in use in the database, format = one id number per line
52 'used_tcn_file=s' => \$used_tcn_file, # external file which prevents tcn collisions by specifying tcns already in use in the database, format = one tcn number per line
53 "db_driver=s" => \$db_driver, # database driver type, usually 'Pg'
54 "db_host=s" => \$db_host, # database hostname
55 "db_name=s" => \$db_name, # database name
56 "db_user=s" => \$db_user, # database username
57 "db_pw=s" => \$db_pw, # database password
58 'use901' => \$use901, # use values from previously created 901 fields and skip all other processing
59 'quiet' => \$quiet # do not output progress count
62 @trash_fields = split(/,/,join(',',@trash_fields));
63 @req_fields = split(/,/,join(',',@req_fields));
66 MARC::Charset->ignore_errors(1);
67 MARC::Charset->assume_encoding($enc);
70 if (uc($marctype) eq 'XML') {
76 @files = @ARGV if (!@files);
82 my $dsn = "dbi:$db_driver:host=$db_host;dbname=$db_name";
85 my $table = 'biblio_record_entry';
86 $table = 'biblio.record_entry' if ($db_driver eq 'Pg');
88 my $dbh = DBI->connect($dsn,$db_user,$db_pw);
89 my $sth = $dbh->prepare("SELECT MAX(id) + 1 FROM $table");
92 $sth->bind_col(1, \$recid);
98 my %tcn_source_map = (
108 Fieldmapper->import(IDL => $idlfile);
112 open F, $tcn_mapfile or die "Couldn't open key file $tcn_mapfile";
114 if ( /^(\d+)\|(\S+)/o ) {
123 open F, $used_id_file or die "Couldn't open used-id file $used_id_file";
128 $used_recids{$_} = 1;
134 if ($used_tcn_file) {
135 open F, $used_tcn_file or die "Couldn't open used-tcn file $used_tcn_file";
145 select STDERR; $| = 1;
146 select STDOUT; $| = 1;
148 my $batch = new MARC::Batch ( $marctype, @files );
149 $batch->strict_off();
150 $batch->warnings_off();
152 my $starttime = time;
155 PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) {
156 next if ($rec == -1);
160 # Skip records that don't contain a required field (like '245', for example)
161 foreach my $req_field (@req_fields) {
162 if (!$rec->field("$req_field")) {
163 warn "\n!!! Record $count missing required field $req_field, skipping record.\n";
171 # If $use901 is set, use it for the id, the tcn, and the tcn source without ANY further processing (i.e. no error checking)
173 $rec->delete_field($_) for ($rec->field(@trash_fields));
174 $tcn_value = $rec->subfield('901' => 'a');
175 $tcn_source = $rec->subfield('901' => 'b');
176 $id = $rec->subfield('901' => 'c');
178 # This section of code deals with the record's 'id', which is a system-level, numeric, internal identifier
179 # It is often convenient but not necessary to carry over the internal ids from your previous ILS, so here is where that happens
181 my $field = $rec->field($id_field);
183 if ($field->is_control_field) {
186 $id = $field->subfield($id_subfield);
188 # ensure internal record ids are numeric only
189 $id =~ s/\D+//gso if $id;
194 warn "\n!!! Record $count has missing or invalid id field $id_field, assinging new id.\n";
196 } elsif (exists $used_recids{$id}) {
197 warn "\n!!! Record $count has a duplicate id in field $id_field, assinging new id.\n";
200 $used_recids{$id} = 1;
204 # id field not specified or found to be invalid, assign auto id
206 while (exists $used_recids{$recid}) {
209 $used_recids{$recid} = 1;
214 # This section of code deals with the record's 'tcn', or title control number, which is a record-level, possibly alpha-numeric, sometimes user-supplied value
217 if (my $tcn = $tcn_map{$id}) {
218 $rec->delete_field( $_ ) for ($rec->field($tcn_field));
219 $rec->append_fields( MARC::Field->new( $tcn_field, '', '', $tcn_subfield, $tcn ) );
221 warn "\n!!! ID $id not found in tcn_mapfile, skipping record.\n";
227 my $field = $rec->field($tcn_field);
229 if ($field->is_control_field) {
230 $tcn_value = $field->data;
232 $tcn_value = $field->subfield($tcn_subfield);
234 # $tcn_offset is another Sirsi influence, as it will allow you to remove '(Sirsi)'
235 # from exported tcns, but was added more generically to perhaps support other use cases
237 $tcn_value = substr($tcn_value, $tcn_offset);
244 # turn our id and tcn into a 901 field, and also create a tcn and/or figure out the tcn source
246 ($field901, $tcn_value, $tcn_source) = preprocess($rec, $tcn_value, $id);
247 # delete the old identifier and trash fields
248 $rec->delete_field($_) for ($rec->field('901', $tcn_field, $id_field, @trash_fields));
249 $rec->append_fields($field901);
252 (my $xml = $rec->as_xml_record()) =~ s/\n//sog;
253 $xml =~ s/^<\?xml.+\?\s*>//go;
254 $xml =~ s/>\s+</></go;
255 $xml =~ s/\p{Cc}//go;
256 $xml = entityize($xml,'D');
257 $xml =~ s/[\x00-\x1f]//go;
259 my $bib = new Fieldmapper::biblio::record_entry;
264 $bib->creator($user);
265 $bib->create_date('now');
267 $bib->edit_date('now');
268 $bib->tcn_source($tcn_source);
269 $bib->tcn_value($tcn_value);
270 $bib->last_xact_id('IMPORT-'.$starttime);
272 print OpenSRF::Utils::JSON->perl2JSON($bib)."\n";
273 $used_tcns{$tcn_value} = 1;
275 if (!$quiet && !($count % 50)) {
276 print STDERR "\r$count\t". $count / (time - $starttime);
281 open TCN_DUMPFILE, '>', $tcn_dumpfile;
282 print TCN_DUMPFILE "$_\n" for (keys %used_tcns);
288 my $tcn_value = shift;
292 # in the following code, $tcn_number represents the portion of the tcn following the source code-letter
297 # this preprocess subroutine is optimized for Sirsi-created tcns, that is, those with a single letter
298 # followed by some digits (and maybe 'x' in older systems). If using user supplied tcns, try to identify
299 # the source here, otherwise set to 'z' ('Unknown')
300 if ($tcn_value =~ /([a-z])([0-9xX]+)/) {
307 # save and warn if a passed in TCN is replaced
308 if ($tcn_value && exists $used_tcns{$tcn_value}) {
309 $passed_tcn = $tcn_value;
316 # we didn't have a user supplied tcn, or it was a duplicate, so let's derive one from commonly unique record fields
318 my $f = $rec->field('001');
319 $tcn_value = despace($f->data) if ($f);
322 if (!$tcn_value || exists $used_tcns{$tcn_value}) {
323 my $f = $rec->field('000');
325 $tcn_number = despace($f->data);
326 $tcn_source = 'g'; # only Project Gutenberg seems to use this
327 $tcn_value = $tcn_source.$tcn_number;
331 if (!$tcn_value || exists $used_tcns{$tcn_value}) {
332 my $f = $rec->field('020');
334 $tcn_number = despace($f->subfield('a'));
336 $tcn_value = $tcn_source.$tcn_number;
340 if (!$tcn_value || exists $used_tcns{$tcn_value}) {
341 my $f = $rec->field('022');
343 $tcn_number = despace($f->subfield('a'));
345 $tcn_value = $tcn_source.$tcn_number;
349 if (!$tcn_value || exists $used_tcns{$tcn_value}) {
350 my $f = $rec->field('010');
352 $tcn_number = despace($f->subfield('a'));
354 $tcn_value = $tcn_source.$tcn_number;
358 if (!$tcn_value || exists $used_tcns{$tcn_value}) {
361 $tcn_value = $tcn_source.$tcn_number;
364 # special case to catch possibly passed in full OCLC numbers and those derived from the 001 field
365 if ($tcn_value =~ /^oc(m|n)(\d+)$/o) {
368 $tcn_value = $tcn_source.$tcn_number;
371 # expand $tcn_source from code letter to full name
372 $tcn_source = do { $tcn_source_map{$tcn_source} || 'Unknown' };
375 warn "\n!!! TCN $passed_tcn is already in use, using TCN ($tcn_value) derived from $tcn_source ID.\n";
378 my $field901 = MARC::Field->new(
385 return ($field901, $tcn_value, $tcn_source);
392 if ($form and $form eq 'D') {
393 $stuff = NFD($stuff);
395 $stuff = NFC($stuff);
398 $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
405 # remove all leading/trailing spaces and trucate at first internal space if present
408 $value =~ s/^(\S+).*$/$1/o;