]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm
adding client-side stream push method; working on the ingestor
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Application / Ingest.pm
1 package OpenILS::Application::Ingest;
2 use base qw/OpenSRF::Application/;
3
4 use Unicode::Normalize;
5 use OpenSRF::EX qw/:try/;
6
7 use OpenSRF::Utils::SettingsClient;
8 use OpenSRF::Utils::Logger qw/:level/;
9
10 use OpenILS::Utils::FlatXML;
11 use OpenILS::Utils::Fieldmapper;
12 use JSON;
13
14 use OpenILS::Utils::Fieldmapper;
15
16 use XML::LibXML;
17 use XML::LibXSLT;
18 use Time::HiRes qw(time);
19
20 our %supported_formats = (
21         mods3   => {ns => 'http://www.loc.gov/mods/v3'},
22         mods    => {ns => 'http://www.loc.gov/mods/'},
23         marcxml => {ns => 'http://www.loc.gov/MARC21/slim'},
24         srw_dc  => {ns => ''},
25         oai_dc  => {ns => ''},
26         rdf_dc  => {ns => ''},
27 );
28
29
30 our $log = 'OpenSRF::Utils::Logger';
31
32 our $parser = XML::LibXML->new();
33 our $xslt = XML::LibXSLT->new();
34
35 our $mods_sheet;
36 our $mads_sheet;
37 our $xpathset = {};
38 sub initialize {}
39 sub child_init {}
40
41 sub post_init {
42
43         unless (keys %$xpathset) {
44                 $log->debug("Running post_init", DEBUG);
45
46                 my $xsldir = OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl');
47
48                 unless ($supported_formats{mods}{xslt}) {
49                         $log->debug("Loading MODS XSLT", DEBUG);
50                         my $xslt_doc = $parser->parse_file( $xsldir . "/MARC21slim2MODS.xsl");
51                         $supported_formats{mods}{xslt} = $xslt->parse_stylesheet( $xslt_doc );
52                 }
53
54                 unless ($supported_formats{mods3}{xslt}) {
55                         $log->debug("Loading MODS v3 XSLT", DEBUG);
56                         my $xslt_doc = $parser->parse_file( $xsldir . "/MARC21slim2MODS3.xsl");
57                         $supported_formats{mods3}{xslt} = $xslt->parse_stylesheet( $xslt_doc );
58                 }
59
60
61                 my $req = OpenSRF::AppSession
62                                 ->create('open-ils.cstore')
63                                 ->request(
64                                         'open-ils.cstore.direct.config.metabib_field.search.atomic',
65                                         { id => { '!=' => undef } }
66                 );
67
68                 for my $f (@$req) {
69                         $xpathset->{ $f->field_class }->{ $f->name }->{xpath} = $f->xpath;
70                         $xpathset->{ $f->field_class }->{ $f->name }->{id} = $f->id;
71                         $xpathset->{ $f->field_class }->{ $f->name }->{format} = $f->format;
72                         $log->debug("Loaded XPath from DB: ".$f->field_class." => ".$f->name." : ".$f->xpath, DEBUG);
73                 }
74         }
75 }
76
77 sub entityize {
78         my $stuff = shift;
79         my $form = shift;
80
81         if ($form eq 'D') {
82                 $stuff = NFD($stuff);
83         } else {
84                 $stuff = NFC($stuff);
85         }
86
87         $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
88         return $stuff;
89 }
90
91 # --------------------------------------------------------------------------------
92 # MARC index extraction
93
94 package OpenILS::Application::Ingest::XPATH;
95 use base qw/OpenILS::Application::Ingest/;
96 use Unicode::Normalize;
97
98 # give this an XML documentElement and an XPATH expression
99 sub xpath_to_string {
100         my $xml = shift;
101         my $xpath = shift;
102         my $ns_uri = shift;
103         my $ns_prefix = shift;
104         my $unique = shift;
105
106         $xml->setNamespace( $ns_uri, $ns_prefix, 1 ) if ($ns_uri && $ns_prefix);
107
108         my $string = "";
109
110         # grab the set of matching nodes
111         my @nodes = $xml->findnodes( $xpath );
112         for my $value (@nodes) {
113
114                 # grab all children of the node
115                 my @children = $value->childNodes();
116                 for my $child (@children) {
117
118                         # add the childs content to the growing buffer
119                         my $content = quotemeta($child->textContent);
120                         next if ($unique && $string =~ /$content/);  # uniquify the values
121                         $string .= $child->textContent . " ";
122                 }
123                 if( ! @children ) {
124                         $string .= $value->textContent . " ";
125                 }
126         }
127         return NFD($string);
128 }
129
130 sub class_index_string_xml {
131         my $self = shift;
132         my $client = shift;
133         my $xml = shift;
134         my @classes = @_;
135
136         OpenILS::Application::Ingest->post_init();
137         $xml = $parser->parse_string($xml) unless (ref $xml);
138         
139         for my $class (@classes) {
140                 my $class_constructor = "Fieldmapper::metabib::${class}_field_entry";
141                 for my $type ( keys %{ $xpathset->{$class} } ) {
142
143                         my $def = $xpathset->{$class}->{$type};
144                         my $value =  xpath_to_string(
145                                         $mods_sheet->transform($xml)->documentElement,
146                                         $def->{xpath},
147                                         $supported_formats{$def->{format}}{ns},
148                                         $def->{format},
149                                         1
150                         );
151
152                         next unless $value;
153
154                         $value =~ s/\pM+//sgo;
155                         $value =~ s/\pC+//sgo;
156                         #$value =~ s/[\x{0080}-\x{fffd}]//sgoe;
157
158                         $value =~ s/(\w)\./$1/sgo;
159                         $value = lc($value);
160
161                         my $fm = $class_constructor->new;
162                         $fm->value( $value );
163                         $fm->field( $xpathset->{$class}->{$type}->{id} );
164                         $client->respond($fm);
165                 }
166         }
167         return undef;
168 }
169 __PACKAGE__->register_method(  
170         api_name        => "open-ils.ingest.field_entry.class.xml",
171         method          => "class_index_string_xml",
172         api_level       => 1,
173         argc            => 2,
174         stream          => 1,
175 );                      
176
177 sub class_index_string_record {
178         my $self = shift;
179         my $client = shift;
180         my $rec = shift;
181         my @classes = shift;
182
183         OpenILS::Application::Ingest->post_init();
184         my $r = OpenSRF::AppSession
185                         ->create('open-ils.cstore')
186                         ->request(
187                                 'open-ils.cstore.direct.biblio.record_entry.retrieve' => $rec
188                         )->gather(1);
189
190         return undef unless ($r and @$r)
191
192         for my $fm ($self->method_lookup("open-ils.ingest.field_entry.class.xml")->run($r->marc, @classes)) {
193                 $fm->source($rec);
194                 $client->respond($fm);
195         }
196         return undef;
197 }
198 __PACKAGE__->register_method(  
199         api_name        => "open-ils.ingest.field_entry.class.record",
200         method          => "class_index_string_record",
201         api_level       => 1,
202         argc            => 2,
203         stream          => 1,
204 );                      
205
206 sub all_index_string_xml {
207         my $self = shift;
208         my $client = shift;
209         my $xml = shift;
210
211         for my $fm ($self->method_lookup("open-ils.ingest.field_entry.class.xml")->run($xml, keys(%$xpathset))) {
212                 $client->respond($fm);
213         }
214         return undef;
215 }
216 __PACKAGE__->register_method(  
217         api_name        => "open-ils.ingest.extract.field_entry.all.xml",
218         method          => "all_index_string_xml",
219         api_level       => 1,
220         argc            => 1,
221         stream          => 1,
222 );                      
223
224 sub all_index_string_record {
225         my $self = shift;
226         my $client = shift;
227         my $rec = shift;
228
229         OpenILS::Application::Ingest->post_init();
230         my $r = OpenSRF::AppSession
231                         ->create('open-ils.cstore')
232                         ->request(
233                                 'open-ils.cstore.direct.biblio.record_entry.retrieve' => $rec
234                         )
235                         ->gather(1);
236
237         return undef unless ($r and @$r)
238
239         for my $fm ($self->method_lookup("open-ils.ingest.field_entry.class.xml")->run($r->marc, keys(%$xpathset))) {
240                 $fm->source($rec);
241                 $client->respond($fm);
242         }
243         return undef;
244 }
245 __PACKAGE__->register_method(  
246         api_name        => "open-ils.ingest.extract.field_entry.all.record",
247         method          => "all_index_string_record",
248         api_level       => 1,
249         argc            => 1,
250         stream          => 1,
251 );                      
252
253 # --------------------------------------------------------------------------------
254 # Flat MARC
255
256 package OpenILS::Application::Ingest::FlatMARC;
257 use base qw/OpenILS::Application::Ingest/;
258 use Unicode::Normalize;
259
260
261 sub _marcxml_to_full_rows {
262
263         my $marcxml = shift;
264         my $xmltype = shift || 'metabib';
265
266         my $type = "Fieldmapper::${xmltype}::full_rec";
267
268         my @ns_list;
269         
270         my ($root) = $marcxml->findnodes('//*[local-name()="record"]');
271
272         for my $tagline ( @{$root->getChildrenByTagName("leader")} ) {
273                 next unless $tagline;
274
275                 my $ns = $type->new;
276
277                 $ns->tag( 'LDR' );
278                 my $val = $tagline->textContent;
279                 $val = NFD($val);
280                 $val =~ s/(\pM+)//gso;
281                 $ns->value( $val );
282
283                 push @ns_list, $ns;
284         }
285
286         for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) {
287                 next unless $tagline;
288
289                 my $ns = $type->new;
290
291                 $ns->tag( $tagline->getAttribute( "tag" ) );
292                 my $val = $tagline->textContent;
293                 $val = NFD($val);
294                 $val =~ s/(\pM+)//gso;
295                 $ns->value( $val );
296
297                 push @ns_list, $ns;
298         }
299
300         for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) {
301                 next unless $tagline;
302
303                 my $tag = $tagline->getAttribute( "tag" );
304                 my $ind1 = $tagline->getAttribute( "ind1" );
305                 my $ind2 = $tagline->getAttribute( "ind2" );
306
307                 for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) {
308                         next unless $data;
309
310                         my $ns = $type->new;
311
312                         $ns->tag( $tag );
313                         $ns->ind1( $ind1 );
314                         $ns->ind2( $ind2 );
315                         $ns->subfield( $data->getAttribute( "code" ) );
316                         my $val = $data->textContent;
317                         $val = NFD($val);
318                         $val =~ s/(\pM+)//gso;
319                         $ns->value( lc($val) );
320
321                         push @ns_list, $ns;
322                 }
323         }
324
325         $log->debug("Returning ".scalar(@ns_list)." Fieldmapper nodes from $xmltype xml", DEBUG);
326         return @ns_list;
327 }
328
329 sub flat_marc_xml {
330         my $self = shift;
331         my $client = shift;
332         my $xml = shift;
333
334         $xml = $parser->parse_string($xml) unless (ref $xml);
335
336         my $type = 'metabib';
337         $type = 'authority' if ($self->api_name =~ /authority/o);
338
339         OpenILS::Application::Ingest->post_init();
340
341         $client->respond($_) for (_marcxml_to_full_rows($xml, $type));
342         return undef;
343 }
344 __PACKAGE__->register_method(  
345         api_name        => "open-ils.ingest.flat_marc.authority.xml",
346         method          => "flat_marc_xml",
347         api_level       => 1,
348         argc            => 1,
349         stream          => 1,
350 );                      
351 __PACKAGE__->register_method(  
352         api_name        => "open-ils.ingest.flat_marc.biblio.xml",
353         method          => "flat_marc_xml",
354         api_level       => 1,
355         argc            => 1,
356         stream          => 1,
357 );                      
358
359 sub flat_marc_record {
360         my $self = shift;
361         my $client = shift;
362         my $rec = shift;
363
364         my $type = 'biblio';
365         $type = 'authority' if ($self->api_name =~ /authority/o);
366
367         OpenILS::Application::Ingest->post_init();
368         my $r = OpenSRF::AppSession
369                         ->create('open-ils.cstore')
370                         ->request( "open-ils.cstore.direct.${type}.record_entry.retrieve" => $rec );
371
372         $client->respond($_) for ($self->method_lookup("open-ils.ingest.flat_marc.$type.xml")->run($r->marc));
373         return undef;
374 }
375 __PACKAGE__->register_method(  
376         api_name        => "open-ils.ingest.flat_marc.biblio.record_entry",
377         method          => "flat_marc_record",
378         api_level       => 1,
379         argc            => 1,
380         stream          => 1,
381 );                      
382 __PACKAGE__->register_method(  
383         api_name        => "open-ils.ingest.flat_marc.authority.record_entry",
384         method          => "flat_marc_record",
385         api_level       => 1,
386         argc            => 1,
387         stream          => 1,
388 );                      
389
390 # --------------------------------------------------------------------------------
391 # Fingerprinting
392
393 package OpenILS::Application::Ingest::Biblio::Fingerprint;
394 use base qw/OpenILS::Application::Ingest/;
395 use Unicode::Normalize;
396 use OpenSRF::EX qw/:try/;
397
398 sub biblio_fingerprint_record {
399         my $self = shift;
400         my $client = shift;
401         my $rec = shift;
402
403         OpenILS::Application::Ingest->post_init();
404
405         my $r = OpenSRF::AppSession
406                         ->create('open-ils.cstore')
407                         ->request( 'open-ils.storage.direct.biblio.record_entry.retrieve' => $rec );
408
409         return undef unless ($r and $r->marc);
410
411         my ($fp) = $self->method_lookup('open-ils.worm.fingerprint.marc')->run($r->marc);
412         $log->debug("Returning [$fp] as fingerprint for record $rec", INFO);
413         return $fp;
414 }
415 __PACKAGE__->register_method(  
416         api_name        => "open-ils.worm.fingerprint.record",
417         method          => "biblio_fingerprint_record",
418         api_level       => 1,
419         argc            => 1,
420 );                      
421
422 our $fp_script;
423 sub biblio_fingerprint {
424         my $self = shift;
425         my $client = shift;
426         my $xml = shift;
427
428         $log->internal("Got MARC [$xml]");
429
430         if(!$fp_script) {
431                 my @pfx = ( "apps", "open-ils.storage","app_settings" );
432                 my $conf = OpenSRF::Utils::SettingsClient->new;
433
434                 my $libs        = $conf->config_value(@pfx, 'script_path');
435                 my $script_file = $conf->config_value(@pfx, 'scripts', 'biblio_fingerprint');
436                 my $script_libs = (ref($libs)) ? $libs : [$libs];
437
438                 $log->debug("Loading script $script_file for biblio fingerprinting...");
439                 
440                 $fp_script = new OpenILS::Utils::ScriptRunner
441                         ( file          => $script_file,
442                           paths         => $script_libs,
443                           reset_count   => 1000 );
444         }
445
446         $fp_script->insert('environment' => {marc => $marc} => 1);
447
448         my $res = $fp_script->run || ($log->error( "Fingerprint script died!  $@" ) && return undef);
449         $log->debug("Script for biblio fingerprinting completed successfully...");
450
451         return $res;
452 }
453 __PACKAGE__->register_method(  
454         api_name        => "open-ils.ingest.fingerprint.xml",
455         method          => "biblio_fingerprint",
456         api_level       => 1,
457         argc            => 1,
458 );                      
459
460
461 1;
462
463 __END__
464
465 sub in_transaction {
466         OpenILS::Application::Ingest->post_init();
467         return __PACKAGE__->storage_req( 'open-ils.storage.transaction.current' );
468 }
469
470 sub begin_transaction {
471         my $self = shift;
472         my $client = shift;
473         
474         OpenILS::Application::Ingest->post_init();
475         my $outer_xact = __PACKAGE__->storage_req( 'open-ils.storage.transaction.current' );
476         
477         try {
478                 if (!$outer_xact) {
479                         $log->debug("Ingest isn't inside a transaction, starting one now.", INFO);
480                         #__PACKAGE__->st_sess->connect;
481                         my $r = __PACKAGE__->storage_req( 'open-ils.storage.transaction.begin', $client );
482                         unless (defined $r and $r) {
483                                 __PACKAGE__->storage_req( 'open-ils.storage.transaction.rollback' );
484                                 #__PACKAGE__->st_sess->disconnect;
485                                 throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!")
486                         }
487                 }
488         } otherwise {
489                 $log->debug("Ingest Couldn't BEGIN transaction!", ERROR)
490         };
491
492         return __PACKAGE__->storage_req( 'open-ils.storage.transaction.current' );
493 }
494
495 sub rollback_transaction {
496         my $self = shift;
497         my $client = shift;
498
499         OpenILS::Application::Ingest->post_init();
500         my $outer_xact = __PACKAGE__->storage_req( 'open-ils.storage.transaction.current' );
501
502         try {
503                 if ($outer_xact) {
504                         __PACKAGE__->storage_req( 'open-ils.storage.transaction.rollback' );
505                 } else {
506                         $log->debug("Ingest isn't inside a transaction.", INFO);
507                 }
508         } catch Error with {
509                 throw OpenSRF::EX::PANIC ("Ingest Couldn't ROLLBACK transaction!")
510         };
511
512         return 1;
513 }
514
515 sub commit_transaction {
516         my $self = shift;
517         my $client = shift;
518
519         OpenILS::Application::Ingest->post_init();
520         my $outer_xact = __PACKAGE__->storage_req( 'open-ils.storage.transaction.current' );
521
522         try {
523                 #if (__PACKAGE__->st_sess->connected && $outer_xact) {
524                 if ($outer_xact) {
525                         my $r = __PACKAGE__->storage_req( 'open-ils.storage.transaction.commit' );
526                         unless (defined $r and $r) {
527                                 __PACKAGE__->storage_req( 'open-ils.storage.transaction.rollback' );
528                                 throw OpenSRF::EX::PANIC ("Couldn't COMMIT transaction!")
529                         }
530                         #__PACKAGE__->st_sess->disconnect;
531                 } else {
532                         $log->debug("Ingest isn't inside a transaction.", INFO);
533                 }
534         } catch Error with {
535                 throw OpenSRF::EX::PANIC ("Ingest Couldn't COMMIT transaction!")
536         };
537
538         return 1;
539 }
540
541 sub storage_req {
542         my $self = shift;
543         my $method = shift;
544         my @res = __PACKAGE__->method_lookup( $method )->run( @_ );
545         return shift( @res );
546 }
547
548 sub scrub_authority_record {
549         my $self = shift;
550         my $client = shift;
551         my $rec = shift;
552
553         my $commit = 0;
554         if (!OpenILS::Application::Ingest->in_transaction) {
555                 OpenILS::Application::Ingest->begin_transaction($client) || throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!");
556                 $commit = 1;
557         }
558
559         my $success = 1;
560         try {
561                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.set', 'scrub_authority_record' );
562
563                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.authority.full_rec.mass_delete', { record => $rec } );
564                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.authority.record_descriptor.mass_delete', { record => $rec } );
565
566                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.release', 'scrub_authority_record' );
567         } otherwise {
568                 $log->debug('Scrubbing failed : '.shift(), ERROR);
569                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.rollback', 'scrub_authority_record' );
570                 $success = 0;
571         };
572
573         OpenILS::Application::Ingest->commit_transaction if ($commit && $success);
574         OpenILS::Application::Ingest->rollback_transaction if ($commit && !$success);
575         return $success;
576 }
577 __PACKAGE__->register_method(  
578         api_name        => "open-ils.worm.scrub.authority",
579         method          => "scrub_authority_record",
580         api_level       => 1,
581         argc            => 1,
582 );                      
583
584
585 sub scrub_metabib_record {
586         my $self = shift;
587         my $client = shift;
588         my $rec = shift;
589
590         if ( ref($rec) && ref($rec) =~ /HASH/o ) {
591                 $rec = OpenILS::Application::Ingest->storage_req(
592                         'open-ils.storage.id_list.biblio.record_entry.search_where', $rec
593                 );
594         }
595
596         my $commit = 0;
597         if (!OpenILS::Application::Ingest->in_transaction) {
598                 OpenILS::Application::Ingest->begin_transaction($client) || throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!");
599                 $commit = 1;
600         }
601
602         my $success = 1;
603         try {
604                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.set', 'scrub_metabib_record' );
605                 
606                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.full_rec.mass_delete', { record => $rec } );
607                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord_source_map.mass_delete', { source => $rec } );
608                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.record_descriptor.mass_delete', { record => $rec } );
609                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.title_field_entry.mass_delete', { source => $rec } );
610                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.author_field_entry.mass_delete', { source => $rec } );
611                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.subject_field_entry.mass_delete', { source => $rec } );
612                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.keyword_field_entry.mass_delete', { source => $rec } );
613                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.series_field_entry.mass_delete', { source => $rec } );
614
615                 $log->debug( "Looking for metarecords whose master is $rec", DEBUG);
616                 my $masters = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord.search.master_record.atomic', $rec );
617
618                 for my $mr (@$masters) {
619                         $log->debug( "Found metarecord whose master is $rec", DEBUG);
620                         my $others = OpenILS::Application::Ingest->storage_req(
621                                         'open-ils.storage.direct.metabib.metarecord_source_map.search.metarecord.atomic', $mr->id );
622
623                         if (@$others) {
624                                 $log->debug("Metarecord ".$mr->id." had master of $rec, setting to ".$others->[0]->source, DEBUG);
625                                 $mr->master_record($others->[0]->source);
626                                 OpenILS::Application::Ingest->storage_req(
627                                         'open-ils.storage.direct.metabib.metarecord.remote_update',
628                                         { id => $mr->id },
629                                         { master_record => $others->[0]->source, mods => undef }
630                                 );
631                         } else {
632                                 warn "Removing metarecord whose master is $rec";
633                                 $log->debug( "Removing metarecord whose master is $rec", DEBUG);
634                                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord.delete', $mr->id );
635                                 warn "Metarecord removed";
636                                 $log->debug( "Metarecord removed", DEBUG);
637                         }
638                 }
639
640                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.release', 'scrub_metabib_record' );
641
642         } otherwise {
643                 $log->debug('Scrubbing failed : '.shift(), ERROR);
644                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.rollback', 'scrub_metabib_record' );
645                 $success = 0;
646         };
647
648         OpenILS::Application::Ingest->commit_transaction if ($commit && $success);
649         OpenILS::Application::Ingest->rollback_transaction if ($commit && !$success);
650         return $success;
651 }
652 __PACKAGE__->register_method(  
653         api_name        => "open-ils.worm.scrub.biblio",
654         method          => "scrub_metabib_record",
655         api_level       => 1,
656         argc            => 1,
657 );                      
658
659 sub wormize_biblio_metarecord {
660         my $self = shift;
661         my $client = shift;
662         my $mrec = shift;
663
664         my $recs = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord_source_map.search.metarecord.atomic' => $mrec );
665
666         my $count = 0;
667         for my $r (@$recs) {
668                 my $success = 0;
669                 try {
670                         $success = wormize_biblio_record($self => $client => $r->source);
671                         $client->respond(
672                                 { record  => $r->source,
673                                   metarecord => $rec->metarecord,
674                                   success => $success,
675                                 }
676                         );
677                 } catch Error with {
678                         my $e = shift;
679                         $client->respond(
680                                 { record  => $r->source,
681                                   metarecord => $rec->metarecord,
682                                   success => $success,
683                                   error   => $e,
684                                 }
685                         );
686                 };
687         }
688         return undef;
689 }
690 __PACKAGE__->register_method(
691         api_name        => "open-ils.worm.wormize.metarecord",
692         method          => "wormize_biblio_metarecord",
693         api_level       => 1,
694         argc            => 1,
695         stream          => 1,
696 );
697 __PACKAGE__->register_method(
698         api_name        => "open-ils.worm.wormize.metarecord.nomap",
699         method          => "wormize_biblio_metarecord",
700         api_level       => 1,
701         argc            => 1,
702         stream          => 1,
703 );
704 __PACKAGE__->register_method(
705         api_name        => "open-ils.worm.wormize.metarecord.noscrub",
706         method          => "wormize_biblio_metarecord",
707         api_level       => 1,
708         argc            => 1,
709         stream          => 1,
710 );
711 __PACKAGE__->register_method(
712         api_name        => "open-ils.worm.wormize.metarecord.nomap.noscrub",
713         method          => "wormize_biblio_metarecord",
714         api_level       => 1,
715         argc            => 1,
716         stream          => 1,
717 );
718
719
720 sub wormize_biblio_record {
721         my $self = shift;
722         my $client = shift;
723         my $rec = shift;
724
725         if ( ref($rec) && ref($rec) =~ /HASH/o ) {
726                 $rec = OpenILS::Application::Ingest->storage_req(
727                         'open-ils.storage.id_list.biblio.record_entry.search_where', $rec
728                 );
729         }
730
731
732         my $commit = 0;
733         if (!OpenILS::Application::Ingest->in_transaction) {
734                 OpenILS::Application::Ingest->begin_transaction($client) || throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!");
735                 $commit = 1;
736         }
737
738         my $success = 1;
739         try {
740                 # clean up the cruft
741                 unless ($self->api_name =~ /noscrub/o) {
742                         $self->method_lookup( 'open-ils.worm.scrub.biblio' )->run( $rec ) || throw OpenSRF::EX::PANIC ("Couldn't scrub record $rec!");
743                 }
744
745                 # now redo 'em
746                 my $bibs = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.biblio.record_entry.search.id.atomic', $rec );
747
748                 my @full_rec = ();
749                 my @rec_descriptor = ();
750                 my %field_entry = (
751                         title   => [],
752                         author  => [],
753                         subject => [],
754                         keyword => [],
755                         series  => [],
756                 );
757                 my %metarecord = ();
758                 my @source_map = ();
759                 for my $r (@$bibs) {
760                         try {
761                                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.set', 'extract_data'.$r->id );
762
763                                 my $xml = $parser->parse_string($r->marc);
764
765                                 #update the fingerprint
766                                 my ($fp) = $self->method_lookup( 'open-ils.worm.fingerprint.marc' )->run( $xml );
767                                 OpenILS::Application::Ingest->storage_req(
768                                         'open-ils.storage.direct.biblio.record_entry.remote_update',
769                                         { id => $r->id },
770                                         { fingerprint => $fp->{fingerprint},
771                                           quality     => int($fp->{quality}) }
772                                 ) if ($fp->{fingerprint} ne $r->fingerprint || int($fp->{quality}) ne $r->quality);
773
774                                 # the full_rec stuff
775                                 for my $fr ( $self->method_lookup( 'open-ils.worm.flat_marc.biblio.xml' )->run( $xml ) ) {
776                                         $fr->record( $r->id );
777                                         push @full_rec, $fr;
778                                 }
779
780                                 # the rec_descriptor stuff
781                                 my ($rd) = $self->method_lookup( 'open-ils.worm.biblio_leader.xml' )->run( $xml );
782                                 $rd->record( $r->id );
783                                 push @rec_descriptor, $rd;
784                         
785                                 # the indexing field entry stuff
786                                 for my $class ( qw/title author subject keyword series/ ) {
787                                         for my $fe ( $self->method_lookup( 'open-ils.worm.field_entry.class.xml' )->run( $xml, $class ) ) {
788                                                 $fe->source( $r->id );
789                                                 push @{$field_entry{$class}}, $fe;
790                                         }
791                                 }
792
793                                 unless ($self->api_name =~ /nomap/o) {
794                                         my $mr = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord.search.fingerprint.atomic', $fp->{fingerprint}  )->[0];
795                                 
796                                         unless ($mr) {
797                                                 $mr = Fieldmapper::metabib::metarecord->new;
798                                                 $mr->fingerprint( $fp->{fingerprint} );
799                                                 $mr->master_record( $r->id );
800                                                 $mr->id( OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord.create', $mr) );
801                                         }
802
803                                         my $mr_map = Fieldmapper::metabib::metarecord_source_map->new;
804                                         $mr_map->metarecord( $mr->id );
805                                         $mr_map->source( $r->id );
806                                         push @source_map, $mr_map;
807
808                                         $metarecord{$mr->id} = $mr;
809                                 }
810                                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.release', 'extract_data'.$r->id );
811                         } otherwise {
812                                 $log->debug('Data extraction failed for record '.$r->id.': '.shift(), ERROR);
813                                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.rollback', 'extract_data'.$r->id );
814                         };
815                 }
816                 
817
818                 if (@rec_descriptor) {
819                         OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.set', 'wormize_record' );
820
821                         OpenILS::Application::Ingest->storage_req(
822                                 'open-ils.storage.direct.metabib.metarecord_source_map.batch.create',
823                                 @source_map
824                         ) if (@source_map);
825
826                         for my $mr ( values %metarecord ) {
827                                 my $sources = OpenILS::Application::Ingest->storage_req(
828                                         'open-ils.storage.direct.metabib.metarecord_source_map.search.metarecord.atomic',
829                                         $mr->id
830                                 );
831
832                                 my $bibs = OpenILS::Application::Ingest->storage_req(
833                                         'open-ils.storage.direct.biblio.record_entry.search.id.atomic',
834                                         [ map { $_->source } @$sources ]
835                                 );
836
837                                 my $master = ( sort { $b->quality <=> $a->quality } @$bibs )[0];
838
839                                 OpenILS::Application::Ingest->storage_req(
840                                         'open-ils.storage.direct.metabib.metarecord.remote_update',
841                                         { id => $mr->id },
842                                         { master_record => $master->id, mods => undef }
843                                 );
844                         }
845
846                         OpenILS::Application::Ingest->storage_req(
847                                 'open-ils.storage.direct.metabib.record_descriptor.batch.create',
848                                 @rec_descriptor
849                         ) if (@rec_descriptor);
850
851                         OpenILS::Application::Ingest->storage_req(
852                                 'open-ils.storage.direct.metabib.full_rec.batch.create',
853                                 @full_rec
854                         ) if (@full_rec);
855
856                         OpenILS::Application::Ingest->storage_req(
857                                 'open-ils.storage.direct.metabib.title_field_entry.batch.create',
858                                 @{ $field_entry{title} }
859                         ) if (@{ $field_entry{title} });
860
861                         OpenILS::Application::Ingest->storage_req(
862                                 'open-ils.storage.direct.metabib.author_field_entry.batch.create',
863                                 @{ $field_entry{author} }
864                         ) if (@{ $field_entry{author} });
865                         
866                         OpenILS::Application::Ingest->storage_req(
867                                 'open-ils.storage.direct.metabib.subject_field_entry.batch.create',
868                                 @{ $field_entry{subject} }
869                         ) if (@{ $field_entry{subject} });
870
871                         OpenILS::Application::Ingest->storage_req(
872                                 'open-ils.storage.direct.metabib.keyword_field_entry.batch.create',
873                                 @{ $field_entry{keyword} }
874                         ) if (@{ $field_entry{keyword} });
875
876                         OpenILS::Application::Ingest->storage_req(
877                                 'open-ils.storage.direct.metabib.series_field_entry.batch.create',
878                                 @{ $field_entry{series} }
879                         ) if (@{ $field_entry{series} });
880
881                         OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.release', 'wormize_record' );
882                 } else {
883                         $success = 0;
884                 }
885
886         } otherwise {
887                 $log->debug('Wormization failed : '.shift(), ERROR);
888                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.rollback', 'wormize_record' );
889                 $success = 0;
890         };
891
892         OpenILS::Application::Ingest->commit_transaction if ($commit && $success);
893         OpenILS::Application::Ingest->rollback_transaction if ($commit && !$success);
894         return $success;
895 }
896 __PACKAGE__->register_method(
897         api_name        => "open-ils.worm.wormize.biblio",
898         method          => "wormize_biblio_record",
899         api_level       => 1,
900         argc            => 1,
901 );
902 __PACKAGE__->register_method(
903         api_name        => "open-ils.worm.wormize.biblio.nomap",
904         method          => "wormize_biblio_record",
905         api_level       => 1,
906         argc            => 1,
907 );
908 __PACKAGE__->register_method(
909         api_name        => "open-ils.worm.wormize.biblio.noscrub",
910         method          => "wormize_biblio_record",
911         api_level       => 1,
912         argc            => 1,
913 );
914 __PACKAGE__->register_method(
915         api_name        => "open-ils.worm.wormize.biblio.nomap.noscrub",
916         method          => "wormize_biblio_record",
917         api_level       => 1,
918         argc            => 1,
919 );
920
921 sub wormize_authority_record {
922         my $self = shift;
923         my $client = shift;
924         my $rec = shift;
925
926         my $commit = 0;
927         if (!OpenILS::Application::Ingest->in_transaction) {
928                 OpenILS::Application::Ingest->begin_transaction($client) || throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!");
929                 $commit = 1;
930         }
931
932         my $success = 1;
933         try {
934                 # clean up the cruft
935                 unless ($self->api_name =~ /noscrub/o) {
936                         $self->method_lookup( 'open-ils.worm.scrub.authority' )->run( $rec ) || throw OpenSRF::EX::PANIC ("Couldn't scrub record $rec!");
937                 }
938
939                 # now redo 'em
940                 my $bibs = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.authority.record_entry.search.id.atomic', $rec );
941
942                 my @full_rec = ();
943                 my @rec_descriptor = ();
944                 for my $r (@$bibs) {
945                         my $xml = $parser->parse_string($r->marc);
946
947                         # the full_rec stuff
948                         for my $fr ( $self->method_lookup( 'open-ils.worm.flat_marc.authority.xml' )->run( $xml ) ) {
949                                 $fr->record( $r->id );
950                                 push @full_rec, $fr;
951                         }
952
953                         # the rec_descriptor stuff -- XXX What does this mean for authority records?
954                         #my ($rd) = $self->method_lookup( 'open-ils.worm.authority_leader.xml' )->run( $xml );
955                         #$rd->record( $r->id );
956                         #push @rec_descriptor, $rd;
957                         
958                 }
959
960                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.set', 'wormize_authority_record' );
961
962                 #OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.authority.record_descriptor.batch.create', @rec_descriptor ) if (@rec_descriptor);
963                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.authority.full_rec.batch.create', @full_rec ) if (@full_rec);
964
965                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.release', 'wormize_authority_record' );
966
967         } otherwise {
968                 $log->debug('Wormization failed : '.shift(), ERROR);
969                 OpenILS::Application::Ingest->storage_req( 'open-ils.storage.savepoint.rollback', 'wormize_authority_record' );
970                 $success = 0;
971         };
972
973         OpenILS::Application::Ingest->commit_transaction if ($commit && $success);
974         OpenILS::Application::Ingest->rollback_transaction if ($commit && !$success);
975         return $success;
976 }
977 __PACKAGE__->register_method(
978         api_name        => "open-ils.worm.wormize.authority",
979         method          => "wormize_authority_record",
980         api_level       => 1,
981         argc            => 1,
982 );
983 __PACKAGE__->register_method(
984         api_name        => "open-ils.worm.wormize.authority.noscrub",
985         method          => "wormize_authority_record",
986         api_level       => 1,
987         argc            => 1,
988 );
989
990
991 # --------------------------------------------------------------------------------
992 # MARC index extraction
993
994 package OpenILS::Application::Ingest::XPATH;
995 use base qw/OpenILS::Application::Ingest/;
996 use Unicode::Normalize;
997
998 # give this a MODS documentElement and an XPATH expression
999 sub _xpath_to_string {
1000         my $xml = shift;
1001         my $xpath = shift;
1002         my $ns_uri = shift;
1003         my $ns_prefix = shift;
1004         my $unique = shift;
1005
1006         $xml->setNamespace( $ns_uri, $ns_prefix, 1 ) if ($ns_uri && $ns_prefix);
1007
1008         my $string = "";
1009
1010         # grab the set of matching nodes
1011         my @nodes = $xml->findnodes( $xpath );
1012         for my $value (@nodes) {
1013
1014                 # grab all children of the node
1015                 my @children = $value->childNodes();
1016                 for my $child (@children) {
1017
1018                         # add the childs content to the growing buffer
1019                         my $content = quotemeta($child->textContent);
1020                         next if ($unique && $string =~ /$content/);  # uniquify the values
1021                         $string .= $child->textContent . " ";
1022                 }
1023                 if( ! @children ) {
1024                         $string .= $value->textContent . " ";
1025                 }
1026         }
1027         return NFD($string);
1028 }
1029
1030 sub class_all_index_string_xml {
1031         my $self = shift;
1032         my $client = shift;
1033         my $xml = shift;
1034         my $class = shift;
1035
1036         OpenILS::Application::Ingest->post_init();
1037         $xml = $parser->parse_string($xml) unless (ref $xml);
1038         
1039         my $class_constructor = "Fieldmapper::metabib::${class}_field_entry";
1040         for my $type ( keys %{ $xpathset->{$class} } ) {
1041                 my $value =  _xpath_to_string(
1042                                 $mods_sheet->transform($xml)->documentElement,
1043                                 $xpathset->{$class}->{$type}->{xpath},
1044                                 "http://www.loc.gov/mods/",
1045                                 "mods",
1046                                 1
1047                 );
1048
1049                 next unless $value;
1050
1051                 $value =~ s/\pM+//sgo;
1052                 $value =~ s/\pC+//sgo;
1053                 #$value =~ s/[\x{0080}-\x{fffd}]//sgoe;
1054
1055                 $value =~ s/(\w)\./$1/sgo;
1056                 $value = lc($value);
1057
1058                 my $fm = $class_constructor->new;
1059                 $fm->value( $value );
1060                 $fm->field( $xpathset->{$class}->{$type}->{id} );
1061                 $client->respond($fm);
1062         }
1063         return undef;
1064 }
1065 __PACKAGE__->register_method(  
1066         api_name        => "open-ils.worm.field_entry.class.xml",
1067         method          => "class_all_index_string_xml",
1068         api_level       => 1,
1069         argc            => 1,
1070         stream          => 1,
1071 );                      
1072
1073 sub class_all_index_string_record {
1074         my $self = shift;
1075         my $client = shift;
1076         my $rec = shift;
1077         my $class = shift;
1078
1079         OpenILS::Application::Ingest->post_init();
1080         my $r = OpenILS::Application::Ingest->storage_req( "open-ils.storage.direct.biblio.record_entry.retrieve" => $rec );
1081
1082         for my $fm ($self->method_lookup("open-ils.worm.field_entry.class.xml")->run($r->marc, $class)) {
1083                 $fm->source($rec);
1084                 $client->respond($fm);
1085         }
1086         return undef;
1087 }
1088 __PACKAGE__->register_method(  
1089         api_name        => "open-ils.worm.field_entry.class.record",
1090         method          => "class_all_index_string_record",
1091         api_level       => 1,
1092         argc            => 1,
1093         stream          => 1,
1094 );                      
1095
1096
1097 sub class_index_string_xml {
1098         my $self = shift;
1099         my $client = shift;
1100         my $xml = shift;
1101         my $class = shift;
1102         my $type = shift;
1103
1104         OpenILS::Application::Ingest->post_init();
1105         $xml = $parser->parse_string($xml) unless (ref $xml);
1106         return _xpath_to_string( $mods_sheet->transform($xml)->documentElement, $xpathset->{$class}->{$type}->{xpath}, "http://www.loc.gov/mods/", "mods", 1 );
1107 }
1108 __PACKAGE__->register_method(  
1109         api_name        => "open-ils.worm.class.type.xml",
1110         method          => "class_index_string_xml",
1111         api_level       => 1,
1112         argc            => 1,
1113 );                      
1114
1115 sub class_index_string_record {
1116         my $self = shift;
1117         my $client = shift;
1118         my $rec = shift;
1119         my $class = shift;
1120         my $type = shift;
1121
1122         OpenILS::Application::Ingest->post_init();
1123         my $r = OpenILS::Application::Ingest->storage_req( "open-ils.storage.direct.biblio.record_entry.retrieve" => $rec );
1124
1125         my ($d) = $self->method_lookup("open-ils.worm.class.type.xml")->run($r->marc, $class => $type);
1126         $log->debug("XPath $class->$type for bib rec $rec returns ($d)", DEBUG);
1127         return $d;
1128 }
1129 __PACKAGE__->register_method(  
1130         api_name        => "open-ils.worm.class.type.record",
1131         method          => "class_index_string_record",
1132         api_level       => 1,
1133         argc            => 1,
1134 );                      
1135
1136 sub xml_xpath {
1137         my $self = shift;
1138         my $client = shift;
1139         my $xml = shift;
1140         my $xpath = shift;
1141         my $uri = shift;
1142         my $prefix = shift;
1143         my $unique = shift;
1144
1145         OpenILS::Application::Ingest->post_init();
1146         $xml = $parser->parse_string($xml) unless (ref $xml);
1147         return _xpath_to_string( $xml->documentElement, $xpath, $uri, $prefix, $unique );
1148 }
1149 __PACKAGE__->register_method(  
1150         api_name        => "open-ils.worm.xpath.xml",
1151         method          => "xml_xpath",
1152         api_level       => 1,
1153         argc            => 1,
1154 );                      
1155
1156 sub record_xpath {
1157         my $self = shift;
1158         my $client = shift;
1159         my $rec = shift;
1160         my $xpath = shift;
1161         my $uri = shift;
1162         my $prefix = shift;
1163         my $unique = shift;
1164
1165         OpenILS::Application::Ingest->post_init();
1166         my $r = OpenILS::Application::Ingest->storage_req( "open-ils.storage.direct.biblio.record_entry.retrieve" => $rec );
1167
1168         my ($d) = $self->method_lookup("open-ils.worm.xpath.xml")->run($r->marc, $xpath, $uri, $prefix, $unique );
1169         $log->debug("XPath [$xpath] bib rec $rec returns ($d)", DEBUG);
1170         return $d;
1171 }
1172 __PACKAGE__->register_method(  
1173         api_name        => "open-ils.worm.xpath.record",
1174         method          => "record_xpath",
1175         api_level       => 1,
1176         argc            => 1,
1177 );                      
1178
1179
1180 # --------------------------------------------------------------------------------
1181 # MARC Descriptor
1182
1183 package OpenILS::Application::Ingest::Biblio::Leader;
1184 use base qw/OpenILS::Application::Ingest/;
1185 use Unicode::Normalize;
1186
1187 our %marc_type_groups = (
1188         BKS => q/[at]{1}/,
1189         SER => q/[a]{1}/,
1190         VIS => q/[gkro]{1}/,
1191         MIX => q/[p]{1}/,
1192         MAP => q/[ef]{1}/,
1193         SCO => q/[cd]{1}/,
1194         REC => q/[ij]{1}/,
1195         COM => q/[m]{1}/,
1196 );
1197
1198 sub _type_re {
1199         my $re = '^'. join('|', $marc_type_groups{@_}) .'$';
1200         return qr/$re/;
1201 }
1202
1203 our %biblio_descriptor_code = (
1204         item_type => sub { substr($ldr,6,1); },
1205         item_form =>
1206                 sub {
1207                         if (substr($ldr,6,1) =~ _type_re( qw/MAP VIS/ )) {
1208                                 return substr($oo8,29,1);
1209                         } elsif (substr($ldr,6,1) =~ _type_re( qw/BKS SER MIX SCO REC/ )) {
1210                                 return substr($oo8,23,1);
1211                         }
1212                         return ' ';
1213                 },
1214         bib_level => sub { substr($ldr,7,1); },
1215         control_type => sub { substr($ldr,8,1); },
1216         char_encoding => sub { substr($ldr,9,1); },
1217         enc_level => sub { substr($ldr,17,1); },
1218         cat_form => sub { substr($ldr,18,1); },
1219         pub_status => sub { substr($ldr,5,1); },
1220         item_lang => sub { substr($oo8,35,3); },
1221         lit_form => sub { (substr($ldr,6,1) =~ _type_re('BKS')) ? substr($oo8,33,1) : undef; },
1222         type_mat => sub { (substr($ldr,6,1) =~ _type_re('VIS')) ? substr($oo8,33,1) : undef; },
1223         audience => sub { substr($oo8,22,1); },
1224 );
1225
1226 sub _extract_biblio_descriptors {
1227         my $xml = shift;
1228
1229         local $ldr = $xml->findvalue('//*[local-name()="leader"]');
1230         local $oo8 = $xml->findvalue('//*[local-name()="controlfield" and @tag="008"]');
1231         local $oo7 = $xml->findvalue('//*[local-name()="controlfield" and @tag="007"]');
1232
1233         my $rd_obj = Fieldmapper::metabib::record_descriptor->new;
1234         for my $rd_field ( keys %biblio_descriptor_code ) {
1235                 $rd_obj->$rd_field( $biblio_descriptor_code{$rd_field}->() );
1236         }
1237
1238         return $rd_obj;
1239 }
1240
1241 sub extract_biblio_desc_xml {
1242         my $self = shift;
1243         my $client = shift;
1244         my $xml = shift;
1245
1246         $xml = $parser->parse_string($xml) unless (ref $xml);
1247
1248         return _extract_biblio_descriptors( $xml );
1249 }
1250 __PACKAGE__->register_method(  
1251         api_name        => "open-ils.worm.biblio_leader.xml",
1252         method          => "extract_biblio_desc_xml",
1253         api_level       => 1,
1254         argc            => 1,
1255 );                      
1256
1257 sub extract_biblio_desc_record {
1258         my $self = shift;
1259         my $client = shift;
1260         my $rec = shift;
1261
1262         OpenILS::Application::Ingest->post_init();
1263         my $r = OpenILS::Application::Ingest->storage_req( "open-ils.storage.direct.biblio.record_entry.retrieve" => $rec );
1264
1265         my ($d) = $self->method_lookup("open-ils.worm.biblio_leader.xml")->run($r->marc);
1266         $log->debug("Record descriptor for bib rec $rec is ".JSON->perl2JSON($d), DEBUG);
1267         return $d;
1268 }
1269 __PACKAGE__->register_method(  
1270         api_name        => "open-ils.worm.biblio_leader.record",
1271         method          => "extract_biblio_desc_record",
1272         api_level       => 1,
1273         argc            => 1,
1274 );                      
1275
1276 # --------------------------------------------------------------------------------
1277 # Flat MARC
1278
1279 package OpenILS::Application::Ingest::FlatMARC;
1280 use base qw/OpenILS::Application::Ingest/;
1281 use Unicode::Normalize;
1282
1283
1284 sub _marcxml_to_full_rows {
1285
1286         my $marcxml = shift;
1287         my $xmltype = shift || 'metabib';
1288
1289         my $type = "Fieldmapper::${xmltype}::full_rec";
1290
1291         my @ns_list;
1292         
1293         my ($root) = $marcxml->findnodes('//*[local-name()="record"]');
1294
1295         for my $tagline ( @{$root->getChildrenByTagName("leader")} ) {
1296                 next unless $tagline;
1297
1298                 my $ns = $type->new;
1299
1300                 $ns->tag( 'LDR' );
1301                 my $val = $tagline->textContent;
1302                 $val = NFD($val);
1303                 $val =~ s/(\pM+)//gso;
1304                 $ns->value( $val );
1305
1306                 push @ns_list, $ns;
1307         }
1308
1309         for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) {
1310                 next unless $tagline;
1311
1312                 my $ns = $type->new;
1313
1314                 $ns->tag( $tagline->getAttribute( "tag" ) );
1315                 my $val = $tagline->textContent;
1316                 $val = NFD($val);
1317                 $val =~ s/(\pM+)//gso;
1318                 $ns->value( $val );
1319
1320                 push @ns_list, $ns;
1321         }
1322
1323         for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) {
1324                 next unless $tagline;
1325
1326                 my $tag = $tagline->getAttribute( "tag" );
1327                 my $ind1 = $tagline->getAttribute( "ind1" );
1328                 my $ind2 = $tagline->getAttribute( "ind2" );
1329
1330                 for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) {
1331                         next unless $data;
1332
1333                         my $ns = $type->new;
1334
1335                         $ns->tag( $tag );
1336                         $ns->ind1( $ind1 );
1337                         $ns->ind2( $ind2 );
1338                         $ns->subfield( $data->getAttribute( "code" ) );
1339                         my $val = $data->textContent;
1340                         $val = NFD($val);
1341                         $val =~ s/(\pM+)//gso;
1342                         $ns->value( lc($val) );
1343
1344                         push @ns_list, $ns;
1345                 }
1346         }
1347
1348         $log->debug("Returning ".scalar(@ns_list)." Fieldmapper nodes from $xmltype xml", DEBUG);
1349         return @ns_list;
1350 }
1351
1352 sub flat_marc_xml {
1353         my $self = shift;
1354         my $client = shift;
1355         my $xml = shift;
1356
1357         $xml = $parser->parse_string($xml) unless (ref $xml);
1358
1359         my $type = 'metabib';
1360         $type = 'authority' if ($self->api_name =~ /authority/o);
1361
1362         OpenILS::Application::Ingest->post_init();
1363
1364         $client->respond($_) for (_marcxml_to_full_rows($xml, $type));
1365         return undef;
1366 }
1367 __PACKAGE__->register_method(  
1368         api_name        => "open-ils.worm.flat_marc.authority.xml",
1369         method          => "flat_marc_xml",
1370         api_level       => 1,
1371         argc            => 1,
1372         stream          => 1,
1373 );                      
1374 __PACKAGE__->register_method(  
1375         api_name        => "open-ils.worm.flat_marc.biblio.xml",
1376         method          => "flat_marc_xml",
1377         api_level       => 1,
1378         argc            => 1,
1379         stream          => 1,
1380 );                      
1381
1382 sub flat_marc_record {
1383         my $self = shift;
1384         my $client = shift;
1385         my $rec = shift;
1386
1387         my $type = 'biblio';
1388         $type = 'authority' if ($self->api_name =~ /authority/o);
1389
1390         OpenILS::Application::Ingest->post_init();
1391         my $r = OpenILS::Application::Ingest->storage_req( "open-ils.storage.direct.${type}.record_entry.retrieve" => $rec );
1392
1393         $client->respond($_) for ($self->method_lookup("open-ils.worm.flat_marc.$type.xml")->run($r->marc));
1394         return undef;
1395 }
1396 __PACKAGE__->register_method(  
1397         api_name        => "open-ils.worm.flat_marc.biblio.record_entry",
1398         method          => "flat_marc_record",
1399         api_level       => 1,
1400         argc            => 1,
1401         stream          => 1,
1402 );                      
1403 __PACKAGE__->register_method(  
1404         api_name        => "open-ils.worm.flat_marc.authority.record_entry",
1405         method          => "flat_marc_record",
1406         api_level       => 1,
1407         argc            => 1,
1408         stream          => 1,
1409 );                      
1410
1411
1412 # --------------------------------------------------------------------------------
1413 # Fingerprinting
1414
1415 package OpenILS::Application::Ingest::Biblio::Fingerprint;
1416 use base qw/OpenILS::Application::Ingest/;
1417 use Unicode::Normalize;
1418 use OpenSRF::EX qw/:try/;
1419
1420 my @fp_mods_xpath = (
1421         '//mods:mods/mods:typeOfResource[text()="text"]' => [
1422                         title   => {
1423                                         xpath   => [
1424                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="uniform")]',
1425                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="translated")]',
1426                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="alternative")]',
1427                                                         '//mods:mods/mods:titleInfo[mods:title and not(@type)]',
1428                                         ],
1429                                         fixup   => sub {
1430                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1431                                                         $text = NFD($text);
1432                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1433                                                         $text =~ s/\pM+//gso;
1434                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1435                                                         $text = lc($text);
1436                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1437                                                         $text =~ s/\s+/ /sgo;
1438                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1439                                                         $text =~ s/^\s*(.+)\s*$/$1/sgo;
1440                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1441                                                         $text =~ s/\b(?:the|an?)\b//sgo;
1442                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1443                                                         $text =~ s/\[.[^\]]+\]//sgo;
1444                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1445                                                         $text =~ s/\s*[;\/\.]*$//sgo;
1446                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1447                                                 },
1448                         },
1449                         author  => {
1450                                         xpath   => [
1451                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
1452                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
1453                                         ],
1454                                         fixup   => sub {
1455                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1456                                                         $text = NFD($text);
1457                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1458                                                         $text =~ s/\pM+//gso;
1459                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1460                                                         $text = lc($text);
1461                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1462                                                         $text =~ s/\s+/ /sgo;
1463                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1464                                                         $text =~ s/^\s*(.+)\s*$/$1/sgo;
1465                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1466                                                         $text =~ s/,?\s+.*$//sgo;
1467                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1468                                                 },
1469                         },
1470         ],
1471
1472         '//mods:mods/mods:relatedItem[@type!="host" and @type!="series"]' => [
1473                         title   => {
1474                                         xpath   => [
1475                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="uniform")]',
1476                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="translated")]',
1477                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="alternative")]',
1478                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and not(@type)]',
1479                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="uniform")]',
1480                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="translated")]',
1481                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="alternative")]',
1482                                                         '//mods:mods/mods:titleInfo[mods:title and not(@type)]',
1483                                         ],
1484                                         fixup   => sub {
1485                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1486                                                         $text = NFD($text);
1487                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1488                                                         $text =~ s/\pM+//gso;
1489                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1490                                                         $text = lc($text);
1491                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1492                                                         $text =~ s/\s+/ /sgo;
1493                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1494                                                         $text =~ s/^\s*(.+)\s*$/$1/sgo;
1495                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1496                                                         $text =~ s/\b(?:the|an?)\b//sgo;
1497                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1498                                                         $text =~ s/\[.[^\]]+\]//sgo;
1499                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1500                                                         $text =~ s/\s*[;\/\.]*$//sgo;
1501                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1502                                                 },
1503                         },
1504                         author  => {
1505                                         xpath   => [
1506                                                         '//mods:mods/mods:relatedItem/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
1507                                                         '//mods:mods/mods:relatedItem/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
1508                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
1509                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
1510                                         ],
1511                                         fixup   => sub {
1512                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1513                                                         $text = NFD($text);
1514                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1515                                                         $text =~ s/\pM+//gso;
1516                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1517                                                         $text = lc($text);
1518                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1519                                                         $text =~ s/\s+/ /sgo;
1520                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1521                                                         $text =~ s/^\s*(.+)\s*$/$1/sgo;
1522                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1523                                                         $text =~ s/,?\s+.*$//sgo;
1524                                                         $log->debug("Fingerprint text /durring/ fixup : [$text]", INTERNAL);
1525                                                 },
1526                         },
1527         ],
1528
1529 );
1530
1531 push @fp_mods_xpath, '//mods:mods/mods:titleInfo' => $fp_mods_xpath[1];
1532
1533 sub _fp_mods {
1534         my $mods = shift;
1535         $mods->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
1536
1537         my $fp_string = '';
1538
1539         my $match_index = 0;
1540         my $block_index = 1;
1541         while ( my $match_xpath = $fp_mods_xpath[$match_index] ) {
1542                 if ( my @nodes = $mods->findnodes( $match_xpath ) ) {
1543
1544                         my $block_name_index = 0;
1545                         my $block_value_index = 1;
1546                         my $block = $fp_mods_xpath[$block_index];
1547                         while ( my $part = $$block[$block_value_index] ) {
1548                                 local $text;
1549                                 for my $xpath ( @{ $part->{xpath} } ) {
1550                                         $text = $mods->findvalue( $xpath );
1551                                         last if ($text);
1552                                 }
1553
1554                                 $log->debug("Found fingerprint text using $$block[$block_name_index] : [$text]", DEBUG);
1555
1556                                 if ($text) {
1557                                         $$part{fixup}->();
1558                                         $log->debug("Fingerprint text after fixup : [$text]", DEBUG);
1559                                         $fp_string .= $text;
1560                                 }
1561
1562                                 $block_name_index += 2;
1563                                 $block_value_index += 2;
1564                         }
1565                 }
1566                 if ($fp_string) {
1567                         $fp_string =~ s/\W+//gso;
1568                         $log->debug("Fingerprint is [$fp_string]", INFO);;
1569                         return $fp_string;
1570                 }
1571
1572                 $match_index += 2;
1573                 $block_index += 2;
1574         }
1575         return undef;
1576 }
1577
1578 sub refingerprint_bibrec {
1579         my $self = shift;
1580         my $client = shift;
1581         my $rec = shift;
1582
1583         my $commit = 0;
1584         if (!OpenILS::Application::Ingest->in_transaction) {
1585                 OpenILS::Application::Ingest->begin_transaction($client) || throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!");
1586                 $commit = 1;
1587         }
1588
1589         my $success = 1;
1590         try {
1591                 my $bibs = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.biblio.record_entry.search.id.atomic', $rec );
1592                 for my $b (@$bibs) {
1593                         my ($fp) = $self->method_lookup( 'open-ils.worm.fingerprint.marc' )->run( $b->marc );
1594
1595                         if ($b->fingerprint ne $fp->{fingerprint} || $b->quality != $fp->{quality}) {
1596
1597                                 $log->debug("Updating ".$b->id." with fingerprint [$fp->{fingerprint}], quality [$fp->{quality}]", INFO);;
1598
1599                                 OpenILS::Application::Ingest->storage_req(
1600                                         'open-ils.storage.direct.biblio.record_entry.remote_update',
1601                                         { id => $b->id },
1602                                         { fingerprint => $fp->{fingerprint},
1603                                           quality     => $fp->{quality} }
1604                                 );
1605
1606                                 if ($self->api_name !~ /nomap/o) {
1607                                         my $old_source_map = OpenILS::Application::Ingest->storage_req(
1608                                                 'open-ils.storage.direct.metabib.metarecord_source_map.search.source.atomic',
1609                                                 $b->id
1610                                         );
1611
1612                                         my $old_mrid;
1613                                         if (ref($old_source_map) and @$old_source_map) {
1614                                                 for my $m (@$old_source_map) {
1615                                                         $old_mrid = $m->metarecord;
1616                                                         OpenILS::Application::Ingest->storage_req(
1617                                                                 'open-ils.storage.direct.metabib.metarecord_source_map.delete',
1618                                                                 $m->id
1619                                                         );
1620                                                 }
1621                                         }
1622
1623                                         my $old_sm = OpenILS::Application::Ingest->storage_req(
1624                                                         'open-ils.storage.direct.metabib.metarecord_source_map.search.atomic',
1625                                                         { metarecord => $old_mrid }
1626                                         ) if ($old_mrid);
1627
1628                                         if (ref($old_sm) and @$old_sm == 0) {
1629                                                 OpenILS::Application::Ingest->storage_req(
1630                                                         'open-ils.storage.direct.metabib.metarecord.delete',
1631                                                         $old_mrid
1632                                                 );
1633                                         }
1634
1635                                         my $mr = OpenILS::Application::Ingest->storage_req(
1636                                                         'open-ils.storage.direct.metabib.metarecord.search.fingerprint.atomic',
1637                                                         { fingerprint => $fp->{fingerprint} }
1638                                         )->[0];
1639                                 
1640                                         unless ($mr) {
1641                                                 $mr = Fieldmapper::metabib::metarecord->new;
1642                                                 $mr->fingerprint( $fp->{fingerprint} );
1643                                                 $mr->master_record( $b->id );
1644                                                 $mr->id( OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord.create', $mr) );
1645                                         }
1646
1647                                         my $mr_map = Fieldmapper::metabib::metarecord_source_map->new;
1648                                         $mr_map->metarecord( $mr->id );
1649                                         $mr_map->source( $b->id );
1650                                         OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.metabib.metarecord_source_map.create', $mr_map );
1651
1652                                 }
1653                         }
1654                         $client->respond($b->id);
1655                 }
1656
1657         } otherwise {
1658                 $log->debug('Fingerprinting failed : '.shift(), ERROR);
1659                 $success = 0;
1660         };
1661
1662         OpenILS::Application::Ingest->commit_transaction if ($commit && $success);
1663         OpenILS::Application::Ingest->rollback_transaction if ($commit && !$success);
1664         return undef;
1665 }
1666 __PACKAGE__->register_method(  
1667         api_name        => "open-ils.worm.fingerprint.record.update",
1668         method          => "refingerprint_bibrec",
1669         api_level       => 1,
1670         argc            => 1,
1671         stream          => 1,
1672 );                      
1673
1674 __PACKAGE__->register_method(  
1675         api_name        => "open-ils.worm.fingerprint.record.update.nomap",
1676         method          => "refingerprint_bibrec",
1677         api_level       => 1,
1678         argc            => 1,
1679 );                      
1680
1681 =comment
1682
1683 sub fingerprint_bibrec {
1684         my $self = shift;
1685         my $client = shift;
1686         my $rec = shift;
1687
1688         OpenILS::Application::Ingest->post_init();
1689         my $r = OpenILS::Application::Ingest->storage_req( 'open-ils.storage.direct.biblio.record_entry.retrieve' => $rec );
1690
1691         my ($fp) = $self->method_lookup('open-ils.worm.fingerprint.marc')->run($r->marc);
1692         $log->debug("Returning [$fp] as fingerprint for record $rec", INFO);
1693         return $fp;
1694
1695 }
1696 __PACKAGE__->register_method(  
1697         api_name        => "open-ils.worm.fingerprint.record",
1698         method          => "fingerprint_bibrec",
1699         api_level       => 0,
1700         argc            => 1,
1701 );                      
1702
1703
1704 sub fingerprint_mods {
1705         my $self = shift;
1706         my $client = shift;
1707         my $xml = shift;
1708
1709         OpenILS::Application::Ingest->post_init();
1710         my $mods = $parser->parse_string($xml)->documentElement;
1711
1712         return _fp_mods( $mods );
1713 }
1714 __PACKAGE__->register_method(  
1715         api_name        => "open-ils.worm.fingerprint.mods",
1716         method          => "fingerprint_mods",
1717         api_level       => 1,
1718         argc            => 1,
1719 );                      
1720
1721 sub fingerprint_marc {
1722         my $self = shift;
1723         my $client = shift;
1724         my $xml = shift;
1725
1726         $xml = $parser->parse_string($xml) unless (ref $xml);
1727
1728         OpenILS::Application::Ingest->post_init();
1729         my $fp = _fp_mods( $mods_sheet->transform($xml)->documentElement );
1730         $log->debug("Returning [$fp] as fingerprint", INFO);
1731         return $fp;
1732 }
1733 __PACKAGE__->register_method(  
1734         api_name        => "open-ils.worm.fingerprint.marc",
1735         method          => "fingerprint_marc",
1736         api_level       => 1,
1737         argc            => 1,
1738 );                      
1739
1740
1741 =cut
1742
1743 sub biblio_fingerprint_record {
1744         my $self = shift;
1745         my $client = shift;
1746         my $rec = shift;
1747
1748         OpenILS::Application::Ingest->post_init();
1749
1750         my $marc = OpenILS::Application::Ingest
1751                         ->storage_req( 'open-ils.storage.direct.biblio.record_entry.retrieve' => $rec )
1752                         ->marc;
1753
1754         my ($fp) = $self->method_lookup('open-ils.worm.fingerprint.marc')->run($marc);
1755         $log->debug("Returning [$fp] as fingerprint for record $rec", INFO);
1756         return $fp;
1757 }
1758 __PACKAGE__->register_method(  
1759         api_name        => "open-ils.worm.fingerprint.record",
1760         method          => "biblio_fingerprint_record",
1761         api_level       => 1,
1762         argc            => 1,
1763 );                      
1764
1765 our $fp_script;
1766 sub biblio_fingerprint {
1767         my $self = shift;
1768         my $client = shift;
1769         my $marc = shift;
1770
1771         OpenILS::Application::Ingest->post_init();
1772
1773         $marc = $parser->parse_string($marc) unless (ref $marc);
1774
1775         my $mods = OpenILS::Application::Ingest::entityize(
1776                 $mods_sheet
1777                         ->transform( $marc )
1778                         ->documentElement
1779                         ->toString,
1780                 'D'
1781         );
1782
1783         $marc = OpenILS::Application::Ingest::entityize( $marc->documentElement->toString => 'D' );
1784
1785         warn $marc;
1786         $log->internal("Got MARC [$marc]");
1787         $log->internal("Created MODS [$mods]");
1788
1789         if(!$fp_script) {
1790                 my @pfx = ( "apps", "open-ils.storage","app_settings" );
1791                 my $conf = OpenSRF::Utils::SettingsClient->new;
1792
1793                 my $libs        = $conf->config_value(@pfx, 'script_path');
1794                 my $script_file = $conf->config_value(@pfx, 'scripts', 'biblio_fingerprint');
1795                 my $script_libs = (ref($libs)) ? $libs : [$libs];
1796
1797                 $log->debug("Loading script $script_file for biblio fingerprinting...");
1798                 
1799                 $fp_script = new OpenILS::Utils::ScriptRunner
1800                         ( file          => $script_file,
1801                           paths         => $script_libs,
1802                           reset_count   => 1000 );
1803         }
1804
1805         $log->debug("Applying environment for biblio fingerprinting...");
1806
1807         my $env = {marc => $marc, mods => $mods};
1808         #my $res = {fingerprint => '', quality => '0'};
1809
1810         $fp_script->insert('environment' => $env);
1811         #$fp_script->insert('result' => $res);
1812
1813         $log->debug("Running script for biblio fingerprinting...");
1814
1815         my $res = $fp_script->run || ($log->error( "Fingerprint script died!  $@" ) && return 0);
1816
1817         $log->debug("Script for biblio fingerprinting completed successfully...");
1818
1819         return $res;
1820 }
1821 __PACKAGE__->register_method(  
1822         api_name        => "open-ils.worm.fingerprint.marc",
1823         method          => "biblio_fingerprint",
1824         api_level       => 1,
1825         argc            => 1,
1826 );                      
1827
1828 # --------------------------------------------------------------------------------
1829
1830 1;
1831
1832 __END__
1833 my $in_xact;
1834 my $begin;
1835 my $commit;
1836 my $rollback;
1837 my $lookup;
1838 my $update_entry;
1839 my $mr_lookup;
1840 my $mr_update;
1841 my $mr_create;
1842 my $create_source_map;
1843 my $sm_lookup;
1844 my $rm_old_rd;
1845 my $rm_old_sm;
1846 my $rm_old_fr;
1847 my $rm_old_tr;
1848 my $rm_old_ar;
1849 my $rm_old_sr;
1850 my $rm_old_kr;
1851 my $rm_old_ser;
1852
1853 my $fr_create;
1854 my $rd_create;
1855 my $create = {};
1856
1857 my %descriptor_code = (
1858         item_type => 'substr($ldr,6,1)',
1859         item_form => '(substr($ldr,6,1) =~ /^(?:f|g|i|m|o|p|r)$/) ? substr($oo8,29,1) : substr($oo8,23,1)',
1860         bib_level => 'substr($ldr,7,1)',
1861         control_type => 'substr($ldr,8,1)',
1862         char_encoding => 'substr($ldr,9,1)',
1863         enc_level => 'substr($ldr,17,1)',
1864         cat_form => 'substr($ldr,18,1)',
1865         pub_status => 'substr($ldr,5,1)',
1866         item_lang => 'substr($oo8,35,3)',
1867         #lit_form => '(substr($ldr,6,1) =~ /^(?:f|g|i|m|o|p|r)$/) ? substr($oo8,33,1) : "0"',
1868         audience => 'substr($oo8,22,1)',
1869 );
1870
1871 sub wormize {
1872
1873         my $self = shift;
1874         my $client = shift;
1875         my @docids = @_;
1876
1877         my $no_map = 0;
1878         if ($self->api_name =~ /no_map/o) {
1879                 $no_map = 1;
1880         }
1881
1882         $in_xact = $self->method_lookup( 'open-ils.storage.transaction.current')
1883                 unless ($in_xact);
1884         $begin = $self->method_lookup( 'open-ils.storage.transaction.begin')
1885                 unless ($begin);
1886         $commit = $self->method_lookup( 'open-ils.storage.transaction.commit')
1887                 unless ($commit);
1888         $rollback = $self->method_lookup( 'open-ils.storage.transaction.rollback')
1889                 unless ($rollback);
1890         $sm_lookup = $self->method_lookup('open-ils.storage.direct.metabib.metarecord_source_map.search.source')
1891                 unless ($sm_lookup);
1892         $mr_lookup = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.search.fingerprint')
1893                 unless ($mr_lookup);
1894         $mr_update = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.batch.update')
1895                 unless ($mr_update);
1896         $lookup = $self->method_lookup('open-ils.storage.direct.biblio.record_entry.batch.retrieve')
1897                 unless ($lookup);
1898         $update_entry = $self->method_lookup('open-ils.storage.direct.biblio.record_entry.batch.update')
1899                 unless ($update_entry);
1900         $rm_old_sm = $self->method_lookup( 'open-ils.storage.direct.metabib.metarecord_source_map.mass_delete')
1901                 unless ($rm_old_sm);
1902         $rm_old_rd = $self->method_lookup( 'open-ils.storage.direct.metabib.record_descriptor.mass_delete')
1903                 unless ($rm_old_rd);
1904         $rm_old_fr = $self->method_lookup( 'open-ils.storage.direct.metabib.full_rec.mass_delete')
1905                 unless ($rm_old_fr);
1906         $rm_old_tr = $self->method_lookup( 'open-ils.storage.direct.metabib.title_field_entry.mass_delete')
1907                 unless ($rm_old_tr);
1908         $rm_old_ar = $self->method_lookup( 'open-ils.storage.direct.metabib.author_field_entry.mass_delete')
1909                 unless ($rm_old_ar);
1910         $rm_old_sr = $self->method_lookup( 'open-ils.storage.direct.metabib.subject_field_entry.mass_delete')
1911                 unless ($rm_old_sr);
1912         $rm_old_kr = $self->method_lookup( 'open-ils.storage.direct.metabib.keyword_field_entry.mass_delete')
1913                 unless ($rm_old_kr);
1914         $rm_old_ser = $self->method_lookup( 'open-ils.storage.direct.metabib.series_field_entry.mass_delete')
1915                 unless ($rm_old_ser);
1916         $mr_create = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.create')
1917                 unless ($mr_create);
1918         $create_source_map = $self->method_lookup('open-ils.storage.direct.metabib.metarecord_source_map.batch.create')
1919                 unless ($create_source_map);
1920         $rd_create = $self->method_lookup( 'open-ils.storage.direct.metabib.record_descriptor.batch.create')
1921                 unless ($rd_create);
1922         $fr_create = $self->method_lookup( 'open-ils.storage.direct.metabib.full_rec.batch.create')
1923                 unless ($fr_create);
1924         $$create{title} = $self->method_lookup( 'open-ils.storage.direct.metabib.title_field_entry.batch.create')
1925                 unless ($$create{title});
1926         $$create{author} = $self->method_lookup( 'open-ils.storage.direct.metabib.author_field_entry.batch.create')
1927                 unless ($$create{author});
1928         $$create{subject} = $self->method_lookup( 'open-ils.storage.direct.metabib.subject_field_entry.batch.create')
1929                 unless ($$create{subject});
1930         $$create{keyword} = $self->method_lookup( 'open-ils.storage.direct.metabib.keyword_field_entry.batch.create')
1931                 unless ($$create{keyword});
1932         $$create{series} = $self->method_lookup( 'open-ils.storage.direct.metabib.series_field_entry.batch.create')
1933                 unless ($$create{series});
1934
1935
1936         my ($outer_xact) = $in_xact->run;
1937         try {
1938                 unless ($outer_xact) {
1939                         $log->debug("Ingest isn't inside a transaction, starting one now.", INFO);
1940                         my ($r) = $begin->run($client);
1941                         unless (defined $r and $r) {
1942                                 $rollback->run;
1943                                 throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!")
1944                         }
1945                 }
1946         } catch Error with {
1947                 throw OpenSRF::EX::PANIC ("Ingest Couldn't BEGIN transaction!")
1948         };
1949
1950         my @source_maps;
1951         my @entry_list;
1952         my @mr_list;
1953         my @rd_list;
1954         my @ns_list;
1955         my @mods_data;
1956         my $ret = 0;
1957         for my $entry ( $lookup->run(@docids) ) {
1958                 # step -1: grab the doc from storage
1959                 next unless ($entry);
1960
1961                 if(!$mods_sheet) {
1962                         my $xslt_doc = $parser->parse_file(
1963                                 OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS.xsl");
1964                         $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
1965                 }
1966
1967                 my $xml = $entry->marc;
1968                 my $docid = $entry->id;
1969                 my $marcdoc = $parser->parse_string($xml);
1970                 my $modsdoc = $mods_sheet->transform($marcdoc);
1971
1972                 my $mods = $modsdoc->documentElement;
1973                 $mods->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
1974
1975                 $entry->fingerprint( fingerprint_mods( $mods ) );
1976                 push @entry_list, $entry;
1977
1978                 $log->debug("Fingerprint for Record Entry ".$docid." is [".$entry->fingerprint."]", INFO);
1979
1980                 unless ($no_map) {
1981                         my ($mr) = $mr_lookup->run( $entry->fingerprint );
1982                         if (!$mr || !@$mr) {
1983                                 $log->debug("No metarecord found for fingerprint [".$entry->fingerprint."]; Creating a new one", INFO);
1984                                 $mr = new Fieldmapper::metabib::metarecord;
1985                                 $mr->fingerprint( $entry->fingerprint );
1986                                 $mr->master_record( $entry->id );
1987                                 my ($new_mr) = $mr_create->run($mr);
1988                                 $mr->id($new_mr);
1989                                 unless (defined $mr) {
1990                                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord.create!")
1991                                 }
1992                         } else {
1993                                 $log->debug("Retrieved metarecord, id is ".$mr->id, INFO);
1994                                 $mr->mods('');
1995                                 push @mr_list, $mr;
1996                         }
1997
1998                         my $sm = new Fieldmapper::metabib::metarecord_source_map;
1999                         $sm->metarecord( $mr->id );
2000                         $sm->source( $entry->id );
2001                         push @source_maps, $sm;
2002                 }
2003
2004                 my $ldr = $marcdoc->documentElement->getChildrenByTagName('leader')->pop->textContent;
2005                 my $oo8 = $marcdoc->documentElement->findvalue('//*[local-name()="controlfield" and @tag="008"]');
2006
2007                 my $rd_obj = Fieldmapper::metabib::record_descriptor->new;
2008                 for my $rd_field ( keys %descriptor_code ) {
2009                         $rd_obj->$rd_field( eval "$descriptor_code{$rd_field};" );
2010                 }
2011                 $rd_obj->record( $docid );
2012                 push @rd_list, $rd_obj;
2013
2014                 push @mods_data, { $docid => $self->modsdoc_to_values( $mods ) };
2015
2016                 # step 2: build the KOHA rows
2017                 my @tmp_list = _marcxml_to_full_rows( $marcdoc );
2018                 $_->record( $docid ) for (@tmp_list);
2019                 push @ns_list, @tmp_list;
2020
2021                 $ret++;
2022
2023                 last unless ($self->api_name =~ /batch$/o);
2024         }
2025
2026         $rm_old_rd->run( { record => \@docids } );
2027         $rm_old_fr->run( { record => \@docids } );
2028         $rm_old_sm->run( { source => \@docids } ) unless ($no_map);
2029         $rm_old_tr->run( { source => \@docids } );
2030         $rm_old_ar->run( { source => \@docids } );
2031         $rm_old_sr->run( { source => \@docids } );
2032         $rm_old_kr->run( { source => \@docids } );
2033         $rm_old_ser->run( { source => \@docids } );
2034
2035         unless ($no_map) {
2036                 my ($sm) = $create_source_map->run(@source_maps);
2037                 unless (defined $sm) {
2038                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord_source_map.batch.create!")
2039                 }
2040                 my ($mr) = $mr_update->run(@mr_list);
2041                 unless (defined $mr) {
2042                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord.batch.update!")
2043                 }
2044         }
2045
2046         my ($re) = $update_entry->run(@entry_list);
2047         unless (defined $re) {
2048                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.biblio.record_entry.batch.update!")
2049         }
2050
2051         my ($rd) = $rd_create->run(@rd_list);
2052         unless (defined $rd) {
2053                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.record_descriptor.batch.create!")
2054         }
2055
2056         my ($fr) = $fr_create->run(@ns_list);
2057         unless (defined $fr) {
2058                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.full_rec.batch.create!")
2059         }
2060
2061         # step 5: insert the new metadata
2062         for my $class ( qw/title author subject keyword series/ ) {
2063                 my @md_list = ();
2064                 for my $doc ( @mods_data ) {
2065                         my ($did) = keys %$doc;
2066                         my ($data) = values %$doc;
2067
2068                         my $fm_constructor = "Fieldmapper::metabib::${class}_field_entry";
2069                         for my $row ( keys %{ $$data{$class} } ) {
2070                                 next unless (exists $$data{$class}{$row});
2071                                 next unless ($$data{$class}{$row}{value});
2072                                 my $fm_obj = $fm_constructor->new;
2073                                 $fm_obj->value( $$data{$class}{$row}{value} );
2074                                 $fm_obj->field( $$data{$class}{$row}{field_id} );
2075                                 $fm_obj->source( $did );
2076                                 $log->debug("$class entry: ".$fm_obj->source." => ".$fm_obj->field." : ".$fm_obj->value, DEBUG);
2077
2078                                 push @md_list, $fm_obj;
2079                         }
2080                 }
2081                         
2082                 my ($cr) = $$create{$class}->run(@md_list);
2083                 unless (defined $cr) {
2084                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.${class}_field_entry.batch.create!")
2085                 }
2086         }
2087
2088         unless ($outer_xact) {
2089                 $log->debug("Commiting transaction started by the Ingest.", INFO);
2090                 my ($c) = $commit->run;
2091                 unless (defined $c and $c) {
2092                         $rollback->run;
2093                         throw OpenSRF::EX::PANIC ("Couldn't COMMIT changes!")
2094                 }
2095         }
2096
2097         return $ret;
2098 }
2099 __PACKAGE__->register_method( 
2100         api_name        => "open-ils.worm.wormize",
2101         method          => "wormize",
2102         api_level       => 1,
2103         argc            => 1,
2104 );
2105 __PACKAGE__->register_method( 
2106         api_name        => "open-ils.worm.wormize.no_map",
2107         method          => "wormize",
2108         api_level       => 1,
2109         argc            => 1,
2110 );
2111 __PACKAGE__->register_method( 
2112         api_name        => "open-ils.worm.wormize.batch",
2113         method          => "wormize",
2114         api_level       => 1,
2115         argc            => 1,
2116 );
2117 __PACKAGE__->register_method( 
2118         api_name        => "open-ils.worm.wormize.no_map.batch",
2119         method          => "wormize",
2120         api_level       => 1,
2121         argc            => 1,
2122 );
2123
2124
2125 my $ain_xact;
2126 my $abegin;
2127 my $acommit;
2128 my $arollback;
2129 my $alookup;
2130 my $aupdate_entry;
2131 my $amr_lookup;
2132 my $amr_update;
2133 my $amr_create;
2134 my $acreate_source_map;
2135 my $asm_lookup;
2136 my $arm_old_rd;
2137 my $arm_old_sm;
2138 my $arm_old_fr;
2139 my $arm_old_tr;
2140 my $arm_old_ar;
2141 my $arm_old_sr;
2142 my $arm_old_kr;
2143 my $arm_old_ser;
2144
2145 my $afr_create;
2146 my $ard_create;
2147 my $acreate = {};
2148
2149 sub authority_wormize {
2150
2151         my $self = shift;
2152         my $client = shift;
2153         my @docids = @_;
2154
2155         my $no_map = 0;
2156         if ($self->api_name =~ /no_map/o) {
2157                 $no_map = 1;
2158         }
2159
2160         $in_xact = $self->method_lookup( 'open-ils.storage.transaction.current')
2161                 unless ($in_xact);
2162         $begin = $self->method_lookup( 'open-ils.storage.transaction.begin')
2163                 unless ($begin);
2164         $commit = $self->method_lookup( 'open-ils.storage.transaction.commit')
2165                 unless ($commit);
2166         $rollback = $self->method_lookup( 'open-ils.storage.transaction.rollback')
2167                 unless ($rollback);
2168         $alookup = $self->method_lookup('open-ils.storage.direct.authority.record_entry.batch.retrieve')
2169                 unless ($alookup);
2170         $aupdate_entry = $self->method_lookup('open-ils.storage.direct.authority.record_entry.batch.update')
2171                 unless ($aupdate_entry);
2172         $arm_old_rd = $self->method_lookup( 'open-ils.storage.direct.authority.record_descriptor.mass_delete')
2173                 unless ($arm_old_rd);
2174         $arm_old_fr = $self->method_lookup( 'open-ils.storage.direct.authority.full_rec.mass_delete')
2175                 unless ($arm_old_fr);
2176         $ard_create = $self->method_lookup( 'open-ils.storage.direct.authority.record_descriptor.batch.create')
2177                 unless ($ard_create);
2178         $afr_create = $self->method_lookup( 'open-ils.storage.direct.authority.full_rec.batch.create')
2179                 unless ($afr_create);
2180
2181
2182         my ($outer_xact) = $in_xact->run;
2183         try {
2184                 unless ($outer_xact) {
2185                         $log->debug("Ingest isn't inside a transaction, starting one now.", INFO);
2186                         my ($r) = $begin->run($client);
2187                         unless (defined $r and $r) {
2188                                 $rollback->run;
2189                                 throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!")
2190                         }
2191                 }
2192         } catch Error with {
2193                 throw OpenSRF::EX::PANIC ("Ingest Couldn't BEGIN transaction!")
2194         };
2195
2196         my @source_maps;
2197         my @entry_list;
2198         my @mr_list;
2199         my @rd_list;
2200         my @ns_list;
2201         my @mads_data;
2202         my $ret = 0;
2203         for my $entry ( $lookup->run(@docids) ) {
2204                 # step -1: grab the doc from storage
2205                 next unless ($entry);
2206
2207                 #if(!$mads_sheet) {
2208                 #       my $xslt_doc = $parser->parse_file(
2209                 #               OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS.xsl");
2210                 #       $mads_sheet = $xslt->parse_stylesheet( $xslt_doc );
2211                 #}
2212
2213                 my $xml = $entry->marc;
2214                 my $docid = $entry->id;
2215                 my $marcdoc = $parser->parse_string($xml);
2216                 #my $madsdoc = $mads_sheet->transform($marcdoc);
2217
2218                 #my $mads = $madsdoc->documentElement;
2219                 #$mads->setNamespace( "http://www.loc.gov/mads/", "mads", 1 );
2220
2221                 push @entry_list, $entry;
2222
2223                 my $ldr = $marcdoc->documentElement->getChildrenByTagName('leader')->pop->textContent;
2224                 my $oo8 = $marcdoc->documentElement->findvalue('//*[local-name()="controlfield" and @tag="008"]');
2225
2226                 my $rd_obj = Fieldmapper::authority::record_descriptor->new;
2227                 for my $rd_field ( keys %descriptor_code ) {
2228                         $rd_obj->$rd_field( eval "$descriptor_code{$rd_field};" );
2229                 }
2230                 $rd_obj->record( $docid );
2231                 push @rd_list, $rd_obj;
2232
2233                 # step 2: build the KOHA rows
2234                 my @tmp_list = _marcxml_to_full_rows( $marcdoc, 'Fieldmapper::authority::full_rec' );
2235                 $_->record( $docid ) for (@tmp_list);
2236                 push @ns_list, @tmp_list;
2237
2238                 $ret++;
2239
2240                 last unless ($self->api_name =~ /batch$/o);
2241         }
2242
2243         $arm_old_rd->run( { record => \@docids } );
2244         $arm_old_fr->run( { record => \@docids } );
2245
2246         my ($rd) = $ard_create->run(@rd_list);
2247         unless (defined $rd) {
2248                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.authority.record_descriptor.batch.create!")
2249         }
2250
2251         my ($fr) = $fr_create->run(@ns_list);
2252         unless (defined $fr) {
2253                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.authority.full_rec.batch.create!")
2254         }
2255
2256         unless ($outer_xact) {
2257                 $log->debug("Commiting transaction started by Ingest.", INFO);
2258                 my ($c) = $commit->run;
2259                 unless (defined $c and $c) {
2260                         $rollback->run;
2261                         throw OpenSRF::EX::PANIC ("Couldn't COMMIT changes!")
2262                 }
2263         }
2264
2265         return $ret;
2266 }
2267 __PACKAGE__->register_method( 
2268         api_name        => "open-ils.worm.authortiy.wormize",
2269         method          => "wormize",
2270         api_level       => 1,
2271         argc            => 1,
2272 );
2273 __PACKAGE__->register_method( 
2274         api_name        => "open-ils.worm.authority.wormize.batch",
2275         method          => "wormize",
2276         api_level       => 1,
2277         argc            => 1,
2278 );
2279
2280
2281 # --------------------------------------------------------------------------------
2282
2283
2284 sub _marcxml_to_full_rows {
2285
2286         my $marcxml = shift;
2287         my $type = shift || 'Fieldmapper::metabib::full_rec';
2288
2289         my @ns_list;
2290         
2291         my $root = $marcxml->documentElement;
2292
2293         for my $tagline ( @{$root->getChildrenByTagName("leader")} ) {
2294                 next unless $tagline;
2295
2296                 my $ns = new Fieldmapper::metabib::full_rec;
2297
2298                 $ns->tag( 'LDR' );
2299                 my $val = NFD($tagline->textContent);
2300                 $val =~ s/(\pM+)//gso;
2301                 $ns->value( $val );
2302
2303                 push @ns_list, $ns;
2304         }
2305
2306         for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) {
2307                 next unless $tagline;
2308
2309                 my $ns = new Fieldmapper::metabib::full_rec;
2310
2311                 $ns->tag( $tagline->getAttribute( "tag" ) );
2312                 my $val = NFD($tagline->textContent);
2313                 $val =~ s/(\pM+)//gso;
2314                 $ns->value( $val );
2315
2316                 push @ns_list, $ns;
2317         }
2318
2319         for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) {
2320                 next unless $tagline;
2321
2322                 my $tag = $tagline->getAttribute( "tag" );
2323                 my $ind1 = $tagline->getAttribute( "ind1" );
2324                 my $ind2 = $tagline->getAttribute( "ind2" );
2325
2326                 for my $data ( $tagline->childNodes ) {
2327                         next unless $data;
2328
2329                         my $ns = $type->new;
2330
2331                         $ns->tag( $tag );
2332                         $ns->ind1( $ind1 );
2333                         $ns->ind2( $ind2 );
2334                         $ns->subfield( $data->getAttribute( "code" ) );
2335                         my $val = NFD($data->textContent);
2336                         $val =~ s/(\pM+)//gso;
2337                         $ns->value( lc($val) );
2338
2339                         push @ns_list, $ns;
2340                 }
2341         }
2342         return @ns_list;
2343 }
2344
2345 sub _get_field_value {
2346
2347         my( $root, $xpath ) = @_;
2348
2349         my $string = "";
2350
2351         # grab the set of matching nodes
2352         my @nodes = $root->findnodes( $xpath );
2353         for my $value (@nodes) {
2354
2355                 # grab all children of the node
2356                 my @children = $value->childNodes();
2357                 for my $child (@children) {
2358
2359                         # add the childs content to the growing buffer
2360                         my $content = quotemeta($child->textContent);
2361                         next if ($string =~ /$content/);  # uniquify the values
2362                         $string .= $child->textContent . " ";
2363                 }
2364                 if( ! @children ) {
2365                         $string .= $value->textContent . " ";
2366                 }
2367         }
2368         $string = NFD($string);
2369         $string =~ s/(\pM)//gso;
2370         return lc($string);
2371 }
2372
2373
2374 sub modsdoc_to_values {
2375         my( $self, $mods ) = @_;
2376         my $data = {};
2377         for my $class (keys %$xpathset) {
2378                 $data->{$class} = {};
2379                 for my $type (keys %{$xpathset->{$class}}) {
2380                         $data->{$class}->{$type} = {};
2381                         $data->{$class}->{$type}->{field_id} = $xpathset->{$class}->{$type}->{id};
2382                 }
2383         }
2384         return $data;
2385 }
2386
2387
2388 1;
2389
2390