using Unicode::Normalize to get all incoming text into Unicode normal form D
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Application / Storage / WORM.pm
1 package OpenILS::Application::Storage::WORM;
2 use base qw/OpenILS::Application::Storage/;
3 use strict; use warnings;
4
5 use Unicode::Normalize;
6 use OpenSRF::EX qw/:try/;
7
8 use OpenSRF::Utils::SettingsClient;
9 use OpenSRF::Utils::Logger qw/:level/;
10 my $log = 'OpenSRF::Utils::Logger';
11
12 use OpenILS::Utils::FlatXML;
13 use OpenILS::Utils::Fieldmapper;
14 use JSON;
15
16 use XML::LibXML;
17 use XML::LibXSLT;
18 use Time::HiRes qw(time);
19
20 my $xml_util    = OpenILS::Utils::FlatXML->new();
21
22 my $parser              = XML::LibXML->new();
23 my $xslt                        = XML::LibXSLT->new();
24 my $mods_sheet;
25
26 use open qw/:utf8/;
27
28 my $xpathset = {};
29
30 sub child_init {
31         my $meth = __PACKAGE__->method_lookup('open-ils.storage.direct.config.metabib_field.retrieve.all');
32         for my $f ($meth->run) {
33                 $xpathset->{ $f->field_class }->{ $f->name }->{xpath} = $f->xpath;
34                 $xpathset->{ $f->field_class }->{ $f->name }->{id} = $f->id;
35                 $log->debug("Loaded XPath from DB: ".$f->field_class." => ".$f->name." : ".$f->xpath, DEBUG);
36         }
37 }
38
39 # --------------------------------------------------------------------------------
40 # Fingerprinting
41
42 my @fp_mods_xpath = (
43         '//mods:mods/mods:typeOfResource[text()="text"]' => [
44                         title   => {
45                                         xpath   => [
46                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="uniform")]',
47                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="translated")]',
48                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="alternative")]',
49                                                         '//mods:mods/mods:titleInfo[mods:title and not(@type)]',
50                                         ],
51                                         fixup   => '
52                                                         do {
53                                                                 $text = lc(NFD($text));
54                                                                 $text =~ s/\pM+//gso;
55                                                                 $text =~ s/\s+/ /sgo;
56                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
57                                                                 $text =~ s/\b(?:the|an?)\b//sgo;
58                                                                 $text =~ s/\[.[^\]]+\]//sgo;
59                                                                 $text =~ s/\s*[;\/\.]*$//sgo;
60                                                         };
61                                         ',
62                         },
63                         author  => {
64                                         xpath   => [
65                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
66                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
67                                         ],
68                                         fixup   => '
69                                                         do {
70                                                                 $text = lc(NFD($text));
71                                                                 $text =~ s/\pM+//gso;
72                                                                 $text =~ s/\s+/ /sgo;
73                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
74                                                                 $text =~ s/,?\s+.*$//sgo;
75                                                                 $text =~ s/\pM+//gso;
76                                                         };
77                                         ',
78                         },
79         ],
80
81         '//mods:mods/mods:relatedItem[@type!="host" and @type!="series"]' => [
82                         title   => {
83                                         xpath   => [
84                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="uniform")]',
85                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="translated")]',
86                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="alternative")]',
87                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and not(@type)]',
88                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="uniform")]',
89                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="translated")]',
90                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="alternative")]',
91                                                         '//mods:mods/mods:titleInfo[mods:title and not(@type)]',
92                                         ],
93                                         fixup   => '
94                                                         do {
95                                                                 $text = lc(NFD($text));
96                                                                 $text =~ s/\pM+//gso;
97                                                                 $text =~ s/\s+/ /sgo;
98                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
99                                                                 $text =~ s/\b(?:the|an?)\b//sgo;
100                                                                 $text =~ s/\[.[^\]]+\]//sgo;
101                                                                 $text =~ s/\s*[;\/\.]*$//sgo;
102                                                                 $text =~ s/\pM+//gso;
103                                                         };
104                                         ',
105                         },
106                         author  => {
107                                         xpath   => [
108                                                         '//mods:mods/mods:relatedItem/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
109                                                         '//mods:mods/mods:relatedItem/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
110                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
111                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
112                                         ],
113                                         fixup   => '
114                                                         do {
115                                                                 $text = lc(NFD($text));
116                                                                 $text =~ s/\pM+//gso;
117                                                                 $text =~ s/\s+/ /sgo;
118                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
119                                                                 $text =~ s/,?\s+.*$//sgo;
120                                                                 $text =~ s/\pM+//gso;
121                                                         };
122                                         ',
123                         },
124         ],
125
126 );
127
128 push @fp_mods_xpath, '//mods:mods/mods:titleInfo' => $fp_mods_xpath[1];
129
130 sub fingerprint_mods {
131         my $mods = shift;
132
133         my $fp_string = '';
134
135         my $match_index = 0;
136         my $block_index = 1;
137         while ( my $match_xpath = $fp_mods_xpath[$match_index] ) {
138                 if ( my @nodes = $mods->findnodes( $match_xpath ) ) {
139
140                         my $block_name_index = 0;
141                         my $block_value_index = 1;
142                         my $block = $fp_mods_xpath[$block_index];
143                         while ( my $part = $$block[$block_value_index] ) {
144                                 my $text;
145                                 for my $xpath ( @{ $part->{xpath} } ) {
146                                         $text = $mods->findvalue( $xpath );
147                                         last if ($text);
148                                 }
149
150                                 $log->debug("Found fingerprint text using $$block[$block_name_index] : [$text]", DEBUG);
151
152                                 if ($text) {
153                                         eval $$part{fixup};
154                                         $fp_string .= $text;
155                                 }
156
157                                 $block_name_index += 2;
158                                 $block_value_index += 2;
159                         }
160                 }
161                 if ($fp_string) {
162                         $fp_string =~ s/\W+//gso;
163                         $log->debug("Fingerprint is [$fp_string]", INFO);;
164                         return $fp_string;
165                 }
166
167                 $match_index += 2;
168                 $block_index += 2;
169         }
170         return undef;
171 }
172
173
174
175 # --------------------------------------------------------------------------------
176
177 my $in_xact;
178 my $begin;
179 my $commit;
180 my $rollback;
181 my $lookup;
182 my $update_entry;
183 my $mr_lookup;
184 my $mr_update;
185 my $mr_create;
186 my $create_source_map;
187 my $sm_lookup;
188 my $rm_old_rd;
189 my $rm_old_sm;
190 my $rm_old_fr;
191 my $rm_old_tr;
192 my $rm_old_ar;
193 my $rm_old_sr;
194 my $rm_old_kr;
195 my $rm_old_ser;
196
197 my $fr_create;
198 my $rd_create;
199 my $create = {};
200
201 my %descriptor_code = (
202         item_type => 'substr($ldr,6,1)',
203         item_form => '(substr($ldr,6,1) =~ /^(?:f|g|i|m|o|p|r)$/) ? substr($oo8,29,1) : substr($oo8,23,1)',
204         bib_level => 'substr($ldr,7,1)',
205         control_type => 'substr($ldr,8,1)',
206         char_encoding => 'substr($ldr,9,1)',
207         enc_level => 'substr($ldr,17,1)',
208         cat_form => 'substr($ldr,18,1)',
209         pub_status => 'substr($ldr,5,1)',
210         item_lang => 'substr($oo8,35,3)',
211         #lit_form => '(substr($ldr,6,1) =~ /^(?:f|g|i|m|o|p|r)$/) ? substr($oo8,33,1) : "0"',
212         audience => 'substr($oo8,22,1)',
213 );
214
215 sub wormize {
216
217         my $self = shift;
218         my $client = shift;
219         my @docids = @_;
220
221         my $no_map = 0;
222         if ($self->api_name =~ /no_map/o) {
223                 $no_map = 1;
224         }
225
226         $in_xact = $self->method_lookup( 'open-ils.storage.transaction.current')
227                 unless ($in_xact);
228         $begin = $self->method_lookup( 'open-ils.storage.transaction.begin')
229                 unless ($begin);
230         $commit = $self->method_lookup( 'open-ils.storage.transaction.commit')
231                 unless ($commit);
232         $rollback = $self->method_lookup( 'open-ils.storage.transaction.rollback')
233                 unless ($rollback);
234         $sm_lookup = $self->method_lookup('open-ils.storage.direct.metabib.metarecord_source_map.search.source')
235                 unless ($sm_lookup);
236         $mr_lookup = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.search.fingerprint')
237                 unless ($mr_lookup);
238         $mr_update = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.batch.update')
239                 unless ($mr_update);
240         $mr_create = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.create')
241                 unless ($mr_create);
242         $create_source_map = $self->method_lookup('open-ils.storage.direct.metabib.metarecord_source_map.batch.create')
243                 unless ($create_source_map);
244         $lookup = $self->method_lookup('open-ils.storage.direct.biblio.record_entry.batch.retrieve')
245                 unless ($lookup);
246         $update_entry = $self->method_lookup('open-ils.storage.direct.biblio.record_entry.batch.update')
247                 unless ($update_entry);
248         $rm_old_sm = $self->method_lookup( 'open-ils.storage.direct.metabib.metarecord_source_map.mass_delete')
249                 unless ($rm_old_sm);
250         $rm_old_rd = $self->method_lookup( 'open-ils.storage.direct.metabib.record_descriptor.mass_delete')
251                 unless ($rm_old_rd);
252         $rm_old_fr = $self->method_lookup( 'open-ils.storage.direct.metabib.full_rec.mass_delete')
253                 unless ($rm_old_fr);
254         $rm_old_tr = $self->method_lookup( 'open-ils.storage.direct.metabib.title_field_entry.mass_delete')
255                 unless ($rm_old_tr);
256         $rm_old_ar = $self->method_lookup( 'open-ils.storage.direct.metabib.author_field_entry.mass_delete')
257                 unless ($rm_old_ar);
258         $rm_old_sr = $self->method_lookup( 'open-ils.storage.direct.metabib.subject_field_entry.mass_delete')
259                 unless ($rm_old_sr);
260         $rm_old_kr = $self->method_lookup( 'open-ils.storage.direct.metabib.keyword_field_entry.mass_delete')
261                 unless ($rm_old_kr);
262         $rm_old_ser = $self->method_lookup( 'open-ils.storage.direct.metabib.series_field_entry.mass_delete')
263                 unless ($rm_old_ser);
264         $rd_create = $self->method_lookup( 'open-ils.storage.direct.metabib.record_descriptor.batch.create')
265                 unless ($rd_create);
266         $fr_create = $self->method_lookup( 'open-ils.storage.direct.metabib.full_rec.batch.create')
267                 unless ($fr_create);
268         $$create{title} = $self->method_lookup( 'open-ils.storage.direct.metabib.title_field_entry.batch.create')
269                 unless ($$create{title});
270         $$create{author} = $self->method_lookup( 'open-ils.storage.direct.metabib.author_field_entry.batch.create')
271                 unless ($$create{author});
272         $$create{subject} = $self->method_lookup( 'open-ils.storage.direct.metabib.subject_field_entry.batch.create')
273                 unless ($$create{subject});
274         $$create{keyword} = $self->method_lookup( 'open-ils.storage.direct.metabib.keyword_field_entry.batch.create')
275                 unless ($$create{keyword});
276         $$create{series} = $self->method_lookup( 'open-ils.storage.direct.metabib.series_field_entry.batch.create')
277                 unless ($$create{series});
278
279
280         my ($outer_xact) = $in_xact->run;
281         try {
282                 unless ($outer_xact) {
283                         $log->debug("WoRM isn't inside a transaction, starting one now.", INFO);
284                         my ($r) = $begin->run($client);
285                         unless (defined $r and $r) {
286                                 $rollback->run;
287                                 throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!")
288                         }
289                 }
290         } catch Error with {
291                 throw OpenSRF::EX::PANIC ("WoRM Couldn't BEGIN transaction!")
292         };
293
294         my @source_maps;
295         my @entry_list;
296         my @mr_list;
297         my @rd_list;
298         my @ns_list;
299         my @mods_data;
300         my $ret = 0;
301         for my $entry ( $lookup->run(@docids) ) {
302                 # step -1: grab the doc from storage
303                 next unless ($entry);
304
305                 if(!$mods_sheet) {
306                         my $xslt_doc = $parser->parse_file(
307                                 OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS.xsl");
308                         $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
309                 }
310
311                 my $xml = $entry->marc;
312                 my $docid = $entry->id;
313                 my $marcdoc = $parser->parse_string($xml);
314                 my $modsdoc = $mods_sheet->transform($marcdoc);
315
316                 my $mods = $modsdoc->documentElement;
317                 $mods->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
318
319                 $entry->fingerprint( fingerprint_mods( $mods ) );
320                 push @entry_list, $entry;
321
322                 $log->debug("Fingerprint for Record Entry ".$docid." is [".$entry->fingerprint."]", INFO);
323
324                 unless ($no_map) {
325                         my ($mr) = $mr_lookup->run( $entry->fingerprint );
326                         if (!$mr || !@$mr) {
327                                 $log->debug("No metarecord found for fingerprint [".$entry->fingerprint."]; Creating a new one", INFO);
328                                 $mr = new Fieldmapper::metabib::metarecord;
329                                 $mr->fingerprint( $entry->fingerprint );
330                                 $mr->master_record( $entry->id );
331                                 my ($new_mr) = $mr_create->run($mr);
332                                 $mr->id($new_mr);
333                                 unless (defined $mr) {
334                                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord.create!")
335                                 }
336                         } else {
337                                 $log->debug("Retrieved metarecord, id is ".$mr->id, INFO);
338                                 $mr->mods('');
339                                 push @mr_list, $mr;
340                         }
341
342                         my $sm = new Fieldmapper::metabib::metarecord_source_map;
343                         $sm->metarecord( $mr->id );
344                         $sm->source( $entry->id );
345                         push @source_maps, $sm;
346                 }
347
348                 my $ldr = $marcdoc->documentElement->getChildrenByTagName('leader')->pop->textContent;
349                 my $oo8 = $marcdoc->documentElement->findvalue('//*[local-name()="controlfield" and @tag="008"]');
350
351                 my $rd_obj = Fieldmapper::metabib::record_descriptor->new;
352                 for my $rd_field ( keys %descriptor_code ) {
353                         $rd_obj->$rd_field( eval "$descriptor_code{$rd_field};" );
354                 }
355                 $rd_obj->record( $docid );
356                 push @rd_list, $rd_obj;
357
358                 push @mods_data, { $docid => $self->modsdoc_to_values( $mods ) };
359
360                 # step 2: build the KOHA rows
361                 my @tmp_list = _marcxml_to_full_rows( $marcdoc );
362                 $_->record( $docid ) for (@tmp_list);
363                 push @ns_list, @tmp_list;
364
365                 $ret++;
366
367                 last unless ($self->api_name =~ /batch$/o);
368         }
369
370         $rm_old_rd->run( { record => \@docids } );
371         $rm_old_fr->run( { record => \@docids } );
372         $rm_old_sm->run( { source => \@docids } ) unless ($no_map);
373         $rm_old_tr->run( { source => \@docids } );
374         $rm_old_ar->run( { source => \@docids } );
375         $rm_old_sr->run( { source => \@docids } );
376         $rm_old_kr->run( { source => \@docids } );
377         $rm_old_ser->run( { source => \@docids } );
378
379         unless ($no_map) {
380                 my ($sm) = $create_source_map->run(@source_maps);
381                 unless (defined $sm) {
382                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord_source_map.batch.create!")
383                 }
384                 my ($mr) = $mr_update->run(@mr_list);
385                 unless (defined $mr) {
386                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord.batch.update!")
387                 }
388         }
389
390         my ($re) = $update_entry->run(@entry_list);
391         unless (defined $re) {
392                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.biblio.record_entry.batch.update!")
393         }
394
395         my ($rd) = $rd_create->run(@rd_list);
396         unless (defined $rd) {
397                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.record_descriptor.batch.create!")
398         }
399
400         my ($fr) = $fr_create->run(@ns_list);
401         unless (defined $fr) {
402                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.full_rec.batch.create!")
403         }
404
405         # step 5: insert the new metadata
406         for my $class ( qw/title author subject keyword series/ ) {
407                 my @md_list = ();
408                 for my $doc ( @mods_data ) {
409                         my ($did) = keys %$doc;
410                         my ($data) = values %$doc;
411
412                         my $fm_constructor = "Fieldmapper::metabib::${class}_field_entry";
413                         for my $row ( keys %{ $$data{$class} } ) {
414                                 next unless (exists $$data{$class}{$row});
415                                 next unless ($$data{$class}{$row}{value});
416                                 my $fm_obj = $fm_constructor->new;
417                                 $fm_obj->value( $$data{$class}{$row}{value} );
418                                 $fm_obj->field( $$data{$class}{$row}{field_id} );
419                                 $fm_obj->source( $did );
420                                 $log->debug("$class entry: ".$fm_obj->source." => ".$fm_obj->field." : ".$fm_obj->value, DEBUG);
421
422                                 push @md_list, $fm_obj;
423                         }
424                 }
425                         
426                 my ($cr) = $$create{$class}->run(@md_list);
427                 unless (defined $cr) {
428                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.${class}_field_entry.batch.create!")
429                 }
430         }
431
432         unless ($outer_xact) {
433                 $log->debug("Commiting transaction started by the WoRM.", INFO);
434                 my ($c) = $commit->run;
435                 unless (defined $c and $c) {
436                         $rollback->run;
437                         throw OpenSRF::EX::PANIC ("Couldn't COMMIT changes!")
438                 }
439         }
440
441         return $ret;
442 }
443 __PACKAGE__->register_method( 
444         api_name        => "open-ils.worm.wormize",
445         method          => "wormize",
446         api_level       => 1,
447         argc            => 1,
448 );
449 __PACKAGE__->register_method( 
450         api_name        => "open-ils.worm.wormize.no_map",
451         method          => "wormize",
452         api_level       => 1,
453         argc            => 1,
454 );
455 __PACKAGE__->register_method( 
456         api_name        => "open-ils.worm.wormize.batch",
457         method          => "wormize",
458         api_level       => 1,
459         argc            => 1,
460 );
461 __PACKAGE__->register_method( 
462         api_name        => "open-ils.worm.wormize.no_map.batch",
463         method          => "wormize",
464         api_level       => 1,
465         argc            => 1,
466 );
467
468
469 # --------------------------------------------------------------------------------
470
471
472 sub _marcxml_to_full_rows {
473
474         my $marcxml = shift;
475
476         my @ns_list;
477         
478         my $root = $marcxml->documentElement;
479
480         for my $tagline ( @{$root->getChildrenByTagName("leader")} ) {
481                 next unless $tagline;
482
483                 my $ns = new Fieldmapper::metabib::full_rec;
484
485                 $ns->tag( 'LDR' );
486                 my $val = NFD($tagline->textContent);
487                 $val =~ s/(\pM+)//gso;
488                 $ns->value( $val );
489
490                 push @ns_list, $ns;
491         }
492
493         for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) {
494                 next unless $tagline;
495
496                 my $ns = new Fieldmapper::metabib::full_rec;
497
498                 $ns->tag( $tagline->getAttribute( "tag" ) );
499                 my $val = NFD($tagline->textContent);
500                 $val =~ s/(\pM+)//gso;
501                 $ns->value( $val );
502
503                 push @ns_list, $ns;
504         }
505
506         for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) {
507                 next unless $tagline;
508
509                 my $tag = $tagline->getAttribute( "tag" );
510                 my $ind1 = $tagline->getAttribute( "ind1" );
511                 my $ind2 = $tagline->getAttribute( "ind2" );
512
513                 for my $data ( $tagline->childNodes ) {
514                         next unless $data;
515
516                         my $ns = new Fieldmapper::metabib::full_rec;
517
518                         $ns->tag( $tag );
519                         $ns->ind1( $ind1 );
520                         $ns->ind2( $ind2 );
521                         $ns->subfield( $data->getAttribute( "code" ) );
522                         my $val = NFD($data->textContent);
523                         $val =~ s/(\pM+)//gso;
524                         $ns->value( lc($val) );
525
526                         push @ns_list, $ns;
527                 }
528         }
529         return @ns_list;
530 }
531
532 sub _get_field_value {
533
534         my( $root, $xpath ) = @_;
535
536         my $string = "";
537
538         # grab the set of matching nodes
539         my @nodes = $root->findnodes( $xpath );
540         for my $value (@nodes) {
541
542                 # grab all children of the node
543                 my @children = $value->childNodes();
544                 for my $child (@children) {
545
546                         # add the childs content to the growing buffer
547                         my $content = quotemeta($child->textContent);
548                         next if ($string =~ /$content/);  # uniquify the values
549                         $string .= $child->textContent . " ";
550                 }
551                 if( ! @children ) {
552                         $string .= $value->textContent . " ";
553                 }
554         }
555         $string = NFD($string);
556         $string =~ s/(\pM)//gso;
557         return lc($string);
558 }
559
560
561 sub modsdoc_to_values {
562         my( $self, $mods ) = @_;
563         my $data = {};
564         for my $class (keys %$xpathset) {
565                 $data->{$class} = {};
566                 for my $type (keys %{$xpathset->{$class}}) {
567                         $data->{$class}->{$type} = {};
568                         $data->{$class}->{$type}->{value} = _get_field_value( $mods, $xpathset->{$class}->{$type}->{xpath} );
569                         $data->{$class}->{$type}->{field_id} = $xpathset->{$class}->{$type}->{id};
570                 }
571         }
572         return $data;
573 }
574
575
576 1;
577
578