]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Application/Storage/WORM.pm
more install goodness, fixed some bugs, etc
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Application / Storage / WORM.pm
1 package OpenILS::Application::Storage::WORM;
2 use base qw/OpenILS::Application::Storage/;
3 use strict; use warnings;
4
5 use OpenSRF::EX qw/:try/;
6
7 use OpenSRF::Utils::Logger qw/:level/;
8 my $log = 'OpenSRF::Utils::Logger';
9
10 use OpenILS::Utils::FlatXML;
11 use OpenILS::Utils::Fieldmapper;
12 use JSON;
13
14 use XML::LibXML;
15 use XML::LibXSLT;
16 use Time::HiRes qw(time);
17
18 my $xml_util    = OpenILS::Utils::FlatXML->new();
19
20 my $parser              = XML::LibXML->new();
21 my $xslt                        = XML::LibXSLT->new();
22 my $mods_sheet;
23
24 use open qw/:utf8/;
25
26 my $xpathset = {};
27
28 sub child_init {
29         my $meth = __PACKAGE__->method_lookup('open-ils.storage.direct.config.metabib_field.retrieve.all');
30         for my $f ($meth->run) {
31                 $xpathset->{ $f->field_class }->{ $f->name }->{xpath} = $f->xpath;
32                 $xpathset->{ $f->field_class }->{ $f->name }->{id} = $f->id;
33                 $log->debug("Loaded XPath from DB: ".$f->field_class." => ".$f->name." : ".$f->xpath, DEBUG);
34         }
35 }
36
37 # --------------------------------------------------------------------------------
38 # Fingerprinting
39
40 my @fp_mods_xpath = (
41         '//mods:mods/mods:typeOfResource[text()="text"]' => [
42                         title   => {
43                                         xpath   => [
44                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="uniform")]',
45                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="translated")]',
46                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="alternative")]',
47                                                         '//mods:mods/mods:titleInfo[mods:title and not(@type)]',
48                                         ],
49                                         fixup   => '
50                                                         do {
51                                                                 $text = lc($text);
52                                                                 $text =~ s/\s+/ /sgo;
53                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
54                                                                 $text =~ s/\b(?:the|an?)\b//sgo;
55                                                                 $text =~ s/\[.[^\]]+\]//sgo;
56                                                                 $text =~ s/\s*[;\/\.]*$//sgo;
57                                                                 $text =~ s/\pM+//gso;
58                                                         };
59                                         ',
60                         },
61                         author  => {
62                                         xpath   => [
63                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
64                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
65                                         ],
66                                         fixup   => '
67                                                         do {
68                                                                 $text = lc($text);
69                                                                 $text =~ s/\s+/ /sgo;
70                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
71                                                                 $text =~ s/,?\s+.*$//sgo;
72                                                                 $text =~ s/\pM+//gso;
73                                                         };
74                                         ',
75                         },
76         ],
77
78         '//mods:mods/mods:relatedItem[@type!="host" and @type!="series"]' => [
79                         title   => {
80                                         xpath   => [
81                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="uniform")]',
82                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="translated")]',
83                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and (@type="alternative")]',
84                                                         '//mods:mods/mods:relatedItem/mods:titleInfo[mods:title and not(@type)]',
85                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="uniform")]',
86                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="translated")]',
87                                                         '//mods:mods/mods:titleInfo[mods:title and (@type="alternative")]',
88                                                         '//mods:mods/mods:titleInfo[mods:title and not(@type)]',
89                                         ],
90                                         fixup   => '
91                                                         do {
92                                                                 $text = lc($text);
93                                                                 $text =~ s/\s+/ /sgo;
94                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
95                                                                 $text =~ s/\b(?:the|an?)\b//sgo;
96                                                                 $text =~ s/\[.[^\]]+\]//sgo;
97                                                                 $text =~ s/\s*[;\/\.]*$//sgo;
98                                                                 $text =~ s/\pM+//gso;
99                                                         };
100                                         ',
101                         },
102                         author  => {
103                                         xpath   => [
104                                                         '//mods:mods/mods:relatedItem/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
105                                                         '//mods:mods/mods:relatedItem/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
106                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator" and @type="personal"]/mods:namePart',
107                                                         '//mods:mods/mods:name[mods:role/mods:text/text()="creator"]/mods:namePart',
108                                         ],
109                                         fixup   => '
110                                                         do {
111                                                                 $text = lc($text);
112                                                                 $text =~ s/\s+/ /sgo;
113                                                                 $text =~ s/^\s*(.+)\s*$/$1/sgo;
114                                                                 $text =~ s/,?\s+.*$//sgo;
115                                                                 $text =~ s/\pM+//gso;
116                                                         };
117                                         ',
118                         },
119         ],
120
121 );
122
123 push @fp_mods_xpath, '//mods:mods/mods:titleInfo' => $fp_mods_xpath[1];
124
125 sub fingerprint_mods {
126         my $mods = shift;
127
128         my $fp_string = '';
129
130         my $match_index = 0;
131         my $block_index = 1;
132         while ( my $match_xpath = $fp_mods_xpath[$match_index] ) {
133                 if ( my @nodes = $mods->findnodes( $match_xpath ) ) {
134
135                         my $block_name_index = 0;
136                         my $block_value_index = 1;
137                         my $block = $fp_mods_xpath[$block_index];
138                         while ( my $part = $$block[$block_value_index] ) {
139                                 my $text;
140                                 for my $xpath ( @{ $part->{xpath} } ) {
141                                         $text = $mods->findvalue( $xpath );
142                                         last if ($text);
143                                 }
144
145                                 $log->debug("Found fingerprint text using $$block[$block_name_index] : [$text]", DEBUG);
146
147                                 if ($text) {
148                                         eval $$part{fixup};
149                                         $fp_string .= $text;
150                                 }
151
152                                 $block_name_index += 2;
153                                 $block_value_index += 2;
154                         }
155                 }
156                 if ($fp_string) {
157                         $fp_string =~ s/\W+//gso;
158                         $log->debug("Fingerprint is [$fp_string]", INFO);;
159                         return $fp_string;
160                 }
161
162                 $match_index += 2;
163                 $block_index += 2;
164         }
165         return undef;
166 }
167
168
169
170 # --------------------------------------------------------------------------------
171
172 my $in_xact;
173 my $begin;
174 my $commit;
175 my $rollback;
176 my $lookup;
177 my $update_entry;
178 my $mr_lookup;
179 my $mr_update;
180 my $mr_create;
181 my $create_source_map;
182 my $sm_lookup;
183 my $rm_old_rd;
184 my $rm_old_sm;
185 my $rm_old_fr;
186 my $rm_old_tr;
187 my $rm_old_ar;
188 my $rm_old_sr;
189 my $rm_old_kr;
190 my $rm_old_ser;
191
192 my $fr_create;
193 my $rd_create;
194 my $create = {};
195
196 my %descriptor_code = (
197         item_type => 'substr($ldr,6,1)',
198         item_form => '(substr($ldr,6,1) =~ /^(?:f|g|i|m|o|p|r)$/) ? substr($oo8,29,1) : substr($oo8,23,1)',
199         bib_level => 'substr($ldr,7,1)',
200         control_type => 'substr($ldr,8,1)',
201         char_encoding => 'substr($ldr,9,1)',
202         enc_level => 'substr($ldr,17,1)',
203         cat_form => 'substr($ldr,18,1)',
204         pub_status => 'substr($ldr,5,1)',
205         item_lang => 'substr($oo8,35,3)',
206         #lit_form => '(substr($ldr,6,1) =~ /^(?:f|g|i|m|o|p|r)$/) ? substr($oo8,33,1) : "0"',
207         audience => 'substr($oo8,22,1)',
208 );
209
210 sub wormize {
211
212         my $self = shift;
213         my $client = shift;
214         my @docids = @_;
215
216         my $no_map = 0;
217         if ($self->api_name =~ /no_map/o) {
218                 $no_map = 1;
219         }
220
221         $in_xact = $self->method_lookup( 'open-ils.storage.transaction.current')
222                 unless ($in_xact);
223         $begin = $self->method_lookup( 'open-ils.storage.transaction.begin')
224                 unless ($begin);
225         $commit = $self->method_lookup( 'open-ils.storage.transaction.commit')
226                 unless ($commit);
227         $rollback = $self->method_lookup( 'open-ils.storage.transaction.rollback')
228                 unless ($rollback);
229         $sm_lookup = $self->method_lookup('open-ils.storage.direct.metabib.metarecord_source_map.search.source')
230                 unless ($sm_lookup);
231         $mr_lookup = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.search.fingerprint')
232                 unless ($mr_lookup);
233         $mr_update = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.batch.update')
234                 unless ($mr_update);
235         $mr_create = $self->method_lookup('open-ils.storage.direct.metabib.metarecord.create')
236                 unless ($mr_create);
237         $create_source_map = $self->method_lookup('open-ils.storage.direct.metabib.metarecord_source_map.batch.create')
238                 unless ($create_source_map);
239         $lookup = $self->method_lookup('open-ils.storage.direct.biblio.record_entry.batch.retrieve')
240                 unless ($lookup);
241         $update_entry = $self->method_lookup('open-ils.storage.direct.biblio.record_entry.batch.update')
242                 unless ($update_entry);
243         $rm_old_sm = $self->method_lookup( 'open-ils.storage.direct.metabib.metarecord_source_map.mass_delete')
244                 unless ($rm_old_sm);
245         $rm_old_rd = $self->method_lookup( 'open-ils.storage.direct.metabib.record_descriptor.mass_delete')
246                 unless ($rm_old_rd);
247         $rm_old_fr = $self->method_lookup( 'open-ils.storage.direct.metabib.full_rec.mass_delete')
248                 unless ($rm_old_fr);
249         $rm_old_tr = $self->method_lookup( 'open-ils.storage.direct.metabib.title_field_entry.mass_delete')
250                 unless ($rm_old_tr);
251         $rm_old_ar = $self->method_lookup( 'open-ils.storage.direct.metabib.author_field_entry.mass_delete')
252                 unless ($rm_old_ar);
253         $rm_old_sr = $self->method_lookup( 'open-ils.storage.direct.metabib.subject_field_entry.mass_delete')
254                 unless ($rm_old_sr);
255         $rm_old_kr = $self->method_lookup( 'open-ils.storage.direct.metabib.keyword_field_entry.mass_delete')
256                 unless ($rm_old_kr);
257         $rm_old_ser = $self->method_lookup( 'open-ils.storage.direct.metabib.series_field_entry.mass_delete')
258                 unless ($rm_old_ser);
259         $rd_create = $self->method_lookup( 'open-ils.storage.direct.metabib.record_descriptor.batch.create')
260                 unless ($rd_create);
261         $fr_create = $self->method_lookup( 'open-ils.storage.direct.metabib.full_rec.batch.create')
262                 unless ($fr_create);
263         $$create{title} = $self->method_lookup( 'open-ils.storage.direct.metabib.title_field_entry.batch.create')
264                 unless ($$create{title});
265         $$create{author} = $self->method_lookup( 'open-ils.storage.direct.metabib.author_field_entry.batch.create')
266                 unless ($$create{author});
267         $$create{subject} = $self->method_lookup( 'open-ils.storage.direct.metabib.subject_field_entry.batch.create')
268                 unless ($$create{subject});
269         $$create{keyword} = $self->method_lookup( 'open-ils.storage.direct.metabib.keyword_field_entry.batch.create')
270                 unless ($$create{keyword});
271         $$create{series} = $self->method_lookup( 'open-ils.storage.direct.metabib.series_field_entry.batch.create')
272                 unless ($$create{series});
273
274
275         my ($outer_xact) = $in_xact->run;
276         try {
277                 unless ($outer_xact) {
278                         $log->debug("WoRM isn't inside a transaction, starting one now.", INFO);
279                         my ($r) = $begin->run($client);
280                         unless (defined $r and $r) {
281                                 $rollback->run;
282                                 throw OpenSRF::EX::PANIC ("Couldn't BEGIN transaction!")
283                         }
284                 }
285         } catch Error with {
286                 throw OpenSRF::EX::PANIC ("WoRM Couldn't BEGIN transaction!")
287         };
288
289         my @source_maps;
290         my @entry_list;
291         my @mr_list;
292         my @rd_list;
293         my @ns_list;
294         my @mods_data;
295         my $ret = 0;
296         for my $entry ( $lookup->run(@docids) ) {
297                 # step -1: grab the doc from storage
298                 next unless ($entry);
299
300                 if(!$mods_sheet) {
301                         my $xslt_doc = $parser->parse_file(
302                                 OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS.xsl");
303                         $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
304                 }
305
306                 my $xml = $entry->marc;
307                 my $docid = $entry->id;
308                 my $marcdoc = $parser->parse_string($xml);
309                 my $modsdoc = $mods_sheet->transform($marcdoc);
310
311                 my $mods = $modsdoc->documentElement;
312                 $mods->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
313
314                 $entry->fingerprint( fingerprint_mods( $mods ) );
315                 push @entry_list, $entry;
316
317                 $log->debug("Fingerprint for Record Entry ".$docid." is [".$entry->fingerprint."]", INFO);
318
319                 unless ($no_map) {
320                         my ($mr) = $mr_lookup->run( $entry->fingerprint );
321                         if (!$mr || !@$mr) {
322                                 $log->debug("No metarecord found for fingerprint [".$entry->fingerprint."]; Creating a new one", INFO);
323                                 $mr = new Fieldmapper::metabib::metarecord;
324                                 $mr->fingerprint( $entry->fingerprint );
325                                 $mr->master_record( $entry->id );
326                                 my ($new_mr) = $mr_create->run($mr);
327                                 $mr->id($new_mr);
328                                 unless (defined $mr) {
329                                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord.create!")
330                                 }
331                         } else {
332                                 $log->debug("Retrieved metarecord, id is ".$mr->id, INFO);
333                                 $mr->mods('');
334                                 push @mr_list, $mr;
335                         }
336
337                         my $sm = new Fieldmapper::metabib::metarecord_source_map;
338                         $sm->metarecord( $mr->id );
339                         $sm->source( $entry->id );
340                         push @source_maps, $sm;
341                 }
342
343                 my $ldr = $marcdoc->documentElement->getChildrenByTagName('leader')->pop->textContent;
344                 my $oo8 = $marcdoc->documentElement->findvalue('//*[local-name()="controlfield" and @tag="008"]');
345
346                 my $rd_obj = Fieldmapper::metabib::record_descriptor->new;
347                 for my $rd_field ( keys %descriptor_code ) {
348                         $rd_obj->$rd_field( eval "$descriptor_code{$rd_field};" );
349                 }
350                 $rd_obj->record( $docid );
351                 push @rd_list, $rd_obj;
352
353                 push @mods_data, { $docid => $self->modsdoc_to_values( $mods ) };
354
355                 # step 2: build the KOHA rows
356                 my @tmp_list = _marcxml_to_full_rows( $marcdoc );
357                 $_->record( $docid ) for (@tmp_list);
358                 push @ns_list, @tmp_list;
359
360                 $ret++;
361
362                 last unless ($self->api_name =~ /batch$/o);
363         }
364
365         $rm_old_rd->run( { record => \@docids } );
366         $rm_old_fr->run( { record => \@docids } );
367         $rm_old_sm->run( { source => \@docids } ) unless ($no_map);
368         $rm_old_tr->run( { source => \@docids } );
369         $rm_old_ar->run( { source => \@docids } );
370         $rm_old_sr->run( { source => \@docids } );
371         $rm_old_kr->run( { source => \@docids } );
372         $rm_old_ser->run( { source => \@docids } );
373
374         unless ($no_map) {
375                 my ($sm) = $create_source_map->run(@source_maps);
376                 unless (defined $sm) {
377                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord_source_map.batch.create!")
378                 }
379                 my ($mr) = $mr_update->run(@mr_list);
380                 unless (defined $mr) {
381                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.metarecord.batch.update!")
382                 }
383         }
384
385         my ($re) = $update_entry->run(@entry_list);
386         unless (defined $re) {
387                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.biblio.record_entry.batch.update!")
388         }
389
390         my ($rd) = $rd_create->run(@rd_list);
391         unless (defined $rd) {
392                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.record_descriptor.batch.create!")
393         }
394
395         my ($fr) = $fr_create->run(@ns_list);
396         unless (defined $fr) {
397                 throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.full_rec.batch.create!")
398         }
399
400         # step 5: insert the new metadata
401         for my $class ( qw/title author subject keyword series/ ) {
402                 my @md_list = ();
403                 for my $doc ( @mods_data ) {
404                         my ($did) = keys %$doc;
405                         my ($data) = values %$doc;
406
407                         my $fm_constructor = "Fieldmapper::metabib::${class}_field_entry";
408                         for my $row ( keys %{ $$data{$class} } ) {
409                                 next unless (exists $$data{$class}{$row});
410                                 next unless ($$data{$class}{$row}{value});
411                                 my $fm_obj = $fm_constructor->new;
412                                 $fm_obj->value( $$data{$class}{$row}{value} );
413                                 $fm_obj->field( $$data{$class}{$row}{field_id} );
414                                 $fm_obj->source( $did );
415                                 $log->debug("$class entry: ".$fm_obj->source." => ".$fm_obj->field." : ".$fm_obj->value, DEBUG);
416
417                                 push @md_list, $fm_obj;
418                         }
419                 }
420                         
421                 my ($cr) = $$create{$class}->run(@md_list);
422                 unless (defined $cr) {
423                         throw OpenSRF::EX::PANIC ("Couldn't run open-ils.storage.direct.metabib.${class}_field_entry.batch.create!")
424                 }
425         }
426
427         unless ($outer_xact) {
428                 $log->debug("Commiting transaction started by the WoRM.", INFO);
429                 my ($c) = $commit->run;
430                 unless (defined $c and $c) {
431                         $rollback->run;
432                         throw OpenSRF::EX::PANIC ("Couldn't COMMIT changes!")
433                 }
434         }
435
436         return $ret;
437 }
438 __PACKAGE__->register_method( 
439         api_name        => "open-ils.worm.wormize",
440         method          => "wormize",
441         api_level       => 1,
442         argc            => 1,
443 );
444 __PACKAGE__->register_method( 
445         api_name        => "open-ils.worm.wormize.no_map",
446         method          => "wormize",
447         api_level       => 1,
448         argc            => 1,
449 );
450 __PACKAGE__->register_method( 
451         api_name        => "open-ils.worm.wormize.batch",
452         method          => "wormize",
453         api_level       => 1,
454         argc            => 1,
455 );
456 __PACKAGE__->register_method( 
457         api_name        => "open-ils.worm.wormize.no_map.batch",
458         method          => "wormize",
459         api_level       => 1,
460         argc            => 1,
461 );
462
463
464 # --------------------------------------------------------------------------------
465
466
467 sub _marcxml_to_full_rows {
468
469         my $marcxml = shift;
470
471         my @ns_list;
472         
473         my $root = $marcxml->documentElement;
474
475         for my $tagline ( @{$root->getChildrenByTagName("leader")} ) {
476                 next unless $tagline;
477
478                 my $ns = new Fieldmapper::metabib::full_rec;
479
480                 $ns->tag( 'LDR' );
481                 my $val = $tagline->textContent;
482                 $val =~ s/(\pM)//gso;
483                 $ns->value( $val );
484
485                 push @ns_list, $ns;
486         }
487
488         for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) {
489                 next unless $tagline;
490
491                 my $ns = new Fieldmapper::metabib::full_rec;
492
493                 $ns->tag( $tagline->getAttribute( "tag" ) );
494                 my $val = $tagline->textContent;
495                 $val =~ s/(\pM)//gso;
496                 $ns->value( $val );
497
498                 push @ns_list, $ns;
499         }
500
501         for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) {
502                 next unless $tagline;
503
504                 my $tag = $tagline->getAttribute( "tag" );
505                 my $ind1 = $tagline->getAttribute( "ind1" );
506                 my $ind2 = $tagline->getAttribute( "ind2" );
507
508                 for my $data ( $tagline->childNodes ) {
509                         next unless $data;
510
511                         my $ns = new Fieldmapper::metabib::full_rec;
512
513                         $ns->tag( $tag );
514                         $ns->ind1( $ind1 );
515                         $ns->ind2( $ind2 );
516                         $ns->subfield( $data->getAttribute( "code" ) );
517                         my $val = $data->textContent;
518                         $val =~ s/(\pM)//gso;
519                         $ns->value( lc($val) );
520
521                         push @ns_list, $ns;
522                 }
523         }
524         return @ns_list;
525 }
526
527 sub _get_field_value {
528
529         my( $root, $xpath ) = @_;
530
531         my $string = "";
532
533         # grab the set of matching nodes
534         my @nodes = $root->findnodes( $xpath );
535         for my $value (@nodes) {
536
537                 # grab all children of the node
538                 my @children = $value->childNodes();
539                 for my $child (@children) {
540
541                         # add the childs content to the growing buffer
542                         my $content = quotemeta($child->textContent);
543                         next if ($string =~ /$content/);  # uniquify the values
544                         $string .= $child->textContent . " ";
545                 }
546                 if( ! @children ) {
547                         $string .= $value->textContent . " ";
548                 }
549         }
550         $string =~ s/(\pM)//gso;
551         return lc($string);
552 }
553
554
555 sub modsdoc_to_values {
556         my( $self, $mods ) = @_;
557         my $data = {};
558         for my $class (keys %$xpathset) {
559                 $data->{$class} = {};
560                 for my $type (keys %{$xpathset->{$class}}) {
561                         $data->{$class}->{$type} = {};
562                         $data->{$class}->{$type}->{value} = _get_field_value( $mods, $xpathset->{$class}->{$type}->{xpath} );
563                         $data->{$class}->{$type}->{field_id} = $xpathset->{$class}->{$type}->{id};
564                 }
565         }
566         return $data;
567 }
568
569
570 1;
571
572