]> git.evergreen-ils.org Git - working/Evergreen.git/blob - Open-ILS/src/perlmods/lib/OpenILS/Utils/ModsParser.pm
Merge branch 'master' of git.evergreen-ils.org:Evergreen into template-toolkit-opac
[working/Evergreen.git] / Open-ILS / src / perlmods / lib / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use OpenSRF::Utils::Logger qw/$logger/;
11 use Data::Dumper;
12
13 my $parser              = XML::LibXML->new();
14 my $xslt                        = XML::LibXSLT->new();
15 my $mods_sheet;
16
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn' and not(\@invalid)]";
20 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
22                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath      = "//mods:mods/mods:abstract";
27 my $related_xpath               = "";
28 my $online_loc_xpath = "//mods:location/mods:url";
29 my $physical_desc               = "(//mods:mods/mods:physicalDescription/mods:form|//mods:mods/mods:physicalDescription/mods:extent|".
30         "//mods:mods/mods:physicalDescription/mods:reformattingQuality|//mods:mods/mods:physicalDescription/mods:internetMediaType|".
31         "//mods:mods/mods:physicalDescription/mods:digitalOrigin)";
32 my $toc_xpath                   = "//mods:tableOfContents";
33
34 my $xpathset = {
35
36         title => {
37                 abbreviated => 
38                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abbreviated')]",
39                 translated =>
40                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
41                 uniform =>
42                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
43                 proper =>
44                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
45                 any =>
46                         "//mods:mods/mods:titleInfo",
47         },
48
49         author => {
50                 corporate => 
51                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52                                 "[../mods:role/mods:text[text()='creator']".
53                                 " or ../mods:role/mods:roleTerm[".
54                                 "        \@type='text'".
55                                 "        and \@authority='marcrelator'".
56                                 "        and text()='creator']".
57                                 "][1]",
58                 personal => 
59                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
60                                 "[../mods:role/mods:text[text()='creator']".
61                                 " or ../mods:role/mods:roleTerm[".
62                                 "        \@type='text'".
63                                 "        and \@authority='marcrelator'".
64                                 "        and text()='creator']".
65                                 "][1]",
66                 conference => 
67                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
68                                 "[../mods:role/mods:text[text()='creator']".
69                                 " or ../mods:role/mods:roleTerm[".
70                                 "        \@type='text'".
71                                 "        and \@authority='marcrelator'".
72                                 "        and text()='creator']".
73                                 "][1]",
74                 other => 
75                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
76                 any => 
77                         "//mods:mods/mods:name/*[local-name()='namePart'][1]",
78         },
79
80         subject => {
81
82                 topic => 
83                         "//mods:mods/mods:subject/*[".
84                         "   local-name()='geographic'".
85                         "   or local-name()='name'".
86                         "   or local-name()='temporal'".
87                         "   or local-name()='topic'".
88                         "]/parent::mods:subject",
89
90 #               geographic => 
91 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
92 #               name => 
93 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
94 #               temporal => 
95 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
96 #               topic => 
97 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
98         },
99         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
100
101         series => {
102                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
103         }
104 };
105 # ----------------------------------------------------------------------------------------
106
107
108
109 sub new { return bless( {}, shift() ); }
110
111 sub get_field_value {
112
113         my( $self, $mods, $xpath, $type) = @_;
114
115         my @string;
116
117         my $root = $mods->documentElement;
118         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
119
120         try {
121                 # grab the set of matching nodes
122                 my @nodes = $root->findnodes( $xpath );
123                 for my $value (@nodes) {
124
125                         # grab all children of the node
126                         my @children = $value->childNodes();
127                         my @child_text;
128                         for my $child (@children) {
129                                 # MODS strips the punctuation from 245abc, which often
130                                 # results in "title subtitle" rather than "title : subtitle";
131                                 # this hack gets it back for us
132                                 if ($type && $type eq 'title' && $child->nodeName =~ m/subTitle/) {
133                                         push(@child_text, " : "); 
134                                 }
135                                 next unless( $child->nodeType != 3 );
136
137                                 if($child->childNodes) {
138                                         my @a;
139                                         for my $c (@{$child->childNodes}){
140                                                 push @a, $c->textContent;
141                                         }
142                                         push(@child_text, join(' ', @a));
143
144                                 } else {
145                                         push(@child_text, $child->textContent); 
146                                 }
147
148                         }
149                         if(@child_text) {
150                                 push(@string, \@child_text);
151                         }
152
153                         if( !@child_text  ) {
154                                 push(@string, $value->textContent );
155                         }
156                 }
157         } otherwise {
158                 $logger->info("MODS-izing failure: ".shift());
159                 $logger->info("Failed MODS xml: ".$root->toString);
160                 $logger->info("Failed MODS xpath: $xpath");
161         };
162         return @string;
163 }
164
165 =head
166 sub _modsdoc_to_values {
167         my( $self, $mods ) = @_;
168         my $data = {};
169         for my $class (keys %$xpathset) {
170                 $data->{$class} = {};
171                 for my $type(keys %{$xpathset->{$class}}) {
172                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
173                         if( $class eq "subject" ) {
174                                 push( @{$data->{$class}->{$type}},  @value );
175                         } else {
176                                 $data->{$class}->{$type} = $value[0];
177                         }
178                 }
179         }
180         return $data;
181 }
182 =cut
183
184 sub modsdoc_to_values {
185         my( $self, $mods ) = @_;
186         my $data = {};
187
188         {
189                 my $class = "subject";
190                 $data->{$class} = {};
191                 for my $type(keys %{$xpathset->{$class}}) {
192                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
193                         for my $arr (@value) {
194                                 push( @{$data->{$class}->{$type}},  $arr);
195                         }
196                 }
197         }
198
199         {
200                 my $class = "title";
201                 $data->{$class} = {};
202                 for my $type(keys %{$xpathset->{$class}}) {
203                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type}, "title" );
204                         for my $arr (@value) {
205                                 if( ref($arr) ) {
206                                         $data->{$class}->{$type} = shift @$arr;
207
208                                         my $t = lc($data->{$class}->{$type});
209                                         if($t and $t =~ /^l[eoa]s|l[ae]|el|the|un[ae]?|an?\s?$/o ) {
210                                                 my $val = shift @$arr || "";
211                                                 $data->{$class}->{$type} .= " $val" if $data->{$class}->{$type};
212                                                 $data->{$class}->{$type} = " $val" unless $data->{$class}->{$type};
213                                         }
214
215                                         for my $t (@$arr) {
216                                                 $data->{$class}->{$type} .= " $t";
217                                         }
218                                 } else {
219                                         $data->{$class}->{$type} = $arr;
220                                 }
221                         }
222                         $data->{$class}->{$type} =~ s/\s+/ /go if ($data->{$class}->{$type});
223                 }
224         }
225
226         {
227                 my $class = "author";
228                 $data->{$class} = {};
229                 for my $type(keys %{$xpathset->{$class}}) {
230                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
231                         $data->{$class}->{$type} = $value[0];
232                 }
233         }
234
235         {
236                 my $class = "series";
237                 $data->{$class} = {};
238                 for my $type(keys %{$xpathset->{$class}}) {
239                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
240                         for my $arr (@value) {
241                                 if( ref($arr) ) {
242                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
243                                 } else {
244                                         push( @{$data->{$class}->{$type}}, $arr );
245                                 }
246                         }
247                 }
248
249         }
250
251         return $data;
252 }
253
254
255
256
257 # ---------------------------------------------------------------------------
258 # Grabs the data 'we want' from the MODS doc and returns it in hash form
259 # ---------------------------------------------------------------------------
260 sub mods_values_to_mods_slim {
261         my( $self, $modsperl ) = @_;
262
263         my $title = "";
264         my $author = "";
265         my $subject = [];
266         my $series      = [];
267
268         my $tmp = $modsperl->{title};
269
270
271         if(!$tmp) { $title = ""; }
272         else {
273                 ($title = $tmp->{proper}) ||
274                 ($title = $tmp->{translated}) ||
275                 ($title = $tmp->{abbreviated}) ||
276                 ($title = $tmp->{uniform}) ||
277                 ($title = $tmp->{any});
278         }
279
280         $tmp = $modsperl->{author};
281         if(!$tmp) { $author = ""; }
282         else {
283                 ($author = $tmp->{personal}) ||
284                 ($author = $tmp->{corporate}) ||
285                 ($author = $tmp->{conference}) ||
286                 ($author = $tmp->{other}) ||
287                 ($author = $tmp->{any}); 
288         }
289
290         $tmp = $modsperl->{subject};
291         if(!$tmp) { $subject = {}; } 
292         else {
293                 for my $key( keys %{$tmp}) {
294                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
295                 }
296                 my $subh = {};
297                 for my $s (@$subject) {
298                         if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
299                 }
300                 $subject = $subh
301         }
302
303         $tmp = $modsperl->{'series'};
304         if(!$tmp) { $series = []; }
305         else { $series = $tmp->{'series'}; }
306
307
308         return { series => $series, title => $title, 
309                         author => $author, subject => $subject };
310 }
311
312
313
314 # ---------------------------------------------------------------------------
315 # Initializes a MARC -> Unified MODS batch process
316 # ---------------------------------------------------------------------------
317
318 sub start_mods_batch {
319
320         my( $self, $master_doc ) = @_;
321
322         if(!$master_doc) {
323                 $self->{master_doc} = undef;
324                 return;
325         }
326
327         if(!$mods_sheet) {
328                  my $xslt_doc = $parser->parse_file(
329                         OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS32.xsl");
330                 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
331         }
332
333
334         my $xmldoc = $parser->parse_string($master_doc);
335         my $mods = $mods_sheet->transform($xmldoc);
336
337         $self->{master_doc} = $self->modsdoc_to_values( $mods );
338         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
339
340         ($self->{master_doc}->{isbn}) = 
341                 $self->get_field_value( $mods, $isbn_xpath );
342
343         $self->{master_doc}->{type_of_resource} = 
344                 [ $self->get_field_value( $mods, $resource_xpath ) ];
345
346         ($self->{master_doc}->{tcn}) = 
347                 $self->get_field_value( $mods, $tcn_xpath );
348
349         ($self->{master_doc}->{pubdate}) = 
350                 $self->get_field_value( $mods, $pub_xpath );
351
352         ($self->{master_doc}->{publisher}) = 
353                 $self->get_field_value( $mods, $publisher_xpath );
354
355         ($self->{master_doc}->{edition}) =
356                 $self->get_field_value( $mods, $edition_xpath );
357
358
359
360 # ------------------------------
361         # holds an array of [ link, title, link, title, ... ]
362         $self->{master_doc}->{online_loc} = [];
363         for my $url ($mods->findnodes($online_loc_xpath)) {
364                 push(@{$self->{master_doc}->{online_loc}}, $url->textContent);
365                 push(@{$self->{master_doc}->{online_loc}}, $url->getAttribute('displayLabel') || '');
366                 push(@{$self->{master_doc}->{online_loc}}, $url->getAttribute('note') || '');
367         }
368
369         ($self->{master_doc}->{synopsis}) = 
370                 $self->get_field_value( $mods, $abstract_xpath );
371
372         $self->{master_doc}->{physical_description} = [];
373         push(@{$self->{master_doc}->{physical_description}},
374                 $self->get_field_value( $mods, $physical_desc ) );
375         $self->{master_doc}->{physical_description} = 
376                 join( ' ', @{$self->{master_doc}->{physical_description}});
377
378         ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
379
380 }
381
382
383
384 # ---------------------------------------------------------------------------
385 # Takes a MARCXML string and adds it to the growing MODS doc
386 # ---------------------------------------------------------------------------
387 sub push_mods_batch {
388         my( $self, $marcxml ) = @_;
389
390         my $xmldoc = $parser->parse_string($marcxml);
391         my $mods = $mods_sheet->transform($xmldoc);
392
393         my $xmlperl = $self->modsdoc_to_values( $mods );
394         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
395
396         # for backwards compatibility, remove the array part when all is decided
397         if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
398                 for my $subject( @{$xmlperl->{subject}} ) {
399                         push @{$self->{master_doc}->{subject}}, $subject;
400                 }
401         } else {
402                 for my $subject ( keys %{$xmlperl->{subject}} ) {
403                         my $s = $self->{master_doc}->{subject};
404                         if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
405                 }
406         }
407
408         push( @{$self->{master_doc}->{type_of_resource}}, 
409                 $self->get_field_value( $mods, $resource_xpath ));
410
411         if(!($self->{master_doc}->{isbn}) ) {
412                 ($self->{master_doc}->{isbn}) = 
413                         $self->get_field_value( $mods, $isbn_xpath );
414         }
415 }
416
417
418 # ---------------------------------------------------------------------------
419 # Completes a MARC -> Unified MODS batch process and returns the perl hash
420 # ---------------------------------------------------------------------------
421 sub init_virtual_record {
422         my $record = Fieldmapper::metabib::virtual_record->new;
423         $record->subject([]);
424         $record->types_of_resource([]);
425         $record->call_numbers([]);
426         return $record;
427 }
428
429 sub finish_mods_batch {
430         my $self = shift;
431
432         return undef unless $self->{master_doc};
433
434         my $perl = $self->{master_doc};
435         my $record = init_virtual_record();
436
437         # turn the hash into a fieldmapper object
438         #(my $title = $perl->{title}) =~ s/\[.*?\]//og;
439         #(my $author = $perl->{author}) =~ s/\(.*?\)//og;
440         my $title = $perl->{title};
441         my $author = $perl->{author};
442
443         my @series;
444         for my $s (@{$perl->{series}}) {
445                 push @series, (split( /\s*;/, $s ))[0];
446         }
447
448         # uniquify the types of resource
449         my $rtypes = $perl->{type_of_resource};
450         my %hash = map { ($_ => 1) } @$rtypes;
451         $rtypes = [ keys %hash ];
452
453         $record->title($title);
454         $record->author($author);
455
456         $record->doc_id($perl->{doc_id});
457         $record->isbn($perl->{isbn});
458         $record->pubdate($perl->{pubdate});
459         $record->publisher($perl->{publisher});
460         $record->tcn($perl->{tcn});
461
462         $record->edition($perl->{edition});
463
464         $record->subject($perl->{subject});
465         $record->types_of_resource($rtypes);
466         $record->series(\@series);
467
468         $record->online_loc($perl->{online_loc});
469         $record->synopsis($perl->{synopsis});
470         $record->physical_description($perl->{physical_description});
471         $record->toc($perl->{toc});
472
473         $self->{master_doc} = undef;
474         return $record;
475 }
476
477
478