Typo: MODS doesn't define "abreviated"
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use OpenSRF::Utils::Logger qw/$logger/;
11 use Data::Dumper;
12
13 my $parser              = XML::LibXML->new();
14 my $xslt                        = XML::LibXSLT->new();
15 my $mods_sheet;
16
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
22                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath      = "//mods:mods/mods:abstract";
27 my $related_xpath               = "";
28 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
29 my $physical_desc               = "(//mods:physicalDescription/mods:form|//mods:physicalDescription/mods:extent|".
30         "//mods:physicalDescription/mods:reformattingQuality|//mods:physicalDescription/mods:internetMediaType|".
31         "//mods:physicalDescription/mods:digitalOrigin)";
32 my $toc_xpath                   = "//mods:tableOfContents";
33
34 my $xpathset = {
35
36         title => {
37                 abbreviated => 
38                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abbreviated')]",
39                 translated =>
40                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
41                 uniform =>
42                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
43                 proper =>
44                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
45                 any =>
46                         "//mods:mods/mods:titleInfo",
47         },
48
49         author => {
50                 corporate => 
51                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52                                 "[../mods:role/mods:text[text()='creator']".
53                                 " or ../mods:role/mods:roleTerm[".
54                                 "        \@type='text'".
55                                 "        and \@authority='marcrelator'".
56                                 "        and text()='creator']".
57                                 "][1]",
58                 personal => 
59                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
60                                 "[../mods:role/mods:text[text()='creator']".
61                                 " or ../mods:role/mods:roleTerm[".
62                                 "        \@type='text'".
63                                 "        and \@authority='marcrelator'".
64                                 "        and text()='creator']".
65                                 "][1]",
66                 conference => 
67                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
68                                 "[../mods:role/mods:text[text()='creator']".
69                                 " or ../mods:role/mods:roleTerm[".
70                                 "        \@type='text'".
71                                 "        and \@authority='marcrelator'".
72                                 "        and text()='creator']".
73                                 "][1]",
74                 other => 
75                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
76                 any => 
77                         "//mods:mods/mods:name/*[local-name()='namePart'][1]",
78         },
79
80         subject => {
81
82                 topic => 
83                         "//mods:mods/mods:subject/*[".
84                         "   local-name()='geographic'".
85                         "   or local-name()='name'".
86                         "   or local-name()='temporal'".
87                         "   or local-name()='topic'".
88                         "]/parent::mods:subject",
89
90 #               geographic => 
91 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
92 #               name => 
93 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
94 #               temporal => 
95 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
96 #               topic => 
97 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
98         },
99         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
100
101         series => {
102                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
103         }
104 };
105 # ----------------------------------------------------------------------------------------
106
107
108
109 sub new { return bless( {}, shift() ); }
110
111 sub get_field_value {
112
113         my( $self, $mods, $xpath ) = @_;
114
115         my @string;
116
117         my $root = $mods->documentElement;
118         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
119
120         try {
121                 # grab the set of matching nodes
122                 my @nodes = $root->findnodes( $xpath );
123                 for my $value (@nodes) {
124
125                         # grab all children of the node
126                         my @children = $value->childNodes();
127                         my @child_text;
128                         for my $child (@children) {
129                                 next unless( $child->nodeType != 3 );
130
131                                 if($child->childNodes) {
132                                         my @a;
133                                         for my $c (@{$child->childNodes}){
134                                                 push @a, $c->textContent;
135                                         }
136                                         push(@child_text, join(' ', @a));
137
138                                 } else {
139                                         push(@child_text, $child->textContent); 
140                                 }
141
142                         }
143                         if(@child_text) {
144                                 push(@string, \@child_text);
145                         }
146
147                         if( !@child_text  ) {
148                                 push(@string, $value->textContent );
149                         }
150                 }
151         } otherwise {
152                 $logger->info("MODS-izing failure: ".shift());
153                 $logger->info("Failed MODS xml: ".$root->toString);
154                 $logger->info("Failed MODS xpath: $xpath");
155         };
156         return @string;
157 }
158
159 =head
160 sub _modsdoc_to_values {
161         my( $self, $mods ) = @_;
162         my $data = {};
163         for my $class (keys %$xpathset) {
164                 $data->{$class} = {};
165                 for my $type(keys %{$xpathset->{$class}}) {
166                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
167                         if( $class eq "subject" ) {
168                                 push( @{$data->{$class}->{$type}},  @value );
169                         } else {
170                                 $data->{$class}->{$type} = $value[0];
171                         }
172                 }
173         }
174         return $data;
175 }
176 =cut
177
178 sub modsdoc_to_values {
179         my( $self, $mods ) = @_;
180         my $data = {};
181
182         {
183                 my $class = "subject";
184                 $data->{$class} = {};
185                 for my $type(keys %{$xpathset->{$class}}) {
186                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
187                         for my $arr (@value) {
188                                 push( @{$data->{$class}->{$type}},  $arr);
189                         }
190                 }
191         }
192
193         {
194                 my $class = "title";
195                 $data->{$class} = {};
196                 for my $type(keys %{$xpathset->{$class}}) {
197                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
198                         for my $arr (@value) {
199                                 if( ref($arr) ) {
200                                         $data->{$class}->{$type} = shift @$arr;
201
202                                         my $t = lc($data->{$class}->{$type});
203                                         if($t and $t =~ /^l[eoa]s|l[ae]|el|the|un[ae]?|an?\s?$/o ) {
204                                                 my $val = shift @$arr || "";
205                                                 $data->{$class}->{$type} .= " $val" if $data->{$class}->{$type};
206                                                 $data->{$class}->{$type} = " $val" unless $data->{$class}->{$type};
207                                         }
208
209                                         for my $t (@$arr) {
210                                                 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
211                                                 $data->{$class}->{$type} .= " $t";
212                                         }
213                                 } else {
214                                         $data->{$class}->{$type} = $arr;
215                                 }
216                         }
217                         $data->{$class}->{$type} =~ s/\s+/ /go if ($data->{$class}->{$type});
218                 }
219         }
220
221         {
222                 my $class = "author";
223                 $data->{$class} = {};
224                 for my $type(keys %{$xpathset->{$class}}) {
225                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
226                         $data->{$class}->{$type} = $value[0];
227                 }
228         }
229
230         {
231                 my $class = "series";
232                 $data->{$class} = {};
233                 for my $type(keys %{$xpathset->{$class}}) {
234                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
235                         for my $arr (@value) {
236                                 if( ref($arr) ) {
237                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
238                                 } else {
239                                         push( @{$data->{$class}->{$type}}, $arr );
240                                 }
241                         }
242                 }
243
244         }
245
246         return $data;
247 }
248
249
250
251
252 # ---------------------------------------------------------------------------
253 # Grabs the data 'we want' from the MODS doc and returns it in hash form
254 # ---------------------------------------------------------------------------
255 sub mods_values_to_mods_slim {
256         my( $self, $modsperl ) = @_;
257
258         my $title = "";
259         my $author = "";
260         my $subject = [];
261         my $series      = [];
262
263         my $tmp = $modsperl->{title};
264
265
266         if(!$tmp) { $title = ""; }
267         else {
268                 ($title = $tmp->{proper}) ||
269                 ($title = $tmp->{translated}) ||
270                 ($title = $tmp->{abbreviated}) ||
271                 ($title = $tmp->{uniform}) ||
272                 ($title = $tmp->{any});
273         }
274
275         $tmp = $modsperl->{author};
276         if(!$tmp) { $author = ""; }
277         else {
278                 ($author = $tmp->{personal}) ||
279                 ($author = $tmp->{corporate}) ||
280                 ($author = $tmp->{conference}) ||
281                 ($author = $tmp->{other}) ||
282                 ($author = $tmp->{any}); 
283         }
284
285         $tmp = $modsperl->{subject};
286         if(!$tmp) { $subject = {}; } 
287         else {
288                 for my $key( keys %{$tmp}) {
289                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
290                 }
291                 my $subh = {};
292                 for my $s (@$subject) {
293                         if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
294                 }
295                 $subject = $subh
296         }
297
298         $tmp = $modsperl->{'series'};
299         if(!$tmp) { $series = []; }
300         else { $series = $tmp->{'series'}; }
301
302
303         return { series => $series, title => $title, 
304                         author => $author, subject => $subject };
305 }
306
307
308
309 # ---------------------------------------------------------------------------
310 # Initializes a MARC -> Unified MODS batch process
311 # ---------------------------------------------------------------------------
312
313 sub start_mods_batch {
314
315         my( $self, $master_doc ) = @_;
316
317         if(!$master_doc) {
318                 $self->{master_doc} = undef;
319                 return;
320         }
321
322         if(!$mods_sheet) {
323                  my $xslt_doc = $parser->parse_file(
324                         OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS3.xsl");
325                 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
326         }
327
328
329         my $xmldoc = $parser->parse_string($master_doc);
330         my $mods = $mods_sheet->transform($xmldoc);
331
332         $self->{master_doc} = $self->modsdoc_to_values( $mods );
333         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
334
335         ($self->{master_doc}->{isbn}) = 
336                 $self->get_field_value( $mods, $isbn_xpath );
337
338         $self->{master_doc}->{type_of_resource} = 
339                 [ $self->get_field_value( $mods, $resource_xpath ) ];
340
341         ($self->{master_doc}->{tcn}) = 
342                 $self->get_field_value( $mods, $tcn_xpath );
343
344         ($self->{master_doc}->{pubdate}) = 
345                 $self->get_field_value( $mods, $pub_xpath );
346
347         ($self->{master_doc}->{publisher}) = 
348                 $self->get_field_value( $mods, $publisher_xpath );
349
350         ($self->{master_doc}->{edition}) =
351                 $self->get_field_value( $mods, $edition_xpath );
352
353
354
355 # ------------------------------
356         # holds an array of [ link, title, link, title, ... ]
357         $self->{master_doc}->{online_loc} = [];
358         push(@{$self->{master_doc}->{online_loc}},
359                 $self->get_field_value( $mods, $online_loc_xpath ));
360
361         ($self->{master_doc}->{synopsis}) = 
362                 $self->get_field_value( $mods, $abstract_xpath );
363
364         $self->{master_doc}->{physical_description} = [];
365         push(@{$self->{master_doc}->{physical_description}},
366                 $self->get_field_value( $mods, $physical_desc ) );
367         $self->{master_doc}->{physical_description} = 
368                 join( ' ', @{$self->{master_doc}->{physical_description}});
369
370         ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
371
372 }
373
374
375
376 # ---------------------------------------------------------------------------
377 # Takes a MARCXML string and adds it to the growing MODS doc
378 # ---------------------------------------------------------------------------
379 sub push_mods_batch {
380         my( $self, $marcxml ) = @_;
381
382         my $xmldoc = $parser->parse_string($marcxml);
383         my $mods = $mods_sheet->transform($xmldoc);
384
385         my $xmlperl = $self->modsdoc_to_values( $mods );
386         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
387
388         # for backwards compatibility, remove the array part when all is decided
389         if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
390                 for my $subject( @{$xmlperl->{subject}} ) {
391                         push @{$self->{master_doc}->{subject}}, $subject;
392                 }
393         } else {
394                 for my $subject ( keys %{$xmlperl->{subject}} ) {
395                         my $s = $self->{master_doc}->{subject};
396                         if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
397                 }
398         }
399
400         push( @{$self->{master_doc}->{type_of_resource}}, 
401                 $self->get_field_value( $mods, $resource_xpath ));
402
403         if(!($self->{master_doc}->{isbn}) ) {
404                 ($self->{master_doc}->{isbn}) = 
405                         $self->get_field_value( $mods, $isbn_xpath );
406         }
407 }
408
409
410 # ---------------------------------------------------------------------------
411 # Completes a MARC -> Unified MODS batch process and returns the perl hash
412 # ---------------------------------------------------------------------------
413 sub init_virtual_record {
414         my $record = Fieldmapper::metabib::virtual_record->new;
415         $record->subject([]);
416         $record->types_of_resource([]);
417         $record->call_numbers([]);
418         return $record;
419 }
420
421 sub finish_mods_batch {
422         my $self = shift;
423
424         return undef unless $self->{master_doc};
425
426         my $perl = $self->{master_doc};
427         my $record = init_virtual_record();
428
429         # turn the hash into a fieldmapper object
430         #(my $title = $perl->{title}) =~ s/\[.*?\]//og;
431         #(my $author = $perl->{author}) =~ s/\(.*?\)//og;
432         my $title = $perl->{title};
433         my $author = $perl->{author};
434
435         my @series;
436         for my $s (@{$perl->{series}}) {
437                 push @series, (split( /\s*;/, $s ))[0];
438         }
439
440         # uniquify the types of resource
441         my $rtypes = $perl->{type_of_resource};
442         my %hash = map { ($_ => 1) } @$rtypes;
443         $rtypes = [ keys %hash ];
444
445         $record->title($title);
446         $record->author($author);
447
448         $record->doc_id($perl->{doc_id});
449         $record->isbn($perl->{isbn});
450         $record->pubdate($perl->{pubdate});
451         $record->publisher($perl->{publisher});
452         $record->tcn($perl->{tcn});
453
454         $record->edition($perl->{edition});
455
456         $record->subject($perl->{subject});
457         $record->types_of_resource($rtypes);
458         $record->series(\@series);
459
460         $record->online_loc($perl->{online_loc});
461         $record->synopsis($perl->{synopsis});
462         $record->physical_description($perl->{physical_description});
463         $record->toc($perl->{toc});
464
465         $self->{master_doc} = undef;
466         return $record;
467 }
468
469
470