]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
non-exploding modsparser
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use OpenSRF::Utils::Logger qw/$logger/;
11 use Data::Dumper;
12
13 my $parser              = XML::LibXML->new();
14 my $xslt                        = XML::LibXSLT->new();
15 my $mods_sheet;
16
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
22                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath      = "//mods:mods/mods:abstract";
27 my $related_xpath               = "";
28 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
29 my $physical_desc               = "(//mods:physicalDescription/mods:form|//mods:physicalDescription/mods:extent|".
30         "//mods:physicalDescription/mods:reformattingQuality|//mods:physicalDescription/mods:internetMediaType|".
31         "//mods:physicalDescription/mods:digitalOrigin)";
32 my $toc_xpath                   = "//mods:tableOfContents";
33
34 my $xpathset = {
35
36         title => {
37                 abbreviated => 
38                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
39                 translated =>
40                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
41                 uniform =>
42                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
43                 proper =>
44                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
45                 any =>
46                         "//mods:mods/mods:titleInfo",
47         },
48
49         author => {
50                 corporate => 
51                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52                                 "[../mods:role/mods:text[text()='creator']][1]",
53                 personal => 
54                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
55                                 "[../mods:role/mods:text[text()='creator']][1]",
56                 conference => 
57                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
58                                 "[../mods:role/mods:text[text()='creator']][1]",
59                 other => 
60                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
61         },
62
63         subject => {
64
65                 topic => 
66                         "//mods:mods/mods:subject/*[local-name()='geographic' or local-name()='name' or local-name()='temporal' or local-name()='topic']/parent::mods:subject",
67
68 #               geographic => 
69 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
70 #               name => 
71 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
72 #               temporal => 
73 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
74 #               topic => 
75 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
76         },
77         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
78
79         series => {
80                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
81         }
82 };
83 # ----------------------------------------------------------------------------------------
84
85
86
87 sub new { return bless( {}, shift() ); }
88
89 sub get_field_value {
90
91         my( $self, $mods, $xpath ) = @_;
92
93         my @string;
94
95         my $root = $mods->documentElement;
96         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
97
98         try {
99                 # grab the set of matching nodes
100                 my @nodes = $root->findnodes( $xpath );
101                 for my $value (@nodes) {
102
103                         # grab all children of the node
104                         my @children = $value->childNodes();
105                         my @child_text;
106                         for my $child (@children) {
107                                 next unless( $child->nodeType != 3 );
108
109                                 if($child->childNodes) {
110                                         my @a;
111                                         for my $c (@{$child->childNodes}){
112                                                 push @a, $c->textContent;
113                                         }
114                                         push(@child_text, join(' ', @a));
115
116                                 } else {
117                                         push(@child_text, $child->textContent); 
118                                 }
119
120                         }
121                         if(@child_text) {
122                                 push(@string, \@child_text);
123                         }
124
125                         if( !@child_text  ) {
126                                 push(@string, $value->textContent );
127                         }
128                 }
129         } otherwise {
130                 $logger->info("MODS-izing failure: ".shift());
131                 $logger->info("Failed MODS xml: ".$root->toString);
132                 $logger->info("Failed MODS xpath: $xpath");
133         };
134         return @string;
135 }
136
137 =head
138 sub _modsdoc_to_values {
139         my( $self, $mods ) = @_;
140         my $data = {};
141         for my $class (keys %$xpathset) {
142                 $data->{$class} = {};
143                 for my $type(keys %{$xpathset->{$class}}) {
144                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
145                         if( $class eq "subject" ) {
146                                 push( @{$data->{$class}->{$type}},  @value );
147                         } else {
148                                 $data->{$class}->{$type} = $value[0];
149                         }
150                 }
151         }
152         return $data;
153 }
154 =cut
155
156 sub modsdoc_to_values {
157         my( $self, $mods ) = @_;
158         my $data = {};
159
160         {
161                 my $class = "subject";
162                 $data->{$class} = {};
163                 for my $type(keys %{$xpathset->{$class}}) {
164                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
165                         for my $arr (@value) {
166                                 push( @{$data->{$class}->{$type}},  $arr);
167                         }
168                 }
169         }
170
171         {
172                 my $class = "title";
173                 $data->{$class} = {};
174                 for my $type(keys %{$xpathset->{$class}}) {
175                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
176                         for my $arr (@value) {
177                                 if( ref($arr) ) {
178                                         $data->{$class}->{$type} = shift @$arr;
179
180                                         my $t = lc($data->{$class}->{$type});
181                                         if($t and $t =~ /^the|an?/o ) {
182                                                 my $val = shift @$arr || "";
183                                                 $data->{$class}->{$type} .= " $val" if $data->{$class}->{$type};
184                                                 $data->{$class}->{$type} = " $val" unless $data->{$class}->{$type};
185                                         }
186
187                                         for my $t (@$arr) {
188                                                 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
189                                                 $data->{$class}->{$type} .= " $t";
190                                         }
191                                 } else {
192                                         $data->{$class}->{$type} = $arr;
193                                 }
194                         }
195                 }
196         }
197
198         {
199                 my $class = "author";
200                 $data->{$class} = {};
201                 for my $type(keys %{$xpathset->{$class}}) {
202                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
203                         $data->{$class}->{$type} = $value[0];
204                 }
205         }
206
207         {
208                 my $class = "series";
209                 $data->{$class} = {};
210                 for my $type(keys %{$xpathset->{$class}}) {
211                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
212                         for my $arr (@value) {
213                                 if( ref($arr) ) {
214                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
215                                 } else {
216                                         push( @{$data->{$class}->{$type}}, $arr );
217                                 }
218                         }
219                 }
220
221         }
222
223         return $data;
224 }
225
226
227
228
229 # ---------------------------------------------------------------------------
230 # Grabs the data 'we want' from the MODS doc and returns it in hash form
231 # ---------------------------------------------------------------------------
232 sub mods_values_to_mods_slim {
233         my( $self, $modsperl ) = @_;
234
235         my $title = "";
236         my $author = "";
237         my $subject = [];
238         my $series      = [];
239
240         my $tmp = $modsperl->{title};
241
242
243         if(!$tmp) { $title = ""; }
244         else {
245                 ($title = $tmp->{proper}) ||
246                 ($title = $tmp->{translated}) ||
247                 ($title = $tmp->{abbreviated}) ||
248                 ($title = $tmp->{uniform}) ||
249                 ($title = $tmp->{any});
250         }
251
252         $tmp = $modsperl->{author};
253         if(!$tmp) { $author = ""; }
254         else {
255                 ($author = $tmp->{personal}) ||
256                 ($author = $tmp->{other}) ||
257                 ($author = $tmp->{corporate}) ||
258                 ($author = $tmp->{conference}); 
259         }
260
261         $tmp = $modsperl->{subject};
262         if(!$tmp) { $subject = {}; } 
263         else {
264                 for my $key( keys %{$tmp}) {
265                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
266                 }
267                 my $subh = {};
268                 for my $s (@$subject) {
269                         if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
270                 }
271                 $subject = $subh
272         }
273
274         $tmp = $modsperl->{'series'};
275         if(!$tmp) { $series = []; }
276         else { $series = $tmp->{'series'}; }
277
278
279         return { series => $series, title => $title, 
280                         author => $author, subject => $subject };
281 }
282
283
284
285 # ---------------------------------------------------------------------------
286 # Initializes a MARC -> Unified MODS batch process
287 # ---------------------------------------------------------------------------
288
289 sub start_mods_batch {
290
291         my( $self, $master_doc ) = @_;
292
293         if(!$master_doc) {
294                 $self->{master_doc} = undef;
295                 return;
296         }
297
298         if(!$mods_sheet) {
299                  my $xslt_doc = $parser->parse_file(
300                         OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS3.xsl");
301                 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
302         }
303
304
305         my $xmldoc = $parser->parse_string($master_doc);
306         my $mods = $mods_sheet->transform($xmldoc);
307
308         $self->{master_doc} = $self->modsdoc_to_values( $mods );
309         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
310
311         ($self->{master_doc}->{isbn}) = 
312                 $self->get_field_value( $mods, $isbn_xpath );
313
314         $self->{master_doc}->{type_of_resource} = 
315                 [ $self->get_field_value( $mods, $resource_xpath ) ];
316
317         ($self->{master_doc}->{tcn}) = 
318                 $self->get_field_value( $mods, $tcn_xpath );
319
320         ($self->{master_doc}->{pubdate}) = 
321                 $self->get_field_value( $mods, $pub_xpath );
322
323         ($self->{master_doc}->{publisher}) = 
324                 $self->get_field_value( $mods, $publisher_xpath );
325
326         ($self->{master_doc}->{edition}) =
327                 $self->get_field_value( $mods, $edition_xpath );
328
329
330
331 # ------------------------------
332         # holds an array of [ link, title, link, title, ... ]
333         $self->{master_doc}->{online_loc} = [];
334         push(@{$self->{master_doc}->{online_loc}},
335                 $self->get_field_value( $mods, $online_loc_xpath ));
336
337         ($self->{master_doc}->{synopsis}) = 
338                 $self->get_field_value( $mods, $abstract_xpath );
339
340         $self->{master_doc}->{physical_description} = [];
341         push(@{$self->{master_doc}->{physical_description}},
342                 $self->get_field_value( $mods, $physical_desc ) );
343         $self->{master_doc}->{physical_description} = 
344                 join( ' ', @{$self->{master_doc}->{physical_description}});
345
346         ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
347
348 }
349
350
351
352 # ---------------------------------------------------------------------------
353 # Takes a MARCXML string and adds it to the growing MODS doc
354 # ---------------------------------------------------------------------------
355 sub push_mods_batch {
356         my( $self, $marcxml ) = @_;
357
358         my $xmldoc = $parser->parse_string($marcxml);
359         my $mods = $mods_sheet->transform($xmldoc);
360
361         my $xmlperl = $self->modsdoc_to_values( $mods );
362         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
363
364         # for backwards compatibility, remove the array part when all is decided
365         if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
366                 for my $subject( @{$xmlperl->{subject}} ) {
367                         push @{$self->{master_doc}->{subject}}, $subject;
368                 }
369         } else {
370                 for my $subject ( keys %{$xmlperl->{subject}} ) {
371                         my $s = $self->{master_doc}->{subject};
372                         if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
373                 }
374         }
375
376         push( @{$self->{master_doc}->{type_of_resource}}, 
377                 $self->get_field_value( $mods, $resource_xpath ));
378
379         if(!($self->{master_doc}->{isbn}) ) {
380                 ($self->{master_doc}->{isbn}) = 
381                         $self->get_field_value( $mods, $isbn_xpath );
382         }
383 }
384
385
386 # ---------------------------------------------------------------------------
387 # Completes a MARC -> Unified MODS batch process and returns the perl hash
388 # ---------------------------------------------------------------------------
389 sub init_virtual_record {
390         my $record = new Fieldmapper::metabib::virtual_record;
391         $record->subject([]);
392         $record->types_of_resource([]);
393         $record->call_numbers([]);
394         return $record;
395 }
396
397 sub finish_mods_batch {
398         my $self = shift;
399
400         return undef unless $self->{master_doc};
401
402         my $perl = $self->{master_doc};
403         my $record = init_virtual_record();
404
405         # turn the hash into a fieldmapper object
406         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
407         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
408
409         my @series;
410         for my $s (@{$perl->{series}}) {
411                 push @series, (split( /\s*;/, $s ))[0];
412         }
413
414         # uniquify the types of resource
415         my $rtypes = $perl->{type_of_resource};
416         my %hash = map { ($_ => 1) } @$rtypes;
417         $rtypes = [ keys %hash ];
418
419         $record->title($title);
420         $record->author($author);
421
422         $record->doc_id($perl->{doc_id});
423         $record->isbn($perl->{isbn});
424         $record->pubdate($perl->{pubdate});
425         $record->publisher($perl->{publisher});
426         $record->tcn($perl->{tcn});
427
428         $record->edition($perl->{edition});
429
430         $record->subject($perl->{subject});
431         $record->types_of_resource($rtypes);
432         $record->series(\@series);
433
434         $record->online_loc($perl->{online_loc});
435         $record->synopsis($perl->{synopsis});
436         $record->physical_description($perl->{physical_description});
437         $record->toc($perl->{toc});
438
439         $self->{master_doc} = undef;
440         return $record;
441 }
442
443
444