2bc47d87a6c39229713933bc07431e8f289f4c8c
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use Data::Dumper;
11
12 my $parser              = XML::LibXML->new();
13 my $xslt                        = XML::LibXSLT->new();
14 my $mods_sheet;
15
16 # ----------------------------------------------------------------------------------------
17 # XPATH for extracting info from a MODS doc
18 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
19 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
20 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
21                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
22 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
23 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
24 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
25 my $abstract_xpath      = "//mods:mods/mods:abstract";
26 my $related_xpath               = "";
27 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
28 my $physical_desc               = "(//mods:physicalDescription/mods:form|//mods:physicalDescription/mods:extent|".
29         "//mods:physicalDescription/mods:reformattingQuality|//mods:physicalDescription/mods:internetMediaType|".
30         "//mods:physicalDescription/mods:digitalOrigin)";
31 my $toc_xpath                   = "//mods:tableOfContents";
32
33 my $xpathset = {
34
35         title => {
36                 abbreviated => 
37                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
38                 translated =>
39                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
40                 uniform =>
41                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
42                 proper =>
43                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
44                 any =>
45                         "//mods:mods/mods:titleInfo",
46         },
47
48         author => {
49                 corporate => 
50                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
51                                 "[../mods:role/mods:text[text()='creator']][1]",
52                 personal => 
53                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
54                                 "[../mods:role/mods:text[text()='creator']][1]",
55                 conference => 
56                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
57                                 "[../mods:role/mods:text[text()='creator']][1]",
58                 other => 
59                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
60         },
61
62         subject => {
63
64                 topic => 
65                         "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
66
67 #               geographic => 
68 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
69 #               name => 
70 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
71 #               temporal => 
72 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
73 #               topic => 
74 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
75         },
76         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
77
78         series => {
79                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
80         }
81 };
82 # ----------------------------------------------------------------------------------------
83
84
85
86 sub new { return bless( {}, shift() ); }
87
88 sub get_field_value {
89
90         my( $self, $mods, $xpath ) = @_;
91
92         my @string;
93         my $root = $mods->documentElement;
94         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
95
96         # grab the set of matching nodes
97         my @nodes = $root->findnodes( $xpath );
98         for my $value (@nodes) {
99
100                 # grab all children of the node
101                 my @children = $value->childNodes();
102                 my @child_text;
103                 for my $child (@children) {
104                         next unless( $child->nodeType != 3 );
105
106                         if($child->childNodes) {
107                                 my @a;
108                                 for my $c (@{$child->childNodes}){
109                                         push @a, $c->textContent;
110                                 }
111                                 push(@child_text, join(' ', @a));
112
113                         } else {
114                                 push(@child_text, $child->textContent); 
115                         }
116
117                 }
118                 if(@child_text) {
119                         push(@string, \@child_text);
120                 }
121
122                 if( !@child_text  ) {
123                         push(@string, $value->textContent );
124                 }
125         }
126         return @string;
127 }
128
129 =head
130 sub _modsdoc_to_values {
131         my( $self, $mods ) = @_;
132         my $data = {};
133         for my $class (keys %$xpathset) {
134                 $data->{$class} = {};
135                 for my $type(keys %{$xpathset->{$class}}) {
136                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
137                         if( $class eq "subject" ) {
138                                 push( @{$data->{$class}->{$type}},  @value );
139                         } else {
140                                 $data->{$class}->{$type} = $value[0];
141                         }
142                 }
143         }
144         return $data;
145 }
146 =cut
147
148 sub modsdoc_to_values {
149         my( $self, $mods ) = @_;
150         my $data = {};
151
152         {
153                 my $class = "subject";
154                 $data->{$class} = {};
155                 for my $type(keys %{$xpathset->{$class}}) {
156                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
157                         for my $arr (@value) {
158                                 push( @{$data->{$class}->{$type}},  $arr);
159                         }
160                 }
161         }
162
163         {
164                 my $class = "title";
165                 $data->{$class} = {};
166                 for my $type(keys %{$xpathset->{$class}}) {
167                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
168                         for my $arr (@value) {
169                                 if( ref($arr) ) {
170                                         $data->{$class}->{$type} = shift @$arr;
171                                         $data->{$class}->{$type} .= ' ' . shift @$arr if (lc($data->{$class}->{$type}) =~ /^the|an?/o);
172                                         for my $t (@$arr) {
173                                                 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
174                                                 $data->{$class}->{$type} .= " $t";
175                                         }
176                                 } else {
177                                         $data->{$class}->{$type} = $arr;
178                                 }
179                         }
180                 }
181         }
182
183         {
184                 my $class = "author";
185                 $data->{$class} = {};
186                 for my $type(keys %{$xpathset->{$class}}) {
187                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
188                         $data->{$class}->{$type} = $value[0];
189                 }
190         }
191
192         {
193                 my $class = "series";
194                 $data->{$class} = {};
195                 for my $type(keys %{$xpathset->{$class}}) {
196                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
197                         for my $arr (@value) {
198                                 if( ref($arr) ) {
199                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
200                                 } else {
201                                         push( @{$data->{$class}->{$type}}, $arr );
202                                 }
203                         }
204                 }
205
206         }
207
208         return $data;
209 }
210
211
212
213
214 # ---------------------------------------------------------------------------
215 # Grabs the data 'we want' from the MODS doc and returns it in hash form
216 # ---------------------------------------------------------------------------
217 sub mods_values_to_mods_slim {
218         my( $self, $modsperl ) = @_;
219
220         my $title = "";
221         my $author = "";
222         my $subject = [];
223         my $series      = [];
224
225         my $tmp = $modsperl->{title};
226
227
228         if(!$tmp) { $title = ""; }
229         else {
230                 ($title = $tmp->{proper}) ||
231                 ($title = $tmp->{translated}) ||
232                 ($title = $tmp->{abbreviated}) ||
233                 ($title = $tmp->{uniform}) ||
234                 ($title = $tmp->{any});
235         }
236
237         $tmp = $modsperl->{author};
238         if(!$tmp) { $author = ""; }
239         else {
240                 ($author = $tmp->{personal}) ||
241                 ($author = $tmp->{other}) ||
242                 ($author = $tmp->{corporate}) ||
243                 ($author = $tmp->{conference}); 
244         }
245
246         $tmp = $modsperl->{subject};
247         if(!$tmp) { $subject = {}; } 
248         else {
249                 for my $key( keys %{$tmp}) {
250                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
251                 }
252                 my $subh = {};
253                 for my $s (@$subject) {
254                         if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
255                 }
256                 $subject = $subh
257         }
258
259         $tmp = $modsperl->{'series'};
260         if(!$tmp) { $series = []; }
261         else { $series = $tmp->{'series'}; }
262
263
264         return { series => $series, title => $title, 
265                         author => $author, subject => $subject };
266 }
267
268
269
270 # ---------------------------------------------------------------------------
271 # Initializes a MARC -> Unified MODS batch process
272 # ---------------------------------------------------------------------------
273
274 sub start_mods_batch {
275
276         my( $self, $master_doc ) = @_;
277
278
279         if(!$mods_sheet) {
280                  my $xslt_doc = $parser->parse_file(
281                         OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS3.xsl");
282                 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
283         }
284
285
286         my $xmldoc = $parser->parse_string($master_doc);
287         my $mods = $mods_sheet->transform($xmldoc);
288
289 #       warn "-" x 100 . "\n";
290 #       warn "MODS " . $mods->toString(1) . "\n";
291 #       warn "-" x 100 . "\n";
292
293         $self->{master_doc} = $self->modsdoc_to_values( $mods );
294         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
295
296         ($self->{master_doc}->{isbn}) = 
297                 $self->get_field_value( $mods, $isbn_xpath );
298
299         $self->{master_doc}->{type_of_resource} = 
300                 [ $self->get_field_value( $mods, $resource_xpath ) ];
301
302         ($self->{master_doc}->{tcn}) = 
303                 $self->get_field_value( $mods, $tcn_xpath );
304
305         ($self->{master_doc}->{pubdate}) = 
306                 $self->get_field_value( $mods, $pub_xpath );
307
308         ($self->{master_doc}->{publisher}) = 
309                 $self->get_field_value( $mods, $publisher_xpath );
310
311         ($self->{master_doc}->{edition}) =
312                 $self->get_field_value( $mods, $edition_xpath );
313
314
315
316 # ------------------------------
317         # holds an array of [ link, title, link, title, ... ]
318         $self->{master_doc}->{online_loc} = [];
319         push(@{$self->{master_doc}->{online_loc}},
320                 $self->get_field_value( $mods, $online_loc_xpath ));
321
322         ($self->{master_doc}->{synopsis}) = 
323                 $self->get_field_value( $mods, $abstract_xpath );
324
325         $self->{master_doc}->{physical_description} = [];
326         push(@{$self->{master_doc}->{physical_description}},
327                 $self->get_field_value( $mods, $physical_desc ) );
328         $self->{master_doc}->{physical_description} = 
329                 join( ' ', @{$self->{master_doc}->{physical_description}});
330
331         ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
332
333 }
334
335
336
337 # ---------------------------------------------------------------------------
338 # Takes a MARCXML string and adds it to the growing MODS doc
339 # ---------------------------------------------------------------------------
340 sub push_mods_batch {
341         my( $self, $marcxml ) = @_;
342
343         my $xmldoc = $parser->parse_string($marcxml);
344         my $mods = $mods_sheet->transform($xmldoc);
345
346         my $xmlperl = $self->modsdoc_to_values( $mods );
347         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
348
349         # for backwards compatibility, remove the array part when all is decided
350         if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
351                 for my $subject( @{$xmlperl->{subject}} ) {
352                         push @{$self->{master_doc}->{subject}}, $subject;
353                 }
354         } else {
355                 for my $subject ( keys %{$xmlperl->{subject}} ) {
356                         my $s = $self->{master_doc}->{subject};
357                         if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
358                 }
359         }
360
361         push( @{$self->{master_doc}->{type_of_resource}}, 
362                 $self->get_field_value( $mods, $resource_xpath ));
363
364         if(!($self->{master_doc}->{isbn}) ) {
365                 ($self->{master_doc}->{isbn}) = 
366                         $self->get_field_value( $mods, $isbn_xpath );
367         }
368 }
369
370
371 # ---------------------------------------------------------------------------
372 # Completes a MARC -> Unified MODS batch process and returns the perl hash
373 # ---------------------------------------------------------------------------
374 sub init_virtual_record {
375         my $record = new Fieldmapper::metabib::virtual_record;
376         $record->subject([]);
377         $record->types_of_resource([]);
378         $record->call_numbers([]);
379         return $record;
380 }
381
382 sub finish_mods_batch {
383         my $self = shift;
384         my $perl = $self->{master_doc};
385         my $record = init_virtual_record();
386
387         # turn the hash into a fieldmapper object
388         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
389         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
390
391         my @series;
392         for my $s (@{$perl->{series}}) {
393                 push @series, (split( /\s*;/, $s ))[0];
394         }
395
396         # uniquify the types of resource
397         my $rtypes = $perl->{type_of_resource};
398         my %hash = map { ($_ => 1) } @$rtypes;
399         $rtypes = [ keys %hash ];
400
401         $record->title($title);
402         $record->author($author);
403
404         $record->doc_id($perl->{doc_id});
405         $record->isbn($perl->{isbn});
406         $record->pubdate($perl->{pubdate});
407         $record->publisher($perl->{publisher});
408         $record->tcn($perl->{tcn});
409
410         $record->edition($perl->{edition});
411
412         $record->subject($perl->{subject});
413         $record->types_of_resource($rtypes);
414         $record->series(\@series);
415
416         $record->online_loc($perl->{online_loc});
417         $record->synopsis($perl->{synopsis});
418         $record->physical_description($perl->{physical_description});
419         $record->toc($perl->{toc});
420
421         use Data::Dumper;
422         warn Dumper $self->{master_doc};
423
424         $self->{master_doc} = undef;
425         return $record;
426 }
427
428
429