adjust authority stuff; separate title and subtitle
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use Data::Dumper;
11
12 my $parser              = XML::LibXML->new();
13 my $xslt                        = XML::LibXSLT->new();
14 my $mods_sheet;
15
16 # ----------------------------------------------------------------------------------------
17 # XPATH for extracting info from a MODS doc
18 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
19 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
20 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
21                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
22 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
23 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
24 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
25 my $abstract_xpath      = "//mods:mods/mods:abstract";
26 my $toc_xpath                   = "";
27 my $related_xpath               = "";
28 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
29
30 my $xpathset = {
31
32         title => {
33                 abbreviated => 
34                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
35                 translated =>
36                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
37                 uniform =>
38                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
39                 proper =>
40                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
41         },
42
43         author => {
44                 corporate => 
45                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
46                                 "[../mods:role/mods:text[text()='creator']][1]",
47                 personal => 
48                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
49                                 "[../mods:role/mods:text[text()='creator']][1]",
50                 conference => 
51                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
52                                 "[../mods:role/mods:text[text()='creator']][1]",
53                 other => 
54                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
55         },
56
57         subject => {
58
59                 topic => 
60                         "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
61
62 #               geographic => 
63 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
64 #               name => 
65 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
66 #               temporal => 
67 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
68 #               topic => 
69 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
70         },
71         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
72
73         series => {
74                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
75         }
76 };
77 # ----------------------------------------------------------------------------------------
78
79
80
81 sub new { return bless( {}, shift() ); }
82
83 sub get_field_value {
84
85         my( $self, $mods, $xpath ) = @_;
86
87         my @string;
88         my $root = $mods->documentElement;
89         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
90
91         # grab the set of matching nodes
92         my @nodes = $root->findnodes( $xpath );
93         for my $value (@nodes) {
94
95                 # grab all children of the node
96                 my @children = $value->childNodes();
97                 my @child_text;
98                 for my $child (@children) {
99                         next unless( $child->nodeType != 3 );
100
101                         if($child->childNodes) {
102                                 my @a;
103                                 for my $c (@{$child->childNodes}){
104                                         push @a, $c->textContent;
105                                 }
106                                 push(@child_text, join(' ', @a));
107
108                         } else {
109                                 push(@child_text, $child->textContent); 
110                         }
111
112                 }
113                 if(@child_text) {
114                         push(@string, \@child_text);
115                 }
116
117                 if( !@child_text  ) {
118                         push(@string, $value->textContent );
119                 }
120         }
121         return @string;
122 }
123
124 =head
125 sub _modsdoc_to_values {
126         my( $self, $mods ) = @_;
127         my $data = {};
128         for my $class (keys %$xpathset) {
129                 $data->{$class} = {};
130                 for my $type(keys %{$xpathset->{$class}}) {
131                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
132                         if( $class eq "subject" ) {
133                                 push( @{$data->{$class}->{$type}},  @value );
134                         } else {
135                                 $data->{$class}->{$type} = $value[0];
136                         }
137                 }
138         }
139         return $data;
140 }
141 =cut
142
143 sub modsdoc_to_values {
144         my( $self, $mods ) = @_;
145         my $data = {};
146
147         {
148                 my $class = "subject";
149                 $data->{$class} = {};
150                 for my $type(keys %{$xpathset->{$class}}) {
151                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
152                         for my $arr (@value) {
153                                 push( @{$data->{$class}->{$type}},  $arr);
154                         }
155                 }
156         }
157
158         {
159                 my $class = "title";
160                 $data->{$class} = {};
161                 for my $type(keys %{$xpathset->{$class}}) {
162                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
163                         for my $arr (@value) {
164                                 if( ref($arr) ) {
165                                         $data->{$class}->{$type} = shift @$arr;
166                                         $data->{$class}->{$type} = shift @$arr if (lc($data->{$class}->{$type}) =~ /^the|an?/o);
167                                         for my $t (@$arr) {
168                                                 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
169                                                 $data->{$class}->{$type} .= " $t";
170                                         }
171                                 } else {
172                                         $data->{$class}->{$type} = $arr;
173                                 }
174                         }
175                 }
176         }
177
178         {
179                 my $class = "author";
180                 $data->{$class} = {};
181                 for my $type(keys %{$xpathset->{$class}}) {
182                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
183                         $data->{$class}->{$type} = $value[0];
184                 }
185         }
186
187         {
188                 my $class = "series";
189                 $data->{$class} = {};
190                 for my $type(keys %{$xpathset->{$class}}) {
191                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
192                         for my $arr (@value) {
193                                 if( ref($arr) ) {
194                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
195                                 } else {
196                                         push( @{$data->{$class}->{$type}}, $arr );
197                                 }
198                         }
199                 }
200
201         }
202
203         return $data;
204 }
205
206
207
208
209 # ---------------------------------------------------------------------------
210 # Grabs the data 'we want' from the MODS doc and returns it in hash form
211 # ---------------------------------------------------------------------------
212 sub mods_values_to_mods_slim {
213         my( $self, $modsperl ) = @_;
214
215         my $title = "";
216         my $author = "";
217         my $subject = [];
218         my $series      = [];
219
220         my $tmp = $modsperl->{title};
221
222
223         if(!$tmp) { $title = ""; }
224         else {
225                 ($title = $tmp->{proper}) ||
226                 ($title = $tmp->{translated}) ||
227                 ($title = $tmp->{abbreviated}) ||
228                 ($title = $tmp->{uniform});
229         }
230
231         $tmp = $modsperl->{author};
232         if(!$tmp) { $author = ""; }
233         else {
234                 ($author = $tmp->{personal}) ||
235                 ($author = $tmp->{other}) ||
236                 ($author = $tmp->{corporate}) ||
237                 ($author = $tmp->{conference}); 
238         }
239
240         $tmp = $modsperl->{subject};
241         if(!$tmp) { $subject = {}; } 
242         else {
243                 for my $key( keys %{$tmp}) {
244                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
245                 }
246                 my $subh = {};
247                 for my $s (@$subject) {
248                         if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
249                 }
250                 $subject = $subh
251         }
252
253         $tmp = $modsperl->{'series'};
254         if(!$tmp) { $series = []; }
255         else { $series = $tmp->{'series'}; }
256
257
258         return { series => $series, title => $title, 
259                         author => $author, subject => $subject };
260 }
261
262
263
264 # ---------------------------------------------------------------------------
265 # Initializes a MARC -> Unified MODS batch process
266 # ---------------------------------------------------------------------------
267
268 sub start_mods_batch {
269
270         my( $self, $master_doc ) = @_;
271
272
273         if(!$mods_sheet) {
274                  my $xslt_doc = $parser->parse_file(
275                         OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS3.xsl");
276                 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
277         }
278
279
280         my $xmldoc = $parser->parse_string($master_doc);
281         my $mods = $mods_sheet->transform($xmldoc);
282
283 #       warn "-" x 100 . "\n";
284 #       warn "MODS " . $mods->toString(1) . "\n";
285 #       warn "-" x 100 . "\n";
286
287         $self->{master_doc} = $self->modsdoc_to_values( $mods );
288         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
289
290         ($self->{master_doc}->{isbn}) = 
291                 $self->get_field_value( $mods, $isbn_xpath );
292
293         $self->{master_doc}->{type_of_resource} = 
294                 [ $self->get_field_value( $mods, $resource_xpath ) ];
295
296         ($self->{master_doc}->{tcn}) = 
297                 $self->get_field_value( $mods, $tcn_xpath );
298
299         ($self->{master_doc}->{pubdate}) = 
300                 $self->get_field_value( $mods, $pub_xpath );
301
302         ($self->{master_doc}->{publisher}) = 
303                 $self->get_field_value( $mods, $publisher_xpath );
304
305         ($self->{master_doc}->{edition}) =
306                 $self->get_field_value( $mods, $edition_xpath );
307
308
309
310 # ------------------------------
311         # holds an array of [ link, title, link, title, ... ]
312         $self->{master_doc}->{online_loc} = [];
313         push(@{$self->{master_doc}->{online_loc}},
314                 $self->get_field_value( $mods, $online_loc_xpath ));
315
316         ($self->{master_doc}->{synopsis}) = 
317                 $self->get_field_value( $mods, $abstract_xpath );
318
319 }
320
321
322
323 # ---------------------------------------------------------------------------
324 # Takes a MARCXML string and adds it to the growing MODS doc
325 # ---------------------------------------------------------------------------
326 sub push_mods_batch {
327         my( $self, $marcxml ) = @_;
328
329         my $xmldoc = $parser->parse_string($marcxml);
330         my $mods = $mods_sheet->transform($xmldoc);
331
332         my $xmlperl = $self->modsdoc_to_values( $mods );
333         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
334
335         # for backwards compatibility, remove the array part when all is decided
336         if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
337                 for my $subject( @{$xmlperl->{subject}} ) {
338                         push @{$self->{master_doc}->{subject}}, $subject;
339                 }
340         } else {
341                 for my $subject ( keys %{$xmlperl->{subject}} ) {
342                         my $s = $self->{master_doc}->{subject};
343                         if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
344                 }
345         }
346
347         push( @{$self->{master_doc}->{type_of_resource}}, 
348                 $self->get_field_value( $mods, $resource_xpath ));
349
350         if(!($self->{master_doc}->{isbn}) ) {
351                 ($self->{master_doc}->{isbn}) = 
352                         $self->get_field_value( $mods, $isbn_xpath );
353         }
354 }
355
356
357 # ---------------------------------------------------------------------------
358 # Completes a MARC -> Unified MODS batch process and returns the perl hash
359 # ---------------------------------------------------------------------------
360 sub init_virtual_record {
361         my $record = new Fieldmapper::metabib::virtual_record;
362         $record->subject([]);
363         $record->types_of_resource([]);
364         $record->call_numbers([]);
365         return $record;
366 }
367
368 sub finish_mods_batch {
369         my $self = shift;
370         my $perl = $self->{master_doc};
371         my $record = init_virtual_record();
372
373         # turn the hash into a fieldmapper object
374         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
375         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
376
377         my @series;
378         for my $s (@{$perl->{series}}) {
379                 push @series, (split( /\s*;/, $s ))[0];
380         }
381
382         # uniquify the types of resource
383         my $rtypes = $perl->{type_of_resource};
384         my %hash = map { ($_ => 1) } @$rtypes;
385         $rtypes = [ keys %hash ];
386
387         $record->title($title);
388         $record->author($author);
389
390         $record->doc_id($perl->{doc_id});
391         $record->isbn($perl->{isbn});
392         $record->pubdate($perl->{pubdate});
393         $record->publisher($perl->{publisher});
394         $record->tcn($perl->{tcn});
395
396         $record->edition($perl->{edition});
397
398         $record->subject($perl->{subject});
399         $record->types_of_resource($rtypes);
400         $record->series(\@series);
401
402         $record->online_loc($perl->{online_loc});
403         $record->synopsis($perl->{synopsis});
404
405         $self->{master_doc} = undef;
406         return $record;
407 }
408
409
410