]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
f2c7c31f7e9950df5f264b845d593eec8eb7fc10
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use Data::Dumper;
11
12 my $parser              = XML::LibXML->new();
13 my $xslt                        = XML::LibXSLT->new();
14 my $mods_sheet;
15
16 # ----------------------------------------------------------------------------------------
17 # XPATH for extracting info from a MODS doc
18 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
19 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
20 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
21                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
22 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
23 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
24 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
25 my $abstract_xpath      = "//mods:mods/mods:abstract";
26 my $toc_xpath                   = "";
27 my $related_xpath               = "";
28 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
29
30 my $xpathset = {
31
32         title => {
33                 abbreviated => 
34                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
35                 translated =>
36                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
37                 uniform =>
38                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
39                 proper =>
40                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
41                 any =>
42                         "//mods:mods/mods:titleInfo",
43         },
44
45         author => {
46                 corporate => 
47                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
48                                 "[../mods:role/mods:text[text()='creator']][1]",
49                 personal => 
50                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
51                                 "[../mods:role/mods:text[text()='creator']][1]",
52                 conference => 
53                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
54                                 "[../mods:role/mods:text[text()='creator']][1]",
55                 other => 
56                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
57         },
58
59         subject => {
60
61                 topic => 
62                         "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
63
64 #               geographic => 
65 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
66 #               name => 
67 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
68 #               temporal => 
69 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
70 #               topic => 
71 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
72         },
73         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
74
75         series => {
76                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
77         }
78 };
79 # ----------------------------------------------------------------------------------------
80
81
82
83 sub new { return bless( {}, shift() ); }
84
85 sub get_field_value {
86
87         my( $self, $mods, $xpath ) = @_;
88
89         my @string;
90         my $root = $mods->documentElement;
91         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
92
93         # grab the set of matching nodes
94         my @nodes = $root->findnodes( $xpath );
95         for my $value (@nodes) {
96
97                 # grab all children of the node
98                 my @children = $value->childNodes();
99                 my @child_text;
100                 for my $child (@children) {
101                         next unless( $child->nodeType != 3 );
102
103                         if($child->childNodes) {
104                                 my @a;
105                                 for my $c (@{$child->childNodes}){
106                                         push @a, $c->textContent;
107                                 }
108                                 push(@child_text, join(' ', @a));
109
110                         } else {
111                                 push(@child_text, $child->textContent); 
112                         }
113
114                 }
115                 if(@child_text) {
116                         push(@string, \@child_text);
117                 }
118
119                 if( !@child_text  ) {
120                         push(@string, $value->textContent );
121                 }
122         }
123         return @string;
124 }
125
126 =head
127 sub _modsdoc_to_values {
128         my( $self, $mods ) = @_;
129         my $data = {};
130         for my $class (keys %$xpathset) {
131                 $data->{$class} = {};
132                 for my $type(keys %{$xpathset->{$class}}) {
133                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
134                         if( $class eq "subject" ) {
135                                 push( @{$data->{$class}->{$type}},  @value );
136                         } else {
137                                 $data->{$class}->{$type} = $value[0];
138                         }
139                 }
140         }
141         return $data;
142 }
143 =cut
144
145 sub modsdoc_to_values {
146         my( $self, $mods ) = @_;
147         my $data = {};
148
149         {
150                 my $class = "subject";
151                 $data->{$class} = {};
152                 for my $type(keys %{$xpathset->{$class}}) {
153                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
154                         for my $arr (@value) {
155                                 push( @{$data->{$class}->{$type}},  $arr);
156                         }
157                 }
158         }
159
160         {
161                 my $class = "title";
162                 $data->{$class} = {};
163                 for my $type(keys %{$xpathset->{$class}}) {
164                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
165                         for my $arr (@value) {
166                                 if( ref($arr) ) {
167                                         $data->{$class}->{$type} = shift @$arr;
168                                         $data->{$class}->{$type} .= shift @$arr if (lc($data->{$class}->{$type}) =~ /^the|an?/o);
169                                         for my $t (@$arr) {
170                                                 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
171                                                 $data->{$class}->{$type} .= " $t";
172                                         }
173                                 } else {
174                                         $data->{$class}->{$type} = $arr;
175                                 }
176                         }
177                 }
178         }
179
180         {
181                 my $class = "author";
182                 $data->{$class} = {};
183                 for my $type(keys %{$xpathset->{$class}}) {
184                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
185                         $data->{$class}->{$type} = $value[0];
186                 }
187         }
188
189         {
190                 my $class = "series";
191                 $data->{$class} = {};
192                 for my $type(keys %{$xpathset->{$class}}) {
193                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
194                         for my $arr (@value) {
195                                 if( ref($arr) ) {
196                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
197                                 } else {
198                                         push( @{$data->{$class}->{$type}}, $arr );
199                                 }
200                         }
201                 }
202
203         }
204
205         return $data;
206 }
207
208
209
210
211 # ---------------------------------------------------------------------------
212 # Grabs the data 'we want' from the MODS doc and returns it in hash form
213 # ---------------------------------------------------------------------------
214 sub mods_values_to_mods_slim {
215         my( $self, $modsperl ) = @_;
216
217         my $title = "";
218         my $author = "";
219         my $subject = [];
220         my $series      = [];
221
222         my $tmp = $modsperl->{title};
223
224
225         if(!$tmp) { $title = ""; }
226         else {
227                 ($title = $tmp->{proper}) ||
228                 ($title = $tmp->{translated}) ||
229                 ($title = $tmp->{abbreviated}) ||
230                 ($title = $tmp->{uniform}) ||
231                 ($title = $tmp->{any});
232         }
233
234         $tmp = $modsperl->{author};
235         if(!$tmp) { $author = ""; }
236         else {
237                 ($author = $tmp->{personal}) ||
238                 ($author = $tmp->{other}) ||
239                 ($author = $tmp->{corporate}) ||
240                 ($author = $tmp->{conference}); 
241         }
242
243         $tmp = $modsperl->{subject};
244         if(!$tmp) { $subject = {}; } 
245         else {
246                 for my $key( keys %{$tmp}) {
247                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
248                 }
249                 my $subh = {};
250                 for my $s (@$subject) {
251                         if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
252                 }
253                 $subject = $subh
254         }
255
256         $tmp = $modsperl->{'series'};
257         if(!$tmp) { $series = []; }
258         else { $series = $tmp->{'series'}; }
259
260
261         return { series => $series, title => $title, 
262                         author => $author, subject => $subject };
263 }
264
265
266
267 # ---------------------------------------------------------------------------
268 # Initializes a MARC -> Unified MODS batch process
269 # ---------------------------------------------------------------------------
270
271 sub start_mods_batch {
272
273         my( $self, $master_doc ) = @_;
274
275
276         if(!$mods_sheet) {
277                  my $xslt_doc = $parser->parse_file(
278                         OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') .  "/MARC21slim2MODS3.xsl");
279                 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
280         }
281
282
283         my $xmldoc = $parser->parse_string($master_doc);
284         my $mods = $mods_sheet->transform($xmldoc);
285
286 #       warn "-" x 100 . "\n";
287 #       warn "MODS " . $mods->toString(1) . "\n";
288 #       warn "-" x 100 . "\n";
289
290         $self->{master_doc} = $self->modsdoc_to_values( $mods );
291         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
292
293         ($self->{master_doc}->{isbn}) = 
294                 $self->get_field_value( $mods, $isbn_xpath );
295
296         $self->{master_doc}->{type_of_resource} = 
297                 [ $self->get_field_value( $mods, $resource_xpath ) ];
298
299         ($self->{master_doc}->{tcn}) = 
300                 $self->get_field_value( $mods, $tcn_xpath );
301
302         ($self->{master_doc}->{pubdate}) = 
303                 $self->get_field_value( $mods, $pub_xpath );
304
305         ($self->{master_doc}->{publisher}) = 
306                 $self->get_field_value( $mods, $publisher_xpath );
307
308         ($self->{master_doc}->{edition}) =
309                 $self->get_field_value( $mods, $edition_xpath );
310
311
312
313 # ------------------------------
314         # holds an array of [ link, title, link, title, ... ]
315         $self->{master_doc}->{online_loc} = [];
316         push(@{$self->{master_doc}->{online_loc}},
317                 $self->get_field_value( $mods, $online_loc_xpath ));
318
319         ($self->{master_doc}->{synopsis}) = 
320                 $self->get_field_value( $mods, $abstract_xpath );
321
322 }
323
324
325
326 # ---------------------------------------------------------------------------
327 # Takes a MARCXML string and adds it to the growing MODS doc
328 # ---------------------------------------------------------------------------
329 sub push_mods_batch {
330         my( $self, $marcxml ) = @_;
331
332         my $xmldoc = $parser->parse_string($marcxml);
333         my $mods = $mods_sheet->transform($xmldoc);
334
335         my $xmlperl = $self->modsdoc_to_values( $mods );
336         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
337
338         # for backwards compatibility, remove the array part when all is decided
339         if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
340                 for my $subject( @{$xmlperl->{subject}} ) {
341                         push @{$self->{master_doc}->{subject}}, $subject;
342                 }
343         } else {
344                 for my $subject ( keys %{$xmlperl->{subject}} ) {
345                         my $s = $self->{master_doc}->{subject};
346                         if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
347                 }
348         }
349
350         push( @{$self->{master_doc}->{type_of_resource}}, 
351                 $self->get_field_value( $mods, $resource_xpath ));
352
353         if(!($self->{master_doc}->{isbn}) ) {
354                 ($self->{master_doc}->{isbn}) = 
355                         $self->get_field_value( $mods, $isbn_xpath );
356         }
357 }
358
359
360 # ---------------------------------------------------------------------------
361 # Completes a MARC -> Unified MODS batch process and returns the perl hash
362 # ---------------------------------------------------------------------------
363 sub init_virtual_record {
364         my $record = new Fieldmapper::metabib::virtual_record;
365         $record->subject([]);
366         $record->types_of_resource([]);
367         $record->call_numbers([]);
368         return $record;
369 }
370
371 sub finish_mods_batch {
372         my $self = shift;
373         my $perl = $self->{master_doc};
374         my $record = init_virtual_record();
375
376         # turn the hash into a fieldmapper object
377         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
378         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
379
380         my @series;
381         for my $s (@{$perl->{series}}) {
382                 push @series, (split( /\s*;/, $s ))[0];
383         }
384
385         # uniquify the types of resource
386         my $rtypes = $perl->{type_of_resource};
387         my %hash = map { ($_ => 1) } @$rtypes;
388         $rtypes = [ keys %hash ];
389
390         $record->title($title);
391         $record->author($author);
392
393         $record->doc_id($perl->{doc_id});
394         $record->isbn($perl->{isbn});
395         $record->pubdate($perl->{pubdate});
396         $record->publisher($perl->{publisher});
397         $record->tcn($perl->{tcn});
398
399         $record->edition($perl->{edition});
400
401         $record->subject($perl->{subject});
402         $record->types_of_resource($rtypes);
403         $record->series(\@series);
404
405         $record->online_loc($perl->{online_loc});
406         $record->synopsis($perl->{synopsis});
407
408         $self->{master_doc} = undef;
409         return $record;
410 }
411
412
413