1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
4 use OpenSRF::EX qw/:try/;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use OpenSRF::Utils::Logger qw/$logger/;
13 my $parser = XML::LibXML->new();
14 my $xslt = XML::LibXSLT->new();
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath = "//mods:mods/mods:identifier[\@type='isbn' and not(\@invalid)]";
20 my $resource_xpath = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" .
22 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath = "//mods:mods/mods:abstract";
27 my $related_xpath = "";
28 my $online_loc_xpath = "//mods:location/mods:url";
29 my $physical_desc = "(//mods:mods/mods:physicalDescription/mods:form|//mods:mods/mods:physicalDescription/mods:extent|".
30 "//mods:mods/mods:physicalDescription/mods:reformattingQuality|//mods:mods/mods:physicalDescription/mods:internetMediaType|".
31 "//mods:mods/mods:physicalDescription/mods:digitalOrigin)";
32 my $toc_xpath = "//mods:tableOfContents";
38 "//mods:mods/mods:titleInfo[mods:title and (\@type='abbreviated')]",
40 "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
42 "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
44 "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
46 "//mods:mods/mods:titleInfo",
51 "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52 "[../mods:role/mods:text[text()='creator']".
53 " or ../mods:role/mods:roleTerm[".
55 " and \@authority='marcrelator'".
56 " and text()='creator']".
59 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
60 "[../mods:role/mods:text[text()='creator']".
61 " or ../mods:role/mods:roleTerm[".
63 " and \@authority='marcrelator'".
64 " and text()='creator']".
67 "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
68 "[../mods:role/mods:text[text()='creator']".
69 " or ../mods:role/mods:roleTerm[".
71 " and \@authority='marcrelator'".
72 " and text()='creator']".
75 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
77 "//mods:mods/mods:name/*[local-name()='namePart'][1]",
83 "//mods:mods/mods:subject/*[".
84 " local-name()='geographic'".
85 " or local-name()='name'".
86 " or local-name()='temporal'".
87 " or local-name()='topic'".
88 "]/parent::mods:subject",
91 # "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
93 # "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
95 # "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
97 # "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
99 #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
102 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
105 # ----------------------------------------------------------------------------------------
109 sub new { return bless( {}, shift() ); }
111 sub get_field_value {
113 my( $self, $mods, $xpath, $type) = @_;
117 my $root = $mods->documentElement;
118 $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
121 # grab the set of matching nodes
122 my @nodes = $root->findnodes( $xpath );
123 for my $value (@nodes) {
125 # grab all children of the node
126 my @children = $value->childNodes();
128 for my $child (@children) {
129 # MODS strips the punctuation from 245abc, which often
130 # results in "title subtitle" rather than "title : subtitle";
131 # this hack gets it back for us
132 if ($type && $type eq 'title' && $child->nodeName =~ m/subTitle/) {
133 push(@child_text, " : ");
135 next unless( $child->nodeType != 3 );
137 if($child->childNodes) {
139 for my $c (@{$child->childNodes}){
140 push @a, $c->textContent;
142 push(@child_text, join(' ', @a));
145 push(@child_text, $child->textContent);
150 push(@string, \@child_text);
154 push(@string, $value->textContent );
158 $logger->info("MODS-izing failure: ".shift());
159 $logger->info("Failed MODS xml: ".$root->toString);
160 $logger->info("Failed MODS xpath: $xpath");
165 =head1 old implementation
167 sub _modsdoc_to_values {
168 my( $self, $mods ) = @_;
170 for my $class (keys %$xpathset) {
171 $data->{$class} = {};
172 for my $type(keys %{$xpathset->{$class}}) {
173 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
174 if( $class eq "subject" ) {
175 push( @{$data->{$class}->{$type}}, @value );
177 $data->{$class}->{$type} = $value[0];
186 sub modsdoc_to_values {
187 my( $self, $mods ) = @_;
191 my $class = "subject";
192 $data->{$class} = {};
193 for my $type(keys %{$xpathset->{$class}}) {
194 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
195 for my $arr (@value) {
196 push( @{$data->{$class}->{$type}}, $arr);
203 $data->{$class} = {};
204 for my $type(keys %{$xpathset->{$class}}) {
205 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type}, "title" );
206 for my $arr (@value) {
208 $data->{$class}->{$type} = shift @$arr;
210 my $t = lc($data->{$class}->{$type});
211 if($t and $t =~ /^l[eoa]s|l[ae]|el|the|un[ae]?|an?\s?$/o ) {
212 my $val = shift @$arr || "";
213 $data->{$class}->{$type} .= " $val" if $data->{$class}->{$type};
214 $data->{$class}->{$type} = " $val" unless $data->{$class}->{$type};
218 $data->{$class}->{$type} .= " $t";
221 $data->{$class}->{$type} = $arr;
224 $data->{$class}->{$type} =~ s/\s+/ /go if ($data->{$class}->{$type});
229 my $class = "author";
230 $data->{$class} = {};
231 for my $type(keys %{$xpathset->{$class}}) {
232 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
233 $data->{$class}->{$type} = $value[0];
238 my $class = "series";
239 $data->{$class} = {};
240 for my $type(keys %{$xpathset->{$class}}) {
241 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
242 for my $arr (@value) {
244 push(@{$data->{$class}->{$type}}, join(" ", @$arr));
246 push( @{$data->{$class}->{$type}}, $arr );
259 # ---------------------------------------------------------------------------
260 # Grabs the data 'we want' from the MODS doc and returns it in hash form
261 # ---------------------------------------------------------------------------
262 sub mods_values_to_mods_slim {
263 my( $self, $modsperl ) = @_;
270 my $tmp = $modsperl->{title};
273 if(!$tmp) { $title = ""; }
275 ($title = $tmp->{proper}) ||
276 ($title = $tmp->{translated}) ||
277 ($title = $tmp->{abbreviated}) ||
278 ($title = $tmp->{uniform}) ||
279 ($title = $tmp->{any});
282 $tmp = $modsperl->{author};
283 if(!$tmp) { $author = ""; }
285 ($author = $tmp->{personal}) ||
286 ($author = $tmp->{corporate}) ||
287 ($author = $tmp->{conference}) ||
288 ($author = $tmp->{other}) ||
289 ($author = $tmp->{any});
292 $tmp = $modsperl->{subject};
293 if(!$tmp) { $subject = {}; }
295 for my $key( keys %{$tmp}) {
296 push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
299 for my $s (@$subject) {
300 if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
305 $tmp = $modsperl->{'series'};
306 if(!$tmp) { $series = []; }
307 else { $series = $tmp->{'series'}; }
310 return { series => $series, title => $title,
311 author => $author, subject => $subject };
316 # ---------------------------------------------------------------------------
317 # Initializes a MARC -> Unified MODS batch process
318 # ---------------------------------------------------------------------------
320 sub start_mods_batch {
322 my( $self, $master_doc ) = @_;
325 $self->{master_doc} = undef;
330 my $xslt_doc = $parser->parse_file(
331 OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') . "/MARC21slim2MODS32.xsl");
332 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
336 my $xmldoc = $parser->parse_string($master_doc);
337 my $mods = $mods_sheet->transform($xmldoc);
339 $self->{master_doc} = $self->modsdoc_to_values( $mods );
340 $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
342 ($self->{master_doc}->{isbn}) =
343 $self->get_field_value( $mods, $isbn_xpath );
345 $self->{master_doc}->{type_of_resource} =
346 [ $self->get_field_value( $mods, $resource_xpath ) ];
348 ($self->{master_doc}->{tcn}) =
349 $self->get_field_value( $mods, $tcn_xpath );
351 ($self->{master_doc}->{pubdate}) =
352 $self->get_field_value( $mods, $pub_xpath );
354 ($self->{master_doc}->{publisher}) =
355 $self->get_field_value( $mods, $publisher_xpath );
357 ($self->{master_doc}->{edition}) =
358 $self->get_field_value( $mods, $edition_xpath );
362 # ------------------------------
363 # holds an array of [ link, title, link, title, ... ]
364 $self->{master_doc}->{online_loc} = [];
365 for my $url ($mods->findnodes($online_loc_xpath)) {
366 push(@{$self->{master_doc}->{online_loc}}, $url->textContent);
367 push(@{$self->{master_doc}->{online_loc}}, $url->getAttribute('displayLabel') || '');
368 push(@{$self->{master_doc}->{online_loc}}, $url->getAttribute('note') || '');
371 ($self->{master_doc}->{synopsis}) =
372 $self->get_field_value( $mods, $abstract_xpath );
374 $self->{master_doc}->{physical_description} = [];
375 push(@{$self->{master_doc}->{physical_description}},
376 $self->get_field_value( $mods, $physical_desc ) );
377 $self->{master_doc}->{physical_description} =
378 join( ' ', @{$self->{master_doc}->{physical_description}});
380 ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
386 # ---------------------------------------------------------------------------
387 # Takes a MARCXML string and adds it to the growing MODS doc
388 # ---------------------------------------------------------------------------
389 sub push_mods_batch {
390 my( $self, $marcxml ) = @_;
392 my $xmldoc = $parser->parse_string($marcxml);
393 my $mods = $mods_sheet->transform($xmldoc);
395 my $xmlperl = $self->modsdoc_to_values( $mods );
396 $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
398 # for backwards compatibility, remove the array part when all is decided
399 if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
400 for my $subject( @{$xmlperl->{subject}} ) {
401 push @{$self->{master_doc}->{subject}}, $subject;
404 for my $subject ( keys %{$xmlperl->{subject}} ) {
405 my $s = $self->{master_doc}->{subject};
406 if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
410 push( @{$self->{master_doc}->{type_of_resource}},
411 $self->get_field_value( $mods, $resource_xpath ));
413 if(!($self->{master_doc}->{isbn}) ) {
414 ($self->{master_doc}->{isbn}) =
415 $self->get_field_value( $mods, $isbn_xpath );
420 # ---------------------------------------------------------------------------
421 # Completes a MARC -> Unified MODS batch process and returns the perl hash
422 # ---------------------------------------------------------------------------
423 sub init_virtual_record {
424 my $record = Fieldmapper::metabib::virtual_record->new;
425 $record->subject([]);
426 $record->types_of_resource([]);
427 $record->call_numbers([]);
431 sub finish_mods_batch {
434 return undef unless $self->{master_doc};
436 my $perl = $self->{master_doc};
437 my $record = init_virtual_record();
439 # turn the hash into a fieldmapper object
440 #(my $title = $perl->{title}) =~ s/\[.*?\]//og;
441 #(my $author = $perl->{author}) =~ s/\(.*?\)//og;
442 my $title = $perl->{title};
443 my $author = $perl->{author};
446 for my $s (@{$perl->{series}}) {
447 push @series, (split( /\s*;/, $s ))[0];
450 # uniquify the types of resource
451 my $rtypes = $perl->{type_of_resource};
452 my %hash = map { ($_ => 1) } @$rtypes;
453 $rtypes = [ keys %hash ];
455 $record->title($title);
456 $record->author($author);
458 $record->doc_id($perl->{doc_id});
459 $record->isbn($perl->{isbn});
460 $record->pubdate($perl->{pubdate});
461 $record->publisher($perl->{publisher});
462 $record->tcn($perl->{tcn});
464 $record->edition($perl->{edition});
466 $record->subject($perl->{subject});
467 $record->types_of_resource($rtypes);
468 $record->series(\@series);
470 $record->online_loc($perl->{online_loc});
471 $record->synopsis($perl->{synopsis});
472 $record->physical_description($perl->{physical_description});
473 $record->toc($perl->{toc});
475 $self->{master_doc} = undef;