1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
4 use OpenSRF::EX qw/:try/;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use OpenSRF::Utils::Logger qw/$logger/;
13 my $parser = XML::LibXML->new();
14 my $xslt = XML::LibXSLT->new();
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" .
22 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath = "//mods:mods/mods:abstract";
27 my $related_xpath = "";
28 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
29 my $physical_desc = "(//mods:physicalDescription/mods:form|//mods:physicalDescription/mods:extent|".
30 "//mods:physicalDescription/mods:reformattingQuality|//mods:physicalDescription/mods:internetMediaType|".
31 "//mods:physicalDescription/mods:digitalOrigin)";
32 my $toc_xpath = "//mods:tableOfContents";
38 "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
40 "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
42 "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
44 "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
46 "//mods:mods/mods:titleInfo",
51 "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52 "[../mods:role/mods:text[text()='creator']][1]",
54 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
55 "[../mods:role/mods:text[text()='creator']][1]",
57 "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
58 "[../mods:role/mods:text[text()='creator']][1]",
60 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
66 "//mods:mods/mods:subject/*[local-name()='geographic' or local-name()='name' or local-name()='temporal' or local-name()='topic']/parent::mods:subject",
69 # "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
71 # "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
73 # "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
75 # "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
77 #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
80 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
83 # ----------------------------------------------------------------------------------------
87 sub new { return bless( {}, shift() ); }
91 my( $self, $mods, $xpath ) = @_;
95 my $root = $mods->documentElement;
96 $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
99 # grab the set of matching nodes
100 my @nodes = $root->findnodes( $xpath );
101 for my $value (@nodes) {
103 # grab all children of the node
104 my @children = $value->childNodes();
106 for my $child (@children) {
107 next unless( $child->nodeType != 3 );
109 if($child->childNodes) {
111 for my $c (@{$child->childNodes}){
112 push @a, $c->textContent;
114 push(@child_text, join(' ', @a));
117 push(@child_text, $child->textContent);
122 push(@string, \@child_text);
126 push(@string, $value->textContent );
130 $logger->info("MODS-izing failure: ".shift());
131 $logger->info("Failed MODS xml: ".$root->toString);
132 $logger->info("Failed MODS xpath: $xpath");
138 sub _modsdoc_to_values {
139 my( $self, $mods ) = @_;
141 for my $class (keys %$xpathset) {
142 $data->{$class} = {};
143 for my $type(keys %{$xpathset->{$class}}) {
144 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
145 if( $class eq "subject" ) {
146 push( @{$data->{$class}->{$type}}, @value );
148 $data->{$class}->{$type} = $value[0];
156 sub modsdoc_to_values {
157 my( $self, $mods ) = @_;
161 my $class = "subject";
162 $data->{$class} = {};
163 for my $type(keys %{$xpathset->{$class}}) {
164 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
165 for my $arr (@value) {
166 push( @{$data->{$class}->{$type}}, $arr);
173 $data->{$class} = {};
174 for my $type(keys %{$xpathset->{$class}}) {
175 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
176 for my $arr (@value) {
178 $data->{$class}->{$type} = shift @$arr;
180 my $t = lc($data->{$class}->{$type});
181 if($t and $t =~ /^the|an?/o ) {
182 my $val = shift @$arr || "";
183 $data->{$class}->{$type} .= " $val" if $data->{$class}->{$type};
184 $data->{$class}->{$type} = " $val" unless $data->{$class}->{$type};
188 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
189 $data->{$class}->{$type} .= " $t";
192 $data->{$class}->{$type} = $arr;
199 my $class = "author";
200 $data->{$class} = {};
201 for my $type(keys %{$xpathset->{$class}}) {
202 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
203 $data->{$class}->{$type} = $value[0];
208 my $class = "series";
209 $data->{$class} = {};
210 for my $type(keys %{$xpathset->{$class}}) {
211 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
212 for my $arr (@value) {
214 push(@{$data->{$class}->{$type}}, join(" ", @$arr));
216 push( @{$data->{$class}->{$type}}, $arr );
229 # ---------------------------------------------------------------------------
230 # Grabs the data 'we want' from the MODS doc and returns it in hash form
231 # ---------------------------------------------------------------------------
232 sub mods_values_to_mods_slim {
233 my( $self, $modsperl ) = @_;
240 my $tmp = $modsperl->{title};
243 if(!$tmp) { $title = ""; }
245 ($title = $tmp->{proper}) ||
246 ($title = $tmp->{translated}) ||
247 ($title = $tmp->{abbreviated}) ||
248 ($title = $tmp->{uniform}) ||
249 ($title = $tmp->{any});
252 $tmp = $modsperl->{author};
253 if(!$tmp) { $author = ""; }
255 ($author = $tmp->{personal}) ||
256 ($author = $tmp->{other}) ||
257 ($author = $tmp->{corporate}) ||
258 ($author = $tmp->{conference});
261 $tmp = $modsperl->{subject};
262 if(!$tmp) { $subject = {}; }
264 for my $key( keys %{$tmp}) {
265 push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
268 for my $s (@$subject) {
269 if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
274 $tmp = $modsperl->{'series'};
275 if(!$tmp) { $series = []; }
276 else { $series = $tmp->{'series'}; }
279 return { series => $series, title => $title,
280 author => $author, subject => $subject };
285 # ---------------------------------------------------------------------------
286 # Initializes a MARC -> Unified MODS batch process
287 # ---------------------------------------------------------------------------
289 sub start_mods_batch {
291 my( $self, $master_doc ) = @_;
294 $self->{master_doc} = undef;
299 my $xslt_doc = $parser->parse_file(
300 OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') . "/MARC21slim2MODS3.xsl");
301 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
305 my $xmldoc = $parser->parse_string($master_doc);
306 my $mods = $mods_sheet->transform($xmldoc);
308 $self->{master_doc} = $self->modsdoc_to_values( $mods );
309 $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
311 ($self->{master_doc}->{isbn}) =
312 $self->get_field_value( $mods, $isbn_xpath );
314 $self->{master_doc}->{type_of_resource} =
315 [ $self->get_field_value( $mods, $resource_xpath ) ];
317 ($self->{master_doc}->{tcn}) =
318 $self->get_field_value( $mods, $tcn_xpath );
320 ($self->{master_doc}->{pubdate}) =
321 $self->get_field_value( $mods, $pub_xpath );
323 ($self->{master_doc}->{publisher}) =
324 $self->get_field_value( $mods, $publisher_xpath );
326 ($self->{master_doc}->{edition}) =
327 $self->get_field_value( $mods, $edition_xpath );
331 # ------------------------------
332 # holds an array of [ link, title, link, title, ... ]
333 $self->{master_doc}->{online_loc} = [];
334 push(@{$self->{master_doc}->{online_loc}},
335 $self->get_field_value( $mods, $online_loc_xpath ));
337 ($self->{master_doc}->{synopsis}) =
338 $self->get_field_value( $mods, $abstract_xpath );
340 $self->{master_doc}->{physical_description} = [];
341 push(@{$self->{master_doc}->{physical_description}},
342 $self->get_field_value( $mods, $physical_desc ) );
343 $self->{master_doc}->{physical_description} =
344 join( ' ', @{$self->{master_doc}->{physical_description}});
346 ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
352 # ---------------------------------------------------------------------------
353 # Takes a MARCXML string and adds it to the growing MODS doc
354 # ---------------------------------------------------------------------------
355 sub push_mods_batch {
356 my( $self, $marcxml ) = @_;
358 my $xmldoc = $parser->parse_string($marcxml);
359 my $mods = $mods_sheet->transform($xmldoc);
361 my $xmlperl = $self->modsdoc_to_values( $mods );
362 $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
364 # for backwards compatibility, remove the array part when all is decided
365 if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
366 for my $subject( @{$xmlperl->{subject}} ) {
367 push @{$self->{master_doc}->{subject}}, $subject;
370 for my $subject ( keys %{$xmlperl->{subject}} ) {
371 my $s = $self->{master_doc}->{subject};
372 if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
376 push( @{$self->{master_doc}->{type_of_resource}},
377 $self->get_field_value( $mods, $resource_xpath ));
379 if(!($self->{master_doc}->{isbn}) ) {
380 ($self->{master_doc}->{isbn}) =
381 $self->get_field_value( $mods, $isbn_xpath );
386 # ---------------------------------------------------------------------------
387 # Completes a MARC -> Unified MODS batch process and returns the perl hash
388 # ---------------------------------------------------------------------------
389 sub init_virtual_record {
390 my $record = new Fieldmapper::metabib::virtual_record;
391 $record->subject([]);
392 $record->types_of_resource([]);
393 $record->call_numbers([]);
397 sub finish_mods_batch {
400 return undef unless $self->{master_doc};
402 my $perl = $self->{master_doc};
403 my $record = init_virtual_record();
405 # turn the hash into a fieldmapper object
406 (my $title = $perl->{title}) =~ s/\[.*?\]//og;
407 (my $author = $perl->{author}) =~ s/\(.*?\)//og;
410 for my $s (@{$perl->{series}}) {
411 push @series, (split( /\s*;/, $s ))[0];
414 # uniquify the types of resource
415 my $rtypes = $perl->{type_of_resource};
416 my %hash = map { ($_ => 1) } @$rtypes;
417 $rtypes = [ keys %hash ];
419 $record->title($title);
420 $record->author($author);
422 $record->doc_id($perl->{doc_id});
423 $record->isbn($perl->{isbn});
424 $record->pubdate($perl->{pubdate});
425 $record->publisher($perl->{publisher});
426 $record->tcn($perl->{tcn});
428 $record->edition($perl->{edition});
430 $record->subject($perl->{subject});
431 $record->types_of_resource($rtypes);
432 $record->series(\@series);
434 $record->online_loc($perl->{online_loc});
435 $record->synopsis($perl->{synopsis});
436 $record->physical_description($perl->{physical_description});
437 $record->toc($perl->{toc});
439 $self->{master_doc} = undef;