1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
4 use OpenSRF::EX qw/:try/;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
12 my $parser = XML::LibXML->new();
13 my $xslt = XML::LibXSLT->new();
16 # ----------------------------------------------------------------------------------------
17 # XPATH for extracting info from a MODS doc
18 my $isbn_xpath = "//mods:mods/mods:identifier[\@type='isbn']";
19 my $resource_xpath = "//mods:mods/mods:typeOfResource";
20 my $pub_xpath = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" .
21 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
22 my $tcn_xpath = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
23 my $publisher_xpath = "//mods:mods/mods:originInfo//mods:publisher[1]";
24 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
25 my $abstract_xpath = "//mods:mods/mods:abstract";
26 my $related_xpath = "";
27 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
28 my $physical_desc = "(//mods:physicalDescription/mods:form|//mods:physicalDescription/mods:extent|".
29 "//mods:physicalDescription/mods:reformattingQuality|//mods:physicalDescription/mods:internetMediaType|".
30 "//mods:physicalDescription/mods:digitalOrigin)";
31 my $toc_xpath = "//mods:tableOfContents";
37 "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
39 "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
41 "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
43 "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
45 "//mods:mods/mods:titleInfo",
50 "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
51 "[../mods:role/mods:text[text()='creator']][1]",
53 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
54 "[../mods:role/mods:text[text()='creator']][1]",
56 "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
57 "[../mods:role/mods:text[text()='creator']][1]",
59 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
65 "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
68 # "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
70 # "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
72 # "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
74 # "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
76 #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
79 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
82 # ----------------------------------------------------------------------------------------
86 sub new { return bless( {}, shift() ); }
90 my( $self, $mods, $xpath ) = @_;
93 my $root = $mods->documentElement;
94 $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
96 # grab the set of matching nodes
97 my @nodes = $root->findnodes( $xpath );
98 for my $value (@nodes) {
100 # grab all children of the node
101 my @children = $value->childNodes();
103 for my $child (@children) {
104 next unless( $child->nodeType != 3 );
106 if($child->childNodes) {
108 for my $c (@{$child->childNodes}){
109 push @a, $c->textContent;
111 push(@child_text, join(' ', @a));
114 push(@child_text, $child->textContent);
119 push(@string, \@child_text);
123 push(@string, $value->textContent );
130 sub _modsdoc_to_values {
131 my( $self, $mods ) = @_;
133 for my $class (keys %$xpathset) {
134 $data->{$class} = {};
135 for my $type(keys %{$xpathset->{$class}}) {
136 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
137 if( $class eq "subject" ) {
138 push( @{$data->{$class}->{$type}}, @value );
140 $data->{$class}->{$type} = $value[0];
148 sub modsdoc_to_values {
149 my( $self, $mods ) = @_;
153 my $class = "subject";
154 $data->{$class} = {};
155 for my $type(keys %{$xpathset->{$class}}) {
156 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
157 for my $arr (@value) {
158 push( @{$data->{$class}->{$type}}, $arr);
165 $data->{$class} = {};
166 for my $type(keys %{$xpathset->{$class}}) {
167 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
168 for my $arr (@value) {
170 $data->{$class}->{$type} = shift @$arr;
171 $data->{$class}->{$type} .= ' ' . shift @$arr if (lc($data->{$class}->{$type}) =~ /^the|an?/o);
173 $data->{$class}->{$type} .= ' : ' if ($data->{$class}->{$type} =~ /\w\s*$/o);
174 $data->{$class}->{$type} .= " $t";
177 $data->{$class}->{$type} = $arr;
184 my $class = "author";
185 $data->{$class} = {};
186 for my $type(keys %{$xpathset->{$class}}) {
187 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
188 $data->{$class}->{$type} = $value[0];
193 my $class = "series";
194 $data->{$class} = {};
195 for my $type(keys %{$xpathset->{$class}}) {
196 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
197 for my $arr (@value) {
199 push(@{$data->{$class}->{$type}}, join(" ", @$arr));
201 push( @{$data->{$class}->{$type}}, $arr );
214 # ---------------------------------------------------------------------------
215 # Grabs the data 'we want' from the MODS doc and returns it in hash form
216 # ---------------------------------------------------------------------------
217 sub mods_values_to_mods_slim {
218 my( $self, $modsperl ) = @_;
225 my $tmp = $modsperl->{title};
228 if(!$tmp) { $title = ""; }
230 ($title = $tmp->{proper}) ||
231 ($title = $tmp->{translated}) ||
232 ($title = $tmp->{abbreviated}) ||
233 ($title = $tmp->{uniform}) ||
234 ($title = $tmp->{any});
237 $tmp = $modsperl->{author};
238 if(!$tmp) { $author = ""; }
240 ($author = $tmp->{personal}) ||
241 ($author = $tmp->{other}) ||
242 ($author = $tmp->{corporate}) ||
243 ($author = $tmp->{conference});
246 $tmp = $modsperl->{subject};
247 if(!$tmp) { $subject = {}; }
249 for my $key( keys %{$tmp}) {
250 push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
253 for my $s (@$subject) {
254 if(defined($subh->{$s})) { $subh->{$s->[0]}++ } else { $subh->{$s->[0]} = 1;}
259 $tmp = $modsperl->{'series'};
260 if(!$tmp) { $series = []; }
261 else { $series = $tmp->{'series'}; }
264 return { series => $series, title => $title,
265 author => $author, subject => $subject };
270 # ---------------------------------------------------------------------------
271 # Initializes a MARC -> Unified MODS batch process
272 # ---------------------------------------------------------------------------
274 sub start_mods_batch {
276 my( $self, $master_doc ) = @_;
280 my $xslt_doc = $parser->parse_file(
281 OpenSRF::Utils::SettingsClient->new->config_value(dirs => 'xsl') . "/MARC21slim2MODS3.xsl");
282 $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
286 my $xmldoc = $parser->parse_string($master_doc);
287 my $mods = $mods_sheet->transform($xmldoc);
289 # warn "-" x 100 . "\n";
290 # warn "MODS " . $mods->toString(1) . "\n";
291 # warn "-" x 100 . "\n";
293 $self->{master_doc} = $self->modsdoc_to_values( $mods );
294 $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
296 ($self->{master_doc}->{isbn}) =
297 $self->get_field_value( $mods, $isbn_xpath );
299 $self->{master_doc}->{type_of_resource} =
300 [ $self->get_field_value( $mods, $resource_xpath ) ];
302 ($self->{master_doc}->{tcn}) =
303 $self->get_field_value( $mods, $tcn_xpath );
305 ($self->{master_doc}->{pubdate}) =
306 $self->get_field_value( $mods, $pub_xpath );
308 ($self->{master_doc}->{publisher}) =
309 $self->get_field_value( $mods, $publisher_xpath );
311 ($self->{master_doc}->{edition}) =
312 $self->get_field_value( $mods, $edition_xpath );
316 # ------------------------------
317 # holds an array of [ link, title, link, title, ... ]
318 $self->{master_doc}->{online_loc} = [];
319 push(@{$self->{master_doc}->{online_loc}},
320 $self->get_field_value( $mods, $online_loc_xpath ));
322 ($self->{master_doc}->{synopsis}) =
323 $self->get_field_value( $mods, $abstract_xpath );
325 $self->{master_doc}->{physical_description} = [];
326 push(@{$self->{master_doc}->{physical_description}},
327 $self->get_field_value( $mods, $physical_desc ) );
328 $self->{master_doc}->{physical_description} =
329 join( ' ', @{$self->{master_doc}->{physical_description}});
331 ($self->{master_doc}->{toc}) = $self->get_field_value($mods, $toc_xpath);
337 # ---------------------------------------------------------------------------
338 # Takes a MARCXML string and adds it to the growing MODS doc
339 # ---------------------------------------------------------------------------
340 sub push_mods_batch {
341 my( $self, $marcxml ) = @_;
343 my $xmldoc = $parser->parse_string($marcxml);
344 my $mods = $mods_sheet->transform($xmldoc);
346 my $xmlperl = $self->modsdoc_to_values( $mods );
347 $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
349 # for backwards compatibility, remove the array part when all is decided
350 if(ref($xmlperl->{subject}) eq 'ARRAY' ) {
351 for my $subject( @{$xmlperl->{subject}} ) {
352 push @{$self->{master_doc}->{subject}}, $subject;
355 for my $subject ( keys %{$xmlperl->{subject}} ) {
356 my $s = $self->{master_doc}->{subject};
357 if(defined($s->{$subject})) { $s->{$subject}++; } else { $s->{$subject} = 1; }
361 push( @{$self->{master_doc}->{type_of_resource}},
362 $self->get_field_value( $mods, $resource_xpath ));
364 if(!($self->{master_doc}->{isbn}) ) {
365 ($self->{master_doc}->{isbn}) =
366 $self->get_field_value( $mods, $isbn_xpath );
371 # ---------------------------------------------------------------------------
372 # Completes a MARC -> Unified MODS batch process and returns the perl hash
373 # ---------------------------------------------------------------------------
374 sub init_virtual_record {
375 my $record = new Fieldmapper::metabib::virtual_record;
376 $record->subject([]);
377 $record->types_of_resource([]);
378 $record->call_numbers([]);
382 sub finish_mods_batch {
384 my $perl = $self->{master_doc};
385 my $record = init_virtual_record();
387 # turn the hash into a fieldmapper object
388 (my $title = $perl->{title}) =~ s/\[.*?\]//og;
389 (my $author = $perl->{author}) =~ s/\(.*?\)//og;
392 for my $s (@{$perl->{series}}) {
393 push @series, (split( /\s*;/, $s ))[0];
396 # uniquify the types of resource
397 my $rtypes = $perl->{type_of_resource};
398 my %hash = map { ($_ => 1) } @$rtypes;
399 $rtypes = [ keys %hash ];
401 $record->title($title);
402 $record->author($author);
404 $record->doc_id($perl->{doc_id});
405 $record->isbn($perl->{isbn});
406 $record->pubdate($perl->{pubdate});
407 $record->publisher($perl->{publisher});
408 $record->tcn($perl->{tcn});
410 $record->edition($perl->{edition});
412 $record->subject($perl->{subject});
413 $record->types_of_resource($rtypes);
414 $record->series(\@series);
416 $record->online_loc($perl->{online_loc});
417 $record->synopsis($perl->{synopsis});
418 $record->physical_description($perl->{physical_description});
419 $record->toc($perl->{toc});
422 warn Dumper $self->{master_doc};
424 $self->{master_doc} = undef;