1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
4 use OpenSRF::EX qw/:try/;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
11 my $parser = XML::LibXML->new();
12 my $xslt = XML::LibXSLT->new();
13 my $xslt_doc = $parser->parse_file(
14 "/pines/cvs/ILS/Open-ILS/xsl/MARC21slim2MODS3.xsl" );
15 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" .
22 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath = "//mods:mods/mods:abstract";
28 my $related_xpath = "";
29 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
35 "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
37 "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
39 "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
41 "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
46 "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
47 "[../mods:role/mods:text[text()='creator']][1]",
49 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
50 "[../mods:role/mods:text[text()='creator']][1]",
52 "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
53 "[../mods:role/mods:text[text()='creator']][1]",
55 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
61 "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
64 # "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
66 # "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
68 # "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
70 # "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
72 #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
75 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
78 # ----------------------------------------------------------------------------------------
82 sub new { return bless( {}, shift() ); }
86 my( $self, $mods, $xpath ) = @_;
89 my $root = $mods->documentElement;
90 $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
92 # grab the set of matching nodes
93 my @nodes = $root->findnodes( $xpath );
94 for my $value (@nodes) {
96 # grab all children of the node
97 my @children = $value->childNodes();
99 for my $child (@children) {
100 next unless( $child->nodeType != 3 );
102 if($child->childNodes) {
104 for my $c (@{$child->childNodes}){
105 push @a, $c->textContent;
107 push(@child_text, join(' ', @a));
110 push(@child_text, $child->textContent);
115 push(@string, \@child_text);
119 push(@string, $value->textContent );
126 sub _modsdoc_to_values {
127 my( $self, $mods ) = @_;
129 for my $class (keys %$xpathset) {
130 $data->{$class} = {};
131 for my $type(keys %{$xpathset->{$class}}) {
132 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
133 if( $class eq "subject" ) {
134 push( @{$data->{$class}->{$type}}, @value );
136 $data->{$class}->{$type} = $value[0];
144 sub modsdoc_to_values {
145 my( $self, $mods ) = @_;
149 my $class = "subject";
150 $data->{$class} = {};
151 for my $type(keys %{$xpathset->{$class}}) {
152 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
153 for my $arr (@value) {
154 push( @{$data->{$class}->{$type}}, $arr);
161 $data->{$class} = {};
162 for my $type(keys %{$xpathset->{$class}}) {
163 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
164 for my $arr (@value) {
166 $data->{$class}->{$type} = join(" ", @$arr);
168 $data->{$class}->{$type} = $arr;
175 my $class = "author";
176 $data->{$class} = {};
177 for my $type(keys %{$xpathset->{$class}}) {
178 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
179 $data->{$class}->{$type} = $value[0];
184 my $class = "series";
185 $data->{$class} = {};
186 for my $type(keys %{$xpathset->{$class}}) {
187 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
188 for my $arr (@value) {
190 push(@{$data->{$class}->{$type}}, join(" ", @$arr));
192 push( @{$data->{$class}->{$type}}, $arr );
205 # ---------------------------------------------------------------------------
206 # Grabs the data 'we want' from the MODS doc and returns it in hash form
207 # ---------------------------------------------------------------------------
208 sub mods_values_to_mods_slim {
209 my( $self, $modsperl ) = @_;
216 my $tmp = $modsperl->{title};
219 if(!$tmp) { $title = ""; }
221 ($title = $tmp->{proper}) ||
222 ($title = $tmp->{translated}) ||
223 ($title = $tmp->{abbreviated}) ||
224 ($title = $tmp->{uniform});
227 $tmp = $modsperl->{author};
228 if(!$tmp) { $author = ""; }
230 ($author = $tmp->{personal}) ||
231 ($author = $tmp->{other}) ||
232 ($author = $tmp->{corporate}) ||
233 ($author = $tmp->{conference});
236 $tmp = $modsperl->{subject};
237 if(!$tmp) { $subject = []; }
239 for my $key( keys %{$tmp}) {
240 push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
244 $tmp = $modsperl->{'series'};
245 if(!$tmp) { $series = []; }
246 else { $series = $tmp->{'series'}; }
249 return { series => $series, title => $title,
250 author => $author, subject => $subject };
255 # ---------------------------------------------------------------------------
256 # Initializes a MARC -> Unified MODS batch process
257 # ---------------------------------------------------------------------------
259 sub start_mods_batch {
261 my( $self, $master_doc ) = @_;
263 my $xmldoc = $parser->parse_string($master_doc);
264 my $mods = $mods_sheet->transform($xmldoc);
266 # warn "-" x 100 . "\n";
267 # warn "MODS " . $mods->toString(1) . "\n";
268 # warn "-" x 100 . "\n";
270 $self->{master_doc} = $self->modsdoc_to_values( $mods );
271 $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
273 ($self->{master_doc}->{isbn}) =
274 $self->get_field_value( $mods, $isbn_xpath );
276 $self->{master_doc}->{type_of_resource} =
277 [ $self->get_field_value( $mods, $resource_xpath ) ];
279 ($self->{master_doc}->{tcn}) =
280 $self->get_field_value( $mods, $tcn_xpath );
282 ($self->{master_doc}->{pubdate}) =
283 $self->get_field_value( $mods, $pub_xpath );
285 ($self->{master_doc}->{publisher}) =
286 $self->get_field_value( $mods, $publisher_xpath );
288 ($self->{master_doc}->{edition}) =
289 $self->get_field_value( $mods, $edition_xpath );
293 # ------------------------------
294 # holds an array of [ link, title, link, title, ... ]
295 $self->{master_doc}->{online_loc} = [];
296 push(@{$self->{master_doc}->{online_loc}},
297 $self->get_field_value( $mods, $online_loc_xpath ));
299 ($self->{master_doc}->{synopsis}) =
300 $self->get_field_value( $mods, $abstract_xpath );
306 # ---------------------------------------------------------------------------
307 # Takes a MARCXML string and adds it to the growing MODS doc
308 # ---------------------------------------------------------------------------
309 sub push_mods_batch {
310 my( $self, $marcxml ) = @_;
312 my $xmldoc = $parser->parse_string($marcxml);
313 my $mods = $mods_sheet->transform($xmldoc);
315 my $xmlperl = $self->modsdoc_to_values( $mods );
316 $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
318 for my $subject( @{$xmlperl->{subject}} ) {
319 push @{$self->{master_doc}->{subject}}, $subject;
322 push( @{$self->{master_doc}->{type_of_resource}},
323 $self->get_field_value( $mods, $resource_xpath ));
325 if(!($self->{master_doc}->{isbn}) ) {
326 ($self->{master_doc}->{isbn}) =
327 $self->get_field_value( $mods, $isbn_xpath );
332 # ---------------------------------------------------------------------------
333 # Completes a MARC -> Unified MODS batch process and returns the perl hash
334 # ---------------------------------------------------------------------------
335 sub init_virtual_record {
336 my $record = new Fieldmapper::metabib::virtual_record;
337 $record->subject([]);
338 $record->types_of_resource([]);
339 $record->call_numbers([]);
343 sub finish_mods_batch {
345 my $perl = $self->{master_doc};
346 my $record = init_virtual_record();
348 # turn the hash into a fieldmapper object
349 (my $title = $perl->{title}) =~ s/\[.*?\]//og;
350 (my $author = $perl->{author}) =~ s/\(.*?\)//og;
353 for my $s (@{$perl->{series}}) {
354 push @series, (split( /\s*;/, $s ))[0];
357 # uniquify the types of resource
358 my $rtypes = $perl->{type_of_resource};
359 my %hash = map { ($_ => 1) } @$rtypes;
360 $rtypes = [ keys %hash ];
362 $record->title($title);
363 $record->author($author);
365 $record->doc_id($perl->{doc_id});
366 $record->isbn($perl->{isbn});
367 $record->pubdate($perl->{pubdate});
368 $record->publisher($perl->{publisher});
369 $record->tcn($perl->{tcn});
371 $record->edition($perl->{edition});
373 $record->subject($perl->{subject});
374 $record->types_of_resource($rtypes);
375 $record->series(\@series);
377 $record->online_loc($perl->{online_loc});
378 $record->synopsis($perl->{synopsis});
380 $self->{master_doc} = undef;