1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
4 use OpenSRF::EX qw/:try/;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
12 my $parser = XML::LibXML->new();
13 my $xslt = XML::LibXSLT->new();
14 my $xslt_doc = $parser->parse_file(
15 OpenSRF::Utils::SettingsClient
17 ->config_value(dirs => 'xsl') . "/MARC21slim2MODS3.xsl"
20 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
22 # ----------------------------------------------------------------------------------------
23 # XPATH for extracting info from a MODS doc
24 my $isbn_xpath = "//mods:mods/mods:identifier[\@type='isbn']";
25 my $resource_xpath = "//mods:mods/mods:typeOfResource";
26 my $pub_xpath = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" .
27 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
28 my $tcn_xpath = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
29 my $publisher_xpath = "//mods:mods/mods:originInfo//mods:publisher[1]";
30 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
31 my $abstract_xpath = "//mods:mods/mods:abstract";
33 my $related_xpath = "";
34 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
40 "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
42 "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
44 "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
46 "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
51 "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52 "[../mods:role/mods:text[text()='creator']][1]",
54 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
55 "[../mods:role/mods:text[text()='creator']][1]",
57 "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
58 "[../mods:role/mods:text[text()='creator']][1]",
60 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
66 "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
69 # "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
71 # "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
73 # "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
75 # "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
77 #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
80 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
83 # ----------------------------------------------------------------------------------------
87 sub new { return bless( {}, shift() ); }
91 my( $self, $mods, $xpath ) = @_;
94 my $root = $mods->documentElement;
95 $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
97 # grab the set of matching nodes
98 my @nodes = $root->findnodes( $xpath );
99 for my $value (@nodes) {
101 # grab all children of the node
102 my @children = $value->childNodes();
104 for my $child (@children) {
105 next unless( $child->nodeType != 3 );
107 if($child->childNodes) {
109 for my $c (@{$child->childNodes}){
110 push @a, $c->textContent;
112 push(@child_text, join(' ', @a));
115 push(@child_text, $child->textContent);
120 push(@string, \@child_text);
124 push(@string, $value->textContent );
131 sub _modsdoc_to_values {
132 my( $self, $mods ) = @_;
134 for my $class (keys %$xpathset) {
135 $data->{$class} = {};
136 for my $type(keys %{$xpathset->{$class}}) {
137 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
138 if( $class eq "subject" ) {
139 push( @{$data->{$class}->{$type}}, @value );
141 $data->{$class}->{$type} = $value[0];
149 sub modsdoc_to_values {
150 my( $self, $mods ) = @_;
154 my $class = "subject";
155 $data->{$class} = {};
156 for my $type(keys %{$xpathset->{$class}}) {
157 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
158 for my $arr (@value) {
159 push( @{$data->{$class}->{$type}}, $arr);
166 $data->{$class} = {};
167 for my $type(keys %{$xpathset->{$class}}) {
168 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
169 for my $arr (@value) {
171 $data->{$class}->{$type} = join(" ", @$arr);
173 $data->{$class}->{$type} = $arr;
180 my $class = "author";
181 $data->{$class} = {};
182 for my $type(keys %{$xpathset->{$class}}) {
183 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
184 $data->{$class}->{$type} = $value[0];
189 my $class = "series";
190 $data->{$class} = {};
191 for my $type(keys %{$xpathset->{$class}}) {
192 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
193 for my $arr (@value) {
195 push(@{$data->{$class}->{$type}}, join(" ", @$arr));
197 push( @{$data->{$class}->{$type}}, $arr );
210 # ---------------------------------------------------------------------------
211 # Grabs the data 'we want' from the MODS doc and returns it in hash form
212 # ---------------------------------------------------------------------------
213 sub mods_values_to_mods_slim {
214 my( $self, $modsperl ) = @_;
221 my $tmp = $modsperl->{title};
224 if(!$tmp) { $title = ""; }
226 ($title = $tmp->{proper}) ||
227 ($title = $tmp->{translated}) ||
228 ($title = $tmp->{abbreviated}) ||
229 ($title = $tmp->{uniform});
232 $tmp = $modsperl->{author};
233 if(!$tmp) { $author = ""; }
235 ($author = $tmp->{personal}) ||
236 ($author = $tmp->{other}) ||
237 ($author = $tmp->{corporate}) ||
238 ($author = $tmp->{conference});
241 $tmp = $modsperl->{subject};
242 if(!$tmp) { $subject = []; }
244 for my $key( keys %{$tmp}) {
245 push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
249 $tmp = $modsperl->{'series'};
250 if(!$tmp) { $series = []; }
251 else { $series = $tmp->{'series'}; }
254 return { series => $series, title => $title,
255 author => $author, subject => $subject };
260 # ---------------------------------------------------------------------------
261 # Initializes a MARC -> Unified MODS batch process
262 # ---------------------------------------------------------------------------
264 sub start_mods_batch {
266 my( $self, $master_doc ) = @_;
268 my $xmldoc = $parser->parse_string($master_doc);
269 my $mods = $mods_sheet->transform($xmldoc);
271 # warn "-" x 100 . "\n";
272 # warn "MODS " . $mods->toString(1) . "\n";
273 # warn "-" x 100 . "\n";
275 $self->{master_doc} = $self->modsdoc_to_values( $mods );
276 $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
278 ($self->{master_doc}->{isbn}) =
279 $self->get_field_value( $mods, $isbn_xpath );
281 $self->{master_doc}->{type_of_resource} =
282 [ $self->get_field_value( $mods, $resource_xpath ) ];
284 ($self->{master_doc}->{tcn}) =
285 $self->get_field_value( $mods, $tcn_xpath );
287 ($self->{master_doc}->{pubdate}) =
288 $self->get_field_value( $mods, $pub_xpath );
290 ($self->{master_doc}->{publisher}) =
291 $self->get_field_value( $mods, $publisher_xpath );
293 ($self->{master_doc}->{edition}) =
294 $self->get_field_value( $mods, $edition_xpath );
298 # ------------------------------
299 # holds an array of [ link, title, link, title, ... ]
300 $self->{master_doc}->{online_loc} = [];
301 push(@{$self->{master_doc}->{online_loc}},
302 $self->get_field_value( $mods, $online_loc_xpath ));
304 ($self->{master_doc}->{synopsis}) =
305 $self->get_field_value( $mods, $abstract_xpath );
311 # ---------------------------------------------------------------------------
312 # Takes a MARCXML string and adds it to the growing MODS doc
313 # ---------------------------------------------------------------------------
314 sub push_mods_batch {
315 my( $self, $marcxml ) = @_;
317 my $xmldoc = $parser->parse_string($marcxml);
318 my $mods = $mods_sheet->transform($xmldoc);
320 my $xmlperl = $self->modsdoc_to_values( $mods );
321 $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
323 for my $subject( @{$xmlperl->{subject}} ) {
324 push @{$self->{master_doc}->{subject}}, $subject;
327 push( @{$self->{master_doc}->{type_of_resource}},
328 $self->get_field_value( $mods, $resource_xpath ));
330 if(!($self->{master_doc}->{isbn}) ) {
331 ($self->{master_doc}->{isbn}) =
332 $self->get_field_value( $mods, $isbn_xpath );
337 # ---------------------------------------------------------------------------
338 # Completes a MARC -> Unified MODS batch process and returns the perl hash
339 # ---------------------------------------------------------------------------
340 sub init_virtual_record {
341 my $record = new Fieldmapper::metabib::virtual_record;
342 $record->subject([]);
343 $record->types_of_resource([]);
344 $record->call_numbers([]);
348 sub finish_mods_batch {
350 my $perl = $self->{master_doc};
351 my $record = init_virtual_record();
353 # turn the hash into a fieldmapper object
354 (my $title = $perl->{title}) =~ s/\[.*?\]//og;
355 (my $author = $perl->{author}) =~ s/\(.*?\)//og;
358 for my $s (@{$perl->{series}}) {
359 push @series, (split( /\s*;/, $s ))[0];
362 # uniquify the types of resource
363 my $rtypes = $perl->{type_of_resource};
364 my %hash = map { ($_ => 1) } @$rtypes;
365 $rtypes = [ keys %hash ];
367 $record->title($title);
368 $record->author($author);
370 $record->doc_id($perl->{doc_id});
371 $record->isbn($perl->{isbn});
372 $record->pubdate($perl->{pubdate});
373 $record->publisher($perl->{publisher});
374 $record->tcn($perl->{tcn});
376 $record->edition($perl->{edition});
378 $record->subject($perl->{subject});
379 $record->types_of_resource($rtypes);
380 $record->series(\@series);
382 $record->online_loc($perl->{online_loc});
383 $record->synopsis($perl->{synopsis});
385 $self->{master_doc} = undef;