1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
4 use OpenSRF::EX qw/:try/;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
11 my $parser = XML::LibXML->new();
12 my $xslt = XML::LibXSLT->new();
13 my $xslt_doc = $parser->parse_file(
14 "/pines/cvs/ILS/Open-ILS/xsl/MARC21slim2MODS.xsl" );
15 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
17 # ----------------------------------------------------------------------------------------
18 # XXX get me from the database and cache me ...
19 my $isbn_xpath = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" .
22 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath = "//mods:mods/mods:originInfo//mods:publisher[1]";
26 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
33 "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
35 "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
37 "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
39 "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
44 "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
45 "[../mods:role/mods:text[text()='creator']][1]",
47 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
48 "[../mods:role/mods:text[text()='creator']][1]",
50 "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
51 "[../mods:role/mods:text[text()='creator']][1]",
53 "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
59 "//mods:mods/*[local-name()='subject']",
62 # "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
64 # "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
66 # "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
68 # "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
70 #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
73 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
76 # ----------------------------------------------------------------------------------------
80 sub new { return bless( {}, shift() ); }
84 my( $self, $mods, $xpath ) = @_;
87 my $root = $mods->documentElement;
88 $root->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
90 # grab the set of matching nodes
91 my @nodes = $root->findnodes( $xpath );
92 for my $value (@nodes) {
94 # grab all children of the node
95 my @children = $value->childNodes();
97 for my $child (@children) {
98 next unless( $child->nodeType != 3 );
100 if($child->childNodes) {
102 for my $c (@{$child->childNodes}){
103 push @a, $c->textContent;
105 push(@child_text, join(' ', @a));
108 push(@child_text, $child->textContent);
113 push(@string, \@child_text);
117 push(@string, $value->textContent );
124 sub _modsdoc_to_values {
125 my( $self, $mods ) = @_;
127 for my $class (keys %$xpathset) {
128 $data->{$class} = {};
129 for my $type(keys %{$xpathset->{$class}}) {
130 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
131 if( $class eq "subject" ) {
132 push( @{$data->{$class}->{$type}}, @value );
134 $data->{$class}->{$type} = $value[0];
142 sub modsdoc_to_values {
143 my( $self, $mods ) = @_;
147 my $class = "subject";
148 $data->{$class} = {};
149 for my $type(keys %{$xpathset->{$class}}) {
150 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
151 for my $arr (@value) {
152 push( @{$data->{$class}->{$type}}, $arr);
159 $data->{$class} = {};
160 for my $type(keys %{$xpathset->{$class}}) {
161 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
162 for my $arr (@value) {
164 $data->{$class}->{$type} = join(" ", @$arr);
166 $data->{$class}->{$type} = $arr;
173 my $class = "author";
174 $data->{$class} = {};
175 for my $type(keys %{$xpathset->{$class}}) {
176 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
177 $data->{$class}->{$type} = $value[0];
182 my $class = "series";
183 $data->{$class} = {};
184 for my $type(keys %{$xpathset->{$class}}) {
185 my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
186 for my $arr (@value) {
188 push(@{$data->{$class}->{$type}}, join(" ", @$arr));
190 push( @{$data->{$class}->{$type}}, $arr );
203 # ---------------------------------------------------------------------------
204 # Grabs the data 'we want' from the MODS doc and returns it in hash form
205 # ---------------------------------------------------------------------------
206 sub mods_values_to_mods_slim {
207 my( $self, $modsperl ) = @_;
214 my $tmp = $modsperl->{title};
217 if(!$tmp) { $title = ""; }
219 ($title = $tmp->{proper}) ||
220 ($title = $tmp->{translated}) ||
221 ($title = $tmp->{abbreviated}) ||
222 ($title = $tmp->{uniform});
225 $tmp = $modsperl->{author};
226 if(!$tmp) { $author = ""; }
228 ($author = $tmp->{personal}) ||
229 ($author = $tmp->{other}) ||
230 ($author = $tmp->{corporate}) ||
231 ($author = $tmp->{conference});
234 $tmp = $modsperl->{subject};
235 if(!$tmp) { $subject = []; }
237 for my $key( keys %{$tmp}) {
238 push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
242 $tmp = $modsperl->{'series'};
243 if(!$tmp) { $series = []; }
244 else { $series = $tmp->{'series'}; }
247 return { series => $series, title => $title, author => $author, subject => $subject };
253 # ---------------------------------------------------------------------------
254 # Initializes a MARC -> Unified MODS batch process
255 # ---------------------------------------------------------------------------
257 sub start_mods_batch {
259 my( $self, $master_doc ) = @_;
261 my $xmldoc = $parser->parse_string($master_doc);
262 my $mods = $mods_sheet->transform($xmldoc);
264 # warn "-" x 100 . "\n";
265 # warn "MODS " . $mods->toString(1) . "\n";
266 # warn "-" x 100 . "\n";
268 $self->{master_doc} = $self->modsdoc_to_values( $mods );
269 $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
271 ($self->{master_doc}->{isbn}) =
272 $self->get_field_value( $mods, $isbn_xpath );
274 $self->{master_doc}->{type_of_resource} =
275 [ $self->get_field_value( $mods, $resource_xpath ) ];
277 ($self->{master_doc}->{tcn}) =
278 $self->get_field_value( $mods, $tcn_xpath );
280 ($self->{master_doc}->{pubdate}) =
281 $self->get_field_value( $mods, $pub_xpath );
283 ($self->{master_doc}->{publisher}) =
284 $self->get_field_value( $mods, $publisher_xpath );
286 ($self->{master_doc}->{edition}) =
287 $self->get_field_value( $mods, $edition_xpath );
291 # ---------------------------------------------------------------------------
292 # Takes a MARCXML string and adds it to the growing MODS doc
293 # ---------------------------------------------------------------------------
294 sub push_mods_batch {
295 my( $self, $marcxml ) = @_;
297 my $xmldoc = $parser->parse_string($marcxml);
298 my $mods = $mods_sheet->transform($xmldoc);
300 my $xmlperl = $self->modsdoc_to_values( $mods );
301 $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
303 for my $subject( @{$xmlperl->{subject}} ) {
304 push @{$self->{master_doc}->{subject}}, $subject;
307 push( @{$self->{master_doc}->{type_of_resource}},
308 $self->get_field_value( $mods, $resource_xpath ));
310 if(!($self->{master_doc}->{isbn}) ) {
311 ($self->{master_doc}->{isbn}) =
312 $self->get_field_value( $mods, $isbn_xpath );
317 # ---------------------------------------------------------------------------
318 # Completes a MARC -> Unified MODS batch process and returns the perl hash
319 # ---------------------------------------------------------------------------
320 sub init_virtual_record {
321 my $record = new Fieldmapper::metabib::virtual_record;
322 $record->subject([]);
323 $record->types_of_resource([]);
324 $record->call_numbers([]);
328 sub finish_mods_batch {
330 my $perl = $self->{master_doc};
331 my $record = init_virtual_record();
333 # turn the hash into a fieldmapper object
334 (my $title = $perl->{title}) =~ s/\[.*?\]//og;
335 (my $author = $perl->{author}) =~ s/\(.*?\)//og;
338 for my $s (@{$perl->{series}}) {
339 push @series, (split( /\s*;/, $s ))[0];
342 # uniquify the types of resource
343 my $rtypes = $perl->{type_of_resource};
344 my %hash = map { ($_ => 1) } @$rtypes;
345 $rtypes = [ keys %hash ];
347 $record->title($title);
348 $record->author($author);
350 $record->doc_id($perl->{doc_id});
351 $record->isbn($perl->{isbn});
352 $record->pubdate($perl->{pubdate});
353 $record->publisher($perl->{publisher});
354 $record->tcn($perl->{tcn});
356 $record->edition($perl->{edition});
358 $record->subject($perl->{subject});
359 $record->types_of_resource($rtypes);
360 $record->series(\@series);
362 $self->{master_doc} = undef;