]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
use the settings server to get the MODS xsl
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use OpenSRF::Utils::SettingsClient;
10 use Data::Dumper;
11
12 my $parser              = XML::LibXML->new();
13 my $xslt                        = XML::LibXSLT->new();
14 my $xslt_doc    = $parser->parse_file(
15         OpenSRF::Utils::SettingsClient
16                 ->new
17                 ->config_value(dirs => 'xsl') .  "/MARC21slim2MODS3.xsl"
18 );
19
20 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
21
22 # ----------------------------------------------------------------------------------------
23 # XPATH for extracting info from a MODS doc
24 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
25 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
26 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
27                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
28 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
29 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
30 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
31 my $abstract_xpath      = "//mods:mods/mods:abstract";
32 my $toc_xpath                   = "";
33 my $related_xpath               = "";
34 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
35
36 my $xpathset = {
37
38         title => {
39                 abbreviated => 
40                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
41                 translated =>
42                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
43                 uniform =>
44                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
45                 proper =>
46                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
47         },
48
49         author => {
50                 corporate => 
51                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
52                                 "[../mods:role/mods:text[text()='creator']][1]",
53                 personal => 
54                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
55                                 "[../mods:role/mods:text[text()='creator']][1]",
56                 conference => 
57                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
58                                 "[../mods:role/mods:text[text()='creator']][1]",
59                 other => 
60                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
61         },
62
63         subject => {
64
65                 topic => 
66                         "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
67
68 #               geographic => 
69 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
70 #               name => 
71 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
72 #               temporal => 
73 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
74 #               topic => 
75 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
76         },
77         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
78
79         series => {
80                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
81         }
82 };
83 # ----------------------------------------------------------------------------------------
84
85
86
87 sub new { return bless( {}, shift() ); }
88
89 sub get_field_value {
90
91         my( $self, $mods, $xpath ) = @_;
92
93         my @string;
94         my $root = $mods->documentElement;
95         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
96
97         # grab the set of matching nodes
98         my @nodes = $root->findnodes( $xpath );
99         for my $value (@nodes) {
100
101                 # grab all children of the node
102                 my @children = $value->childNodes();
103                 my @child_text;
104                 for my $child (@children) {
105                         next unless( $child->nodeType != 3 );
106
107                         if($child->childNodes) {
108                                 my @a;
109                                 for my $c (@{$child->childNodes}){
110                                         push @a, $c->textContent;
111                                 }
112                                 push(@child_text, join(' ', @a));
113
114                         } else {
115                                 push(@child_text, $child->textContent); 
116                         }
117
118                 }
119                 if(@child_text) {
120                         push(@string, \@child_text);
121                 }
122
123                 if( !@child_text  ) {
124                         push(@string, $value->textContent );
125                 }
126         }
127         return @string;
128 }
129
130 =head
131 sub _modsdoc_to_values {
132         my( $self, $mods ) = @_;
133         my $data = {};
134         for my $class (keys %$xpathset) {
135                 $data->{$class} = {};
136                 for my $type(keys %{$xpathset->{$class}}) {
137                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
138                         if( $class eq "subject" ) {
139                                 push( @{$data->{$class}->{$type}},  @value );
140                         } else {
141                                 $data->{$class}->{$type} = $value[0];
142                         }
143                 }
144         }
145         return $data;
146 }
147 =cut
148
149 sub modsdoc_to_values {
150         my( $self, $mods ) = @_;
151         my $data = {};
152
153         {
154                 my $class = "subject";
155                 $data->{$class} = {};
156                 for my $type(keys %{$xpathset->{$class}}) {
157                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
158                         for my $arr (@value) {
159                                 push( @{$data->{$class}->{$type}},  $arr);
160                         }
161                 }
162         }
163
164         {
165                 my $class = "title";
166                 $data->{$class} = {};
167                 for my $type(keys %{$xpathset->{$class}}) {
168                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
169                         for my $arr (@value) {
170                                 if( ref($arr) ) {
171                                         $data->{$class}->{$type} = join(" ", @$arr);
172                                 } else {
173                                         $data->{$class}->{$type} = $arr;
174                                 }
175                         }
176                 }
177         }
178
179         {
180                 my $class = "author";
181                 $data->{$class} = {};
182                 for my $type(keys %{$xpathset->{$class}}) {
183                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
184                         $data->{$class}->{$type} = $value[0];
185                 }
186         }
187
188         {
189                 my $class = "series";
190                 $data->{$class} = {};
191                 for my $type(keys %{$xpathset->{$class}}) {
192                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
193                         for my $arr (@value) {
194                                 if( ref($arr) ) {
195                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
196                                 } else {
197                                         push( @{$data->{$class}->{$type}}, $arr );
198                                 }
199                         }
200                 }
201
202         }
203
204         return $data;
205 }
206
207
208
209
210 # ---------------------------------------------------------------------------
211 # Grabs the data 'we want' from the MODS doc and returns it in hash form
212 # ---------------------------------------------------------------------------
213 sub mods_values_to_mods_slim {
214         my( $self, $modsperl ) = @_;
215
216         my $title = "";
217         my $author = "";
218         my $subject = [];
219         my $series      = [];
220
221         my $tmp = $modsperl->{title};
222
223
224         if(!$tmp) { $title = ""; }
225         else {
226                 ($title = $tmp->{proper}) ||
227                 ($title = $tmp->{translated}) ||
228                 ($title = $tmp->{abbreviated}) ||
229                 ($title = $tmp->{uniform});
230         }
231
232         $tmp = $modsperl->{author};
233         if(!$tmp) { $author = ""; }
234         else {
235                 ($author = $tmp->{personal}) ||
236                 ($author = $tmp->{other}) ||
237                 ($author = $tmp->{corporate}) ||
238                 ($author = $tmp->{conference}); 
239         }
240
241         $tmp = $modsperl->{subject};
242         if(!$tmp) { $subject = []; } 
243         else {
244                 for my $key( keys %{$tmp}) {
245                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
246                 }
247         }
248
249         $tmp = $modsperl->{'series'};
250         if(!$tmp) { $series = []; }
251         else { $series = $tmp->{'series'}; }
252
253
254         return { series => $series, title => $title, 
255                         author => $author, subject => $subject };
256 }
257
258
259
260 # ---------------------------------------------------------------------------
261 # Initializes a MARC -> Unified MODS batch process
262 # ---------------------------------------------------------------------------
263
264 sub start_mods_batch {
265
266         my( $self, $master_doc ) = @_;
267
268         my $xmldoc = $parser->parse_string($master_doc);
269         my $mods = $mods_sheet->transform($xmldoc);
270
271 #       warn "-" x 100 . "\n";
272 #       warn "MODS " . $mods->toString(1) . "\n";
273 #       warn "-" x 100 . "\n";
274
275         $self->{master_doc} = $self->modsdoc_to_values( $mods );
276         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
277
278         ($self->{master_doc}->{isbn}) = 
279                 $self->get_field_value( $mods, $isbn_xpath );
280
281         $self->{master_doc}->{type_of_resource} = 
282                 [ $self->get_field_value( $mods, $resource_xpath ) ];
283
284         ($self->{master_doc}->{tcn}) = 
285                 $self->get_field_value( $mods, $tcn_xpath );
286
287         ($self->{master_doc}->{pubdate}) = 
288                 $self->get_field_value( $mods, $pub_xpath );
289
290         ($self->{master_doc}->{publisher}) = 
291                 $self->get_field_value( $mods, $publisher_xpath );
292
293         ($self->{master_doc}->{edition}) =
294                 $self->get_field_value( $mods, $edition_xpath );
295
296
297
298 # ------------------------------
299         # holds an array of [ link, title, link, title, ... ]
300         $self->{master_doc}->{online_loc} = [];
301         push(@{$self->{master_doc}->{online_loc}},
302                 $self->get_field_value( $mods, $online_loc_xpath ));
303
304         ($self->{master_doc}->{synopsis}) = 
305                 $self->get_field_value( $mods, $abstract_xpath );
306
307 }
308
309
310
311 # ---------------------------------------------------------------------------
312 # Takes a MARCXML string and adds it to the growing MODS doc
313 # ---------------------------------------------------------------------------
314 sub push_mods_batch {
315         my( $self, $marcxml ) = @_;
316
317         my $xmldoc = $parser->parse_string($marcxml);
318         my $mods = $mods_sheet->transform($xmldoc);
319
320         my $xmlperl = $self->modsdoc_to_values( $mods );
321         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
322
323         for my $subject( @{$xmlperl->{subject}} ) {
324                 push @{$self->{master_doc}->{subject}}, $subject;
325         }
326
327         push( @{$self->{master_doc}->{type_of_resource}}, 
328                 $self->get_field_value( $mods, $resource_xpath ));
329
330         if(!($self->{master_doc}->{isbn}) ) {
331                 ($self->{master_doc}->{isbn}) = 
332                         $self->get_field_value( $mods, $isbn_xpath );
333         }
334 }
335
336
337 # ---------------------------------------------------------------------------
338 # Completes a MARC -> Unified MODS batch process and returns the perl hash
339 # ---------------------------------------------------------------------------
340 sub init_virtual_record {
341         my $record = new Fieldmapper::metabib::virtual_record;
342         $record->subject([]);
343         $record->types_of_resource([]);
344         $record->call_numbers([]);
345         return $record;
346 }
347
348 sub finish_mods_batch {
349         my $self = shift;
350         my $perl = $self->{master_doc};
351         my $record = init_virtual_record();
352
353         # turn the hash into a fieldmapper object
354         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
355         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
356
357         my @series;
358         for my $s (@{$perl->{series}}) {
359                 push @series, (split( /\s*;/, $s ))[0];
360         }
361
362         # uniquify the types of resource
363         my $rtypes = $perl->{type_of_resource};
364         my %hash = map { ($_ => 1) } @$rtypes;
365         $rtypes = [ keys %hash ];
366
367         $record->title($title);
368         $record->author($author);
369
370         $record->doc_id($perl->{doc_id});
371         $record->isbn($perl->{isbn});
372         $record->pubdate($perl->{pubdate});
373         $record->publisher($perl->{publisher});
374         $record->tcn($perl->{tcn});
375
376         $record->edition($perl->{edition});
377
378         $record->subject($perl->{subject});
379         $record->types_of_resource($rtypes);
380         $record->series(\@series);
381
382         $record->online_loc($perl->{online_loc});
383         $record->synopsis($perl->{synopsis});
384
385         $self->{master_doc} = undef;
386         return $record;
387 }
388
389
390