]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
updated mods parser with new virtual_record fields
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use Data::Dumper;
10
11 my $parser              = XML::LibXML->new();
12 my $xslt                        = XML::LibXSLT->new();
13 my $xslt_doc    = $parser->parse_file( 
14                 "/pines/cvs/ILS/Open-ILS/xsl/MARC21slim2MODS3.xsl" );
15 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
16
17 # ----------------------------------------------------------------------------------------
18 # XPATH for extracting info from a MODS doc
19 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
22                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
25 my $edition_xpath               = "//mods:mods/mods:originInfo//mods:edition[1]";
26 my $abstract_xpath      = "//mods:mods/mods:abstract";
27 my $toc_xpath                   = "";
28 my $related_xpath               = "";
29 my $online_loc_xpath = "(//mods:location/mods:url|//mods:location/mods:url/\@displayLabel)";
30
31 my $xpathset = {
32
33         title => {
34                 abbreviated => 
35                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
36                 translated =>
37                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
38                 uniform =>
39                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
40                 proper =>
41                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
42         },
43
44         author => {
45                 corporate => 
46                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
47                                 "[../mods:role/mods:text[text()='creator']][1]",
48                 personal => 
49                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
50                                 "[../mods:role/mods:text[text()='creator']][1]",
51                 conference => 
52                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
53                                 "[../mods:role/mods:text[text()='creator']][1]",
54                 other => 
55                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
56         },
57
58         subject => {
59
60                 topic => 
61                         "//mods:mods/mods:subject/*[local-name()!='geographicCode']/parent::mods:subject",
62
63 #               geographic => 
64 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
65 #               name => 
66 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
67 #               temporal => 
68 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
69 #               topic => 
70 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
71         },
72         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
73
74         series => {
75                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
76         }
77 };
78 # ----------------------------------------------------------------------------------------
79
80
81
82 sub new { return bless( {}, shift() ); }
83
84 sub get_field_value {
85
86         my( $self, $mods, $xpath ) = @_;
87
88         my @string;
89         my $root = $mods->documentElement;
90         $root->setNamespace( "http://www.loc.gov/mods/v3", "mods", 1 );
91
92         # grab the set of matching nodes
93         my @nodes = $root->findnodes( $xpath );
94         for my $value (@nodes) {
95
96                 # grab all children of the node
97                 my @children = $value->childNodes();
98                 my @child_text;
99                 for my $child (@children) {
100                         next unless( $child->nodeType != 3 );
101
102                         if($child->childNodes) {
103                                 my @a;
104                                 for my $c (@{$child->childNodes}){
105                                         push @a, $c->textContent;
106                                 }
107                                 push(@child_text, join(' ', @a));
108
109                         } else {
110                                 push(@child_text, $child->textContent); 
111                         }
112
113                 }
114                 if(@child_text) {
115                         push(@string, \@child_text);
116                 }
117
118                 if( !@child_text  ) {
119                         push(@string, $value->textContent );
120                 }
121         }
122         return @string;
123 }
124
125 =head
126 sub _modsdoc_to_values {
127         my( $self, $mods ) = @_;
128         my $data = {};
129         for my $class (keys %$xpathset) {
130                 $data->{$class} = {};
131                 for my $type(keys %{$xpathset->{$class}}) {
132                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
133                         if( $class eq "subject" ) {
134                                 push( @{$data->{$class}->{$type}},  @value );
135                         } else {
136                                 $data->{$class}->{$type} = $value[0];
137                         }
138                 }
139         }
140         return $data;
141 }
142 =cut
143
144 sub modsdoc_to_values {
145         my( $self, $mods ) = @_;
146         my $data = {};
147
148         {
149                 my $class = "subject";
150                 $data->{$class} = {};
151                 for my $type(keys %{$xpathset->{$class}}) {
152                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
153                         for my $arr (@value) {
154                                 push( @{$data->{$class}->{$type}},  $arr);
155                         }
156                 }
157         }
158
159         {
160                 my $class = "title";
161                 $data->{$class} = {};
162                 for my $type(keys %{$xpathset->{$class}}) {
163                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
164                         for my $arr (@value) {
165                                 if( ref($arr) ) {
166                                         $data->{$class}->{$type} = join(" ", @$arr);
167                                 } else {
168                                         $data->{$class}->{$type} = $arr;
169                                 }
170                         }
171                 }
172         }
173
174         {
175                 my $class = "author";
176                 $data->{$class} = {};
177                 for my $type(keys %{$xpathset->{$class}}) {
178                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
179                         $data->{$class}->{$type} = $value[0];
180                 }
181         }
182
183         {
184                 my $class = "series";
185                 $data->{$class} = {};
186                 for my $type(keys %{$xpathset->{$class}}) {
187                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
188                         for my $arr (@value) {
189                                 if( ref($arr) ) {
190                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
191                                 } else {
192                                         push( @{$data->{$class}->{$type}}, $arr );
193                                 }
194                         }
195                 }
196
197         }
198
199         return $data;
200 }
201
202
203
204
205 # ---------------------------------------------------------------------------
206 # Grabs the data 'we want' from the MODS doc and returns it in hash form
207 # ---------------------------------------------------------------------------
208 sub mods_values_to_mods_slim {
209         my( $self, $modsperl ) = @_;
210
211         my $title = "";
212         my $author = "";
213         my $subject = [];
214         my $series      = [];
215
216         my $tmp = $modsperl->{title};
217
218
219         if(!$tmp) { $title = ""; }
220         else {
221                 ($title = $tmp->{proper}) ||
222                 ($title = $tmp->{translated}) ||
223                 ($title = $tmp->{abbreviated}) ||
224                 ($title = $tmp->{uniform});
225         }
226
227         $tmp = $modsperl->{author};
228         if(!$tmp) { $author = ""; }
229         else {
230                 ($author = $tmp->{personal}) ||
231                 ($author = $tmp->{other}) ||
232                 ($author = $tmp->{corporate}) ||
233                 ($author = $tmp->{conference}); 
234         }
235
236         $tmp = $modsperl->{subject};
237         if(!$tmp) { $subject = []; } 
238         else {
239                 for my $key( keys %{$tmp}) {
240                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
241                 }
242         }
243
244         $tmp = $modsperl->{'series'};
245         if(!$tmp) { $series = []; }
246         else { $series = $tmp->{'series'}; }
247
248
249         return { series => $series, title => $title, 
250                         author => $author, subject => $subject };
251 }
252
253
254
255 # ---------------------------------------------------------------------------
256 # Initializes a MARC -> Unified MODS batch process
257 # ---------------------------------------------------------------------------
258
259 sub start_mods_batch {
260
261         my( $self, $master_doc ) = @_;
262
263         my $xmldoc = $parser->parse_string($master_doc);
264         my $mods = $mods_sheet->transform($xmldoc);
265
266         warn "-" x 100 . "\n";
267         warn "MODS " . $mods->toString(1) . "\n";
268         warn "-" x 100 . "\n";
269
270         $self->{master_doc} = $self->modsdoc_to_values( $mods );
271         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
272
273         ($self->{master_doc}->{isbn}) = 
274                 $self->get_field_value( $mods, $isbn_xpath );
275
276         $self->{master_doc}->{type_of_resource} = 
277                 [ $self->get_field_value( $mods, $resource_xpath ) ];
278
279         ($self->{master_doc}->{tcn}) = 
280                 $self->get_field_value( $mods, $tcn_xpath );
281
282         ($self->{master_doc}->{pubdate}) = 
283                 $self->get_field_value( $mods, $pub_xpath );
284
285         ($self->{master_doc}->{publisher}) = 
286                 $self->get_field_value( $mods, $publisher_xpath );
287
288         ($self->{master_doc}->{edition}) =
289                 $self->get_field_value( $mods, $edition_xpath );
290
291
292
293 # ------------------------------
294         # holds an array of [ link, title, link, title, ... ]
295         $self->{master_doc}->{online_loc} = [];
296         push(@{$self->{master_doc}->{online_loc}},
297                 $self->get_field_value( $mods, $online_loc_xpath ));
298
299         ($self->{master_doc}->{synopsis}) = 
300                 $self->get_field_value( $mods, $abstract_xpath );
301
302 }
303
304
305
306 # ---------------------------------------------------------------------------
307 # Takes a MARCXML string and adds it to the growing MODS doc
308 # ---------------------------------------------------------------------------
309 sub push_mods_batch {
310         my( $self, $marcxml ) = @_;
311
312         my $xmldoc = $parser->parse_string($marcxml);
313         my $mods = $mods_sheet->transform($xmldoc);
314
315         my $xmlperl = $self->modsdoc_to_values( $mods );
316         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
317
318         for my $subject( @{$xmlperl->{subject}} ) {
319                 push @{$self->{master_doc}->{subject}}, $subject;
320         }
321
322         push( @{$self->{master_doc}->{type_of_resource}}, 
323                 $self->get_field_value( $mods, $resource_xpath ));
324
325         if(!($self->{master_doc}->{isbn}) ) {
326                 ($self->{master_doc}->{isbn}) = 
327                         $self->get_field_value( $mods, $isbn_xpath );
328         }
329 }
330
331
332 # ---------------------------------------------------------------------------
333 # Completes a MARC -> Unified MODS batch process and returns the perl hash
334 # ---------------------------------------------------------------------------
335 sub init_virtual_record {
336         my $record = new Fieldmapper::metabib::virtual_record;
337         $record->subject([]);
338         $record->types_of_resource([]);
339         $record->call_numbers([]);
340         return $record;
341 }
342
343 sub finish_mods_batch {
344         my $self = shift;
345         my $perl = $self->{master_doc};
346         my $record = init_virtual_record();
347
348         # turn the hash into a fieldmapper object
349         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
350         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
351
352         my @series;
353         for my $s (@{$perl->{series}}) {
354                 push @series, (split( /\s*;/, $s ))[0];
355         }
356
357         # uniquify the types of resource
358         my $rtypes = $perl->{type_of_resource};
359         my %hash = map { ($_ => 1) } @$rtypes;
360         $rtypes = [ keys %hash ];
361
362         $record->title($title);
363         $record->author($author);
364
365         $record->doc_id($perl->{doc_id});
366         $record->isbn($perl->{isbn});
367         $record->pubdate($perl->{pubdate});
368         $record->publisher($perl->{publisher});
369         $record->tcn($perl->{tcn});
370
371         $record->edition($perl->{edition});
372
373         $record->subject($perl->{subject});
374         $record->types_of_resource($rtypes);
375         $record->series(\@series);
376
377         $record->online_loc($perl->{online_loc});
378         $record->synopsis($perl->{synopsis});
379
380         $self->{master_doc} = undef;
381         return $record;
382 }
383
384
385