]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
tuning, added serials
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use Data::Dumper;
10
11 my $parser              = XML::LibXML->new();
12 my $xslt                        = XML::LibXSLT->new();
13 my $xslt_doc    = $parser->parse_file( 
14                 "/pines/cvs/ILS/Open-ILS/xsl/MARC21slim2MODS.xsl" );
15 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
16
17 # ----------------------------------------------------------------------------------------
18 # XXX get me from the database and cache me ...
19 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
22                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
25
26
27 my $xpathset = {
28         title => {
29                 abbreviated => 
30                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
31                 translated =>
32                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
33                 uniform =>
34                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
35                 proper =>
36                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
37         },
38         author => {
39                 corporate => 
40                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
41                                 "[../mods:role/mods:text[text()='creator']][1]",
42                 personal => 
43                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
44                                 "[../mods:role/mods:text[text()='creator']][1]",
45                 conference => 
46                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
47                                 "[../mods:role/mods:text[text()='creator']][1]",
48                 other => 
49                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
50         },
51         subject => {
52
53                 topic => 
54                         "//mods:mods/*[local-name()='subject']",
55
56 #               geographic => 
57 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
58 #               name => 
59 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
60 #               temporal => 
61 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
62 #               topic => 
63 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
64         },
65         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
66
67         series => {
68                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
69         }
70 };
71 # ----------------------------------------------------------------------------------------
72
73
74
75 sub new { return bless( {}, shift() ); }
76
77 sub get_field_value {
78
79         my( $self, $mods, $xpath ) = @_;
80
81         my @string;
82         my $root = $mods->documentElement;
83         $root->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
84
85         # grab the set of matching nodes
86         my @nodes = $root->findnodes( $xpath );
87         for my $value (@nodes) {
88
89                 # grab all children of the node
90                 my @children = $value->childNodes();
91                 my @child_text;
92                 for my $child (@children) {
93                         next unless( $child->nodeType != 3 );
94
95                         if($child->childNodes) {
96                                 my @a;
97                                 for my $c (@{$child->childNodes}){
98                                         push @a, $c->textContent;
99                                 }
100                                 push(@child_text, join(' ', @a));
101
102                         } else {
103                                 push(@child_text, $child->textContent); 
104                         }
105
106                 }
107                 if(@child_text) {
108                         push(@string, \@child_text);
109                 }
110
111                 if( !@child_text  ) {
112                         push(@string, $value->textContent );
113                 }
114         }
115         return @string;
116 }
117
118 =head
119 sub _modsdoc_to_values {
120         my( $self, $mods ) = @_;
121         my $data = {};
122         for my $class (keys %$xpathset) {
123                 $data->{$class} = {};
124                 for my $type(keys %{$xpathset->{$class}}) {
125                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
126                         if( $class eq "subject" ) {
127                                 push( @{$data->{$class}->{$type}},  @value );
128                         } else {
129                                 $data->{$class}->{$type} = $value[0];
130                         }
131                 }
132         }
133         return $data;
134 }
135 =cut
136
137 sub modsdoc_to_values {
138         my( $self, $mods ) = @_;
139         my $data = {};
140
141         {
142                 my $class = "subject";
143                 $data->{$class} = {};
144                 for my $type(keys %{$xpathset->{$class}}) {
145                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
146                         for my $arr (@value) {
147                                 push( @{$data->{$class}->{$type}},  $arr);
148                         }
149                 }
150         }
151
152         {
153                 my $class = "title";
154                 $data->{$class} = {};
155                 for my $type(keys %{$xpathset->{$class}}) {
156                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
157                         for my $arr (@value) {
158                                 if( ref($arr) ) {
159                                         $data->{$class}->{$type} = join(" ", @$arr);
160                                 } else {
161                                         $data->{$class}->{$type} = $arr;
162                                 }
163                         }
164                 }
165         }
166
167         {
168                 my $class = "author";
169                 $data->{$class} = {};
170                 for my $type(keys %{$xpathset->{$class}}) {
171                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
172                         $data->{$class}->{$type} = $value[0];
173                 }
174         }
175
176         {
177                 my $class = "series";
178                 $data->{$class} = {};
179                 for my $type(keys %{$xpathset->{$class}}) {
180                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
181                         for my $arr (@value) {
182                                 if( ref($arr) ) {
183                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
184                                 } else {
185                                         push( @{$data->{$class}->{$type}}, $arr );
186                                 }
187                         }
188                 }
189
190         }
191
192         return $data;
193 }
194
195
196
197
198 # ---------------------------------------------------------------------------
199 # Grabs the data 'we want' from the MODS doc and returns it in hash form
200 # ---------------------------------------------------------------------------
201 sub mods_values_to_mods_slim {
202         my( $self, $modsperl ) = @_;
203
204         my $title = "";
205         my $author = "";
206         my $subject = [];
207         my $series      = [];
208
209         my $tmp = $modsperl->{title};
210
211
212         if(!$tmp) { $title = ""; }
213         else {
214                 ($title = $tmp->{proper}) ||
215                 ($title = $tmp->{translated}) ||
216                 ($title = $tmp->{abbreviated}) ||
217                 ($title = $tmp->{uniform});
218         }
219
220         $tmp = $modsperl->{author};
221         if(!$tmp) { $author = ""; }
222         else {
223                 ($author = $tmp->{personal}) ||
224                 ($author = $tmp->{other}) ||
225                 ($author = $tmp->{corporate}) ||
226                 ($author = $tmp->{conference}); 
227         }
228
229         $tmp = $modsperl->{subject};
230         if(!$tmp) { $subject = []; } 
231         else {
232                 for my $key( keys %{$tmp}) {
233                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
234                 }
235         }
236
237         $tmp = $modsperl->{'series'};
238         if(!$tmp) { $series = []; }
239         else { $series = $tmp->{'series'}; }
240
241
242         return { series => $series, title => $title, author => $author, subject => $subject };
243
244 }
245
246
247
248 # ---------------------------------------------------------------------------
249 # Initializes a MARC -> Unified MODS batch process
250 # ---------------------------------------------------------------------------
251
252 sub start_mods_batch {
253
254         my( $self, $master_doc ) = @_;
255
256         my $xmldoc = $parser->parse_string($master_doc);
257         my $mods = $mods_sheet->transform($xmldoc);
258
259 #       warn "-" x 100 . "\n";
260 #       warn "MODS " . $mods->toString(1) . "\n";
261 #       warn "-" x 100 . "\n";
262
263         $self->{master_doc} = $self->modsdoc_to_values( $mods );
264         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
265
266         ($self->{master_doc}->{isbn}) = 
267                 $self->get_field_value( $mods, $isbn_xpath );
268
269         $self->{master_doc}->{type_of_resource} = 
270                 [ $self->get_field_value( $mods, $resource_xpath ) ];
271
272         ($self->{master_doc}->{tcn}) = 
273                 $self->get_field_value( $mods, $tcn_xpath );
274
275         ($self->{master_doc}->{pubdate}) = 
276                 $self->get_field_value( $mods, $pub_xpath );
277
278         ($self->{master_doc}->{publisher}) = 
279                 $self->get_field_value( $mods, $publisher_xpath );
280
281 }
282
283 # ---------------------------------------------------------------------------
284 # Takes a MARCXML string and adds it to the growing MODS doc
285 # ---------------------------------------------------------------------------
286 sub push_mods_batch {
287         my( $self, $marcxml ) = @_;
288
289         my $xmldoc = $parser->parse_string($marcxml);
290         my $mods = $mods_sheet->transform($xmldoc);
291
292         my $xmlperl = $self->modsdoc_to_values( $mods );
293         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
294
295         for my $subject( @{$xmlperl->{subject}} ) {
296                 push @{$self->{master_doc}->{subject}}, $subject;
297         }
298
299         push( @{$self->{master_doc}->{type_of_resource}}, 
300                 $self->get_field_value( $mods, $resource_xpath ));
301
302         if(!($self->{master_doc}->{isbn}) ) {
303                 ($self->{master_doc}->{isbn}) = 
304                         $self->get_field_value( $mods, $isbn_xpath );
305         }
306 }
307
308
309 # ---------------------------------------------------------------------------
310 # Completes a MARC -> Unified MODS batch process and returns the perl hash
311 # ---------------------------------------------------------------------------
312 sub init_virtual_record {
313         my $record = new Fieldmapper::metabib::virtual_record;
314         $record->subject([]);
315         $record->types_of_resource([]);
316         $record->call_numbers([]);
317         return $record;
318 }
319
320 sub finish_mods_batch {
321         my $self = shift;
322         my $perl = $self->{master_doc};
323         my $record = init_virtual_record();
324
325         # turn the hash into a fieldmapper object
326         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
327         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
328
329         my @series;
330         for my $s (@{$perl->{series}}) {
331                 push @series, (split( /\s*;/, $s ))[0];
332         }
333
334         $record->title($title);
335         $record->author($author);
336
337         $record->doc_id($perl->{doc_id});
338         $record->isbn($perl->{isbn});
339         $record->pubdate($perl->{pubdate});
340         $record->publisher($perl->{publisher});
341         $record->tcn($perl->{tcn});
342
343         $record->subject($perl->{subject});
344         $record->types_of_resource($perl->{types_of_resource});
345         $record->series(\@series);
346
347         $self->{master_doc} = undef;
348         #return $perl
349         return $record;
350 }
351
352
353