]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
onward and upward
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8 use OpenILS::Utils::Fieldmapper;
9 use Data::Dumper;
10
11 my $parser              = XML::LibXML->new();
12 my $xslt                        = XML::LibXSLT->new();
13 my $xslt_doc    = $parser->parse_file( 
14                 "/pines/cvs/ILS/Open-ILS/xsl/MARC21slim2MODS.xsl" );
15 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
16
17 # ----------------------------------------------------------------------------------------
18 # XXX get me from the database and cache me ...
19 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
20 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
21 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
22                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
23 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
24 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
25
26 my $edition_xpath = "//mods:mods/mods:originInfo//mods:edition[1]";
27
28
29 my $xpathset = {
30
31         title => {
32                 abbreviated => 
33                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
34                 translated =>
35                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
36                 uniform =>
37                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
38                 proper =>
39                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
40         },
41
42         author => {
43                 corporate => 
44                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
45                                 "[../mods:role/mods:text[text()='creator']][1]",
46                 personal => 
47                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
48                                 "[../mods:role/mods:text[text()='creator']][1]",
49                 conference => 
50                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
51                                 "[../mods:role/mods:text[text()='creator']][1]",
52                 other => 
53                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
54         },
55
56         subject => {
57
58                 topic => 
59                         "//mods:mods/*[local-name()='subject']",
60
61 #               geographic => 
62 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
63 #               name => 
64 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
65 #               temporal => 
66 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
67 #               topic => 
68 #                       "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
69         },
70         #keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
71
72         series => {
73                 series => "//mods:mods/mods:relatedItem[\@type='series']/mods:titleInfo"
74         }
75 };
76 # ----------------------------------------------------------------------------------------
77
78
79
80 sub new { return bless( {}, shift() ); }
81
82 sub get_field_value {
83
84         my( $self, $mods, $xpath ) = @_;
85
86         my @string;
87         my $root = $mods->documentElement;
88         $root->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
89
90         # grab the set of matching nodes
91         my @nodes = $root->findnodes( $xpath );
92         for my $value (@nodes) {
93
94                 # grab all children of the node
95                 my @children = $value->childNodes();
96                 my @child_text;
97                 for my $child (@children) {
98                         next unless( $child->nodeType != 3 );
99
100                         if($child->childNodes) {
101                                 my @a;
102                                 for my $c (@{$child->childNodes}){
103                                         push @a, $c->textContent;
104                                 }
105                                 push(@child_text, join(' ', @a));
106
107                         } else {
108                                 push(@child_text, $child->textContent); 
109                         }
110
111                 }
112                 if(@child_text) {
113                         push(@string, \@child_text);
114                 }
115
116                 if( !@child_text  ) {
117                         push(@string, $value->textContent );
118                 }
119         }
120         return @string;
121 }
122
123 =head
124 sub _modsdoc_to_values {
125         my( $self, $mods ) = @_;
126         my $data = {};
127         for my $class (keys %$xpathset) {
128                 $data->{$class} = {};
129                 for my $type(keys %{$xpathset->{$class}}) {
130                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
131                         if( $class eq "subject" ) {
132                                 push( @{$data->{$class}->{$type}},  @value );
133                         } else {
134                                 $data->{$class}->{$type} = $value[0];
135                         }
136                 }
137         }
138         return $data;
139 }
140 =cut
141
142 sub modsdoc_to_values {
143         my( $self, $mods ) = @_;
144         my $data = {};
145
146         {
147                 my $class = "subject";
148                 $data->{$class} = {};
149                 for my $type(keys %{$xpathset->{$class}}) {
150                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
151                         for my $arr (@value) {
152                                 push( @{$data->{$class}->{$type}},  $arr);
153                         }
154                 }
155         }
156
157         {
158                 my $class = "title";
159                 $data->{$class} = {};
160                 for my $type(keys %{$xpathset->{$class}}) {
161                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
162                         for my $arr (@value) {
163                                 if( ref($arr) ) {
164                                         $data->{$class}->{$type} = join(" ", @$arr);
165                                 } else {
166                                         $data->{$class}->{$type} = $arr;
167                                 }
168                         }
169                 }
170         }
171
172         {
173                 my $class = "author";
174                 $data->{$class} = {};
175                 for my $type(keys %{$xpathset->{$class}}) {
176                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
177                         $data->{$class}->{$type} = $value[0];
178                 }
179         }
180
181         {
182                 my $class = "series";
183                 $data->{$class} = {};
184                 for my $type(keys %{$xpathset->{$class}}) {
185                         my @value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
186                         for my $arr (@value) {
187                                 if( ref($arr) ) {
188                                         push(@{$data->{$class}->{$type}}, join(" ", @$arr));
189                                 } else {
190                                         push( @{$data->{$class}->{$type}}, $arr );
191                                 }
192                         }
193                 }
194
195         }
196
197         return $data;
198 }
199
200
201
202
203 # ---------------------------------------------------------------------------
204 # Grabs the data 'we want' from the MODS doc and returns it in hash form
205 # ---------------------------------------------------------------------------
206 sub mods_values_to_mods_slim {
207         my( $self, $modsperl ) = @_;
208
209         my $title = "";
210         my $author = "";
211         my $subject = [];
212         my $series      = [];
213
214         my $tmp = $modsperl->{title};
215
216
217         if(!$tmp) { $title = ""; }
218         else {
219                 ($title = $tmp->{proper}) ||
220                 ($title = $tmp->{translated}) ||
221                 ($title = $tmp->{abbreviated}) ||
222                 ($title = $tmp->{uniform});
223         }
224
225         $tmp = $modsperl->{author};
226         if(!$tmp) { $author = ""; }
227         else {
228                 ($author = $tmp->{personal}) ||
229                 ($author = $tmp->{other}) ||
230                 ($author = $tmp->{corporate}) ||
231                 ($author = $tmp->{conference}); 
232         }
233
234         $tmp = $modsperl->{subject};
235         if(!$tmp) { $subject = []; } 
236         else {
237                 for my $key( keys %{$tmp}) {
238                         push(@$subject, @{$tmp->{$key}}) if ($tmp->{$key});
239                 }
240         }
241
242         $tmp = $modsperl->{'series'};
243         if(!$tmp) { $series = []; }
244         else { $series = $tmp->{'series'}; }
245
246
247         return { series => $series, title => $title, author => $author, subject => $subject };
248
249 }
250
251
252
253 # ---------------------------------------------------------------------------
254 # Initializes a MARC -> Unified MODS batch process
255 # ---------------------------------------------------------------------------
256
257 sub start_mods_batch {
258
259         my( $self, $master_doc ) = @_;
260
261         my $xmldoc = $parser->parse_string($master_doc);
262         my $mods = $mods_sheet->transform($xmldoc);
263
264 #       warn "-" x 100 . "\n";
265 #       warn "MODS " . $mods->toString(1) . "\n";
266 #       warn "-" x 100 . "\n";
267
268         $self->{master_doc} = $self->modsdoc_to_values( $mods );
269         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
270
271         ($self->{master_doc}->{isbn}) = 
272                 $self->get_field_value( $mods, $isbn_xpath );
273
274         $self->{master_doc}->{type_of_resource} = 
275                 [ $self->get_field_value( $mods, $resource_xpath ) ];
276
277         ($self->{master_doc}->{tcn}) = 
278                 $self->get_field_value( $mods, $tcn_xpath );
279
280         ($self->{master_doc}->{pubdate}) = 
281                 $self->get_field_value( $mods, $pub_xpath );
282
283         ($self->{master_doc}->{publisher}) = 
284                 $self->get_field_value( $mods, $publisher_xpath );
285
286         ($self->{master_doc}->{edition}) =
287                 $self->get_field_value( $mods, $edition_xpath );
288
289 }
290
291 # ---------------------------------------------------------------------------
292 # Takes a MARCXML string and adds it to the growing MODS doc
293 # ---------------------------------------------------------------------------
294 sub push_mods_batch {
295         my( $self, $marcxml ) = @_;
296
297         my $xmldoc = $parser->parse_string($marcxml);
298         my $mods = $mods_sheet->transform($xmldoc);
299
300         my $xmlperl = $self->modsdoc_to_values( $mods );
301         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
302
303         for my $subject( @{$xmlperl->{subject}} ) {
304                 push @{$self->{master_doc}->{subject}}, $subject;
305         }
306
307         push( @{$self->{master_doc}->{type_of_resource}}, 
308                 $self->get_field_value( $mods, $resource_xpath ));
309
310         if(!($self->{master_doc}->{isbn}) ) {
311                 ($self->{master_doc}->{isbn}) = 
312                         $self->get_field_value( $mods, $isbn_xpath );
313         }
314 }
315
316
317 # ---------------------------------------------------------------------------
318 # Completes a MARC -> Unified MODS batch process and returns the perl hash
319 # ---------------------------------------------------------------------------
320 sub init_virtual_record {
321         my $record = new Fieldmapper::metabib::virtual_record;
322         $record->subject([]);
323         $record->types_of_resource([]);
324         $record->call_numbers([]);
325         return $record;
326 }
327
328 sub finish_mods_batch {
329         my $self = shift;
330         my $perl = $self->{master_doc};
331         my $record = init_virtual_record();
332
333         # turn the hash into a fieldmapper object
334         (my $title = $perl->{title}) =~ s/\[.*?\]//og;
335         (my $author = $perl->{author}) =~ s/\(.*?\)//og;
336
337         my @series;
338         for my $s (@{$perl->{series}}) {
339                 push @series, (split( /\s*;/, $s ))[0];
340         }
341
342         # uniquify the types of resource
343         my $rtypes = $perl->{type_of_resource};
344         my %hash = map { ($_ => 1) } @$rtypes;
345         $rtypes = [ keys %hash ];
346
347         $record->title($title);
348         $record->author($author);
349
350         $record->doc_id($perl->{doc_id});
351         $record->isbn($perl->{isbn});
352         $record->pubdate($perl->{pubdate});
353         $record->publisher($perl->{publisher});
354         $record->tcn($perl->{tcn});
355
356         $record->edition($perl->{edition});
357
358         $record->subject($perl->{subject});
359         $record->types_of_resource($rtypes);
360         $record->series(\@series);
361
362         $self->{master_doc} = undef;
363         return $record;
364 }
365
366
367