]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/perlmods/OpenILS/Utils/ModsParser.pm
mods utility functions used accross applications
[Evergreen.git] / Open-ILS / src / perlmods / OpenILS / Utils / ModsParser.pm
1 package OpenILS::Utils::ModsParser;
2 use strict; use warnings;
3
4 use OpenSRF::EX qw/:try/;
5 use XML::LibXML;
6 use XML::LibXSLT;
7 use Time::HiRes qw(time);
8
9 my $parser              = XML::LibXML->new();
10 my $xslt                        = XML::LibXSLT->new();
11 my $xslt_doc    = $parser->parse_file( 
12                 "/pines/cvs/ILS/Open-ILS/xsl/MARC21slim2MODS.xsl" );
13 my $mods_sheet = $xslt->parse_stylesheet( $xslt_doc );
14
15 # ----------------------------------------------------------------------------------------
16 # XXX get me from the database and cache me ...
17 my $isbn_xpath                  = "//mods:mods/mods:identifier[\@type='isbn']";
18 my $resource_xpath      = "//mods:mods/mods:typeOfResource";
19 my $pub_xpath                   = "//mods:mods/mods:originInfo//mods:dateIssued[\@encoding='marc']|" . 
20                                                                 "//mods:mods/mods:originInfo//mods:dateIssued[1]";
21 my $tcn_xpath                   = "//mods:mods/mods:recordInfo/mods:recordIdentifier";
22 my $publisher_xpath     = "//mods:mods/mods:originInfo//mods:publisher[1]";
23
24
25 my $xpathset = {
26         title => {
27                 abbreviated => 
28                         "//mods:mods/mods:titleInfo[mods:title and (\@type='abreviated')]",
29                 translated =>
30                         "//mods:mods/mods:titleInfo[mods:title and (\@type='translated')]",
31                 uniform =>
32                         "//mods:mods/mods:titleInfo[mods:title and (\@type='uniform')]",
33                 proper =>
34                         "//mods:mods/mods:titleInfo[mods:title and not (\@type)]",
35         },
36         author => {
37                 corporate => 
38                         "//mods:mods/mods:name[\@type='corporate']/*[local-name()='namePart']".
39                                 "[../mods:role/mods:text[text()='creator']][1]",
40                 personal => 
41                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']".
42                                 "[../mods:role/mods:text[text()='creator']][1]",
43                 conference => 
44                         "//mods:mods/mods:name[\@type='conference']/*[local-name()='namePart']".
45                                 "[../mods:role/mods:text[text()='creator']][1]",
46                 other => 
47                         "//mods:mods/mods:name[\@type='personal']/*[local-name()='namePart']",
48         },
49         subject => {
50                 geographic => 
51                         "//mods:mods/*[local-name()='subject']/*[local-name()='geographic']",
52                 name => 
53                         "//mods:mods/*[local-name()='subject']/*[local-name()='name']",
54                 temporal => 
55                         "//mods:mods/*[local-name()='subject']/*[local-name()='temporal']",
56                 topic => 
57                         "//mods:mods/*[local-name()='subject']/*[local-name()='topic']",
58         },
59         keyword => { keyword => "//mods:mods/*[not(local-name()='originInfo')]", },
60 };
61 # ----------------------------------------------------------------------------------------
62
63
64
65 sub new { return bless( {}, shift() ); }
66
67 sub get_field_value {
68
69         my( $self, $mods, $xpath ) = @_;
70
71         my $string = "";
72         my $root = $mods->documentElement;
73         $root->setNamespace( "http://www.loc.gov/mods/", "mods", 1 );
74
75         # grab the set of matching nodes
76         my @nodes = $root->findnodes( $xpath );
77         for my $value (@nodes) {
78
79                 # grab all children of the node
80                 my @children = $value->childNodes();
81                 for my $child (@children) {
82
83                         # add the childs content to the growing buffer
84                         my $content = quotemeta($child->textContent);
85                         next if ($string =~ /$content/);  # uniquify the values ! don't de-dup for the WORM!
86                         $string .= $child->textContent . " ";
87                 }
88                 if( ! @children ) {
89                         $string .= $value->textContent . " ";
90                 }
91         }
92         return $string;
93 }
94
95
96 sub modsdoc_to_values {
97         my( $self, $mods ) = @_;
98         my $data = {};
99         for my $class (keys %$xpathset) {
100                 $data->{$class} = {};
101                 for my $type(keys %{$xpathset->{$class}}) {
102                         my $value = $self->get_field_value( $mods, $xpathset->{$class}->{$type} );
103                         $data->{$class}->{$type} = $value;
104                 }
105         }
106         return $data;
107 }
108
109
110
111 # ---------------------------------------------------------------------------
112 # Grabs the data 'we want' from the MODS doc and returns it in hash form
113 # ---------------------------------------------------------------------------
114 sub mods_values_to_mods_slim {
115         my( $self, $modsperl ) = @_;
116
117         my $title = "";
118         my $author = "";
119         my $subject = [];
120
121         my $tmp = $modsperl->{title};
122
123         if(!$tmp) { $title = ""; }
124         else {
125                 ($title = $tmp->{proper}) ||
126                 ($title = $tmp->{translated}) ||
127                 ($title = $tmp->{abbreviated}) ||
128                 ($title = $tmp->{uniform});
129         }
130
131         $tmp = $modsperl->{author};
132         if(!$tmp) { $author = ""; }
133         else {
134                 ($author = $tmp->{personal}) ||
135                 ($author = $tmp->{other}) ||
136                 ($author = $tmp->{corporate}) ||
137                 ($author = $tmp->{conference}); 
138         }
139
140         $tmp = $modsperl->{subject};
141         if(!$tmp) { $subject = []; } 
142         else {
143                 for my $key( keys %{$tmp}) {
144                         push(@$subject, $tmp->{$key}) if $tmp->{$key};
145                 }
146         }
147
148         return { title => $title, author => $author, subject => $subject };
149
150 }
151
152
153
154 # ---------------------------------------------------------------------------
155 # Initializes a MARC -> Unified MODS batch process
156 # ---------------------------------------------------------------------------
157
158 sub start_mods_batch {
159
160         my( $self, $master_doc ) = @_;
161
162         my $xmldoc = $parser->parse_string($master_doc);
163         my $mods = $mods_sheet->transform($xmldoc);
164
165         $self->{master_doc} = $self->modsdoc_to_values( $mods );
166         $self->{master_doc} = $self->mods_values_to_mods_slim( $self->{master_doc} );
167
168         $self->{master_doc}->{isbn} = 
169                 $self->get_field_value( $mods, $isbn_xpath );
170
171         $self->{master_doc}->{type_of_resource} = 
172                 [ $self->get_field_value( $mods, $resource_xpath ) ];
173
174         $self->{master_doc}->{tcn} = 
175                 $self->get_field_value( $mods, $tcn_xpath );
176
177         $self->{master_doc}->{pubdate} = 
178                 $self->get_field_value( $mods, $pub_xpath );
179
180         $self->{master_doc}->{publisher} = 
181                 $self->get_field_value( $mods, $publisher_xpath );
182
183 }
184
185 # ---------------------------------------------------------------------------
186 # Takes a MARCXML string and adds it to the growing MODS doc
187 # ---------------------------------------------------------------------------
188 sub push_mods_batch {
189         my( $self, $marcxml ) = @_;
190
191         my $xmldoc = $parser->parse_string($marcxml);
192         my $mods = $mods_sheet->transform($xmldoc);
193
194         my $xmlperl = $self->modsdoc_to_values( $mods );
195         $xmlperl = $self->mods_values_to_mods_slim( $xmlperl );
196
197         for my $subject( @{$xmlperl->{subject}} ) {
198                 push @{$self->{master_doc}->{subject}}, $subject;
199         }
200
201         push( @{$self->{master_doc}->{type_of_resource}}, 
202                 $self->get_field_value( $mods, $resource_xpath ));
203
204         if(!($self->{master_doc}->{isbn}) ) {
205                 $self->{master_doc}->{isbn} = 
206                         $self->get_field_value( $mods, $isbn_xpath );
207         }
208 }
209
210
211 # ---------------------------------------------------------------------------
212 # Completes a MARC -> Unified MODS batch process and returns the perl hash
213 # ---------------------------------------------------------------------------
214 sub finish_mods_batch {
215         my $self = shift;
216         my $perl = $self->{master_doc};
217         $self->{master_doc} = undef;
218         return $perl
219 }
220
221
222