1 package MARC::File::XML;
5 use base qw( MARC::File );
14 our $VERSION = '0.66';
16 my $handler = MARC::File::SAX->new();
17 my $parser = XML::SAX::ParserFactory->parser( Handler => $handler );
18 my $charset = MARC::Charset->new();
23 MARC::File::XML - Work with MARC data encoded as XML
27 ## reading with MARC::Batch
28 my $batch = MARC::Batch->new( 'XML', $filename );
29 my $record = $batch->next();
31 ## or reading with MARC::File::XML explicitly
32 my $file = MARC::File::XML->in( $filename );
33 my $record = $file->next();
35 ## serialize a single MARC::Record object as XML
36 print $record->as_xml();
38 ## write a bunch of records to a file
39 my $file = MARC::File::XML->out( 'myfile.xml' );
40 $file->write( $record1 );
41 $file->write( $record2 );
42 $file->write( $record3 );
45 ## instead of writing to disk, get the xml directly
47 MARC::File::XML::header(),
48 MARC::File::XML::record( $record1 ),
49 MARC::File::XML::record( $record2 ),
50 MARC::File::XML::footer()
55 The MARC-XML distribution is an extension to the MARC-Record distribution for
56 working with MARC21 data that is encoded as XML. The XML encoding used is the
57 MARC21slim schema supplied by the Library of Congress. More information may
58 be obtained here: http://www.loc.gov/standards/marcxml/
60 You must have MARC::Record installed to use MARC::File::XML. In fact
61 once you install the MARC-XML distribution you will most likely not use it
62 directly, but will have an additional file format available to you when you
65 This version of MARC-XML supersedes an the versions ending with 0.25 which
66 were used with the MARC.pm framework. MARC-XML now uses MARC::Record
69 If you have any questions or would like to contribute to this module please
70 sign on to the perl4lib list. More information about perl4lib is available
71 at L<http://perl4lib.perl.org>.
75 When you use MARC::File::XML your MARC::Record objects will have two new
76 additional methods available to them:
80 Returns a MARC::Record object serialized in XML.
82 print $record->as_xml();
86 sub MARC::Record::as_xml {
88 return( MARC::File::XML::encode( $record ) );
93 If you have a chunk of XML and you want a record object for it you can use
94 this method to generate a MARC::Record object.
96 my $record = MARC::Record->new_from_xml( $xml );
98 Note: only works for single record XML chunks.
102 sub MARC::Record::new_from_xml {
104 ## to allow calling as MARC::Record::new_from_xml()
105 ## or MARC::Record->new_from_xml()
106 $xml = shift if ( ref($xml) || ($xml eq "MARC::Record") );
107 return( MARC::File::XML::decode( $xml ) );
112 If you want to write records as XML to a file you can use out() with write()
113 to serialize more than one record as XML.
117 A constructor for creating a MARC::File::XML object that can write XML to a
118 file. You must pass in the name of a file to write XML to.
120 my $file = MARC::XML::File->out( $filename );
125 my ( $class, $filename ) = @_;
126 my $fh = IO::File->new( ">$filename" ) or croak( $! );
128 filename => $filename,
132 return( bless \%self, ref( $class ) || $class );
137 Used in tandem with out() to write records to a file.
139 my $file = MARC::File::XML->out( $filename );
140 $file->write( $record1 );
141 $file->write( $record2 );
146 my ( $self, $record ) = @_;
147 if ( ! $self->{ fh } ) {
148 croak( "MARC::File::XML object not open for writing" );
151 croak( "must pass write() a MARC::Record object" );
153 ## print the XML header if we haven't already
154 if ( ! $self->{ header } ) {
155 $self->{ fh }->print( header() );
156 $self->{ header } = 1;
158 ## print out the record
159 $self->{ fh }->print( record( $record ) ) || croak( $! );
165 When writing records to disk the filehandle is automatically closed when you
166 the MARC::File::XML object goes out of scope. If you want to close it explicitly
167 use the close() method.
174 if ( $self->{ fh } ) {
175 $self->{ fh }->print( footer() ) if $self->{ header };
176 $self->{ fh } = undef;
177 $self->{ filename } = undef;
178 $self->{ header } = undef;
183 ## makes sure that the XML file is closed off
191 If you want to generate batches of records as XML, but don't want to write to
192 disk you'll have to use header(), record() and footer() to generate the
196 MARC::File::XML::header(),
197 MARC::File::XML::record( $record1 ),
198 MARC::File::XML::record( $record2 ),
199 MARC::File::XML::record( $record3 ),
200 MARC::File::XML::footer()
205 Returns a string of XML to use as the header to your XML file.
207 This method takes an optional $encoding parameter to set the output encoding
208 to something other than 'UTF-8'. This is meant mainly to support slightly
209 broken records that are in ISO-8859-1 (ANSI) format with 8-bit characters.
214 my $encoding = shift || 'UTF-8';
215 return( <<MARC_XML_HEADER );
216 <?xml version="1.0" encoding="$encoding"?>
217 <collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns="http://www.loc.gov/MARC21/slim">
223 Returns a string of XML to use at the end of your XML file.
228 return( "</collection>" );
233 Returns a chunk of XML suitable for placement between the header and the footer.
237 sub _perhaps_encode {
240 $data = Encode::encode('utf8',$charset->to_utf8($data)) unless ($done);
246 my $_is_unicode = shift;
248 push( @xml, "<record>" );
249 push( @xml, " <leader>" . escape( _perhaps_encode($record->leader(), $_is_unicode)) . "</leader>" );
250 foreach my $field ( $record->fields() ) {
251 my $tag = $field->tag();
252 if ( $field->is_control_field() ) {
253 my $data = $field->data;
254 push( @xml, qq( <controlfield tag="$tag">) .
255 escape( _perhaps_encode($data, $_is_unicode) ). qq(</controlfield>) );
257 my $i1 = $field->indicator( 1 );
258 my $i2 = $field->indicator( 2 );
259 push( @xml, qq( <datafield tag="$tag" ind1="$i1" ind2="$i2">) );
260 foreach my $subfield ( $field->subfields() ) {
261 my ( $code, $data ) = @$subfield;
262 push( @xml, qq( <subfield code="$code">).
263 escape( _perhaps_encode($data, $_is_unicode) ).qq(</subfield>) );
265 push( @xml, " </datafield>" );
268 push( @xml, "</record>\n" );
269 return( join( "\n", @xml ) );
279 join( '|', map { $_ = "\Q$_\E" } keys %ESCAPES ) .
285 $string =~ s/($ESCAPE_REGEX)/$ESCAPES{$1}/oge;
291 my $fh = $self->{ fh };
293 ## return undef at the end of the file
296 ## get a chunk of xml for a record
297 local $/ = '</record>';
300 ## trim stuff before the start record element
301 $xml =~ s/.*<record.*?>/<record>/s;
303 ## return undef if there isn't a good chunk of xml
304 return if ( $xml !~ m|<record>.*</record>|s );
306 ## return the chunk of xml
312 You probably don't ever want to call this method directly. If you do
313 you should pass in a chunk of XML as the argument.
315 It is normally invoked by a call to next(), see L<MARC::Batch> or L<MARC::File>.
325 ## see MARC::File::USMARC::decode for explanation of what's going on
327 if ( ref($self) =~ /^MARC::File/ ) {
328 $location = 'in record '.$self->{recnum};
331 $location = 'in record 1';
332 $text = $self=~/MARC::File/ ? shift : $self;
335 $parser->{ tagStack } = [];
336 $parser->{ subfields } = [];
337 $parser->{ Handler }{ record } = MARC::Record->new();
338 $parser->parse_string( $text );
340 return( $parser->{ Handler }{ record } );
344 =head2 encode([$encoding])
346 You probably want to use the as_marc() method on your MARC::Record object
347 instead of calling this directly. But if you want to you just need to
348 pass in the MARC::Record object you wish to encode as XML, and you will be
349 returned the XML as a scalar.
351 This method takes an optional $encoding parameter to set the output encoding
352 to something other than 'UTF-8'. This is meant mainly to support slightly
353 broken records that are in ISO-8859-1 (ANSI) format with 8-bit characters.
359 my $encoding = shift;
362 my $ldr = $record->leader;
365 if (defined $encoding) {
366 # Are we forcing an alternate encoding? Then leave it alone.
368 } elsif (substr($ldr,9,1) eq 'a') {
369 # Does the record think it is already Unicode?
371 if ( my ($unneeded_charset) = $record->field('066') ) {
372 $record->delete_field( $unneeded_charset );
376 # Not forcing an encoding, and it's NOT Unicode. We set the leader to say
377 # Unicode for the conversion, remove any '066' field, and put it back later.
379 # XXX Need to generat a '066' field here, but I don't understand how yet.
380 substr($ldr,9,1,'a');
381 $record->leader( $ldr );
382 if ( ($needed_charset) = $record->field('066') ) {
383 $record->delete_field( $needed_charset );
389 push( @xml, header($encoding) );
390 push( @xml, record( $record, $_is_unicode ) );
391 push( @xml, footer() );
393 if (defined $needed_charset) {
394 $record->insert_fields_ordered($needed_charset);
395 substr($ldr,8,1,' ');
396 $record->leader( $ldr );
399 return( join( "\n", @xml ) );
406 =item * Support for character translation using MARC::Charset.
408 =item * Support for callback filters in decode().
410 =item * Command line utilities marc2xml, etc.
418 =item L<http://www.loc.gov/standards/marcxml/>
420 =item L<MARC::File::USMARC>
424 =item L<MARC::Record>
432 =item * Ed Summers <ehs@pobox.com>