1 package MARC::File::XML;
5 use base qw( MARC::File );
14 our $VERSION = '0.66';
16 my $handler = MARC::File::SAX->new();
17 my $parser = XML::SAX::ParserFactory->parser( Handler => $handler );
18 my $charset = MARC::Charset->new();
23 MARC::File::XML - Work with MARC data encoded as XML
27 ## reading with MARC::Batch
28 my $batch = MARC::Batch->new( 'XML', $filename );
29 my $record = $batch->next();
31 ## or reading with MARC::File::XML explicitly
32 my $file = MARC::File::XML->in( $filename );
33 my $record = $file->next();
35 ## serialize a single MARC::Record object as XML
36 print $record->as_xml();
38 ## write a bunch of records to a file
39 my $file = MARC::File::XML->out( 'myfile.xml' );
40 $file->write( $record1 );
41 $file->write( $record2 );
42 $file->write( $record3 );
45 ## instead of writing to disk, get the xml directly
47 MARC::File::XML::header(),
48 MARC::File::XML::record( $record1 ),
49 MARC::File::XML::record( $record2 ),
50 MARC::File::XML::footer()
55 The MARC-XML distribution is an extension to the MARC-Record distribution for
56 working with MARC21 data that is encoded as XML. The XML encoding used is the
57 MARC21slim schema supplied by the Library of Congress. More information may
58 be obtained here: http://www.loc.gov/standards/marcxml/
60 You must have MARC::Record installed to use MARC::File::XML. In fact
61 once you install the MARC-XML distribution you will most likely not use it
62 directly, but will have an additional file format available to you when you
65 This version of MARC-XML supersedes an the versions ending with 0.25 which
66 were used with the MARC.pm framework. MARC-XML now uses MARC::Record
69 If you have any questions or would like to contribute to this module please
70 sign on to the perl4lib list. More information about perl4lib is available
71 at L<http://perl4lib.perl.org>.
75 When you use MARC::File::XML your MARC::Record objects will have two new
76 additional methods available to them:
80 Returns a MARC::Record object serialized in XML.
82 print $record->as_xml();
86 sub MARC::Record::as_xml {
89 return( MARC::File::XML::encode( $record, $enc ) );
94 If you have a chunk of XML and you want a record object for it you can use
95 this method to generate a MARC::Record object.
97 my $record = MARC::Record->new_from_xml( $xml );
99 Note: only works for single record XML chunks.
103 sub MARC::Record::new_from_xml {
105 ## to allow calling as MARC::Record::new_from_xml()
106 ## or MARC::Record->new_from_xml()
107 $xml = shift if ( ref($xml) || ($xml eq "MARC::Record") );
108 return( MARC::File::XML::decode( $xml ) );
113 If you want to write records as XML to a file you can use out() with write()
114 to serialize more than one record as XML.
118 A constructor for creating a MARC::File::XML object that can write XML to a
119 file. You must pass in the name of a file to write XML to.
121 my $file = MARC::XML::File->out( $filename );
126 my ( $class, $filename ) = @_;
127 my $fh = IO::File->new( ">$filename" ) or croak( $! );
129 filename => $filename,
133 return( bless \%self, ref( $class ) || $class );
138 Used in tandem with out() to write records to a file.
140 my $file = MARC::File::XML->out( $filename );
141 $file->write( $record1 );
142 $file->write( $record2 );
147 my ( $self, $record ) = @_;
148 if ( ! $self->{ fh } ) {
149 croak( "MARC::File::XML object not open for writing" );
152 croak( "must pass write() a MARC::Record object" );
154 ## print the XML header if we haven't already
155 if ( ! $self->{ header } ) {
156 $self->{ fh }->print( header() );
157 $self->{ header } = 1;
159 ## print out the record
160 $self->{ fh }->print( record( $record ) ) || croak( $! );
166 When writing records to disk the filehandle is automatically closed when you
167 the MARC::File::XML object goes out of scope. If you want to close it explicitly
168 use the close() method.
175 if ( $self->{ fh } ) {
176 $self->{ fh }->print( footer() ) if $self->{ header };
177 $self->{ fh } = undef;
178 $self->{ filename } = undef;
179 $self->{ header } = undef;
184 ## makes sure that the XML file is closed off
192 If you want to generate batches of records as XML, but don't want to write to
193 disk you'll have to use header(), record() and footer() to generate the
197 MARC::File::XML::header(),
198 MARC::File::XML::record( $record1 ),
199 MARC::File::XML::record( $record2 ),
200 MARC::File::XML::record( $record3 ),
201 MARC::File::XML::footer()
206 Returns a string of XML to use as the header to your XML file.
208 This method takes an optional $encoding parameter to set the output encoding
209 to something other than 'UTF-8'. This is meant mainly to support slightly
210 broken records that are in ISO-8859-1 (ANSI) format with 8-bit characters.
215 my $encoding = shift || 'UTF-8';
216 return( <<MARC_XML_HEADER );
217 <?xml version="1.0" encoding="$encoding"?>
218 <collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns="http://www.loc.gov/MARC21/slim">
224 Returns a string of XML to use at the end of your XML file.
229 return( "</collection>" );
234 Returns a chunk of XML suitable for placement between the header and the footer.
238 sub _perhaps_encode {
241 $data = Encode::encode('utf8',$charset->to_utf8($data)) unless ($done);
247 my $_is_unicode = shift;
249 push( @xml, "<record>" );
250 push( @xml, " <leader>" . escape( _perhaps_encode($record->leader(), $_is_unicode)) . "</leader>" );
251 foreach my $field ( $record->fields() ) {
252 my $tag = $field->tag();
253 if ( $field->is_control_field() ) {
254 my $data = $field->data;
255 push( @xml, qq( <controlfield tag="$tag">) .
256 escape( _perhaps_encode($data, $_is_unicode) ). qq(</controlfield>) );
258 my $i1 = $field->indicator( 1 );
259 my $i2 = $field->indicator( 2 );
260 push( @xml, qq( <datafield tag="$tag" ind1="$i1" ind2="$i2">) );
261 foreach my $subfield ( $field->subfields() ) {
262 my ( $code, $data ) = @$subfield;
263 push( @xml, qq( <subfield code="$code">).
264 escape( _perhaps_encode($data, $_is_unicode) ).qq(</subfield>) );
266 push( @xml, " </datafield>" );
269 push( @xml, "</record>\n" );
270 return( join( "\n", @xml ) );
280 join( '|', map { $_ = "\Q$_\E" } keys %ESCAPES ) .
286 $string =~ s/($ESCAPE_REGEX)/$ESCAPES{$1}/oge;
292 my $fh = $self->{ fh };
294 ## return undef at the end of the file
297 ## get a chunk of xml for a record
298 local $/ = '</record>';
301 ## trim stuff before the start record element
302 $xml =~ s/.*<record.*?>/<record>/s;
304 ## return undef if there isn't a good chunk of xml
305 return if ( $xml !~ m|<record>.*</record>|s );
307 ## return the chunk of xml
313 You probably don't ever want to call this method directly. If you do
314 you should pass in a chunk of XML as the argument.
316 It is normally invoked by a call to next(), see L<MARC::Batch> or L<MARC::File>.
326 ## see MARC::File::USMARC::decode for explanation of what's going on
328 if ( ref($self) =~ /^MARC::File/ ) {
329 $location = 'in record '.$self->{recnum};
332 $location = 'in record 1';
333 $text = $self=~/MARC::File/ ? shift : $self;
336 $parser->{ tagStack } = [];
337 $parser->{ subfields } = [];
338 $parser->{ Handler }{ record } = MARC::Record->new();
339 $parser->parse_string( $text );
341 return( $parser->{ Handler }{ record } );
345 =head2 encode([$encoding])
347 You probably want to use the as_marc() method on your MARC::Record object
348 instead of calling this directly. But if you want to you just need to
349 pass in the MARC::Record object you wish to encode as XML, and you will be
350 returned the XML as a scalar.
352 This method takes an optional $encoding parameter to set the output encoding
353 to something other than 'UTF-8'. This is meant mainly to support slightly
354 broken records that are in ISO-8859-1 (ANSI) format with 8-bit characters.
360 my $encoding = shift;
363 my $ldr = $record->leader;
366 if (defined $encoding) {
367 # Are we forcing an alternate encoding? Then leave it alone.
369 } elsif (substr($ldr,9,1) eq 'a') {
370 # Does the record think it is already Unicode?
372 if ( my ($unneeded_charset) = $record->field('066') ) {
373 $record->delete_field( $unneeded_charset );
377 # Not forcing an encoding, and it's NOT Unicode. We set the leader to say
378 # Unicode for the conversion, remove any '066' field, and put it back later.
380 # XXX Need to generat a '066' field here, but I don't understand how yet.
381 substr($ldr,9,1,'a');
382 $record->leader( $ldr );
383 if ( ($needed_charset) = $record->field('066') ) {
384 $record->delete_field( $needed_charset );
390 push( @xml, header($encoding) );
391 push( @xml, record( $record, $_is_unicode ) );
392 push( @xml, footer() );
394 if (defined $needed_charset) {
395 $record->insert_fields_ordered($needed_charset);
396 substr($ldr,8,1,' ');
397 $record->leader( $ldr );
400 return( join( "\n", @xml ) );
407 =item * Support for character translation using MARC::Charset.
409 =item * Support for callback filters in decode().
411 =item * Command line utilities marc2xml, etc.
419 =item L<http://www.loc.gov/standards/marcxml/>
421 =item L<MARC::File::USMARC>
425 =item L<MARC::Record>
433 =item * Ed Summers <ehs@pobox.com>