#!/usr/bin/perl use Error qw/:try/; use MARC::Batch; use MARC::File::XML (BinaryEncoding => 'UTF-8'); use XML::LibXSLT; use XML::LibXML; use Unicode::Normalize; use Getopt::Long; use FileHandle; my ($split,$enc,$marc,$out,$bad) = (100); GetOptions( 'split=i' => \$split, 'marc=s' => \$marc, 'encoding=s' => \$enc, 'out_dir=s' => \$out, 'bad=s' => \$bad, ); if ($enc) { MARC::Charset->ignore_errors(1); MARC::Charset->assume_encoding($enc); } die "gimme some marc!\n" unless $marc; die "gimme somewhere to put it!\n" unless $out; my $xsl = join('',()); my $parser = XML::LibXML->new(); my $xslt = XML::LibXSLT->new(); $stylesheet = $xslt->parse_stylesheet( $parser->parse_string($xsl) ); $bad = new FileHandle( $bad => '>:raw' ) if ($bad); my $xml = ''; my $current = 1; my $prev = 0; my $next = 2; my $marc = MARC::Batch->new( USMARC => $marc ); $marc->strict_off; $marc->warnings_off; while (my $r = $marc->next) { my $rxml = entityize(MARC::File::XML::record($r)); $rxml =~ s/[\x00-\x1f]//go; try { $doc = $parser->parse_string($rxml); } catch Error with { my $e = shift; warn "arg ... bad record $current, skipping: $e\n"; $current++; print $bad $r->as_usmarc if ($bad); $r = undef; }; next unless ($r); $xml .= $rxml; unless ($current % $split) { $xml = <<" XML"; $xml XML my $doc; try { $doc = $parser->parse_string($xml); } catch Error with { my $e = shift; warn "ARG! Doc failed to parse:\n$e\n-------------------------------------------\n$xml\n"; }; die unless $doc; $xml = ''; my $results = $stylesheet->transform($doc, prev => "'$prev'", next => "'$next'"); $prev++; $next++; open OUT, ">$out/$prev.html"; print OUT $results->toString; close OUT; } $current++; } my $doc = $parser->parse_string(< $xml XML my $results = $stylesheet->transform($doc, prev => "'$prev'", next => "'0'"); $prev++; $stylesheet->output_file($results, "$out/$prev.html"); sub entityize { my $stuff = shift; my $form = shift; if ($form eq 'D') { $stuff = NFD($stuff); } else { $stuff = NFC($stuff); } $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; $stuff =~ s/([\x00-\x19])//sgoe; return $stuff; } __DATA__ Previous page | Next page
Previous page | Next page
LDR

.