From c6ce95ada46abe72e561e6c04298788594894cfd Mon Sep 17 00:00:00 2001 From: miker Date: Fri, 17 Aug 2007 17:37:42 +0000 Subject: [PATCH] removing control characters from XML with reckless abandon git-svn-id: svn://svn.open-ils.org/ILS/trunk@7698 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/extras/import/marc2are.pl | 1 + Open-ILS/src/extras/import/marc2bre.pl | 1 + Open-ILS/src/extras/marc2html | 28 ++++++++++++++++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/Open-ILS/src/extras/import/marc2are.pl b/Open-ILS/src/extras/import/marc2are.pl index a9f3e13b02..e01b916c43 100755 --- a/Open-ILS/src/extras/import/marc2are.pl +++ b/Open-ILS/src/extras/import/marc2are.pl @@ -67,6 +67,7 @@ while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { $xml =~ s/>\s+id($id); diff --git a/Open-ILS/src/extras/import/marc2bre.pl b/Open-ILS/src/extras/import/marc2bre.pl index aa2cb9922f..ee09c8b3f5 100755 --- a/Open-ILS/src/extras/import/marc2bre.pl +++ b/Open-ILS/src/extras/import/marc2bre.pl @@ -159,6 +159,7 @@ while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { $xml =~ s/>\s+id($id); diff --git a/Open-ILS/src/extras/marc2html b/Open-ILS/src/extras/marc2html index c076e4c396..f367c70680 100755 --- a/Open-ILS/src/extras/marc2html +++ b/Open-ILS/src/extras/marc2html @@ -1,19 +1,21 @@ #!/usr/bin/perl -use Error; +use Error qw/:try/; use MARC::Batch; use MARC::File::XML; use XML::LibXSLT; use XML::LibXML; use Unicode::Normalize; use Getopt::Long; +use FileHandle; -my ($split,$enc,$marc,$out) = (100); +my ($split,$enc,$marc,$out,$bad) = (100); GetOptions( 'split=i' => \$split, 'marc=s' => \$marc, 'encoding=s' => \$enc, 'out_dir=s' => \$out, + 'bad=s' => \$bad, ); if ($enc) { @@ -31,6 +33,7 @@ my $xslt = XML::LibXSLT->new(); $stylesheet = $xslt->parse_stylesheet( $parser->parse_string($xsl) ); +$bad = new FileHandle( $bad => '>:raw' ) if ($bad); my $xml = ''; my $current = 1; @@ -42,7 +45,20 @@ $marc->strict_off; $marc->warnings_off; while (my $r = $marc->next) { - $xml .= entityize(MARC::File::XML::record($r)); + my $rxml = entityize(MARC::File::XML::record($r)); + $rxml =~ s/[\x00-\x1f]//go; + + try { $doc = $parser->parse_string($rxml); } + catch Error with { + my $e = shift; + warn "arg ... bad record $current, skipping: $e\n"; + $current++; + print $bad $r->as_usmarc if ($bad); + $r = undef; + }; + next unless ($r); + + $xml .= $rxml; unless ($current % $split) { $xml = <<" XML"; @@ -51,7 +67,11 @@ while (my $r = $marc->next) { XML - my $doc = $parser->parse_string($xml); + my $doc; + try { $doc = $parser->parse_string($xml); } + catch Error with { my $e = shift; warn "ARG! Doc failed to parse:\n$e\n-------------------------------------------\n$xml\n"; }; + die unless $doc; + $xml = ''; my $results = $stylesheet->transform($doc, prev => "'$prev'", next => "'$next'"); -- 2.43.2