Fix Unicode mangling in clean_marc function
authorDan Scott <dscott@laurentian.ca>
Sun, 4 Mar 2012 07:41:11 +0000 (02:41 -0500)
committerDan Scott <dscott@laurentian.ca>
Wed, 7 Mar 2012 21:03:04 +0000 (16:03 -0500)
Calling s/\p{Cc}//go; before entityize() was resulting in all xFFFD
entities being returned for the upper case diacritic characters, which
in turn caused the new unit test to fail (yay unit tests). I added a
corresponding unit tese for entityize() to ensure that the problem
wasn't coming from that function. Switching the order in which the p{Cc}
regex and entityize() calls resolved the corruption in the unit test.

This suggests that Vandelay may be introducing significant corruption to
imported records and that backporting of this commit to the inline
Vandelay variants from previous releases may be warranted.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
Signed-off-by: Jason Stephenson <jstephenson@mvlc.org>
Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
Open-ILS/src/perlmods/t/01-OpenILS-Application.t
Open-ILS/src/perlmods/t/14-OpenILS-Utils.t

index 9ddca6e..9035576 100644 (file)
@@ -115,8 +115,8 @@ sub clean_marc {
     $xml =~ s/\n//sog;
     $xml =~ s/^<\?xml.+\?\s*>//go;
     $xml =~ s/>\s+</></go;
-    $xml =~ s/\p{Cc}//go;
     $xml = OpenILS::Application::AppUtils->entityize($xml);
+    $xml =~ s/\p{Cc}//go;
     $xml =~ s/[\x00-\x1f]//go;
     return $xml;
 }
index 06f3ad4..a4367f6 100644 (file)
@@ -1,6 +1,6 @@
 #!perl -T
 
-use Test::More tests => 13;
+use Test::More tests => 14;
 
 BEGIN {
        use_ok( 'OpenILS::Application' );
@@ -18,3 +18,9 @@ use_ok( 'OpenILS::Application::ResolverResolver' );
 use_ok( 'OpenILS::Application::Serial' );
 use_ok( 'OpenILS::Application::SuperCat' );
 use_ok( 'OpenILS::Application::Vandelay' );
+
+is(
+    OpenILS::Application::AppUtils::entityize(0, 'èöçÇÈÀ'),
+    '&#xE8;&#xF6;&#xE7;&#xC7;&#xC8;&#xC0;',
+    'entityize: diacritics'
+);
index 924e2a3..9878956 100644 (file)
@@ -1,6 +1,6 @@
 #!perl -T
 
-use Test::More tests => 22;
+use Test::More tests => 24;
 
 use_ok( 'OpenILS::Utils::Configure' );
 use_ok( 'OpenILS::Utils::Cronscript' );
@@ -43,3 +43,40 @@ is($apostring, "its time", "naco_normalize: strip apostrophes");
 
 my $apos = OpenILS::Utils::Normalize::search_normalize("it's time");
 is($apos, "it s time", "search_normalize: replace apostrophes with space");
+
+my $raw_marcxml = <<RAWMARC;
+<?xml version="1.0" encoding="utf-8"?>
+<record>
+  <leader>01614nmm a22003975u 4500</leader>
+  <controlfield tag="001">978-0-387-35767-6</controlfield>
+  <controlfield tag="003">Springer</controlfield>
+  <controlfield tag="005">20071022150035.8</controlfield>
+  <controlfield tag="007">cr nn 008mamaa</controlfield>
+  <controlfield tag="008">071022s2008    xx         j        eng d</controlfield>
+  <datafield tag="020" ind1=" " ind2=" ">
+    <subfield code="a">9780387685748</subfield>
+  </datafield>
+  <datafield tag="100" ind1="1" ind2=" ">
+    <subfield code="a">Neteler, Markus.</subfield>
+  </datafield>
+  <datafield tag="245" ind1="1" ind2="0">
+    <subfield code="a">Open Source GIS</subfield>
+    <subfield code="h">[electronic resource] :</subfield>
+    <subfield code="b">A GRASS GIS Approach /</subfield>
+    <subfield code="c">edited by Markus Neteler, Helena Mitasova.</subfield>
+  </datafield>
+  <datafield tag="250" ind1=" " ind2=" ">
+    <subfield code="a">Third Edition.</subfield>
+  </datafield>
+  <datafield tag="260" ind1=" " ind2=" ">
+    <subfield code="a">Boston, MA :</subfield>
+    <subfield code="b">Springer Science+Business Media, LLC,</subfield>
+    <subfield code="c">2008.</subfield>
+  </datafield>
+</record>
+RAWMARC
+my $exp_xml = '<record><leader>01614nmm a22003975u 4500</leader><controlfield tag="001">978-0-387-35767-6</controlfield><controlfield tag="003">Springer</controlfield><controlfield tag="005">20071022150035.8</controlfield><controlfield tag="007">cr nn 008mamaa</controlfield><controlfield tag="008">071022s2008    xx         j        eng d</controlfield><datafield tag="020" ind1=" " ind2=" "><subfield code="a">9780387685748</subfield></datafield><datafield tag="100" ind1="1" ind2=" "><subfield code="a">Neteler, Markus.</subfield></datafield><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Open Source GIS</subfield><subfield code="h">[electronic resource] :</subfield><subfield code="b">A GRASS GIS Approach /</subfield><subfield code="c">edited by Markus Neteler, Helena Mitasova.</subfield></datafield><datafield tag="250" ind1=" " ind2=" "><subfield code="a">Third Edition.</subfield></datafield><datafield tag="260" ind1=" " ind2=" "><subfield code="a">Boston, MA :</subfield><subfield code="b">Springer Science+Business Media, LLC,</subfield><subfield code="c">2008.</subfield></datafield></record>';
+my $clean_xml = OpenILS::Utils::Normalize::clean_marc($raw_marcxml);
+is($clean_xml, $exp_xml, "clean_marc: header and space normalization");
+
+is(OpenILS::Utils::Normalize::clean_marc('èöçÇÈÀ'), '&#xE8;&#xF6;&#xE7;&#xC7;&#xC8;&#xC0;', 'clean_marc: diacritics');