using Unicode::Normalize to get all incoming text into Unicode normal form D
authormiker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Mon, 29 Aug 2005 16:24:52 +0000 (16:24 +0000)
committermiker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Mon, 29 Aug 2005 16:24:52 +0000 (16:24 +0000)
git-svn-id: svn://svn.open-ils.org/ILS/trunk@1758 dcc99617-32d9-48b4-a31d-7c20da2025e4

Open-ILS/src/perlmods/OpenILS/Application/Storage/Driver/Pg/fts.pm
Open-ILS/src/perlmods/OpenILS/Application/Storage/WORM.pm

index c6c64df..c508346 100644 (file)
@@ -3,11 +3,12 @@
        #-------------------------------------------------------------------------------
        package OpenILS::Application::Storage::FTS;
        use OpenSRF::Utils::Logger qw/:level/;
+       use Unicode::Normalize;
        my $log = 'OpenSRF::Utils::Logger';
 
        sub compile {
                my $self = shift;
-               my $term = shift;
+               my $term = NFD(shift());
 
                $log->debug("Raw term: $term",DEBUG);
 
@@ -17,7 +18,7 @@
                $self = ref($self) || $self;
                $self = bless {} => $self;
 
-               $term =~ s/\pM//gos;
+               $term =~ s/(\pM+)//gos;
                $self->decompose($term);
 
                my $newterm = join('&', $self->words);
index 2f6eb42..54d44b3 100644 (file)
@@ -2,6 +2,7 @@ package OpenILS::Application::Storage::WORM;
 use base qw/OpenILS::Application::Storage/;
 use strict; use warnings;
 
+use Unicode::Normalize;
 use OpenSRF::EX qw/:try/;
 
 use OpenSRF::Utils::SettingsClient;
@@ -49,13 +50,13 @@ my @fp_mods_xpath = (
                                        ],
                                        fixup   => '
                                                        do {
-                                                               $text = lc($text);
+                                                               $text = lc(NFD($text));
+                                                               $text =~ s/\pM+//gso;
                                                                $text =~ s/\s+/ /sgo;
                                                                $text =~ s/^\s*(.+)\s*$/$1/sgo;
                                                                $text =~ s/\b(?:the|an?)\b//sgo;
                                                                $text =~ s/\[.[^\]]+\]//sgo;
                                                                $text =~ s/\s*[;\/\.]*$//sgo;
-                                                               $text =~ s/\pM+//gso;
                                                        };
                                        ',
                        },
@@ -66,7 +67,8 @@ my @fp_mods_xpath = (
                                        ],
                                        fixup   => '
                                                        do {
-                                                               $text = lc($text);
+                                                               $text = lc(NFD($text));
+                                                               $text =~ s/\pM+//gso;
                                                                $text =~ s/\s+/ /sgo;
                                                                $text =~ s/^\s*(.+)\s*$/$1/sgo;
                                                                $text =~ s/,?\s+.*$//sgo;
@@ -90,7 +92,8 @@ my @fp_mods_xpath = (
                                        ],
                                        fixup   => '
                                                        do {
-                                                               $text = lc($text);
+                                                               $text = lc(NFD($text));
+                                                               $text =~ s/\pM+//gso;
                                                                $text =~ s/\s+/ /sgo;
                                                                $text =~ s/^\s*(.+)\s*$/$1/sgo;
                                                                $text =~ s/\b(?:the|an?)\b//sgo;
@@ -109,7 +112,8 @@ my @fp_mods_xpath = (
                                        ],
                                        fixup   => '
                                                        do {
-                                                               $text = lc($text);
+                                                               $text = lc(NFD($text));
+                                                               $text =~ s/\pM+//gso;
                                                                $text =~ s/\s+/ /sgo;
                                                                $text =~ s/^\s*(.+)\s*$/$1/sgo;
                                                                $text =~ s/,?\s+.*$//sgo;
@@ -479,8 +483,8 @@ sub _marcxml_to_full_rows {
                my $ns = new Fieldmapper::metabib::full_rec;
 
                $ns->tag( 'LDR' );
-               my $val = $tagline->textContent;
-               $val =~ s/(\pM)//gso;
+               my $val = NFD($tagline->textContent);
+               $val =~ s/(\pM+)//gso;
                $ns->value( $val );
 
                push @ns_list, $ns;
@@ -492,8 +496,8 @@ sub _marcxml_to_full_rows {
                my $ns = new Fieldmapper::metabib::full_rec;
 
                $ns->tag( $tagline->getAttribute( "tag" ) );
-               my $val = $tagline->textContent;
-               $val =~ s/(\pM)//gso;
+               my $val = NFD($tagline->textContent);
+               $val =~ s/(\pM+)//gso;
                $ns->value( $val );
 
                push @ns_list, $ns;
@@ -515,8 +519,8 @@ sub _marcxml_to_full_rows {
                        $ns->ind1( $ind1 );
                        $ns->ind2( $ind2 );
                        $ns->subfield( $data->getAttribute( "code" ) );
-                       my $val = $data->textContent;
-                       $val =~ s/(\pM)//gso;
+                       my $val = NFD($data->textContent);
+                       $val =~ s/(\pM+)//gso;
                        $ns->value( lc($val) );
 
                        push @ns_list, $ns;
@@ -548,6 +552,7 @@ sub _get_field_value {
                        $string .= $value->textContent . " ";
                }
        }
+       $string = NFD($string);
        $string =~ s/(\pM)//gso;
        return lc($string);
 }