From 34aa2ada316921b94ee81282606c7a5aa15312c9 Mon Sep 17 00:00:00 2001 From: miker Date: Mon, 10 Mar 2008 01:06:23 +0000 Subject: [PATCH 1/1] some normalization functions for use in in-DB ingest, when it happens git-svn-id: svn://svn.open-ils.org/ILS/trunk@8944 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/020.schema.functions.sql | 35 +++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index fc07bba17b..8974c7e36b 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -17,9 +17,12 @@ CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS $$ LANGUAGE SQL STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ + use Unicode::Normalize; + my $txt = lc(shift); my $sf = shift; + $txt = NFD($txt); $txt =~ s/\pM+//go; # Remove diacritics $txt =~ s/\xE6/AE/go; # Convert ae digraph @@ -33,7 +36,7 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $ $txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /; # Convert Misc $txt =~ tr/\'\[\]\|//d; # Remove Misc - if ($sf =~ /^a/o) { + if ($sf && $sf =~ /^a/o) { my $commapos = index($txt,','); if ($commapos > -1) { if ($commapos != length($txt) - 1) { @@ -59,6 +62,36 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$ SELECT public.naco_normalize($1,''); $func$ LANGUAGE 'sql' STRICT IMMUTABLE; +CREATE OR REPLACE FUNCTION public.normalize_space( TEXT ) RETURNS TEXT AS $$ + SELECT regexp_replace(regexp_replace(regexp_replace($1, E'\\n', ' ', 'g'), E'(?:^\\s+)|(\\s+$)', '', 'g'), E'\\s+', ' ', 'g'); +$$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION public.lowercase( TEXT ) RETURNS TEXT AS $$ + return lc(shift); +$$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION public.uppercase( TEXT ) RETURNS TEXT AS $$ + return uc(shift); +$$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION public.remove_diacritics( TEXT ) RETURNS TEXT AS $$ + use Unicode::Normalize; + + my $x = NFD(shift); + $x =~ s/\pM+//go; + return $x; + +$$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION public.entityize( TEXT ) RETURNS TEXT AS $$ + use Unicode::Normalize; + + my $x = NFC(shift); + $x =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; + return $x; + +$$ LANGUAGE PLPERLU; + CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT ) RETURNS TEXT AS $$ my $txt = shift; $txt =~ s/^\s+//o; -- 2.43.2