]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql
LP1894131 Sticky catalog holdings org select
[Evergreen.git] / Open-ILS / src / sql / Pg / upgrade / 0446.schema.naco-normalize-modifiers.sql
1 BEGIN;
2
3 INSERT INTO config.upgrade_log (version) VALUES ('0446'); -- gmc
4
5 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
6         use Unicode::Normalize;
7         use Encode;
8
9         # When working with Unicode data, the first step is to decode it to
10         # a byte string; after that, lowercasing is safe
11         my $txt = lc(decode_utf8(shift));
12         my $sf = shift;
13
14         $txt = NFD($txt);
15         $txt =~ s/\pM+//go;     # Remove diacritics
16
17         # remove non-combining diacritics
18         # this list of characters follows the NACO normalization spec,
19         # but a looser but more comprehensive version might be
20         # $txt =~ s/\pLm+//go;
21         $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
22
23         $txt =~ s/\xE6/AE/go;   # Convert ae digraph
24         $txt =~ s/\x{153}/OE/go;# Convert oe digraph
25         $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
26
27         $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
28         $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
29
30         $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;             # Convert Latin and Greek
31         $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /;     # Convert Misc
32         $txt =~ tr/\'\[\]\|//d;                                                 # Remove Misc
33
34         if ($sf && $sf =~ /^a/o) {
35                 my $commapos = index($txt,',');
36                 if ($commapos > -1) {
37                         if ($commapos != length($txt) - 1) {
38                                 my @list = split /,/, $txt;
39                                 my $first = shift @list;
40                                 $txt = $first . ',' . join(' ', @list);
41                         } else {
42                                 $txt =~ s/,/ /go;
43                         }
44                 }
45         } else {
46                 $txt =~ s/,/ /go;
47         }
48
49         $txt =~ s/\s+/ /go;     # Compress multiple spaces
50         $txt =~ s/^\s+//o;      # Remove leading space
51         $txt =~ s/\s+$//o;      # Remove trailing space
52
53         # Encoding the outgoing string is good practice, but not strictly
54         # necessary in this case because we've stripped everything from it
55         return encode_utf8($txt);
56 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
57
58 END;