2 * Copyright (C) 2004-2008 Georgia Public Library Service
3 * Copyright (C) 2007-2008 Equinox Software, Inc.
4 * Mike Rylander <miker@esilibrary.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
18 CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS TEXT AS $$
30 WHEN $2::INT NOT BETWEEN 48 AND 57 THEN 1
31 ELSE $2::TEXT::INT + 1
34 $$ LANGUAGE SQL STRICT IMMUTABLE;
36 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
37 use Unicode::Normalize;
40 # When working with Unicode data, the first step is to decode it to
41 # a byte string; after that, lowercasing is safe
42 my $txt = lc(decode_utf8(shift));
46 $txt =~ s/\pM+//go; # Remove diacritics
48 $txt =~ s/\xE6/AE/go; # Convert ae digraph
49 $txt =~ s/\x{153}/OE/go;# Convert oe digraph
50 $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn
52 $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
53 $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
55 $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek
56 $txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /; # Convert Misc
57 $txt =~ tr/\'\[\]\|//d; # Remove Misc
59 if ($sf && $sf =~ /^a/o) {
60 my $commapos = index($txt,',');
62 if ($commapos != length($txt) - 1) {
63 my @list = split /,/, $txt;
64 my $first = shift @list;
65 $txt = $first . ',' . join(' ', @list);
74 $txt =~ s/\s+/ /go; # Compress multiple spaces
75 $txt =~ s/^\s+//o; # Remove leading space
76 $txt =~ s/\s+$//o; # Remove trailing space
78 # Encoding the outgoing string is good practice, but not strictly
79 # necessary in this case because we've stripped everything from it
80 return encode_utf8($txt);
81 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
83 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
84 SELECT public.naco_normalize($1,'');
85 $func$ LANGUAGE 'sql' STRICT IMMUTABLE;
87 CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$
88 SELECT COALESCE(SUBSTRING( $1 FROM $_$^\S+$_$), '');
89 $$ LANGUAGE SQL STRICT IMMUTABLE;
91 CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
92 SELECT public.naco_normalize($1,'a');
93 $func$ LANGUAGE SQL STRICT IMMUTABLE;
95 CREATE OR REPLACE FUNCTION public.normalize_space( TEXT ) RETURNS TEXT AS $$
96 SELECT regexp_replace(regexp_replace(regexp_replace($1, E'\\n', ' ', 'g'), E'(?:^\\s+)|(\\s+$)', '', 'g'), E'\\s+', ' ', 'g');
97 $$ LANGUAGE SQL STRICT IMMUTABLE;
99 CREATE OR REPLACE FUNCTION public.remove_commas( TEXT ) RETURNS TEXT AS $$
100 SELECT regexp_replace($1, ',', '', 'g');
101 $$ LANGUAGE SQL STRICT IMMUTABLE;
103 CREATE OR REPLACE FUNCTION public.remove_paren_substring( TEXT ) RETURNS TEXT AS $func$
104 SELECT regexp_replace($1, $$\([^)]+\)$$, '', 'g');
105 $func$ LANGUAGE SQL STRICT IMMUTABLE;
107 CREATE OR REPLACE FUNCTION public.remove_whitespace( TEXT ) RETURNS TEXT AS $$
108 SELECT regexp_replace(normalize_space($1), E'\\s+', '', 'g');
109 $$ LANGUAGE SQL STRICT IMMUTABLE;
111 CREATE OR REPLACE FUNCTION public.lowercase( TEXT ) RETURNS TEXT AS $$
113 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
115 CREATE OR REPLACE FUNCTION public.uppercase( TEXT ) RETURNS TEXT AS $$
117 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
119 CREATE OR REPLACE FUNCTION public.remove_diacritics( TEXT ) RETURNS TEXT AS $$
120 use Unicode::Normalize;
126 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
128 CREATE OR REPLACE FUNCTION public.entityize( TEXT ) RETURNS TEXT AS $$
129 use Unicode::Normalize;
132 $x =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
135 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
137 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT ) RETURNS TEXT AS $$
140 $txt =~ s/[\[\]\{\}\(\)`'"#<>\*\?\-\+\$\\]+//og;
142 if ($txt =~ /(\d{3}(?:\.\d+)?)/o) {
145 return (split /\s+/, $txt)[0];
147 $$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
149 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT, INT ) RETURNS TEXT AS $$
150 SELECT SUBSTRING(call_number_dewey($1) FROM 1 FOR $2);
151 $$ LANGUAGE SQL STRICT IMMUTABLE;
153 CREATE OR REPLACE FUNCTION tableoid2name ( oid ) RETURNS TEXT AS $$
157 $$ language 'plpgsql';
160 CREATE OR REPLACE FUNCTION actor.org_unit_descendants ( INT ) RETURNS SETOF actor.org_unit AS $$
162 FROM connectby('actor.org_unit'::text,'id'::text,'parent_ou'::text,'name'::text,$1::text,100,'.'::text)
163 AS t(keyid text, parent_keyid text, level int, branch text,pos int)
164 JOIN actor.org_unit a ON a.id::text = t.keyid::text
165 ORDER BY CASE WHEN a.parent_ou IS NULL THEN 0 ELSE 1 END, a.name;
166 $$ LANGUAGE SQL STABLE;
168 CREATE OR REPLACE FUNCTION actor.org_unit_ancestors ( INT ) RETURNS SETOF actor.org_unit AS $$
170 FROM connectby('actor.org_unit'::text,'parent_ou'::text,'id'::text,'name'::text,$1::text,100,'.'::text)
171 AS t(keyid text, parent_keyid text, level int, branch text,pos int)
172 JOIN actor.org_unit a ON a.id::text = t.keyid::text
173 JOIN actor.org_unit_type tp ON tp.id = a.ou_type
174 ORDER BY tp.depth, a.name;
175 $$ LANGUAGE SQL STABLE;
177 CREATE OR REPLACE FUNCTION actor.org_unit_descendants ( INT,INT ) RETURNS SETOF actor.org_unit AS $$
179 FROM connectby('actor.org_unit'::text,'id'::text,'parent_ou'::text,'name'::text,
181 FROM actor.org_unit_ancestors($1) x
182 JOIN actor.org_unit_type y ON x.ou_type = y.id
183 WHERE y.depth = $2)::text
185 AS t(keyid text, parent_keyid text, level int, branch text,pos int)
186 JOIN actor.org_unit a ON a.id::text = t.keyid::text
187 ORDER BY CASE WHEN a.parent_ou IS NULL THEN 0 ELSE 1 END, a.name;
188 $$ LANGUAGE SQL STABLE;
190 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_at_depth ( INT,INT ) RETURNS actor.org_unit AS $$
192 FROM actor.org_unit a
193 WHERE id = ( SELECT FIRST(x.id)
194 FROM actor.org_unit_ancestors($1) x
195 JOIN actor.org_unit_type y
196 ON x.ou_type = y.id AND y.depth = $2);
197 $$ LANGUAGE SQL STABLE;
199 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT ) RETURNS SETOF actor.org_unit AS $$
201 FROM actor.org_unit_ancestors($1)
204 FROM actor.org_unit_descendants($1);
205 $$ LANGUAGE SQL STABLE;
207 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
208 SELECT * FROM actor.org_unit_full_path((actor.org_unit_ancestor_at_depth($1, $2)).id)
209 $$ LANGUAGE SQL STABLE;
211 CREATE OR REPLACE FUNCTION actor.org_unit_combined_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
213 FROM actor.org_unit_ancestors($1)
216 FROM actor.org_unit_ancestors($2);
217 $$ LANGUAGE SQL STABLE;
219 CREATE OR REPLACE FUNCTION actor.org_unit_common_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
221 FROM actor.org_unit_ancestors($1)
224 FROM actor.org_unit_ancestors($2);
225 $$ LANGUAGE SQL STABLE;
227 CREATE OR REPLACE FUNCTION actor.org_unit_proximity ( INT, INT ) RETURNS INT AS $$
228 SELECT COUNT(id)::INT FROM (
229 SELECT id FROM actor.org_unit_combined_ancestors($1, $2)
231 SELECT id FROM actor.org_unit_common_ancestors($1, $2)
233 $$ LANGUAGE SQL STABLE;
235 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_setting( setting_name TEXT, org_id INT ) RETURNS SETOF actor.org_unit_setting AS $$
242 SELECT INTO setting * FROM actor.org_unit_setting WHERE org_unit = cur_org AND name = setting_name;
246 SELECT INTO cur_org parent_ou FROM actor.org_unit WHERE id = cur_org;
247 EXIT WHEN cur_org IS NULL;
251 $$ LANGUAGE plpgsql STABLE;
253 COMMENT ON FUNCTION actor.org_unit_ancestor_setting( TEXT, INT) IS $$
255 * Search "up" the org_unit tree until we find the first occurrence of an
256 * org_unit_setting with the given name.
260 -- Intended to be used in a unique index on authority.record_entry like so:
261 -- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus
262 -- ON authority.record_entry (authority.normalize_heading(marc))
263 -- WHERE deleted IS FALSE or deleted = FALSE;
264 CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
270 use MARC::File::XML (BinaryEncoding => 'UTF8');
271 use UUID::Tiny ':std';
273 my $xml = shift() or return undef;
277 # Prevent errors in XML parsing from blowing out ungracefully
279 $r = MARC::Record->new_from_xml( $xml );
282 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
286 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
289 # From http://www.loc.gov/standards/sourcelist/subject.html
290 my $thes_code_map = {
296 n => 'notapplicable',
302 # Default to "No attempt to code" if the leader is horribly broken
303 my $fixed_field = $r->field('008');
306 $thes_char = substr($fixed_field->data(), 11, 1) || '|';
309 my $thes_code = 'UNDEFINED';
311 if ($thes_char eq 'z') {
312 # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
313 $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
314 } elsif ($thes_code_map->{$thes_char}) {
315 $thes_code = $thes_code_map->{$thes_char};
319 my $head = $r->field('1..');
321 # Concatenate all of these subfields together, prefixed by their code
322 # to prevent collisions along the lines of "Fiction, North Carolina"
323 foreach my $sf ($head->subfields()) {
324 $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
328 # Perhaps better to parameterize the spi and pass as a parameter
329 $auth_txt =~ s/'//go;
332 my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
333 my $norm_txt = $result->{rows}[0]->{norm_text};
334 return $head->tag() . "_" . $thes_code . " " . $norm_txt;
337 return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
338 $func$ LANGUAGE 'plperlu' IMMUTABLE;
340 COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
342 * Extract the authority heading, thesaurus, and NACO-normalized values
343 * from an authority record. The primary purpose is to build a unique
344 * index to defend against duplicated authority records from the same