2 * Copyright (C) 2004-2008 Georgia Public Library Service
3 * Copyright (C) 2007-2008 Equinox Software, Inc.
4 * Mike Rylander <miker@esilibrary.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
18 CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS TEXT AS $$
30 WHEN $2::INT NOT BETWEEN 48 AND 57 THEN 1
31 ELSE $2::TEXT::INT + 1
34 $$ LANGUAGE SQL STRICT IMMUTABLE;
36 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
37 use Unicode::Normalize;
40 # When working with Unicode data, the first step is to decode it to
41 # a byte string; after that, lowercasing is safe
42 my $txt = lc(decode_utf8(shift));
46 $txt =~ s/\pM+//go; # Remove diacritics
48 $txt =~ s/\xE6/AE/go; # Convert ae digraph
49 $txt =~ s/\x{153}/OE/go;# Convert oe digraph
50 $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn
52 $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
53 $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
55 $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek
56 $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc
57 $txt =~ tr/\'\[\]\|//d; # Remove Misc
59 if ($sf && $sf =~ /^a/o) {
60 my $commapos = index($txt,',');
62 if ($commapos != length($txt) - 1) {
63 my @list = split /,/, $txt;
64 my $first = shift @list;
65 $txt = $first . ',' . join(' ', @list);
74 $txt =~ s/\s+/ /go; # Compress multiple spaces
75 $txt =~ s/^\s+//o; # Remove leading space
76 $txt =~ s/\s+$//o; # Remove trailing space
78 # Encoding the outgoing string is good practice, but not strictly
79 # necessary in this case because we've stripped everything from it
80 return encode_utf8($txt);
81 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
83 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
84 SELECT public.naco_normalize($1,'');
85 $func$ LANGUAGE 'sql' STRICT IMMUTABLE;
87 CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$
88 SELECT COALESCE(SUBSTRING( $1 FROM $_$^\S+$_$), '');
89 $$ LANGUAGE SQL STRICT IMMUTABLE;
91 CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
92 SELECT public.naco_normalize($1,'a');
93 $func$ LANGUAGE SQL STRICT IMMUTABLE;
95 CREATE OR REPLACE FUNCTION public.normalize_space( TEXT ) RETURNS TEXT AS $$
96 SELECT regexp_replace(regexp_replace(regexp_replace($1, E'\\n', ' ', 'g'), E'(?:^\\s+)|(\\s+$)', '', 'g'), E'\\s+', ' ', 'g');
97 $$ LANGUAGE SQL STRICT IMMUTABLE;
99 CREATE OR REPLACE FUNCTION public.remove_commas( TEXT ) RETURNS TEXT AS $$
100 SELECT regexp_replace($1, ',', '', 'g');
101 $$ LANGUAGE SQL STRICT IMMUTABLE;
103 CREATE OR REPLACE FUNCTION public.remove_paren_substring( TEXT ) RETURNS TEXT AS $func$
104 SELECT regexp_replace($1, $$\([^)]+\)$$, '', 'g');
105 $func$ LANGUAGE SQL STRICT IMMUTABLE;
107 CREATE OR REPLACE FUNCTION public.remove_whitespace( TEXT ) RETURNS TEXT AS $$
108 SELECT regexp_replace(normalize_space($1), E'\\s+', '', 'g');
109 $$ LANGUAGE SQL STRICT IMMUTABLE;
111 CREATE OR REPLACE FUNCTION public.lowercase( TEXT ) RETURNS TEXT AS $$
113 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
115 CREATE OR REPLACE FUNCTION public.uppercase( TEXT ) RETURNS TEXT AS $$
117 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
119 CREATE OR REPLACE FUNCTION public.remove_diacritics( TEXT ) RETURNS TEXT AS $$
120 use Unicode::Normalize;
126 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
128 CREATE OR REPLACE FUNCTION public.entityize( TEXT ) RETURNS TEXT AS $$
129 use Unicode::Normalize;
132 $x =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
135 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
137 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT ) RETURNS TEXT AS $$
140 $txt =~ s/[\[\]\{\}\(\)`'"#<>\*\?\-\+\$\\]+//og;
142 if ($txt =~ /(\d{3}(?:\.\d+)?)/o) {
145 return (split /\s+/, $txt)[0];
147 $$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
149 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT, INT ) RETURNS TEXT AS $$
150 SELECT SUBSTRING(call_number_dewey($1) FROM 1 FOR $2);
151 $$ LANGUAGE SQL STRICT IMMUTABLE;
153 CREATE OR REPLACE FUNCTION tableoid2name ( oid ) RETURNS TEXT AS $$
157 $$ language 'plpgsql';
159 CREATE OR REPLACE FUNCTION actor.org_unit_descendants( INT, INT ) RETURNS SETOF actor.org_unit AS $$
160 WITH RECURSIVE descendant_depth AS (
164 FROM actor.org_unit ou
165 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
166 JOIN anscestor_depth ad ON (ad.id = ou.id)
172 FROM actor.org_unit ou
173 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
174 JOIN descendant_depth ot ON (ot.id = ou.parent_ou)
175 ), anscestor_depth AS (
179 FROM actor.org_unit ou
180 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
186 FROM actor.org_unit ou
187 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
188 JOIN anscestor_depth ot ON (ot.parent_ou = ou.id)
189 ) SELECT ou.* FROM actor.org_unit ou JOIN descendant_depth USING (id);
192 CREATE OR REPLACE FUNCTION actor.org_unit_descendants( INT ) RETURNS SETOF actor.org_unit AS $$
193 WITH RECURSIVE descendant_depth AS (
197 FROM actor.org_unit ou
198 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
204 FROM actor.org_unit ou
205 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
206 JOIN descendant_depth ot ON (ot.id = ou.parent_ou)
207 ) SELECT ou.* FROM actor.org_unit ou JOIN descendant_depth USING (id);
210 CREATE OR REPLACE FUNCTION actor.org_unit_ancestors( INT ) RETURNS SETOF actor.org_unit AS $$
211 WITH RECURSIVE anscestor_depth AS (
214 FROM actor.org_unit ou
219 FROM actor.org_unit ou
220 JOIN anscestor_depth ot ON (ot.parent_ou = ou.id)
221 ) SELECT ou.* FROM actor.org_unit ou JOIN anscestor_depth USING (id);
224 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_at_depth ( INT,INT ) RETURNS actor.org_unit AS $$
226 FROM actor.org_unit a
227 WHERE id = ( SELECT FIRST(x.id)
228 FROM actor.org_unit_ancestors($1) x
229 JOIN actor.org_unit_type y
230 ON x.ou_type = y.id AND y.depth = $2);
231 $$ LANGUAGE SQL STABLE;
233 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT ) RETURNS SETOF actor.org_unit AS $$
235 FROM actor.org_unit_ancestors($1)
238 FROM actor.org_unit_descendants($1);
239 $$ LANGUAGE SQL STABLE;
241 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
242 SELECT * FROM actor.org_unit_full_path((actor.org_unit_ancestor_at_depth($1, $2)).id)
243 $$ LANGUAGE SQL STABLE;
245 CREATE OR REPLACE FUNCTION actor.org_unit_combined_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
247 FROM actor.org_unit_ancestors($1)
250 FROM actor.org_unit_ancestors($2);
251 $$ LANGUAGE SQL STABLE;
253 CREATE OR REPLACE FUNCTION actor.org_unit_common_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
255 FROM actor.org_unit_ancestors($1)
258 FROM actor.org_unit_ancestors($2);
259 $$ LANGUAGE SQL STABLE;
261 CREATE OR REPLACE FUNCTION actor.org_unit_proximity ( INT, INT ) RETURNS INT AS $$
262 SELECT COUNT(id)::INT FROM (
263 SELECT id FROM actor.org_unit_combined_ancestors($1, $2)
265 SELECT id FROM actor.org_unit_common_ancestors($1, $2)
267 $$ LANGUAGE SQL STABLE;
269 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_setting( setting_name TEXT, org_id INT ) RETURNS SETOF actor.org_unit_setting AS $$
276 SELECT INTO setting * FROM actor.org_unit_setting WHERE org_unit = cur_org AND name = setting_name;
280 SELECT INTO cur_org parent_ou FROM actor.org_unit WHERE id = cur_org;
281 EXIT WHEN cur_org IS NULL;
285 $$ LANGUAGE plpgsql STABLE;
287 COMMENT ON FUNCTION actor.org_unit_ancestor_setting( TEXT, INT) IS $$
289 * Search "up" the org_unit tree until we find the first occurrence of an
290 * org_unit_setting with the given name.
294 -- Intended to be used in a unique index on authority.record_entry like so:
295 -- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus
296 -- ON authority.record_entry (authority.normalize_heading(marc))
297 -- WHERE deleted IS FALSE or deleted = FALSE;
298 CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
304 use MARC::File::XML (BinaryEncoding => 'UTF8');
305 use UUID::Tiny ':std';
307 my $xml = shift() or return undef;
311 # Prevent errors in XML parsing from blowing out ungracefully
313 $r = MARC::Record->new_from_xml( $xml );
316 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
320 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
323 # From http://www.loc.gov/standards/sourcelist/subject.html
324 my $thes_code_map = {
330 n => 'notapplicable',
336 # Default to "No attempt to code" if the leader is horribly broken
337 my $fixed_field = $r->field('008');
340 $thes_char = substr($fixed_field->data(), 11, 1) || '|';
343 my $thes_code = 'UNDEFINED';
345 if ($thes_char eq 'z') {
346 # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
347 $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
348 } elsif ($thes_code_map->{$thes_char}) {
349 $thes_code = $thes_code_map->{$thes_char};
353 my $head = $r->field('1..');
355 # Concatenate all of these subfields together, prefixed by their code
356 # to prevent collisions along the lines of "Fiction, North Carolina"
357 foreach my $sf ($head->subfields()) {
358 $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
362 # Perhaps better to parameterize the spi and pass as a parameter
363 $auth_txt =~ s/'//go;
366 my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
367 my $norm_txt = $result->{rows}[0]->{norm_text};
368 return $head->tag() . "_" . $thes_code . " " . $norm_txt;
371 return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
372 $func$ LANGUAGE 'plperlu' IMMUTABLE;
374 COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
376 * Extract the authority heading, thesaurus, and NACO-normalized values
377 * from an authority record. The primary purpose is to build a unique
378 * index to defend against duplicated authority records from the same