2 * Copyright (C) 2004-2008 Georgia Public Library Service
3 * Copyright (C) 2007-2008 Equinox Software, Inc.
4 * Mike Rylander <miker@esilibrary.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
18 CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS TEXT AS $$
30 WHEN $2::INT NOT BETWEEN 48 AND 57 THEN 1
31 ELSE $2::TEXT::INT + 1
34 $$ LANGUAGE SQL STRICT IMMUTABLE;
36 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
37 use Unicode::Normalize;
40 # When working with Unicode data, the first step is to decode it to
41 # a byte string; after that, lowercasing is safe
42 my $txt = lc(decode_utf8(shift));
46 $txt =~ s/\pM+//go; # Remove diacritics
48 # remove non-combining diacritics
49 # this list of characters follows the NACO normalization spec,
50 # but a looser but more comprehensive version might be
51 # $txt =~ s/\pLm+//go;
52 $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
54 $txt =~ s/\xE6/AE/go; # Convert ae digraph
55 $txt =~ s/\x{153}/OE/go;# Convert oe digraph
56 $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn
58 $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
59 $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
61 $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek
62 $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc
63 $txt =~ tr/\'\[\]\|//d; # Remove Misc
65 if ($sf && $sf =~ /^a/o) {
66 my $commapos = index($txt,',');
68 if ($commapos != length($txt) - 1) {
69 my @list = split /,/, $txt;
70 my $first = shift @list;
71 $txt = $first . ',' . join(' ', @list);
80 $txt =~ s/\s+/ /go; # Compress multiple spaces
81 $txt =~ s/^\s+//o; # Remove leading space
82 $txt =~ s/\s+$//o; # Remove trailing space
84 # Encoding the outgoing string is good practice, but not strictly
85 # necessary in this case because we've stripped everything from it
86 return encode_utf8($txt);
87 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
89 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
90 SELECT public.naco_normalize($1,'');
91 $func$ LANGUAGE 'sql' STRICT IMMUTABLE;
93 CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$
94 SELECT COALESCE(SUBSTRING( $1 FROM $_$^\S+$_$), '');
95 $$ LANGUAGE SQL STRICT IMMUTABLE;
97 CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
98 SELECT public.naco_normalize($1,'a');
99 $func$ LANGUAGE SQL STRICT IMMUTABLE;
101 CREATE OR REPLACE FUNCTION public.normalize_space( TEXT ) RETURNS TEXT AS $$
102 SELECT regexp_replace(regexp_replace(regexp_replace($1, E'\\n', ' ', 'g'), E'(?:^\\s+)|(\\s+$)', '', 'g'), E'\\s+', ' ', 'g');
103 $$ LANGUAGE SQL STRICT IMMUTABLE;
105 CREATE OR REPLACE FUNCTION public.remove_commas( TEXT ) RETURNS TEXT AS $$
106 SELECT regexp_replace($1, ',', '', 'g');
107 $$ LANGUAGE SQL STRICT IMMUTABLE;
109 CREATE OR REPLACE FUNCTION public.remove_paren_substring( TEXT ) RETURNS TEXT AS $func$
110 SELECT regexp_replace($1, $$\([^)]+\)$$, '', 'g');
111 $func$ LANGUAGE SQL STRICT IMMUTABLE;
113 CREATE OR REPLACE FUNCTION public.remove_whitespace( TEXT ) RETURNS TEXT AS $$
114 SELECT regexp_replace(normalize_space($1), E'\\s+', '', 'g');
115 $$ LANGUAGE SQL STRICT IMMUTABLE;
117 CREATE OR REPLACE FUNCTION public.lowercase( TEXT ) RETURNS TEXT AS $$
119 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
121 CREATE OR REPLACE FUNCTION public.uppercase( TEXT ) RETURNS TEXT AS $$
123 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
125 CREATE OR REPLACE FUNCTION public.remove_diacritics( TEXT ) RETURNS TEXT AS $$
126 use Unicode::Normalize;
132 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
134 CREATE OR REPLACE FUNCTION public.entityize( TEXT ) RETURNS TEXT AS $$
135 use Unicode::Normalize;
138 $x =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
141 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
143 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT ) RETURNS TEXT AS $$
146 $txt =~ s/[\[\]\{\}\(\)`'"#<>\*\?\-\+\$\\]+//og;
148 if ($txt =~ /(\d{3}(?:\.\d+)?)/o) {
151 return (split /\s+/, $txt)[0];
153 $$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
155 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT, INT ) RETURNS TEXT AS $$
156 SELECT SUBSTRING(call_number_dewey($1) FROM 1 FOR $2);
157 $$ LANGUAGE SQL STRICT IMMUTABLE;
159 CREATE OR REPLACE FUNCTION tableoid2name ( oid ) RETURNS TEXT AS $$
163 $$ language 'plpgsql';
165 CREATE OR REPLACE FUNCTION actor.org_unit_descendants( INT, INT ) RETURNS SETOF actor.org_unit AS $$
166 WITH RECURSIVE descendant_depth AS (
170 FROM actor.org_unit ou
171 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
172 JOIN anscestor_depth ad ON (ad.id = ou.id)
178 FROM actor.org_unit ou
179 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
180 JOIN descendant_depth ot ON (ot.id = ou.parent_ou)
181 ), anscestor_depth AS (
185 FROM actor.org_unit ou
186 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
192 FROM actor.org_unit ou
193 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
194 JOIN anscestor_depth ot ON (ot.parent_ou = ou.id)
195 ) SELECT ou.* FROM actor.org_unit ou JOIN descendant_depth USING (id);
198 CREATE OR REPLACE FUNCTION actor.org_unit_descendants( INT ) RETURNS SETOF actor.org_unit AS $$
199 WITH RECURSIVE descendant_depth AS (
203 FROM actor.org_unit ou
204 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
210 FROM actor.org_unit ou
211 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
212 JOIN descendant_depth ot ON (ot.id = ou.parent_ou)
213 ) SELECT ou.* FROM actor.org_unit ou JOIN descendant_depth USING (id);
216 CREATE OR REPLACE FUNCTION actor.org_unit_ancestors( INT ) RETURNS SETOF actor.org_unit AS $$
217 WITH RECURSIVE anscestor_depth AS (
220 FROM actor.org_unit ou
225 FROM actor.org_unit ou
226 JOIN anscestor_depth ot ON (ot.parent_ou = ou.id)
227 ) SELECT ou.* FROM actor.org_unit ou JOIN anscestor_depth USING (id);
230 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_at_depth ( INT,INT ) RETURNS actor.org_unit AS $$
232 FROM actor.org_unit a
233 WHERE id = ( SELECT FIRST(x.id)
234 FROM actor.org_unit_ancestors($1) x
235 JOIN actor.org_unit_type y
236 ON x.ou_type = y.id AND y.depth = $2);
237 $$ LANGUAGE SQL STABLE;
239 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT ) RETURNS SETOF actor.org_unit AS $$
241 FROM actor.org_unit_ancestors($1)
244 FROM actor.org_unit_descendants($1);
245 $$ LANGUAGE SQL STABLE;
247 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
248 SELECT * FROM actor.org_unit_full_path((actor.org_unit_ancestor_at_depth($1, $2)).id)
249 $$ LANGUAGE SQL STABLE;
251 CREATE OR REPLACE FUNCTION actor.org_unit_combined_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
253 FROM actor.org_unit_ancestors($1)
256 FROM actor.org_unit_ancestors($2);
257 $$ LANGUAGE SQL STABLE;
259 CREATE OR REPLACE FUNCTION actor.org_unit_common_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
261 FROM actor.org_unit_ancestors($1)
264 FROM actor.org_unit_ancestors($2);
265 $$ LANGUAGE SQL STABLE;
267 CREATE OR REPLACE FUNCTION actor.org_unit_proximity ( INT, INT ) RETURNS INT AS $$
268 SELECT COUNT(id)::INT FROM (
269 SELECT id FROM actor.org_unit_combined_ancestors($1, $2)
271 SELECT id FROM actor.org_unit_common_ancestors($1, $2)
273 $$ LANGUAGE SQL STABLE;
275 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_setting( setting_name TEXT, org_id INT ) RETURNS SETOF actor.org_unit_setting AS $$
282 SELECT INTO setting * FROM actor.org_unit_setting WHERE org_unit = cur_org AND name = setting_name;
286 SELECT INTO cur_org parent_ou FROM actor.org_unit WHERE id = cur_org;
287 EXIT WHEN cur_org IS NULL;
291 $$ LANGUAGE plpgsql STABLE;
293 COMMENT ON FUNCTION actor.org_unit_ancestor_setting( TEXT, INT) IS $$
295 * Search "up" the org_unit tree until we find the first occurrence of an
296 * org_unit_setting with the given name.
300 -- Intended to be used in a unique index on authority.record_entry like so:
301 -- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus
302 -- ON authority.record_entry (authority.normalize_heading(marc))
303 -- WHERE deleted IS FALSE or deleted = FALSE;
304 CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
310 use MARC::File::XML (BinaryEncoding => 'UTF8');
311 use UUID::Tiny ':std';
313 my $xml = shift() or return undef;
317 # Prevent errors in XML parsing from blowing out ungracefully
319 $r = MARC::Record->new_from_xml( $xml );
322 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
326 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
329 # From http://www.loc.gov/standards/sourcelist/subject.html
330 my $thes_code_map = {
336 n => 'notapplicable',
342 # Default to "No attempt to code" if the leader is horribly broken
343 my $fixed_field = $r->field('008');
346 $thes_char = substr($fixed_field->data(), 11, 1) || '|';
349 my $thes_code = 'UNDEFINED';
351 if ($thes_char eq 'z') {
352 # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
353 $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
354 } elsif ($thes_code_map->{$thes_char}) {
355 $thes_code = $thes_code_map->{$thes_char};
359 my $head = $r->field('1..');
361 # Concatenate all of these subfields together, prefixed by their code
362 # to prevent collisions along the lines of "Fiction, North Carolina"
363 foreach my $sf ($head->subfields()) {
364 $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
368 # Perhaps better to parameterize the spi and pass as a parameter
369 $auth_txt =~ s/'//go;
372 my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
373 my $norm_txt = $result->{rows}[0]->{norm_text};
374 return $head->tag() . "_" . $thes_code . " " . $norm_txt;
377 return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
378 $func$ LANGUAGE 'plperlu' IMMUTABLE;
380 COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
382 * Extract the authority heading, thesaurus, and NACO-normalized values
383 * from an authority record. The primary purpose is to build a unique
384 * index to defend against duplicated authority records from the same