Open-ILS/src/sql/Pg/020.schema.functions.sql

   1 /*
   2  * Copyright (C) 2004-2008  Georgia Public Library Service
   3  * Copyright (C) 2007-2008  Equinox Software, Inc.
   4  * Mike Rylander <miker@esilibrary.com>
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version 2
   9  * of the License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  */
  17
  18 CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS TEXT AS $$
  19         SELECT  SUBSTRING(
  20                         REGEXP_REPLACE(
  21                                 REGEXP_REPLACE(
  22                                         $1,
  23                                         E'\W*$',
  24                                         ''
  25                                 ),
  26                                 '  ',
  27                                 ' '
  28                         ),
  29                         CASE
  30                                 WHEN $2::INT NOT BETWEEN 48 AND 57 THEN 1
  31                                 ELSE $2::TEXT::INT + 1
  32                         END
  33                 );
  34 $$ LANGUAGE SQL STRICT IMMUTABLE;
  35
  36 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
  37         use Unicode::Normalize;
  38         use Encode;
  39
  40         # When working with Unicode data, the first step is to decode it to
  41         # a byte string; after that, lowercasing is safe
  42         my $txt = lc(decode_utf8(shift));
  43         my $sf = shift;
  44
  45         $txt = NFD($txt);
  46         $txt =~ s/\pM+//go;     # Remove diacritics
  47
  48         # remove non-combining diacritics
  49         # this list of characters follows the NACO normalization spec,
  50         # but a looser but more comprehensive version might be
  51         # $txt =~ s/\pLm+//go;
  52         $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
  53
  54         $txt =~ s/\xE6/AE/go;   # Convert ae digraph
  55         $txt =~ s/\x{153}/OE/go;# Convert oe digraph
  56         $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
  57
  58         $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
  59         $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
  60
  61         $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;             # Convert Latin and Greek
  62         $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /;     # Convert Misc
  63         $txt =~ tr/\'\[\]\|//d;                                                 # Remove Misc
  64
  65         if ($sf && $sf =~ /^a/o) {
  66                 my $commapos = index($txt,',');
  67                 if ($commapos > -1) {
  68                         if ($commapos != length($txt) - 1) {
  69                                 my @list = split /,/, $txt;
  70                                 my $first = shift @list;
  71                                 $txt = $first . ',' . join(' ', @list);
  72                         } else {
  73                                 $txt =~ s/,/ /go;
  74                         }
  75                 }
  76         } else {
  77                 $txt =~ s/,/ /go;
  78         }
  79
  80         $txt =~ s/\s+/ /go;     # Compress multiple spaces
  81         $txt =~ s/^\s+//o;      # Remove leading space
  82         $txt =~ s/\s+$//o;      # Remove trailing space
  83
  84         # Encoding the outgoing string is good practice, but not strictly
  85         # necessary in this case because we've stripped everything from it
  86         return encode_utf8($txt);
  87 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
  88
  89 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
  90         SELECT public.naco_normalize($1,'');
  91 $func$ LANGUAGE 'sql' STRICT IMMUTABLE;
  92
  93 CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$
  94         SELECT COALESCE(SUBSTRING( $1 FROM $_$^\S+$_$), '');
  95 $$ LANGUAGE SQL STRICT IMMUTABLE;
  96
  97 CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
  98         SELECT public.naco_normalize($1,'a');
  99 $func$ LANGUAGE SQL STRICT IMMUTABLE;
 100
 101 CREATE OR REPLACE FUNCTION public.normalize_space( TEXT ) RETURNS TEXT AS $$
 102     SELECT regexp_replace(regexp_replace(regexp_replace($1, E'\\n', ' ', 'g'), E'(?:^\\s+)|(\\s+$)', '', 'g'), E'\\s+', ' ', 'g');
 103 $$ LANGUAGE SQL STRICT IMMUTABLE;
 104
 105 CREATE OR REPLACE FUNCTION public.remove_commas( TEXT ) RETURNS TEXT AS $$
 106     SELECT regexp_replace($1, ',', '', 'g');
 107 $$ LANGUAGE SQL STRICT IMMUTABLE;
 108
 109 CREATE OR REPLACE FUNCTION public.remove_paren_substring( TEXT ) RETURNS TEXT AS $func$
 110     SELECT regexp_replace($1, $$\([^)]+\)$$, '', 'g');
 111 $func$ LANGUAGE SQL STRICT IMMUTABLE;
 112
 113 CREATE OR REPLACE FUNCTION public.remove_whitespace( TEXT ) RETURNS TEXT AS $$
 114     SELECT regexp_replace(normalize_space($1), E'\\s+', '', 'g');
 115 $$ LANGUAGE SQL STRICT IMMUTABLE;
 116
 117 CREATE OR REPLACE FUNCTION public.lowercase( TEXT ) RETURNS TEXT AS $$
 118     return lc(shift);
 119 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
 120
 121 CREATE OR REPLACE FUNCTION public.uppercase( TEXT ) RETURNS TEXT AS $$
 122     return uc(shift);
 123 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
 124
 125 CREATE OR REPLACE FUNCTION public.remove_diacritics( TEXT ) RETURNS TEXT AS $$
 126     use Unicode::Normalize;
 127
 128     my $x = NFD(shift);
 129     $x =~ s/\pM+//go;
 130     return $x;
 131
 132 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
 133
 134 CREATE OR REPLACE FUNCTION public.entityize( TEXT ) RETURNS TEXT AS $$
 135     use Unicode::Normalize;
 136
 137     my $x = NFC(shift);
 138     $x =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
 139     return $x;
 140
 141 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
 142
 143 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT ) RETURNS TEXT AS $$
 144         my $txt = shift;
 145         $txt =~ s/^\s+//o;
 146         $txt =~ s/[\[\]\{\}\(\)`'"#<>\*\?\-\+\$\\]+//og;
 147         $txt =~ s/\s+$//o;
 148         if ($txt =~ /(\d{3}(?:\.\d+)?)/o) {
 149                 return $1;
 150         } else {
 151                 return (split /\s+/, $txt)[0];
 152         }
 153 $$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 154
 155 CREATE OR REPLACE FUNCTION public.call_number_dewey( TEXT, INT ) RETURNS TEXT AS $$
 156         SELECT SUBSTRING(call_number_dewey($1) FROM 1 FOR $2);
 157 $$ LANGUAGE SQL STRICT IMMUTABLE;
 158
 159 CREATE OR REPLACE FUNCTION tableoid2name ( oid ) RETURNS TEXT AS $$
 160         BEGIN
 161                 RETURN $1::regclass;
 162         END;
 163 $$ language 'plpgsql';
 164
 165 CREATE OR REPLACE FUNCTION actor.org_unit_descendants( INT, INT ) RETURNS SETOF actor.org_unit AS $$
 166     WITH RECURSIVE descendant_depth AS (
 167         SELECT  ou.id,
 168                 ou.parent_ou,
 169                 out.depth
 170           FROM  actor.org_unit ou
 171                 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
 172                 JOIN anscestor_depth ad ON (ad.id = ou.id)
 173           WHERE ad.depth = $2
 174             UNION ALL
 175         SELECT  ou.id,
 176                 ou.parent_ou,
 177                 out.depth
 178           FROM  actor.org_unit ou
 179                 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
 180                 JOIN descendant_depth ot ON (ot.id = ou.parent_ou)
 181     ), anscestor_depth AS (
 182         SELECT  ou.id,
 183                 ou.parent_ou,
 184                 out.depth
 185           FROM  actor.org_unit ou
 186                 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
 187           WHERE ou.id = $1
 188             UNION ALL
 189         SELECT  ou.id,
 190                 ou.parent_ou,
 191                 out.depth
 192           FROM  actor.org_unit ou
 193                 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
 194                 JOIN anscestor_depth ot ON (ot.parent_ou = ou.id)
 195     ) SELECT ou.* FROM actor.org_unit ou JOIN descendant_depth USING (id);
 196 $$ LANGUAGE SQL;
 197
 198 CREATE OR REPLACE FUNCTION actor.org_unit_descendants( INT ) RETURNS SETOF actor.org_unit AS $$
 199     WITH RECURSIVE descendant_depth AS (
 200         SELECT  ou.id,
 201                 ou.parent_ou,
 202                 out.depth
 203           FROM  actor.org_unit ou
 204                 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
 205           WHERE ou.id = $1
 206             UNION ALL
 207         SELECT  ou.id,
 208                 ou.parent_ou,
 209                 out.depth
 210           FROM  actor.org_unit ou
 211                 JOIN actor.org_unit_type out ON (out.id = ou.ou_type)
 212                 JOIN descendant_depth ot ON (ot.id = ou.parent_ou)
 213     ) SELECT ou.* FROM actor.org_unit ou JOIN descendant_depth USING (id);
 214 $$ LANGUAGE SQL;
 215
 216 CREATE OR REPLACE FUNCTION actor.org_unit_ancestors( INT ) RETURNS SETOF actor.org_unit AS $$
 217     WITH RECURSIVE anscestor_depth AS (
 218         SELECT  ou.id,
 219                 ou.parent_ou
 220           FROM  actor.org_unit ou
 221           WHERE ou.id = $1
 222             UNION ALL
 223         SELECT  ou.id,
 224                 ou.parent_ou
 225           FROM  actor.org_unit ou
 226                 JOIN anscestor_depth ot ON (ot.parent_ou = ou.id)
 227     ) SELECT ou.* FROM actor.org_unit ou JOIN anscestor_depth USING (id);
 228 $$ LANGUAGE SQL;
 229
 230 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_at_depth ( INT,INT ) RETURNS actor.org_unit AS $$
 231         SELECT  a.*
 232           FROM  actor.org_unit a
 233           WHERE id = ( SELECT FIRST(x.id)
 234                          FROM   actor.org_unit_ancestors($1) x
 235                                 JOIN actor.org_unit_type y
 236                                         ON x.ou_type = y.id AND y.depth = $2);
 237 $$ LANGUAGE SQL STABLE;
 238
 239 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT ) RETURNS SETOF actor.org_unit AS $$
 240         SELECT  *
 241           FROM  actor.org_unit_ancestors($1)
 242                         UNION
 243         SELECT  *
 244           FROM  actor.org_unit_descendants($1);
 245 $$ LANGUAGE SQL STABLE;
 246
 247 CREATE OR REPLACE FUNCTION actor.org_unit_full_path ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
 248         SELECT  * FROM actor.org_unit_full_path((actor.org_unit_ancestor_at_depth($1, $2)).id)
 249 $$ LANGUAGE SQL STABLE;
 250
 251 CREATE OR REPLACE FUNCTION actor.org_unit_combined_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
 252         SELECT  *
 253           FROM  actor.org_unit_ancestors($1)
 254                         UNION
 255         SELECT  *
 256           FROM  actor.org_unit_ancestors($2);
 257 $$ LANGUAGE SQL STABLE;
 258
 259 CREATE OR REPLACE FUNCTION actor.org_unit_common_ancestors ( INT, INT ) RETURNS SETOF actor.org_unit AS $$
 260         SELECT  *
 261           FROM  actor.org_unit_ancestors($1)
 262                         INTERSECT
 263         SELECT  *
 264           FROM  actor.org_unit_ancestors($2);
 265 $$ LANGUAGE SQL STABLE;
 266
 267 CREATE OR REPLACE FUNCTION actor.org_unit_proximity ( INT, INT ) RETURNS INT AS $$
 268         SELECT COUNT(id)::INT FROM (
 269                 SELECT id FROM actor.org_unit_combined_ancestors($1, $2)
 270                         EXCEPT
 271                 SELECT id FROM actor.org_unit_common_ancestors($1, $2)
 272         ) z;
 273 $$ LANGUAGE SQL STABLE;
 274
 275 CREATE OR REPLACE FUNCTION actor.org_unit_ancestor_setting( setting_name TEXT, org_id INT ) RETURNS SETOF actor.org_unit_setting AS $$
 276 DECLARE
 277     setting RECORD;
 278     cur_org INT;
 279 BEGIN
 280     cur_org := org_id;
 281     LOOP
 282         SELECT INTO setting * FROM actor.org_unit_setting WHERE org_unit = cur_org AND name = setting_name;
 283         IF FOUND THEN
 284             RETURN NEXT setting;
 285         END IF;
 286         SELECT INTO cur_org parent_ou FROM actor.org_unit WHERE id = cur_org;
 287         EXIT WHEN cur_org IS NULL;
 288     END LOOP;
 289     RETURN;
 290 END;
 291 $$ LANGUAGE plpgsql STABLE;
 292
 293 COMMENT ON FUNCTION actor.org_unit_ancestor_setting( TEXT, INT) IS $$
 294 /**
 295 * Search "up" the org_unit tree until we find the first occurrence of an
 296 * org_unit_setting with the given name.
 297 */
 298 $$;
 299
 300 -- Intended to be used in a unique index on authority.record_entry like so:
 301 -- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus
 302 --   ON authority.record_entry (authority.normalize_heading(marc))
 303 --   WHERE deleted IS FALSE or deleted = FALSE;
 304 CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
 305     use strict;
 306     use warnings;
 307
 308     use utf8;
 309     use MARC::Record;
 310     use MARC::File::XML (BinaryEncoding => 'UTF8');
 311     use UUID::Tiny ':std';
 312
 313     my $xml = shift() or return undef;
 314
 315     my $r;
 316
 317     # Prevent errors in XML parsing from blowing out ungracefully
 318     eval {
 319         $r = MARC::Record->new_from_xml( $xml );
 320         1;
 321     } or do {
 322        return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
 323     };
 324
 325     if (!$r) {
 326        return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
 327     }
 328
 329     # From http://www.loc.gov/standards/sourcelist/subject.html
 330     my $thes_code_map = {
 331         a => 'lcsh',
 332         b => 'lcshac',
 333         c => 'mesh',
 334         d => 'nal',
 335         k => 'cash',
 336         n => 'notapplicable',
 337         r => 'aat',
 338         s => 'sears',
 339         v => 'rvm',
 340     };
 341
 342     # Default to "No attempt to code" if the leader is horribly broken
 343     my $fixed_field = $r->field('008');
 344     my $thes_char = '|';
 345     if ($fixed_field) {
 346         $thes_char = substr($fixed_field->data(), 11, 1) || '|';
 347     }
 348
 349     my $thes_code = 'UNDEFINED';
 350
 351     if ($thes_char eq 'z') {
 352         # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
 353         $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
 354     } elsif ($thes_code_map->{$thes_char}) {
 355         $thes_code = $thes_code_map->{$thes_char};
 356     }
 357
 358     my $auth_txt = '';
 359     my $head = $r->field('1..');
 360     if ($head) {
 361         # Concatenate all of these subfields together, prefixed by their code
 362         # to prevent collisions along the lines of "Fiction, North Carolina"
 363         foreach my $sf ($head->subfields()) {
 364             $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
 365         }
 366     }
 367
 368     # Perhaps better to parameterize the spi and pass as a parameter
 369     $auth_txt =~ s/'//go;
 370
 371     if ($auth_txt) {
 372         my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
 373         my $norm_txt = $result->{rows}[0]->{norm_text};
 374         return $head->tag() . "_" . $thes_code . " " . $norm_txt;
 375     }
 376
 377     return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
 378 $func$ LANGUAGE 'plperlu' IMMUTABLE;
 379
 380 COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
 381 /**
 382 * Extract the authority heading, thesaurus, and NACO-normalized values
 383 * from an authority record. The primary purpose is to build a unique
 384 * index to defend against duplicated authority records from the same
 385 * thesaurus.
 386 */
 387 $$;