3 -- Make the authority heading normalization routine more defensive
4 -- Also drop back to a plain index for 2.0, we will get more restrictive over time
6 INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs
8 CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
14 use MARC::File::XML (BinaryEncoding => 'UTF8');
15 use UUID::Tiny ':std';
17 my $xml = shift() or return undef;
21 # Prevent errors in XML parsing from blowing out ungracefully
23 $r = MARC::Record->new_from_xml( $xml );
26 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
30 return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
33 # From http://www.loc.gov/standards/sourcelist/subject.html
46 # Default to "No attempt to code" if the leader is horribly broken
47 my $fixed_field = $r->field('008');
50 $thes_char = substr($fixed_field->data(), 11, 1) || '|';
53 my $thes_code = 'UNDEFINED';
55 if ($thes_char eq 'z') {
56 # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
57 $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
58 } elsif ($thes_code_map->{$thes_char}) {
59 $thes_code = $thes_code_map->{$thes_char};
63 my $head = $r->field('1..');
65 # Concatenate all of these subfields together, prefixed by their code
66 # to prevent collisions along the lines of "Fiction, North Carolina"
67 foreach my $sf ($head->subfields()) {
68 $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
72 # Perhaps better to parameterize the spi and pass as a parameter
76 my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
77 my $norm_txt = $result->{rows}[0]->{norm_text};
78 return $head->tag() . "_" . $thes_code . " " . $norm_txt;
81 return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
82 $func$ LANGUAGE 'plperlu' IMMUTABLE;
84 DROP INDEX authority.unique_by_heading_and_thesaurus;
86 CREATE INDEX by_heading_and_thesaurus
87 ON authority.record_entry (authority.normalize_heading(marc))
88 WHERE deleted IS FALSE or deleted = FALSE