From 300ba62e32de6d5f24649c449dd4ab381c37bb61 Mon Sep 17 00:00:00 2001 From: dbs Date: Tue, 3 May 2011 16:29:16 +0000 Subject: [PATCH] Avoid data loss by setting MARC::Charset->assume_unicode(1) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When using MARC::File::XML, MARC::Charset is used to perform character conversions; however, MARC::File::XML does not tell MARC::Charset that it is handling Unicode data. If we do not tell MARC::Charset that it is handling Unicode data, it can return an error which results in the loss of data (typically a subfield containing one or more characters which MARC::Charset does not have an equivalent mapping outside of Unicode). This problem could be reproduced in authority_control_fields.pl with a subfield like "von Hans-Christian Müơller" - when this subfield was encountered without assume_unicode(1), a null string was returned for that subfield, and if the record was written back to the database due to an authority match being found in a different field, the only recourse was to restore the record from auditor.biblio_record_entry_history. The same sort of problems could occur for any other script or function that modifies the data being handed to it using MARC::File::XML and BinaryEncoding => UTF8. Signed-off-by: Dan Scott git-svn-id: svn://svn.open-ils.org/ILS/trunk@20385 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/002.functions.config.sql | 3 + Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- Open-ILS/src/sql/Pg/011.schema.authority.sql | 3 + Open-ILS/src/sql/Pg/012.schema.vandelay.sql | 6 + Open-ILS/src/sql/Pg/020.schema.functions.sql | 3 + Open-ILS/src/sql/Pg/030.schema.metabib.sql | 3 + Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql | 413 +++++++++++++++ Open-ILS/src/sql/Pg/999.functions.global.sql | 3 + .../0528.schema.functions_assume_unicode.sql | 491 ++++++++++++++++++ .../authority_control_fields.pl.in | 3 + 10 files changed, 929 insertions(+), 1 deletion(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0528.schema.functions_assume_unicode.sql diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql index a9eca28831..c97a1dd579 100644 --- a/Open-ILS/src/sql/Pg/002.functions.config.sql +++ b/Open-ILS/src/sql/Pg/002.functions.config.sql @@ -491,9 +491,12 @@ CREATE OR REPLACE FUNCTION maintain_control_numbers() RETURNS TRIGGER AS $func$ use strict; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; use Encode; use Unicode::Normalize; +MARC::Charset->assume_unicode(1); + my $record = MARC::Record->new_from_xml($_TD->{new}{marc}); my $schema = $_TD->{table_schema}; my $rec_id = $_TD->{new}{id}; diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 91e46e46fa..c0bc8c1d34 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -86,7 +86,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0526', :eg_version); -- miker +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0528', :eg_version); -- dbs CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/011.schema.authority.sql b/Open-ILS/src/sql/Pg/011.schema.authority.sql index 3f3f3e563f..133a2438f7 100644 --- a/Open-ILS/src/sql/Pg/011.schema.authority.sql +++ b/Open-ILS/src/sql/Pg/011.schema.authority.sql @@ -121,6 +121,9 @@ CREATE OR REPLACE FUNCTION authority.generate_overlay_template ( TEXT, BIGINT ) use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; + + MARC::Charset->assume_unicode(1); my $xml = shift; my $r = MARC::Record->new_from_xml( $xml ); diff --git a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql index 476ee07745..784117002e 100644 --- a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql +++ b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql @@ -309,8 +309,11 @@ CREATE OR REPLACE FUNCTION vandelay.add_field ( target_xml TEXT, source_xml TEXT use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; use strict; + MARC::Charset->assume_unicode(1); + my $target_xml = shift; my $source_xml = shift; my $field_spec = shift; @@ -386,8 +389,11 @@ CREATE OR REPLACE FUNCTION vandelay.strip_field ( xml TEXT, field TEXT ) RETURNS use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; use strict; + MARC::Charset->assume_unicode(1); + my $xml = shift; my $r = MARC::Record->new_from_xml( $xml ); diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index f7c43c5ffa..981a364e2b 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -338,8 +338,11 @@ CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $ use utf8; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF8'); + use MARC::Charset; use UUID::Tiny ':std'; + MARC::Charset->assume_unicode(1); + my $xml = shift() or return undef; my $r; diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 2f6539f145..5dc4761691 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -430,6 +430,9 @@ CREATE OR REPLACE FUNCTION biblio.flatten_marc ( TEXT ) RETURNS SETOF metabib.fu use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; + +MARC::Charset->assume_unicode(1); my $xml = shift; my $r = MARC::Record->new_from_xml( $xml ); diff --git a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql index ad5376d54e..6d6451edbd 100644 --- a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql +++ b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql @@ -18705,8 +18705,11 @@ CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $ use utf8; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF8'); + use MARC::Charset; use UUID::Tiny ':std'; + MARC::Charset->assume_unicode(1); + my $xml = shift() or return undef; my $r; @@ -19193,6 +19196,416 @@ CREATE INDEX cp_create_date ON asset.copy (create_date); -- Speed up call number browsing CREATE INDEX asset_call_number_label_sortkey_browse ON asset.call_number(oils_text_as_bytea(label_sortkey), oils_text_as_bytea(label), id, owning_lib) WHERE deleted IS FALSE OR deleted = FALSE; +-- Add MARC::Charset->assume_unicode(1) to improve handling of Unicode characters +CREATE OR REPLACE FUNCTION maintain_control_numbers() RETURNS TRIGGER AS $func$ +use strict; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; +use Encode; +use Unicode::Normalize; + +MARC::Charset->assume_unicode(1); + +my $record = MARC::Record->new_from_xml($_TD->{new}{marc}); +my $schema = $_TD->{table_schema}; +my $rec_id = $_TD->{new}{id}; + +# Short-circuit if maintaining control numbers per MARC21 spec is not enabled +my $enable = spi_exec_query("SELECT enabled FROM config.global_flag WHERE name = 'cat.maintain_control_numbers'"); +if (!($enable->{processed}) or $enable->{rows}[0]->{enabled} eq 'f') { + return; +} + +# Get the control number identifier from an OU setting based on $_TD->{new}{owner} +my $ou_cni = 'EVRGRN'; + +my $owner; +if ($schema eq 'serial') { + $owner = $_TD->{new}{owning_lib}; +} else { + # are.owner and bre.owner can be null, so fall back to the consortial setting + $owner = $_TD->{new}{owner} || 1; +} + +my $ous_rv = spi_exec_query("SELECT value FROM actor.org_unit_ancestor_setting('cat.marc_control_number_identifier', $owner)"); +if ($ous_rv->{processed}) { + $ou_cni = $ous_rv->{rows}[0]->{value}; + $ou_cni =~ s/"//g; # Stupid VIM syntax highlighting" +} else { + # Fall back to the shortname of the OU if there was no OU setting + $ous_rv = spi_exec_query("SELECT shortname FROM actor.org_unit WHERE id = $owner"); + if ($ous_rv->{processed}) { + $ou_cni = $ous_rv->{rows}[0]->{shortname}; + } +} + +my ($create, $munge) = (0, 0); + +my @scns = $record->field('035'); + +foreach my $id_field ('001', '003') { + my $spec_value; + my @controls = $record->field($id_field); + + if ($id_field eq '001') { + $spec_value = $rec_id; + } else { + $spec_value = $ou_cni; + } + + # Create the 001/003 if none exist + if (scalar(@controls) == 1) { + # Only one field; check to see if we need to munge it + unless (grep $_->data() eq $spec_value, @controls) { + $munge = 1; + } + } else { + # Delete the other fields, as with more than 1 001/003 we do not know which 003/001 to match + foreach my $control (@controls) { + unless ($control->data() eq $spec_value) { + $record->delete_field($control); + } + } + $record->insert_fields_ordered(MARC::Field->new($id_field, $spec_value)); + $create = 1; + } +} + +# Now, if we need to munge the 001, we will first push the existing 001/003 +# into the 035; but if the record did not have one (and one only) 001 and 003 +# to begin with, skip this process +if ($munge and not $create) { + my $scn = "(" . $record->field('003')->data() . ")" . $record->field('001')->data(); + + # Do not create duplicate 035 fields + unless (grep $_->subfield('a') eq $scn, @scns) { + $record->insert_fields_ordered(MARC::Field->new('035', '', '', 'a' => $scn)); + } +} + +# Set the 001/003 and update the MARC +if ($create or $munge) { + $record->field('001')->data($rec_id); + $record->field('003')->data($ou_cni); + + my $xml = $record->as_xml_record(); + $xml =~ s/\n//sgo; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize() + # to avoid having to set PERL5LIB for PostgreSQL as well + + # If we are going to convert non-ASCII characters to XML entities, + # we had better be dealing with a UTF8 string to begin with + $xml = decode_utf8($xml); + + $xml = NFC($xml); + + # Convert raw ampersands to entities + $xml =~ s/&(?!\S+;)/&/gso; + + # Convert Unicode characters to entities + $xml =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; + + $xml =~ s/[\x00-\x1f]//go; + $_TD->{new}{marc} = $xml; + + return "MODIFY"; +} + +return; +$func$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION authority.generate_overlay_template ( TEXT, BIGINT ) RETURNS TEXT AS $func$ + + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; + + MARC::Charset->assume_unicode(1); + + my $xml = shift; + my $r = MARC::Record->new_from_xml( $xml ); + + return undef unless ($r); + + my $id = shift() || $r->subfield( '901' => 'c' ); + $id =~ s/^\s*(?:\([^)]+\))?\s*(.+)\s*?$/$1/; + return undef unless ($id); # We need an ID! + + my $tmpl = MARC::Record->new(); + $tmpl->encoding( 'UTF-8' ); + + my @rule_fields; + for my $field ( $r->field( '1..' ) ) { # Get main entry fields from the authority record + + my $tag = $field->tag; + my $i1 = $field->indicator(1); + my $i2 = $field->indicator(2); + my $sf = join '', map { $_->[0] } $field->subfields; + my @data = map { @$_ } $field->subfields; + + my @replace_them; + + # Map the authority field to bib fields it can control. + if ($tag >= 100 and $tag <= 111) { # names + @replace_them = map { $tag + $_ } (0, 300, 500, 600, 700); + } elsif ($tag eq '130') { # uniform title + @replace_them = qw/130 240 440 730 830/; + } elsif ($tag >= 150 and $tag <= 155) { # subjects + @replace_them = ($tag + 500); + } elsif ($tag >= 180 and $tag <= 185) { # floating subdivisions + @replace_them = qw/100 400 600 700 800 110 410 610 710 810 111 411 611 711 811 130 240 440 730 830 650 651 655/; + } else { + next; + } + + # Dummy up the bib-side data + $tmpl->append_fields( + map { + MARC::Field->new( $_, $i1, $i2, @data ) + } @replace_them + ); + + # Construct some 'replace' rules + push @rule_fields, map { $_ . $sf . '[0~\)' .$id . '$]' } @replace_them; + } + + # Insert the replace rules into the template + $tmpl->append_fields( + MARC::Field->new( '905' => ' ' => ' ' => 'r' => join(',', @rule_fields ) ) + ); + + $xml = $tmpl->as_xml_record; + $xml =~ s/^<\?.+?\?>$//mo; + $xml =~ s/\n//sgo; + $xml =~ s/>\s+ 'UTF-8'); + use MARC::Charset; + use strict; + + MARC::Charset->assume_unicode(1); + + my $target_xml = shift; + my $source_xml = shift; + my $field_spec = shift; + my $force_add = shift || 0; + + my $target_r = MARC::Record->new_from_xml( $target_xml ); + my $source_r = MARC::Record->new_from_xml( $source_xml ); + + return $target_xml unless ($target_r && $source_r); + + my @field_list = split(',', $field_spec); + + my %fields; + for my $f (@field_list) { + $f =~ s/^\s*//; $f =~ s/\s*$//; + if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) { + my $field = $1; + $field =~ s/\s+//; + my $sf = $2; + $sf =~ s/\s+//; + my $match = $3; + $match =~ s/^\s*//; $match =~ s/\s*$//; + $fields{$field} = { sf => [ split('', $sf) ] }; + if ($match) { + my ($msf,$mre) = split('~', $match); + if (length($msf) > 0 and length($mre) > 0) { + $msf =~ s/^\s*//; $msf =~ s/\s*$//; + $mre =~ s/^\s*//; $mre =~ s/\s*$//; + $fields{$field}{match} = { sf => $msf, re => qr/$mre/ }; + } + } + } + } + + for my $f ( keys %fields) { + if ( @{$fields{$f}{sf}} ) { + for my $from_field ($source_r->field( $f )) { + my @tos = $target_r->field( $f ); + if (!@tos) { + next if (exists($fields{$f}{match}) and !$force_add); + my @new_fields = map { $_->clone } $source_r->field( $f ); + $target_r->insert_fields_ordered( @new_fields ); + } else { + for my $to_field (@tos) { + if (exists($fields{$f}{match})) { + next unless (grep { $_ =~ $fields{$f}{match}{re} } $to_field->subfield($fields{$f}{match}{sf})); + } + my @new_sf = map { ($_ => $from_field->subfield($_)) } @{$fields{$f}{sf}}; + $to_field->add_subfields( @new_sf ); + } + } + } + } else { + my @new_fields = map { $_->clone } $source_r->field( $f ); + $target_r->insert_fields_ordered( @new_fields ); + } + } + + $target_xml = $target_r->as_xml_record; + $target_xml =~ s/^<\?.+?\?>$//mo; + $target_xml =~ s/\n//sgo; + $target_xml =~ s/>\s+ 'UTF-8'); + use MARC::Charset; + use strict; + + MARC::Charset->assume_unicode(1); + + my $xml = shift; + my $r = MARC::Record->new_from_xml( $xml ); + + return $xml unless ($r); + + my $field_spec = shift; + my @field_list = split(',', $field_spec); + + my %fields; + for my $f (@field_list) { + $f =~ s/^\s*//; $f =~ s/\s*$//; + if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) { + my $field = $1; + $field =~ s/\s+//; + my $sf = $2; + $sf =~ s/\s+//; + my $match = $3; + $match =~ s/^\s*//; $match =~ s/\s*$//; + $fields{$field} = { sf => [ split('', $sf) ] }; + if ($match) { + my ($msf,$mre) = split('~', $match); + if (length($msf) > 0 and length($mre) > 0) { + $msf =~ s/^\s*//; $msf =~ s/\s*$//; + $mre =~ s/^\s*//; $mre =~ s/\s*$//; + $fields{$field}{match} = { sf => $msf, re => qr/$mre/ }; + } + } + } + } + + for my $f ( keys %fields) { + for my $to_field ($r->field( $f )) { + if (exists($fields{$f}{match})) { + next unless (grep { $_ =~ $fields{$f}{match}{re} } $to_field->subfield($fields{$f}{match}{sf})); + } + + if ( @{$fields{$f}{sf}} ) { + $to_field->delete_subfield(code => $fields{$f}{sf}); + } else { + $r->delete_field( $to_field ); + } + } + } + + $xml = $r->as_xml_record; + $xml =~ s/^<\?.+?\?>$//mo; + $xml =~ s/\n//sgo; + $xml =~ s/>\s+ 'UTF-8'); +use MARC::Charset; + +MARC::Charset->assume_unicode(1); + +my $xml = shift; +my $r = MARC::Record->new_from_xml( $xml ); + +return_next( { tag => 'LDR', value => $r->leader } ); + +for my $f ( $r->fields ) { + if ($f->is_control_field) { + return_next({ tag => $f->tag, value => $f->data }); + } else { + for my $s ($f->subfields) { + return_next({ + tag => $f->tag, + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => $s->[0], + value => $s->[1] + }); + + if ( $f->tag eq '245' and $s->[0] eq 'a' ) { + my $trim = $f->indicator(2) || 0; + return_next({ + tag => 'tnf', + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => 'a', + value => substr( $s->[1], $trim ) + }); + } + } + } +} + +return undef; + +$func$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION authority.flatten_marc ( TEXT ) RETURNS SETOF authority.full_rec AS $func$ + +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; + +MARC::Charset->assume_unicode(1); + +my $xml = shift; +my $r = MARC::Record->new_from_xml( $xml ); + +return_next( { tag => 'LDR', value => $r->leader } ); + +for my $f ( $r->fields ) { + if ($f->is_control_field) { + return_next({ tag => $f->tag, value => $f->data }); + } else { + for my $s ($f->subfields) { + return_next({ + tag => $f->tag, + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => $s->[0], + value => $s->[1] + }); + + } + } +} + +return undef; + +$func$ LANGUAGE PLPERLU; + + + \qecho Upgrade script completed. \qecho But wait, there's more: please run reingest-1.6-2.0.pl \qecho in order to create an SQL script to run to partially reindex diff --git a/Open-ILS/src/sql/Pg/999.functions.global.sql b/Open-ILS/src/sql/Pg/999.functions.global.sql index 71c45c3c63..eeb7284ef4 100644 --- a/Open-ILS/src/sql/Pg/999.functions.global.sql +++ b/Open-ILS/src/sql/Pg/999.functions.global.sql @@ -1371,6 +1371,9 @@ CREATE OR REPLACE FUNCTION authority.flatten_marc ( TEXT ) RETURNS SETOF authori use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; + +MARC::Charset->assume_unicode(1); my $xml = shift; my $r = MARC::Record->new_from_xml( $xml ); diff --git a/Open-ILS/src/sql/Pg/upgrade/0528.schema.functions_assume_unicode.sql b/Open-ILS/src/sql/Pg/upgrade/0528.schema.functions_assume_unicode.sql new file mode 100644 index 0000000000..fb1935694b --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0528.schema.functions_assume_unicode.sql @@ -0,0 +1,491 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0528'); -- dbs + +CREATE OR REPLACE FUNCTION maintain_control_numbers() RETURNS TRIGGER AS $func$ +use strict; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; +use Encode; +use Unicode::Normalize; + +MARC::Charset->assume_unicode(1); + +my $record = MARC::Record->new_from_xml($_TD->{new}{marc}); +my $schema = $_TD->{table_schema}; +my $rec_id = $_TD->{new}{id}; + +# Short-circuit if maintaining control numbers per MARC21 spec is not enabled +my $enable = spi_exec_query("SELECT enabled FROM config.global_flag WHERE name = 'cat.maintain_control_numbers'"); +if (!($enable->{processed}) or $enable->{rows}[0]->{enabled} eq 'f') { + return; +} + +# Get the control number identifier from an OU setting based on $_TD->{new}{owner} +my $ou_cni = 'EVRGRN'; + +my $owner; +if ($schema eq 'serial') { + $owner = $_TD->{new}{owning_lib}; +} else { + # are.owner and bre.owner can be null, so fall back to the consortial setting + $owner = $_TD->{new}{owner} || 1; +} + +my $ous_rv = spi_exec_query("SELECT value FROM actor.org_unit_ancestor_setting('cat.marc_control_number_identifier', $owner)"); +if ($ous_rv->{processed}) { + $ou_cni = $ous_rv->{rows}[0]->{value}; + $ou_cni =~ s/"//g; # Stupid VIM syntax highlighting" +} else { + # Fall back to the shortname of the OU if there was no OU setting + $ous_rv = spi_exec_query("SELECT shortname FROM actor.org_unit WHERE id = $owner"); + if ($ous_rv->{processed}) { + $ou_cni = $ous_rv->{rows}[0]->{shortname}; + } +} + +my ($create, $munge) = (0, 0); + +my @scns = $record->field('035'); + +foreach my $id_field ('001', '003') { + my $spec_value; + my @controls = $record->field($id_field); + + if ($id_field eq '001') { + $spec_value = $rec_id; + } else { + $spec_value = $ou_cni; + } + + # Create the 001/003 if none exist + if (scalar(@controls) == 1) { + # Only one field; check to see if we need to munge it + unless (grep $_->data() eq $spec_value, @controls) { + $munge = 1; + } + } else { + # Delete the other fields, as with more than 1 001/003 we do not know which 003/001 to match + foreach my $control (@controls) { + unless ($control->data() eq $spec_value) { + $record->delete_field($control); + } + } + $record->insert_fields_ordered(MARC::Field->new($id_field, $spec_value)); + $create = 1; + } +} + +# Now, if we need to munge the 001, we will first push the existing 001/003 +# into the 035; but if the record did not have one (and one only) 001 and 003 +# to begin with, skip this process +if ($munge and not $create) { + my $scn = "(" . $record->field('003')->data() . ")" . $record->field('001')->data(); + + # Do not create duplicate 035 fields + unless (grep $_->subfield('a') eq $scn, @scns) { + $record->insert_fields_ordered(MARC::Field->new('035', '', '', 'a' => $scn)); + } +} + +# Set the 001/003 and update the MARC +if ($create or $munge) { + $record->field('001')->data($rec_id); + $record->field('003')->data($ou_cni); + + my $xml = $record->as_xml_record(); + $xml =~ s/\n//sgo; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize() + # to avoid having to set PERL5LIB for PostgreSQL as well + + # If we are going to convert non-ASCII characters to XML entities, + # we had better be dealing with a UTF8 string to begin with + $xml = decode_utf8($xml); + + $xml = NFC($xml); + + # Convert raw ampersands to entities + $xml =~ s/&(?!\S+;)/&/gso; + + # Convert Unicode characters to entities + $xml =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; + + $xml =~ s/[\x00-\x1f]//go; + $_TD->{new}{marc} = $xml; + + return "MODIFY"; +} + +return; +$func$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION authority.generate_overlay_template ( TEXT, BIGINT ) RETURNS TEXT AS $func$ + + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; + + MARC::Charset->assume_unicode(1); + + my $xml = shift; + my $r = MARC::Record->new_from_xml( $xml ); + + return undef unless ($r); + + my $id = shift() || $r->subfield( '901' => 'c' ); + $id =~ s/^\s*(?:\([^)]+\))?\s*(.+)\s*?$/$1/; + return undef unless ($id); # We need an ID! + + my $tmpl = MARC::Record->new(); + $tmpl->encoding( 'UTF-8' ); + + my @rule_fields; + for my $field ( $r->field( '1..' ) ) { # Get main entry fields from the authority record + + my $tag = $field->tag; + my $i1 = $field->indicator(1); + my $i2 = $field->indicator(2); + my $sf = join '', map { $_->[0] } $field->subfields; + my @data = map { @$_ } $field->subfields; + + my @replace_them; + + # Map the authority field to bib fields it can control. + if ($tag >= 100 and $tag <= 111) { # names + @replace_them = map { $tag + $_ } (0, 300, 500, 600, 700); + } elsif ($tag eq '130') { # uniform title + @replace_them = qw/130 240 440 730 830/; + } elsif ($tag >= 150 and $tag <= 155) { # subjects + @replace_them = ($tag + 500); + } elsif ($tag >= 180 and $tag <= 185) { # floating subdivisions + @replace_them = qw/100 400 600 700 800 110 410 610 710 810 111 411 611 711 811 130 240 440 730 830 650 651 655/; + } else { + next; + } + + # Dummy up the bib-side data + $tmpl->append_fields( + map { + MARC::Field->new( $_, $i1, $i2, @data ) + } @replace_them + ); + + # Construct some 'replace' rules + push @rule_fields, map { $_ . $sf . '[0~\)' .$id . '$]' } @replace_them; + } + + # Insert the replace rules into the template + $tmpl->append_fields( + MARC::Field->new( '905' => ' ' => ' ' => 'r' => join(',', @rule_fields ) ) + ); + + $xml = $tmpl->as_xml_record; + $xml =~ s/^<\?.+?\?>$//mo; + $xml =~ s/\n//sgo; + $xml =~ s/>\s+ 'UTF-8'); + use MARC::Charset; + use strict; + + MARC::Charset->assume_unicode(1); + + my $target_xml = shift; + my $source_xml = shift; + my $field_spec = shift; + my $force_add = shift || 0; + + my $target_r = MARC::Record->new_from_xml( $target_xml ); + my $source_r = MARC::Record->new_from_xml( $source_xml ); + + return $target_xml unless ($target_r && $source_r); + + my @field_list = split(',', $field_spec); + + my %fields; + for my $f (@field_list) { + $f =~ s/^\s*//; $f =~ s/\s*$//; + if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) { + my $field = $1; + $field =~ s/\s+//; + my $sf = $2; + $sf =~ s/\s+//; + my $match = $3; + $match =~ s/^\s*//; $match =~ s/\s*$//; + $fields{$field} = { sf => [ split('', $sf) ] }; + if ($match) { + my ($msf,$mre) = split('~', $match); + if (length($msf) > 0 and length($mre) > 0) { + $msf =~ s/^\s*//; $msf =~ s/\s*$//; + $mre =~ s/^\s*//; $mre =~ s/\s*$//; + $fields{$field}{match} = { sf => $msf, re => qr/$mre/ }; + } + } + } + } + + for my $f ( keys %fields) { + if ( @{$fields{$f}{sf}} ) { + for my $from_field ($source_r->field( $f )) { + my @tos = $target_r->field( $f ); + if (!@tos) { + next if (exists($fields{$f}{match}) and !$force_add); + my @new_fields = map { $_->clone } $source_r->field( $f ); + $target_r->insert_fields_ordered( @new_fields ); + } else { + for my $to_field (@tos) { + if (exists($fields{$f}{match})) { + next unless (grep { $_ =~ $fields{$f}{match}{re} } $to_field->subfield($fields{$f}{match}{sf})); + } + my @new_sf = map { ($_ => $from_field->subfield($_)) } @{$fields{$f}{sf}}; + $to_field->add_subfields( @new_sf ); + } + } + } + } else { + my @new_fields = map { $_->clone } $source_r->field( $f ); + $target_r->insert_fields_ordered( @new_fields ); + } + } + + $target_xml = $target_r->as_xml_record; + $target_xml =~ s/^<\?.+?\?>$//mo; + $target_xml =~ s/\n//sgo; + $target_xml =~ s/>\s+ 'UTF8'); + use MARC::Charset; + use UUID::Tiny ':std'; + + MARC::Charset->assume_unicode(1); + + my $xml = shift() or return undef; + + my $r; + + # Prevent errors in XML parsing from blowing out ungracefully + eval { + $r = MARC::Record->new_from_xml( $xml ); + 1; + } or do { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + }; + + if (!$r) { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + } + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $fixed_field = $r->field('008'); + my $thes_char = '|'; + if ($fixed_field) { + $thes_char = substr($fixed_field->data(), 11, 1) || '|'; + } + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $auth_txt = ''; + my $head = $r->field('1..'); + if ($head) { + # Concatenate all of these subfields together, prefixed by their code + # to prevent collisions along the lines of "Fiction, North Carolina" + foreach my $sf ($head->subfields()) { + $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1]; + } + } + + if ($auth_txt) { + my $stmt = spi_prepare('SELECT public.naco_normalize($1) AS norm_text', 'TEXT'); + my $result = spi_exec_prepared($stmt, $auth_txt); + my $norm_txt = $result->{rows}[0]->{norm_text}; + spi_freeplan($stmt); + undef($stmt); + return $head->tag() . "_" . $thes_code . " " . $norm_txt; + } + + return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml); +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +CREATE OR REPLACE FUNCTION vandelay.strip_field ( xml TEXT, field TEXT ) RETURNS TEXT AS $_$ + + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; + use strict; + + MARC::Charset->assume_unicode(1); + + my $xml = shift; + my $r = MARC::Record->new_from_xml( $xml ); + + return $xml unless ($r); + + my $field_spec = shift; + my @field_list = split(',', $field_spec); + + my %fields; + for my $f (@field_list) { + $f =~ s/^\s*//; $f =~ s/\s*$//; + if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) { + my $field = $1; + $field =~ s/\s+//; + my $sf = $2; + $sf =~ s/\s+//; + my $match = $3; + $match =~ s/^\s*//; $match =~ s/\s*$//; + $fields{$field} = { sf => [ split('', $sf) ] }; + if ($match) { + my ($msf,$mre) = split('~', $match); + if (length($msf) > 0 and length($mre) > 0) { + $msf =~ s/^\s*//; $msf =~ s/\s*$//; + $mre =~ s/^\s*//; $mre =~ s/\s*$//; + $fields{$field}{match} = { sf => $msf, re => qr/$mre/ }; + } + } + } + } + + for my $f ( keys %fields) { + for my $to_field ($r->field( $f )) { + if (exists($fields{$f}{match})) { + next unless (grep { $_ =~ $fields{$f}{match}{re} } $to_field->subfield($fields{$f}{match}{sf})); + } + + if ( @{$fields{$f}{sf}} ) { + $to_field->delete_subfield(code => $fields{$f}{sf}); + } else { + $r->delete_field( $to_field ); + } + } + } + + $xml = $r->as_xml_record; + $xml =~ s/^<\?.+?\?>$//mo; + $xml =~ s/\n//sgo; + $xml =~ s/>\s+ 'UTF-8'); +use MARC::Charset; + +MARC::Charset->assume_unicode(1); + +my $xml = shift; +my $r = MARC::Record->new_from_xml( $xml ); + +return_next( { tag => 'LDR', value => $r->leader } ); + +for my $f ( $r->fields ) { + if ($f->is_control_field) { + return_next({ tag => $f->tag, value => $f->data }); + } else { + for my $s ($f->subfields) { + return_next({ + tag => $f->tag, + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => $s->[0], + value => $s->[1] + }); + + if ( $f->tag eq '245' and $s->[0] eq 'a' ) { + my $trim = $f->indicator(2) || 0; + return_next({ + tag => 'tnf', + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => 'a', + value => substr( $s->[1], $trim ) + }); + } + } + } +} + +return undef; + +$func$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION authority.flatten_marc ( TEXT ) RETURNS SETOF authority.full_rec AS $func$ + +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; + +MARC::Charset->assume_unicode(1); + +my $xml = shift; +my $r = MARC::Record->new_from_xml( $xml ); + +return_next( { tag => 'LDR', value => $r->leader } ); + +for my $f ( $r->fields ) { + if ($f->is_control_field) { + return_next({ tag => $f->tag, value => $f->data }); + } else { + for my $s ($f->subfields) { + return_next({ + tag => $f->tag, + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => $s->[0], + value => $s->[1] + }); + + } + } +} + +return undef; + +$func$ LANGUAGE PLPERLU; + +COMMIT; diff --git a/Open-ILS/src/support-scripts/authority_control_fields.pl.in b/Open-ILS/src/support-scripts/authority_control_fields.pl.in index 07c9f77f10..ad3cfde8be 100755 --- a/Open-ILS/src/support-scripts/authority_control_fields.pl.in +++ b/Open-ILS/src/support-scripts/authority_control_fields.pl.in @@ -19,6 +19,7 @@ use DBI; use Getopt::Long; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; use OpenSRF::System; use OpenILS::Utils::Fieldmapper; use OpenSRF::Utils::SettingsClient; @@ -29,6 +30,8 @@ use OpenILS::Application::AppUtils; use Data::Dumper; use Pod::Usage qw/ pod2usage /; +MARC::Charset->assume_unicode(1); + my ($start_id, $end_id); my $bootstrap = '@sysconfdir@/opensrf_core.xml'; my @records; -- 2.43.2