From cacb6861baa23d622a36b8b0240b6b96f2b291d1 Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Fri, 28 Oct 2016 13:00:45 -0400 Subject: [PATCH] LP#1528901: avoid accidental bib fingerprint collisions This patch fixes a problem where the bib fingerprint algorithm could end up putting completely different works in the same metarecord. For example, 100 $a Steel, Danielle 245 $a Blue and *no 1XX 245 $a Blue steel previously (with stock config.biblio_fingerprint settings) got a fingerprint of "bluesteel". With this patch, their fingerprints are now: "Title:blue Author:steel" and "Title:bluesteel Author:" The upgrade script supplied with this patch remaps the metarecords after updating the fingerprints. While existing metarecord holds may get moved, note that there is no known way of ensuring that a metarecord hold placed on a collided metarecord will end up attach to whatever work the patron intended to request. To test: [1] Add records for "Blue" and "Blue steel". [2] Note that they end up on the same metarecord. [3] Apply the patch and perform the update. [4] The two bibs should now be on separate metarecords. Signed-off-by: Galen Charlton Signed-off-by: Rogan Hamby Signed-off-by: Kathy Lussier Signed-off-by: Mike Rylander --- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 5 +- .../t/lp1528901_more_precise_fingerprints.pg | 50 +++++++++++ .../XXXX.schema.update_fingerprinting.sql | 86 +++++++++++++++++++ 3 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 71bab563bf..f79d09e3a8 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -1073,11 +1073,12 @@ BEGIN raw_text := REGEXP_REPLACE(raw_text, E'^(\\w+).*?$', E'\\1'); END IF; - output_text := output_text || REGEXP_REPLACE(raw_text, E'\\s+', '', 'g'); + output_text := output_text || idx.name || ':' || + REGEXP_REPLACE(raw_text, E'\\s+', '', 'g') || ' '; END LOOP; - RETURN output_text; + RETURN BTRIM(output_text); END; $func$ LANGUAGE PLPGSQL; diff --git a/Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg b/Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg new file mode 100644 index 0000000000..bb06244d23 --- /dev/null +++ b/Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg @@ -0,0 +1,50 @@ +-- tests to verify biblio fingerprints avoid conflating +-- works where words coming from titles and authors might overlap +BEGIN; + +SELECT plan(1); + +INSERT INTO biblio.record_entry (last_xact_id, marc) VALUES ( + 'bib-fingerprint-test-1', + $record$ + 02137cam a2200457 a 4500 + ocn694080497 + 20160729104757.0 + 101217s2011 txu b 001 0 eng + + Jasper, Frances + + + Gzarniblat + +$record$); + +INSERT INTO biblio.record_entry (last_xact_id, marc) VALUES ( + 'bib-fingerprint-test-2', + $record$ + 02137cam a2200457 a 4500 + ocn694080497 + 20160729104757.0 + 101217s2011 txu b 001 0 eng + + Gzarniblat Jasper + +$record$); + +SELECT results_ne( + $$ + SELECT metarecord FROM metabib.metarecord_source_map + WHERE source = ( + SELECT id FROM biblio.record_entry WHERE last_xact_id = 'bib-fingerprint-test-1' + ) + $$, + $$ + SELECT metarecord FROM metabib.metarecord_source_map + WHERE source = ( + SELECT id FROM biblio.record_entry WHERE last_xact_id = 'bib-fingerprint-test-2' + ) + $$, + 'LP#1528901: same words in title and author do not stick different bibs in same metarecord' +); + +ROLLBACK; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql new file mode 100644 index 0000000000..2eb5ac889e --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql @@ -0,0 +1,86 @@ +BEGIN; + +--- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +CREATE OR REPLACE FUNCTION biblio.extract_fingerprint ( marc text ) RETURNS TEXT AS $func$ +DECLARE + idx config.biblio_fingerprint%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + raw_text TEXT; + output_text TEXT := ''; +BEGIN + + IF marc IS NULL OR marc = '' THEN + RETURN NULL; + END IF; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.biblio_fingerprint ORDER BY format, id LOOP + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(marc,xfrm.xslt); + ELSE + transformed_xml := marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + raw_text := COALESCE( + naco_normalize( + ARRAY_TO_STRING( + oils_xpath( + '//text()', + (oils_xpath( + idx.xpath, + transformed_xml, + ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] + ))[1] + ), + '' + ) + ), + '' + ); + + raw_text := REGEXP_REPLACE(raw_text, E'\\[.+?\\]', E''); + raw_text := REGEXP_REPLACE(raw_text, E'\\mthe\\M|\\man?d?d\\M', E'', 'g'); -- arg! the pain! + + IF idx.first_word IS TRUE THEN + raw_text := REGEXP_REPLACE(raw_text, E'^(\\w+).*?$', E'\\1'); + END IF; + + output_text := output_text || idx.name || ':' || + REGEXP_REPLACE(raw_text, E'\\s+', '', 'g') || ' '; + + END LOOP; + + RETURN BTRIM(output_text); + +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT; + +\qecho Recalculating bib fingerprints +ALTER TABLE biblio.record_entry DISABLE TRIGGER USER; +UPDATE biblio.record_entry SET fingerprint = biblio.extract_fingerprint(marc) WHERE NOT deleted; +ALTER TABLE biblio.record_entry ENABLE TRIGGER USER; + +SELECT metabib.remap_metarecord_for_bib(id, fingerprint) +FROM biblio.record_entry +WHERE NOT deleted; + +\qecho Remapping metarecords +SELECT metabib.remap_metarecord_for_bib(id, fingerprint) +FROM biblio.record_entry +WHERE NOT deleted; -- 2.43.2