From a0d07c885904e2e42850dae93691e22921176f23 Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Tue, 6 Mar 2012 13:24:51 -0500 Subject: [PATCH] Speed up autosuggest in large data environments The autosuggest infrastructure was assuming the the Postgres query planner would be able to cope with large datasets without any additional fiddling. Unfortunately, that proved to be untrue. We also needed a few indexing changes. * At the suggestion of Ben Shum, ignore the identifier search class for autosuggest. * Added indexes to all joined columns of metabib.browse_entry_def_map. * Switched from GIST to GIN indexing of metabib.browse_entry.index_vector because GIN, being an inverted index, is /much/ better for prefix matching which, in turn, is extremely important for browse and autosuggest. * Apply some reasonable sanity-checking limits on suggest queries. This means you can't use autosuggest as a reporting tool -- but that's OK because it's not one. Signed-off-by: Mike Rylander Signed-off-by: Lebbeous Fogle-Weekley --- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 58 ++++--- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 56 +++--- ...80.schema.autosuggest-big_data-speedup.sql | 160 ++++++++++++++++++ 4 files changed, 226 insertions(+), 50 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0680.schema.autosuggest-big_data-speedup.sql diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 4c1bc1e89b..112a50b1a6 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -86,7 +86,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0679', :eg_version); -- berick/miker +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0680', :eg_version); -- miker/senator CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index b956ab8673..c3514ef32a 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -140,7 +140,7 @@ CREATE TABLE metabib.browse_entry ( value TEXT unique, index_vector tsvector ); -CREATE INDEX metabib_browse_entry_index_vector_idx ON metabib.browse_entry USING GIST (index_vector); +CREATE INDEX metabib_browse_entry_index_vector_idx ON metabib.browse_entry USING GIN (index_vector); CREATE TRIGGER metabib_browse_entry_fti_trigger BEFORE INSERT OR UPDATE ON metabib.browse_entry FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('keyword'); @@ -152,6 +152,10 @@ CREATE TABLE metabib.browse_entry_def_map ( def INT REFERENCES config.metabib_field (id), source BIGINT REFERENCES biblio.record_entry (id) ); +CREATE INDEX browse_entry_def_map_def_idx ON metabib.browse_entry_def_map (def); +CREATE INDEX browse_entry_def_map_entry_idx ON metabib.browse_entry_def_map (entry); +CREATE INDEX browse_entry_def_map_source_idx ON metabib.browse_entry_def_map (source); + CREATE OR REPLACE FUNCTION metabib.facet_normalize_trigger () RETURNS TRIGGER AS $$ @@ -1294,7 +1298,7 @@ BEGIN END IF; END IF; END; -$func$ LANGUAGE PLPGSQL; +$func$ LANGUAGE PLPGSQL ROWS 1; -- Given a string such as a user might type into a search box, prepare @@ -1372,7 +1376,7 @@ BEGIN IF visibility_org IS NOT NULL THEN opac_visibility_join := ' JOIN asset.opac_visible_copies aovc ON ( - aovc.record = mbedm.source AND + aovc.record = x.source AND aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) )'; ELSE @@ -1420,24 +1424,36 @@ BEGIN '; END IF; - RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT - mbe.value, - cmf.id, - cmc.buoyant AND _registered.field_class IS NOT NULL, - _registered.field = cmf.id, - cmf.weight, - TS_RANK_CD(mbe.index_vector, $1, $6), - cmc.buoyant - FROM metabib.browse_entry_def_map mbedm - JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry) - JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) - JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) - ' || search_class_join || opac_visibility_join || - ' WHERE $1 @@ mbe.index_vector - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - LIMIT $5) x - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - ' -- sic, repeat the order by clause in the outer select too + RETURN QUERY EXECUTE ' +SELECT DISTINCT + x.value, + x.id, + x.push, + x.restrict, + x.weight, + x.ts_rank_cd, + x.buoyant, + TS_HEADLINE(value, $7, $3) + FROM (SELECT DISTINCT + mbe.value, + cmf.id, + cmc.buoyant AND _registered.field_class IS NOT NULL AS push, + _registered.field = cmf.id AS restrict, + cmf.weight, + TS_RANK_CD(mbe.index_vector, $1, $6), + cmc.buoyant, + mbedm.source + FROM metabib.browse_entry_def_map mbedm + JOIN (SELECT * FROM metabib.browse_entry WHERE index_vector @@ $1 LIMIT 10000) mbe ON (mbe.id = mbedm.entry) + JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) + JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) + ' || search_class_join || ' + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + LIMIT 1000) AS x + ' || opac_visibility_join || ' + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + LIMIT $5 +' -- sic, repeat the order by clause in the outer select too USING query, search_class, headline_opts, visibility_org, query_limit, normalization, plain_query diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index 0aacca1817..48964754e4 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -146,34 +146,34 @@ INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES (16, 'subject', 'complete', oils_i18n_gettext(16, 'All Subjects', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject$$, FALSE ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (17, 'identifier', 'accession', oils_i18n_gettext(17, 'Accession Number', 'cmf', 'label'), 'marcxml', $$//marc:controlfield[@tag='001']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (18, 'identifier', 'isbn', oils_i18n_gettext(18, 'ISBN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='020']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (19, 'identifier', 'issn', oils_i18n_gettext(19, 'ISSN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='022']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (20, 'identifier', 'upc', oils_i18n_gettext(20, 'UPC', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='1']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (21, 'identifier', 'ismn', oils_i18n_gettext(21, 'ISMN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='2']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (22, 'identifier', 'ean', oils_i18n_gettext(22, 'EAN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='3']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (23, 'identifier', 'isrc', oils_i18n_gettext(23, 'ISRC', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='0']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (24, 'identifier', 'sici', oils_i18n_gettext(24, 'SICI', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='4']/marc:subfield[@code='a' or @code='z']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (25, 'identifier', 'bibcn', oils_i18n_gettext(25, 'Local Free-Text Call Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='099']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (26, 'identifier', 'tcn', oils_i18n_gettext(26, 'Title Control Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='901']/marc:subfield[@code='a']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (27, 'identifier', 'bibid', oils_i18n_gettext(27, 'Internal ID', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='901']/marc:subfield[@code='c']$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, search_field, facet_field) VALUES - (28, 'identifier', 'authority_id', oils_i18n_gettext(28, 'Authority Record ID', 'cmf', 'label'), 'marcxml', '//marc:datafield/marc:subfield[@code="0"]', FALSE, TRUE); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath) VALUES - (29, 'identifier', 'scn', oils_i18n_gettext(29, 'System Control Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='035']/marc:subfield[@code="a"]$$); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath) VALUES - (30, 'identifier', 'lccn', oils_i18n_gettext(30, 'LC Control Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='010']/marc:subfield[@code="a" or @code='z']$$); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (17, 'identifier', 'accession', oils_i18n_gettext(17, 'Accession Number', 'cmf', 'label'), 'marcxml', $$//marc:controlfield[@tag='001']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (18, 'identifier', 'isbn', oils_i18n_gettext(18, 'ISBN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='020']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (19, 'identifier', 'issn', oils_i18n_gettext(19, 'ISSN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='022']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (20, 'identifier', 'upc', oils_i18n_gettext(20, 'UPC', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='1']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (21, 'identifier', 'ismn', oils_i18n_gettext(21, 'ISMN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='2']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (22, 'identifier', 'ean', oils_i18n_gettext(22, 'EAN', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='3']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (23, 'identifier', 'isrc', oils_i18n_gettext(23, 'ISRC', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='0']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (24, 'identifier', 'sici', oils_i18n_gettext(24, 'SICI', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='024' and @ind1='4']/marc:subfield[@code='a' or @code='z']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (25, 'identifier', 'bibcn', oils_i18n_gettext(25, 'Local Free-Text Call Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='099']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (26, 'identifier', 'tcn', oils_i18n_gettext(26, 'Title Control Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='901']/marc:subfield[@code='a']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field ) VALUES + (27, 'identifier', 'bibid', oils_i18n_gettext(27, 'Internal ID', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='901']/marc:subfield[@code='c']$$, FALSE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, search_field, facet_field, browse_field) VALUES + (28, 'identifier', 'authority_id', oils_i18n_gettext(28, 'Authority Record ID', 'cmf', 'label'), 'marcxml', '//marc:datafield/marc:subfield[@code="0"]', FALSE, TRUE, FALSE); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field) VALUES + (29, 'identifier', 'scn', oils_i18n_gettext(29, 'System Control Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='035']/marc:subfield[@code="a"]$$, FALSE); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, browse_field) VALUES + (30, 'identifier', 'lccn', oils_i18n_gettext(30, 'LC Control Number', 'cmf', 'label'), 'marcxml', $$//marc:datafield[@tag='010']/marc:subfield[@code="a" or @code='z']$$, FALSE); SELECT SETVAL('config.metabib_field_id_seq'::TEXT, (SELECT MAX(id) FROM config.metabib_field), TRUE); diff --git a/Open-ILS/src/sql/Pg/upgrade/0680.schema.autosuggest-big_data-speedup.sql b/Open-ILS/src/sql/Pg/upgrade/0680.schema.autosuggest-big_data-speedup.sql new file mode 100644 index 0000000000..126c4f1273 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0680.schema.autosuggest-big_data-speedup.sql @@ -0,0 +1,160 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('0680', :eg_version); + +-- Not much use in having identifier-class fields be suggestions. Credit for the idea goes to Ben Shum. +UPDATE config.metabib_field SET browse_field = FALSE WHERE id < 100 AND field_class = 'identifier'; + + +--------------------------------------------------------------------------- +-- The rest of this was tested on Evergreen Indiana's dev server, which has +-- a large data set of 2.6M bibs, and was instrumental in sussing out the +-- needed adjustments. Thanks, EG-IN! +--------------------------------------------------------------------------- + +-- GIN indexes are /much/ better for prefix matching, which is important for browse and autosuggest +DROP INDEX metabib.metabib_browse_entry_index_vector_idx; +CREATE INDEX metabib_browse_entry_index_vector_idx ON metabib.browse_entry USING GIN (index_vector); + + +-- We need thes to make the autosuggest limiting joins fast +CREATE INDEX browse_entry_def_map_def_idx ON metabib.browse_entry_def_map (def); +CREATE INDEX browse_entry_def_map_entry_idx ON metabib.browse_entry_def_map (entry); +CREATE INDEX browse_entry_def_map_source_idx ON metabib.browse_entry_def_map (source); + +-- In practice this will always be ~1 row, and the default of 1000 causes terrible plans +ALTER FUNCTION metabib.search_class_to_registered_components(text) ROWS 1; + +-- Reworking of the generated query to act in a sane manner in the face of large datasets +CREATE OR REPLACE + FUNCTION metabib.suggest_browse_entries( + raw_query_text TEXT, -- actually typed by humans at the UI level + search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc + headline_opts TEXT, -- markup options for ts_headline() + visibility_org INTEGER,-- null if you don't want opac visibility test + query_limit INTEGER,-- use in LIMIT clause of interal query + normalization INTEGER -- argument to TS_RANK_CD() + ) RETURNS TABLE ( + value TEXT, -- plain + field INTEGER, + buoyant_and_class_match BOOL, + field_match BOOL, + field_weight INTEGER, + rank REAL, + buoyant BOOL, + match TEXT -- marked up + ) AS $func$ +DECLARE + prepared_query_texts TEXT[]; + query TSQUERY; + plain_query TSQUERY; + opac_visibility_join TEXT; + search_class_join TEXT; + r_fields RECORD; +BEGIN + prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); + + query := TO_TSQUERY('keyword', prepared_query_texts[1]); + plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); + + IF visibility_org IS NOT NULL THEN + opac_visibility_join := ' + JOIN asset.opac_visible_copies aovc ON ( + aovc.record = x.source AND + aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) + )'; + ELSE + opac_visibility_join := ''; + END IF; + + -- The following determines whether we only provide suggestsons matching + -- the user's selected search_class, or whether we show other suggestions + -- too. The reason for MIN() is that for search_classes like + -- 'title|proper|uniform' you would otherwise get multiple rows. The + -- implication is that if title as a class doesn't have restrict, + -- nor does the proper field, but the uniform field does, you're going + -- to get 'false' for your overall evaluation of 'should we restrict?' + -- To invert that, change from MIN() to MAX(). + + SELECT + INTO r_fields + MIN(cmc.restrict::INT) AS restrict_class, + MIN(cmf.restrict::INT) AS restrict_field + FROM metabib.search_class_to_registered_components(search_class) + AS _registered (field_class TEXT, field INT) + JOIN + config.metabib_class cmc ON (cmc.name = _registered.field_class) + LEFT JOIN + config.metabib_field cmf ON (cmf.id = _registered.field); + + -- evaluate 'should we restrict?' + IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN + search_class_join := ' + JOIN + metabib.search_class_to_registered_components($2) + AS _registered (field_class TEXT, field INT) ON ( + (_registered.field IS NULL AND + _registered.field_class = cmf.field_class) OR + (_registered.field = cmf.id) + ) + '; + ELSE + search_class_join := ' + LEFT JOIN + metabib.search_class_to_registered_components($2) + AS _registered (field_class TEXT, field INT) ON ( + _registered.field_class = cmc.name + ) + '; + END IF; + + RETURN QUERY EXECUTE ' +SELECT DISTINCT + x.value, + x.id, + x.push, + x.restrict, + x.weight, + x.ts_rank_cd, + x.buoyant, + TS_HEADLINE(value, $7, $3) + FROM (SELECT DISTINCT + mbe.value, + cmf.id, + cmc.buoyant AND _registered.field_class IS NOT NULL AS push, + _registered.field = cmf.id AS restrict, + cmf.weight, + TS_RANK_CD(mbe.index_vector, $1, $6), + cmc.buoyant, + mbedm.source + FROM metabib.browse_entry_def_map mbedm + + -- Start with a pre-limited set of 10k possible suggestions. More than that is not going to be useful anyway + JOIN (SELECT * FROM metabib.browse_entry WHERE index_vector @@ $1 LIMIT 10000) mbe ON (mbe.id = mbedm.entry) + + JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) + JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) + ' || search_class_join || ' + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + LIMIT 1000) AS x -- This outer limit makes testing for opac visibility usably fast + ' || opac_visibility_join || ' + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + LIMIT $5 +' -- sic, repeat the order by clause in the outer select too + USING + query, search_class, headline_opts, + visibility_org, query_limit, normalization, plain_query + ; + + -- sort order: + -- buoyant AND chosen class = match class + -- chosen field = match field + -- field weight + -- rank + -- buoyancy + -- value itself + +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT; -- 2.43.2