From a67ad7f9a2419301e16d79a078a4a41dcf36d788 Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Mon, 7 Jun 2021 15:57:32 -0400 Subject: [PATCH] LP#1931162: DYM needs some situational optimization For some data sets and some queries the Did You Mean search suggestion logic can be much too slow. This is mainly in cases where a "misspelled" word of sufficient length greater than the symspell prefix length is checked against many short prefixes that have many long suggestions attached to them. This commit optimizes for that case in particular by testing the length of suggestions and prefix keys against the user input to avoid unnecessary tests. Futher, it captures the edit distance of suggestions that pass that test in-line, avoiding expensive retesting, and caches the short-cutoff edit distance when in low-verbosity mode to avoid future different-but-not-too-different suggestions coming from the same prefix key. It additionally provides a general optimization by batching the capture of suggest counts to avoid per-suggestion secondary lookups, and a micro-optimization of ordering suggestions by length at distance cache time. Signed-off-by: Mike Rylander Signed-off-by: Shula Link Signed-off-by: Jason Boyer --- .../src/sql/Pg/300.schema.staged_search.sql | 205 +++++++------- ...XXX.function.did_you_mean_optimization.sql | 268 ++++++++++++++++++ 2 files changed, 375 insertions(+), 98 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql index eba7b96d46..ece24083ba 100644 --- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql +++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql @@ -1587,19 +1587,21 @@ CREATE TYPE search.symspell_lookup_output AS ( word_pos INT ); -CREATE OR REPLACE FUNCTION search.symspell_lookup ( - raw_input TEXT, - search_class TEXT, - verbosity INT DEFAULT 2, - xfer_case BOOL DEFAULT FALSE, - count_threshold INT DEFAULT 1, - soundex_weight INT DEFAULT 0, - pg_trgm_weight INT DEFAULT 0, - kbdist_weight INT DEFAULT 0 -) RETURNS SETOF search.symspell_lookup_output AS $F$ + +CREATE OR REPLACE FUNCTION search.symspell_lookup( + raw_input text, + search_class text, + verbosity integer DEFAULT 2, + xfer_case boolean DEFAULT false, + count_threshold integer DEFAULT 1, + soundex_weight integer DEFAULT 0, + pg_trgm_weight integer DEFAULT 0, + kbdist_weight integer DEFAULT 0 +) RETURNS SETOF search.symspell_lookup_output LANGUAGE plpgsql AS $function$ DECLARE prefix_length INT; maxED INT; + good_suggs HSTORE; word_list TEXT[]; edit_list TEXT[] := '{}'; seen_list TEXT[] := '{}'; @@ -1614,6 +1616,8 @@ DECLARE w_pos INT := -1; smallest_ed INT := -1; global_ed INT; + i_len INT; + l_maxED INT; BEGIN SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; prefix_length := COALESCE(prefix_length, 6); @@ -1631,8 +1635,8 @@ BEGIN prefix_key FROM search.symspell_dictionary WHERE prefix_key = $1 - AND '||search_class||'_count >= $2 - AND '||search_class||'_suggestions @> ARRAY[$1]' + AND '||search_class||'_count >= $2 + AND '||search_class||'_suggestions @> ARRAY[$1]' INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1); IF entry.prefix_key IS NOT NULL THEN output.lev_distance := 0; -- definitionally @@ -1658,12 +1662,14 @@ BEGIN FOREACH word IN ARRAY word_list LOOP w_pos := w_pos + 1; input := evergreen.lowercase(word); + i_len := CHARACTER_LENGTH(input); + l_maxED := maxED; IF CHARACTER_LENGTH(input) > prefix_length THEN prefix_key := SUBSTRING(input FROM 1 FOR prefix_length); - edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, maxED); + edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED); ELSE - edit_list := input || search.symspell_generate_edits(input, 1, maxED); + edit_list := input || search.symspell_generate_edits(input, 1, l_maxED); END IF; SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x; @@ -1678,106 +1684,109 @@ BEGIN IF global_ed IS NOT NULL THEN smallest_ed := global_ed; END IF; + FOR entry IN EXECUTE 'SELECT '||search_class||'_suggestions AS suggestions, '||search_class||'_count AS count, prefix_key FROM search.symspell_dictionary WHERE prefix_key = $1 - AND '||search_class||'_suggestions IS NOT NULL' + AND '||search_class||'_suggestions IS NOT NULL' USING entry_key LOOP - FOREACH sugg IN ARRAY entry.suggestions LOOP - IF NOT seen_list @> ARRAY[sugg] THEN - seen_list := seen_list || sugg; - IF input = sugg THEN -- exact match, no need to spend time on a call - output.lev_distance := 0; - output.suggestion_count = entry.count; - ELSIF ABS(CHARACTER_LENGTH(input) - CHARACTER_LENGTH(sugg)) > maxED THEN - -- They are definitionally too different to consider, just move on. - CONTINUE; - ELSE - --output.lev_distance := levenshtein_less_equal( - output.lev_distance := evergreen.levenshtein_damerau_edistance( - input, - sugg, - maxED - ); - IF output.lev_distance < 0 THEN - -- The Perl module returns -1 for "more distant than max". - output.lev_distance := maxED + 1; - -- This short-circuit's the count test below for speed, bypassing - -- a couple useless tests. - output.suggestion_count := -1; - ELSE - EXECUTE 'SELECT '||search_class||'_count FROM search.symspell_dictionary WHERE prefix_key = $1' - INTO output.suggestion_count USING sugg; - END IF; - END IF; - -- The caller passes a minimum suggestion count threshold (or uses - -- the default of 0) and if the suggestion has that many or less uses - -- then we move on to the next suggestion, since this one is too rare. - CONTINUE WHEN output.suggestion_count < COALESCE(count_threshold,1); + SELECT HSTORE( + ARRAY_AGG( + ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT] + ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC + ) + ) + INTO good_suggs + FROM UNNEST(entry.suggestions) s + WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED) + AND NOT seen_list @> ARRAY[s]; + + CONTINUE WHEN good_suggs IS NULL; + + FOR sugg, output.suggestion_count IN EXECUTE + 'SELECT prefix_key, '||search_class||'_count + FROM search.symspell_dictionary + WHERE prefix_key = ANY ($1) + AND '||search_class||'_count >= $2' + USING AKEYS(good_suggs), COALESCE(count_threshold,1) + LOOP + + output.lev_distance := good_suggs->sugg; + seen_list := seen_list || sugg; + + -- Track the smallest edit distance among suggestions from this prefix key. + IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN + smallest_ed := output.lev_distance; + END IF; - -- Track the smallest edit distance among suggestions from this prefix key. - IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN - smallest_ed := output.lev_distance; + -- Track the smallest edit distance for all prefix keys for this word. + IF global_ed IS NULL OR smallest_ed < global_ed THEN + global_ed = smallest_ed; + -- And if low verbosity, ignore suggs with a larger distance from here on. + IF verbosity <= 1 THEN + l_maxED := global_ed; END IF; + END IF; - -- Track the smallest edit distance for all prefix keys for this word. - IF global_ed IS NULL OR smallest_ed < global_ed THEN - global_ed = smallest_ed; + -- Lev distance is our main similarity measure. While + -- trgm or soundex similarity could be the main filter, + -- Lev is both language agnostic and faster. + -- + -- Here we will skip suggestions that have a longer edit distance + -- than the shortest we've already found. This is simply an + -- optimization that allows us to avoid further processing + -- of this entry. It would be filtered out later. + CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1; + + -- If we have an exact match on the suggestion key we can also avoid + -- some function calls. + IF output.lev_distance = 0 THEN + output.qwerty_kb_match := 1; + output.pg_trgm_sim := 1; + output.soundex_sim := 1; + ELSE + IF kbdist_weight THEN + output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg); + ELSE + output.qwerty_kb_match := 0; END IF; + IF pg_trgm_weight THEN + output.pg_trgm_sim := similarity(input, sugg); + ELSE + output.pg_trgm_sim := 0; + END IF; + IF soundex_weight THEN + output.soundex_sim := difference(input, sugg) / 4.0; + ELSE + output.soundex_sim := 0; + END IF; + END IF; - -- Only proceed if the edit distance is <= the max for the dictionary. - IF output.lev_distance <= maxED THEN - IF output.lev_distance > global_ed AND verbosity <= 1 THEN - -- Lev distance is our main similarity measure. While - -- trgm or soundex similarity could be the main filter, - -- Lev is both language agnostic and faster. - -- - -- Here we will skip suggestions that have a longer edit distance - -- than the shortest we've already found. This is simply an - -- optimization that allows us to avoid further processing - -- of this entry. It would be filtered out later. + -- Fill in some fields + IF xfer_case AND input <> word THEN + output.suggestion := search.symspell_transfer_casing(word, sugg); + ELSE + output.suggestion := sugg; + END IF; + output.prefix_key := entry.prefix_key; + output.prefix_key_count := entry.count; + output.input := word; + output.norm_input := input; + output.word_pos := w_pos; - CONTINUE; - END IF; + -- We can't "cache" a set of generated records directly, so + -- here we build up an array of search.symspell_lookup_output + -- records that we can revivicate later as a table using UNNEST(). + output_list := output_list || output; - -- If we have an exact match on the suggestion key we can also avoid - -- some function calls. - IF output.lev_distance = 0 THEN - output.qwerty_kb_match := 1; - output.pg_trgm_sim := 1; - output.soundex_sim := 1; - ELSE - output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg); - output.pg_trgm_sim := similarity(input, sugg); - output.soundex_sim := difference(input, sugg) / 4.0; - END IF; + EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit + CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key - -- Fill in some fields - IF xfer_case THEN - output.suggestion := search.symspell_transfer_casing(word, sugg); - ELSE - output.suggestion := sugg; - END IF; - output.prefix_key := entry.prefix_key; - output.prefix_key_count := entry.count; - output.input := word; - output.norm_input := input; - output.word_pos := w_pos; - - -- We can't "cache" a set of generated records directly, so - -- here we build up an array of search.symspell_lookup_output - -- records that we can revivicate later as a table using UNNEST(). - output_list := output_list || output; - - EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit - CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key - END IF; -- maxED test - END IF; -- suggestion not seen test END LOOP; -- loop over suggestions END LOOP; -- loop over entries END LOOP; -- loop over entry_keys @@ -1837,7 +1846,7 @@ BEGIN END IF; END LOOP; -- loop over words END; -$F$ LANGUAGE PLPGSQL; +$function$; CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry ( raw_input TEXT, diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql new file mode 100644 index 0000000000..a72c76f709 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql @@ -0,0 +1,268 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +CREATE OR REPLACE FUNCTION search.symspell_lookup( + raw_input text, + search_class text, + verbosity integer DEFAULT 2, + xfer_case boolean DEFAULT false, + count_threshold integer DEFAULT 1, + soundex_weight integer DEFAULT 0, + pg_trgm_weight integer DEFAULT 0, + kbdist_weight integer DEFAULT 0 +) RETURNS SETOF search.symspell_lookup_output + LANGUAGE plpgsql +AS $function$ +DECLARE + prefix_length INT; + maxED INT; + good_suggs HSTORE; + word_list TEXT[]; + edit_list TEXT[] := '{}'; + seen_list TEXT[] := '{}'; + output search.symspell_lookup_output; + output_list search.symspell_lookup_output[]; + entry RECORD; + entry_key TEXT; + prefix_key TEXT; + sugg TEXT; + input TEXT; + word TEXT; + w_pos INT := -1; + smallest_ed INT := -1; + global_ed INT; + i_len INT; + l_maxED INT; +BEGIN + SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; + prefix_length := COALESCE(prefix_length, 6); + + SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled; + maxED := COALESCE(maxED, 3); + + word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x; + + -- Common case exact match test for preformance + IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN + EXECUTE + 'SELECT '||search_class||'_suggestions AS suggestions, + '||search_class||'_count AS count, + prefix_key + FROM search.symspell_dictionary + WHERE prefix_key = $1 + AND '||search_class||'_count >= $2 + AND '||search_class||'_suggestions @> ARRAY[$1]' + INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1); + IF entry.prefix_key IS NOT NULL THEN + output.lev_distance := 0; -- definitionally + output.prefix_key := entry.prefix_key; + output.prefix_key_count := entry.count; + output.suggestion_count := entry.count; + output.input := word_list[1]; + IF xfer_case THEN + output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key); + ELSE + output.suggestion := entry.prefix_key; + END IF; + output.norm_input := entry.prefix_key; + output.qwerty_kb_match := 1; + output.pg_trgm_sim := 1; + output.soundex_sim := 1; + RETURN NEXT output; + RETURN; + END IF; + END IF; + + <> + FOREACH word IN ARRAY word_list LOOP + w_pos := w_pos + 1; + input := evergreen.lowercase(word); + i_len := CHARACTER_LENGTH(input); + l_maxED := maxED; + + IF CHARACTER_LENGTH(input) > prefix_length THEN + prefix_key := SUBSTRING(input FROM 1 FOR prefix_length); + edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED); + ELSE + edit_list := input || search.symspell_generate_edits(input, 1, l_maxED); + END IF; + + SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x; + + output_list := '{}'; + seen_list := '{}'; + global_ed := NULL; + + <> + FOREACH entry_key IN ARRAY edit_list LOOP + smallest_ed := -1; + IF global_ed IS NOT NULL THEN + smallest_ed := global_ed; + END IF; + + FOR entry IN EXECUTE + 'SELECT '||search_class||'_suggestions AS suggestions, + '||search_class||'_count AS count, + prefix_key + FROM search.symspell_dictionary + WHERE prefix_key = $1 + AND '||search_class||'_suggestions IS NOT NULL' + USING entry_key + LOOP + + SELECT HSTORE( + ARRAY_AGG( + ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT] + ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC + ) + ) + INTO good_suggs + FROM UNNEST(entry.suggestions) s + WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED) + AND NOT seen_list @> ARRAY[s]; + + CONTINUE WHEN good_suggs IS NULL; + + FOR sugg, output.suggestion_count IN EXECUTE + 'SELECT prefix_key, '||search_class||'_count + FROM search.symspell_dictionary + WHERE prefix_key = ANY ($1) + AND '||search_class||'_count >= $2' + USING AKEYS(good_suggs), COALESCE(count_threshold,1) + LOOP + + output.lev_distance := good_suggs->sugg; + seen_list := seen_list || sugg; + + -- Track the smallest edit distance among suggestions from this prefix key. + IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN + smallest_ed := output.lev_distance; + END IF; + + -- Track the smallest edit distance for all prefix keys for this word. + IF global_ed IS NULL OR smallest_ed < global_ed THEN + global_ed = smallest_ed; + -- And if low verbosity, ignore suggs with a larger distance from here on. + IF verbosity <= 1 THEN + l_maxED := global_ed; + END IF; + END IF; + + -- Lev distance is our main similarity measure. While + -- trgm or soundex similarity could be the main filter, + -- Lev is both language agnostic and faster. + -- + -- Here we will skip suggestions that have a longer edit distance + -- than the shortest we've already found. This is simply an + -- optimization that allows us to avoid further processing + -- of this entry. It would be filtered out later. + CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1; + + -- If we have an exact match on the suggestion key we can also avoid + -- some function calls. + IF output.lev_distance = 0 THEN + output.qwerty_kb_match := 1; + output.pg_trgm_sim := 1; + output.soundex_sim := 1; + ELSE + IF kbdist_weight THEN + output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg); + ELSE + output.qwerty_kb_match := 0; + END IF; + IF pg_trgm_weight THEN + output.pg_trgm_sim := similarity(input, sugg); + ELSE + output.pg_trgm_sim := 0; + END IF; + IF soundex_weight THEN + output.soundex_sim := difference(input, sugg) / 4.0; + ELSE + output.soundex_sim := 0; + END IF; + END IF; + + -- Fill in some fields + IF xfer_case AND input <> word THEN + output.suggestion := search.symspell_transfer_casing(word, sugg); + ELSE + output.suggestion := sugg; + END IF; + output.prefix_key := entry.prefix_key; + output.prefix_key_count := entry.count; + output.input := word; + output.norm_input := input; + output.word_pos := w_pos; + + -- We can't "cache" a set of generated records directly, so + -- here we build up an array of search.symspell_lookup_output + -- records that we can revivicate later as a table using UNNEST(). + output_list := output_list || output; + + EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit + CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key + + END LOOP; -- loop over suggestions + END LOOP; -- loop over entries + END LOOP; -- loop over entry_keys + + -- Now we're done examining this word + IF verbosity = 0 THEN + -- Return the "best" suggestion from the smallest edit + -- distance group. We define best based on the weighting + -- of the non-lev similarity measures and use the suggestion + -- use count to break ties. + RETURN QUERY + SELECT * FROM UNNEST(output_list) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC + LIMIT 1; + ELSIF verbosity = 1 THEN + -- Return all suggestions from the smallest + -- edit distance group. + RETURN QUERY + SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed + ORDER BY (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + ELSIF verbosity = 2 THEN + -- Return everything we find, along with relevant stats + RETURN QUERY + SELECT * FROM UNNEST(output_list) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + ELSIF verbosity = 3 THEN + -- Return everything we find from the two smallest edit distance groups + RETURN QUERY + SELECT * FROM UNNEST(output_list) + WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + ELSIF verbosity = 4 THEN + -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance + RETURN QUERY + SELECT * FROM UNNEST(output_list) + WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + END IF; + END LOOP; -- loop over words +END; +$function$; + +COMMIT; + -- 2.43.2