3 SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
5 CREATE OR REPLACE FUNCTION search.symspell_lookup(
8 verbosity integer DEFAULT 2,
9 xfer_case boolean DEFAULT false,
10 count_threshold integer DEFAULT 1,
11 soundex_weight integer DEFAULT 0,
12 pg_trgm_weight integer DEFAULT 0,
13 kbdist_weight integer DEFAULT 0
14 ) RETURNS SETOF search.symspell_lookup_output
22 edit_list TEXT[] := '{}';
23 seen_list TEXT[] := '{}';
24 output search.symspell_lookup_output;
25 output_list search.symspell_lookup_output[];
33 smallest_ed INT := -1;
38 SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
39 prefix_length := COALESCE(prefix_length, 6);
41 SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
42 maxED := COALESCE(maxED, 3);
44 word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
46 -- Common case exact match test for preformance
47 IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
49 'SELECT '||search_class||'_suggestions AS suggestions,
50 '||search_class||'_count AS count,
52 FROM search.symspell_dictionary
54 AND '||search_class||'_count >= $2
55 AND '||search_class||'_suggestions @> ARRAY[$1]'
56 INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
57 IF entry.prefix_key IS NOT NULL THEN
58 output.lev_distance := 0; -- definitionally
59 output.prefix_key := entry.prefix_key;
60 output.prefix_key_count := entry.count;
61 output.suggestion_count := entry.count;
62 output.input := word_list[1];
64 output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
66 output.suggestion := entry.prefix_key;
68 output.norm_input := entry.prefix_key;
69 output.qwerty_kb_match := 1;
70 output.pg_trgm_sim := 1;
71 output.soundex_sim := 1;
78 FOREACH word IN ARRAY word_list LOOP
80 input := evergreen.lowercase(word);
81 i_len := CHARACTER_LENGTH(input);
84 IF CHARACTER_LENGTH(input) > prefix_length THEN
85 prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
86 edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED);
88 edit_list := input || search.symspell_generate_edits(input, 1, l_maxED);
91 SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
98 FOREACH entry_key IN ARRAY edit_list LOOP
100 IF global_ed IS NOT NULL THEN
101 smallest_ed := global_ed;
105 'SELECT '||search_class||'_suggestions AS suggestions,
106 '||search_class||'_count AS count,
108 FROM search.symspell_dictionary
109 WHERE prefix_key = $1
110 AND '||search_class||'_suggestions IS NOT NULL'
116 ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT]
117 ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC
121 FROM UNNEST(entry.suggestions) s
122 WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED)
123 AND NOT seen_list @> ARRAY[s];
125 CONTINUE WHEN good_suggs IS NULL;
127 FOR sugg, output.suggestion_count IN EXECUTE
128 'SELECT prefix_key, '||search_class||'_count
129 FROM search.symspell_dictionary
130 WHERE prefix_key = ANY ($1)
131 AND '||search_class||'_count >= $2'
132 USING AKEYS(good_suggs), COALESCE(count_threshold,1)
135 output.lev_distance := good_suggs->sugg;
136 seen_list := seen_list || sugg;
138 -- Track the smallest edit distance among suggestions from this prefix key.
139 IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
140 smallest_ed := output.lev_distance;
143 -- Track the smallest edit distance for all prefix keys for this word.
144 IF global_ed IS NULL OR smallest_ed < global_ed THEN
145 global_ed = smallest_ed;
146 -- And if low verbosity, ignore suggs with a larger distance from here on.
147 IF verbosity <= 1 THEN
148 l_maxED := global_ed;
152 -- Lev distance is our main similarity measure. While
153 -- trgm or soundex similarity could be the main filter,
154 -- Lev is both language agnostic and faster.
156 -- Here we will skip suggestions that have a longer edit distance
157 -- than the shortest we've already found. This is simply an
158 -- optimization that allows us to avoid further processing
159 -- of this entry. It would be filtered out later.
160 CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1;
162 -- If we have an exact match on the suggestion key we can also avoid
163 -- some function calls.
164 IF output.lev_distance = 0 THEN
165 output.qwerty_kb_match := 1;
166 output.pg_trgm_sim := 1;
167 output.soundex_sim := 1;
169 IF kbdist_weight THEN
170 output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
172 output.qwerty_kb_match := 0;
174 IF pg_trgm_weight THEN
175 output.pg_trgm_sim := similarity(input, sugg);
177 output.pg_trgm_sim := 0;
179 IF soundex_weight THEN
180 output.soundex_sim := difference(input, sugg) / 4.0;
182 output.soundex_sim := 0;
186 -- Fill in some fields
187 IF xfer_case AND input <> word THEN
188 output.suggestion := search.symspell_transfer_casing(word, sugg);
190 output.suggestion := sugg;
192 output.prefix_key := entry.prefix_key;
193 output.prefix_key_count := entry.count;
194 output.input := word;
195 output.norm_input := input;
196 output.word_pos := w_pos;
198 -- We can't "cache" a set of generated records directly, so
199 -- here we build up an array of search.symspell_lookup_output
200 -- records that we can revivicate later as a table using UNNEST().
201 output_list := output_list || output;
203 EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
204 CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
206 END LOOP; -- loop over suggestions
207 END LOOP; -- loop over entries
208 END LOOP; -- loop over entry_keys
210 -- Now we're done examining this word
211 IF verbosity = 0 THEN
212 -- Return the "best" suggestion from the smallest edit
213 -- distance group. We define best based on the weighting
214 -- of the non-lev similarity measures and use the suggestion
215 -- use count to break ties.
217 SELECT * FROM UNNEST(output_list)
218 ORDER BY lev_distance,
219 (soundex_sim * COALESCE(soundex_weight,0))
220 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
221 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
222 suggestion_count DESC
224 ELSIF verbosity = 1 THEN
225 -- Return all suggestions from the smallest
226 -- edit distance group.
228 SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
229 ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
230 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
231 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
232 suggestion_count DESC;
233 ELSIF verbosity = 2 THEN
234 -- Return everything we find, along with relevant stats
236 SELECT * FROM UNNEST(output_list)
237 ORDER BY lev_distance,
238 (soundex_sim * COALESCE(soundex_weight,0))
239 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
240 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
241 suggestion_count DESC;
242 ELSIF verbosity = 3 THEN
243 -- Return everything we find from the two smallest edit distance groups
245 SELECT * FROM UNNEST(output_list)
246 WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
247 ORDER BY lev_distance,
248 (soundex_sim * COALESCE(soundex_weight,0))
249 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
250 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
251 suggestion_count DESC;
252 ELSIF verbosity = 4 THEN
253 -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
255 SELECT * FROM UNNEST(output_list)
256 WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
257 ORDER BY lev_distance,
258 (soundex_sim * COALESCE(soundex_weight,0))
259 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
260 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
261 suggestion_count DESC;
263 END LOOP; -- loop over words