]> git.evergreen-ils.org Git - Evergreen.git/blob - Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql
LP#1931162: DYM needs some situational optimization
[Evergreen.git] / Open-ILS / src / sql / Pg / upgrade / XXXX.function.did_you_mean_optimization.sql
1 BEGIN;
2
3 SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
4
5 CREATE OR REPLACE FUNCTION search.symspell_lookup(
6         raw_input text,
7         search_class text,
8         verbosity integer DEFAULT 2,
9         xfer_case boolean DEFAULT false,
10         count_threshold integer DEFAULT 1,
11         soundex_weight integer DEFAULT 0,
12         pg_trgm_weight integer DEFAULT 0,
13         kbdist_weight integer DEFAULT 0
14 ) RETURNS SETOF search.symspell_lookup_output
15  LANGUAGE plpgsql
16 AS $function$
17 DECLARE
18     prefix_length INT;
19     maxED         INT;
20     good_suggs  HSTORE;
21     word_list   TEXT[];
22     edit_list   TEXT[] := '{}';
23     seen_list   TEXT[] := '{}';
24     output      search.symspell_lookup_output;
25     output_list search.symspell_lookup_output[];
26     entry       RECORD;
27     entry_key   TEXT;
28     prefix_key  TEXT;
29     sugg        TEXT;
30     input       TEXT;
31     word        TEXT;
32     w_pos       INT := -1;
33     smallest_ed INT := -1;
34     global_ed   INT;
35     i_len       INT;
36     l_maxED     INT;
37 BEGIN
38     SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
39     prefix_length := COALESCE(prefix_length, 6);
40
41     SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
42     maxED := COALESCE(maxED, 3);
43
44     word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
45
46     -- Common case exact match test for preformance
47     IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
48         EXECUTE
49           'SELECT  '||search_class||'_suggestions AS suggestions,
50                    '||search_class||'_count AS count,
51                    prefix_key
52              FROM  search.symspell_dictionary
53              WHERE prefix_key = $1
54                    AND '||search_class||'_count >= $2
55                    AND '||search_class||'_suggestions @> ARRAY[$1]'
56           INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
57         IF entry.prefix_key IS NOT NULL THEN
58             output.lev_distance := 0; -- definitionally
59             output.prefix_key := entry.prefix_key;
60             output.prefix_key_count := entry.count;
61             output.suggestion_count := entry.count;
62             output.input := word_list[1];
63             IF xfer_case THEN
64                 output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
65             ELSE
66                 output.suggestion := entry.prefix_key;
67             END IF;
68             output.norm_input := entry.prefix_key;
69             output.qwerty_kb_match := 1;
70             output.pg_trgm_sim := 1;
71             output.soundex_sim := 1;
72             RETURN NEXT output;
73             RETURN;
74         END IF;
75     END IF;
76
77     <<word_loop>>
78     FOREACH word IN ARRAY word_list LOOP
79         w_pos := w_pos + 1;
80         input := evergreen.lowercase(word);
81         i_len := CHARACTER_LENGTH(input);
82         l_maxED := maxED;
83
84         IF CHARACTER_LENGTH(input) > prefix_length THEN
85             prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
86             edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED);
87         ELSE
88             edit_list := input || search.symspell_generate_edits(input, 1, l_maxED);
89         END IF;
90
91         SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
92
93         output_list := '{}';
94         seen_list := '{}';
95         global_ed := NULL;
96
97         <<entry_key_loop>>
98         FOREACH entry_key IN ARRAY edit_list LOOP
99             smallest_ed := -1;
100             IF global_ed IS NOT NULL THEN
101                 smallest_ed := global_ed;
102             END IF;
103
104             FOR entry IN EXECUTE
105                 'SELECT  '||search_class||'_suggestions AS suggestions,
106                          '||search_class||'_count AS count,
107                          prefix_key
108                    FROM  search.symspell_dictionary
109                    WHERE prefix_key = $1
110                          AND '||search_class||'_suggestions IS NOT NULL'
111                 USING entry_key
112             LOOP
113
114                 SELECT  HSTORE(
115                             ARRAY_AGG(
116                                 ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT]
117                                     ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC
118                             )
119                         )
120                   INTO  good_suggs
121                   FROM  UNNEST(entry.suggestions) s
122                   WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED)
123                         AND NOT seen_list @> ARRAY[s];
124
125                 CONTINUE WHEN good_suggs IS NULL;
126
127                 FOR sugg, output.suggestion_count IN EXECUTE
128                     'SELECT  prefix_key, '||search_class||'_count
129                        FROM  search.symspell_dictionary
130                        WHERE prefix_key = ANY ($1)
131                              AND '||search_class||'_count >= $2'
132                     USING AKEYS(good_suggs), COALESCE(count_threshold,1)
133                 LOOP
134
135                     output.lev_distance := good_suggs->sugg;
136                     seen_list := seen_list || sugg;
137
138                     -- Track the smallest edit distance among suggestions from this prefix key.
139                     IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
140                         smallest_ed := output.lev_distance;
141                     END IF;
142
143                     -- Track the smallest edit distance for all prefix keys for this word.
144                     IF global_ed IS NULL OR smallest_ed < global_ed THEN
145                         global_ed = smallest_ed;
146                         -- And if low verbosity, ignore suggs with a larger distance from here on.
147                         IF verbosity <= 1 THEN
148                             l_maxED := global_ed;
149                         END IF;
150                     END IF;
151
152                     -- Lev distance is our main similarity measure. While
153                     -- trgm or soundex similarity could be the main filter,
154                     -- Lev is both language agnostic and faster.
155                     --
156                     -- Here we will skip suggestions that have a longer edit distance
157                     -- than the shortest we've already found. This is simply an
158                     -- optimization that allows us to avoid further processing
159                     -- of this entry. It would be filtered out later.
160                     CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1;
161
162                     -- If we have an exact match on the suggestion key we can also avoid
163                     -- some function calls.
164                     IF output.lev_distance = 0 THEN
165                         output.qwerty_kb_match := 1;
166                         output.pg_trgm_sim := 1;
167                         output.soundex_sim := 1;
168                     ELSE
169                         IF kbdist_weight THEN
170                             output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
171                         ELSE
172                             output.qwerty_kb_match := 0;
173                         END IF;
174                         IF pg_trgm_weight THEN
175                             output.pg_trgm_sim := similarity(input, sugg);
176                         ELSE
177                             output.pg_trgm_sim := 0;
178                         END IF;
179                         IF soundex_weight THEN
180                             output.soundex_sim := difference(input, sugg) / 4.0;
181                         ELSE
182                             output.soundex_sim := 0;
183                         END IF;
184                     END IF;
185
186                     -- Fill in some fields
187                     IF xfer_case AND input <> word THEN
188                         output.suggestion := search.symspell_transfer_casing(word, sugg);
189                     ELSE
190                         output.suggestion := sugg;
191                     END IF;
192                     output.prefix_key := entry.prefix_key;
193                     output.prefix_key_count := entry.count;
194                     output.input := word;
195                     output.norm_input := input;
196                     output.word_pos := w_pos;
197
198                     -- We can't "cache" a set of generated records directly, so
199                     -- here we build up an array of search.symspell_lookup_output
200                     -- records that we can revivicate later as a table using UNNEST().
201                     output_list := output_list || output;
202
203                     EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
204                     CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
205
206                 END LOOP; -- loop over suggestions
207             END LOOP; -- loop over entries
208         END LOOP; -- loop over entry_keys
209
210         -- Now we're done examining this word
211         IF verbosity = 0 THEN
212             -- Return the "best" suggestion from the smallest edit
213             -- distance group.  We define best based on the weighting
214             -- of the non-lev similarity measures and use the suggestion
215             -- use count to break ties.
216             RETURN QUERY
217                 SELECT * FROM UNNEST(output_list)
218                     ORDER BY lev_distance,
219                         (soundex_sim * COALESCE(soundex_weight,0))
220                             + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
221                             + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
222                         suggestion_count DESC
223                         LIMIT 1;
224         ELSIF verbosity = 1 THEN
225             -- Return all suggestions from the smallest
226             -- edit distance group.
227             RETURN QUERY
228                 SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
229                     ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
230                             + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
231                             + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
232                         suggestion_count DESC;
233         ELSIF verbosity = 2 THEN
234             -- Return everything we find, along with relevant stats
235             RETURN QUERY
236                 SELECT * FROM UNNEST(output_list)
237                     ORDER BY lev_distance,
238                         (soundex_sim * COALESCE(soundex_weight,0))
239                             + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
240                             + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
241                         suggestion_count DESC;
242         ELSIF verbosity = 3 THEN
243             -- Return everything we find from the two smallest edit distance groups
244             RETURN QUERY
245                 SELECT * FROM UNNEST(output_list)
246                     WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
247                     ORDER BY lev_distance,
248                         (soundex_sim * COALESCE(soundex_weight,0))
249                             + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
250                             + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
251                         suggestion_count DESC;
252         ELSIF verbosity = 4 THEN
253             -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
254             RETURN QUERY
255                 SELECT * FROM UNNEST(output_list)
256                     WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
257                     ORDER BY lev_distance,
258                         (soundex_sim * COALESCE(soundex_weight,0))
259                             + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
260                             + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
261                         suggestion_count DESC;
262         END IF;
263     END LOOP; -- loop over words
264 END;
265 $function$;
266
267 COMMIT;
268