3 CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
4 CREATE EXTENSION IF NOT EXISTS pg_trgm;
6 INSERT INTO config.internal_flag (name, value, enabled) VALUES ('symspell.prefix_length', '6', TRUE);
7 INSERT INTO config.internal_flag (name, value, enabled) VALUES ('symspell.max_edit_distance', '3', TRUE);
9 INSERT into config.org_unit_setting_type
10 ( name, grp, label, description, datatype )
12 ( 'opac.did_you_mean.max_suggestions', 'opac',
14 'opac.did_you_mean.max_suggestions',
15 'Maximum number of spelling suggestions that may be offered',
18 'opac.did_you_mean.max_suggestions',
19 'If set to -1, provide "best" suggestion if mispelled; if set higher than 0, the maximum suggestions that can be provided; if set to 0, disable suggestions.',
20 'coust', 'description'),
23 INSERT into config.org_unit_setting_type
24 ( name, grp, label, description, datatype )
26 ( 'opac.did_you_mean.low_result_threshold', 'opac',
28 'opac.did_you_mean.low_result_threshold',
29 'Maximum search result count at which spelling suggestions may be offered',
32 'opac.did_you_mean.low_result_threshold',
33 'If a search results in this number or fewer results, and there are correctable spelling mistakes, a suggested search may be provided.',
34 'coust', 'description'),
37 INSERT into config.org_unit_setting_type
38 ( name, grp, label, description, datatype )
40 ( 'search.symspell.min_suggestion_use_threshold', 'opac',
42 'search.symspell.min_suggestion_use_threshold',
43 'Minimum required uses of a spelling suggestions that may be offered',
46 'search.symspell.min_suggestion_use_threshold',
47 'The number of bibliographic records (more or less) that a spelling suggestion must appear in to be considered before offering it to a user. Defaults to 1 (must appear in the bib data).',
48 'coust', 'description'),
51 INSERT into config.org_unit_setting_type
52 ( name, grp, label, description, datatype )
54 ( 'search.symspell.soundex.weight', 'opac',
56 'search.symspell.soundex.weight',
57 'Soundex score weighting in OPAC spelling suggestions.',
60 'search.symspell.soundex.weight',
61 'Soundex, trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled soundex component. Defaults to 0 for "off".',
62 'coust', 'description'),
65 INSERT into config.org_unit_setting_type
66 ( name, grp, label, description, datatype )
68 ( 'search.symspell.pg_trgm.weight', 'opac',
70 'search.symspell.pg_trgm.weight',
71 'Pg_trgm score weighting in OPAC spelling suggestions.',
74 'search.symspell.pg_trgm.weight',
75 'Soundex, pg_trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled pg_trgm component. Defaults to 0 for "off".',
76 'coust', 'description'),
79 INSERT into config.org_unit_setting_type
80 ( name, grp, label, description, datatype )
82 ( 'search.symspell.keyboard_distance.weight', 'opac',
84 'search.symspell.keyboard_distance.weight',
85 'Keyboard distance score weighting in OPAC spelling suggestions.',
88 'search.symspell.keyboard_distance.weight',
89 'Soundex, trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled keyboard distance component. Defaults to 0 for "off".',
90 'coust', 'description'),
93 CREATE OR REPLACE FUNCTION evergreen.uppercase( TEXT ) RETURNS TEXT AS $$
95 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
97 CREATE OR REPLACE FUNCTION evergreen.text_array_merge_unique (
99 ) RETURNS TEXT[] AS $F$
101 SELECT * FROM UNNEST($1) x WHERE x IS NOT NULL
103 SELECT * FROM UNNEST($2) y WHERE y IS NOT NULL
107 CREATE OR REPLACE FUNCTION evergreen.qwerty_keyboard_distance ( a TEXT, b TEXT ) RETURNS NUMERIC AS $F$
108 use String::KeyboardDistance qw(:all);
109 return qwerty_keyboard_distance(@_);
110 $F$ LANGUAGE PLPERLU STRICT IMMUTABLE;
112 CREATE OR REPLACE FUNCTION evergreen.qwerty_keyboard_distance_match ( a TEXT, b TEXT ) RETURNS NUMERIC AS $F$
113 use String::KeyboardDistance qw(:all);
114 return qwerty_keyboard_distance_match(@_);
115 $F$ LANGUAGE PLPERLU STRICT IMMUTABLE;
117 CREATE OR REPLACE FUNCTION evergreen.levenshtein_damerau_edistance ( a TEXT, b TEXT, INT ) RETURNS NUMERIC AS $F$
118 use Text::Levenshtein::Damerau::XS qw/xs_edistance/;
119 return xs_edistance(@_);
120 $F$ LANGUAGE PLPERLU STRICT IMMUTABLE;
122 CREATE TABLE search.symspell_dictionary (
123 keyword_count INT NOT NULL DEFAULT 0,
124 title_count INT NOT NULL DEFAULT 0,
125 author_count INT NOT NULL DEFAULT 0,
126 subject_count INT NOT NULL DEFAULT 0,
127 series_count INT NOT NULL DEFAULT 0,
128 identifier_count INT NOT NULL DEFAULT 0,
130 prefix_key TEXT PRIMARY KEY,
132 keyword_suggestions TEXT[],
133 title_suggestions TEXT[],
134 author_suggestions TEXT[],
135 subject_suggestions TEXT[],
136 series_suggestions TEXT[],
137 identifier_suggestions TEXT[]
138 ) WITH (fillfactor = 80);
140 CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT )
141 RETURNS SETOF TEXT AS $F$
142 SELECT UNNEST(x) FROM regexp_matches($1, '([[:alnum:]]+''*[[:alnum:]]*)', 'g') x;
143 $F$ LANGUAGE SQL STRICT IMMUTABLE;
145 -- This version does not preserve input word order!
146 CREATE OR REPLACE FUNCTION search.symspell_parse_words_distinct ( phrase TEXT )
147 RETURNS SETOF TEXT AS $F$
148 SELECT DISTINCT UNNEST(x) FROM regexp_matches($1, '([[:alnum:]]+''*[[:alnum:]]*)', 'g') x;
149 $F$ LANGUAGE SQL STRICT IMMUTABLE;
151 CREATE OR REPLACE FUNCTION search.symspell_transfer_casing ( withCase TEXT, withoutCase TEXT )
158 woChars := regexp_split_to_array(withoutCase,'');
159 FOR curr IN SELECT x FROM regexp_split_to_table(withCase, '') x LOOP
160 IF curr = evergreen.uppercase(curr) THEN
161 woChars[ind] := evergreen.uppercase(woChars[ind]);
165 RETURN ARRAY_TO_STRING(woChars,'');
167 $F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
169 CREATE OR REPLACE FUNCTION search.symspell_generate_edits (
173 ) RETURNS TEXT[] AS $F$
177 sublist TEXT[] := '{}';
179 FOR I IN 1 .. CHARACTER_LENGTH(raw_word) LOOP
180 item := SUBSTRING(raw_word FROM 1 FOR I - 1) || SUBSTRING(raw_word FROM I + 1);
181 IF NOT list @> ARRAY[item] THEN
182 list := item || list;
183 IF dist < maxED AND CHARACTER_LENGTH(raw_word) > dist + 1 THEN
184 sublist := search.symspell_generate_edits(item, dist + 1, maxED) || sublist;
190 RETURN evergreen.text_array_merge_unique(list, sublist);
192 RETURN list || sublist;
195 $F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
197 -- DROP TYPE search.symspell_lookup_output CASCADE;
198 CREATE TYPE search.symspell_lookup_output AS (
200 suggestion_count INT,
203 qwerty_kb_match NUMERIC,
208 prefix_key_count INT,
212 CREATE OR REPLACE FUNCTION search.symspell_lookup (
215 verbosity INT DEFAULT 2,
216 xfer_case BOOL DEFAULT FALSE,
217 count_threshold INT DEFAULT 1,
218 soundex_weight INT DEFAULT 0,
219 pg_trgm_weight INT DEFAULT 0,
220 kbdist_weight INT DEFAULT 0
221 ) RETURNS SETOF search.symspell_lookup_output AS $F$
226 edit_list TEXT[] := '{}';
227 seen_list TEXT[] := '{}';
228 output search.symspell_lookup_output;
229 output_list search.symspell_lookup_output[];
237 smallest_ed INT := -1;
240 SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
241 prefix_length := COALESCE(prefix_length, 6);
243 SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
244 maxED := COALESCE(maxED, 3);
246 word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
248 -- Common case exact match test for preformance
249 IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
251 'SELECT '||search_class||'_suggestions AS suggestions,
252 '||search_class||'_count AS count,
254 FROM search.symspell_dictionary
255 WHERE prefix_key = $1
256 AND '||search_class||'_count >= $2
257 AND '||search_class||'_suggestions @> ARRAY[$1]'
258 INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
259 IF entry.prefix_key IS NOT NULL THEN
260 output.lev_distance := 0; -- definitionally
261 output.prefix_key := entry.prefix_key;
262 output.prefix_key_count := entry.count;
263 output.suggestion_count := entry.count;
264 output.input := word_list[1];
266 output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
268 output.suggestion := entry.prefix_key;
270 output.norm_input := entry.prefix_key;
271 output.qwerty_kb_match := 1;
272 output.pg_trgm_sim := 1;
273 output.soundex_sim := 1;
280 FOREACH word IN ARRAY word_list LOOP
282 input := evergreen.lowercase(word);
284 IF CHARACTER_LENGTH(input) > prefix_length THEN
285 prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
286 edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, maxED);
288 edit_list := input || search.symspell_generate_edits(input, 1, maxED);
291 SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
298 FOREACH entry_key IN ARRAY edit_list LOOP
300 IF global_ed IS NOT NULL THEN
301 smallest_ed := global_ed;
304 'SELECT '||search_class||'_suggestions AS suggestions,
305 '||search_class||'_count AS count,
307 FROM search.symspell_dictionary
308 WHERE prefix_key = $1
309 AND '||search_class||'_suggestions IS NOT NULL'
312 FOREACH sugg IN ARRAY entry.suggestions LOOP
313 IF NOT seen_list @> ARRAY[sugg] THEN
314 seen_list := seen_list || sugg;
315 IF input = sugg THEN -- exact match, no need to spend time on a call
316 output.lev_distance := 0;
317 output.suggestion_count = entry.count;
318 ELSIF ABS(CHARACTER_LENGTH(input) - CHARACTER_LENGTH(sugg)) > maxED THEN
319 -- They are definitionally too different to consider, just move on.
322 --output.lev_distance := levenshtein_less_equal(
323 output.lev_distance := evergreen.levenshtein_damerau_edistance(
328 IF output.lev_distance < 0 THEN
329 -- The Perl module returns -1 for "more distant than max".
330 output.lev_distance := maxED + 1;
331 -- This short-circuit's the count test below for speed, bypassing
332 -- a couple useless tests.
333 output.suggestion_count := -1;
335 EXECUTE 'SELECT '||search_class||'_count FROM search.symspell_dictionary WHERE prefix_key = $1'
336 INTO output.suggestion_count USING sugg;
340 -- The caller passes a minimum suggestion count threshold (or uses
341 -- the default of 0) and if the suggestion has that many or less uses
342 -- then we move on to the next suggestion, since this one is too rare.
343 CONTINUE WHEN output.suggestion_count < COALESCE(count_threshold,1);
345 -- Track the smallest edit distance among suggestions from this prefix key.
346 IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
347 smallest_ed := output.lev_distance;
350 -- Track the smallest edit distance for all prefix keys for this word.
351 IF global_ed IS NULL OR smallest_ed < global_ed THEN
352 global_ed = smallest_ed;
355 -- Only proceed if the edit distance is <= the max for the dictionary.
356 IF output.lev_distance <= maxED THEN
357 IF output.lev_distance > global_ed AND verbosity <= 1 THEN
358 -- Lev distance is our main similarity measure. While
359 -- trgm or soundex similarity could be the main filter,
360 -- Lev is both language agnostic and faster.
362 -- Here we will skip suggestions that have a longer edit distance
363 -- than the shortest we've already found. This is simply an
364 -- optimization that allows us to avoid further processing
365 -- of this entry. It would be filtered out later.
370 -- If we have an exact match on the suggestion key we can also avoid
371 -- some function calls.
372 IF output.lev_distance = 0 THEN
373 output.qwerty_kb_match := 1;
374 output.pg_trgm_sim := 1;
375 output.soundex_sim := 1;
377 output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
378 output.pg_trgm_sim := similarity(input, sugg);
379 output.soundex_sim := difference(input, sugg) / 4.0;
382 -- Fill in some fields
384 output.suggestion := search.symspell_transfer_casing(word, sugg);
386 output.suggestion := sugg;
388 output.prefix_key := entry.prefix_key;
389 output.prefix_key_count := entry.count;
390 output.input := word;
391 output.norm_input := input;
392 output.word_pos := w_pos;
394 -- We can't "cache" a set of generated records directly, so
395 -- here we build up an array of search.symspell_lookup_output
396 -- records that we can revivicate later as a table using UNNEST().
397 output_list := output_list || output;
399 EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
400 CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
401 END IF; -- maxED test
402 END IF; -- suggestion not seen test
403 END LOOP; -- loop over suggestions
404 END LOOP; -- loop over entries
405 END LOOP; -- loop over entry_keys
407 -- Now we're done examining this word
408 IF verbosity = 0 THEN
409 -- Return the "best" suggestion from the smallest edit
410 -- distance group. We define best based on the weighting
411 -- of the non-lev similarity measures and use the suggestion
412 -- use count to break ties.
414 SELECT * FROM UNNEST(output_list)
415 ORDER BY lev_distance,
416 (soundex_sim * COALESCE(soundex_weight,0))
417 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
418 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
419 suggestion_count DESC
421 ELSIF verbosity = 1 THEN
422 -- Return all suggestions from the smallest
423 -- edit distance group.
425 SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
426 ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
427 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
428 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
429 suggestion_count DESC;
430 ELSIF verbosity = 2 THEN
431 -- Return everything we find, along with relevant stats
433 SELECT * FROM UNNEST(output_list)
434 ORDER BY lev_distance,
435 (soundex_sim * COALESCE(soundex_weight,0))
436 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
437 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
438 suggestion_count DESC;
439 ELSIF verbosity = 3 THEN
440 -- Return everything we find from the two smallest edit distance groups
442 SELECT * FROM UNNEST(output_list)
443 WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
444 ORDER BY lev_distance,
445 (soundex_sim * COALESCE(soundex_weight,0))
446 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
447 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
448 suggestion_count DESC;
449 ELSIF verbosity = 4 THEN
450 -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
452 SELECT * FROM UNNEST(output_list)
453 WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
454 ORDER BY lev_distance,
455 (soundex_sim * COALESCE(soundex_weight,0))
456 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
457 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
458 suggestion_count DESC;
460 END LOOP; -- loop over words
462 $F$ LANGUAGE PLPGSQL;
464 CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry (
467 no_limit BOOL DEFAULT FALSE,
468 prefix_length INT DEFAULT 6,
470 ) RETURNS SETOF search.symspell_dictionary AS $F$
475 entry search.symspell_dictionary%ROWTYPE;
479 IF NOT no_limit AND CHARACTER_LENGTH(raw_input) > prefix_length THEN
480 key := SUBSTRING(key FROM 1 FOR prefix_length);
481 key_list := ARRAY[raw_input, key];
483 key_list := ARRAY[key];
486 FOREACH del_key IN ARRAY key_list LOOP
487 entry.prefix_key := del_key;
489 entry.keyword_count := 0;
490 entry.title_count := 0;
491 entry.author_count := 0;
492 entry.subject_count := 0;
493 entry.series_count := 0;
494 entry.identifier_count := 0;
496 entry.keyword_suggestions := '{}';
497 entry.title_suggestions := '{}';
498 entry.author_suggestions := '{}';
499 entry.subject_suggestions := '{}';
500 entry.series_suggestions := '{}';
501 entry.identifier_suggestions := '{}';
503 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
504 IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF;
505 IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF;
506 IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF;
507 IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF;
508 IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF;
509 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
511 IF del_key = raw_input THEN
512 IF source_class = 'keyword' THEN entry.keyword_count := 1; END IF;
513 IF source_class = 'title' THEN entry.title_count := 1; END IF;
514 IF source_class = 'author' THEN entry.author_count := 1; END IF;
515 IF source_class = 'subject' THEN entry.subject_count := 1; END IF;
516 IF source_class = 'series' THEN entry.series_count := 1; END IF;
517 IF source_class = 'identifier' THEN entry.identifier_count := 1; END IF;
523 FOR del_key IN SELECT x FROM UNNEST(search.symspell_generate_edits(key, 1, maxED)) x LOOP
525 entry.keyword_suggestions := '{}';
526 entry.title_suggestions := '{}';
527 entry.author_suggestions := '{}';
528 entry.subject_suggestions := '{}';
529 entry.series_suggestions := '{}';
530 entry.identifier_suggestions := '{}';
532 IF source_class = 'keyword' THEN entry.keyword_count := 0; END IF;
533 IF source_class = 'title' THEN entry.title_count := 0; END IF;
534 IF source_class = 'author' THEN entry.author_count := 0; END IF;
535 IF source_class = 'subject' THEN entry.subject_count := 0; END IF;
536 IF source_class = 'series' THEN entry.series_count := 0; END IF;
537 IF source_class = 'identifier' THEN entry.identifier_count := 0; END IF;
539 entry.prefix_key := del_key;
541 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
542 IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF;
543 IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF;
544 IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF;
545 IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF;
546 IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF;
547 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
553 $F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
555 CREATE OR REPLACE FUNCTION search.symspell_build_entries (
558 old_input TEXT DEFAULT NULL,
559 include_phrases BOOL DEFAULT FALSE
560 ) RETURNS SETOF search.symspell_dictionary AS $F$
567 entry search.symspell_dictionary;
569 IF full_input IS NOT NULL THEN
570 SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
571 prefix_length := COALESCE(prefix_length, 6);
573 SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
574 maxED := COALESCE(maxED, 3);
576 input := evergreen.lowercase(full_input);
577 word_list := ARRAY_AGG(x) FROM search.symspell_parse_words_distinct(input) x;
579 IF CARDINALITY(word_list) > 1 AND include_phrases THEN
580 RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(input, source_class, TRUE, prefix_length, maxED);
583 FOREACH word IN ARRAY word_list LOOP
584 RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(word, source_class, FALSE, prefix_length, maxED);
588 IF old_input IS NOT NULL THEN
589 input := evergreen.lowercase(old_input);
591 FOR word IN SELECT x FROM search.symspell_parse_words_distinct(input) x LOOP
592 entry.prefix_key := word;
594 entry.keyword_count := 0;
595 entry.title_count := 0;
596 entry.author_count := 0;
597 entry.subject_count := 0;
598 entry.series_count := 0;
599 entry.identifier_count := 0;
601 entry.keyword_suggestions := '{}';
602 entry.title_suggestions := '{}';
603 entry.author_suggestions := '{}';
604 entry.subject_suggestions := '{}';
605 entry.series_suggestions := '{}';
606 entry.identifier_suggestions := '{}';
608 IF source_class = 'keyword' THEN entry.keyword_count := -1; END IF;
609 IF source_class = 'title' THEN entry.title_count := -1; END IF;
610 IF source_class = 'author' THEN entry.author_count := -1; END IF;
611 IF source_class = 'subject' THEN entry.subject_count := -1; END IF;
612 IF source_class = 'series' THEN entry.series_count := -1; END IF;
613 IF source_class = 'identifier' THEN entry.identifier_count := -1; END IF;
619 $F$ LANGUAGE PLPGSQL;
621 CREATE OR REPLACE FUNCTION search.symspell_build_and_merge_entries (
624 old_input TEXT DEFAULT NULL,
625 include_phrases BOOL DEFAULT FALSE
626 ) RETURNS SETOF search.symspell_dictionary AS $F$
629 conflict_entry RECORD;
632 IF full_input = old_input THEN -- neither NULL, and are the same
636 FOR new_entry IN EXECUTE $q$
639 evergreen.text_array_merge_unique(s,'{}') suggestions
640 FROM (SELECT prefix_key,
641 ARRAY_AGG($q$ || source_class || $q$_suggestions[1]) s,
642 SUM($q$ || source_class || $q$_count) count
643 FROM search.symspell_build_entries($1, $2, $3, $4)
645 $q$ USING full_input, source_class, old_input, include_phrases
649 $q$ || source_class || $q$_suggestions suggestions,
650 $q$ || source_class || $q$_count count
651 FROM search.symspell_dictionary
652 WHERE prefix_key = $1 $q$
654 USING new_entry.prefix_key;
656 IF new_entry.count <> 0 THEN -- Real word, and count changed
657 IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating
658 IF conflict_entry.count > 0 THEN -- it's a real word
659 RETURN QUERY EXECUTE $q$
660 UPDATE search.symspell_dictionary
661 SET $q$ || source_class || $q$_count = $2
662 WHERE prefix_key = $1
664 USING new_entry.prefix_key, GREATEST(0, new_entry.count + conflict_entry.count);
665 ELSE -- it was a prefix key or delete-emptied word before
666 IF conflict_entry.suggestions @> new_entry.suggestions THEN -- already have all suggestions here...
667 RETURN QUERY EXECUTE $q$
668 UPDATE search.symspell_dictionary
669 SET $q$ || source_class || $q$_count = $2
670 WHERE prefix_key = $1
672 USING new_entry.prefix_key, GREATEST(0, new_entry.count);
673 ELSE -- new suggestion!
674 RETURN QUERY EXECUTE $q$
675 UPDATE search.symspell_dictionary
676 SET $q$ || source_class || $q$_count = $2,
677 $q$ || source_class || $q$_suggestions = $3
678 WHERE prefix_key = $1
680 USING new_entry.prefix_key, GREATEST(0, new_entry.count), evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions);
684 -- We keep the on-conflict clause just in case...
685 RETURN QUERY EXECUTE $q$
686 INSERT INTO search.symspell_dictionary AS d (
687 $q$ || source_class || $q$_count,
689 $q$ || source_class || $q$_suggestions
690 ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO
691 UPDATE SET $q$ || source_class || $q$_count = d.$q$ || source_class || $q$_count + EXCLUDED.$q$ || source_class || $q$_count,
692 $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions)
694 USING new_entry.count, new_entry.prefix_key, new_entry.suggestions;
696 ELSE -- key only, or no change
697 IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating
698 IF NOT conflict_entry.suggestions @> new_entry.suggestions THEN -- There are new suggestions
699 RETURN QUERY EXECUTE $q$
700 UPDATE search.symspell_dictionary
701 SET $q$ || source_class || $q$_suggestions = $2
702 WHERE prefix_key = $1
704 USING new_entry.prefix_key, evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions);
707 RETURN QUERY EXECUTE $q$
708 INSERT INTO search.symspell_dictionary AS d (
709 $q$ || source_class || $q$_count,
711 $q$ || source_class || $q$_suggestions
712 ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO -- key exists, suggestions may be added due to this entry
713 UPDATE SET $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions)
715 USING new_entry.count, new_entry.prefix_key, new_entry.suggestions;
720 $F$ LANGUAGE PLPGSQL;
722 CREATE OR REPLACE FUNCTION search.symspell_maintain_entries () RETURNS TRIGGER AS $f$
725 new_value TEXT := NULL;
726 old_value TEXT := NULL;
728 search_class := COALESCE(TG_ARGV[0], SPLIT_PART(TG_TABLE_NAME,'_',1));
730 IF TG_OP IN ('INSERT', 'UPDATE') THEN
731 new_value := NEW.value;
734 IF TG_OP IN ('DELETE', 'UPDATE') THEN
735 old_value := OLD.value;
738 PERFORM * FROM search.symspell_build_and_merge_entries(new_value, search_class, old_value);
740 RETURN NULL; -- always fired AFTER
742 $f$ LANGUAGE PLPGSQL;
744 CREATE TRIGGER maintain_symspell_entries_tgr
745 AFTER INSERT OR UPDATE OR DELETE ON metabib.title_field_entry
746 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
748 CREATE TRIGGER maintain_symspell_entries_tgr
749 AFTER INSERT OR UPDATE OR DELETE ON metabib.author_field_entry
750 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
752 CREATE TRIGGER maintain_symspell_entries_tgr
753 AFTER INSERT OR UPDATE OR DELETE ON metabib.subject_field_entry
754 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
756 CREATE TRIGGER maintain_symspell_entries_tgr
757 AFTER INSERT OR UPDATE OR DELETE ON metabib.series_field_entry
758 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
760 CREATE TRIGGER maintain_symspell_entries_tgr
761 AFTER INSERT OR UPDATE OR DELETE ON metabib.keyword_field_entry
762 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
764 CREATE TRIGGER maintain_symspell_entries_tgr
765 AFTER INSERT OR UPDATE OR DELETE ON metabib.identifier_field_entry
766 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
770 /* This will generate the queries needed to generate the /file/ that can
771 * be used to populate the dictionary table.
773 select $z$select $y$select $y$||x.id||$y$, '$z$||x.x||$z$', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '$z$||x||$z$');$y$ from metabib.$z$||x||$z$_field_entry x;$z$ from (select 'keyword'::text x union select 'title' union select 'author' union select 'subject' union select 'series' union select 'identifier') x;
778 \qecho 'The following should be run at the end of the upgrade before any'
779 \qecho 'reingest occurs. Because new triggers are installed already,'
780 \qecho 'updates to indexed strings will cause zero-count dictionary entries'
781 \qecho 'to be recorded which will require updating every row again (or'
782 \qecho 'starting from scratch) so best to do this before other batch'
783 \qecho 'changes. A later reingest that does not significantly change'
784 \qecho 'indexed strings will /not/ cause table bloat here, and will be'
785 \qecho 'as fast as normal. A copy of the SQL in a ready-to-use, non-escaped'
786 \qecho 'form is available inside a comment at the end of this upgrade sub-'
787 \qecho 'script so you do not need to copy this comment from the psql ouptut.'
789 \qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;'
790 \qecho 'TRUNCATE search.symspell_dictionary;'
794 \qecho '\\o dym.prime.sql'
795 \qecho select $y$select $y$||x.id||$y$, '''title''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''title''');$y$ from metabib.title_field_entry x;
796 \qecho select $y$select $y$||x.id||$y$, '''author''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''author''');$y$ from metabib.author_field_entry x;
797 \qecho select $y$select $y$||x.id||$y$, '''series''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''series''');$y$ from metabib.series_field_entry x;
798 \qecho select $y$select $y$||x.id||$y$, '''identifier''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''identifier''');$y$ from metabib.identifier_field_entry x;
799 \qecho select $y$select $y$||x.id||$y$, '''keyword''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''keyword''');$y$ from metabib.keyword_field_entry x;
800 \qecho select $y$select $y$||x.id||$y$, '''subject''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''subject''');$y$ from metabib.subject_field_entry x;
803 \qecho '\\i dym.prime.sql'
805 \qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;'
806 \qecho 'REINDEX TABLE search.symspell_dictionary;'
807 \qecho 'ALTER TABLE search.symspell_dictionary SET LOGGED;'
808 \qecho 'VACUUM ANALYZE search.symspell_dictionary;'
813 ALTER TABLE search.symspell_dictionary SET UNLOGGED;
814 TRUNCATE search.symspell_dictionary;
819 select $y$select $y$||x.id||$y$, 'title', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'title');$y$ from metabib.title_field_entry x;
820 select $y$select $y$||x.id||$y$, 'author', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'author');$y$ from metabib.author_field_entry x;
821 select $y$select $y$||x.id||$y$, 'series', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'series');$y$ from metabib.series_field_entry x;
822 select $y$select $y$||x.id||$y$, 'identifier', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'identifier');$y$ from metabib.identifier_field_entry x;
823 select $y$select $y$||x.id||$y$, 'keyword', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'keyword');$y$ from metabib.keyword_field_entry x;
824 select $y$select $y$||x.id||$y$, 'subject', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'subject');$y$ from metabib.subject_field_entry x;
829 CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;
830 REINDEX TABLE search.symspell_dictionary;
831 ALTER TABLE search.symspell_dictionary SET LOGGED;
832 VACUUM ANALYZE search.symspell_dictionary;