3 SELECT evergreen.upgrade_deps_block_check('1256', :eg_version);
5 CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
6 CREATE EXTENSION IF NOT EXISTS pg_trgm;
8 INSERT INTO config.internal_flag (name, value, enabled) VALUES ('symspell.prefix_length', '6', TRUE);
9 INSERT INTO config.internal_flag (name, value, enabled) VALUES ('symspell.max_edit_distance', '3', TRUE);
11 INSERT into config.org_unit_setting_type
12 ( name, grp, label, description, datatype )
14 ( 'opac.did_you_mean.max_suggestions', 'opac',
16 'opac.did_you_mean.max_suggestions',
17 'Maximum number of spelling suggestions that may be offered',
20 'opac.did_you_mean.max_suggestions',
21 'If set to -1, provide "best" suggestion if mispelled; if set higher than 0, the maximum suggestions that can be provided; if set to 0, disable suggestions.',
22 'coust', 'description'),
25 INSERT into config.org_unit_setting_type
26 ( name, grp, label, description, datatype )
28 ( 'opac.did_you_mean.low_result_threshold', 'opac',
30 'opac.did_you_mean.low_result_threshold',
31 'Maximum search result count at which spelling suggestions may be offered',
34 'opac.did_you_mean.low_result_threshold',
35 'If a search results in this number or fewer results, and there are correctable spelling mistakes, a suggested search may be provided.',
36 'coust', 'description'),
39 INSERT into config.org_unit_setting_type
40 ( name, grp, label, description, datatype )
42 ( 'search.symspell.min_suggestion_use_threshold', 'opac',
44 'search.symspell.min_suggestion_use_threshold',
45 'Minimum required uses of a spelling suggestions that may be offered',
48 'search.symspell.min_suggestion_use_threshold',
49 'The number of bibliographic records (more or less) that a spelling suggestion must appear in to be considered before offering it to a user. Defaults to 1 (must appear in the bib data).',
50 'coust', 'description'),
53 INSERT into config.org_unit_setting_type
54 ( name, grp, label, description, datatype )
56 ( 'search.symspell.soundex.weight', 'opac',
58 'search.symspell.soundex.weight',
59 'Soundex score weighting in OPAC spelling suggestions.',
62 'search.symspell.soundex.weight',
63 'Soundex, trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled soundex component. Defaults to 0 for "off".',
64 'coust', 'description'),
67 INSERT into config.org_unit_setting_type
68 ( name, grp, label, description, datatype )
70 ( 'search.symspell.pg_trgm.weight', 'opac',
72 'search.symspell.pg_trgm.weight',
73 'Pg_trgm score weighting in OPAC spelling suggestions.',
76 'search.symspell.pg_trgm.weight',
77 'Soundex, pg_trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled pg_trgm component. Defaults to 0 for "off".',
78 'coust', 'description'),
81 INSERT into config.org_unit_setting_type
82 ( name, grp, label, description, datatype )
84 ( 'search.symspell.keyboard_distance.weight', 'opac',
86 'search.symspell.keyboard_distance.weight',
87 'Keyboard distance score weighting in OPAC spelling suggestions.',
90 'search.symspell.keyboard_distance.weight',
91 'Soundex, trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled keyboard distance component. Defaults to 0 for "off".',
92 'coust', 'description'),
95 CREATE OR REPLACE FUNCTION evergreen.uppercase( TEXT ) RETURNS TEXT AS $$
97 $$ LANGUAGE PLPERLU STRICT IMMUTABLE;
99 CREATE OR REPLACE FUNCTION evergreen.text_array_merge_unique (
101 ) RETURNS TEXT[] AS $F$
103 SELECT * FROM UNNEST($1) x WHERE x IS NOT NULL
105 SELECT * FROM UNNEST($2) y WHERE y IS NOT NULL
109 CREATE OR REPLACE FUNCTION evergreen.qwerty_keyboard_distance ( a TEXT, b TEXT ) RETURNS NUMERIC AS $F$
110 use String::KeyboardDistance qw(:all);
111 return qwerty_keyboard_distance(@_);
112 $F$ LANGUAGE PLPERLU STRICT IMMUTABLE;
114 CREATE OR REPLACE FUNCTION evergreen.qwerty_keyboard_distance_match ( a TEXT, b TEXT ) RETURNS NUMERIC AS $F$
115 use String::KeyboardDistance qw(:all);
116 return qwerty_keyboard_distance_match(@_);
117 $F$ LANGUAGE PLPERLU STRICT IMMUTABLE;
119 CREATE OR REPLACE FUNCTION evergreen.levenshtein_damerau_edistance ( a TEXT, b TEXT, INT ) RETURNS NUMERIC AS $F$
120 use Text::Levenshtein::Damerau::XS qw/xs_edistance/;
121 return xs_edistance(@_);
122 $F$ LANGUAGE PLPERLU STRICT IMMUTABLE;
124 CREATE TABLE search.symspell_dictionary (
125 keyword_count INT NOT NULL DEFAULT 0,
126 title_count INT NOT NULL DEFAULT 0,
127 author_count INT NOT NULL DEFAULT 0,
128 subject_count INT NOT NULL DEFAULT 0,
129 series_count INT NOT NULL DEFAULT 0,
130 identifier_count INT NOT NULL DEFAULT 0,
132 prefix_key TEXT PRIMARY KEY,
134 keyword_suggestions TEXT[],
135 title_suggestions TEXT[],
136 author_suggestions TEXT[],
137 subject_suggestions TEXT[],
138 series_suggestions TEXT[],
139 identifier_suggestions TEXT[]
140 ) WITH (fillfactor = 80);
142 CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT )
143 RETURNS SETOF TEXT AS $F$
144 SELECT UNNEST(x) FROM regexp_matches($1, '([[:alnum:]]+''*[[:alnum:]]*)', 'g') x;
145 $F$ LANGUAGE SQL STRICT IMMUTABLE;
147 -- This version does not preserve input word order!
148 CREATE OR REPLACE FUNCTION search.symspell_parse_words_distinct ( phrase TEXT )
149 RETURNS SETOF TEXT AS $F$
150 SELECT DISTINCT UNNEST(x) FROM regexp_matches($1, '([[:alnum:]]+''*[[:alnum:]]*)', 'g') x;
151 $F$ LANGUAGE SQL STRICT IMMUTABLE;
153 CREATE OR REPLACE FUNCTION search.symspell_transfer_casing ( withCase TEXT, withoutCase TEXT )
160 woChars := regexp_split_to_array(withoutCase,'');
161 FOR curr IN SELECT x FROM regexp_split_to_table(withCase, '') x LOOP
162 IF curr = evergreen.uppercase(curr) THEN
163 woChars[ind] := evergreen.uppercase(woChars[ind]);
167 RETURN ARRAY_TO_STRING(woChars,'');
169 $F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
171 CREATE OR REPLACE FUNCTION search.symspell_generate_edits (
175 ) RETURNS TEXT[] AS $F$
179 sublist TEXT[] := '{}';
181 FOR I IN 1 .. CHARACTER_LENGTH(raw_word) LOOP
182 item := SUBSTRING(raw_word FROM 1 FOR I - 1) || SUBSTRING(raw_word FROM I + 1);
183 IF NOT list @> ARRAY[item] THEN
184 list := item || list;
185 IF dist < maxED AND CHARACTER_LENGTH(raw_word) > dist + 1 THEN
186 sublist := search.symspell_generate_edits(item, dist + 1, maxED) || sublist;
192 RETURN evergreen.text_array_merge_unique(list, sublist);
194 RETURN list || sublist;
197 $F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
199 -- DROP TYPE search.symspell_lookup_output CASCADE;
200 CREATE TYPE search.symspell_lookup_output AS (
202 suggestion_count INT,
205 qwerty_kb_match NUMERIC,
210 prefix_key_count INT,
214 CREATE OR REPLACE FUNCTION search.symspell_lookup (
217 verbosity INT DEFAULT 2,
218 xfer_case BOOL DEFAULT FALSE,
219 count_threshold INT DEFAULT 1,
220 soundex_weight INT DEFAULT 0,
221 pg_trgm_weight INT DEFAULT 0,
222 kbdist_weight INT DEFAULT 0
223 ) RETURNS SETOF search.symspell_lookup_output AS $F$
228 edit_list TEXT[] := '{}';
229 seen_list TEXT[] := '{}';
230 output search.symspell_lookup_output;
231 output_list search.symspell_lookup_output[];
239 smallest_ed INT := -1;
242 SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
243 prefix_length := COALESCE(prefix_length, 6);
245 SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
246 maxED := COALESCE(maxED, 3);
248 word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
250 -- Common case exact match test for preformance
251 IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
253 'SELECT '||search_class||'_suggestions AS suggestions,
254 '||search_class||'_count AS count,
256 FROM search.symspell_dictionary
257 WHERE prefix_key = $1
258 AND '||search_class||'_count >= $2
259 AND '||search_class||'_suggestions @> ARRAY[$1]'
260 INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
261 IF entry.prefix_key IS NOT NULL THEN
262 output.lev_distance := 0; -- definitionally
263 output.prefix_key := entry.prefix_key;
264 output.prefix_key_count := entry.count;
265 output.suggestion_count := entry.count;
266 output.input := word_list[1];
268 output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
270 output.suggestion := entry.prefix_key;
272 output.norm_input := entry.prefix_key;
273 output.qwerty_kb_match := 1;
274 output.pg_trgm_sim := 1;
275 output.soundex_sim := 1;
282 FOREACH word IN ARRAY word_list LOOP
284 input := evergreen.lowercase(word);
286 IF CHARACTER_LENGTH(input) > prefix_length THEN
287 prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
288 edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, maxED);
290 edit_list := input || search.symspell_generate_edits(input, 1, maxED);
293 SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
300 FOREACH entry_key IN ARRAY edit_list LOOP
302 IF global_ed IS NOT NULL THEN
303 smallest_ed := global_ed;
306 'SELECT '||search_class||'_suggestions AS suggestions,
307 '||search_class||'_count AS count,
309 FROM search.symspell_dictionary
310 WHERE prefix_key = $1
311 AND '||search_class||'_suggestions IS NOT NULL'
314 FOREACH sugg IN ARRAY entry.suggestions LOOP
315 IF NOT seen_list @> ARRAY[sugg] THEN
316 seen_list := seen_list || sugg;
317 IF input = sugg THEN -- exact match, no need to spend time on a call
318 output.lev_distance := 0;
319 output.suggestion_count = entry.count;
320 ELSIF ABS(CHARACTER_LENGTH(input) - CHARACTER_LENGTH(sugg)) > maxED THEN
321 -- They are definitionally too different to consider, just move on.
324 --output.lev_distance := levenshtein_less_equal(
325 output.lev_distance := evergreen.levenshtein_damerau_edistance(
330 IF output.lev_distance < 0 THEN
331 -- The Perl module returns -1 for "more distant than max".
332 output.lev_distance := maxED + 1;
333 -- This short-circuit's the count test below for speed, bypassing
334 -- a couple useless tests.
335 output.suggestion_count := -1;
337 EXECUTE 'SELECT '||search_class||'_count FROM search.symspell_dictionary WHERE prefix_key = $1'
338 INTO output.suggestion_count USING sugg;
342 -- The caller passes a minimum suggestion count threshold (or uses
343 -- the default of 0) and if the suggestion has that many or less uses
344 -- then we move on to the next suggestion, since this one is too rare.
345 CONTINUE WHEN output.suggestion_count < COALESCE(count_threshold,1);
347 -- Track the smallest edit distance among suggestions from this prefix key.
348 IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
349 smallest_ed := output.lev_distance;
352 -- Track the smallest edit distance for all prefix keys for this word.
353 IF global_ed IS NULL OR smallest_ed < global_ed THEN
354 global_ed = smallest_ed;
357 -- Only proceed if the edit distance is <= the max for the dictionary.
358 IF output.lev_distance <= maxED THEN
359 IF output.lev_distance > global_ed AND verbosity <= 1 THEN
360 -- Lev distance is our main similarity measure. While
361 -- trgm or soundex similarity could be the main filter,
362 -- Lev is both language agnostic and faster.
364 -- Here we will skip suggestions that have a longer edit distance
365 -- than the shortest we've already found. This is simply an
366 -- optimization that allows us to avoid further processing
367 -- of this entry. It would be filtered out later.
372 -- If we have an exact match on the suggestion key we can also avoid
373 -- some function calls.
374 IF output.lev_distance = 0 THEN
375 output.qwerty_kb_match := 1;
376 output.pg_trgm_sim := 1;
377 output.soundex_sim := 1;
379 output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
380 output.pg_trgm_sim := similarity(input, sugg);
381 output.soundex_sim := difference(input, sugg) / 4.0;
384 -- Fill in some fields
386 output.suggestion := search.symspell_transfer_casing(word, sugg);
388 output.suggestion := sugg;
390 output.prefix_key := entry.prefix_key;
391 output.prefix_key_count := entry.count;
392 output.input := word;
393 output.norm_input := input;
394 output.word_pos := w_pos;
396 -- We can't "cache" a set of generated records directly, so
397 -- here we build up an array of search.symspell_lookup_output
398 -- records that we can revivicate later as a table using UNNEST().
399 output_list := output_list || output;
401 EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
402 CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
403 END IF; -- maxED test
404 END IF; -- suggestion not seen test
405 END LOOP; -- loop over suggestions
406 END LOOP; -- loop over entries
407 END LOOP; -- loop over entry_keys
409 -- Now we're done examining this word
410 IF verbosity = 0 THEN
411 -- Return the "best" suggestion from the smallest edit
412 -- distance group. We define best based on the weighting
413 -- of the non-lev similarity measures and use the suggestion
414 -- use count to break ties.
416 SELECT * FROM UNNEST(output_list)
417 ORDER BY lev_distance,
418 (soundex_sim * COALESCE(soundex_weight,0))
419 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
420 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
421 suggestion_count DESC
423 ELSIF verbosity = 1 THEN
424 -- Return all suggestions from the smallest
425 -- edit distance group.
427 SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
428 ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
429 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
430 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
431 suggestion_count DESC;
432 ELSIF verbosity = 2 THEN
433 -- Return everything we find, along with relevant stats
435 SELECT * FROM UNNEST(output_list)
436 ORDER BY lev_distance,
437 (soundex_sim * COALESCE(soundex_weight,0))
438 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
439 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
440 suggestion_count DESC;
441 ELSIF verbosity = 3 THEN
442 -- Return everything we find from the two smallest edit distance groups
444 SELECT * FROM UNNEST(output_list)
445 WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
446 ORDER BY lev_distance,
447 (soundex_sim * COALESCE(soundex_weight,0))
448 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
449 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
450 suggestion_count DESC;
451 ELSIF verbosity = 4 THEN
452 -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
454 SELECT * FROM UNNEST(output_list)
455 WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
456 ORDER BY lev_distance,
457 (soundex_sim * COALESCE(soundex_weight,0))
458 + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
459 + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
460 suggestion_count DESC;
462 END LOOP; -- loop over words
464 $F$ LANGUAGE PLPGSQL;
466 CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry (
469 no_limit BOOL DEFAULT FALSE,
470 prefix_length INT DEFAULT 6,
472 ) RETURNS SETOF search.symspell_dictionary AS $F$
477 entry search.symspell_dictionary%ROWTYPE;
481 IF NOT no_limit AND CHARACTER_LENGTH(raw_input) > prefix_length THEN
482 key := SUBSTRING(key FROM 1 FOR prefix_length);
483 key_list := ARRAY[raw_input, key];
485 key_list := ARRAY[key];
488 FOREACH del_key IN ARRAY key_list LOOP
489 entry.prefix_key := del_key;
491 entry.keyword_count := 0;
492 entry.title_count := 0;
493 entry.author_count := 0;
494 entry.subject_count := 0;
495 entry.series_count := 0;
496 entry.identifier_count := 0;
498 entry.keyword_suggestions := '{}';
499 entry.title_suggestions := '{}';
500 entry.author_suggestions := '{}';
501 entry.subject_suggestions := '{}';
502 entry.series_suggestions := '{}';
503 entry.identifier_suggestions := '{}';
505 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
506 IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF;
507 IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF;
508 IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF;
509 IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF;
510 IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF;
511 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
513 IF del_key = raw_input THEN
514 IF source_class = 'keyword' THEN entry.keyword_count := 1; END IF;
515 IF source_class = 'title' THEN entry.title_count := 1; END IF;
516 IF source_class = 'author' THEN entry.author_count := 1; END IF;
517 IF source_class = 'subject' THEN entry.subject_count := 1; END IF;
518 IF source_class = 'series' THEN entry.series_count := 1; END IF;
519 IF source_class = 'identifier' THEN entry.identifier_count := 1; END IF;
525 FOR del_key IN SELECT x FROM UNNEST(search.symspell_generate_edits(key, 1, maxED)) x LOOP
527 entry.keyword_suggestions := '{}';
528 entry.title_suggestions := '{}';
529 entry.author_suggestions := '{}';
530 entry.subject_suggestions := '{}';
531 entry.series_suggestions := '{}';
532 entry.identifier_suggestions := '{}';
534 IF source_class = 'keyword' THEN entry.keyword_count := 0; END IF;
535 IF source_class = 'title' THEN entry.title_count := 0; END IF;
536 IF source_class = 'author' THEN entry.author_count := 0; END IF;
537 IF source_class = 'subject' THEN entry.subject_count := 0; END IF;
538 IF source_class = 'series' THEN entry.series_count := 0; END IF;
539 IF source_class = 'identifier' THEN entry.identifier_count := 0; END IF;
541 entry.prefix_key := del_key;
543 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
544 IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF;
545 IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF;
546 IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF;
547 IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF;
548 IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF;
549 IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF;
555 $F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
557 CREATE OR REPLACE FUNCTION search.symspell_build_entries (
560 old_input TEXT DEFAULT NULL,
561 include_phrases BOOL DEFAULT FALSE
562 ) RETURNS SETOF search.symspell_dictionary AS $F$
569 entry search.symspell_dictionary;
571 IF full_input IS NOT NULL THEN
572 SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
573 prefix_length := COALESCE(prefix_length, 6);
575 SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
576 maxED := COALESCE(maxED, 3);
578 input := evergreen.lowercase(full_input);
579 word_list := ARRAY_AGG(x) FROM search.symspell_parse_words_distinct(input) x;
581 IF CARDINALITY(word_list) > 1 AND include_phrases THEN
582 RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(input, source_class, TRUE, prefix_length, maxED);
585 FOREACH word IN ARRAY word_list LOOP
586 RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(word, source_class, FALSE, prefix_length, maxED);
590 IF old_input IS NOT NULL THEN
591 input := evergreen.lowercase(old_input);
593 FOR word IN SELECT x FROM search.symspell_parse_words_distinct(input) x LOOP
594 entry.prefix_key := word;
596 entry.keyword_count := 0;
597 entry.title_count := 0;
598 entry.author_count := 0;
599 entry.subject_count := 0;
600 entry.series_count := 0;
601 entry.identifier_count := 0;
603 entry.keyword_suggestions := '{}';
604 entry.title_suggestions := '{}';
605 entry.author_suggestions := '{}';
606 entry.subject_suggestions := '{}';
607 entry.series_suggestions := '{}';
608 entry.identifier_suggestions := '{}';
610 IF source_class = 'keyword' THEN entry.keyword_count := -1; END IF;
611 IF source_class = 'title' THEN entry.title_count := -1; END IF;
612 IF source_class = 'author' THEN entry.author_count := -1; END IF;
613 IF source_class = 'subject' THEN entry.subject_count := -1; END IF;
614 IF source_class = 'series' THEN entry.series_count := -1; END IF;
615 IF source_class = 'identifier' THEN entry.identifier_count := -1; END IF;
621 $F$ LANGUAGE PLPGSQL;
623 CREATE OR REPLACE FUNCTION search.symspell_build_and_merge_entries (
626 old_input TEXT DEFAULT NULL,
627 include_phrases BOOL DEFAULT FALSE
628 ) RETURNS SETOF search.symspell_dictionary AS $F$
631 conflict_entry RECORD;
634 IF full_input = old_input THEN -- neither NULL, and are the same
638 FOR new_entry IN EXECUTE $q$
641 evergreen.text_array_merge_unique(s,'{}') suggestions
642 FROM (SELECT prefix_key,
643 ARRAY_AGG($q$ || source_class || $q$_suggestions[1]) s,
644 SUM($q$ || source_class || $q$_count) count
645 FROM search.symspell_build_entries($1, $2, $3, $4)
647 $q$ USING full_input, source_class, old_input, include_phrases
651 $q$ || source_class || $q$_suggestions suggestions,
652 $q$ || source_class || $q$_count count
653 FROM search.symspell_dictionary
654 WHERE prefix_key = $1 $q$
656 USING new_entry.prefix_key;
658 IF new_entry.count <> 0 THEN -- Real word, and count changed
659 IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating
660 IF conflict_entry.count > 0 THEN -- it's a real word
661 RETURN QUERY EXECUTE $q$
662 UPDATE search.symspell_dictionary
663 SET $q$ || source_class || $q$_count = $2
664 WHERE prefix_key = $1
666 USING new_entry.prefix_key, GREATEST(0, new_entry.count + conflict_entry.count);
667 ELSE -- it was a prefix key or delete-emptied word before
668 IF conflict_entry.suggestions @> new_entry.suggestions THEN -- already have all suggestions here...
669 RETURN QUERY EXECUTE $q$
670 UPDATE search.symspell_dictionary
671 SET $q$ || source_class || $q$_count = $2
672 WHERE prefix_key = $1
674 USING new_entry.prefix_key, GREATEST(0, new_entry.count);
675 ELSE -- new suggestion!
676 RETURN QUERY EXECUTE $q$
677 UPDATE search.symspell_dictionary
678 SET $q$ || source_class || $q$_count = $2,
679 $q$ || source_class || $q$_suggestions = $3
680 WHERE prefix_key = $1
682 USING new_entry.prefix_key, GREATEST(0, new_entry.count), evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions);
686 -- We keep the on-conflict clause just in case...
687 RETURN QUERY EXECUTE $q$
688 INSERT INTO search.symspell_dictionary AS d (
689 $q$ || source_class || $q$_count,
691 $q$ || source_class || $q$_suggestions
692 ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO
693 UPDATE SET $q$ || source_class || $q$_count = d.$q$ || source_class || $q$_count + EXCLUDED.$q$ || source_class || $q$_count,
694 $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions)
696 USING new_entry.count, new_entry.prefix_key, new_entry.suggestions;
698 ELSE -- key only, or no change
699 IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating
700 IF NOT conflict_entry.suggestions @> new_entry.suggestions THEN -- There are new suggestions
701 RETURN QUERY EXECUTE $q$
702 UPDATE search.symspell_dictionary
703 SET $q$ || source_class || $q$_suggestions = $2
704 WHERE prefix_key = $1
706 USING new_entry.prefix_key, evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions);
709 RETURN QUERY EXECUTE $q$
710 INSERT INTO search.symspell_dictionary AS d (
711 $q$ || source_class || $q$_count,
713 $q$ || source_class || $q$_suggestions
714 ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO -- key exists, suggestions may be added due to this entry
715 UPDATE SET $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions)
717 USING new_entry.count, new_entry.prefix_key, new_entry.suggestions;
722 $F$ LANGUAGE PLPGSQL;
724 CREATE OR REPLACE FUNCTION search.symspell_maintain_entries () RETURNS TRIGGER AS $f$
727 new_value TEXT := NULL;
728 old_value TEXT := NULL;
730 search_class := COALESCE(TG_ARGV[0], SPLIT_PART(TG_TABLE_NAME,'_',1));
732 IF TG_OP IN ('INSERT', 'UPDATE') THEN
733 new_value := NEW.value;
736 IF TG_OP IN ('DELETE', 'UPDATE') THEN
737 old_value := OLD.value;
740 PERFORM * FROM search.symspell_build_and_merge_entries(new_value, search_class, old_value);
742 RETURN NULL; -- always fired AFTER
744 $f$ LANGUAGE PLPGSQL;
746 CREATE TRIGGER maintain_symspell_entries_tgr
747 AFTER INSERT OR UPDATE OR DELETE ON metabib.title_field_entry
748 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
750 CREATE TRIGGER maintain_symspell_entries_tgr
751 AFTER INSERT OR UPDATE OR DELETE ON metabib.author_field_entry
752 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
754 CREATE TRIGGER maintain_symspell_entries_tgr
755 AFTER INSERT OR UPDATE OR DELETE ON metabib.subject_field_entry
756 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
758 CREATE TRIGGER maintain_symspell_entries_tgr
759 AFTER INSERT OR UPDATE OR DELETE ON metabib.series_field_entry
760 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
762 CREATE TRIGGER maintain_symspell_entries_tgr
763 AFTER INSERT OR UPDATE OR DELETE ON metabib.keyword_field_entry
764 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
766 CREATE TRIGGER maintain_symspell_entries_tgr
767 AFTER INSERT OR UPDATE OR DELETE ON metabib.identifier_field_entry
768 FOR EACH ROW EXECUTE PROCEDURE search.symspell_maintain_entries();
772 /* This will generate the queries needed to generate the /file/ that can
773 * be used to populate the dictionary table.
775 select $z$select $y$select $y$||x.id||$y$, '$z$||x.x||$z$', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '$z$||x||$z$');$y$ from metabib.$z$||x||$z$_field_entry x;$z$ from (select 'keyword'::text x union select 'title' union select 'author' union select 'subject' union select 'series' union select 'identifier') x;
780 \qecho 'The following should be run at the end of the upgrade before any'
781 \qecho 'reingest occurs. Because new triggers are installed already,'
782 \qecho 'updates to indexed strings will cause zero-count dictionary entries'
783 \qecho 'to be recorded which will require updating every row again (or'
784 \qecho 'starting from scratch) so best to do this before other batch'
785 \qecho 'changes. A later reingest that does not significantly change'
786 \qecho 'indexed strings will /not/ cause table bloat here, and will be'
787 \qecho 'as fast as normal. A copy of the SQL in a ready-to-use, non-escaped'
788 \qecho 'form is available inside a comment at the end of this upgrade sub-'
789 \qecho 'script so you do not need to copy this comment from the psql ouptut.'
795 \qecho 'select value from metabib.title_field_entry;'
797 \qecho 'select value from metabib.author_field_entry;'
799 \qecho 'select value from metabib.subject_field_entry;'
801 \qecho 'select value from metabib.series_field_entry;'
802 \qecho '\\o identifier'
803 \qecho 'select value from metabib.identifier_field_entry;'
805 \qecho 'select value from metabib.keyword_field_entry;'
811 \qecho '// Then, at the command line:'
813 \qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql'
814 \qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql'
815 \qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql'
816 \qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql'
817 \qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql'
818 \qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql'
820 \qecho '// And, back in psql'
822 \qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;'
823 \qecho 'TRUNCATE search.symspell_dictionary;'
825 \qecho '\\i identifier.sql'
826 \qecho '\\i author.sql'
827 \qecho '\\i title.sql'
828 \qecho '\\i subject.sql'
829 \qecho '\\i series.sql'
830 \qecho '\\i keyword.sql'
832 \qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;'
833 \qecho 'REINDEX TABLE search.symspell_dictionary;'
834 \qecho 'ALTER TABLE search.symspell_dictionary SET LOGGED;'
835 \qecho 'VACUUM ANALYZE search.symspell_dictionary;'
837 \qecho 'DROP TABLE search.symspell_dictionary_partial_title;'
838 \qecho 'DROP TABLE search.symspell_dictionary_partial_author;'
839 \qecho 'DROP TABLE search.symspell_dictionary_partial_subject;'
840 \qecho 'DROP TABLE search.symspell_dictionary_partial_series;'
841 \qecho 'DROP TABLE search.symspell_dictionary_partial_identifier;'
842 \qecho 'DROP TABLE search.symspell_dictionary_partial_keyword;'
850 select value from metabib.title_field_entry;
853 select value from metabib.author_field_entry;
856 select value from metabib.subject_field_entry;
859 select value from metabib.series_field_entry;
862 select value from metabib.identifier_field_entry;
865 select value from metabib.keyword_field_entry;
871 // Then, at the command line:
873 $ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql
874 $ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql
875 $ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql
876 $ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql
877 $ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql
878 $ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql
880 // To the extent your hardware allows, the above commands can be run in
881 // in parallel, in different shells. Each will use a full CPU, and RAM
882 // may be a limiting resource, so keep an eye on that with `top`.
887 ALTER TABLE search.symspell_dictionary SET UNLOGGED;
888 TRUNCATE search.symspell_dictionary;
897 CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;
898 REINDEX TABLE search.symspell_dictionary;
899 ALTER TABLE search.symspell_dictionary SET LOGGED;
900 VACUUM ANALYZE search.symspell_dictionary;
902 DROP TABLE search.symspell_dictionary_partial_title;
903 DROP TABLE search.symspell_dictionary_partial_author;
904 DROP TABLE search.symspell_dictionary_partial_subject;
905 DROP TABLE search.symspell_dictionary_partial_series;
906 DROP TABLE search.symspell_dictionary_partial_identifier;
907 DROP TABLE search.symspell_dictionary_partial_keyword;