4 use List::MoreUtils qw/uniq/;
18 while (my $data = <>) {
21 chomp($data); $data=lc($data);
24 while( $data =~ m/([\w\d]+'*[\w\d]*)/g ) {
28 for my $raw (uniq @words) {
30 my $rlen = length($raw);
31 next if ($raw =~ /\d{5,}/ and $rlen > 4);
33 $dict{$key} //= [0,[]];
36 if ($dict{$key}[0] == 1) { # first time we've seen it, need to generate prefix keys
37 push @{$dict{$key}[1]}, $raw;
40 $key = substr($raw,0,$plen);
41 $dict{$key} //= [0,[]];
42 push @{$dict{$key}[1]}, $raw;
45 for my $edit (symspell_generate_edits($key, 1)) {
46 next unless length($edit);
47 next if (length($edit) <= ($plen - $maxed) and $rlen > $plen);
48 $dict{$edit} //= [0,[]];
49 push @{$dict{$edit}[1]}, $raw;
54 unless ($line % 10000) {
56 $secs = $etime - $stime;
57 warn "$line lines consumed from input in $secs seconds...\n";
62 $secs = $etime - $stime;
63 warn "Dictionary built in $secs seconds, writing...\n";
69 CREATE UNLOGGED TABLE search.symspell_dictionary_partial_$class (
72 ${class}_suggestions TEXT[]
75 COPY search.symspell_dictionary_partial_$class FROM STDIN;
78 while ( my ($key, $cl_dict) = each %dict ) {
80 print join( "\t", $key, $$cl_dict[0], (scalar(@{$$cl_dict[1]}) ? '{'.join(',', uniq @{$$cl_dict[1]}).'}' : '\N')) . "\n";
87 INSERT INTO search.symspell_dictionary (prefix_key, ${class}_count, ${class}_suggestions)
88 SELECT * FROM search.symspell_dictionary_partial_$class
89 ON CONFLICT (prefix_key) DO UPDATE
90 SET ${class}_count = EXCLUDED.${class}_count,
91 ${class}_suggestions = EXCLUDED.${class}_suggestions;
96 $secs = $etime - $stime;
97 warn "$counter dictionary prefix key entries written in $secs seconds.\n";
99 sub symspell_generate_edits {
105 my $len = length($word);
107 while ( $c <= $len ) {
108 my $item = substr($word, 0, $c - 1) . substr($word, $c);
110 if ($dist < $maxed) {
111 push @sublist, symspell_generate_edits($item, $dist + 1);
116 push @list, @sublist;
119 #warn join(', ', uniq @list) . "\n";