2 * @brief Spelling correction data for a honey database.
4 /* Copyright (C) 2004-2024 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <xapian/error.h>
24 #include <xapian/types.h>
26 #include "expand/expandweight.h"
27 #include "expand/termlistmerger.h"
28 #include "honey_spelling.h"
32 #include "../prefix_compressed_strings.h"
40 #include <string_view>
42 using namespace Honey
;
46 HoneySpellingTable::merge_changes()
48 for (auto i
: termlist_deltas
) {
49 const string
& key
= i
.first
;
50 const set
<string
>& changes
= i
.second
;
52 auto d
= changes
.begin();
53 if (d
== changes
.end()) continue;
57 PrefixCompressedStringWriter
out(updated
);
58 if (get_exact_entry(key
, current
)) {
59 PrefixCompressedStringItor
in(current
, key
);
60 updated
.reserve(current
.size()); // FIXME plus some?
61 while (!in
.at_end() && d
!= changes
.end()) {
62 const string
& word
= *in
;
63 Assert(d
!= changes
.end());
64 int cmp
= word
.compare(*d
);
72 // If an existing entry is in the changes list, that means
73 // we should remove it.
79 // FIXME : easy to optimise this to a fix-up and substring copy.
80 while (!in
.at_end()) {
85 while (d
!= changes
.end()) {
88 if (!updated
.empty()) {
94 termlist_deltas
.clear();
96 for (auto j
= wordfreq_changes
.begin(); j
!= wordfreq_changes
.end(); ++j
) {
97 const string
& key
= make_spelling_wordlist_key(j
->first
);
98 Xapian::termcount wordfreq
= j
->second
;
101 pack_uint_last(tag
, wordfreq
);
103 if (wordfreq
> wordfreq_upper_bound
)
104 wordfreq_upper_bound
= wordfreq
;
109 wordfreq_changes
.clear();
113 HoneySpellingTable::toggle_fragment(fragment frag
, const string
& word
)
115 auto i
= termlist_deltas
.find(frag
);
116 if (i
== termlist_deltas
.end()) {
117 i
= termlist_deltas
.insert(make_pair(frag
, set
<string
>())).first
;
119 // The commonest case is that we're adding lots of words, so try insert
120 // first and if that reports that the word already exists, remove it.
121 auto res
= i
->second
.insert(word
);
123 // word is already in the set, so remove it.
124 i
->second
.erase(res
.first
);
129 HoneySpellingTable::add_word(const string
& word
, Xapian::termcount freqinc
)
131 if (word
.size() <= 1) return;
133 auto i
= wordfreq_changes
.find(word
);
134 if (i
!= wordfreq_changes
.end()) {
135 // Word "word" already exists and has been modified.
137 i
->second
+= freqinc
;
140 // If "word" is currently modified such that it no longer exists, so
141 // we need to execute the code below to re-add trigrams for it.
145 if (get_exact_entry(make_spelling_wordlist_key(word
), data
)) {
146 // Word "word" already exists, so increment its count.
147 Xapian::termcount freq
;
148 const char* p
= data
.data();
149 if (!unpack_uint_last(&p
, p
+ data
.size(), &freq
) || freq
== 0) {
150 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
152 wordfreq_changes
[word
] = freq
+ freqinc
;
155 wordfreq_changes
[word
] = freqinc
;
158 // Add trigrams for word.
163 HoneySpellingTable::remove_word(const string
& word
, Xapian::termcount freqdec
)
165 if (word
.size() <= 1) return freqdec
;
167 auto i
= wordfreq_changes
.find(word
);
168 if (i
!= wordfreq_changes
.end()) {
169 if (i
->second
== 0) {
170 // Word has already been deleted.
173 // Word "word" exists and has been modified.
174 if (freqdec
< i
->second
) {
175 i
->second
-= freqdec
;
178 freqdec
-= i
->second
;
180 // Mark word as deleted.
184 if (!get_exact_entry(make_spelling_wordlist_key(word
), data
)) {
185 // This word doesn't exist.
189 Xapian::termcount freq
;
190 const char* p
= data
.data();
191 if (!unpack_uint_last(&p
, p
+ data
.size(), &freq
)) {
192 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
194 if (freqdec
< freq
) {
195 wordfreq_changes
[word
] = freq
- freqdec
;
200 // Mark word as deleted.
201 wordfreq_changes
[word
] = 0;
204 // Remove trigrams for word.
211 HoneySpellingTable::toggle_word(const string
& word
)
215 if (word
.size() <= 4) {
216 // We also generate 'bookends' for two, three, and four character
217 // terms so we can handle transposition of the middle two characters
218 // of a four character word, substitution or deletion of the middle
219 // character of a three character word, or insertion in the middle of a
220 // two character word.
222 buf
[0] = KEY_PREFIX_BOOKEND
;
224 buf
[2] = word
[word
.size() - 1];
225 toggle_fragment(buf
, word
);
229 buf
[0] = KEY_PREFIX_HEAD
;
232 toggle_fragment(buf
, word
);
235 buf
[0] = KEY_PREFIX_TAIL
;
236 buf
[1] = word
[word
.size() - 2];
237 buf
[2] = word
[word
.size() - 1];
238 toggle_fragment(buf
, word
);
240 if (word
.size() > 2) {
243 buf
[0] = KEY_PREFIX_MIDDLE
;
244 for (size_t start
= 0; start
<= word
.size() - 3; ++start
) {
245 memcpy(buf
.data
+ 1, word
.data() + start
, 3);
246 // Don't toggle the same fragment twice or it will cancel out.
247 // Bug fixed in 1.2.6.
248 if (done
.insert(buf
).second
)
249 toggle_fragment(buf
, word
);
254 struct TermListGreaterApproxSize
{
255 bool operator()(const TermList
* a
, const TermList
* b
) const {
256 return a
->get_approx_size() > b
->get_approx_size();
261 HoneySpellingTable::open_termlist(string_view word
)
263 // This should have been handled by Database::get_spelling_suggestion().
264 AssertRel(word
.size(),>,1);
266 // Merge any pending changes to disk, but don't call commit() so they
267 // won't be switched live.
268 if (!wordfreq_changes
.empty()) merge_changes();
270 vector
<TermList
*> termlists
;
275 if (word
.size() <= 4) {
276 // We also generate 'bookends' for two, three, and four character
277 // terms so we can handle transposition of the middle two
278 // characters of a four character word, substitution or deletion of
279 // the middle character of a three character word, or insertion in
280 // the middle of a two character word.
281 buf
[0] = KEY_PREFIX_BOOKEND
;
283 buf
[2] = word
[word
.size() - 1];
284 if (get_exact_entry(string(buf
), data
))
285 termlists
.push_back(new HoneySpellingTermList(data
, buf
.data
));
289 buf
[0] = KEY_PREFIX_HEAD
;
292 if (get_exact_entry(string(buf
), data
))
293 termlists
.push_back(new HoneySpellingTermList(data
, buf
.data
));
295 if (word
.size() == 2) {
296 // For two letter words, we generate H and T terms for the
297 // transposed form so that we can produce good spelling
302 if (get_exact_entry(string(buf
), data
))
303 termlists
.push_back(new HoneySpellingTermList(data
, buf
.data
));
304 buf
[0] = KEY_PREFIX_TAIL
;
305 if (get_exact_entry(string(buf
), data
))
306 termlists
.push_back(new HoneySpellingTermList(data
, buf
.data
));
310 buf
[0] = KEY_PREFIX_TAIL
;
311 buf
[1] = word
[word
.size() - 2];
312 buf
[2] = word
[word
.size() - 1];
313 if (get_exact_entry(string(buf
), data
))
314 termlists
.push_back(new HoneySpellingTermList(data
, buf
.data
));
316 if (word
.size() > 2) {
318 buf
[0] = KEY_PREFIX_MIDDLE
;
319 for (size_t start
= 0; start
<= word
.size() - 3; ++start
) {
320 memcpy(buf
.data
+ 1, word
.data() + start
, 3);
321 if (get_exact_entry(string(buf
), data
))
322 termlists
.push_back(new HoneySpellingTermList(data
));
325 if (word
.size() == 3) {
326 // For three letter words, we generate the two "single
327 // transposition" forms too, so that we can produce good
328 // spelling suggestions.
332 if (get_exact_entry(string(buf
), data
))
333 termlists
.push_back(new HoneySpellingTermList(data
));
338 if (get_exact_entry(string(buf
), data
))
339 termlists
.push_back(new HoneySpellingTermList(data
));
343 return make_termlist_merger(termlists
);
345 // Make sure we delete all the TermList objects to avoid leaking
347 for (auto& t
: termlists
) {
355 HoneySpellingTable::get_word_frequency(string_view word
) const
357 auto i
= wordfreq_changes
.find(word
);
358 if (i
!= wordfreq_changes
.end()) {
359 // Modified frequency for word:
364 if (get_exact_entry(make_spelling_wordlist_key(word
), data
)) {
365 // Word "word" already exists.
366 Xapian::termcount freq
;
367 const char* p
= data
.data();
368 if (!unpack_uint_last(&p
, p
+ data
.size(), &freq
)) {
369 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
377 ///////////////////////////////////////////////////////////////////////////
380 HoneySpellingTermList::get_approx_size() const
382 // This is only used to decide how to build a OR-tree of TermList objects
383 // so we just need to return "sizes" which are ordered roughly correctly.
388 HoneySpellingTermList::get_wdf() const
394 HoneySpellingTermList::get_termfreq() const
400 HoneySpellingTermList::next()
402 if (p
== data
.size()) {
407 if (rare(tail
< 0)) {
409 keep
= current_term
.size() - tail
;
410 } else if (usual(!current_term
.empty())) {
411 keep
= data
[p
++] ^ MAGIC_XOR_VALUE
;
414 if (p
== data
.size() ||
415 (add
= data
[p
] ^ MAGIC_XOR_VALUE
) >= data
.size() - p
) {
416 throw Xapian::DatabaseCorruptError("Bad spelling data (too little "
419 if (rare(keep
+ tail
> current_term
.size())) {
420 // The initial part to keep overlaps with the tail part which is an
421 // unusual case requiring special handling.
422 string
tail_string(current_term
, current_term
.size() - tail
);
423 current_term
.replace(keep
, string::npos
, data
.data() + p
+ 1, add
);
424 current_term
+= tail_string
;
426 current_term
.replace(keep
, current_term
.size() - tail
- keep
,
427 data
.data() + p
+ 1, add
);
435 HoneySpellingTermList::skip_to(string_view term
)
437 while (current_term
< term
) {
438 if (HoneySpellingTermList::next())
445 HoneySpellingTermList::positionlist_count() const
448 Xapian::UnimplementedError("HoneySpellingTermList::"
449 "positionlist_count() "
454 HoneySpellingTermList::positionlist_begin() const
457 Xapian::UnimplementedError("HoneySpellingTermList::"
458 "positionlist_begin() "