Update to Unicode 16.0.0
[xapian.git] / xapian-core / backends / honey / honey_spelling.cc
blob8354baf07785f31f74d745480c71b85a07eebe44
1 /** @file
2 * @brief Spelling correction data for a honey database.
3 */
4 /* Copyright (C) 2004-2024 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include <xapian/error.h>
24 #include <xapian/types.h>
26 #include "expand/expandweight.h"
27 #include "expand/termlistmerger.h"
28 #include "honey_spelling.h"
29 #include "omassert.h"
30 #include "pack.h"
32 #include "../prefix_compressed_strings.h"
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40 #include <string_view>
42 using namespace Honey;
43 using namespace std;
45 void
46 HoneySpellingTable::merge_changes()
48 for (auto i : termlist_deltas) {
49 const string& key = i.first;
50 const set<string>& changes = i.second;
52 auto d = changes.begin();
53 if (d == changes.end()) continue;
55 string updated;
56 string current;
57 PrefixCompressedStringWriter out(updated);
58 if (get_exact_entry(key, current)) {
59 PrefixCompressedStringItor in(current, key);
60 updated.reserve(current.size()); // FIXME plus some?
61 while (!in.at_end() && d != changes.end()) {
62 const string& word = *in;
63 Assert(d != changes.end());
64 int cmp = word.compare(*d);
65 if (cmp < 0) {
66 out.append(word);
67 ++in;
68 } else if (cmp > 0) {
69 out.append(*d);
70 ++d;
71 } else {
72 // If an existing entry is in the changes list, that means
73 // we should remove it.
74 ++in;
75 ++d;
78 if (!in.at_end()) {
79 // FIXME : easy to optimise this to a fix-up and substring copy.
80 while (!in.at_end()) {
81 out.append(*in++);
85 while (d != changes.end()) {
86 out.append(*d++);
88 if (!updated.empty()) {
89 add(key, updated);
90 } else {
91 del(key);
94 termlist_deltas.clear();
96 for (auto j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
97 const string& key = make_spelling_wordlist_key(j->first);
98 Xapian::termcount wordfreq = j->second;
99 if (wordfreq) {
100 string tag;
101 pack_uint_last(tag, wordfreq);
102 add(key, tag);
103 if (wordfreq > wordfreq_upper_bound)
104 wordfreq_upper_bound = wordfreq;
105 } else {
106 del(key);
109 wordfreq_changes.clear();
112 void
113 HoneySpellingTable::toggle_fragment(fragment frag, const string& word)
115 auto i = termlist_deltas.find(frag);
116 if (i == termlist_deltas.end()) {
117 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
119 // The commonest case is that we're adding lots of words, so try insert
120 // first and if that reports that the word already exists, remove it.
121 auto res = i->second.insert(word);
122 if (!res.second) {
123 // word is already in the set, so remove it.
124 i->second.erase(res.first);
128 void
129 HoneySpellingTable::add_word(const string& word, Xapian::termcount freqinc)
131 if (word.size() <= 1) return;
133 auto i = wordfreq_changes.find(word);
134 if (i != wordfreq_changes.end()) {
135 // Word "word" already exists and has been modified.
136 if (i->second) {
137 i->second += freqinc;
138 return;
140 // If "word" is currently modified such that it no longer exists, so
141 // we need to execute the code below to re-add trigrams for it.
142 i->second = freqinc;
143 } else {
144 string data;
145 if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
146 // Word "word" already exists, so increment its count.
147 Xapian::termcount freq;
148 const char* p = data.data();
149 if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
150 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
152 wordfreq_changes[word] = freq + freqinc;
153 return;
155 wordfreq_changes[word] = freqinc;
158 // Add trigrams for word.
159 toggle_word(word);
162 Xapian::termcount
163 HoneySpellingTable::remove_word(const string& word, Xapian::termcount freqdec)
165 if (word.size() <= 1) return freqdec;
167 auto i = wordfreq_changes.find(word);
168 if (i != wordfreq_changes.end()) {
169 if (i->second == 0) {
170 // Word has already been deleted.
171 return freqdec;
173 // Word "word" exists and has been modified.
174 if (freqdec < i->second) {
175 i->second -= freqdec;
176 return 0;
178 freqdec -= i->second;
180 // Mark word as deleted.
181 i->second = 0;
182 } else {
183 string data;
184 if (!get_exact_entry(make_spelling_wordlist_key(word), data)) {
185 // This word doesn't exist.
186 return freqdec;
189 Xapian::termcount freq;
190 const char* p = data.data();
191 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
192 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
194 if (freqdec < freq) {
195 wordfreq_changes[word] = freq - freqdec;
196 return 0;
198 freqdec -= freq;
200 // Mark word as deleted.
201 wordfreq_changes[word] = 0;
204 // Remove trigrams for word.
205 toggle_word(word);
207 return freqdec;
210 void
211 HoneySpellingTable::toggle_word(const string& word)
213 fragment buf(0);
215 if (word.size() <= 4) {
216 // We also generate 'bookends' for two, three, and four character
217 // terms so we can handle transposition of the middle two characters
218 // of a four character word, substitution or deletion of the middle
219 // character of a three character word, or insertion in the middle of a
220 // two character word.
221 // 'Bookends':
222 buf[0] = KEY_PREFIX_BOOKEND;
223 buf[1] = word[0];
224 buf[2] = word[word.size() - 1];
225 toggle_fragment(buf, word);
228 // Head:
229 buf[0] = KEY_PREFIX_HEAD;
230 buf[1] = word[0];
231 buf[2] = word[1];
232 toggle_fragment(buf, word);
234 // Tail:
235 buf[0] = KEY_PREFIX_TAIL;
236 buf[1] = word[word.size() - 2];
237 buf[2] = word[word.size() - 1];
238 toggle_fragment(buf, word);
240 if (word.size() > 2) {
241 set<fragment> done;
242 // Middles:
243 buf[0] = KEY_PREFIX_MIDDLE;
244 for (size_t start = 0; start <= word.size() - 3; ++start) {
245 memcpy(buf.data + 1, word.data() + start, 3);
246 // Don't toggle the same fragment twice or it will cancel out.
247 // Bug fixed in 1.2.6.
248 if (done.insert(buf).second)
249 toggle_fragment(buf, word);
254 struct TermListGreaterApproxSize {
255 bool operator()(const TermList* a, const TermList* b) const {
256 return a->get_approx_size() > b->get_approx_size();
260 TermList*
261 HoneySpellingTable::open_termlist(string_view word)
263 // This should have been handled by Database::get_spelling_suggestion().
264 AssertRel(word.size(),>,1);
266 // Merge any pending changes to disk, but don't call commit() so they
267 // won't be switched live.
268 if (!wordfreq_changes.empty()) merge_changes();
270 vector<TermList*> termlists;
271 try {
272 string data;
273 fragment buf(0);
275 if (word.size() <= 4) {
276 // We also generate 'bookends' for two, three, and four character
277 // terms so we can handle transposition of the middle two
278 // characters of a four character word, substitution or deletion of
279 // the middle character of a three character word, or insertion in
280 // the middle of a two character word.
281 buf[0] = KEY_PREFIX_BOOKEND;
282 buf[1] = word[0];
283 buf[2] = word[word.size() - 1];
284 if (get_exact_entry(string(buf), data))
285 termlists.push_back(new HoneySpellingTermList(data, buf.data));
288 // Head:
289 buf[0] = KEY_PREFIX_HEAD;
290 buf[1] = word[0];
291 buf[2] = word[1];
292 if (get_exact_entry(string(buf), data))
293 termlists.push_back(new HoneySpellingTermList(data, buf.data));
295 if (word.size() == 2) {
296 // For two letter words, we generate H and T terms for the
297 // transposed form so that we can produce good spelling
298 // suggestions.
299 // AB -> BA
300 buf[1] = word[1];
301 buf[2] = word[0];
302 if (get_exact_entry(string(buf), data))
303 termlists.push_back(new HoneySpellingTermList(data, buf.data));
304 buf[0] = KEY_PREFIX_TAIL;
305 if (get_exact_entry(string(buf), data))
306 termlists.push_back(new HoneySpellingTermList(data, buf.data));
309 // Tail:
310 buf[0] = KEY_PREFIX_TAIL;
311 buf[1] = word[word.size() - 2];
312 buf[2] = word[word.size() - 1];
313 if (get_exact_entry(string(buf), data))
314 termlists.push_back(new HoneySpellingTermList(data, buf.data));
316 if (word.size() > 2) {
317 // Middles:
318 buf[0] = KEY_PREFIX_MIDDLE;
319 for (size_t start = 0; start <= word.size() - 3; ++start) {
320 memcpy(buf.data + 1, word.data() + start, 3);
321 if (get_exact_entry(string(buf), data))
322 termlists.push_back(new HoneySpellingTermList(data));
325 if (word.size() == 3) {
326 // For three letter words, we generate the two "single
327 // transposition" forms too, so that we can produce good
328 // spelling suggestions.
329 // ABC -> BAC
330 buf[1] = word[1];
331 buf[2] = word[0];
332 if (get_exact_entry(string(buf), data))
333 termlists.push_back(new HoneySpellingTermList(data));
334 // ABC -> ACB
335 buf[1] = word[0];
336 buf[2] = word[2];
337 buf[3] = word[1];
338 if (get_exact_entry(string(buf), data))
339 termlists.push_back(new HoneySpellingTermList(data));
343 return make_termlist_merger(termlists);
344 } catch (...) {
345 // Make sure we delete all the TermList objects to avoid leaking
346 // memory.
347 for (auto& t : termlists) {
348 delete t;
350 throw;
354 Xapian::doccount
355 HoneySpellingTable::get_word_frequency(string_view word) const
357 auto i = wordfreq_changes.find(word);
358 if (i != wordfreq_changes.end()) {
359 // Modified frequency for word:
360 return i->second;
363 string data;
364 if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
365 // Word "word" already exists.
366 Xapian::termcount freq;
367 const char* p = data.data();
368 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
369 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
371 return freq;
374 return 0;
377 ///////////////////////////////////////////////////////////////////////////
379 Xapian::termcount
380 HoneySpellingTermList::get_approx_size() const
382 // This is only used to decide how to build a OR-tree of TermList objects
383 // so we just need to return "sizes" which are ordered roughly correctly.
384 return data.size();
387 Xapian::termcount
388 HoneySpellingTermList::get_wdf() const
390 return 1;
393 Xapian::doccount
394 HoneySpellingTermList::get_termfreq() const
396 return 1;
399 TermList*
400 HoneySpellingTermList::next()
402 if (p == data.size()) {
403 return this;
406 size_t keep = 0;
407 if (rare(tail < 0)) {
408 tail += 2;
409 keep = current_term.size() - tail;
410 } else if (usual(!current_term.empty())) {
411 keep = data[p++] ^ MAGIC_XOR_VALUE;
413 size_t add;
414 if (p == data.size() ||
415 (add = data[p] ^ MAGIC_XOR_VALUE) >= data.size() - p) {
416 throw Xapian::DatabaseCorruptError("Bad spelling data (too little "
417 "left)");
419 if (rare(keep + tail > current_term.size())) {
420 // The initial part to keep overlaps with the tail part which is an
421 // unusual case requiring special handling.
422 string tail_string(current_term, current_term.size() - tail);
423 current_term.replace(keep, string::npos, data.data() + p + 1, add);
424 current_term += tail_string;
425 } else {
426 current_term.replace(keep, current_term.size() - tail - keep,
427 data.data() + p + 1, add);
429 p += add + 1;
431 return NULL;
434 TermList*
435 HoneySpellingTermList::skip_to(string_view term)
437 while (current_term < term) {
438 if (HoneySpellingTermList::next())
439 return this;
441 return NULL;
444 Xapian::termcount
445 HoneySpellingTermList::positionlist_count() const
447 throw
448 Xapian::UnimplementedError("HoneySpellingTermList::"
449 "positionlist_count() "
450 "not implemented");
453 PositionList*
454 HoneySpellingTermList::positionlist_begin() const
456 throw
457 Xapian::UnimplementedError("HoneySpellingTermList::"
458 "positionlist_begin() "
459 "not implemented");