[honey] Improve spelling table key encoding
[xapian.git] / xapian-core / backends / honey / honey_spelling.cc
blob35c2f9c51704e3a312191d1a8ccd063f081ceede
1 /** @file honey_spelling.cc
2 * @brief Spelling correction data for a honey database.
3 */
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2015,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include <xapian/error.h>
24 #include <xapian/types.h>
26 #include "expand/expandweight.h"
27 #include "honey_spelling.h"
28 #include "omassert.h"
29 #include "expand/ortermlist.h"
30 #include "pack.h"
32 #include "../prefix_compressed_strings.h"
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
41 using namespace Honey;
42 using namespace std;
44 void
45 HoneySpellingTable::merge_changes()
47 map<fragment, set<string> >::const_iterator i;
48 for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
49 string key = i->first;
50 const set<string> & changes = i->second;
52 set<string>::const_iterator d = changes.begin();
53 if (d == changes.end()) continue;
55 string updated;
56 string current;
57 PrefixCompressedStringWriter out(updated);
58 if (get_exact_entry(key, current)) {
59 PrefixCompressedStringItor in(current);
60 updated.reserve(current.size()); // FIXME plus some?
61 while (!in.at_end() && d != changes.end()) {
62 const string & word = *in;
63 Assert(d != changes.end());
64 int cmp = word.compare(*d);
65 if (cmp < 0) {
66 out.append(word);
67 ++in;
68 } else if (cmp > 0) {
69 out.append(*d);
70 ++d;
71 } else {
72 // If an existing entry is in the changes list, that means
73 // we should remove it.
74 ++in;
75 ++d;
78 if (!in.at_end()) {
79 // FIXME : easy to optimise this to a fix-up and substring copy.
80 while (!in.at_end()) {
81 out.append(*in++);
85 while (d != changes.end()) {
86 out.append(*d++);
88 if (!updated.empty()) {
89 add(key, updated);
90 } else {
91 del(key);
94 termlist_deltas.clear();
96 map<string, Xapian::termcount>::const_iterator j;
97 for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
98 const string& key = make_spelling_wordlist_key(j->first);
99 Xapian::termcount wordfreq = j->second;
100 if (wordfreq) {
101 string tag;
102 pack_uint_last(tag, wordfreq);
103 add(key, tag);
104 if (wordfreq > wordfreq_upper_bound)
105 wordfreq_upper_bound = wordfreq;
106 } else {
107 del(key);
110 wordfreq_changes.clear();
113 void
114 HoneySpellingTable::toggle_fragment(fragment frag, const string & word)
116 map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
117 if (i == termlist_deltas.end()) {
118 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
120 // The commonest case is that we're adding lots of words, so try insert
121 // first and if that reports that the word already exists, remove it.
122 pair<set<string>::iterator, bool> res = i->second.insert(word);
123 if (!res.second) {
124 // word is already in the set, so remove it.
125 i->second.erase(res.first);
129 void
130 HoneySpellingTable::add_word(const string & word, Xapian::termcount freqinc)
132 if (word.size() <= 1) return;
134 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
135 if (i != wordfreq_changes.end()) {
136 // Word "word" already exists and has been modified.
137 if (i->second) {
138 i->second += freqinc;
139 return;
141 // If "word" is currently modified such that it no longer exists, so
142 // we need to execute the code below to re-add trigrams for it.
143 i->second = freqinc;
144 } else {
145 string data;
146 if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
147 // Word "word" already exists, so increment its count.
148 Xapian::termcount freq;
149 const char * p = data.data();
150 if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
151 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
153 wordfreq_changes[word] = freq + freqinc;
154 return;
156 wordfreq_changes[word] = freqinc;
159 // Add trigrams for word.
160 toggle_word(word);
163 Xapian::termcount
164 HoneySpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
166 if (word.size() <= 1) return freqdec;
168 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
169 if (i != wordfreq_changes.end()) {
170 if (i->second == 0) {
171 // Word has already been deleted.
172 return freqdec;
174 // Word "word" exists and has been modified.
175 if (freqdec < i->second) {
176 i->second -= freqdec;
177 return 0;
179 freqdec -= i->second;
181 // Mark word as deleted.
182 i->second = 0;
183 } else {
184 string data;
185 if (!get_exact_entry(make_spelling_wordlist_key(word), data)) {
186 // This word doesn't exist.
187 return freqdec;
190 Xapian::termcount freq;
191 const char *p = data.data();
192 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
193 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
195 if (freqdec < freq) {
196 wordfreq_changes[word] = freq - freqdec;
197 return 0;
199 freqdec -= freq;
201 // Mark word as deleted.
202 wordfreq_changes[word] = 0;
205 // Remove trigrams for word.
206 toggle_word(word);
208 return freqdec;
211 void
212 HoneySpellingTable::toggle_word(const string & word)
214 fragment buf(0);
216 if (word.size() <= 4) {
217 // We also generate 'bookends' for two, three, and four character
218 // terms so we can handle transposition of the middle two characters
219 // of a four character word, substitution or deletion of the middle
220 // character of a three character word, or insertion in the middle of a
221 // two character word.
222 // 'Bookends':
223 buf[0] = '\x00';
224 buf[1] = word[0];
225 buf[2] = word[word.size() - 1];
226 toggle_fragment(buf, word);
229 // Head:
230 buf[0] = '\x01';
231 buf[1] = word[0];
232 buf[2] = word[1];
233 toggle_fragment(buf, word);
235 // Tail:
236 buf[0] = '\x03';
237 buf[1] = word[word.size() - 2];
238 buf[2] = word[word.size() - 1];
239 toggle_fragment(buf, word);
241 if (word.size() > 2) {
242 set<fragment> done;
243 // Middles:
244 buf[0] = '\x02';
245 for (size_t start = 0; start <= word.size() - 3; ++start) {
246 memcpy(buf.data + 1, word.data() + start, 3);
247 // Don't toggle the same fragment twice or it will cancel out.
248 // Bug fixed in 1.2.6.
249 if (done.insert(buf).second)
250 toggle_fragment(buf, word);
255 struct TermListGreaterApproxSize {
256 bool operator()(const TermList *a, const TermList *b) const {
257 return a->get_approx_size() > b->get_approx_size();
261 TermList *
262 HoneySpellingTable::open_termlist(const string & word)
264 // This should have been handled by Database::get_spelling_suggestion().
265 AssertRel(word.size(),>,1);
267 // Merge any pending changes to disk, but don't call commit() so they
268 // won't be switched live.
269 if (!wordfreq_changes.empty()) merge_changes();
271 // Build a priority queue of TermList objects which returns those of
272 // greatest approximate size first.
273 priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
274 try {
275 string data;
276 fragment buf(0);
278 if (word.size() <= 4) {
279 // We also generate 'bookends' for two, three, and four character
280 // terms so we can handle transposition of the middle two
281 // characters of a four character word, substitution or deletion of
282 // the middle character of a three character word, or insertion in
283 // the middle of a two character word.
284 buf[0] = '\x00';
285 buf[1] = word[0];
286 buf[2] = word[word.size() - 1];
287 if (get_exact_entry(string(buf), data))
288 pq.push(new HoneySpellingTermList(data));
291 // Head:
292 buf[0] = '\x01';
293 buf[1] = word[0];
294 buf[2] = word[1];
295 if (get_exact_entry(string(buf), data))
296 pq.push(new HoneySpellingTermList(data));
298 if (word.size() == 2) {
299 // For two letter words, we generate H and T terms for the
300 // transposed form so that we can produce good spelling
301 // suggestions.
302 // AB -> BA
303 buf[1] = word[1];
304 buf[2] = word[0];
305 if (get_exact_entry(string(buf), data))
306 pq.push(new HoneySpellingTermList(data));
307 buf[0] = '\x02';
308 if (get_exact_entry(string(buf), data))
309 pq.push(new HoneySpellingTermList(data));
312 // Tail:
313 buf[0] = '\x02';
314 buf[1] = word[word.size() - 2];
315 buf[2] = word[word.size() - 1];
316 if (get_exact_entry(string(buf), data))
317 pq.push(new HoneySpellingTermList(data));
319 if (word.size() > 2) {
320 // Middles:
321 buf[0] = '\x03';
322 for (size_t start = 0; start <= word.size() - 3; ++start) {
323 memcpy(buf.data + 1, word.data() + start, 3);
324 if (get_exact_entry(string(buf), data))
325 pq.push(new HoneySpellingTermList(data));
328 if (word.size() == 3) {
329 // For three letter words, we generate the two "single
330 // transposition" forms too, so that we can produce good
331 // spelling suggestions.
332 // ABC -> BAC
333 buf[1] = word[1];
334 buf[2] = word[0];
335 if (get_exact_entry(string(buf), data))
336 pq.push(new HoneySpellingTermList(data));
337 // ABC -> ACB
338 buf[1] = word[0];
339 buf[2] = word[2];
340 buf[3] = word[1];
341 if (get_exact_entry(string(buf), data))
342 pq.push(new HoneySpellingTermList(data));
346 if (pq.empty()) return NULL;
348 // Build up an OrTermList tree by combine leaves and/or branches in
349 // pairs. The tree is balanced by the approximated sizes of the leaf
350 // HoneySpellingTermList objects - the way the tree is built are very
351 // similar to how an optimal Huffman code is often constructed.
353 // Balancing the tree like this should tend to minimise the amount of
354 // work done.
355 while (pq.size() > 1) {
356 // Build the tree such that left is always >= right so that
357 // OrTermList can rely on this when trying to minimise work.
358 TermList * termlist = pq.top();
359 pq.pop();
361 termlist = new OrTermList(pq.top(), termlist);
362 pq.pop();
363 pq.push(termlist);
366 return pq.top();
367 } catch (...) {
368 // Make sure we delete all the TermList objects to avoid leaking
369 // memory.
370 while (!pq.empty()) {
371 delete pq.top();
372 pq.pop();
374 throw;
378 Xapian::doccount
379 HoneySpellingTable::get_word_frequency(const string & word) const
381 map<string, Xapian::termcount>::const_iterator i;
382 i = wordfreq_changes.find(word);
383 if (i != wordfreq_changes.end()) {
384 // Modified frequency for word:
385 return i->second;
388 string data;
389 if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
390 // Word "word" already exists.
391 Xapian::termcount freq;
392 const char *p = data.data();
393 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
394 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
396 return freq;
399 return 0;
402 ///////////////////////////////////////////////////////////////////////////
404 Xapian::termcount
405 HoneySpellingTermList::get_approx_size() const
407 // This is only used to decide how to build a OR-tree of TermList objects
408 // so we just need to return "sizes" which are ordered roughly correctly.
409 return data.size();
412 std::string
413 HoneySpellingTermList::get_termname() const
415 return current_term;
418 Xapian::termcount
419 HoneySpellingTermList::get_wdf() const
421 return 1;
424 Xapian::doccount
425 HoneySpellingTermList::get_termfreq() const
427 return 1;
430 Xapian::termcount
431 HoneySpellingTermList::get_collection_freq() const
433 return 1;
436 TermList *
437 HoneySpellingTermList::next()
439 if (p == data.size()) {
440 p = 0;
441 data.resize(0);
442 return NULL;
444 if (!current_term.empty()) {
445 if (p == data.size())
446 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
447 current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
449 size_t add;
450 if (p == data.size() ||
451 (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
452 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
453 current_term.append(data.data() + p + 1, add);
454 p += add + 1;
455 return NULL;
458 TermList *
459 HoneySpellingTermList::skip_to(const string & term)
461 while (!data.empty() && current_term < term) {
462 (void)HoneySpellingTermList::next();
464 return NULL;
467 bool
468 HoneySpellingTermList::at_end() const
470 return data.empty();
473 Xapian::termcount
474 HoneySpellingTermList::positionlist_count() const
476 throw
477 Xapian::UnimplementedError("HoneySpellingTermList::"
478 "positionlist_count() "
479 "not implemented");
482 PositionList*
483 HoneySpellingTermList::positionlist_begin() const
485 throw
486 Xapian::UnimplementedError("HoneySpellingTermList::"
487 "positionlist_begin() "
488 "not implemented");