1 /** @file honey_spelling.cc
2 * @brief Spelling correction data for a honey database.
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2015,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <xapian/error.h>
24 #include <xapian/types.h>
26 #include "expand/expandweight.h"
27 #include "honey_spelling.h"
29 #include "expand/ortermlist.h"
32 #include "../prefix_compressed_strings.h"
41 using namespace Honey
;
45 HoneySpellingTable::merge_changes()
47 map
<fragment
, set
<string
> >::const_iterator i
;
48 for (i
= termlist_deltas
.begin(); i
!= termlist_deltas
.end(); ++i
) {
49 string key
= i
->first
;
50 const set
<string
> & changes
= i
->second
;
52 set
<string
>::const_iterator d
= changes
.begin();
53 if (d
== changes
.end()) continue;
57 PrefixCompressedStringWriter
out(updated
);
58 if (get_exact_entry(key
, current
)) {
59 PrefixCompressedStringItor
in(current
);
60 updated
.reserve(current
.size()); // FIXME plus some?
61 while (!in
.at_end() && d
!= changes
.end()) {
62 const string
& word
= *in
;
63 Assert(d
!= changes
.end());
64 int cmp
= word
.compare(*d
);
72 // If an existing entry is in the changes list, that means
73 // we should remove it.
79 // FIXME : easy to optimise this to a fix-up and substring copy.
80 while (!in
.at_end()) {
85 while (d
!= changes
.end()) {
88 if (!updated
.empty()) {
94 termlist_deltas
.clear();
96 map
<string
, Xapian::termcount
>::const_iterator j
;
97 for (j
= wordfreq_changes
.begin(); j
!= wordfreq_changes
.end(); ++j
) {
98 const string
& key
= make_spelling_wordlist_key(j
->first
);
99 Xapian::termcount wordfreq
= j
->second
;
102 pack_uint_last(tag
, wordfreq
);
104 if (wordfreq
> wordfreq_upper_bound
)
105 wordfreq_upper_bound
= wordfreq
;
110 wordfreq_changes
.clear();
114 HoneySpellingTable::toggle_fragment(fragment frag
, const string
& word
)
116 map
<fragment
, set
<string
> >::iterator i
= termlist_deltas
.find(frag
);
117 if (i
== termlist_deltas
.end()) {
118 i
= termlist_deltas
.insert(make_pair(frag
, set
<string
>())).first
;
120 // The commonest case is that we're adding lots of words, so try insert
121 // first and if that reports that the word already exists, remove it.
122 pair
<set
<string
>::iterator
, bool> res
= i
->second
.insert(word
);
124 // word is already in the set, so remove it.
125 i
->second
.erase(res
.first
);
130 HoneySpellingTable::add_word(const string
& word
, Xapian::termcount freqinc
)
132 if (word
.size() <= 1) return;
134 map
<string
, Xapian::termcount
>::iterator i
= wordfreq_changes
.find(word
);
135 if (i
!= wordfreq_changes
.end()) {
136 // Word "word" already exists and has been modified.
138 i
->second
+= freqinc
;
141 // If "word" is currently modified such that it no longer exists, so
142 // we need to execute the code below to re-add trigrams for it.
146 if (get_exact_entry(make_spelling_wordlist_key(word
), data
)) {
147 // Word "word" already exists, so increment its count.
148 Xapian::termcount freq
;
149 const char * p
= data
.data();
150 if (!unpack_uint_last(&p
, p
+ data
.size(), &freq
) || freq
== 0) {
151 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
153 wordfreq_changes
[word
] = freq
+ freqinc
;
156 wordfreq_changes
[word
] = freqinc
;
159 // Add trigrams for word.
164 HoneySpellingTable::remove_word(const string
& word
, Xapian::termcount freqdec
)
166 if (word
.size() <= 1) return freqdec
;
168 map
<string
, Xapian::termcount
>::iterator i
= wordfreq_changes
.find(word
);
169 if (i
!= wordfreq_changes
.end()) {
170 if (i
->second
== 0) {
171 // Word has already been deleted.
174 // Word "word" exists and has been modified.
175 if (freqdec
< i
->second
) {
176 i
->second
-= freqdec
;
179 freqdec
-= i
->second
;
181 // Mark word as deleted.
185 if (!get_exact_entry(make_spelling_wordlist_key(word
), data
)) {
186 // This word doesn't exist.
190 Xapian::termcount freq
;
191 const char *p
= data
.data();
192 if (!unpack_uint_last(&p
, p
+ data
.size(), &freq
)) {
193 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
195 if (freqdec
< freq
) {
196 wordfreq_changes
[word
] = freq
- freqdec
;
201 // Mark word as deleted.
202 wordfreq_changes
[word
] = 0;
205 // Remove trigrams for word.
212 HoneySpellingTable::toggle_word(const string
& word
)
216 if (word
.size() <= 4) {
217 // We also generate 'bookends' for two, three, and four character
218 // terms so we can handle transposition of the middle two characters
219 // of a four character word, substitution or deletion of the middle
220 // character of a three character word, or insertion in the middle of a
221 // two character word.
225 buf
[2] = word
[word
.size() - 1];
226 toggle_fragment(buf
, word
);
233 toggle_fragment(buf
, word
);
237 buf
[1] = word
[word
.size() - 2];
238 buf
[2] = word
[word
.size() - 1];
239 toggle_fragment(buf
, word
);
241 if (word
.size() > 2) {
245 for (size_t start
= 0; start
<= word
.size() - 3; ++start
) {
246 memcpy(buf
.data
+ 1, word
.data() + start
, 3);
247 // Don't toggle the same fragment twice or it will cancel out.
248 // Bug fixed in 1.2.6.
249 if (done
.insert(buf
).second
)
250 toggle_fragment(buf
, word
);
255 struct TermListGreaterApproxSize
{
256 bool operator()(const TermList
*a
, const TermList
*b
) const {
257 return a
->get_approx_size() > b
->get_approx_size();
262 HoneySpellingTable::open_termlist(const string
& word
)
264 // This should have been handled by Database::get_spelling_suggestion().
265 AssertRel(word
.size(),>,1);
267 // Merge any pending changes to disk, but don't call commit() so they
268 // won't be switched live.
269 if (!wordfreq_changes
.empty()) merge_changes();
271 // Build a priority queue of TermList objects which returns those of
272 // greatest approximate size first.
273 priority_queue
<TermList
*, vector
<TermList
*>, TermListGreaterApproxSize
> pq
;
278 if (word
.size() <= 4) {
279 // We also generate 'bookends' for two, three, and four character
280 // terms so we can handle transposition of the middle two
281 // characters of a four character word, substitution or deletion of
282 // the middle character of a three character word, or insertion in
283 // the middle of a two character word.
286 buf
[2] = word
[word
.size() - 1];
287 if (get_exact_entry(string(buf
), data
))
288 pq
.push(new HoneySpellingTermList(data
));
295 if (get_exact_entry(string(buf
), data
))
296 pq
.push(new HoneySpellingTermList(data
));
298 if (word
.size() == 2) {
299 // For two letter words, we generate H and T terms for the
300 // transposed form so that we can produce good spelling
305 if (get_exact_entry(string(buf
), data
))
306 pq
.push(new HoneySpellingTermList(data
));
308 if (get_exact_entry(string(buf
), data
))
309 pq
.push(new HoneySpellingTermList(data
));
314 buf
[1] = word
[word
.size() - 2];
315 buf
[2] = word
[word
.size() - 1];
316 if (get_exact_entry(string(buf
), data
))
317 pq
.push(new HoneySpellingTermList(data
));
319 if (word
.size() > 2) {
322 for (size_t start
= 0; start
<= word
.size() - 3; ++start
) {
323 memcpy(buf
.data
+ 1, word
.data() + start
, 3);
324 if (get_exact_entry(string(buf
), data
))
325 pq
.push(new HoneySpellingTermList(data
));
328 if (word
.size() == 3) {
329 // For three letter words, we generate the two "single
330 // transposition" forms too, so that we can produce good
331 // spelling suggestions.
335 if (get_exact_entry(string(buf
), data
))
336 pq
.push(new HoneySpellingTermList(data
));
341 if (get_exact_entry(string(buf
), data
))
342 pq
.push(new HoneySpellingTermList(data
));
346 if (pq
.empty()) return NULL
;
348 // Build up an OrTermList tree by combine leaves and/or branches in
349 // pairs. The tree is balanced by the approximated sizes of the leaf
350 // HoneySpellingTermList objects - the way the tree is built are very
351 // similar to how an optimal Huffman code is often constructed.
353 // Balancing the tree like this should tend to minimise the amount of
355 while (pq
.size() > 1) {
356 // Build the tree such that left is always >= right so that
357 // OrTermList can rely on this when trying to minimise work.
358 TermList
* termlist
= pq
.top();
361 termlist
= new OrTermList(pq
.top(), termlist
);
368 // Make sure we delete all the TermList objects to avoid leaking
370 while (!pq
.empty()) {
379 HoneySpellingTable::get_word_frequency(const string
& word
) const
381 map
<string
, Xapian::termcount
>::const_iterator i
;
382 i
= wordfreq_changes
.find(word
);
383 if (i
!= wordfreq_changes
.end()) {
384 // Modified frequency for word:
389 if (get_exact_entry(make_spelling_wordlist_key(word
), data
)) {
390 // Word "word" already exists.
391 Xapian::termcount freq
;
392 const char *p
= data
.data();
393 if (!unpack_uint_last(&p
, p
+ data
.size(), &freq
)) {
394 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
402 ///////////////////////////////////////////////////////////////////////////
405 HoneySpellingTermList::get_approx_size() const
407 // This is only used to decide how to build a OR-tree of TermList objects
408 // so we just need to return "sizes" which are ordered roughly correctly.
413 HoneySpellingTermList::get_termname() const
419 HoneySpellingTermList::get_wdf() const
425 HoneySpellingTermList::get_termfreq() const
431 HoneySpellingTermList::get_collection_freq() const
437 HoneySpellingTermList::next()
439 if (p
== data
.size()) {
444 if (!current_term
.empty()) {
445 if (p
== data
.size())
446 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
447 current_term
.resize(byte(data
[p
++]) ^ MAGIC_XOR_VALUE
);
450 if (p
== data
.size() ||
451 (add
= byte(data
[p
]) ^ MAGIC_XOR_VALUE
) >= data
.size() - p
)
452 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
453 current_term
.append(data
.data() + p
+ 1, add
);
459 HoneySpellingTermList::skip_to(const string
& term
)
461 while (!data
.empty() && current_term
< term
) {
462 (void)HoneySpellingTermList::next();
468 HoneySpellingTermList::at_end() const
474 HoneySpellingTermList::positionlist_count() const
477 Xapian::UnimplementedError("HoneySpellingTermList::"
478 "positionlist_count() "
483 HoneySpellingTermList::positionlist_begin() const
486 Xapian::UnimplementedError("HoneySpellingTermList::"
487 "positionlist_begin() "