xapian-core/backends/honey/honey_spelling.cc

   1 /** @file honey_spelling.cc
   2  * @brief Spelling correction data for a honey database.
   3  */
   4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2015,2017,2018 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include <xapian/error.h>
  24 #include <xapian/types.h>
  25
  26 #include "expand/expandweight.h"
  27 #include "honey_spelling.h"
  28 #include "omassert.h"
  29 #include "expand/ortermlist.h"
  30 #include "pack.h"
  31
  32 #include "../prefix_compressed_strings.h"
  33
  34 #include <algorithm>
  35 #include <map>
  36 #include <queue>
  37 #include <vector>
  38 #include <set>
  39 #include <string>
  40
  41 using namespace Honey;
  42 using namespace std;
  43
  44 void
  45 HoneySpellingTable::merge_changes()
  46 {
  47     map<fragment, set<string> >::const_iterator i;
  48     for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
  49         string key = i->first;
  50         const set<string> & changes = i->second;
  51
  52         set<string>::const_iterator d = changes.begin();
  53         if (d == changes.end()) continue;
  54
  55         string updated;
  56         string current;
  57         PrefixCompressedStringWriter out(updated);
  58         if (get_exact_entry(key, current)) {
  59             PrefixCompressedStringItor in(current);
  60             updated.reserve(current.size()); // FIXME plus some?
  61             while (!in.at_end() && d != changes.end()) {
  62                 const string & word = *in;
  63                 Assert(d != changes.end());
  64                 int cmp = word.compare(*d);
  65                 if (cmp < 0) {
  66                     out.append(word);
  67                     ++in;
  68                 } else if (cmp > 0) {
  69                     out.append(*d);
  70                     ++d;
  71                 } else {
  72                     // If an existing entry is in the changes list, that means
  73                     // we should remove it.
  74                     ++in;
  75                     ++d;
  76                 }
  77             }
  78             if (!in.at_end()) {
  79                 // FIXME : easy to optimise this to a fix-up and substring copy.
  80                 while (!in.at_end()) {
  81                     out.append(*in++);
  82                 }
  83             }
  84         }
  85         while (d != changes.end()) {
  86             out.append(*d++);
  87         }
  88         if (!updated.empty()) {
  89             add(key, updated);
  90         } else {
  91             del(key);
  92         }
  93     }
  94     termlist_deltas.clear();
  95
  96     map<string, Xapian::termcount>::const_iterator j;
  97     for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
  98         const string& key = make_spelling_wordlist_key(j->first);
  99         Xapian::termcount wordfreq = j->second;
 100         if (wordfreq) {
 101             string tag;
 102             pack_uint_last(tag, wordfreq);
 103             add(key, tag);
 104             if (wordfreq > wordfreq_upper_bound)
 105                 wordfreq_upper_bound = wordfreq;
 106         } else {
 107             del(key);
 108         }
 109     }
 110     wordfreq_changes.clear();
 111 }
 112
 113 void
 114 HoneySpellingTable::toggle_fragment(fragment frag, const string & word)
 115 {
 116     map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
 117     if (i == termlist_deltas.end()) {
 118         i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
 119     }
 120     // The commonest case is that we're adding lots of words, so try insert
 121     // first and if that reports that the word already exists, remove it.
 122     pair<set<string>::iterator, bool> res = i->second.insert(word);
 123     if (!res.second) {
 124         // word is already in the set, so remove it.
 125         i->second.erase(res.first);
 126     }
 127 }
 128
 129 void
 130 HoneySpellingTable::add_word(const string & word, Xapian::termcount freqinc)
 131 {
 132     if (word.size() <= 1) return;
 133
 134     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
 135     if (i != wordfreq_changes.end()) {
 136         // Word "word" already exists and has been modified.
 137         if (i->second) {
 138             i->second += freqinc;
 139             return;
 140         }
 141         // If "word" is currently modified such that it no longer exists, so
 142         // we need to execute the code below to re-add trigrams for it.
 143         i->second = freqinc;
 144     } else {
 145         string data;
 146         if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
 147             // Word "word" already exists, so increment its count.
 148             Xapian::termcount freq;
 149             const char * p = data.data();
 150             if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
 151                 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
 152             }
 153             wordfreq_changes[word] = freq + freqinc;
 154             return;
 155         }
 156         wordfreq_changes[word] = freqinc;
 157     }
 158
 159     // Add trigrams for word.
 160     toggle_word(word);
 161 }
 162
 163 Xapian::termcount
 164 HoneySpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
 165 {
 166     if (word.size() <= 1) return freqdec;
 167
 168     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
 169     if (i != wordfreq_changes.end()) {
 170         if (i->second == 0) {
 171             // Word has already been deleted.
 172             return freqdec;
 173         }
 174         // Word "word" exists and has been modified.
 175         if (freqdec < i->second) {
 176             i->second -= freqdec;
 177             return 0;
 178         }
 179         freqdec -= i->second;
 180
 181         // Mark word as deleted.
 182         i->second = 0;
 183     } else {
 184         string data;
 185         if (!get_exact_entry(make_spelling_wordlist_key(word), data)) {
 186             // This word doesn't exist.
 187             return freqdec;
 188         }
 189
 190         Xapian::termcount freq;
 191         const char *p = data.data();
 192         if (!unpack_uint_last(&p, p + data.size(), &freq)) {
 193             throw Xapian::DatabaseCorruptError("Bad spelling word freq");
 194         }
 195         if (freqdec < freq) {
 196             wordfreq_changes[word] = freq - freqdec;
 197             return 0;
 198         }
 199         freqdec -= freq;
 200
 201         // Mark word as deleted.
 202         wordfreq_changes[word] = 0;
 203     }
 204
 205     // Remove trigrams for word.
 206     toggle_word(word);
 207
 208     return freqdec;
 209 }
 210
 211 void
 212 HoneySpellingTable::toggle_word(const string & word)
 213 {
 214     fragment buf(0);
 215
 216     if (word.size() <= 4) {
 217         // We also generate 'bookends' for two, three, and four character
 218         // terms so we can handle transposition of the middle two characters
 219         // of a four character word, substitution or deletion of the middle
 220         // character of a three character word, or insertion in the middle of a
 221         // two character word.
 222         // 'Bookends':
 223         buf[0] = '\x00';
 224         buf[1] = word[0];
 225         buf[2] = word[word.size() - 1];
 226         toggle_fragment(buf, word);
 227     }
 228
 229     // Head:
 230     buf[0] = '\x01';
 231     buf[1] = word[0];
 232     buf[2] = word[1];
 233     toggle_fragment(buf, word);
 234
 235     // Tail:
 236     buf[0] = '\x03';
 237     buf[1] = word[word.size() - 2];
 238     buf[2] = word[word.size() - 1];
 239     toggle_fragment(buf, word);
 240
 241     if (word.size() > 2) {
 242         set<fragment> done;
 243         // Middles:
 244         buf[0] = '\x02';
 245         for (size_t start = 0; start <= word.size() - 3; ++start) {
 246             memcpy(buf.data + 1, word.data() + start, 3);
 247             // Don't toggle the same fragment twice or it will cancel out.
 248             // Bug fixed in 1.2.6.
 249             if (done.insert(buf).second)
 250                 toggle_fragment(buf, word);
 251         }
 252     }
 253 }
 254
 255 struct TermListGreaterApproxSize {
 256     bool operator()(const TermList *a, const TermList *b) const {
 257         return a->get_approx_size() > b->get_approx_size();
 258     }
 259 };
 260
 261 TermList *
 262 HoneySpellingTable::open_termlist(const string & word)
 263 {
 264     // This should have been handled by Database::get_spelling_suggestion().
 265     AssertRel(word.size(),>,1);
 266
 267     // Merge any pending changes to disk, but don't call commit() so they
 268     // won't be switched live.
 269     if (!wordfreq_changes.empty()) merge_changes();
 270
 271     // Build a priority queue of TermList objects which returns those of
 272     // greatest approximate size first.
 273     priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
 274     try {
 275         string data;
 276         fragment buf(0);
 277
 278         if (word.size() <= 4) {
 279             // We also generate 'bookends' for two, three, and four character
 280             // terms so we can handle transposition of the middle two
 281             // characters of a four character word, substitution or deletion of
 282             // the middle character of a three character word, or insertion in
 283             // the middle of a two character word.
 284             buf[0] = '\x00';
 285             buf[1] = word[0];
 286             buf[2] = word[word.size() - 1];
 287             if (get_exact_entry(string(buf), data))
 288                 pq.push(new HoneySpellingTermList(data));
 289         }
 290
 291         // Head:
 292         buf[0] = '\x01';
 293         buf[1] = word[0];
 294         buf[2] = word[1];
 295         if (get_exact_entry(string(buf), data))
 296             pq.push(new HoneySpellingTermList(data));
 297
 298         if (word.size() == 2) {
 299             // For two letter words, we generate H and T terms for the
 300             // transposed form so that we can produce good spelling
 301             // suggestions.
 302             // AB -> BA
 303             buf[1] = word[1];
 304             buf[2] = word[0];
 305             if (get_exact_entry(string(buf), data))
 306                 pq.push(new HoneySpellingTermList(data));
 307             buf[0] = '\x02';
 308             if (get_exact_entry(string(buf), data))
 309                 pq.push(new HoneySpellingTermList(data));
 310         }
 311
 312         // Tail:
 313         buf[0] = '\x02';
 314         buf[1] = word[word.size() - 2];
 315         buf[2] = word[word.size() - 1];
 316         if (get_exact_entry(string(buf), data))
 317             pq.push(new HoneySpellingTermList(data));
 318
 319         if (word.size() > 2) {
 320             // Middles:
 321             buf[0] = '\x03';
 322             for (size_t start = 0; start <= word.size() - 3; ++start) {
 323                 memcpy(buf.data + 1, word.data() + start, 3);
 324                 if (get_exact_entry(string(buf), data))
 325                     pq.push(new HoneySpellingTermList(data));
 326             }
 327
 328             if (word.size() == 3) {
 329                 // For three letter words, we generate the two "single
 330                 // transposition" forms too, so that we can produce good
 331                 // spelling suggestions.
 332                 // ABC -> BAC
 333                 buf[1] = word[1];
 334                 buf[2] = word[0];
 335                 if (get_exact_entry(string(buf), data))
 336                     pq.push(new HoneySpellingTermList(data));
 337                 // ABC -> ACB
 338                 buf[1] = word[0];
 339                 buf[2] = word[2];
 340                 buf[3] = word[1];
 341                 if (get_exact_entry(string(buf), data))
 342                     pq.push(new HoneySpellingTermList(data));
 343             }
 344         }
 345
 346         if (pq.empty()) return NULL;
 347
 348         // Build up an OrTermList tree by combine leaves and/or branches in
 349         // pairs.  The tree is balanced by the approximated sizes of the leaf
 350         // HoneySpellingTermList objects - the way the tree is built are very
 351         // similar to how an optimal Huffman code is often constructed.
 352         //
 353         // Balancing the tree like this should tend to minimise the amount of
 354         // work done.
 355         while (pq.size() > 1) {
 356             // Build the tree such that left is always >= right so that
 357             // OrTermList can rely on this when trying to minimise work.
 358             TermList * termlist = pq.top();
 359             pq.pop();
 360
 361             termlist = new OrTermList(pq.top(), termlist);
 362             pq.pop();
 363             pq.push(termlist);
 364         }
 365
 366         return pq.top();
 367     } catch (...) {
 368         // Make sure we delete all the TermList objects to avoid leaking
 369         // memory.
 370         while (!pq.empty()) {
 371             delete pq.top();
 372             pq.pop();
 373         }
 374         throw;
 375     }
 376 }
 377
 378 Xapian::doccount
 379 HoneySpellingTable::get_word_frequency(const string & word) const
 380 {
 381     map<string, Xapian::termcount>::const_iterator i;
 382     i = wordfreq_changes.find(word);
 383     if (i != wordfreq_changes.end()) {
 384         // Modified frequency for word:
 385         return i->second;
 386     }
 387
 388     string data;
 389     if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
 390         // Word "word" already exists.
 391         Xapian::termcount freq;
 392         const char *p = data.data();
 393         if (!unpack_uint_last(&p, p + data.size(), &freq)) {
 394             throw Xapian::DatabaseCorruptError("Bad spelling word freq");
 395         }
 396         return freq;
 397     }
 398
 399     return 0;
 400 }
 401
 402 ///////////////////////////////////////////////////////////////////////////
 403
 404 Xapian::termcount
 405 HoneySpellingTermList::get_approx_size() const
 406 {
 407     // This is only used to decide how to build a OR-tree of TermList objects
 408     // so we just need to return "sizes" which are ordered roughly correctly.
 409     return data.size();
 410 }
 411
 412 std::string
 413 HoneySpellingTermList::get_termname() const
 414 {
 415     return current_term;
 416 }
 417
 418 Xapian::termcount
 419 HoneySpellingTermList::get_wdf() const
 420 {
 421     return 1;
 422 }
 423
 424 Xapian::doccount
 425 HoneySpellingTermList::get_termfreq() const
 426 {
 427     return 1;
 428 }
 429
 430 Xapian::termcount
 431 HoneySpellingTermList::get_collection_freq() const
 432 {
 433     return 1;
 434 }
 435
 436 TermList *
 437 HoneySpellingTermList::next()
 438 {
 439     if (p == data.size()) {
 440         p = 0;
 441         data.resize(0);
 442         return NULL;
 443     }
 444     if (!current_term.empty()) {
 445         if (p == data.size())
 446             throw Xapian::DatabaseCorruptError("Bad spelling termlist");
 447         current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
 448     }
 449     size_t add;
 450     if (p == data.size() ||
 451         (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
 452         throw Xapian::DatabaseCorruptError("Bad spelling termlist");
 453     current_term.append(data.data() + p + 1, add);
 454     p += add + 1;
 455     return NULL;
 456 }
 457
 458 TermList *
 459 HoneySpellingTermList::skip_to(const string & term)
 460 {
 461     while (!data.empty() && current_term < term) {
 462         (void)HoneySpellingTermList::next();
 463     }
 464     return NULL;
 465 }
 466
 467 bool
 468 HoneySpellingTermList::at_end() const
 469 {
 470     return data.empty();
 471 }
 472
 473 Xapian::termcount
 474 HoneySpellingTermList::positionlist_count() const
 475 {
 476     throw
 477         Xapian::UnimplementedError("HoneySpellingTermList::"
 478                                    "positionlist_count() "
 479                                    "not implemented");
 480 }
 481
 482 PositionList*
 483 HoneySpellingTermList::positionlist_begin() const
 484 {
 485     throw
 486         Xapian::UnimplementedError("HoneySpellingTermList::"
 487                                    "positionlist_begin() "
 488                                    "not implemented");
 489 }