xapian-core/queryparser/termgenerator_internal.cc

   1 /** @file termgenerator_internal.cc
   2  * @brief TermGenerator class internals
   3  */
   4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "termgenerator_internal.h"
  24
  25 #include "api/msetinternal.h"
  26 #include "api/queryinternal.h"
  27
  28 #include <xapian/document.h>
  29 #include <xapian/queryparser.h>
  30 #include <xapian/stem.h>
  31 #include <xapian/unicode.h>
  32
  33 #include "stringutils.h"
  34
  35 #include <algorithm>
  36 #include <cmath>
  37 #include <deque>
  38 #include <limits>
  39 #include <list>
  40 #include <string>
  41 #include <unordered_map>
  42 #include <vector>
  43
  44 #include "cjk-tokenizer.h"
  45
  46 using namespace std;
  47
  48 namespace Xapian {
  49
  50 inline bool
  51 U_isupper(unsigned ch) {
  52     return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
  53 }
  54
  55 inline unsigned check_wordchar(unsigned ch) {
  56     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
  57     return 0;
  58 }
  59
  60 inline bool
  61 should_stem(const std::string & term)
  62 {
  63     const unsigned int SHOULD_STEM_MASK =
  64         (1 << Unicode::LOWERCASE_LETTER) |
  65         (1 << Unicode::TITLECASE_LETTER) |
  66         (1 << Unicode::MODIFIER_LETTER) |
  67         (1 << Unicode::OTHER_LETTER);
  68     Utf8Iterator u(term);
  69     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
  70 }
  71
  72 /** Value representing "ignore this" when returned by check_infix() or
  73  *  check_infix_digit().
  74  */
  75 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
  76
  77 inline unsigned check_infix(unsigned ch) {
  78     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
  79         // Unicode includes all these except '&' in its word boundary rules,
  80         // as well as 0x2019 (which we handle below) and ':' (for Swedish
  81         // apparently, but we ignore this for now as it's problematic in
  82         // real world cases).
  83         return ch;
  84     }
  85     // 0x2019 is Unicode apostrophe and single closing quote.
  86     // 0x201b is Unicode single opening quote with the tail rising.
  87     if (ch == 0x2019 || ch == 0x201b) return '\'';
  88     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
  89         return UNICODE_IGNORE;
  90     return 0;
  91 }
  92
  93 inline unsigned check_infix_digit(unsigned ch) {
  94     // This list of characters comes from Unicode's word identifying algorithm.
  95     switch (ch) {
  96         case ',':
  97         case '.':
  98         case ';':
  99         case 0x037e: // GREEK QUESTION MARK
 100         case 0x0589: // ARMENIAN FULL STOP
 101         case 0x060D: // ARABIC DATE SEPARATOR
 102         case 0x07F8: // NKO COMMA
 103         case 0x2044: // FRACTION SLASH
 104         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 105         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 106         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 107             return ch;
 108     }
 109     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 110         return UNICODE_IGNORE;
 111     return 0;
 112 }
 113
 114 inline bool
 115 is_digit(unsigned ch) {
 116     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
 117 }
 118
 119 inline unsigned check_suffix(unsigned ch) {
 120     if (ch == '+' || ch == '#') return ch;
 121     // FIXME: what about '-'?
 122     return 0;
 123 }
 124
 125 /** Templated framework for processing terms.
 126  *
 127  *  Calls action(term, positional) for each term to add, where term is a
 128  *  std::string holding the term, and positional is a bool indicating
 129  *  if this term carries positional information.
 130  */
 131 template<typename ACTION> void
 132 parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
 133 {
 134     while (true) {
 135         // Advance to the start of the next term.
 136         unsigned ch;
 137         while (true) {
 138             if (itor == Utf8Iterator()) return;
 139             ch = check_wordchar(*itor);
 140             if (ch) break;
 141             ++itor;
 142         }
 143
 144         string term;
 145         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 146         // Don't worry if there's a trailing '.' or not.
 147         if (U_isupper(*itor)) {
 148             const Utf8Iterator end;
 149             Utf8Iterator p = itor;
 150             do {
 151                 Unicode::append_utf8(term, Unicode::tolower(*p++));
 152             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 153             // One letter does not make an acronym!  If we handled a single
 154             // uppercase letter here, we wouldn't catch M&S below.
 155             if (term.size() > 1) {
 156                 // Check there's not a (lower case) letter or digit
 157                 // immediately after it.
 158                 if (p == end || !Unicode::is_wordchar(*p)) {
 159                     itor = p;
 160                     goto endofterm;
 161                 }
 162             }
 163             term.resize(0);
 164         }
 165
 166         while (true) {
 167             if (cjk_ngram &&
 168                 CJK::codepoint_is_cjk(*itor) &&
 169                 Unicode::is_wordchar(*itor)) {
 170                 const string & cjk = CJK::get_cjk(itor);
 171                 for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
 172                     const string & cjk_token = *tk;
 173                     if (!action(cjk_token, with_positions && tk.get_length() == 1, itor))
 174                         return;
 175                 }
 176                 while (true) {
 177                     if (itor == Utf8Iterator()) return;
 178                     ch = check_wordchar(*itor);
 179                     if (ch) break;
 180                     ++itor;
 181                 }
 182                 continue;
 183             }
 184             unsigned prevch;
 185             do {
 186                 Unicode::append_utf8(term, ch);
 187                 prevch = ch;
 188                 if (++itor == Utf8Iterator() ||
 189                     (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
 190                     goto endofterm;
 191                 ch = check_wordchar(*itor);
 192             } while (ch);
 193
 194             Utf8Iterator next(itor);
 195             ++next;
 196             if (next == Utf8Iterator()) break;
 197             unsigned nextch = check_wordchar(*next);
 198             if (!nextch) break;
 199             unsigned infix_ch = *itor;
 200             if (is_digit(prevch) && is_digit(*next)) {
 201                 infix_ch = check_infix_digit(infix_ch);
 202             } else {
 203                 // Handle things like '&' in AT&T, apostrophes, etc.
 204                 infix_ch = check_infix(infix_ch);
 205             }
 206             if (!infix_ch) break;
 207             if (infix_ch != UNICODE_IGNORE)
 208                 Unicode::append_utf8(term, infix_ch);
 209             ch = nextch;
 210             itor = next;
 211         }
 212
 213         {
 214             size_t len = term.size();
 215             unsigned count = 0;
 216             while ((ch = check_suffix(*itor))) {
 217                 if (++count > 3) {
 218                     term.resize(len);
 219                     break;
 220                 }
 221                 Unicode::append_utf8(term, ch);
 222                 if (++itor == Utf8Iterator()) goto endofterm;
 223             }
 224             // Don't index fish+chips as fish+ chips.
 225             if (Unicode::is_wordchar(*itor))
 226                 term.resize(len);
 227         }
 228
 229 endofterm:
 230         if (!action(term, with_positions, itor))
 231             return;
 232     }
 233 }
 234
 235 void
 236 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
 237                                     const string & prefix, bool with_positions)
 238 {
 239     bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
 240
 241     stop_strategy current_stop_mode;
 242     if (!stopper.get()) {
 243         current_stop_mode = TermGenerator::STOP_NONE;
 244     } else {
 245         current_stop_mode = stop_mode;
 246     }
 247
 248     parse_terms(itor, cjk_ngram, with_positions,
 249         [=](const string & term, bool positional, const Utf8Iterator &) {
 250             if (term.size() > max_word_length) return true;
 251
 252             if (current_stop_mode == TermGenerator::STOP_ALL && (*stopper)(term))
 253                 return true;
 254
 255             if (strategy == TermGenerator::STEM_SOME ||
 256                 strategy == TermGenerator::STEM_NONE) {
 257                 if (positional) {
 258                     doc.add_posting(prefix + term, ++termpos, wdf_inc);
 259                 } else {
 260                     doc.add_term(prefix + term, wdf_inc);
 261                 }
 262             }
 263
 264             // MSVC seems to need "this->" on member variables in this
 265             // situation.
 266             if ((this->flags & FLAG_SPELLING) && prefix.empty())
 267                 db.add_spelling(term);
 268
 269             if (strategy == TermGenerator::STEM_NONE ||
 270                 !stemmer.internal.get()) return true;
 271
 272             if (strategy == TermGenerator::STEM_SOME) {
 273                 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
 274                     (*stopper)(term))
 275                     return true;
 276
 277                 // Note, this uses the lowercased term, but that's OK as we
 278                 // only want to avoid stemming terms starting with a digit.
 279                 if (!should_stem(term)) return true;
 280             }
 281
 282             // Add stemmed form without positional information.
 283             const string& stem = stemmer(term);
 284             if (rare(stem.empty())) return true;
 285             string stemmed_term;
 286             if (strategy != TermGenerator::STEM_ALL) {
 287                 stemmed_term += "Z";
 288             }
 289             stemmed_term += prefix;
 290             stemmed_term += stem;
 291             if (strategy != TermGenerator::STEM_SOME && with_positions) {
 292                 doc.add_posting(stemmed_term, ++termpos, wdf_inc);
 293             } else {
 294                 doc.add_term(stemmed_term, wdf_inc);
 295             }
 296             return true;
 297         });
 298 }
 299
 300 struct Sniplet {
 301     double* relevance;
 302
 303     size_t term_end;
 304
 305     size_t highlight;
 306
 307     Sniplet(double* r, size_t t, size_t h)
 308         : relevance(r), term_end(t), highlight(h) { }
 309 };
 310
 311 class SnipPipe {
 312     deque<Sniplet> pipe;
 313     deque<Sniplet> best_pipe;
 314
 315     // Requested length for snippet.
 316     size_t length;
 317
 318     // Position in text of start of current pipe contents.
 319     size_t begin = 0;
 320
 321     // Rolling sum of the current pipe contents.
 322     double sum = 0;
 323
 324     size_t phrase_len = 0;
 325
 326   public:
 327     size_t best_begin = 0;
 328
 329     size_t best_end = 0;
 330
 331     double best_sum = 0;
 332
 333     // Add one to length to allow for inter-word space.
 334     // FIXME: We ought to correctly allow for multiple spaces.
 335     explicit SnipPipe(size_t length_) : length(length_ + 1) { }
 336
 337     bool pump(double* r, size_t t, size_t h, unsigned flags);
 338
 339     void done();
 340
 341     bool drain(const string & input,
 342                const string & hi_start,
 343                const string & hi_end,
 344                const string & omit,
 345                string & output);
 346 };
 347
 348 #define DECAY 2.0
 349
 350 inline bool
 351 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
 352 {
 353     if (h > 1) {
 354         if (pipe.size() >= h - 1) {
 355             // The final term of a phrase is entering the window.  Peg the
 356             // phrase's relevance onto the first term of the phrase, so it'll
 357             // be removed from `sum` when the phrase starts to leave the
 358             // window.
 359             auto & phrase_start = pipe[pipe.size() - (h - 1)];
 360             if (phrase_start.relevance) {
 361                 *phrase_start.relevance *= DECAY;
 362                 sum -= *phrase_start.relevance;
 363             }
 364             sum += *r;
 365             phrase_start.relevance = r;
 366             phrase_start.highlight = h;
 367             *r /= DECAY;
 368         }
 369         r = NULL;
 370         h = 0;
 371     }
 372     pipe.emplace_back(r, t, h);
 373     if (r) {
 374         sum += *r;
 375         *r /= DECAY;
 376     }
 377
 378     // If necessary, discard words from the start of the pipe until it has the
 379     // desired length.
 380     // FIXME: Also shrink the window past words with relevance < 0?
 381     while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
 382         const Sniplet& word = pipe.front();
 383         if (word.relevance) {
 384             *word.relevance *= DECAY;
 385             sum -= *word.relevance;
 386         }
 387         begin = word.term_end;
 388         if (best_end >= begin)
 389             best_pipe.push_back(word);
 390         pipe.pop_front();
 391         // E.g. can happen if the current term is longer than the requested
 392         // length!
 393         if (rare(pipe.empty())) break;
 394     }
 395
 396     // Using > here doesn't work well, as we don't extend a snippet over terms
 397     // with 0 weight.
 398     if (sum >= best_sum) {
 399         // Discard any part of `best_pipe` which is before `begin`.
 400         if (begin >= best_end) {
 401             best_pipe.clear();
 402         } else {
 403             while (!best_pipe.empty() &&
 404                    best_pipe.front().term_end <= begin) {
 405                 best_pipe.pop_front();
 406             }
 407         }
 408         best_sum = sum;
 409         best_begin = begin;
 410         best_end = t;
 411     } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
 412         if (best_sum > 0 && best_end < begin) {
 413             // We found something, and we aren't still looking near it.
 414             // FIXME: Benchmark this and adjust if necessary.
 415             return false;
 416         }
 417     }
 418     return true;
 419 }
 420
 421 inline void
 422 SnipPipe::done()
 423 {
 424     // Discard any part of `pipe` which is after `best_end`.
 425     if (begin >= best_end) {
 426         pipe.clear();
 427     } else {
 428         // We should never empty the pipe (as that case should be handled
 429         // above).
 430         while (rare(!pipe.empty()) &&
 431                pipe.back().term_end > best_end) {
 432             pipe.pop_back();
 433         }
 434     }
 435 }
 436
 437 // Check if a non-word character is should be included at the start of the
 438 // snippet.  We want to include certain leading non-word characters, but not
 439 // others.
 440 inline bool
 441 snippet_check_leading_nonwordchar(unsigned ch) {
 442     if (Unicode::is_currency(ch) ||
 443         Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
 444         Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
 445         return true;
 446     }
 447     switch (ch) {
 448         case '"':
 449         case '#':
 450         case '%':
 451         case '&':
 452         case '\'':
 453         case '+':
 454         case '-':
 455         case '/':
 456         case '<':
 457         case '@':
 458         case '\\':
 459         case '`':
 460         case '~':
 461         case 0x00A1: // INVERTED EXCLAMATION MARK
 462         case 0x00A7: // SECTION SIGN
 463         case 0x00BF: // INVERTED QUESTION MARK
 464             return true;
 465     }
 466     return false;
 467 }
 468
 469 inline void
 470 append_escaping_xml(const char* p, const char* end, string& output)
 471 {
 472     while (p != end) {
 473         char ch = *p++;
 474         switch (ch) {
 475             case '&':
 476                 output += "&amp;";
 477                 break;
 478             case '<':
 479                 output += "&lt;";
 480                 break;
 481             case '>':
 482                 output += "&gt;";
 483                 break;
 484             default:
 485                 output += ch;
 486         }
 487     }
 488 }
 489
 490 inline bool
 491 SnipPipe::drain(const string & input,
 492                 const string & hi_start,
 493                 const string & hi_end,
 494                 const string & omit,
 495                 string & output)
 496 {
 497     if (best_pipe.empty() && !pipe.empty()) {
 498         swap(best_pipe, pipe);
 499     }
 500
 501     if (best_pipe.empty()) {
 502         size_t tail_len = input.size() - best_end;
 503         if (tail_len == 0) return false;
 504
 505         // See if this is the end of a sentence.
 506         // FIXME: This is quite simplistic - look at the Unicode rules:
 507         // http://unicode.org/reports/tr29/#Sentence_Boundaries
 508         bool punc = false;
 509         Utf8Iterator i(input.data() + best_end, tail_len);
 510         while (i != Utf8Iterator()) {
 511             unsigned ch = *i;
 512             if (punc && Unicode::is_whitespace(ch)) break;
 513
 514             // Allow "...", "!!", "!?!", etc...
 515             punc = (ch == '.' || ch == '?' || ch == '!');
 516
 517             if (Unicode::is_wordchar(ch)) break;
 518             ++i;
 519         }
 520
 521         if (punc) {
 522             // Include end of sentence punctuation.
 523             append_escaping_xml(input.data() + best_end, i.raw(), output);
 524         } else {
 525             // Append "..." or equivalent if this doesn't seem to be the start
 526             // of a sentence.
 527             output += omit;
 528         }
 529
 530         return false;
 531     }
 532
 533     const Sniplet & word = best_pipe.front();
 534
 535     if (output.empty()) {
 536         // Start of the snippet.
 537         enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
 538
 539         Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
 540         while (i != Utf8Iterator()) {
 541             unsigned ch = *i;
 542             switch (sentence_boundary) {
 543                 case NO:
 544                     if (ch == '.' || ch == '?' || ch == '!') {
 545                         sentence_boundary = PUNC;
 546                     }
 547                     break;
 548                 case PUNC:
 549                     if (Unicode::is_whitespace(ch)) {
 550                         sentence_boundary = YES;
 551                     } else if (ch == '.' || ch == '?' || ch == '!') {
 552                         // Allow "...", "!!", "!?!", etc...
 553                     } else {
 554                         sentence_boundary = NO;
 555                     }
 556                     break;
 557                 case YES:
 558                     break;
 559             }
 560
 561             // Start the snippet at the start of the first word, but include
 562             // certain punctuation too.
 563             if (Unicode::is_wordchar(ch)) {
 564                 // But limit how much leading punctuation we include.
 565                 size_t word_begin = i.raw() - input.data();
 566                 if (word_begin - best_begin > 4) {
 567                     best_begin = word_begin;
 568                 }
 569                 break;
 570             }
 571             ++i;
 572             if (!snippet_check_leading_nonwordchar(ch)) {
 573                 best_begin = i.raw() - input.data();
 574             }
 575         }
 576
 577         // Add "..." or equivalent if this doesn't seem to be the start of a
 578         // sentence.
 579         if (sentence_boundary != YES) {
 580             output += omit;
 581         }
 582     }
 583
 584     if (word.highlight) {
 585         // Don't include inter-word characters in the highlight.
 586         Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
 587         while (i != Utf8Iterator()) {
 588             unsigned ch = *i;
 589             if (Unicode::is_wordchar(ch)) {
 590                 append_escaping_xml(input.data() + best_begin, i.raw(), output);
 591                 best_begin = i.raw() - input.data();
 592                 break;
 593             }
 594             ++i;
 595         }
 596     }
 597
 598     if (!phrase_len) {
 599         phrase_len = word.highlight;
 600         if (phrase_len) output += hi_start;
 601     }
 602
 603     const char* p = input.data();
 604     append_escaping_xml(p + best_begin, p + word.term_end, output);
 605     best_begin = word.term_end;
 606
 607     if (phrase_len && --phrase_len == 0) output += hi_end;
 608
 609     best_pipe.pop_front();
 610     return true;
 611 }
 612
 613 static void
 614 check_query(const Xapian::Query & query,
 615             list<vector<string>> & exact_phrases,
 616             unordered_map<string, double> & loose_terms,
 617             list<string> & wildcards,
 618             size_t & longest_phrase)
 619 {
 620     // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
 621     size_t n_subqs = query.get_num_subqueries();
 622     Xapian::Query::op op = query.get_type();
 623     if (op == query.LEAF_TERM) {
 624         const Xapian::Internal::QueryTerm & qt =
 625             *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
 626         loose_terms.insert(make_pair(qt.get_term(), 0));
 627     } else if (op == query.OP_WILDCARD) {
 628         const Xapian::Internal::QueryWildcard & qw =
 629             *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
 630         wildcards.push_back(qw.get_pattern());
 631     } else if (op == query.OP_PHRASE) {
 632         const Xapian::Internal::QueryPhrase & phrase =
 633             *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
 634         if (phrase.get_window() == n_subqs) {
 635             // Tight phrase.
 636             for (size_t i = 0; i != n_subqs; ++i) {
 637                 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
 638                     goto non_term_subquery;
 639             }
 640
 641             // Tight phrase of terms.
 642             exact_phrases.push_back(vector<string>());
 643             vector<string> & terms = exact_phrases.back();
 644             terms.reserve(n_subqs);
 645             for (size_t i = 0; i != n_subqs; ++i) {
 646                 Xapian::Query q = query.get_subquery(i);
 647                 const Xapian::Internal::QueryTerm & qt =
 648                     *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
 649                 terms.push_back(qt.get_term());
 650             }
 651             if (n_subqs > longest_phrase) longest_phrase = n_subqs;
 652             return;
 653         }
 654     }
 655 non_term_subquery:
 656     for (size_t i = 0; i != n_subqs; ++i)
 657         check_query(query.get_subquery(i), exact_phrases, loose_terms,
 658                     wildcards, longest_phrase);
 659 }
 660
 661 static double*
 662 check_term(unordered_map<string, double> & loose_terms,
 663            const Xapian::Weight::Internal * stats,
 664            const string & term,
 665            double max_tw)
 666 {
 667     auto it = loose_terms.find(term);
 668     if (it == loose_terms.end()) return NULL;
 669
 670     if (it->second == 0.0) {
 671         double relevance;
 672         if (!stats->get_termweight(term, relevance)) {
 673             // FIXME: Assert?
 674             loose_terms.erase(it);
 675             return NULL;
 676         }
 677
 678         it->second = relevance + max_tw;
 679     }
 680     return &it->second;
 681 }
 682
 683 string
 684 MSet::Internal::snippet(const string & text,
 685                         size_t length,
 686                         const Xapian::Stem & stemmer,
 687                         unsigned flags,
 688                         const string & hi_start,
 689                         const string & hi_end,
 690                         const string & omit) const
 691 {
 692     if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
 693         // Too easy!
 694         return text;
 695     }
 696
 697     bool cjk_ngram = CJK::is_cjk_enabled();
 698
 699     size_t term_start = 0;
 700     double min_tw = 0, max_tw = 0;
 701     if (stats) stats->get_max_termweight(min_tw, max_tw);
 702     if (max_tw == 0.0) {
 703         max_tw = 1.0;
 704     } else {
 705         // Scale up by (1 + 1/64) so that highlighting works better for terms
 706         // with termweight 0 (which happens for terms not in the database, and
 707         // also with some weighting schemes for terms which occur in almost all
 708         // documents.
 709         max_tw *= 1.015625;
 710     }
 711
 712     SnipPipe snip(length);
 713
 714     list<vector<string>> exact_phrases;
 715     unordered_map<string, double> loose_terms;
 716     list<string> wildcards;
 717     size_t longest_phrase = 0;
 718     check_query(enquire->query, exact_phrases, loose_terms,
 719                 wildcards, longest_phrase);
 720
 721     vector<double> exact_phrases_relevance;
 722     exact_phrases_relevance.reserve(exact_phrases.size());
 723     for (auto&& terms : exact_phrases) {
 724         // FIXME: What relevance to use?
 725         exact_phrases_relevance.push_back(max_tw * terms.size());
 726     }
 727
 728     vector<double> wildcards_relevance;
 729     wildcards_relevance.reserve(exact_phrases.size());
 730     for (auto&& pattern : wildcards) {
 731         // FIXME: What relevance to use?
 732         (void)pattern;
 733         wildcards_relevance.push_back(max_tw + min_tw);
 734     }
 735
 736     // Background relevance is the same for a given MSet, so cache it
 737     // between calls to MSet::snippet() on the same object.
 738     unordered_map<string, double>& background = snippet_bg_relevance;
 739
 740     vector<string> phrase;
 741     if (longest_phrase) phrase.resize(longest_phrase - 1);
 742     size_t phrase_next = 0;
 743     bool matchfound = false;
 744     parse_terms(Utf8Iterator(text), cjk_ngram, true,
 745         [&](const string & term, bool positional, const Utf8Iterator & it) {
 746             // FIXME: Don't hardcode this here.
 747             const size_t max_word_length = 64;
 748
 749             if (!positional) return true;
 750             if (term.size() > max_word_length) return true;
 751
 752             // We get segments with any "inter-word" characters in front of
 753             // each word, e.g.:
 754             // [The][ cat][ sat][ on][ the][ mat]
 755             size_t term_end = text.size() - it.left();
 756
 757             double* relevance = 0;
 758             size_t highlight = 0;
 759             if (stats) {
 760                 size_t i = 0;
 761                 for (auto&& terms : exact_phrases) {
 762                     if (term == terms.back()) {
 763                         size_t n = terms.size() - 1;
 764                         bool match = true;
 765                         while (n--) {
 766                             if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
 767                                 match = false;
 768                                 break;
 769                             }
 770                         }
 771                         if (match) {
 772                             // FIXME: Sort phrases, highest score first!
 773                             relevance = &exact_phrases_relevance[i];
 774                             highlight = terms.size();
 775                             goto relevance_done;
 776                         }
 777                     }
 778                     ++i;
 779                 }
 780
 781                 relevance = check_term(loose_terms, stats.get(), term, max_tw);
 782                 if (relevance) {
 783                     // Matched unstemmed term.
 784                     highlight = 1;
 785                     goto relevance_done;
 786                 }
 787
 788                 string stem = "Z";
 789                 stem += stemmer(term);
 790                 relevance = check_term(loose_terms, stats.get(), stem, max_tw);
 791                 if (relevance) {
 792                     // Matched stemmed term.
 793                     highlight = 1;
 794                     goto relevance_done;
 795                 }
 796
 797                 // Check wildcards.
 798                 // FIXME: Sort wildcards, shortest pattern first or something?
 799                 i = 0;
 800                 for (auto&& pattern : wildcards) {
 801                     if (startswith(term, pattern)) {
 802                         relevance = &wildcards_relevance[i];
 803                         highlight = 1;
 804                         goto relevance_done;
 805                     }
 806                     ++i;
 807                 }
 808
 809                 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
 810                     // Background document model.
 811                     auto bgit = background.find(term);
 812                     if (bgit == background.end()) bgit = background.find(stem);
 813                     if (bgit == background.end()) {
 814                         Xapian::doccount tf = enquire->db.get_termfreq(term);
 815                         if (!tf) {
 816                             tf = enquire->db.get_termfreq(stem);
 817                         } else {
 818                             stem = term;
 819                         }
 820                         double r = 0.0;
 821                         if (tf) {
 822                             // Add one to avoid log(0) when a term indexes all
 823                             // documents.
 824                             Xapian::doccount num_docs = stats->collection_size + 1;
 825                             r = max_tw * log((num_docs - tf) / double(tf));
 826                             r /= (length + 1) * log(double(num_docs));
 827 #if 0
 828                             if (r <= 0) {
 829                                 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
 830                                 while (i != Utf8Iterator()) {
 831                                     if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
 832                                         r = max_tw * 0.05;
 833                                     }
 834                                 }
 835                             }
 836 #endif
 837                         }
 838                         bgit = background.emplace(make_pair(stem, r)).first;
 839                     }
 840                     relevance = &bgit->second;
 841                 }
 842             } else {
 843 #if 0
 844                 // In the absence of weight information, assume longer terms
 845                 // are more relevant, and that unstemmed matches are a bit more
 846                 // relevant than stemmed matches.
 847                 if (queryterms.find(term) != queryterms.end()) {
 848                     relevance = term.size() * 3;
 849                 } else {
 850                     string stem = "Z";
 851                     stem += stemmer(term);
 852                     if (queryterms.find(stem) != queryterms.end()) {
 853                         relevance = term.size() * 2;
 854                     }
 855                 }
 856 #endif
 857             }
 858
 859             // FIXME: Allow Enquire without a DB set or an empty MSet() to be
 860             // used if you don't want the collection model?
 861
 862 #if 0
 863             // FIXME: Punctuation should somehow be included in the model, but this
 864             // approach is problematic - we don't want the first word of a sentence
 865             // to be favoured when it's at the end of the window.
 866
 867             // Give first word in each sentence a relevance boost.
 868             if (term_start == 0) {
 869                 relevance += 10;
 870             } else {
 871                 for (size_t i = term_start; i + term.size() < term_end; ++i) {
 872                     if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
 873                         relevance += 10;
 874                         break;
 875                     }
 876                 }
 877             }
 878 #endif
 879
 880 relevance_done:
 881             if (longest_phrase) {
 882                 phrase[phrase_next] = term;
 883                 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
 884             }
 885
 886             if (highlight) matchfound = true;
 887
 888             if (!snip.pump(relevance, term_end, highlight, flags)) return false;
 889
 890             term_start = term_end;
 891             return true;
 892         });
 893
 894     snip.done();
 895
 896     // Put together the snippet.
 897     string result;
 898     if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
 899         while (snip.drain(text, hi_start, hi_end, omit, result)) { }
 900     }
 901
 902     return result;
 903 }
 904
 905 }