[ci] Enable IRC notifications from travis
[xapian.git] / xapian-core / queryparser / termgenerator_internal.cc
blob5747d0dfb81e64c5ab376194a52fe9fb8182a16e
1 /** @file termgenerator_internal.cc
2 * @brief TermGenerator class internals
3 */
4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "termgenerator_internal.h"
25 #include "api/msetinternal.h"
26 #include "api/queryinternal.h"
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
33 #include "stringutils.h"
35 #include <algorithm>
36 #include <cmath>
37 #include <deque>
38 #include <limits>
39 #include <list>
40 #include <string>
41 #include <unordered_map>
42 #include <vector>
44 #include "cjk-tokenizer.h"
46 using namespace std;
48 namespace Xapian {
50 inline bool
51 U_isupper(unsigned ch) {
52 return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
55 inline unsigned check_wordchar(unsigned ch) {
56 if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
57 return 0;
60 inline bool
61 should_stem(const std::string & term)
63 const unsigned int SHOULD_STEM_MASK =
64 (1 << Unicode::LOWERCASE_LETTER) |
65 (1 << Unicode::TITLECASE_LETTER) |
66 (1 << Unicode::MODIFIER_LETTER) |
67 (1 << Unicode::OTHER_LETTER);
68 Utf8Iterator u(term);
69 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
72 /** Value representing "ignore this" when returned by check_infix() or
73 * check_infix_digit().
75 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
77 inline unsigned check_infix(unsigned ch) {
78 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
79 // Unicode includes all these except '&' in its word boundary rules,
80 // as well as 0x2019 (which we handle below) and ':' (for Swedish
81 // apparently, but we ignore this for now as it's problematic in
82 // real world cases).
83 return ch;
85 // 0x2019 is Unicode apostrophe and single closing quote.
86 // 0x201b is Unicode single opening quote with the tail rising.
87 if (ch == 0x2019 || ch == 0x201b) return '\'';
88 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
89 return UNICODE_IGNORE;
90 return 0;
93 inline unsigned check_infix_digit(unsigned ch) {
94 // This list of characters comes from Unicode's word identifying algorithm.
95 switch (ch) {
96 case ',':
97 case '.':
98 case ';':
99 case 0x037e: // GREEK QUESTION MARK
100 case 0x0589: // ARMENIAN FULL STOP
101 case 0x060D: // ARABIC DATE SEPARATOR
102 case 0x07F8: // NKO COMMA
103 case 0x2044: // FRACTION SLASH
104 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
105 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
106 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
107 return ch;
109 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
110 return UNICODE_IGNORE;
111 return 0;
114 inline bool
115 is_digit(unsigned ch) {
116 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
119 inline unsigned check_suffix(unsigned ch) {
120 if (ch == '+' || ch == '#') return ch;
121 // FIXME: what about '-'?
122 return 0;
125 /** Templated framework for processing terms.
127 * Calls action(term, positional) for each term to add, where term is a
128 * std::string holding the term, and positional is a bool indicating
129 * if this term carries positional information.
131 template<typename ACTION> void
132 parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
134 while (true) {
135 // Advance to the start of the next term.
136 unsigned ch;
137 while (true) {
138 if (itor == Utf8Iterator()) return;
139 ch = check_wordchar(*itor);
140 if (ch) break;
141 ++itor;
144 string term;
145 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
146 // Don't worry if there's a trailing '.' or not.
147 if (U_isupper(*itor)) {
148 const Utf8Iterator end;
149 Utf8Iterator p = itor;
150 do {
151 Unicode::append_utf8(term, Unicode::tolower(*p++));
152 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
153 // One letter does not make an acronym! If we handled a single
154 // uppercase letter here, we wouldn't catch M&S below.
155 if (term.size() > 1) {
156 // Check there's not a (lower case) letter or digit
157 // immediately after it.
158 if (p == end || !Unicode::is_wordchar(*p)) {
159 itor = p;
160 goto endofterm;
163 term.resize(0);
166 while (true) {
167 if (cjk_ngram &&
168 CJK::codepoint_is_cjk(*itor) &&
169 Unicode::is_wordchar(*itor)) {
170 const string & cjk = CJK::get_cjk(itor);
171 for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
172 const string & cjk_token = *tk;
173 if (!action(cjk_token, with_positions && tk.get_length() == 1, itor))
174 return;
176 while (true) {
177 if (itor == Utf8Iterator()) return;
178 ch = check_wordchar(*itor);
179 if (ch) break;
180 ++itor;
182 continue;
184 unsigned prevch;
185 do {
186 Unicode::append_utf8(term, ch);
187 prevch = ch;
188 if (++itor == Utf8Iterator() ||
189 (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
190 goto endofterm;
191 ch = check_wordchar(*itor);
192 } while (ch);
194 Utf8Iterator next(itor);
195 ++next;
196 if (next == Utf8Iterator()) break;
197 unsigned nextch = check_wordchar(*next);
198 if (!nextch) break;
199 unsigned infix_ch = *itor;
200 if (is_digit(prevch) && is_digit(*next)) {
201 infix_ch = check_infix_digit(infix_ch);
202 } else {
203 // Handle things like '&' in AT&T, apostrophes, etc.
204 infix_ch = check_infix(infix_ch);
206 if (!infix_ch) break;
207 if (infix_ch != UNICODE_IGNORE)
208 Unicode::append_utf8(term, infix_ch);
209 ch = nextch;
210 itor = next;
214 size_t len = term.size();
215 unsigned count = 0;
216 while ((ch = check_suffix(*itor))) {
217 if (++count > 3) {
218 term.resize(len);
219 break;
221 Unicode::append_utf8(term, ch);
222 if (++itor == Utf8Iterator()) goto endofterm;
224 // Don't index fish+chips as fish+ chips.
225 if (Unicode::is_wordchar(*itor))
226 term.resize(len);
229 endofterm:
230 if (!action(term, with_positions, itor))
231 return;
235 void
236 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
237 const string & prefix, bool with_positions)
239 bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
241 stop_strategy current_stop_mode;
242 if (!stopper.get()) {
243 current_stop_mode = TermGenerator::STOP_NONE;
244 } else {
245 current_stop_mode = stop_mode;
248 parse_terms(itor, cjk_ngram, with_positions,
249 [=](const string & term, bool positional, const Utf8Iterator &) {
250 if (term.size() > max_word_length) return true;
252 if (current_stop_mode == TermGenerator::STOP_ALL && (*stopper)(term))
253 return true;
255 if (strategy == TermGenerator::STEM_SOME ||
256 strategy == TermGenerator::STEM_NONE) {
257 if (positional) {
258 doc.add_posting(prefix + term, ++termpos, wdf_inc);
259 } else {
260 doc.add_term(prefix + term, wdf_inc);
264 // MSVC seems to need "this->" on member variables in this
265 // situation.
266 if ((this->flags & FLAG_SPELLING) && prefix.empty())
267 db.add_spelling(term);
269 if (strategy == TermGenerator::STEM_NONE ||
270 !stemmer.internal.get()) return true;
272 if (strategy == TermGenerator::STEM_SOME) {
273 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
274 (*stopper)(term))
275 return true;
277 // Note, this uses the lowercased term, but that's OK as we
278 // only want to avoid stemming terms starting with a digit.
279 if (!should_stem(term)) return true;
282 // Add stemmed form without positional information.
283 const string& stem = stemmer(term);
284 if (rare(stem.empty())) return true;
285 string stemmed_term;
286 if (strategy != TermGenerator::STEM_ALL) {
287 stemmed_term += "Z";
289 stemmed_term += prefix;
290 stemmed_term += stem;
291 if (strategy != TermGenerator::STEM_SOME && with_positions) {
292 doc.add_posting(stemmed_term, ++termpos, wdf_inc);
293 } else {
294 doc.add_term(stemmed_term, wdf_inc);
296 return true;
300 struct Sniplet {
301 double* relevance;
303 size_t term_end;
305 size_t highlight;
307 Sniplet(double* r, size_t t, size_t h)
308 : relevance(r), term_end(t), highlight(h) { }
311 class SnipPipe {
312 deque<Sniplet> pipe;
313 deque<Sniplet> best_pipe;
315 // Requested length for snippet.
316 size_t length;
318 // Position in text of start of current pipe contents.
319 size_t begin = 0;
321 // Rolling sum of the current pipe contents.
322 double sum = 0;
324 size_t phrase_len = 0;
326 public:
327 size_t best_begin = 0;
329 size_t best_end = 0;
331 double best_sum = 0;
333 // Add one to length to allow for inter-word space.
334 // FIXME: We ought to correctly allow for multiple spaces.
335 explicit SnipPipe(size_t length_) : length(length_ + 1) { }
337 bool pump(double* r, size_t t, size_t h, unsigned flags);
339 void done();
341 bool drain(const string & input,
342 const string & hi_start,
343 const string & hi_end,
344 const string & omit,
345 string & output);
348 #define DECAY 2.0
350 inline bool
351 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
353 if (h > 1) {
354 if (pipe.size() >= h - 1) {
355 // The final term of a phrase is entering the window. Peg the
356 // phrase's relevance onto the first term of the phrase, so it'll
357 // be removed from `sum` when the phrase starts to leave the
358 // window.
359 auto & phrase_start = pipe[pipe.size() - (h - 1)];
360 if (phrase_start.relevance) {
361 *phrase_start.relevance *= DECAY;
362 sum -= *phrase_start.relevance;
364 sum += *r;
365 phrase_start.relevance = r;
366 phrase_start.highlight = h;
367 *r /= DECAY;
369 r = NULL;
370 h = 0;
372 pipe.emplace_back(r, t, h);
373 if (r) {
374 sum += *r;
375 *r /= DECAY;
378 // If necessary, discard words from the start of the pipe until it has the
379 // desired length.
380 // FIXME: Also shrink the window past words with relevance < 0?
381 while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
382 const Sniplet& word = pipe.front();
383 if (word.relevance) {
384 *word.relevance *= DECAY;
385 sum -= *word.relevance;
387 begin = word.term_end;
388 if (best_end >= begin)
389 best_pipe.push_back(word);
390 pipe.pop_front();
391 // E.g. can happen if the current term is longer than the requested
392 // length!
393 if (rare(pipe.empty())) break;
396 // Using > here doesn't work well, as we don't extend a snippet over terms
397 // with 0 weight.
398 if (sum >= best_sum) {
399 // Discard any part of `best_pipe` which is before `begin`.
400 if (begin >= best_end) {
401 best_pipe.clear();
402 } else {
403 while (!best_pipe.empty() &&
404 best_pipe.front().term_end <= begin) {
405 best_pipe.pop_front();
408 best_sum = sum;
409 best_begin = begin;
410 best_end = t;
411 } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
412 if (best_sum > 0 && best_end < begin) {
413 // We found something, and we aren't still looking near it.
414 // FIXME: Benchmark this and adjust if necessary.
415 return false;
418 return true;
421 inline void
422 SnipPipe::done()
424 // Discard any part of `pipe` which is after `best_end`.
425 if (begin >= best_end) {
426 pipe.clear();
427 } else {
428 // We should never empty the pipe (as that case should be handled
429 // above).
430 while (rare(!pipe.empty()) &&
431 pipe.back().term_end > best_end) {
432 pipe.pop_back();
437 // Check if a non-word character is should be included at the start of the
438 // snippet. We want to include certain leading non-word characters, but not
439 // others.
440 inline bool
441 snippet_check_leading_nonwordchar(unsigned ch) {
442 if (Unicode::is_currency(ch) ||
443 Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
444 Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
445 return true;
447 switch (ch) {
448 case '"':
449 case '#':
450 case '%':
451 case '&':
452 case '\'':
453 case '+':
454 case '-':
455 case '/':
456 case '<':
457 case '@':
458 case '\\':
459 case '`':
460 case '~':
461 case 0x00A1: // INVERTED EXCLAMATION MARK
462 case 0x00A7: // SECTION SIGN
463 case 0x00BF: // INVERTED QUESTION MARK
464 return true;
466 return false;
469 inline void
470 append_escaping_xml(const char* p, const char* end, string& output)
472 while (p != end) {
473 char ch = *p++;
474 switch (ch) {
475 case '&':
476 output += "&amp;";
477 break;
478 case '<':
479 output += "&lt;";
480 break;
481 case '>':
482 output += "&gt;";
483 break;
484 default:
485 output += ch;
490 inline bool
491 SnipPipe::drain(const string & input,
492 const string & hi_start,
493 const string & hi_end,
494 const string & omit,
495 string & output)
497 if (best_pipe.empty() && !pipe.empty()) {
498 swap(best_pipe, pipe);
501 if (best_pipe.empty()) {
502 size_t tail_len = input.size() - best_end;
503 if (tail_len == 0) return false;
505 // See if this is the end of a sentence.
506 // FIXME: This is quite simplistic - look at the Unicode rules:
507 // http://unicode.org/reports/tr29/#Sentence_Boundaries
508 bool punc = false;
509 Utf8Iterator i(input.data() + best_end, tail_len);
510 while (i != Utf8Iterator()) {
511 unsigned ch = *i;
512 if (punc && Unicode::is_whitespace(ch)) break;
514 // Allow "...", "!!", "!?!", etc...
515 punc = (ch == '.' || ch == '?' || ch == '!');
517 if (Unicode::is_wordchar(ch)) break;
518 ++i;
521 if (punc) {
522 // Include end of sentence punctuation.
523 append_escaping_xml(input.data() + best_end, i.raw(), output);
524 } else {
525 // Append "..." or equivalent if this doesn't seem to be the start
526 // of a sentence.
527 output += omit;
530 return false;
533 const Sniplet & word = best_pipe.front();
535 if (output.empty()) {
536 // Start of the snippet.
537 enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
539 Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
540 while (i != Utf8Iterator()) {
541 unsigned ch = *i;
542 switch (sentence_boundary) {
543 case NO:
544 if (ch == '.' || ch == '?' || ch == '!') {
545 sentence_boundary = PUNC;
547 break;
548 case PUNC:
549 if (Unicode::is_whitespace(ch)) {
550 sentence_boundary = YES;
551 } else if (ch == '.' || ch == '?' || ch == '!') {
552 // Allow "...", "!!", "!?!", etc...
553 } else {
554 sentence_boundary = NO;
556 break;
557 case YES:
558 break;
561 // Start the snippet at the start of the first word, but include
562 // certain punctuation too.
563 if (Unicode::is_wordchar(ch)) {
564 // But limit how much leading punctuation we include.
565 size_t word_begin = i.raw() - input.data();
566 if (word_begin - best_begin > 4) {
567 best_begin = word_begin;
569 break;
571 ++i;
572 if (!snippet_check_leading_nonwordchar(ch)) {
573 best_begin = i.raw() - input.data();
577 // Add "..." or equivalent if this doesn't seem to be the start of a
578 // sentence.
579 if (sentence_boundary != YES) {
580 output += omit;
584 if (word.highlight) {
585 // Don't include inter-word characters in the highlight.
586 Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
587 while (i != Utf8Iterator()) {
588 unsigned ch = *i;
589 if (Unicode::is_wordchar(ch)) {
590 append_escaping_xml(input.data() + best_begin, i.raw(), output);
591 best_begin = i.raw() - input.data();
592 break;
594 ++i;
598 if (!phrase_len) {
599 phrase_len = word.highlight;
600 if (phrase_len) output += hi_start;
603 const char* p = input.data();
604 append_escaping_xml(p + best_begin, p + word.term_end, output);
605 best_begin = word.term_end;
607 if (phrase_len && --phrase_len == 0) output += hi_end;
609 best_pipe.pop_front();
610 return true;
613 static void
614 check_query(const Xapian::Query & query,
615 list<vector<string>> & exact_phrases,
616 unordered_map<string, double> & loose_terms,
617 list<string> & wildcards,
618 size_t & longest_phrase)
620 // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
621 size_t n_subqs = query.get_num_subqueries();
622 Xapian::Query::op op = query.get_type();
623 if (op == query.LEAF_TERM) {
624 const Xapian::Internal::QueryTerm & qt =
625 *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
626 loose_terms.insert(make_pair(qt.get_term(), 0));
627 } else if (op == query.OP_WILDCARD) {
628 const Xapian::Internal::QueryWildcard & qw =
629 *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
630 wildcards.push_back(qw.get_pattern());
631 } else if (op == query.OP_PHRASE) {
632 const Xapian::Internal::QueryPhrase & phrase =
633 *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
634 if (phrase.get_window() == n_subqs) {
635 // Tight phrase.
636 for (size_t i = 0; i != n_subqs; ++i) {
637 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
638 goto non_term_subquery;
641 // Tight phrase of terms.
642 exact_phrases.push_back(vector<string>());
643 vector<string> & terms = exact_phrases.back();
644 terms.reserve(n_subqs);
645 for (size_t i = 0; i != n_subqs; ++i) {
646 Xapian::Query q = query.get_subquery(i);
647 const Xapian::Internal::QueryTerm & qt =
648 *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
649 terms.push_back(qt.get_term());
651 if (n_subqs > longest_phrase) longest_phrase = n_subqs;
652 return;
655 non_term_subquery:
656 for (size_t i = 0; i != n_subqs; ++i)
657 check_query(query.get_subquery(i), exact_phrases, loose_terms,
658 wildcards, longest_phrase);
661 static double*
662 check_term(unordered_map<string, double> & loose_terms,
663 const Xapian::Weight::Internal * stats,
664 const string & term,
665 double max_tw)
667 auto it = loose_terms.find(term);
668 if (it == loose_terms.end()) return NULL;
670 if (it->second == 0.0) {
671 double relevance;
672 if (!stats->get_termweight(term, relevance)) {
673 // FIXME: Assert?
674 loose_terms.erase(it);
675 return NULL;
678 it->second = relevance + max_tw;
680 return &it->second;
683 string
684 MSet::Internal::snippet(const string & text,
685 size_t length,
686 const Xapian::Stem & stemmer,
687 unsigned flags,
688 const string & hi_start,
689 const string & hi_end,
690 const string & omit) const
692 if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
693 // Too easy!
694 return text;
697 bool cjk_ngram = CJK::is_cjk_enabled();
699 size_t term_start = 0;
700 double min_tw = 0, max_tw = 0;
701 if (stats) stats->get_max_termweight(min_tw, max_tw);
702 if (max_tw == 0.0) {
703 max_tw = 1.0;
704 } else {
705 // Scale up by (1 + 1/64) so that highlighting works better for terms
706 // with termweight 0 (which happens for terms not in the database, and
707 // also with some weighting schemes for terms which occur in almost all
708 // documents.
709 max_tw *= 1.015625;
712 SnipPipe snip(length);
714 list<vector<string>> exact_phrases;
715 unordered_map<string, double> loose_terms;
716 list<string> wildcards;
717 size_t longest_phrase = 0;
718 check_query(enquire->query, exact_phrases, loose_terms,
719 wildcards, longest_phrase);
721 vector<double> exact_phrases_relevance;
722 exact_phrases_relevance.reserve(exact_phrases.size());
723 for (auto&& terms : exact_phrases) {
724 // FIXME: What relevance to use?
725 exact_phrases_relevance.push_back(max_tw * terms.size());
728 vector<double> wildcards_relevance;
729 wildcards_relevance.reserve(exact_phrases.size());
730 for (auto&& pattern : wildcards) {
731 // FIXME: What relevance to use?
732 (void)pattern;
733 wildcards_relevance.push_back(max_tw + min_tw);
736 // Background relevance is the same for a given MSet, so cache it
737 // between calls to MSet::snippet() on the same object.
738 unordered_map<string, double>& background = snippet_bg_relevance;
740 vector<string> phrase;
741 if (longest_phrase) phrase.resize(longest_phrase - 1);
742 size_t phrase_next = 0;
743 bool matchfound = false;
744 parse_terms(Utf8Iterator(text), cjk_ngram, true,
745 [&](const string & term, bool positional, const Utf8Iterator & it) {
746 // FIXME: Don't hardcode this here.
747 const size_t max_word_length = 64;
749 if (!positional) return true;
750 if (term.size() > max_word_length) return true;
752 // We get segments with any "inter-word" characters in front of
753 // each word, e.g.:
754 // [The][ cat][ sat][ on][ the][ mat]
755 size_t term_end = text.size() - it.left();
757 double* relevance = 0;
758 size_t highlight = 0;
759 if (stats) {
760 size_t i = 0;
761 for (auto&& terms : exact_phrases) {
762 if (term == terms.back()) {
763 size_t n = terms.size() - 1;
764 bool match = true;
765 while (n--) {
766 if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
767 match = false;
768 break;
771 if (match) {
772 // FIXME: Sort phrases, highest score first!
773 relevance = &exact_phrases_relevance[i];
774 highlight = terms.size();
775 goto relevance_done;
778 ++i;
781 relevance = check_term(loose_terms, stats.get(), term, max_tw);
782 if (relevance) {
783 // Matched unstemmed term.
784 highlight = 1;
785 goto relevance_done;
788 string stem = "Z";
789 stem += stemmer(term);
790 relevance = check_term(loose_terms, stats.get(), stem, max_tw);
791 if (relevance) {
792 // Matched stemmed term.
793 highlight = 1;
794 goto relevance_done;
797 // Check wildcards.
798 // FIXME: Sort wildcards, shortest pattern first or something?
799 i = 0;
800 for (auto&& pattern : wildcards) {
801 if (startswith(term, pattern)) {
802 relevance = &wildcards_relevance[i];
803 highlight = 1;
804 goto relevance_done;
806 ++i;
809 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
810 // Background document model.
811 auto bgit = background.find(term);
812 if (bgit == background.end()) bgit = background.find(stem);
813 if (bgit == background.end()) {
814 Xapian::doccount tf = enquire->db.get_termfreq(term);
815 if (!tf) {
816 tf = enquire->db.get_termfreq(stem);
817 } else {
818 stem = term;
820 double r = 0.0;
821 if (tf) {
822 // Add one to avoid log(0) when a term indexes all
823 // documents.
824 Xapian::doccount num_docs = stats->collection_size + 1;
825 r = max_tw * log((num_docs - tf) / double(tf));
826 r /= (length + 1) * log(double(num_docs));
827 #if 0
828 if (r <= 0) {
829 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
830 while (i != Utf8Iterator()) {
831 if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
832 r = max_tw * 0.05;
836 #endif
838 bgit = background.emplace(make_pair(stem, r)).first;
840 relevance = &bgit->second;
842 } else {
843 #if 0
844 // In the absence of weight information, assume longer terms
845 // are more relevant, and that unstemmed matches are a bit more
846 // relevant than stemmed matches.
847 if (queryterms.find(term) != queryterms.end()) {
848 relevance = term.size() * 3;
849 } else {
850 string stem = "Z";
851 stem += stemmer(term);
852 if (queryterms.find(stem) != queryterms.end()) {
853 relevance = term.size() * 2;
856 #endif
859 // FIXME: Allow Enquire without a DB set or an empty MSet() to be
860 // used if you don't want the collection model?
862 #if 0
863 // FIXME: Punctuation should somehow be included in the model, but this
864 // approach is problematic - we don't want the first word of a sentence
865 // to be favoured when it's at the end of the window.
867 // Give first word in each sentence a relevance boost.
868 if (term_start == 0) {
869 relevance += 10;
870 } else {
871 for (size_t i = term_start; i + term.size() < term_end; ++i) {
872 if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
873 relevance += 10;
874 break;
878 #endif
880 relevance_done:
881 if (longest_phrase) {
882 phrase[phrase_next] = term;
883 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
886 if (highlight) matchfound = true;
888 if (!snip.pump(relevance, term_end, highlight, flags)) return false;
890 term_start = term_end;
891 return true;
894 snip.done();
896 // Put together the snippet.
897 string result;
898 if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
899 while (snip.drain(text, hi_start, hi_end, omit, result)) { }
902 return result;