1 /** @file termgenerator_internal.cc
2 * @brief TermGenerator class internals
4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "termgenerator_internal.h"
25 #include "api/msetinternal.h"
26 #include "api/queryinternal.h"
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
33 #include "stringutils.h"
41 #include <unordered_map>
44 #include "cjk-tokenizer.h"
51 U_isupper(unsigned ch
) {
52 return (ch
< 128 && C_isupper(static_cast<unsigned char>(ch
)));
55 inline unsigned check_wordchar(unsigned ch
) {
56 if (Unicode::is_wordchar(ch
)) return Unicode::tolower(ch
);
61 should_stem(const std::string
& term
)
63 const unsigned int SHOULD_STEM_MASK
=
64 (1 << Unicode::LOWERCASE_LETTER
) |
65 (1 << Unicode::TITLECASE_LETTER
) |
66 (1 << Unicode::MODIFIER_LETTER
) |
67 (1 << Unicode::OTHER_LETTER
);
69 return ((SHOULD_STEM_MASK
>> Unicode::get_category(*u
)) & 1);
72 /** Value representing "ignore this" when returned by check_infix() or
73 * check_infix_digit().
75 const unsigned UNICODE_IGNORE
= numeric_limits
<unsigned>::max();
77 inline unsigned check_infix(unsigned ch
) {
78 if (ch
== '\'' || ch
== '&' || ch
== 0xb7 || ch
== 0x5f4 || ch
== 0x2027) {
79 // Unicode includes all these except '&' in its word boundary rules,
80 // as well as 0x2019 (which we handle below) and ':' (for Swedish
81 // apparently, but we ignore this for now as it's problematic in
85 // 0x2019 is Unicode apostrophe and single closing quote.
86 // 0x201b is Unicode single opening quote with the tail rising.
87 if (ch
== 0x2019 || ch
== 0x201b) return '\'';
88 if (ch
>= 0x200b && (ch
<= 0x200d || ch
== 0x2060 || ch
== 0xfeff))
89 return UNICODE_IGNORE
;
93 inline unsigned check_infix_digit(unsigned ch
) {
94 // This list of characters comes from Unicode's word identifying algorithm.
99 case 0x037e: // GREEK QUESTION MARK
100 case 0x0589: // ARMENIAN FULL STOP
101 case 0x060D: // ARABIC DATE SEPARATOR
102 case 0x07F8: // NKO COMMA
103 case 0x2044: // FRACTION SLASH
104 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
105 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
106 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
109 if (ch
>= 0x200b && (ch
<= 0x200d || ch
== 0x2060 || ch
== 0xfeff))
110 return UNICODE_IGNORE
;
115 is_digit(unsigned ch
) {
116 return (Unicode::get_category(ch
) == Unicode::DECIMAL_DIGIT_NUMBER
);
119 inline unsigned check_suffix(unsigned ch
) {
120 if (ch
== '+' || ch
== '#') return ch
;
121 // FIXME: what about '-'?
125 /** Templated framework for processing terms.
127 * Calls action(term, positional) for each term to add, where term is a
128 * std::string holding the term, and positional is a bool indicating
129 * if this term carries positional information.
131 template<typename ACTION
> void
132 parse_terms(Utf8Iterator itor
, bool cjk_ngram
, bool with_positions
, ACTION action
)
135 // Advance to the start of the next term.
138 if (itor
== Utf8Iterator()) return;
139 ch
= check_wordchar(*itor
);
145 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
146 // Don't worry if there's a trailing '.' or not.
147 if (U_isupper(*itor
)) {
148 const Utf8Iterator end
;
149 Utf8Iterator p
= itor
;
151 Unicode::append_utf8(term
, Unicode::tolower(*p
++));
152 } while (p
!= end
&& *p
== '.' && ++p
!= end
&& U_isupper(*p
));
153 // One letter does not make an acronym! If we handled a single
154 // uppercase letter here, we wouldn't catch M&S below.
155 if (term
.size() > 1) {
156 // Check there's not a (lower case) letter or digit
157 // immediately after it.
158 if (p
== end
|| !Unicode::is_wordchar(*p
)) {
168 CJK::codepoint_is_cjk(*itor
) &&
169 Unicode::is_wordchar(*itor
)) {
170 const string
& cjk
= CJK::get_cjk(itor
);
171 for (CJKTokenIterator
tk(cjk
); tk
!= CJKTokenIterator(); ++tk
) {
172 const string
& cjk_token
= *tk
;
173 if (!action(cjk_token
, with_positions
&& tk
.get_length() == 1, itor
))
177 if (itor
== Utf8Iterator()) return;
178 ch
= check_wordchar(*itor
);
186 Unicode::append_utf8(term
, ch
);
188 if (++itor
== Utf8Iterator() ||
189 (cjk_ngram
&& CJK::codepoint_is_cjk(*itor
)))
191 ch
= check_wordchar(*itor
);
194 Utf8Iterator
next(itor
);
196 if (next
== Utf8Iterator()) break;
197 unsigned nextch
= check_wordchar(*next
);
199 unsigned infix_ch
= *itor
;
200 if (is_digit(prevch
) && is_digit(*next
)) {
201 infix_ch
= check_infix_digit(infix_ch
);
203 // Handle things like '&' in AT&T, apostrophes, etc.
204 infix_ch
= check_infix(infix_ch
);
206 if (!infix_ch
) break;
207 if (infix_ch
!= UNICODE_IGNORE
)
208 Unicode::append_utf8(term
, infix_ch
);
214 size_t len
= term
.size();
216 while ((ch
= check_suffix(*itor
))) {
221 Unicode::append_utf8(term
, ch
);
222 if (++itor
== Utf8Iterator()) goto endofterm
;
224 // Don't index fish+chips as fish+ chips.
225 if (Unicode::is_wordchar(*itor
))
230 if (!action(term
, with_positions
, itor
))
236 TermGenerator::Internal::index_text(Utf8Iterator itor
, termcount wdf_inc
,
237 const string
& prefix
, bool with_positions
)
239 bool cjk_ngram
= (flags
& FLAG_CJK_NGRAM
) || CJK::is_cjk_enabled();
241 stop_strategy current_stop_mode
;
242 if (!stopper
.get()) {
243 current_stop_mode
= TermGenerator::STOP_NONE
;
245 current_stop_mode
= stop_mode
;
248 parse_terms(itor
, cjk_ngram
, with_positions
,
249 [=](const string
& term
, bool positional
, const Utf8Iterator
&) {
250 if (term
.size() > max_word_length
) return true;
252 if (current_stop_mode
== TermGenerator::STOP_ALL
&& (*stopper
)(term
))
255 if (strategy
== TermGenerator::STEM_SOME
||
256 strategy
== TermGenerator::STEM_NONE
) {
258 doc
.add_posting(prefix
+ term
, ++termpos
, wdf_inc
);
260 doc
.add_term(prefix
+ term
, wdf_inc
);
264 // MSVC seems to need "this->" on member variables in this
266 if ((this->flags
& FLAG_SPELLING
) && prefix
.empty())
267 db
.add_spelling(term
);
269 if (strategy
== TermGenerator::STEM_NONE
||
270 !stemmer
.internal
.get()) return true;
272 if (strategy
== TermGenerator::STEM_SOME
) {
273 if (current_stop_mode
== TermGenerator::STOP_STEMMED
&&
277 // Note, this uses the lowercased term, but that's OK as we
278 // only want to avoid stemming terms starting with a digit.
279 if (!should_stem(term
)) return true;
282 // Add stemmed form without positional information.
283 const string
& stem
= stemmer(term
);
284 if (rare(stem
.empty())) return true;
286 if (strategy
!= TermGenerator::STEM_ALL
) {
289 stemmed_term
+= prefix
;
290 stemmed_term
+= stem
;
291 if (strategy
!= TermGenerator::STEM_SOME
&& with_positions
) {
292 doc
.add_posting(stemmed_term
, ++termpos
, wdf_inc
);
294 doc
.add_term(stemmed_term
, wdf_inc
);
307 Sniplet(double* r
, size_t t
, size_t h
)
308 : relevance(r
), term_end(t
), highlight(h
) { }
313 deque
<Sniplet
> best_pipe
;
315 // Requested length for snippet.
318 // Position in text of start of current pipe contents.
321 // Rolling sum of the current pipe contents.
324 size_t phrase_len
= 0;
327 size_t best_begin
= 0;
333 // Add one to length to allow for inter-word space.
334 // FIXME: We ought to correctly allow for multiple spaces.
335 explicit SnipPipe(size_t length_
) : length(length_
+ 1) { }
337 bool pump(double* r
, size_t t
, size_t h
, unsigned flags
);
341 bool drain(const string
& input
,
342 const string
& hi_start
,
343 const string
& hi_end
,
351 SnipPipe::pump(double* r
, size_t t
, size_t h
, unsigned flags
)
354 if (pipe
.size() >= h
- 1) {
355 // The final term of a phrase is entering the window. Peg the
356 // phrase's relevance onto the first term of the phrase, so it'll
357 // be removed from `sum` when the phrase starts to leave the
359 auto & phrase_start
= pipe
[pipe
.size() - (h
- 1)];
360 if (phrase_start
.relevance
) {
361 *phrase_start
.relevance
*= DECAY
;
362 sum
-= *phrase_start
.relevance
;
365 phrase_start
.relevance
= r
;
366 phrase_start
.highlight
= h
;
372 pipe
.emplace_back(r
, t
, h
);
378 // If necessary, discard words from the start of the pipe until it has the
380 // FIXME: Also shrink the window past words with relevance < 0?
381 while (t
- begin
> length
/* || pipe.front().relevance < 0.0 */) {
382 const Sniplet
& word
= pipe
.front();
383 if (word
.relevance
) {
384 *word
.relevance
*= DECAY
;
385 sum
-= *word
.relevance
;
387 begin
= word
.term_end
;
388 if (best_end
>= begin
)
389 best_pipe
.push_back(word
);
391 // E.g. can happen if the current term is longer than the requested
393 if (rare(pipe
.empty())) break;
396 // Using > here doesn't work well, as we don't extend a snippet over terms
398 if (sum
>= best_sum
) {
399 // Discard any part of `best_pipe` which is before `begin`.
400 if (begin
>= best_end
) {
403 while (!best_pipe
.empty() &&
404 best_pipe
.front().term_end
<= begin
) {
405 best_pipe
.pop_front();
411 } else if ((flags
& Xapian::MSet::SNIPPET_EXHAUSTIVE
) == 0) {
412 if (best_sum
> 0 && best_end
< begin
) {
413 // We found something, and we aren't still looking near it.
414 // FIXME: Benchmark this and adjust if necessary.
424 // Discard any part of `pipe` which is after `best_end`.
425 if (begin
>= best_end
) {
428 // We should never empty the pipe (as that case should be handled
430 while (rare(!pipe
.empty()) &&
431 pipe
.back().term_end
> best_end
) {
438 SnipPipe::drain(const string
& input
,
439 const string
& hi_start
,
440 const string
& hi_end
,
444 if (best_pipe
.empty() && !pipe
.empty()) {
445 swap(best_pipe
, pipe
);
448 if (best_pipe
.empty()) {
449 size_t tail_len
= input
.size() - best_end
;
450 if (tail_len
== 0) return false;
452 // See if this is the end of a sentence.
453 // FIXME: This is quite simplistic - look at the Unicode rules:
454 // http://unicode.org/reports/tr29/#Sentence_Boundaries
456 Utf8Iterator
i(input
.data() + best_end
, tail_len
);
457 while (i
!= Utf8Iterator()) {
459 if (punc
&& Unicode::is_whitespace(ch
)) break;
461 // Allow "...", "!!", "!?!", etc...
462 punc
= (ch
== '.' || ch
== '?' || ch
== '!');
464 if (Unicode::is_wordchar(ch
)) break;
469 // Include end of sentence punctuation.
470 output
.append(input
.data() + best_end
, i
.raw());
472 // Append "..." or equivalent if this doesn't seem to be the start
480 const Sniplet
& word
= best_pipe
.front();
482 if (output
.empty()) {
483 // Start of the snippet.
484 enum { NO
, PUNC
, YES
} sentence_boundary
= (best_begin
== 0) ? YES
: NO
;
486 Utf8Iterator
i(input
.data() + best_begin
, word
.term_end
- best_begin
);
487 while (i
!= Utf8Iterator()) {
489 switch (sentence_boundary
) {
491 if (ch
== '.' || ch
== '?' || ch
== '!') {
492 sentence_boundary
= PUNC
;
496 if (Unicode::is_whitespace(ch
)) {
497 sentence_boundary
= YES
;
498 } else if (ch
== '.' || ch
== '?' || ch
== '!') {
499 // Allow "...", "!!", "!?!", etc...
501 sentence_boundary
= NO
;
507 if (Unicode::is_wordchar(ch
)) {
508 // Start the snippet at the start of the first word.
509 best_begin
= i
.raw() - input
.data();
515 // Add "..." or equivalent if this doesn't seem to be the start of a
517 if (sentence_boundary
!= YES
) {
522 if (word
.highlight
) {
523 // Don't include inter-word characters in the highlight.
524 Utf8Iterator
i(input
.data() + best_begin
, input
.size() - best_begin
);
525 while (i
!= Utf8Iterator()) {
527 if (Unicode::is_wordchar(ch
)) {
528 const char * p
= input
.data() + best_begin
;
529 output
.append(p
, i
.raw() - p
);
530 best_begin
= i
.raw() - input
.data();
538 phrase_len
= word
.highlight
;
539 if (phrase_len
) output
+= hi_start
;
542 while (best_begin
!= word
.term_end
) {
543 char ch
= input
[best_begin
++];
559 if (phrase_len
&& --phrase_len
== 0) output
+= hi_end
;
561 best_pipe
.pop_front();
566 check_query(const Xapian::Query
& query
,
567 list
<vector
<string
>> & exact_phrases
,
568 unordered_map
<string
, double> & loose_terms
,
569 list
<string
> & wildcards
,
570 size_t & longest_phrase
)
572 // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
573 size_t n_subqs
= query
.get_num_subqueries();
574 Xapian::Query::op op
= query
.get_type();
575 if (op
== query
.LEAF_TERM
) {
576 const Xapian::Internal::QueryTerm
& qt
=
577 *static_cast<const Xapian::Internal::QueryTerm
*>(query
.internal
.get());
578 loose_terms
.insert(make_pair(qt
.get_term(), 0));
579 } else if (op
== query
.OP_WILDCARD
) {
580 const Xapian::Internal::QueryWildcard
& qw
=
581 *static_cast<const Xapian::Internal::QueryWildcard
*>(query
.internal
.get());
582 wildcards
.push_back(qw
.get_pattern());
583 } else if (op
== query
.OP_PHRASE
) {
584 const Xapian::Internal::QueryPhrase
& phrase
=
585 *static_cast<const Xapian::Internal::QueryPhrase
*>(query
.internal
.get());
586 if (phrase
.get_window() == n_subqs
) {
588 for (size_t i
= 0; i
!= n_subqs
; ++i
) {
589 if (query
.get_subquery(i
).get_type() != query
.LEAF_TERM
)
590 goto non_term_subquery
;
593 // Tight phrase of terms.
594 exact_phrases
.push_back(vector
<string
>());
595 vector
<string
> & terms
= exact_phrases
.back();
596 terms
.reserve(n_subqs
);
597 for (size_t i
= 0; i
!= n_subqs
; ++i
) {
598 Xapian::Query q
= query
.get_subquery(i
);
599 const Xapian::Internal::QueryTerm
& qt
=
600 *static_cast<const Xapian::Internal::QueryTerm
*>(q
.internal
.get());
601 terms
.push_back(qt
.get_term());
603 if (n_subqs
> longest_phrase
) longest_phrase
= n_subqs
;
608 for (size_t i
= 0; i
!= n_subqs
; ++i
)
609 check_query(query
.get_subquery(i
), exact_phrases
, loose_terms
,
610 wildcards
, longest_phrase
);
614 check_term(unordered_map
<string
, double> & loose_terms
,
615 const Xapian::Weight::Internal
* stats
,
619 auto it
= loose_terms
.find(term
);
620 if (it
== loose_terms
.end()) return NULL
;
622 if (it
->second
== 0.0) {
624 if (!stats
->get_termweight(term
, relevance
)) {
626 loose_terms
.erase(it
);
630 it
->second
= relevance
+ max_tw
;
636 MSet::Internal::snippet(const string
& text
,
638 const Xapian::Stem
& stemmer
,
640 const string
& hi_start
,
641 const string
& hi_end
,
642 const string
& omit
) const
644 if (hi_start
.empty() && hi_end
.empty() && text
.size() <= length
) {
649 bool cjk_ngram
= CJK::is_cjk_enabled();
651 size_t term_start
= 0;
652 double min_tw
= 0, max_tw
= 0;
653 if (stats
) stats
->get_max_termweight(min_tw
, max_tw
);
657 // Scale up by (1 + 1/64) so that highlighting works better for terms
658 // with termweight 0 (which happens for terms not in the database, and
659 // also with some weighting schemes for terms which occur in almost all
664 SnipPipe
snip(length
);
666 list
<vector
<string
>> exact_phrases
;
667 unordered_map
<string
, double> loose_terms
;
668 list
<string
> wildcards
;
669 size_t longest_phrase
= 0;
670 check_query(enquire
->query
, exact_phrases
, loose_terms
,
671 wildcards
, longest_phrase
);
673 vector
<double> exact_phrases_relevance
;
674 exact_phrases_relevance
.reserve(exact_phrases
.size());
675 for (auto&& terms
: exact_phrases
) {
676 // FIXME: What relevance to use?
677 exact_phrases_relevance
.push_back(max_tw
* terms
.size());
680 vector
<double> wildcards_relevance
;
681 wildcards_relevance
.reserve(exact_phrases
.size());
682 for (auto&& pattern
: wildcards
) {
683 // FIXME: What relevance to use?
685 wildcards_relevance
.push_back(max_tw
+ min_tw
);
688 // Background relevance is the same for a given MSet, so cache it
689 // between calls to MSet::snippet() on the same object.
690 unordered_map
<string
, double>& background
= snippet_bg_relevance
;
692 vector
<string
> phrase
;
693 if (longest_phrase
) phrase
.resize(longest_phrase
- 1);
694 size_t phrase_next
= 0;
695 bool matchfound
= false;
696 parse_terms(Utf8Iterator(text
), cjk_ngram
, true,
697 [&](const string
& term
, bool positional
, const Utf8Iterator
& it
) {
698 // FIXME: Don't hardcode this here.
699 const size_t max_word_length
= 64;
701 if (!positional
) return true;
702 if (term
.size() > max_word_length
) return true;
704 // We get segments with any "inter-word" characters in front of
706 // [The][ cat][ sat][ on][ the][ mat]
707 size_t term_end
= text
.size() - it
.left();
709 double* relevance
= 0;
710 size_t highlight
= 0;
713 for (auto&& terms
: exact_phrases
) {
714 if (term
== terms
.back()) {
715 size_t n
= terms
.size() - 1;
718 if (terms
[n
] != phrase
[(n
+ phrase_next
) % (longest_phrase
- 1)]) {
724 // FIXME: Sort phrases, highest score first!
725 relevance
= &exact_phrases_relevance
[i
];
726 highlight
= terms
.size();
733 relevance
= check_term(loose_terms
, stats
.get(), term
, max_tw
);
735 // Matched unstemmed term.
741 stem
+= stemmer(term
);
742 relevance
= check_term(loose_terms
, stats
.get(), stem
, max_tw
);
744 // Matched stemmed term.
750 // FIXME: Sort wildcards, shortest pattern first or something?
752 for (auto&& pattern
: wildcards
) {
753 if (startswith(term
, pattern
)) {
754 relevance
= &wildcards_relevance
[i
];
761 if (flags
& Xapian::MSet::SNIPPET_BACKGROUND_MODEL
) {
762 // Background document model.
763 auto bgit
= background
.find(term
);
764 if (bgit
== background
.end()) bgit
= background
.find(stem
);
765 if (bgit
== background
.end()) {
766 Xapian::doccount tf
= enquire
->db
.get_termfreq(term
);
768 tf
= enquire
->db
.get_termfreq(stem
);
774 // Add one to avoid log(0) when a term indexes all
776 Xapian::doccount num_docs
= stats
->collection_size
+ 1;
777 r
= max_tw
* log((num_docs
- tf
) / double(tf
));
778 r
/= (length
+ 1) * log(double(num_docs
));
781 Utf8Iterator
i(text
.data() + term_start
, text
.data() + term_end
);
782 while (i
!= Utf8Iterator()) {
783 if (Unicode::get_category(*i
++) == Unicode::UPPERCASE_LETTER
) {
790 bgit
= background
.emplace(make_pair(stem
, r
)).first
;
792 relevance
= &bgit
->second
;
796 // In the absence of weight information, assume longer terms
797 // are more relevant, and that unstemmed matches are a bit more
798 // relevant than stemmed matches.
799 if (queryterms
.find(term
) != queryterms
.end()) {
800 relevance
= term
.size() * 3;
803 stem
+= stemmer(term
);
804 if (queryterms
.find(stem
) != queryterms
.end()) {
805 relevance
= term
.size() * 2;
811 // FIXME: Allow Enquire without a DB set or an empty MSet() to be
812 // used if you don't want the collection model?
815 // FIXME: Punctuation should somehow be included in the model, but this
816 // approach is problematic - we don't want the first word of a sentence
817 // to be favoured when it's at the end of the window.
819 // Give first word in each sentence a relevance boost.
820 if (term_start
== 0) {
823 for (size_t i
= term_start
; i
+ term
.size() < term_end
; ++i
) {
824 if (text
[i
] == '.' && Unicode::is_whitespace(text
[i
+ 1])) {
833 if (longest_phrase
) {
834 phrase
[phrase_next
] = term
;
835 phrase_next
= (phrase_next
+ 1) % (longest_phrase
- 1);
838 if (highlight
) matchfound
= true;
840 if (!snip
.pump(relevance
, term_end
, highlight
, flags
)) return false;
842 term_start
= term_end
;
848 // Put together the snippet.
850 if (matchfound
|| (flags
& SNIPPET_EMPTY_WITHOUT_MATCH
) == 0) {
851 while (snip
.drain(text
, hi_start
, hi_end
, omit
, result
)) { }