1 /** @file termgenerator_internal.cc
2 * @brief TermGenerator class internals
4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "termgenerator_internal.h"
25 #include "api/msetinternal.h"
26 #include "api/queryinternal.h"
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
33 #include "stringutils.h"
41 #include <unordered_map>
44 #include "cjk-tokenizer.h"
51 U_isupper(unsigned ch
)
53 return (ch
< 128 && C_isupper(static_cast<unsigned char>(ch
)));
56 static inline unsigned
57 check_wordchar(unsigned ch
)
59 if (Unicode::is_wordchar(ch
)) return Unicode::tolower(ch
);
64 should_stem(const std::string
& term
)
66 const unsigned int SHOULD_STEM_MASK
=
67 (1 << Unicode::LOWERCASE_LETTER
) |
68 (1 << Unicode::TITLECASE_LETTER
) |
69 (1 << Unicode::MODIFIER_LETTER
) |
70 (1 << Unicode::OTHER_LETTER
);
72 return ((SHOULD_STEM_MASK
>> Unicode::get_category(*u
)) & 1);
75 /** Value representing "ignore this" when returned by check_infix() or
76 * check_infix_digit().
78 static const unsigned UNICODE_IGNORE
= numeric_limits
<unsigned>::max();
80 static inline unsigned
81 check_infix(unsigned ch
)
83 if (ch
== '\'' || ch
== '&' || ch
== 0xb7 || ch
== 0x5f4 || ch
== 0x2027) {
84 // Unicode includes all these except '&' in its word boundary rules,
85 // as well as 0x2019 (which we handle below) and ':' (for Swedish
86 // apparently, but we ignore this for now as it's problematic in
90 // 0x2019 is Unicode apostrophe and single closing quote.
91 // 0x201b is Unicode single opening quote with the tail rising.
92 if (ch
== 0x2019 || ch
== 0x201b) return '\'';
93 if (ch
>= 0x200b && (ch
<= 0x200d || ch
== 0x2060 || ch
== 0xfeff))
94 return UNICODE_IGNORE
;
98 static inline unsigned
99 check_infix_digit(unsigned ch
)
101 // This list of characters comes from Unicode's word identifying algorithm.
106 case 0x037e: // GREEK QUESTION MARK
107 case 0x0589: // ARMENIAN FULL STOP
108 case 0x060D: // ARABIC DATE SEPARATOR
109 case 0x07F8: // NKO COMMA
110 case 0x2044: // FRACTION SLASH
111 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
112 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
113 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
116 if (ch
>= 0x200b && (ch
<= 0x200d || ch
== 0x2060 || ch
== 0xfeff))
117 return UNICODE_IGNORE
;
122 is_digit(unsigned ch
) {
123 return (Unicode::get_category(ch
) == Unicode::DECIMAL_DIGIT_NUMBER
);
126 static inline unsigned
127 check_suffix(unsigned ch
)
129 if (ch
== '+' || ch
== '#') return ch
;
130 // FIXME: what about '-'?
134 /** Templated framework for processing terms.
136 * Calls action(term, positional) for each term to add, where term is a
137 * std::string holding the term, and positional is a bool indicating
138 * if this term carries positional information.
140 template<typename ACTION
>
142 parse_terms(Utf8Iterator itor
, bool cjk_ngram
, bool with_positions
, ACTION action
)
145 // Advance to the start of the next term.
148 if (itor
== Utf8Iterator()) return;
149 ch
= check_wordchar(*itor
);
155 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
156 // Don't worry if there's a trailing '.' or not.
157 if (U_isupper(*itor
)) {
158 const Utf8Iterator end
;
159 Utf8Iterator p
= itor
;
161 Unicode::append_utf8(term
, Unicode::tolower(*p
++));
162 } while (p
!= end
&& *p
== '.' && ++p
!= end
&& U_isupper(*p
));
163 // One letter does not make an acronym! If we handled a single
164 // uppercase letter here, we wouldn't catch M&S below.
165 if (term
.size() > 1) {
166 // Check there's not a (lower case) letter or digit
167 // immediately after it.
168 if (p
== end
|| !Unicode::is_wordchar(*p
)) {
178 CJK::codepoint_is_cjk(*itor
) &&
179 Unicode::is_wordchar(*itor
)) {
180 const string
& cjk
= CJK::get_cjk(itor
);
181 for (CJKTokenIterator
tk(cjk
); tk
!= CJKTokenIterator(); ++tk
) {
182 const string
& cjk_token
= *tk
;
183 if (!action(cjk_token
, with_positions
&& tk
.get_length() == 1, itor
))
187 if (itor
== Utf8Iterator()) return;
188 ch
= check_wordchar(*itor
);
196 Unicode::append_utf8(term
, ch
);
198 if (++itor
== Utf8Iterator() ||
199 (cjk_ngram
&& CJK::codepoint_is_cjk(*itor
)))
201 ch
= check_wordchar(*itor
);
204 Utf8Iterator
next(itor
);
206 if (next
== Utf8Iterator()) break;
207 unsigned nextch
= check_wordchar(*next
);
209 unsigned infix_ch
= *itor
;
210 if (is_digit(prevch
) && is_digit(*next
)) {
211 infix_ch
= check_infix_digit(infix_ch
);
213 // Handle things like '&' in AT&T, apostrophes, etc.
214 infix_ch
= check_infix(infix_ch
);
216 if (!infix_ch
) break;
217 if (infix_ch
!= UNICODE_IGNORE
)
218 Unicode::append_utf8(term
, infix_ch
);
224 size_t len
= term
.size();
226 while ((ch
= check_suffix(*itor
))) {
231 Unicode::append_utf8(term
, ch
);
232 if (++itor
== Utf8Iterator()) goto endofterm
;
234 // Don't index fish+chips as fish+ chips.
235 if (Unicode::is_wordchar(*itor
))
240 if (!action(term
, with_positions
, itor
))
246 TermGenerator::Internal::index_text(Utf8Iterator itor
, termcount wdf_inc
,
247 const string
& prefix
, bool with_positions
)
249 bool cjk_ngram
= (flags
& FLAG_CJK_NGRAM
) || CJK::is_cjk_enabled();
251 stop_strategy current_stop_mode
;
252 if (!stopper
.get()) {
253 current_stop_mode
= TermGenerator::STOP_NONE
;
255 current_stop_mode
= stop_mode
;
258 parse_terms(itor
, cjk_ngram
, with_positions
,
259 [=](const string
& term
, bool positional
, const Utf8Iterator
&) {
260 if (term
.size() > max_word_length
) return true;
262 if (current_stop_mode
== TermGenerator::STOP_ALL
&& (*stopper
)(term
))
265 if (strategy
== TermGenerator::STEM_SOME
||
266 strategy
== TermGenerator::STEM_NONE
||
267 strategy
== TermGenerator::STEM_SOME_FULL_POS
) {
269 doc
.add_posting(prefix
+ term
, ++cur_pos
, wdf_inc
);
271 doc
.add_term(prefix
+ term
, wdf_inc
);
275 // MSVC seems to need "this->" on member variables in this
277 if ((this->flags
& FLAG_SPELLING
) && prefix
.empty())
278 db
.add_spelling(term
);
280 if (strategy
== TermGenerator::STEM_NONE
||
281 !stemmer
.internal
.get()) return true;
283 if (strategy
== TermGenerator::STEM_SOME
||
284 strategy
== TermGenerator::STEM_SOME_FULL_POS
) {
285 if (current_stop_mode
== TermGenerator::STOP_STEMMED
&&
289 // Note, this uses the lowercased term, but that's OK as we
290 // only want to avoid stemming terms starting with a digit.
291 if (!should_stem(term
)) return true;
294 // Add stemmed form without positional information.
295 const string
& stem
= stemmer(term
);
296 if (rare(stem
.empty())) return true;
298 if (strategy
!= TermGenerator::STEM_ALL
) {
301 stemmed_term
+= prefix
;
302 stemmed_term
+= stem
;
303 if (strategy
!= TermGenerator::STEM_SOME
&& with_positions
) {
304 if (strategy
!= TermGenerator::STEM_SOME_FULL_POS
) ++cur_pos
;
305 doc
.add_posting(stemmed_term
, cur_pos
, wdf_inc
);
307 doc
.add_term(stemmed_term
, wdf_inc
);
320 Sniplet(double* r
, size_t t
, size_t h
)
321 : relevance(r
), term_end(t
), highlight(h
) { }
326 deque
<Sniplet
> best_pipe
;
328 // Requested length for snippet.
331 // Position in text of start of current pipe contents.
334 // Rolling sum of the current pipe contents.
337 size_t phrase_len
= 0;
340 size_t best_begin
= 0;
346 // Add one to length to allow for inter-word space.
347 // FIXME: We ought to correctly allow for multiple spaces.
348 explicit SnipPipe(size_t length_
) : length(length_
+ 1) { }
350 bool pump(double* r
, size_t t
, size_t h
, unsigned flags
);
354 bool drain(const string
& input
,
355 const string
& hi_start
,
356 const string
& hi_end
,
364 SnipPipe::pump(double* r
, size_t t
, size_t h
, unsigned flags
)
367 if (pipe
.size() >= h
- 1) {
368 // The final term of a phrase is entering the window. Peg the
369 // phrase's relevance onto the first term of the phrase, so it'll
370 // be removed from `sum` when the phrase starts to leave the
372 auto & phrase_start
= pipe
[pipe
.size() - (h
- 1)];
373 if (phrase_start
.relevance
) {
374 *phrase_start
.relevance
*= DECAY
;
375 sum
-= *phrase_start
.relevance
;
378 phrase_start
.relevance
= r
;
379 phrase_start
.highlight
= h
;
385 pipe
.emplace_back(r
, t
, h
);
391 // If necessary, discard words from the start of the pipe until it has the
393 // FIXME: Also shrink the window past words with relevance < 0?
394 while (t
- begin
> length
/* || pipe.front().relevance < 0.0 */) {
395 const Sniplet
& word
= pipe
.front();
396 if (word
.relevance
) {
397 *word
.relevance
*= DECAY
;
398 sum
-= *word
.relevance
;
400 begin
= word
.term_end
;
401 if (best_end
>= begin
)
402 best_pipe
.push_back(word
);
404 // E.g. can happen if the current term is longer than the requested
406 if (rare(pipe
.empty())) break;
409 // Using > here doesn't work well, as we don't extend a snippet over terms
411 if (sum
>= best_sum
) {
412 // Discard any part of `best_pipe` which is before `begin`.
413 if (begin
>= best_end
) {
416 while (!best_pipe
.empty() &&
417 best_pipe
.front().term_end
<= begin
) {
418 best_pipe
.pop_front();
424 } else if ((flags
& Xapian::MSet::SNIPPET_EXHAUSTIVE
) == 0) {
425 if (best_sum
> 0 && best_end
< begin
) {
426 // We found something, and we aren't still looking near it.
427 // FIXME: Benchmark this and adjust if necessary.
437 // Discard any part of `pipe` which is after `best_end`.
438 if (begin
>= best_end
) {
441 // We should never empty the pipe (as that case should be handled
443 while (rare(!pipe
.empty()) &&
444 pipe
.back().term_end
> best_end
) {
450 // Check if a non-word character is should be included at the start of the
451 // snippet. We want to include certain leading non-word characters, but not
454 snippet_check_leading_nonwordchar(unsigned ch
) {
455 if (Unicode::is_currency(ch
) ||
456 Unicode::get_category(ch
) == Unicode::OPEN_PUNCTUATION
||
457 Unicode::get_category(ch
) == Unicode::INITIAL_QUOTE_PUNCTUATION
) {
474 case 0x00A1: // INVERTED EXCLAMATION MARK
475 case 0x00A7: // SECTION SIGN
476 case 0x00BF: // INVERTED QUESTION MARK
483 append_escaping_xml(const char* p
, const char* end
, string
& output
)
504 SnipPipe::drain(const string
& input
,
505 const string
& hi_start
,
506 const string
& hi_end
,
510 if (best_pipe
.empty() && !pipe
.empty()) {
511 swap(best_pipe
, pipe
);
514 if (best_pipe
.empty()) {
515 size_t tail_len
= input
.size() - best_end
;
516 if (tail_len
== 0) return false;
518 // See if this is the end of a sentence.
519 // FIXME: This is quite simplistic - look at the Unicode rules:
520 // http://unicode.org/reports/tr29/#Sentence_Boundaries
522 Utf8Iterator
i(input
.data() + best_end
, tail_len
);
523 while (i
!= Utf8Iterator()) {
525 if (punc
&& Unicode::is_whitespace(ch
)) break;
527 // Allow "...", "!!", "!?!", etc...
528 punc
= (ch
== '.' || ch
== '?' || ch
== '!');
530 if (Unicode::is_wordchar(ch
)) break;
535 // Include end of sentence punctuation.
536 append_escaping_xml(input
.data() + best_end
, i
.raw(), output
);
538 // Append "..." or equivalent if this doesn't seem to be the start
546 const Sniplet
& word
= best_pipe
.front();
548 if (output
.empty()) {
549 // Start of the snippet.
550 enum { NO
, PUNC
, YES
} sentence_boundary
= (best_begin
== 0) ? YES
: NO
;
552 Utf8Iterator
i(input
.data() + best_begin
, word
.term_end
- best_begin
);
553 while (i
!= Utf8Iterator()) {
555 switch (sentence_boundary
) {
557 if (ch
== '.' || ch
== '?' || ch
== '!') {
558 sentence_boundary
= PUNC
;
562 if (Unicode::is_whitespace(ch
)) {
563 sentence_boundary
= YES
;
564 } else if (ch
== '.' || ch
== '?' || ch
== '!') {
565 // Allow "...", "!!", "!?!", etc...
567 sentence_boundary
= NO
;
574 // Start the snippet at the start of the first word, but include
575 // certain punctuation too.
576 if (Unicode::is_wordchar(ch
)) {
577 // But limit how much leading punctuation we include.
578 size_t word_begin
= i
.raw() - input
.data();
579 if (word_begin
- best_begin
> 4) {
580 best_begin
= word_begin
;
585 if (!snippet_check_leading_nonwordchar(ch
)) {
586 best_begin
= i
.raw() - input
.data();
590 // Add "..." or equivalent if this doesn't seem to be the start of a
592 if (sentence_boundary
!= YES
) {
597 if (word
.highlight
) {
598 // Don't include inter-word characters in the highlight.
599 Utf8Iterator
i(input
.data() + best_begin
, input
.size() - best_begin
);
600 while (i
!= Utf8Iterator()) {
602 if (Unicode::is_wordchar(ch
)) {
603 append_escaping_xml(input
.data() + best_begin
, i
.raw(), output
);
604 best_begin
= i
.raw() - input
.data();
612 phrase_len
= word
.highlight
;
613 if (phrase_len
) output
+= hi_start
;
616 const char* p
= input
.data();
617 append_escaping_xml(p
+ best_begin
, p
+ word
.term_end
, output
);
618 best_begin
= word
.term_end
;
620 if (phrase_len
&& --phrase_len
== 0) output
+= hi_end
;
622 best_pipe
.pop_front();
627 check_query(const Xapian::Query
& query
,
628 list
<vector
<string
>> & exact_phrases
,
629 unordered_map
<string
, double> & loose_terms
,
630 list
<string
> & wildcards
,
631 size_t & longest_phrase
)
633 // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
634 size_t n_subqs
= query
.get_num_subqueries();
635 Xapian::Query::op op
= query
.get_type();
636 if (op
== query
.LEAF_TERM
) {
637 const Xapian::Internal::QueryTerm
& qt
=
638 *static_cast<const Xapian::Internal::QueryTerm
*>(query
.internal
.get());
639 loose_terms
.insert(make_pair(qt
.get_term(), 0));
640 } else if (op
== query
.OP_WILDCARD
) {
641 const Xapian::Internal::QueryWildcard
& qw
=
642 *static_cast<const Xapian::Internal::QueryWildcard
*>(query
.internal
.get());
643 wildcards
.push_back(qw
.get_pattern());
644 } else if (op
== query
.OP_PHRASE
) {
645 const Xapian::Internal::QueryPhrase
& phrase
=
646 *static_cast<const Xapian::Internal::QueryPhrase
*>(query
.internal
.get());
647 if (phrase
.get_window() == n_subqs
) {
649 for (size_t i
= 0; i
!= n_subqs
; ++i
) {
650 if (query
.get_subquery(i
).get_type() != query
.LEAF_TERM
)
651 goto non_term_subquery
;
654 // Tight phrase of terms.
655 exact_phrases
.push_back(vector
<string
>());
656 vector
<string
> & terms
= exact_phrases
.back();
657 terms
.reserve(n_subqs
);
658 for (size_t i
= 0; i
!= n_subqs
; ++i
) {
659 Xapian::Query q
= query
.get_subquery(i
);
660 const Xapian::Internal::QueryTerm
& qt
=
661 *static_cast<const Xapian::Internal::QueryTerm
*>(q
.internal
.get());
662 terms
.push_back(qt
.get_term());
664 if (n_subqs
> longest_phrase
) longest_phrase
= n_subqs
;
669 for (size_t i
= 0; i
!= n_subqs
; ++i
)
670 check_query(query
.get_subquery(i
), exact_phrases
, loose_terms
,
671 wildcards
, longest_phrase
);
675 check_term(unordered_map
<string
, double> & loose_terms
,
676 const Xapian::Weight::Internal
* stats
,
680 auto it
= loose_terms
.find(term
);
681 if (it
== loose_terms
.end()) return NULL
;
683 if (it
->second
== 0.0) {
685 if (!stats
->get_termweight(term
, relevance
)) {
687 loose_terms
.erase(it
);
691 it
->second
= relevance
+ max_tw
;
697 MSet::Internal::snippet(const string
& text
,
699 const Xapian::Stem
& stemmer
,
701 const string
& hi_start
,
702 const string
& hi_end
,
703 const string
& omit
) const
705 if (hi_start
.empty() && hi_end
.empty() && text
.size() <= length
) {
710 bool cjk_ngram
= CJK::is_cjk_enabled();
712 size_t term_start
= 0;
713 double min_tw
= 0, max_tw
= 0;
714 if (stats
) stats
->get_max_termweight(min_tw
, max_tw
);
718 // Scale up by (1 + 1/64) so that highlighting works better for terms
719 // with termweight 0 (which happens for terms not in the database, and
720 // also with some weighting schemes for terms which occur in almost all
725 SnipPipe
snip(length
);
727 list
<vector
<string
>> exact_phrases
;
728 unordered_map
<string
, double> loose_terms
;
729 list
<string
> wildcards
;
730 size_t longest_phrase
= 0;
731 check_query(enquire
->query
, exact_phrases
, loose_terms
,
732 wildcards
, longest_phrase
);
734 vector
<double> exact_phrases_relevance
;
735 exact_phrases_relevance
.reserve(exact_phrases
.size());
736 for (auto&& terms
: exact_phrases
) {
737 // FIXME: What relevance to use?
738 exact_phrases_relevance
.push_back(max_tw
* terms
.size());
741 vector
<double> wildcards_relevance
;
742 wildcards_relevance
.reserve(exact_phrases
.size());
743 for (auto&& pattern
: wildcards
) {
744 // FIXME: What relevance to use?
746 wildcards_relevance
.push_back(max_tw
+ min_tw
);
749 // Background relevance is the same for a given MSet, so cache it
750 // between calls to MSet::snippet() on the same object.
751 unordered_map
<string
, double>& background
= snippet_bg_relevance
;
753 vector
<string
> phrase
;
754 if (longest_phrase
) phrase
.resize(longest_phrase
- 1);
755 size_t phrase_next
= 0;
756 bool matchfound
= false;
757 parse_terms(Utf8Iterator(text
), cjk_ngram
, true,
758 [&](const string
& term
, bool positional
, const Utf8Iterator
& it
) {
759 // FIXME: Don't hardcode this here.
760 const size_t max_word_length
= 64;
762 if (!positional
) return true;
763 if (term
.size() > max_word_length
) return true;
765 // We get segments with any "inter-word" characters in front of
767 // [The][ cat][ sat][ on][ the][ mat]
768 size_t term_end
= text
.size() - it
.left();
770 double* relevance
= 0;
771 size_t highlight
= 0;
774 for (auto&& terms
: exact_phrases
) {
775 if (term
== terms
.back()) {
776 size_t n
= terms
.size() - 1;
779 if (terms
[n
] != phrase
[(n
+ phrase_next
) % (longest_phrase
- 1)]) {
785 // FIXME: Sort phrases, highest score first!
786 relevance
= &exact_phrases_relevance
[i
];
787 highlight
= terms
.size();
794 relevance
= check_term(loose_terms
, stats
.get(), term
, max_tw
);
796 // Matched unstemmed term.
802 stem
+= stemmer(term
);
803 relevance
= check_term(loose_terms
, stats
.get(), stem
, max_tw
);
805 // Matched stemmed term.
811 // FIXME: Sort wildcards, shortest pattern first or something?
813 for (auto&& pattern
: wildcards
) {
814 if (startswith(term
, pattern
)) {
815 relevance
= &wildcards_relevance
[i
];
822 if (flags
& Xapian::MSet::SNIPPET_BACKGROUND_MODEL
) {
823 // Background document model.
824 auto bgit
= background
.find(term
);
825 if (bgit
== background
.end()) bgit
= background
.find(stem
);
826 if (bgit
== background
.end()) {
827 Xapian::doccount tf
= enquire
->db
.get_termfreq(term
);
829 tf
= enquire
->db
.get_termfreq(stem
);
835 // Add one to avoid log(0) when a term indexes all
837 Xapian::doccount num_docs
= stats
->collection_size
+ 1;
838 r
= max_tw
* log((num_docs
- tf
) / double(tf
));
839 r
/= (length
+ 1) * log(double(num_docs
));
842 Utf8Iterator
i(text
.data() + term_start
, text
.data() + term_end
);
843 while (i
!= Utf8Iterator()) {
844 if (Unicode::get_category(*i
++) == Unicode::UPPERCASE_LETTER
) {
851 bgit
= background
.emplace(make_pair(stem
, r
)).first
;
853 relevance
= &bgit
->second
;
857 // In the absence of weight information, assume longer terms
858 // are more relevant, and that unstemmed matches are a bit more
859 // relevant than stemmed matches.
860 if (queryterms
.find(term
) != queryterms
.end()) {
861 relevance
= term
.size() * 3;
864 stem
+= stemmer(term
);
865 if (queryterms
.find(stem
) != queryterms
.end()) {
866 relevance
= term
.size() * 2;
872 // FIXME: Allow Enquire without a DB set or an empty MSet() to be
873 // used if you don't want the collection model?
876 // FIXME: Punctuation should somehow be included in the model, but this
877 // approach is problematic - we don't want the first word of a sentence
878 // to be favoured when it's at the end of the window.
880 // Give first word in each sentence a relevance boost.
881 if (term_start
== 0) {
884 for (size_t i
= term_start
; i
+ term
.size() < term_end
; ++i
) {
885 if (text
[i
] == '.' && Unicode::is_whitespace(text
[i
+ 1])) {
894 if (longest_phrase
) {
895 phrase
[phrase_next
] = term
;
896 phrase_next
= (phrase_next
+ 1) % (longest_phrase
- 1);
899 if (highlight
) matchfound
= true;
901 if (!snip
.pump(relevance
, term_end
, highlight
, flags
)) return false;
903 term_start
= term_end
;
909 // Put together the snippet.
911 if (matchfound
|| (flags
& SNIPPET_EMPTY_WITHOUT_MATCH
) == 0) {
912 while (snip
.drain(text
, hi_start
, hi_end
, omit
, result
)) { }