2 /* queryparser.lemony: build a Xapian::Query object from a user query string.
4 * Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016,2018 Olly Betts
5 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
6 * Copyright (C) 2010 Adam Sjøgren
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #include "queryparser_internal.h"
28 #include "api/queryinternal.h"
31 #include "stringutils.h"
32 #include "xapian/error.h"
33 #include "xapian/unicode.h"
35 // Include the list of token values lemon generates.
36 #include "queryparser_token.h"
38 #include "cjk-tokenizer.h"
47 // We create the yyParser on the stack.
48 #define Parse_ENGINEALWAYSONSTACK
52 using namespace Xapian;
55 U_isupper(unsigned ch) {
56 return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
60 U_isdigit(unsigned ch) {
61 return (ch < 128 && C_isdigit(static_cast<unsigned char>(ch)));
65 U_isalpha(unsigned ch) {
66 return (ch < 128 && C_isalpha(static_cast<unsigned char>(ch)));
69 using Xapian::Unicode::is_whitespace;
72 is_not_whitespace(unsigned ch) {
73 return !is_whitespace(ch);
76 using Xapian::Unicode::is_wordchar;
79 is_not_wordchar(unsigned ch) {
80 return !is_wordchar(ch);
84 is_digit(unsigned ch) {
85 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
88 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
89 // and there's the risk of hyphens getting stuck onto the end of terms...
91 is_suffix(unsigned ch) {
92 return ch == '+' || ch == '#';
96 is_double_quote(unsigned ch) {
97 // We simply treat all double quotes as equivalent, which is a bit crude,
98 // but it isn't clear that it would actually better to require them to
101 // 0x201c is Unicode opening double quote.
102 // 0x201d is Unicode closing double quote.
103 return ch == '"' || ch == 0x201c || ch == 0x201d;
107 prefix_needs_colon(const string & prefix, unsigned ch)
109 if (!U_isupper(ch) && ch != ':') return false;
110 string::size_type len = prefix.length();
111 return (len > 1 && prefix[len - 1] != ':');
114 using Unicode::is_currency;
117 is_positional(Xapian::Query::op op)
119 return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
124 /** Class used to pass information about a token from lexer to parser.
126 * Generally an instance of this class carries term information, but it can be
127 * used for a range query, and with some operators (e.g. the distance in
128 * NEAR/3 or ADJ/3, etc).
135 const FieldInfo * field_info;
137 QueryParser::stem_strategy stem;
141 Term(const string &name_, termpos pos_)
142 : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
143 explicit Term(const string &name_)
144 : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
145 Term(const string &name_, const FieldInfo * field_info_)
146 : name(name_), field_info(field_info_),
147 stem(QueryParser::STEM_NONE), pos(0) { }
148 explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
149 Term(State * state_, const string &name_, const FieldInfo * field_info_,
150 const string &unstemmed_,
151 QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
153 : state(state_), name(name_), field_info(field_info_),
154 unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
156 Term(const Xapian::Query & q, const string & grouping)
157 : name(grouping), query(q) { }
159 string make_term(const string & prefix) const;
161 void need_positions() {
162 if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
165 termpos get_termpos() const { return pos; }
167 string get_grouping() const {
168 return field_info->grouping;
171 Query * as_wildcarded_query(State * state) const;
173 /** Build a query for a term at the very end of the query string when
174 * FLAG_PARTIAL is in use.
176 * This query should match documents containing any terms which start with
177 * the characters specified, but should give a higher score to exact
178 * matches (since the user might have finished typing - we simply don't
181 Query * as_partial_query(State * state_) const;
183 /** Build a query for a string of CJK characters. */
184 Query * as_cjk_query() const;
186 /** Handle a CJK character string in a positional context. */
187 void as_positional_cjk_term(Terms * terms) const;
190 Query as_range_query() const;
192 Query get_query() const;
194 Query get_query_with_synonyms() const;
196 Query get_query_with_auto_synonyms() const;
199 /// Parser State shared between the lexer and the parser.
201 QueryParser::Internal * qpi;
208 State(QueryParser::Internal * qpi_, unsigned flags_)
209 : qpi(qpi_), error(NULL), flags(flags_) { }
211 string stem_term(const string &term) {
212 return qpi->stemmer(term);
215 void add_to_stoplist(const Term * term) {
216 qpi->stoplist.push_back(term->name);
219 void add_to_unstem(const string & term, const string & unstemmed) {
220 qpi->unstem.insert(make_pair(term, unstemmed));
223 Term * range(const string &a, const string &b) {
224 for (auto i : qpi->rangeprocs) {
225 Xapian::Query range_query = (i.proc)->check_range(a, b);
226 Xapian::Query::op op = range_query.get_type();
228 case Xapian::Query::OP_INVALID:
230 case Xapian::Query::OP_VALUE_RANGE:
231 case Xapian::Query::OP_VALUE_GE:
232 case Xapian::Query::OP_VALUE_LE:
233 if (i.default_grouping) {
234 Xapian::Internal::QueryValueBase * base =
235 static_cast<Xapian::Internal::QueryValueBase*>(
236 range_query.internal.get());
237 Xapian::valueno slot = base->get_slot();
238 return new Term(range_query, str(slot));
241 case Xapian::Query::LEAF_TERM:
242 return new Term(range_query, i.grouping);
244 return new Term(range_query, string());
250 Query::op default_op() const { return qpi->default_op; }
252 bool is_stopword(const Term *term) const {
253 return qpi->stopper.get() && (*qpi->stopper)(term->name);
256 Database get_database() const {
260 const Stopper * get_stopper() const {
261 return qpi->stopper.get();
264 size_t stoplist_size() const {
265 return qpi->stoplist.size();
268 void stoplist_resize(size_t s) {
269 qpi->stoplist.resize(s);
272 Xapian::termcount get_max_wildcard_expansion() const {
273 return qpi->max_wildcard_expansion;
276 int get_max_wildcard_type() const {
277 return qpi->max_wildcard_type;
280 Xapian::termcount get_max_partial_expansion() const {
281 return qpi->max_partial_expansion;
284 int get_max_partial_type() const {
285 return qpi->max_partial_type;
290 Term::make_term(const string & prefix) const
293 if (stem != QueryParser::STEM_NONE && stem != QueryParser::STEM_ALL)
295 if (!prefix.empty()) {
297 if (prefix_needs_colon(prefix, name[0])) term += ':';
299 if (stem != QueryParser::STEM_NONE) {
300 term += state->stem_term(name);
305 if (!unstemmed.empty())
306 state->add_to_unstem(term, unstemmed);
310 // Iterator shim to allow building a synonym query from a TermIterator pair.
311 class SynonymIterator {
312 Xapian::TermIterator i;
316 const Xapian::Query * first;
319 SynonymIterator(const Xapian::TermIterator & i_,
320 Xapian::termpos pos_ = 0,
321 const Xapian::Query * first_ = NULL)
322 : i(i_), pos(pos_), first(first_) { }
324 SynonymIterator & operator++() {
332 const Xapian::Query operator*() const {
333 if (first) return *first;
334 return Xapian::Query(*i, 1, pos);
337 bool operator==(const SynonymIterator & o) const {
338 return i == o.i && first == o.first;
341 bool operator!=(const SynonymIterator & o) const {
342 return !(*this == o);
345 typedef std::input_iterator_tag iterator_category;
346 typedef Xapian::Query value_type;
347 typedef Xapian::termcount_diff difference_type;
348 typedef Xapian::Query * pointer;
349 typedef Xapian::Query & reference;
353 Term::get_query_with_synonyms() const
355 // Handle single-word synonyms with each prefix.
356 const list<string> & prefixes = field_info->prefixes;
357 if (prefixes.empty()) {
358 // FIXME: handle multiple here
359 Assert(!field_info->procs.empty());
360 return (**field_info->procs.begin())(name);
363 Query q = get_query();
365 list<string>::const_iterator piter;
366 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
367 // First try the unstemmed term:
369 if (!piter->empty()) {
371 if (prefix_needs_colon(*piter, name[0])) term += ':';
375 Xapian::Database db = state->get_database();
376 Xapian::TermIterator syn = db.synonyms_begin(term);
377 Xapian::TermIterator end = db.synonyms_end(term);
378 if (syn == end && stem != QueryParser::STEM_NONE) {
379 // If that has no synonyms, try the stemmed form:
381 if (!piter->empty()) {
383 if (prefix_needs_colon(*piter, name[0])) term += ':';
385 term += state->stem_term(name);
386 syn = db.synonyms_begin(term);
387 end = db.synonyms_end(term);
389 q = Query(q.OP_SYNONYM,
390 SynonymIterator(syn, pos, &q),
391 SynonymIterator(end));
397 Term::get_query_with_auto_synonyms() const
399 const unsigned MASK_ENABLE_AUTO_SYNONYMS =
400 QueryParser::FLAG_AUTO_SYNONYMS |
401 QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
402 if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
403 return get_query_with_synonyms();
409 add_to_query(Query *& q, Query::op op, Query * term)
413 *q = Query(op, *q, *term);
421 add_to_query(Query *& q, Query::op op, const Query & term)
424 *q = Query(op, *q, term);
431 Term::get_query() const
433 const list<string> & prefixes = field_info->prefixes;
434 if (prefixes.empty()) {
435 // FIXME: handle multiple here
436 Assert(!field_info->procs.empty());
437 return (**field_info->procs.begin())(name);
439 list<string>::const_iterator piter = prefixes.begin();
440 Query q(make_term(*piter), 1, pos);
441 while (++piter != prefixes.end()) {
442 q = Query(Query::OP_OR, q, Query(make_term(*piter), 1, pos));
448 Term::as_wildcarded_query(State * state_) const
450 const list<string> & prefixes = field_info->prefixes;
451 list<string>::const_iterator piter;
452 Xapian::termcount max = state_->get_max_wildcard_expansion();
453 int max_type = state_->get_max_wildcard_type();
455 subqs.reserve(prefixes.size());
456 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
457 string root = *piter;
459 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
460 subqs.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
463 Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
469 Term::as_partial_query(State * state_) const
471 Xapian::termcount max = state_->get_max_partial_expansion();
472 int max_type = state_->get_max_partial_type();
473 vector<Query> subqs_partial; // A synonym of all the partial terms.
474 vector<Query> subqs_full; // A synonym of all the full terms.
476 const list<string> & prefixes = field_info->prefixes;
477 list<string>::const_iterator piter;
478 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
479 string root = *piter;
481 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
482 subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
484 // Add the term, as it would normally be handled, as an alternative.
485 subqs_full.push_back(Query(make_term(*piter), 1, pos));
487 Query * q = new Query(Query::OP_OR,
488 Query(Query::OP_SYNONYM,
489 subqs_partial.begin(), subqs_partial.end()),
490 Query(Query::OP_SYNONYM,
491 subqs_full.begin(), subqs_full.end()));
497 Term::as_cjk_query() const
499 vector<Query> prefix_subqs;
500 vector<Query> cjk_subqs;
501 const list<string> & prefixes = field_info->prefixes;
502 list<string>::const_iterator piter;
503 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
504 const string& prefix = *piter;
505 for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
506 cjk_subqs.push_back(Query(prefix + *tk, 1, pos));
508 prefix_subqs.push_back(Query(Query::OP_AND,
509 cjk_subqs.begin(), cjk_subqs.end()));
512 Query * q = new Query(Query::OP_OR,
513 prefix_subqs.begin(), prefix_subqs.end());
519 Term::as_range_query() const
527 is_phrase_generator(unsigned ch)
529 // These characters generate a phrase search.
530 // Ordered mostly by frequency of calls to this function done when
531 // running the testcases in api_queryparser.cc.
532 return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
536 is_stem_preventer(unsigned ch)
538 return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
542 should_stem(const string & term)
544 const unsigned int SHOULD_STEM_MASK =
545 (1 << Unicode::LOWERCASE_LETTER) |
546 (1 << Unicode::TITLECASE_LETTER) |
547 (1 << Unicode::MODIFIER_LETTER) |
548 (1 << Unicode::OTHER_LETTER);
549 Utf8Iterator u(term);
550 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
553 /** Value representing "ignore this" when returned by check_infix() or
554 * check_infix_digit().
556 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
558 inline unsigned check_infix(unsigned ch) {
559 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
560 // Unicode includes all these except '&' in its word boundary rules,
561 // as well as 0x2019 (which we handle below) and ':' (for Swedish
562 // apparently, but we ignore this for now as it's problematic in
563 // real world cases).
567 // 0x2019 is Unicode apostrophe and single closing quote.
568 // 0x201b is Unicode single opening quote with the tail rising.
569 if (ch == 0x2019 || ch == 0x201b)
571 if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
572 return UNICODE_IGNORE;
577 inline unsigned check_infix_digit(unsigned ch) {
578 // This list of characters comes from Unicode's word identifying algorithm.
583 case 0x037e: // GREEK QUESTION MARK
584 case 0x0589: // ARMENIAN FULL STOP
585 case 0x060D: // ARABIC DATE SEPARATOR
586 case 0x07F8: // NKO COMMA
587 case 0x2044: // FRACTION SLASH
588 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
589 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
590 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
593 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
594 return UNICODE_IGNORE;
598 // Prototype a function lemon generates, but which we want to call before that
599 // in the generated source code file.
601 static void yy_parse_failed(yyParser *);
604 QueryParser::Internal::add_prefix(const string &field, const string &prefix)
606 map<string, FieldInfo>::iterator p = field_map.find(field);
607 if (p == field_map.end()) {
608 field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, prefix)));
610 // Check that this is the same type of filter as the existing one(s).
611 if (p->second.type != NON_BOOLEAN) {
612 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
614 if (!p->second.procs.empty())
615 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
616 p->second.prefixes.push_back(prefix);
621 QueryParser::Internal::add_prefix(const string &field, FieldProcessor *proc)
623 map<string, FieldInfo>::iterator p = field_map.find(field);
624 if (p == field_map.end()) {
625 field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, proc)));
627 // Check that this is the same type of filter as the existing one(s).
628 if (p->second.type != NON_BOOLEAN) {
629 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
631 if (!p->second.prefixes.empty())
632 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
633 throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
634 // p->second.procs.push_back(proc);
639 QueryParser::Internal::add_boolean_prefix(const string &field,
640 const string &prefix,
641 const string* grouping)
643 // Don't allow the empty prefix to be set as boolean as it doesn't
644 // really make sense.
646 throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
647 if (!grouping) grouping = &field;
648 filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
649 map<string, FieldInfo>::iterator p = field_map.find(field);
650 if (p == field_map.end()) {
651 field_map.insert(make_pair(field, FieldInfo(type, prefix, *grouping)));
653 // Check that this is the same type of filter as the existing one(s).
654 if (p->second.type != type) {
655 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
657 if (!p->second.procs.empty())
658 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
659 p->second.prefixes.push_back(prefix); // FIXME grouping
664 QueryParser::Internal::add_boolean_prefix(const string &field,
665 FieldProcessor *proc,
666 const string* grouping)
668 // Don't allow the empty prefix to be set as boolean as it doesn't
669 // really make sense.
671 throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
672 if (!grouping) grouping = &field;
673 filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
674 map<string, FieldInfo>::iterator p = field_map.find(field);
675 if (p == field_map.end()) {
676 field_map.insert(make_pair(field, FieldInfo(type, proc, *grouping)));
678 // Check that this is the same type of filter as the existing one(s).
679 if (p->second.type != type) {
680 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
682 if (!p->second.prefixes.empty())
683 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
684 throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
685 // p->second.procs.push_back(proc);
690 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
691 bool cjk_ngram, bool & is_cjk_term,
695 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
696 // Don't worry if there's a trailing '.' or not.
697 if (U_isupper(*it)) {
701 Unicode::append_utf8(t, *p++);
702 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
703 // One letter does not make an acronym! If we handled a single
704 // uppercase letter here, we wouldn't catch M&S below.
705 if (t.length() > 1) {
706 // Check there's not a (lower case) letter or digit
707 // immediately after it.
708 // FIXME: should I.B.M..P.T.O be a range search?
709 if (p == end || !is_wordchar(*p)) {
715 was_acronym = !term.empty();
717 if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
718 term = CJK::get_cjk(it);
723 unsigned prevch = *it;
724 Unicode::append_utf8(term, prevch);
725 while (++it != end) {
726 if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
728 if (!is_wordchar(ch)) {
729 // Treat a single embedded '&' or "'" or similar as a word
730 // character (e.g. AT&T, Fred's). Also, normalise
731 // apostrophes to ASCII apostrophe.
734 if (p == end || !is_wordchar(*p)) break;
735 unsigned nextch = *p;
736 if (is_digit(prevch) && is_digit(nextch)) {
737 ch = check_infix_digit(ch);
739 ch = check_infix(ch);
742 if (ch == UNICODE_IGNORE)
745 Unicode::append_utf8(term, ch);
748 if (it != end && is_suffix(*it)) {
749 string suff_term = term;
751 // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
753 if (suff_term.size() - term.size() == 3) {
758 } while (is_suffix(*++p));
759 if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
760 // If the suffixed term doesn't exist, check that the
761 // non-suffixed term does. This also takes care of
762 // the case when QueryParser::set_database() hasn't
764 bool use_suff_term = false;
765 string lc = Unicode::tolower(suff_term);
766 if (db.term_exists(lc)) {
767 use_suff_term = true;
769 lc = Unicode::tolower(term);
770 if (!db.term_exists(lc)) use_suff_term = true;
783 // Switch to %code to insert at the end of the file so struct yyParser has been
788 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
789 const string &default_prefix)
791 bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
793 // Set ranges if we may have to handle ranges in the query.
794 bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
796 termpos term_pos = 1;
797 Utf8Iterator it(qs), end;
799 State state(this, flags);
801 // To successfully apply more than one spelling correction to a query
802 // string, we must keep track of the offset due to previous corrections.
803 int correction_offset = 0;
804 corrected_query.resize(0);
806 // Stack of prefixes, used for phrases and subexpressions.
807 list<const FieldInfo *> prefix_stack;
809 // If default_prefix is specified, use it. Otherwise, use any list
810 // that has been set for the empty prefix.
811 const FieldInfo def_pfx(NON_BOOLEAN, default_prefix);
813 const FieldInfo * default_field_info = &def_pfx;
814 if (default_prefix.empty()) {
815 auto f = field_map.find(string());
816 if (f != field_map.end()) default_field_info = &(f->second);
819 // We always have the current prefix on the top of the stack.
820 prefix_stack.push_back(default_field_info);
825 unsigned newprev = ' ';
828 DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
829 IN_GROUP2, EXPLICIT_SYNONYM
831 while (it != end && !state.error) {
832 bool last_was_operator = false;
833 bool last_was_operator_needing_term = false;
834 if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
837 if (it == end) break;
839 last_was_operator_needing_term = false;
840 last_was_operator = true;
843 just_had_operator_needing_term:
844 last_was_operator_needing_term = true;
845 last_was_operator = true;
847 if (mode == IN_PHRASED_TERM) mode = DEFAULT;
848 if (is_whitespace(*it)) {
851 it = find_if(it, end, is_not_whitespace);
852 if (it == end) break;
856 (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
857 // Scan forward to see if this could be the "start of range"
858 // token. Sadly this has O(n^2) tendencies, though at least
859 // "n" is the number of words in a query which is likely to
860 // remain fairly small. FIXME: can we tokenise more elegantly?
861 Utf8Iterator it_initial = it;
865 if (ch == '.' && *p == '.') {
868 Unicode::append_utf8(a, *it++);
870 // Trim off the trailing ".".
871 a.resize(a.size() - 1);
873 // Either end of the range can be empty (for an open-ended
874 // range) but both can't be empty.
875 if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
877 // Allow any character except whitespace and ')' in the
879 while (p != end && *p > ' ' && *p != ')') {
880 Unicode::append_utf8(b, *p++);
882 Term * range = state.range(a, b);
884 state.error = "Unknown range operation";
885 if (a.find(':', 1) == string::npos) {
888 // Might be a boolean filter with ".." in. Leave
889 // state.error in case it isn't.
893 Parse(&parser, RANGE, range, &state);
899 // Allow any character except whitespace and '(' in the lower
901 if (ch <= ' ' || ch == '(') break;
906 if (!is_wordchar(*it)) {
907 unsigned prev = newprev;
910 // Drop out of IN_GROUP mode.
911 if (mode == IN_GROUP || mode == IN_GROUP2)
915 case 0x201c: // Left curly double quote.
916 case 0x201d: // Right curly double quote.
918 if (mode == DEFAULT) {
920 it = find_if(it, end, is_not_whitespace);
922 // Ignore an unmatched " at the end of the query to
923 // avoid generating an empty pair of QUOTEs which will
924 // cause a parse error.
927 if (is_double_quote(*it)) {
928 // Ignore empty "" (but only if we're not already
929 // IN_QUOTES as we don't merge two adjacent quoted
935 if (flags & QueryParser::FLAG_PHRASE) {
936 Parse(&parser, QUOTE, NULL, &state);
937 if (mode == DEFAULT) {
940 // Remove the prefix we pushed for this phrase.
941 if (mode == IN_PREFIXED_QUOTES)
942 prefix_stack.pop_back();
948 case '+': case '-': // Loved or hated term/phrase/subexpression.
949 // Ignore + or - at the end of the query string.
950 if (it == end) goto done;
951 if (prev > ' ' && prev != '(') {
952 // Or if not after whitespace or an open bracket.
955 if (is_whitespace(*it) || *it == '+' || *it == '-') {
956 // Ignore + or - followed by a space, or further + or -.
957 // Postfix + (such as in C++ and H+) is handled as part of
958 // the term lexing code in parse_term().
962 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
966 } else if (last_was_operator) {
967 token = HATE_AFTER_AND;
971 Parse(&parser, token, NULL, &state);
972 goto just_had_operator_needing_term;
974 // Need to prevent the term after a LOVE or HATE starting a
978 case '(': // Bracketed subexpression.
980 it = find_if(it, end, is_not_whitespace);
981 // Ignore ( at the end of the query string.
982 if (it == end) goto done;
983 if (prev > ' ' && strchr("()+-", prev) == NULL) {
984 // Or if not after whitespace or a bracket or '+' or '-'.
992 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
993 prefix_stack.push_back(prefix_stack.back());
994 Parse(&parser, BRA, NULL, &state);
998 case ')': // End of bracketed subexpression.
999 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1000 // Remove the prefix we pushed for the corresponding BRA.
1001 // If brackets are unmatched, it's a syntax error, but
1002 // that's no excuse to SEGV!
1003 if (prefix_stack.size() > 1) prefix_stack.pop_back();
1004 Parse(&parser, KET, NULL, &state);
1008 case '~': // Synonym expansion.
1009 // Ignore at the end of the query string.
1010 if (it == end) goto done;
1011 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1012 if (prev > ' ' && strchr("+-(", prev) == NULL) {
1013 // Or if not after whitespace, +, -, or an open bracket.
1016 if (!is_wordchar(*it)) {
1017 // Ignore if not followed by a word character.
1020 Parse(&parser, SYNONYM, NULL, &state);
1021 mode = EXPLICIT_SYNONYM;
1022 goto just_had_operator_needing_term;
1026 // Skip any other characters.
1030 Assert(is_wordchar(*it));
1032 size_t term_start_index = it.raw() - qs.data();
1034 newprev = 'A'; // Any letter will do...
1036 // A term, a prefix, or a boolean operator.
1037 const FieldInfo * field_info = NULL;
1038 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1039 !field_map.empty()) {
1040 // Check for a fieldname prefix (e.g. title:historical).
1041 Utf8Iterator p = find_if(it, end, is_not_wordchar);
1042 if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1046 Unicode::append_utf8(field, *p++);
1047 map<string, FieldInfo>::const_iterator f;
1048 f = field_map.find(field);
1049 if (f != field_map.end()) {
1050 // Special handling for prefixed fields, depending on the
1051 // type of the prefix.
1053 field_info = &(f->second);
1055 if (field_info->type != NON_BOOLEAN) {
1056 // Drop out of IN_GROUP if we're in it.
1057 if (mode == IN_GROUP || mode == IN_GROUP2)
1061 if (it != end && is_double_quote(*it)) {
1062 // Quoted boolean term (can contain any character).
1063 bool fancy = (*it != '"');
1067 // Interpret "" as an escaped ".
1068 if (++it == end || *it != '"')
1070 } else if (fancy && is_double_quote(*it)) {
1071 // If the opening quote was ASCII, then the
1072 // closing one must be too - otherwise
1073 // the user can't protect non-ASCII double
1074 // quote characters by quoting or escaping.
1078 Unicode::append_utf8(name, *it++);
1081 // Can't boolean filter prefix a subexpression, so
1082 // just use anything following the prefix until the
1083 // next space or ')' as part of the boolean filter
1085 while (it != end && *it > ' ' && *it != ')')
1086 Unicode::append_utf8(name, *it++);
1088 // Build the unstemmed form in field.
1091 // Clear any pending range error.
1093 Term * token = new Term(&state, name, field_info, field);
1094 Parse(&parser, BOOLEAN_FILTER, token, &state);
1098 if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1099 // Prefixed phrase, e.g.: subject:"space flight"
1100 mode = IN_PREFIXED_QUOTES;
1101 Parse(&parser, QUOTE, NULL, &state);
1105 prefix_stack.push_back(field_info);
1109 if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1110 // Prefixed subexpression, e.g.: title:(fast NEAR food)
1112 Parse(&parser, BRA, NULL, &state);
1116 prefix_stack.push_back(field_info);
1121 // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1122 while (is_phrase_generator(ch)) {
1129 if (is_wordchar(ch)) {
1134 // It looks like a prefix but isn't, so parse it as
1144 bool is_cjk_term = false;
1145 string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
1147 // Boolean operators.
1148 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1149 (flags & FLAG_BOOLEAN) &&
1150 // Don't want to interpret A.N.D. as an AND operator.
1153 term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1156 if (flags & FLAG_BOOLEAN_ANY_CASE) {
1157 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1161 if (op.size() == 3) {
1163 Parse(&parser, AND, NULL, &state);
1164 goto just_had_operator;
1167 Parse(&parser, NOT, NULL, &state);
1168 goto just_had_operator;
1171 Parse(&parser, XOR, NULL, &state);
1172 goto just_had_operator;
1175 if (it != end && *it == '/') {
1177 Utf8Iterator p = it;
1178 while (++p != end && U_isdigit(*p)) {
1179 width = (width * 10) + (*p - '0');
1181 if (width && (p == end || is_whitespace(*p))) {
1183 Parse(&parser, ADJ, new Term(width), &state);
1184 goto just_had_operator;
1187 Parse(&parser, ADJ, NULL, &state);
1188 goto just_had_operator;
1191 } else if (op.size() == 2) {
1193 Parse(&parser, OR, NULL, &state);
1194 goto just_had_operator;
1196 } else if (op.size() == 4) {
1198 if (it != end && *it == '/') {
1200 Utf8Iterator p = it;
1201 while (++p != end && U_isdigit(*p)) {
1202 width = (width * 10) + (*p - '0');
1204 if (width && (p == end || is_whitespace(*p))) {
1206 Parse(&parser, NEAR, new Term(width), &state);
1207 goto just_had_operator;
1210 Parse(&parser, NEAR, NULL, &state);
1211 goto just_had_operator;
1217 // If no prefix is set, use the default one.
1218 if (!field_info) field_info = prefix_stack.back();
1220 Assert(field_info->type == NON_BOOLEAN);
1223 string unstemmed_term(term);
1224 term = Unicode::tolower(term);
1226 // Reuse stem_strategy - STEM_SOME here means "stem terms except
1227 // when used with positional operators".
1228 stem_strategy stem_term = stem_action;
1229 if (stem_term != STEM_NONE) {
1230 if (!stemmer.internal.get()) {
1231 // No stemmer is set.
1232 stem_term = STEM_NONE;
1233 } else if (stem_term == STEM_SOME ||
1234 stem_term == STEM_SOME_FULL_POS) {
1235 if (!should_stem(unstemmed_term) ||
1236 (it != end && is_stem_preventer(*it))) {
1237 // Don't stem this particular term.
1238 stem_term = STEM_NONE;
1243 Term * term_obj = new Term(&state, term, field_info,
1244 unstemmed_term, stem_term, term_pos++);
1247 Parse(&parser, CJKTERM, term_obj, &state);
1248 if (it == end) break;
1252 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1254 if ((flags & FLAG_WILDCARD) && *it == '*') {
1257 if (p == end || !is_wordchar(*p)) {
1259 if (mode == IN_GROUP || mode == IN_GROUP2) {
1260 // Drop out of IN_GROUP and flag that the group
1261 // can be empty if all members are stopwords.
1262 if (mode == IN_GROUP2)
1263 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1266 // Wildcard at end of term (also known as
1267 // "right truncation").
1268 Parse(&parser, WILD_TERM, term_obj, &state);
1273 if (flags & FLAG_PARTIAL) {
1274 if (mode == IN_GROUP || mode == IN_GROUP2) {
1275 // Drop out of IN_GROUP and flag that the group
1276 // can be empty if all members are stopwords.
1277 if (mode == IN_GROUP2)
1278 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1281 // Final term of a partial match query, with no
1282 // following characters - treat as a wildcard.
1283 Parse(&parser, PARTIAL_TERM, term_obj, &state);
1289 // Check spelling, if we're a normal term, and any of the prefixes
1291 if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1292 const list<string> & pfxes = field_info->prefixes;
1293 list<string>::const_iterator pfx_it;
1294 for (pfx_it = pfxes.begin(); pfx_it != pfxes.end(); ++pfx_it) {
1295 if (!pfx_it->empty())
1297 const string & suggest = db.get_spelling_suggestion(term);
1298 if (!suggest.empty()) {
1299 if (corrected_query.empty()) corrected_query = qs;
1300 size_t term_end_index = it.raw() - qs.data();
1301 size_t n = term_end_index - term_start_index;
1302 size_t pos = term_start_index + correction_offset;
1303 corrected_query.replace(pos, n, suggest);
1304 correction_offset += suggest.size();
1305 correction_offset -= n;
1311 if (mode == IN_PHRASED_TERM) {
1312 Parse(&parser, PHR_TERM, term_obj, &state);
1314 // See if the next token will be PHR_TERM - if so, this one
1315 // needs to be TERM not GROUP_TERM.
1316 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1317 is_phrase_generator(*it)) {
1318 // FIXME: can we clean this up?
1319 Utf8Iterator p = it;
1322 } while (p != end && is_phrase_generator(*p));
1323 // Don't generate a phrase unless the phrase generators are
1324 // immediately followed by another term.
1325 if (p != end && is_wordchar(*p)) {
1331 if (mode == IN_GROUP || mode == IN_GROUP2) {
1335 Parse(&parser, token, term_obj, &state);
1336 if (token == TERM && mode != DEFAULT)
1341 if (it == end) break;
1343 if (is_phrase_generator(*it)) {
1344 // Skip multiple phrase generators.
1347 } while (it != end && is_phrase_generator(*it));
1348 // Don't generate a phrase unless the phrase generators are
1349 // immediately followed by another term.
1350 if (it != end && is_wordchar(*it)) {
1351 mode = IN_PHRASED_TERM;
1352 term_start_index = it.raw() - qs.data();
1355 } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1356 int old_mode = mode;
1358 if (!last_was_operator_needing_term && is_whitespace(*it)) {
1360 // Skip multiple whitespace.
1363 } while (it != end && is_whitespace(*it));
1364 // Don't generate a group unless the terms are only separated
1366 if (it != end && is_wordchar(*it)) {
1367 if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1378 // Implicitly close any unclosed quotes.
1379 if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1380 Parse(&parser, QUOTE, NULL, &state);
1382 // Implicitly close all unclosed brackets.
1383 while (prefix_stack.size() > 1) {
1384 Parse(&parser, KET, NULL, &state);
1385 prefix_stack.pop_back();
1387 Parse(&parser, 0, NULL, &state);
1390 errmsg = state.error;
1398 Query* query = NULL;
1401 // filter is a map from prefix to a query for that prefix. Queries with
1402 // the same prefix are combined with OR, and the results of this are
1403 // combined with AND to get the full filter.
1404 map<string, Query> filter;
1409 ProbQuery(Query* query_) : query(query_) {}
1417 void add_filter(const string& grouping, const Query & q) {
1418 filter[grouping] = q;
1421 void append_filter(const string& grouping, const Query & qnew) {
1422 auto it = filter.find(grouping);
1423 if (it == filter.end()) {
1424 filter.insert(make_pair(grouping, qnew));
1426 Query & q = it->second;
1427 // We OR multiple filters with the same prefix if they're
1428 // exclusive, otherwise we AND them.
1429 bool exclusive = !grouping.empty();
1430 Query::op op = exclusive ? Query::OP_OR : Query::OP_AND;
1431 q = Query(op, q, qnew);
1435 void add_filter_range(const string& grouping, const Query & range) {
1436 filter[grouping] = range;
1439 void append_filter_range(const string& grouping, const Query & range) {
1440 Query & q = filter[grouping];
1441 q = Query(Query::OP_OR, q, range);
1444 Query merge_filters() const {
1445 auto i = filter.begin();
1446 Assert(i != filter.end());
1447 Query q = i->second;
1448 while (++i != filter.end()) {
1449 q = Query(Query::OP_AND, q, i->second);
1455 /// A group of terms separated only by whitespace.
1457 vector<Term *> terms;
1459 /** Controls how to handle a group where all terms are stopwords.
1461 * If true, then as_group() returns NULL. If false, then the
1462 * stopword status of the terms is ignored.
1466 TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1472 /// Factory function - ensures heap allocation.
1473 static TermGroup* create(Term* t1, Term* t2) {
1474 return new TermGroup(t1, t2);
1478 for (auto&& t : terms) {
1483 /// Add a Term object to this TermGroup object.
1484 void add_term(Term * term) {
1485 terms.push_back(term);
1488 /// Set the empty_ok flag.
1489 void set_empty_ok() { empty_ok = true; }
1491 /// Convert to a Xapian::Query * using default_op.
1492 Query * as_group(State *state) const;
1496 TermGroup::as_group(State *state) const
1498 const Xapian::Stopper * stopper = state->get_stopper();
1499 size_t stoplist_size = state->stoplist_size();
1500 bool default_op_is_positional = is_positional(state->default_op());
1502 Query::op default_op = state->default_op();
1503 vector<Query> subqs;
1504 subqs.reserve(terms.size());
1505 if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1506 // Check for multi-word synonyms.
1507 Database db = state->get_database();
1510 vector<Term*>::const_iterator begin = terms.begin();
1511 vector<Term*>::const_iterator i = begin;
1512 while (i != terms.end()) {
1513 TermIterator synkey(db.synonym_keys_begin((*i)->name));
1514 TermIterator synend(db.synonym_keys_end((*i)->name));
1515 if (synkey == synend) {
1516 // No multi-synonym matches.
1517 if (stopper && (*stopper)((*i)->name)) {
1518 state->add_to_stoplist(*i);
1520 if (default_op_is_positional)
1521 (*i)->need_positions();
1522 subqs.push_back((*i)->get_query_with_auto_synonyms());
1528 while (i != terms.end()) {
1529 if (!key.empty()) key += ' ';
1532 synkey.skip_to(key);
1533 if (synkey == synend || !startswith(*synkey, key)) break;
1535 // Greedily try to match as many consecutive words as possible.
1536 TermIterator syn, end;
1538 syn = db.synonyms_begin(key);
1539 end = db.synonyms_end(key);
1540 if (syn != end) break;
1541 if (--i == begin) break;
1542 key.resize(key.size() - (*i)->name.size() - 1);
1545 // No multi-synonym matches.
1546 if (stopper && (*stopper)((*i)->name)) {
1547 state->add_to_stoplist(*i);
1549 if (default_op_is_positional)
1550 (*i)->need_positions();
1551 subqs.push_back((*i)->get_query_with_auto_synonyms());
1557 vector<Query> subqs2;
1558 vector<Term*>::const_iterator j;
1559 for (j = begin; j != i; ++j) {
1560 if (stopper && (*stopper)((*j)->name)) {
1561 state->add_to_stoplist(*j);
1563 if (default_op_is_positional)
1564 (*i)->need_positions();
1565 subqs2.push_back((*j)->get_query());
1568 Query q_original_terms;
1569 if (default_op_is_positional) {
1570 q_original_terms = Query(default_op,
1571 subqs2.begin(), subqs2.end(),
1574 q_original_terms = Query(default_op,
1575 subqs2.begin(), subqs2.end());
1579 // Use the position of the first term for the synonyms.
1580 Query q(Query::OP_SYNONYM,
1581 SynonymIterator(syn, (*begin)->pos, &q_original_terms),
1582 SynonymIterator(end));
1588 vector<Term*>::const_iterator i;
1589 for (i = terms.begin(); i != terms.end(); ++i) {
1590 if (stopper && (*stopper)((*i)->name)) {
1591 state->add_to_stoplist(*i);
1593 if (default_op_is_positional)
1594 (*i)->need_positions();
1595 subqs.push_back((*i)->get_query_with_auto_synonyms());
1600 if (!empty_ok && stopper && subqs.empty() &&
1601 stoplist_size < state->stoplist_size()) {
1602 // This group is all stopwords, so roll-back, disable stopper
1603 // temporarily, and reprocess this group.
1604 state->stoplist_resize(stoplist_size);
1610 if (!subqs.empty()) {
1611 if (default_op_is_positional) {
1612 q = new Query(default_op, subqs.begin(), subqs.end(),
1615 q = new Query(default_op, subqs.begin(), subqs.end());
1622 /// Some terms which form a positional sub-query.
1624 vector<Term *> terms;
1627 /** Keep track of whether the terms added all have the same list of
1628 * prefixes. If so, we'll build a set of phrases, one using each prefix.
1629 * This works around the limitation that a phrase cannot have multiple
1630 * components which are "OR" combinations of terms, but is also probably
1631 * what users expect: i.e., if a user specifies a phrase in a field, and
1632 * that field maps to multiple prefixes, the user probably wants a phrase
1633 * returned with all terms having one of those prefixes, rather than a
1634 * phrase comprised of terms with differing prefixes.
1636 bool uniform_prefixes;
1638 /** The list of prefixes of the terms added.
1639 * This will be NULL if the terms have different prefixes.
1641 const list<string> * prefixes;
1643 /// Convert to a query using the given operator and window size.
1644 Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
1646 size_t n_terms = terms.size();
1647 Xapian::termcount w = w_delta + terms.size();
1648 if (uniform_prefixes) {
1650 list<string>::const_iterator piter;
1651 for (piter = prefixes->begin(); piter != prefixes->end(); ++piter) {
1652 vector<Query> subqs;
1653 subqs.reserve(n_terms);
1654 vector<Term *>::const_iterator titer;
1655 for (titer = terms.begin(); titer != terms.end(); ++titer) {
1657 subqs.push_back(Query(t->make_term(*piter), 1, t->pos));
1659 add_to_query(q, Query::OP_OR,
1660 Query(op, subqs.begin(), subqs.end(), w));
1664 vector<Query> subqs;
1665 subqs.reserve(n_terms);
1666 vector<Term *>::const_iterator titer;
1667 for (titer = terms.begin(); titer != terms.end(); ++titer) {
1668 subqs.push_back((*titer)->get_query());
1670 q = new Query(op, subqs.begin(), subqs.end(), w);
1677 Terms() : window(0), uniform_prefixes(true), prefixes(NULL) { }
1680 /// Factory function - ensures heap allocation.
1681 static Terms* create() {
1686 for (auto&& t : terms) {
1691 /// Add an unstemmed Term object to this Terms object.
1692 void add_positional_term(Term * term) {
1693 const list<string> & term_prefixes = term->field_info->prefixes;
1694 if (terms.empty()) {
1695 prefixes = &term_prefixes;
1696 } else if (uniform_prefixes && prefixes != &term_prefixes) {
1697 if (*prefixes != term_prefixes) {
1699 uniform_prefixes = false;
1702 term->need_positions();
1703 terms.push_back(term);
1706 void adjust_window(size_t alternative_window) {
1707 if (alternative_window > window) window = alternative_window;
1710 /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
1711 Query * as_phrase_query() const {
1712 return as_opwindow_query(Query::OP_PHRASE, 0);
1715 /// Convert to a Xapian::Query * using OP_NEAR.
1716 Query * as_near_query() const {
1717 // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
1718 // means a window size of 11. For more than 2 terms, we just add one
1719 // to the window size for each extra term.
1722 return as_opwindow_query(Query::OP_NEAR, w - 1);
1725 /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
1726 Query * as_adj_query() const {
1727 // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
1728 // which means a window size of 11. For more than 2 terms, we just add
1729 // one to the window size for each extra term.
1732 return as_opwindow_query(Query::OP_PHRASE, w - 1);
1737 Term::as_positional_cjk_term(Terms * terms) const
1739 // Add each individual CJK character to the phrase.
1741 for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
1742 Unicode::append_utf8(t, *it);
1743 Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
1744 terms->add_positional_term(c);
1748 // FIXME: we want to add the n-grams as filters too for efficiency.
1753 // Helper macro for converting a boolean operation into a Xapian::Query.
1754 #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
1757 state->error = "Syntax: <expression> " OP_TXT " <expression>";\
1758 yy_parse_failed(yypParser);\
1761 E = new Query(OP, *A, *B);\
1768 %token_type {Term *}
1769 %token_destructor {delete $$;}
1771 %extra_argument {State * state}
1774 // If we've not already set an error message, set a default one.
1775 if (!state->error) state->error = "parse error";
1779 yy_parse_failed(yypParser);
1782 // Operators, grouped in order of increasing precedence:
1788 %left LOVE HATE HATE_AFTER_AND SYNONYM.
1790 // Destructors for terminal symbols:
1792 // TERM is a query term, including prefix (if any).
1793 %destructor TERM {delete $$;}
1795 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
1796 // is only separated by whitespace characters.
1797 %destructor GROUP_TERM {delete $$;}
1799 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
1800 // separated only by one or more phrase generator characters (hyphen and
1801 // apostrophe are common examples - see is_phrase_generator() for the list
1802 // of all punctuation which does this).
1803 %destructor PHR_TERM {delete $$;}
1805 // WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
1807 %destructor WILD_TERM {delete $$;}
1809 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
1810 // we're doing "search as you type". It expands to something like WILD_TERM
1812 %destructor PARTIAL_TERM {delete $$;}
1814 // BOOLEAN_FILTER is a query term with a prefix registered using
1815 // add_boolean_prefix(). It's added to the query using an OP_FILTER operator,
1816 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
1817 %destructor BOOLEAN_FILTER {delete $$;}
1821 // query - The whole query - just an expr or nothing.
1823 // query non-terminal doesn't need a type, so just give a dummy one.
1826 query ::= expr(E). {
1827 // Save the parsed query in the State structure so we can return it.
1832 state->query = Query();
1837 // Handle a query string with no terms in.
1838 state->query = Query();
1841 // expr - A query expression.
1843 %type expr {Query *}
1844 %destructor expr {delete $$;}
1846 expr(E) ::= prob_expr(E).
1848 expr(E) ::= bool_arg(A) AND bool_arg(B).
1849 { BOOL_OP_TO_QUERY(E, A, Query::OP_AND, B, "AND"); }
1851 expr(E) ::= bool_arg(A) NOT bool_arg(B). {
1852 // 'NOT foo' -> '<alldocuments> NOT foo'
1853 if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
1854 A = new Query("", 1, 0);
1856 BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "NOT");
1859 expr(E) ::= bool_arg(A) AND NOT bool_arg(B). [NOT]
1860 { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND NOT"); }
1862 expr(E) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND]
1863 { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND"); }
1865 expr(E) ::= bool_arg(A) OR bool_arg(B).
1866 { BOOL_OP_TO_QUERY(E, A, Query::OP_OR, B, "OR"); }
1868 expr(E) ::= bool_arg(A) XOR bool_arg(B).
1869 { BOOL_OP_TO_QUERY(E, A, Query::OP_XOR, B, "XOR"); }
1871 // bool_arg - an argument to a boolean operator such as AND or OR.
1873 %type bool_arg {Query *}
1874 %destructor bool_arg {delete $$;}
1876 bool_arg(A) ::= expr(A).
1878 bool_arg(A) ::= . [ERROR] {
1879 // Set the argument to NULL, which enables the bool_arg-using rules in
1880 // expr above to report uses of AND, OR, etc which don't have two
1885 // prob_expr - a single compound term, or a prob.
1887 %type prob_expr {Query *}
1888 %destructor prob_expr {delete $$;}
1890 prob_expr(E) ::= prob(P). {
1893 // Handle any "+ terms".
1895 if (P->love->empty()) {
1901 add_to_query(E, Query::OP_AND_MAYBE, P->love);
1907 // Handle any boolean filters.
1908 if (!P->filter.empty()) {
1910 add_to_query(E, Query::OP_FILTER, P->merge_filters());
1912 // Make the query a boolean one.
1913 E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
1916 // Handle any "- terms".
1917 if (P->hate && !P->hate->empty()) {
1920 yy_parse_failed(yypParser);
1923 *E = Query(Query::OP_AND_NOT, *E, *P->hate);
1928 prob_expr(E) ::= term(E).
1930 // prob - a sub-expression consisting of stop_terms, "+" terms, "-" terms,
1931 // boolean filters, and/or ranges.
1933 // Note: stop_term can also be several other things other than a simple term!
1935 %type prob {ProbQuery *}
1936 %destructor prob {delete $$;}
1938 prob(P) ::= RANGE(R). {
1939 string grouping = R->name;
1940 const Query & range = R->as_range_query();
1941 P = new ProbQuery; /*P-overwrites-R*/
1942 P->add_filter_range(grouping, range);
1945 prob(P) ::= stop_prob(P) RANGE(R). {
1946 string grouping = R->name;
1947 const Query & range = R->as_range_query();
1948 P->append_filter_range(grouping, range);
1951 prob(P) ::= stop_term(T) stop_term(U). {
1952 P = new ProbQuery(T); /*P-overwrites-T*/
1954 Query::op op = state->default_op();
1955 if (P->query && is_positional(op)) {
1956 // If default_op is OP_NEAR or OP_PHRASE, set the window size to
1957 // 11 for the first pair of terms and it will automatically grow
1958 // by one for each subsequent term.
1959 Query * subqs[2] = { P->query, U };
1960 *(P->query) = Query(op, subqs, subqs + 2, 11);
1963 add_to_query(P->query, op, U);
1968 prob(P) ::= prob(P) stop_term(T). {
1969 // If T is a stopword, there's nothing to do here.
1970 if (T) add_to_query(P->query, state->default_op(), T);
1973 prob(P) ::= LOVE term(T). {
1975 if (state->default_op() == Query::OP_AND) {
1982 prob(P) ::= stop_prob(P) LOVE term(T). {
1983 if (state->default_op() == Query::OP_AND) {
1984 /* The default op is AND, so we just put loved terms into the query
1985 * (in this case the only effect of love is to ignore the stopword
1987 add_to_query(P->query, Query::OP_AND, T);
1989 add_to_query(P->love, Query::OP_AND, T);
1993 prob(P) ::= HATE term(T). {
1998 prob(P) ::= stop_prob(P) HATE term(T). {
1999 add_to_query(P->hate, Query::OP_OR, T);
2002 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2004 P->hate = new Query(T->get_query());
2008 prob(P) ::= stop_prob(P) HATE BOOLEAN_FILTER(T). {
2009 add_to_query(P->hate, Query::OP_OR, T->get_query());
2013 prob(P) ::= BOOLEAN_FILTER(T). {
2015 P->add_filter(T->get_grouping(), T->get_query());
2019 prob(P) ::= stop_prob(P) BOOLEAN_FILTER(T). {
2020 P->append_filter(T->get_grouping(), T->get_query());
2024 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2025 // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2027 P->filter[T->get_grouping()] = T->get_query();
2031 prob(P) ::= stop_prob(P) LOVE BOOLEAN_FILTER(T). {
2032 // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2033 // We OR filters with the same prefix...
2034 Query & q = P->filter[T->get_grouping()];
2035 q = Query(Query::OP_OR, q, T->get_query());
2039 // stop_prob - A prob or a stop_term.
2041 %type stop_prob {ProbQuery *}
2042 %destructor stop_prob {delete $$;}
2044 stop_prob(P) ::= prob(P).
2046 stop_prob(P) ::= stop_term(T). {
2047 P = new ProbQuery(T); /*P-overwrites-T*/
2050 // stop_term - A term which should be checked against the stopword list,
2051 // or a compound_term.
2053 // If a term is loved, hated, or in a phrase, we don't want to consult the
2054 // stopword list, so stop_term isn't used there (instead term is).
2056 %type stop_term {Query *}
2057 %destructor stop_term {delete $$;}
2059 stop_term(T) ::= TERM(U). {
2060 if (state->is_stopword(U)) {
2062 state->add_to_stoplist(U);
2064 T = new Query(U->get_query_with_auto_synonyms());
2069 stop_term(T) ::= compound_term(T).
2071 // term - A term or a compound_term.
2073 %type term {Query *}
2074 %destructor term {delete $$;}
2076 term(T) ::= TERM(U). {
2077 T = new Query(U->get_query_with_auto_synonyms());
2081 term(T) ::= compound_term(T).
2083 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2084 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2085 // or without prefix).
2087 %type compound_term {Query *}
2088 %destructor compound_term {delete $$;}
2090 compound_term(T) ::= WILD_TERM(U).
2091 { T = U->as_wildcarded_query(state); /*T-overwrites-U*/ }
2093 compound_term(T) ::= PARTIAL_TERM(U).
2094 { T = U->as_partial_query(state); /*T-overwrites-U*/ }
2096 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2097 { T = P->as_phrase_query(); }
2099 compound_term(T) ::= phrased_term(P).
2100 { T = P->as_phrase_query(); /*T-overwrites-P*/ }
2102 compound_term(T) ::= group(P).
2103 { T = P->as_group(state); /*T-overwrites-P*/ }
2105 compound_term(T) ::= near_expr(P).
2106 { T = P->as_near_query(); /*T-overwrites-P*/ }
2108 compound_term(T) ::= adj_expr(P).
2109 { T = P->as_adj_query(); /*T-overwrites-P*/ }
2111 compound_term(T) ::= BRA expr(E) KET.
2114 compound_term(T) ::= SYNONYM TERM(U). {
2115 T = new Query(U->get_query_with_synonyms());
2119 compound_term(T) ::= CJKTERM(U). {
2120 { T = U->as_cjk_query(); /*T-overwrites-U*/ }
2123 // phrase - The "inside the quotes" part of a double-quoted phrase.
2125 %type phrase {Terms *}
2127 %destructor phrase {delete $$;}
2129 phrase(P) ::= TERM(T). {
2130 P = Terms::create();
2131 P->add_positional_term(T);
2134 phrase(P) ::= CJKTERM(T). {
2135 P = Terms::create();
2136 T->as_positional_cjk_term(P);
2139 phrase(P) ::= phrase(P) TERM(T). {
2140 P->add_positional_term(T);
2143 phrase(P) ::= phrase(P) CJKTERM(T). {
2144 T->as_positional_cjk_term(P);
2147 // phrased_term - A phrased term works like a single term, but is actually
2148 // 2 or more terms linked together into a phrase by punctuation. There must be
2149 // at least 2 terms in order to be able to have punctuation between the terms!
2151 %type phrased_term {Terms *}
2152 %destructor phrased_term {delete $$;}
2154 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2155 P = Terms::create();
2156 P->add_positional_term(T);
2157 P->add_positional_term(U);
2160 phrased_term(P) ::= phrased_term(P) PHR_TERM(T). {
2161 P->add_positional_term(T);
2164 // group - A group of terms separated only by whitespace - candidates for
2165 // multi-term synonyms.
2167 %type group {TermGroup *}
2168 %destructor group {delete $$;}
2170 group(P) ::= TERM(T) GROUP_TERM(U). {
2171 P = TermGroup::create(T, U); /*P-overwrites-T*/
2174 group(P) ::= group(P) GROUP_TERM(T). {
2178 group(P) ::= group(P) EMPTY_GROUP_OK. {
2182 // near_expr - 2 or more terms with NEAR in between. There must be at least 2
2183 // terms in order for there to be any NEAR operators!
2185 %type near_expr {Terms *}
2186 %destructor near_expr {delete $$;}
2188 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2189 P = Terms::create();
2190 P->add_positional_term(T);
2191 P->add_positional_term(U);
2193 P->adjust_window(N->get_termpos());
2198 near_expr(P) ::= near_expr(P) NEAR(N) TERM(T). {
2199 P->add_positional_term(T);
2201 P->adjust_window(N->get_termpos());
2206 // adj_expr - 2 or more terms with ADJ in between. There must be at least 2
2207 // terms in order for there to be any ADJ operators!
2209 %type adj_expr {Terms *}
2210 %destructor adj_expr {delete $$;}
2212 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2213 P = Terms::create();
2214 P->add_positional_term(T);
2215 P->add_positional_term(U);
2217 P->adjust_window(N->get_termpos());
2222 adj_expr(P) ::= adj_expr(P) ADJ(N) TERM(T). {
2223 P->add_positional_term(T);
2225 P->adjust_window(N->get_termpos());
2230 // Select yacc syntax highlighting in vim editor: vim: syntax=yacc
2231 // (lemon syntax colouring isn't supplied by default; yacc does an OK job).