xapian-core/queryparser/queryparser.lemony

   1 %include {
   2 /** @file
   3  * @brief build a Xapian::Query object from a user query string
   4  */
   5 /* Copyright (C) 2004-2024 Olly Betts
   6  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
   7  * Copyright (C) 2010 Adam Sjøgren
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  22  * USA
  23  */
  24
  25 #include <config.h>
  26
  27 #include "queryparser_internal.h"
  28
  29 #include "api/queryinternal.h"
  30 #include "omassert.h"
  31 #include "str.h"
  32 #include "stringutils.h"
  33 #include "xapian/error.h"
  34 #include "xapian/unicode.h"
  35
  36 // Include the list of token values lemon generates.
  37 #include "queryparser_token.h"
  38
  39 #include "word-breaker.h"
  40
  41 #include <algorithm>
  42 #include <cstring>
  43 #include <limits>
  44 #include <list>
  45 #include <string>
  46 #include <string_view>
  47 #include <vector>
  48
  49 // We create the yyParser on the stack.
  50 #define Parse_ENGINEALWAYSONSTACK
  51
  52 using namespace std;
  53
  54 using namespace Xapian;
  55
  56 static constexpr unsigned NO_EDIT_DISTANCE = unsigned(-1);
  57 static constexpr unsigned DEFAULT_EDIT_DISTANCE = 2;
  58
  59 inline bool
  60 U_isupper(unsigned ch) {
  61     return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
  62 }
  63
  64 inline bool
  65 U_isdigit(unsigned ch) {
  66     return ch < 128 && C_isdigit(static_cast<unsigned char>(ch));
  67 }
  68
  69 inline bool
  70 U_isalpha(unsigned ch) {
  71     return ch < 128 && C_isalpha(static_cast<unsigned char>(ch));
  72 }
  73
  74 using Xapian::Unicode::is_whitespace;
  75
  76 inline bool
  77 is_not_whitespace(unsigned ch) {
  78     return !is_whitespace(ch);
  79 }
  80
  81 using Xapian::Unicode::is_wordchar;
  82
  83 inline bool
  84 is_not_wordchar(unsigned ch) {
  85     return !is_wordchar(ch);
  86 }
  87
  88 inline bool
  89 is_digit(unsigned ch) {
  90     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
  91 }
  92
  93 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
  94 // and there's the risk of hyphens getting stuck onto the end of terms...
  95 //
  96 // There are currently assumptions below that this only matches ASCII
  97 // characters.
  98 inline bool
  99 is_suffix(unsigned ch) {
 100     return ch == '+' || ch == '#';
 101 }
 102
 103 inline bool
 104 is_double_quote(unsigned ch) {
 105     // We simply treat all double quotes as equivalent, which is a bit crude,
 106     // but it isn't clear that it would actually better to require them to
 107     // match up exactly.
 108     //
 109     // 0x201c is Unicode opening double quote.
 110     // 0x201d is Unicode closing double quote.
 111     return ch == '"' || ch == 0x201c || ch == 0x201d;
 112 }
 113
 114 inline bool
 115 prefix_needs_colon(const string & prefix, unsigned ch)
 116 {
 117     if (!U_isupper(ch) && ch != ':') return false;
 118     string::size_type len = prefix.length();
 119     return (len > 1 && prefix[len - 1] != ':');
 120 }
 121
 122 using Unicode::is_currency;
 123
 124 inline bool
 125 is_positional(Xapian::Query::op op)
 126 {
 127     return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
 128 }
 129
 130 class Terms;
 131
 132 /** Class used to pass information about a token from lexer to parser.
 133  *
 134  *  Generally an instance of this class carries term information, but it can be
 135  *  used for a range query, and with some operators (e.g. the distance in
 136  *  NEAR/3 or ADJ/3, etc).
 137  */
 138 class Term {
 139     State * state;
 140
 141   public:
 142     string name;
 143     const FieldInfo * field_info;
 144     string unstemmed;
 145     QueryParser::stem_strategy stem;
 146     termpos pos;
 147     Query query;
 148     unsigned edit_distance;
 149
 150     Term(const string &name_, termpos pos_)
 151         : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
 152     explicit Term(const string &name_)
 153         : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
 154     Term(const string &name_, const FieldInfo * field_info_)
 155         : name(name_), field_info(field_info_),
 156           stem(QueryParser::STEM_NONE), pos(0) { }
 157     explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
 158     Term(State * state_, const string &name_, const FieldInfo * field_info_,
 159          const string &unstemmed_,
 160          QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
 161          termpos pos_ = 0,
 162          unsigned edit_distance_ = NO_EDIT_DISTANCE)
 163         : state(state_), name(name_), field_info(field_info_),
 164           unstemmed(unstemmed_), stem(stem_), pos(pos_),
 165           edit_distance(edit_distance_) { }
 166     // For RANGE tokens.
 167     Term(const Xapian::Query & q, const string & grouping)
 168         : name(grouping), query(q) { }
 169
 170     string make_term(const string & prefix) const;
 171
 172     void need_positions() {
 173         if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
 174     }
 175
 176     termpos get_termpos() const { return pos; }
 177
 178     string get_grouping() const {
 179         return field_info->grouping;
 180     }
 181
 182     Query * as_fuzzy_query(State * state) const;
 183
 184     Query * as_wildcarded_query(State * state) const;
 185
 186     /** Build a query for a term at the very end of the query string when
 187      *  FLAG_PARTIAL is in use.
 188      *
 189      *  This query should match documents containing any terms which start with
 190      *  the characters specified, but should give a higher score to exact
 191      *  matches (since the user might have finished typing - we simply don't
 192      *  know).
 193      */
 194     Query * as_partial_query(State * state_) const;
 195
 196     /** Build a query for a string of words without explicit word breaks. */
 197     Query* as_unbroken_query() const;
 198
 199     /** Handle text without explicit word breaks in a positional context. */
 200     void as_positional_unbroken(Terms* terms) const;
 201
 202     /// Range query.
 203     Query as_range_query() const;
 204
 205     Query get_query() const;
 206
 207     Query get_query_with_synonyms() const;
 208
 209     Query get_query_with_auto_synonyms() const;
 210 };
 211
 212 /// Parser State shared between the lexer and the parser.
 213 class State {
 214     QueryParser::Internal * qpi;
 215
 216   public:
 217     Query query;
 218     const char* error = NULL;
 219     unsigned flags;
 220     Query::op effective_default_op;
 221
 222     State(QueryParser::Internal * qpi_, unsigned flags_)
 223         : qpi(qpi_), flags(flags_), effective_default_op(qpi_->default_op)
 224     {
 225         if ((flags & QueryParser::FLAG_NO_POSITIONS)) {
 226             if (is_positional(effective_default_op)) {
 227                 effective_default_op = Query::OP_AND;
 228                 }
 229         }
 230     }
 231
 232     string stem_term(const string &term) {
 233         return qpi->stemmer(term);
 234     }
 235
 236     void add_to_stoplist(const Term * term) {
 237         qpi->stoplist.push_back(term->name);
 238     }
 239
 240     void add_to_unstem(const string & term, const string & unstemmed) {
 241         qpi->unstem.insert(make_pair(term, unstemmed));
 242     }
 243
 244     Term * range(const string &a, const string &b) {
 245         for (auto i : qpi->rangeprocs) {
 246             Xapian::Query range_query = (i.proc)->check_range(a, b);
 247             Xapian::Query::op op = range_query.get_type();
 248             switch (op) {
 249                 case Xapian::Query::OP_INVALID:
 250                     break;
 251                 case Xapian::Query::OP_VALUE_RANGE:
 252                 case Xapian::Query::OP_VALUE_GE:
 253                 case Xapian::Query::OP_VALUE_LE:
 254                     if (i.default_grouping) {
 255                         Xapian::Internal::QueryValueBase * base =
 256                             static_cast<Xapian::Internal::QueryValueBase*>(
 257                                 range_query.internal.get());
 258                         Xapian::valueno slot = base->get_slot();
 259                         return new Term(range_query, str(slot));
 260                     }
 261                     // FALLTHRU
 262                 case Xapian::Query::LEAF_TERM:
 263                     return new Term(range_query, i.grouping);
 264                 default:
 265                     return new Term(range_query, string());
 266             }
 267         }
 268         return NULL;
 269     }
 270
 271     Query::op default_op() const {
 272         return effective_default_op;
 273     }
 274
 275     bool is_stopword(const Term *term) const {
 276         return qpi->stopper && (*qpi->stopper)(term->name);
 277     }
 278
 279     Database get_database() const {
 280         return qpi->db;
 281     }
 282
 283     const Stopper * get_stopper() const {
 284         return qpi->stopper.get();
 285     }
 286
 287     size_t stoplist_size() const {
 288         return qpi->stoplist.size();
 289     }
 290
 291     void stoplist_resize(size_t s) {
 292         qpi->stoplist.resize(s);
 293     }
 294
 295     Xapian::termcount get_max_wildcard_expansion() const {
 296         return qpi->max_wildcard_expansion;
 297     }
 298
 299     int get_max_wildcard_type() const {
 300         return qpi->max_wildcard_type;
 301     }
 302
 303     unsigned get_min_wildcard_prefix_len() const {
 304         return qpi->min_wildcard_prefix_len;
 305     }
 306
 307     Xapian::termcount get_max_partial_expansion() const {
 308         return qpi->max_partial_expansion;
 309     }
 310
 311     int get_max_partial_type() const {
 312         return qpi->max_partial_type;
 313     }
 314
 315     unsigned get_min_partial_prefix_len() const {
 316         return qpi->min_partial_prefix_len;
 317     }
 318
 319     Xapian::termcount get_max_fuzzy_expansion() const {
 320         return qpi->max_fuzzy_expansion;
 321     }
 322
 323     int get_max_fuzzy_type() const {
 324         return qpi->max_fuzzy_type;
 325     }
 326 };
 327
 328 string
 329 Term::make_term(const string & prefix) const
 330 {
 331     string term;
 332     if (stem != QueryParser::STEM_NONE && stem != QueryParser::STEM_ALL)
 333         term += 'Z';
 334     if (!prefix.empty()) {
 335         term += prefix;
 336         if (prefix_needs_colon(prefix, name[0])) term += ':';
 337     }
 338     if (stem != QueryParser::STEM_NONE) {
 339         term += state->stem_term(name);
 340     } else {
 341         term += name;
 342     }
 343
 344     if (!unstemmed.empty())
 345         state->add_to_unstem(term, unstemmed);
 346     return term;
 347 }
 348
 349 // Iterator shim to allow building a synonym query from a TermIterator pair.
 350 class SynonymIterator {
 351     Xapian::TermIterator i;
 352
 353     Xapian::termpos pos;
 354
 355     const Xapian::Query * first;
 356
 357   public:
 358     SynonymIterator(const Xapian::TermIterator & i_,
 359                     Xapian::termpos pos_ = 0,
 360                     const Xapian::Query * first_ = NULL)
 361         : i(i_), pos(pos_), first(first_) { }
 362
 363     SynonymIterator & operator++() {
 364         if (first)
 365             first = NULL;
 366         else
 367             ++i;
 368         return *this;
 369     }
 370
 371     const Xapian::Query operator*() const {
 372         if (first) return *first;
 373         return Xapian::Query(*i, 1, pos);
 374     }
 375
 376     bool operator==(const SynonymIterator & o) const {
 377         return i == o.i && first == o.first;
 378     }
 379
 380     bool operator!=(const SynonymIterator & o) const {
 381         return !(*this == o);
 382     }
 383
 384     typedef std::input_iterator_tag iterator_category;
 385     typedef Xapian::Query value_type;
 386     typedef Xapian::termcount_diff difference_type;
 387     typedef Xapian::Query * pointer;
 388     typedef Xapian::Query & reference;
 389 };
 390
 391 Query
 392 Term::get_query_with_synonyms() const
 393 {
 394     // Handle single-word synonyms with each prefix.
 395     const auto& prefixes = field_info->prefixes;
 396     if (prefixes.empty()) {
 397         Assert(field_info->proc);
 398         return (*field_info->proc)(name);
 399     }
 400
 401     Query q = get_query();
 402
 403     for (auto&& prefix : prefixes) {
 404         // First try the unstemmed term:
 405         string term;
 406         if (!prefix.empty()) {
 407             term += prefix;
 408             if (prefix_needs_colon(prefix, name[0])) term += ':';
 409         }
 410         term += name;
 411
 412         Xapian::Database db = state->get_database();
 413         Xapian::TermIterator syn = db.synonyms_begin(term);
 414         Xapian::TermIterator end = db.synonyms_end(term);
 415         if (syn == end && stem != QueryParser::STEM_NONE) {
 416             // If that has no synonyms, try the stemmed form:
 417             term = 'Z';
 418             if (!prefix.empty()) {
 419                 term += prefix;
 420                 if (prefix_needs_colon(prefix, name[0])) term += ':';
 421             }
 422             term += state->stem_term(name);
 423             syn = db.synonyms_begin(term);
 424             end = db.synonyms_end(term);
 425         }
 426         q = Query(q.OP_SYNONYM,
 427                   SynonymIterator(syn, pos, &q),
 428                   SynonymIterator(end));
 429     }
 430     return q;
 431 }
 432
 433 Query
 434 Term::get_query_with_auto_synonyms() const
 435 {
 436     const unsigned MASK_ENABLE_AUTO_SYNONYMS =
 437         QueryParser::FLAG_AUTO_SYNONYMS |
 438         QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 439     if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
 440         return get_query_with_synonyms();
 441
 442     return get_query();
 443 }
 444
 445 static void
 446 add_to_query(Query *& q, Query::op op, Query * term)
 447 {
 448     Assert(term);
 449     if (q) {
 450         if (op == Query::OP_OR) {
 451             *q |= *term;
 452         } else if (op == Query::OP_AND) {
 453             *q &= *term;
 454         } else {
 455             *q = Query(op, *q, *term);
 456         }
 457         delete term;
 458     } else {
 459         q = term;
 460     }
 461 }
 462
 463 static void
 464 add_to_query(Query *& q, Query::op op, const Query & term)
 465 {
 466     if (q) {
 467         if (op == Query::OP_OR) {
 468             *q |= term;
 469         } else if (op == Query::OP_AND) {
 470             *q &= term;
 471         } else {
 472             *q = Query(op, *q, term);
 473         }
 474     } else {
 475         q = new Query(term);
 476     }
 477 }
 478
 479 Query
 480 Term::get_query() const
 481 {
 482     const auto& prefixes = field_info->prefixes;
 483     if (prefixes.empty()) {
 484         Assert(field_info->proc);
 485         return (*field_info->proc)(name);
 486     }
 487     auto piter = prefixes.begin();
 488     Query q(make_term(*piter), 1, pos);
 489     while (++piter != prefixes.end()) {
 490         q |= Query(make_term(*piter), 1, pos);
 491     }
 492     return q;
 493 }
 494
 495 Query *
 496 Term::as_fuzzy_query(State* state_) const
 497 {
 498     const auto& prefixes = field_info->prefixes;
 499     Xapian::termcount max = state_->get_max_fuzzy_expansion();
 500     int query_flags = state_->get_max_fuzzy_type();
 501     vector<Query> subqs;
 502     subqs.reserve(prefixes.size());
 503     for (auto&& prefix : prefixes) {
 504         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 505         subqs.emplace_back(Query::OP_EDIT_DISTANCE,
 506                            prefix + name,
 507                            max,
 508                            query_flags,
 509                            Query::OP_OR,
 510                            edit_distance,
 511                            prefix.size());
 512     }
 513     Query* q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
 514     delete this;
 515     return q;
 516 }
 517
 518 Query *
 519 Term::as_wildcarded_query(State * state_) const
 520 {
 521     const auto& prefixes = field_info->prefixes;
 522     Xapian::termcount max = state_->get_max_wildcard_expansion();
 523     int query_flags = state_->get_max_wildcard_type();
 524     if (state_->flags & QueryParser::FLAG_WILDCARD_SINGLE)
 525         query_flags |= Query::WILDCARD_PATTERN_SINGLE;
 526     if (state_->flags & QueryParser::FLAG_WILDCARD_MULTI)
 527         query_flags |= Query::WILDCARD_PATTERN_MULTI;
 528     vector<Query> subqs;
 529     subqs.reserve(prefixes.size());
 530     for (string root : prefixes) {
 531         root += name;
 532         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 533         subqs.push_back(Query(Query::OP_WILDCARD, root, max, query_flags,
 534                               Query::OP_OR));
 535     }
 536     Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
 537     delete this;
 538     return q;
 539 }
 540
 541 Query *
 542 Term::as_partial_query(State * state_) const
 543 {
 544     Xapian::termcount max = state_->get_max_partial_expansion();
 545     int max_type = state_->get_max_partial_type();
 546     vector<Query> subqs_partial; // A synonym of all the partial terms.
 547     vector<Query> subqs_full; // A synonym of all the full terms.
 548
 549     for (const string& prefix : field_info->prefixes) {
 550         string root = prefix;
 551         root += name;
 552         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 553         subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 554                                       Query::OP_OR));
 555         // Add the term, as it would normally be handled, as an alternative.
 556         subqs_full.push_back(Query(make_term(prefix), 1, pos));
 557     }
 558     Query * q = new Query(Query::OP_OR,
 559                           Query(Query::OP_SYNONYM,
 560                                 subqs_partial.begin(), subqs_partial.end()),
 561                           Query(Query::OP_SYNONYM,
 562                                 subqs_full.begin(), subqs_full.end()));
 563     delete this;
 564     return q;
 565 }
 566
 567 Query *
 568 Term::as_unbroken_query() const
 569 {
 570     const auto& prefixes = field_info->prefixes;
 571     Query *q;
 572     vector<Query> prefix_subqs;
 573
 574 #ifdef USE_ICU
 575     if (state->flags & QueryParser::FLAG_WORD_BREAKS) {
 576         for (WordIterator tk(name); tk != WordIterator(); ++tk) {
 577             const string& token = *tk;
 578             for (const string& prefix : prefixes) {
 579                 prefix_subqs.push_back(Query(prefix + token, 1, pos));
 580             }
 581         }
 582
 583         q = new Query(Query::OP_AND, prefix_subqs.begin(), prefix_subqs.end());
 584
 585         delete this;
 586         return q;
 587     }
 588 #endif
 589
 590     vector<Query> ngram_subqs;
 591
 592     for (const string& prefix : prefixes) {
 593         for (NgramIterator tk(name); tk != NgramIterator(); ++tk) {
 594             ngram_subqs.push_back(Query(prefix + *tk, 1, pos));
 595         }
 596         prefix_subqs.push_back(Query(Query::OP_AND,
 597                                      ngram_subqs.begin(), ngram_subqs.end()));
 598         ngram_subqs.clear();
 599     }
 600     q = new Query(Query::OP_OR, prefix_subqs.begin(), prefix_subqs.end());
 601
 602     delete this;
 603     return q;
 604 }
 605
 606 Query
 607 Term::as_range_query() const
 608 {
 609     Query q = query;
 610     delete this;
 611     return q;
 612 }
 613
 614 inline bool
 615 is_phrase_generator(unsigned ch)
 616 {
 617     // These characters generate a phrase search.
 618     // Ordered mostly by frequency of calls to this function done when
 619     // running the testcases in api_queryparser.cc.
 620     return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
 621 }
 622
 623 inline bool
 624 is_stem_preventer(unsigned ch)
 625 {
 626     return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
 627 }
 628
 629 inline bool
 630 should_stem(const string & term)
 631 {
 632     const unsigned int SHOULD_STEM_MASK =
 633         (1 << Unicode::LOWERCASE_LETTER) |
 634         (1 << Unicode::TITLECASE_LETTER) |
 635         (1 << Unicode::MODIFIER_LETTER) |
 636         (1 << Unicode::OTHER_LETTER);
 637     Utf8Iterator u(term);
 638     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
 639 }
 640
 641 /** Value representing "ignore this" when returned by check_infix() or
 642  *  check_infix_digit().
 643  */
 644 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
 645
 646 inline unsigned check_infix(unsigned ch) {
 647     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
 648         // Unicode includes all these except '&' in its word boundary rules,
 649         // as well as 0x2019 (which we handle below) and ':' (for Swedish
 650         // apparently, but we ignore this for now as it's problematic in
 651         // real world cases).
 652         return ch;
 653     }
 654     if (ch >= 0x200b) {
 655         // 0x2019 is Unicode apostrophe and single closing quote.
 656         // 0x201b is Unicode single opening quote with the tail rising.
 657         if (ch == 0x2019 || ch == 0x201b)
 658             return '\'';
 659         if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
 660             return UNICODE_IGNORE;
 661     }
 662     return 0;
 663 }
 664
 665 inline unsigned check_infix_digit(unsigned ch) {
 666     // This list of characters comes from Unicode's word identifying algorithm.
 667     switch (ch) {
 668         case ',':
 669         case '.':
 670         case ';':
 671         case 0x037e: // GREEK QUESTION MARK
 672         case 0x0589: // ARMENIAN FULL STOP
 673         case 0x060D: // ARABIC DATE SEPARATOR
 674         case 0x07F8: // NKO COMMA
 675         case 0x2044: // FRACTION SLASH
 676         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 677         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 678         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 679             return ch;
 680     }
 681     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 682         return UNICODE_IGNORE;
 683     return 0;
 684 }
 685
 686 // Prototype a function lemon generates, but which we want to call before that
 687 // in the generated source code file.
 688 struct yyParser;
 689 static void yy_parse_failed(yyParser *);
 690
 691 void
 692 QueryParser::Internal::add_prefix(string_view field, string_view prefix)
 693 {
 694     // Allow for optional trailing `:` for consistency with how range prefixes
 695     // are specified.
 696     if (!field.empty() && field.back() == ':') {
 697         field = field.substr(0, field.size() - 1);
 698     }
 699 #ifdef  __cpp_lib_associative_heterogeneous_insertion // C++26
 700     auto [it, inserted] = field_map.try_emplace(field, NON_BOOLEAN);
 701 #else
 702     auto [it, inserted] = field_map.try_emplace(string(field), NON_BOOLEAN);
 703 #endif
 704     auto&& p = it->second;
 705     if (inserted) {
 706         p.append(prefix);
 707         return;
 708     }
 709
 710     // Check that this is the same type of filter as the existing one(s).
 711     if (p.type != NON_BOOLEAN) {
 712         throw Xapian::InvalidOperationError("Can't use add_prefix() and "
 713                                             "add_boolean_prefix() on the "
 714                                             "same field name, or "
 715                                             "add_boolean_prefix() with "
 716                                             "different values of the "
 717                                             "'exclusive' parameter");
 718     }
 719     if (p.proc)
 720         throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
 721                                               "and string prefixes currently "
 722                                               "not supported");
 723     // Only add if it's not already there as duplicate entries just result
 724     // in redundant query terms.  This is a linear scan so makes calling
 725     // add_prefix() n times for the same field value with different prefix
 726     // values O(n²), but you wouldn't realistically want to map one field
 727     // to more than a handful of prefixes.
 728     auto& prefixes = p.prefixes;
 729     if (find(prefixes.begin(), prefixes.end(), prefix) == prefixes.end()) {
 730         p.append(prefix);
 731     }
 732 }
 733
 734 void
 735 QueryParser::Internal::add_prefix(string_view field, FieldProcessor* proc)
 736 {
 737     // Allow for optional trailing `:` for consistency with how range prefixes
 738     // are specified.
 739     if (!field.empty() && field.back() == ':') {
 740         field = field.substr(0, field.size() - 1);
 741     }
 742 #ifdef  __cpp_lib_associative_heterogeneous_insertion // C++26
 743     auto [it, inserted] = field_map.try_emplace(field,
 744                                                 NON_BOOLEAN, proc);
 745 #else
 746     auto [it, inserted] = field_map.try_emplace(string(field),
 747                                                 NON_BOOLEAN, proc);
 748 #endif
 749     if (inserted)
 750         return;
 751
 752     auto&& p = it->second;
 753     // Check that this is the same type of filter as the existing one(s).
 754     if (p.type != NON_BOOLEAN) {
 755         throw Xapian::InvalidOperationError("Can't use add_prefix() and "
 756                                             "add_boolean_prefix() on the "
 757                                             "same field name, or "
 758                                             "add_boolean_prefix() with "
 759                                             "different values of the "
 760                                             "'exclusive' parameter");
 761     }
 762     if (!p.prefixes.empty())
 763         throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
 764                                               "and string prefixes currently "
 765                                               "not supported");
 766     throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects "
 767                                           "for the same prefix currently not "
 768                                           "supported");
 769 }
 770
 771 void
 772 QueryParser::Internal::add_boolean_prefix(string_view field,
 773                                           string_view prefix,
 774                                           const string* grouping)
 775 {
 776     // Allow for optional trailing `:` for consistency with how range prefixes
 777     // are specified.
 778     if (!field.empty() && field.back() == ':') {
 779         field = field.substr(0, field.size() - 1);
 780     }
 781     // Don't allow the empty prefix to be set as boolean as it doesn't
 782     // really make sense.
 783     if (field.empty())
 784         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 785     // If grouping == nullptr then it defaults to field which is not empty().
 786     bool inclusive = grouping && grouping->empty();
 787     filter_type type = inclusive ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 788 #ifdef __cpp_lib_associative_heterogeneous_insertion // C++26
 789     auto [it, inserted] = field_map.try_emplace(field, type,
 790                                                 grouping ? *grouping : field);
 791 #else
 792     auto [it, inserted] = field_map.try_emplace(string(field), type,
 793                                                 grouping ? *grouping : field);
 794 #endif
 795     auto&& p = it->second;
 796     if (inserted) {
 797         p.append(prefix);
 798         return;
 799     }
 800
 801     // Check that this is the same type of filter as the existing one(s).
 802     if (p.type != type) {
 803         throw Xapian::InvalidOperationError("Can't use add_prefix() and "
 804                                             "add_boolean_prefix() on the "
 805                                             "same field name, or "
 806                                             "add_boolean_prefix() with "
 807                                             "different values of the "
 808                                             "'exclusive' parameter"); // FIXME
 809     }
 810     if (p.proc)
 811         throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
 812                                               "and string prefixes currently "
 813                                               "not supported");
 814     // Only add if it's not already there as duplicate entries just result
 815     // in redundant query terms.  This is a linear scan so makes calling
 816     // add_prefix() n times for the same field value with different prefix
 817     // values O(n²), but you wouldn't realistically want to map one field
 818     // to more than a handful of prefixes.
 819     auto& prefixes = p.prefixes;
 820     if (find(prefixes.begin(), prefixes.end(), prefix) == prefixes.end()) {
 821         prefixes.emplace_back(prefix); // FIXME grouping
 822     }
 823 }
 824
 825 void
 826 QueryParser::Internal::add_boolean_prefix(string_view field,
 827                                           FieldProcessor *proc,
 828                                           const string* grouping)
 829 {
 830     // Allow for optional trailing `:` for consistency with how range prefixes
 831     // are specified.
 832     if (!field.empty() && field.back() == ':') {
 833         field = field.substr(0, field.size() - 1);
 834     }
 835     // Don't allow the empty prefix to be set as boolean as it doesn't
 836     // really make sense.
 837     if (field.empty())
 838         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 839     // If grouping == nullptr then it defaults to field which is not empty().
 840     bool inclusive = grouping && grouping->empty();
 841     filter_type type = inclusive ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 842 #ifdef __cpp_lib_associative_heterogeneous_insertion // C++26
 843     auto [it, inserted] = field_map.try_emplace(field, type, proc,
 844                                                 grouping ? *grouping : field);
 845 #else
 846     auto [it, inserted] = field_map.try_emplace(string(field), type, proc,
 847                                                 grouping ? *grouping : field);
 848 #endif
 849     if (inserted)
 850         return;
 851
 852     auto&& p = it->second;
 853     // Check that this is the same type of filter as the existing one(s).
 854     if (p.type != type) {
 855         throw Xapian::InvalidOperationError("Can't use add_prefix() and "
 856                                             "add_boolean_prefix() on the "
 857                                             "same field name, or "
 858                                             "add_boolean_prefix() with "
 859                                             "different values of the "
 860                                             "'exclusive' parameter"); // FIXME
 861     }
 862     if (!p.prefixes.empty())
 863         throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
 864                                               "and string prefixes currently "
 865                                               "not supported");
 866     throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects "
 867                                           "for the same prefix currently not "
 868                                           "supported");
 869 }
 870
 871 inline bool
 872 is_extended_wildcard(unsigned ch, unsigned flags)
 873 {
 874     if (ch == '*') return (flags & QueryParser::FLAG_WILDCARD_MULTI);
 875     if (ch == '?') return (flags & QueryParser::FLAG_WILDCARD_SINGLE);
 876     return false;
 877 }
 878
 879 string
 880 QueryParser::Internal::parse_term(Utf8Iterator& it, const Utf8Iterator& end,
 881                                   bool try_word_break, unsigned flags,
 882                                   bool& needs_word_break, bool& was_acronym,
 883                                   size_t& first_wildcard,
 884                                   size_t& char_count,
 885                                   unsigned& edit_distance)
 886 {
 887     string term;
 888     char_count = 0;
 889     // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 890     // Don't worry if there's a trailing '.' or not.
 891     if (U_isupper(*it)) {
 892         string t;
 893         Utf8Iterator p = it;
 894         do {
 895             Unicode::append_utf8(t, *p++);
 896             ++char_count;
 897         } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 898         // One letter does not make an acronym!  If we handled a single
 899         // uppercase letter here, we wouldn't catch M&S below.
 900         if (t.length() > 1) {
 901             // Check there's not a (lower case) letter or digit
 902             // immediately after it.
 903             // FIXME: should I.B.M..P.T.O be a range search?
 904             if (p == end || !is_wordchar(*p)) {
 905                 it = p;
 906                 swap(term, t);
 907             } else {
 908                 char_count = 0;
 909             }
 910         }
 911     }
 912     was_acronym = !term.empty();
 913
 914     if (try_word_break && term.empty() && is_unbroken_script(*it)) {
 915         const char* start = it.raw();
 916         char_count = get_unbroken(it);
 917         term.assign(start, it.raw() - start);
 918         needs_word_break = true;
 919     }
 920
 921     if (term.empty()) {
 922         unsigned prevch = *it;
 923         if (first_wildcard == term.npos &&
 924             is_extended_wildcard(prevch, flags)) {
 925             // Leading wildcard.
 926             first_wildcard = 0;
 927         }
 928         Unicode::append_utf8(term, prevch);
 929         char_count = 1;
 930         while (++it != end) {
 931             if (try_word_break && is_unbroken_script(*it)) break;
 932             unsigned ch = *it;
 933             if (is_extended_wildcard(ch, flags)) {
 934                 if (first_wildcard == term.npos) {
 935                     first_wildcard = char_count;
 936                 }
 937             } else if (!is_wordchar(ch)) {
 938                 // Treat a single embedded '&' or "'" or similar as a word
 939                 // character (e.g. AT&T, Fred's).  Also, normalise
 940                 // apostrophes to ASCII apostrophe.
 941                 Utf8Iterator p = it;
 942                 ++p;
 943                 if (p == end) break;
 944                 unsigned nextch = *p;
 945                 if (is_extended_wildcard(nextch, flags)) {
 946                     // A wildcard follows, which could expand to a digit or a non-digit.
 947                     unsigned ch_orig = ch;
 948                     ch = check_infix(ch);
 949                     if (!ch && is_digit(prevch))
 950                         ch = check_infix_digit(ch_orig);
 951                     if (!ch)
 952                         break;
 953                 } else {
 954                     if (!is_wordchar(nextch)) break;
 955                 }
 956                 if (is_digit(prevch) && is_digit(nextch)) {
 957                     ch = check_infix_digit(ch);
 958                 } else {
 959                     ch = check_infix(ch);
 960                 }
 961                 if (!ch) break;
 962                 if (ch == UNICODE_IGNORE)
 963                     continue;
 964             }
 965             Unicode::append_utf8(term, ch);
 966             ++char_count;
 967             prevch = ch;
 968         }
 969         if (it != end && is_suffix(*it)) {
 970             string suff_term = term;
 971             Utf8Iterator p = it;
 972             // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
 973             do {
 974                 // Assumes is_suffix() only matches ASCII.
 975                 if (suff_term.size() - term.size() == 3) {
 976                     suff_term.resize(0);
 977                     break;
 978                 }
 979                 suff_term += *p;
 980             } while (is_suffix(*++p));
 981             if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
 982                 // If the suffixed term doesn't exist, check that the
 983                 // non-suffixed term does.  This also takes care of
 984                 // the case when QueryParser::set_database() hasn't
 985                 // been called.
 986                 bool use_suff_term = false;
 987                 string lc = Unicode::tolower(suff_term);
 988                 if (db.term_exists(lc)) {
 989                     use_suff_term = true;
 990                 } else {
 991                     lc = Unicode::tolower(term);
 992                     if (!db.term_exists(lc)) use_suff_term = true;
 993                 }
 994                 if (use_suff_term) {
 995                     // Assumes is_suffix() only matches ASCII.
 996                     char_count += (suff_term.size() - term.size());
 997                     term = suff_term;
 998                     it = p;
 999                 }
1000             }
1001         }
1002         if (first_wildcard == term.npos &&
1003             (flags & QueryParser::FLAG_WILDCARD)) {
1004             // Check for right-truncation.
1005             if (it != end && *it == '*') {
1006                 ++it;
1007                 first_wildcard = char_count;
1008             }
1009         }
1010         if (it != end &&
1011             (flags & QueryParser::FLAG_FUZZY) &&
1012             // Not a wildcard.
1013             first_wildcard == string::npos &&
1014             *it == '~') {
1015             Utf8Iterator p = it;
1016             ++p;
1017             unsigned ch = *p;
1018             if (p == end || is_whitespace(ch) || ch == ')') {
1019                 it = p;
1020                 edit_distance = DEFAULT_EDIT_DISTANCE;
1021             } else if (U_isdigit(ch)) {
1022                 unsigned distance = ch - '0';
1023                 while (++p != end && U_isdigit(*p)) {
1024                     distance = distance * 10 + (*p - '0');
1025                 }
1026                 if (p != end && *p == '.') {
1027                     if (distance == 0) goto fractional;
1028                     // Ignore the fractional part on e.g. foo~12.5
1029                     while (++p != end && U_isdigit(*p)) { }
1030                 }
1031                 if (p == end || is_whitespace(ch) || ch == ')') {
1032                     it = p;
1033                     edit_distance = distance;
1034                 }
1035             } else if (ch == '.') {
1036 fractional:
1037                 double fraction = 0.0;
1038                 double digit = 0.1;
1039                 while (++p != end && U_isdigit(*p)) {
1040                     fraction += digit * (*p - '0');
1041                     digit *= 0.1;
1042                 }
1043                 if (p == end || is_whitespace(ch) || ch == ')') {
1044                     it = p;
1045                     unsigned codepoints = 0;
1046                     for (Utf8Iterator u8(term); u8 != Utf8Iterator(); ++u8) {
1047                         ++codepoints;
1048                     }
1049                     edit_distance = unsigned(codepoints * fraction);
1050                 }
1051             }
1052         }
1053     }
1054     return term;
1055 }
1056
1057 }
1058 // Switch to %code to insert at the end of the file so struct yyParser has been
1059 // defined.
1060 %code {
1061
1062 Query
1063 QueryParser::Internal::parse_query(string_view qs, unsigned flags,
1064                                    const string& default_prefix)
1065 {
1066 #ifndef USE_ICU
1067     // Overall it seems best to check for this up front - otherwise we create
1068     // the unhelpful situation where a failure to enable ICU in the build could
1069     // be missed because queries in scripts which don't need word splitting
1070     // still work fine.
1071     if (flags & FLAG_WORD_BREAKS) {
1072         throw Xapian::FeatureUnavailableError("FLAG_WORD_BREAKS requires "
1073                                               "building Xapian to use ICU");
1074     }
1075 #endif
1076     bool try_word_break =
1077         (flags & (FLAG_NGRAMS|FLAG_WORD_BREAKS)) || is_ngram_enabled();
1078
1079     // Set ranges if we may have to handle ranges in the query.
1080     bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
1081
1082     termpos term_pos = 1;
1083     Utf8Iterator it(qs), end;
1084
1085     State state(this, flags);
1086
1087     // To successfully apply more than one spelling correction to a query
1088     // string, we must keep track of the offset due to previous corrections.
1089     int correction_offset = 0;
1090     corrected_query.resize(0);
1091
1092     // Stack of prefixes, used for phrases and subexpressions.
1093     list<const FieldInfo *> prefix_stack;
1094
1095     // If default_prefix is specified, use it.  Otherwise, use any list
1096     // that has been set for the empty prefix.
1097     const FieldInfo def_pfx = FieldInfo{NON_BOOLEAN}.append(default_prefix);
1098     {
1099         const FieldInfo * default_field_info = &def_pfx;
1100         if (default_prefix.empty()) {
1101             auto f = field_map.find(string_view{});
1102             if (f != field_map.end()) default_field_info = &(f->second);
1103         }
1104
1105         // We always have the current prefix on the top of the stack.
1106         prefix_stack.push_back(default_field_info);
1107     }
1108
1109     yyParser parser;
1110
1111     unsigned newprev = ' ';
1112 main_lex_loop:
1113     enum {
1114         DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
1115         IN_GROUP2, EXPLICIT_SYNONYM
1116     } mode = DEFAULT;
1117     while (it != end && !state.error) {
1118         bool last_was_operator = false;
1119         bool last_was_operator_needing_term = false;
1120         if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
1121         if (false) {
1122 just_had_operator:
1123             if (it == end) break;
1124             mode = DEFAULT;
1125             last_was_operator_needing_term = false;
1126             last_was_operator = true;
1127         }
1128         if (false) {
1129 just_had_operator_needing_term:
1130             last_was_operator_needing_term = true;
1131             last_was_operator = true;
1132         }
1133         if (mode == IN_PHRASED_TERM) mode = DEFAULT;
1134         if (is_whitespace(*it)) {
1135             newprev = ' ';
1136             ++it;
1137             it = find_if(it, end, is_not_whitespace);
1138             if (it == end) break;
1139         }
1140
1141         if (ranges &&
1142             (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
1143             // Scan forward to see if this could be the "start of range"
1144             // token.  Sadly this has O(n²) tendencies, though at least
1145             // "n" is the number of words in a query which is likely to
1146             // remain fairly small.  FIXME: can we tokenise more elegantly?
1147             Utf8Iterator it_initial = it;
1148             Utf8Iterator p = it;
1149             unsigned ch = 0;
1150             while (p != end) {
1151                 if (ch == '.' && *p == '.') {
1152                     string a;
1153                     while (it != p) {
1154                         Unicode::append_utf8(a, *it++);
1155                     }
1156                     // Trim off the trailing ".".
1157                     a.resize(a.size() - 1);
1158                     ++p;
1159                     // Either end of the range can be empty (for an open-ended
1160                     // range) but both can't be empty.
1161                     if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
1162                         string b;
1163                         // Allow any character except whitespace and ')' in the
1164                         // upper bound.
1165                         while (p != end && *p > ' ' && *p != ')') {
1166                             Unicode::append_utf8(b, *p++);
1167                         }
1168                         Term * range = state.range(a, b);
1169                         if (!range) {
1170                             state.error = "Unknown range operation";
1171                             if (a.find(':', 1) == string::npos) {
1172                                 goto done;
1173                             }
1174                             // Might be a boolean filter with ".." in.  Leave
1175                             // state.error in case it isn't.
1176                             it = it_initial;
1177                             break;
1178                         }
1179                         Parse(&parser, RANGE, range, &state);
1180                     }
1181                     it = p;
1182                     goto main_lex_loop;
1183                 }
1184                 ch = *p;
1185                 // Allow any character except whitespace and '(' in the lower
1186                 // bound.
1187                 if (ch <= ' ' || ch == '(') break;
1188                 ++p;
1189             }
1190         }
1191
1192         if (!is_wordchar(*it)) {
1193             unsigned prev = newprev;
1194             Utf8Iterator p = it;
1195             unsigned ch = *it++;
1196             newprev = ch;
1197             // Drop out of IN_GROUP mode.
1198             if (mode == IN_GROUP || mode == IN_GROUP2)
1199                 mode = DEFAULT;
1200             switch (ch) {
1201               case '"':
1202               case 0x201c: // Left curly double quote.
1203               case 0x201d: // Right curly double quote.
1204                 // Quoted phrase.
1205                 if (mode == DEFAULT) {
1206                     // Skip whitespace.
1207                     it = find_if(it, end, is_not_whitespace);
1208                     if (it == end) {
1209                         // Ignore an unmatched " at the end of the query to
1210                         // avoid generating an empty pair of QUOTEs which will
1211                         // cause a parse error.
1212                         goto done;
1213                     }
1214                     if (is_double_quote(*it)) {
1215                         // Ignore empty "" (but only if we're not already
1216                         // IN_QUOTES as we don't merge two adjacent quoted
1217                         // phrases!)
1218                         newprev = *it++;
1219                         break;
1220                     }
1221                 }
1222                 if (flags & QueryParser::FLAG_PHRASE) {
1223                     if (ch == '"' && it != end && *it == '"') {
1224                         ++it;
1225                         // Handle "" inside a quoted phrase as an escaped " for
1226                         // consistency with quoted boolean terms.
1227                         break;
1228                     }
1229                     Parse(&parser, QUOTE, NULL, &state);
1230                     if (mode == DEFAULT) {
1231                         mode = IN_QUOTES;
1232                     } else {
1233                         // Remove the prefix we pushed for this phrase.
1234                         if (mode == IN_PREFIXED_QUOTES)
1235                             prefix_stack.pop_back();
1236                         mode = DEFAULT;
1237                     }
1238                 }
1239                 break;
1240
1241               case '+': case '-': // Loved or hated term/phrase/subexpression.
1242                 // Ignore + or - at the end of the query string.
1243                 if (it == end) goto done;
1244                 if (prev > ' ' && prev != '(') {
1245                     // Or if not after whitespace or an open bracket.
1246                     break;
1247                 }
1248                 if (is_whitespace(*it) || *it == '+' || *it == '-') {
1249                     // Ignore + or - followed by a space, or further + or -.
1250                     // Postfix + (such as in C++ and H+) is handled as part of
1251                     // the term lexing code in parse_term().
1252                     newprev = *it++;
1253                     break;
1254                 }
1255                 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
1256                     int token;
1257                     if (ch == '+') {
1258                         token = LOVE;
1259                     } else if (last_was_operator) {
1260                         token = HATE_AFTER_AND;
1261                     } else {
1262                         token = HATE;
1263                     }
1264                     Parse(&parser, token, NULL, &state);
1265                     goto just_had_operator_needing_term;
1266                 }
1267                 // Need to prevent the term after a LOVE or HATE starting a
1268                 // term group...
1269                 break;
1270
1271               case '(': // Bracketed subexpression.
1272                 // Skip whitespace.
1273                 it = find_if(it, end, is_not_whitespace);
1274                 // Ignore ( at the end of the query string.
1275                 if (it == end) goto done;
1276                 if (prev > ' ' && strchr("()+-", prev) == NULL) {
1277                     // Or if not after whitespace or a bracket or '+' or '-'.
1278                     break;
1279                 }
1280                 if (*it == ')') {
1281                     // Ignore empty ().
1282                     newprev = *it++;
1283                     break;
1284                 }
1285                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1286                     prefix_stack.push_back(prefix_stack.back());
1287                     Parse(&parser, BRA, NULL, &state);
1288                 }
1289                 break;
1290
1291               case ')': // End of bracketed subexpression.
1292                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1293                     // Remove the prefix we pushed for the corresponding BRA.
1294                     // If brackets are unmatched, it's a syntax error, but
1295                     // that's no excuse to SEGV!
1296                     if (prefix_stack.size() > 1) prefix_stack.pop_back();
1297                     Parse(&parser, KET, NULL, &state);
1298                 }
1299                 break;
1300
1301               case '~': // Synonym expansion.
1302                 // Ignore at the end of the query string.
1303                 if (it == end) goto done;
1304                 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1305                     if (prev > ' ' && strchr("+-(", prev) == NULL) {
1306                         // Or if not after whitespace, +, -, or an open bracket.
1307                         break;
1308                     }
1309                     if (!is_wordchar(*it) && !is_double_quote(*it)) {
1310                         // Ignore if not followed by a word character.
1311                         break;
1312                     }
1313                     Parse(&parser, SYNONYM, NULL, &state);
1314                     mode = EXPLICIT_SYNONYM;
1315                     if (!is_double_quote(*it))
1316                         goto just_had_operator_needing_term;
1317
1318                     // Support ~"foo bar" syntax to explicitly expand
1319                     // a multi-word synonym.
1320
1321                     // Skip whitespace.
1322                     ++it;
1323                     it = find_if(it, end, is_not_whitespace);
1324                     if (it == end) {
1325                         // Ignore an unmatched " at the end of the query to
1326                         // avoid generating an empty pair of QUOTEs which will
1327                         // cause a parse error.
1328                         goto done;
1329                     }
1330                     if (is_double_quote(*it)) {
1331                         // Ignore empty ~"".
1332                         newprev = *it++;
1333                         break;
1334                     }
1335                     Parse(&parser, QUOTE, NULL, &state);
1336                     mode = IN_QUOTES;
1337                 }
1338                 break;
1339               case '*':
1340                 if (flags & FLAG_WILDCARD_MULTI) {
1341                     it = p;
1342                     goto leading_wildcard;
1343                 }
1344                 break;
1345               case '?':
1346                 if (flags & FLAG_WILDCARD_SINGLE) {
1347                     it = p;
1348                     goto leading_wildcard;
1349                 }
1350                 break;
1351             }
1352             // Skip any other characters.
1353             continue;
1354         }
1355
1356         Assert(is_wordchar(*it));
1357
1358 leading_wildcard:
1359         size_t term_start_index = it.raw() - qs.data();
1360
1361         newprev = 'A'; // Any letter will do...
1362
1363         // A term, a prefix, or a boolean operator.
1364         const FieldInfo * field_info = NULL;
1365         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1366             !field_map.empty()) {
1367             // Check for a fieldname prefix (e.g. title:historical).
1368             Utf8Iterator p = find_if(it, end, is_not_wordchar);
1369             if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1370                 string field;
1371                 p = it;
1372                 while (*p != ':')
1373                     Unicode::append_utf8(field, *p++);
1374                 auto f = field_map.find(field);
1375                 if (f != field_map.end()) {
1376                     // Special handling for prefixed fields, depending on the
1377                     // type of the prefix.
1378                     unsigned ch = *++p;
1379                     field_info = &(f->second);
1380
1381                     if (field_info->type != NON_BOOLEAN) {
1382                         // Drop out of IN_GROUP if we're in it.
1383                         if (mode == IN_GROUP || mode == IN_GROUP2)
1384                             mode = DEFAULT;
1385                         it = p;
1386                         string name;
1387                         if (it != end && is_double_quote(*it)) {
1388                             // Quoted boolean term (can contain any character).
1389                             bool fancy = (*it != '"');
1390                             ++it;
1391                             while (it != end) {
1392                                 if (*it == '"') {
1393                                     // Interpret "" as an escaped ".
1394                                     if (++it == end || *it != '"')
1395                                         break;
1396                                 } else if (fancy && is_double_quote(*it)) {
1397                                     // If the opening quote was ASCII, then the
1398                                     // closing one must be too - otherwise
1399                                     // the user can't protect non-ASCII double
1400                                     // quote characters by quoting or escaping.
1401                                     ++it;
1402                                     break;
1403                                 }
1404                                 Unicode::append_utf8(name, *it++);
1405                             }
1406                         } else {
1407                             // Can't boolean filter prefix a subexpression, so
1408                             // just use anything following the prefix until the
1409                             // next space or ')' as part of the boolean filter
1410                             // term.
1411                             while (it != end && *it > ' ' && *it != ')')
1412                                 Unicode::append_utf8(name, *it++);
1413                         }
1414                         // Build the unstemmed form in field.
1415                         field += ':';
1416                         field += name;
1417                         // Clear any pending range error.
1418                         state.error = NULL;
1419                         Term * token = new Term(&state, name, field_info, field);
1420                         Parse(&parser, BOOLEAN_FILTER, token, &state);
1421                         continue;
1422                     }
1423
1424                     if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1425                         // Prefixed phrase, e.g.: subject:"space flight"
1426                         mode = IN_PREFIXED_QUOTES;
1427                         Parse(&parser, QUOTE, NULL, &state);
1428                         it = p;
1429                         newprev = ch;
1430                         ++it;
1431                         prefix_stack.push_back(field_info);
1432                         continue;
1433                     }
1434
1435                     if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1436                         // Prefixed subexpression, e.g.: title:(fast NEAR food)
1437                         mode = DEFAULT;
1438                         Parse(&parser, BRA, NULL, &state);
1439                         it = p;
1440                         newprev = ch;
1441                         ++it;
1442                         prefix_stack.push_back(field_info);
1443                         continue;
1444                     }
1445
1446                     if (ch != ':') {
1447                         // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1448                         while (is_phrase_generator(ch)) {
1449                             if (++p == end)
1450                                 goto not_prefix;
1451                             ch = *p;
1452                         }
1453                     }
1454
1455                     if (is_wordchar(ch)) {
1456                         // Prefixed term.
1457                         it = p;
1458                     } else {
1459 not_prefix:
1460                         // It looks like a prefix but isn't, so parse it as
1461                         // text instead.
1462                         field_info = NULL;
1463                     }
1464                 }
1465             }
1466         }
1467
1468 phrased_term:
1469         bool was_acronym;
1470         bool needs_word_break = false;
1471         size_t first_wildcard = string::npos;
1472         size_t term_char_count;
1473         unsigned edit_distance = NO_EDIT_DISTANCE;
1474         string term = parse_term(it, end, try_word_break, flags,
1475                                  needs_word_break, was_acronym, first_wildcard,
1476                                  term_char_count, edit_distance);
1477
1478         if (first_wildcard == string::npos &&
1479             edit_distance == NO_EDIT_DISTANCE &&
1480             (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1481             (flags & FLAG_BOOLEAN) &&
1482             // Don't want to interpret A.N.D. as an AND operator.
1483             !was_acronym &&
1484             !field_info &&
1485             term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1486             // Boolean operators.
1487             string op = term;
1488             if (flags & FLAG_BOOLEAN_ANY_CASE) {
1489                 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1490                     *i = C_toupper(*i);
1491                 }
1492             }
1493             if (op.size() == 3) {
1494                 if (op == "AND") {
1495                     Parse(&parser, AND, NULL, &state);
1496                     goto just_had_operator;
1497                 }
1498                 if (op == "NOT") {
1499                     Parse(&parser, NOT, NULL, &state);
1500                     goto just_had_operator;
1501                 }
1502                 if (op == "XOR") {
1503                     Parse(&parser, XOR, NULL, &state);
1504                     goto just_had_operator;
1505                 }
1506                 if (op == "ADJ") {
1507                     if (it != end && *it == '/') {
1508                         size_t width = 0;
1509                         Utf8Iterator p = it;
1510                         while (++p != end && U_isdigit(*p)) {
1511                             width = (width * 10) + (*p - '0');
1512                         }
1513                         if (width && (p == end || is_whitespace(*p))) {
1514                             it = p;
1515                             Parse(&parser, ADJ, new Term(width), &state);
1516                             goto just_had_operator;
1517                         }
1518                     } else {
1519                         Parse(&parser, ADJ, NULL, &state);
1520                         goto just_had_operator;
1521                     }
1522                 }
1523                 if (op == "SYN") {
1524                     Parse(&parser, SYN, NULL, &state);
1525                     goto just_had_operator;
1526                 }
1527             } else if (op.size() == 2) {
1528                 if (op == "OR") {
1529                     Parse(&parser, OR, NULL, &state);
1530                     goto just_had_operator;
1531                 }
1532             } else if (op.size() == 4) {
1533                 if (op == "NEAR") {
1534                     if (it != end && *it == '/') {
1535                         size_t width = 0;
1536                         Utf8Iterator p = it;
1537                         while (++p != end && U_isdigit(*p)) {
1538                             width = (width * 10) + (*p - '0');
1539                         }
1540                         if (width && (p == end || is_whitespace(*p))) {
1541                             it = p;
1542                             Parse(&parser, NEAR, new Term(width), &state);
1543                             goto just_had_operator;
1544                         }
1545                     } else {
1546                         Parse(&parser, NEAR, NULL, &state);
1547                         goto just_had_operator;
1548                     }
1549                 }
1550             }
1551         }
1552
1553         // If no prefix is set, use the default one.
1554         if (!field_info) field_info = prefix_stack.back();
1555
1556         Assert(field_info->type == NON_BOOLEAN);
1557
1558         {
1559             string unstemmed_term(term);
1560             term = Unicode::tolower(term);
1561
1562             // Reuse stem_strategy - STEM_SOME here means "stem terms except
1563             // when used with positional operators".
1564             stem_strategy stem_term = stem_action;
1565             if (stem_term != STEM_NONE) {
1566                 if (stemmer.is_none()) {
1567                     stem_term = STEM_NONE;
1568                 } else if (first_wildcard != string::npos ||
1569                            edit_distance != NO_EDIT_DISTANCE) {
1570                     stem_term = STEM_NONE;
1571                 } else if (stem_term == STEM_SOME ||
1572                            stem_term == STEM_SOME_FULL_POS) {
1573                     if (!should_stem(unstemmed_term) ||
1574                         (it != end && is_stem_preventer(*it))) {
1575                         // Don't stem this particular term.
1576                         stem_term = STEM_NONE;
1577                     }
1578                 }
1579             }
1580
1581             if (first_wildcard != string::npos) {
1582                 if (first_wildcard < state.get_min_wildcard_prefix_len()) {
1583                     errmsg = "Too few characters before wildcard";
1584                     return state.query;
1585                 }
1586             }
1587
1588             Term * term_obj = new Term(&state, term, field_info,
1589                                        unstemmed_term, stem_term, term_pos++,
1590                                        edit_distance);
1591
1592             if (first_wildcard != string::npos ||
1593                 edit_distance != NO_EDIT_DISTANCE) {
1594                 if (mode == IN_GROUP || mode == IN_GROUP2) {
1595                     // Drop out of IN_GROUP and flag that the group
1596                     // can be empty if all members are stopwords.
1597                     if (mode == IN_GROUP2)
1598                         Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1599                     mode = DEFAULT;
1600                 }
1601                 Parse(&parser,
1602                       first_wildcard != string::npos ? WILD_TERM : EDIT_TERM,
1603                       term_obj,
1604                       &state);
1605                 continue;
1606             }
1607
1608             if (needs_word_break) {
1609                 Parse(&parser, UNBROKEN_WORDS, term_obj, &state);
1610                 // Drop out of IN_GROUP mode.
1611                 if (mode == IN_GROUP || mode == IN_GROUP2)
1612                     mode = DEFAULT;
1613                 if (it == end) break;
1614                 continue;
1615             }
1616
1617             if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1618                 if (it == end && (flags & FLAG_PARTIAL)) {
1619                     auto min_len = state.get_min_partial_prefix_len();
1620                     if (term_char_count >= min_len) {
1621                         if (mode == IN_GROUP || mode == IN_GROUP2) {
1622                             // Drop out of IN_GROUP and flag that the group
1623                             // can be empty if all members are stopwords.
1624                             if (mode == IN_GROUP2)
1625                                 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1626                             mode = DEFAULT;
1627                         }
1628                         // Final term of a partial match query, with no
1629                         // following characters - treat as a wildcard.
1630                         Parse(&parser, PARTIAL_TERM, term_obj, &state);
1631                         continue;
1632                     }
1633                 }
1634             }
1635
1636             // Check spelling, if we're a normal term, and any of the prefixes
1637             // are empty.
1638             if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1639                 const auto& prefixes = field_info->prefixes;
1640                 for (const string& prefix : prefixes) {
1641                     if (!prefix.empty())
1642                         continue;
1643                     const string & suggest = db.get_spelling_suggestion(term);
1644                     if (!suggest.empty()) {
1645                         if (corrected_query.empty()) corrected_query = qs;
1646                         size_t term_end_index = it.raw() - qs.data();
1647                         size_t n = term_end_index - term_start_index;
1648                         size_t pos = UNSIGNED_OVERFLOW_OK(term_start_index + correction_offset);
1649                         corrected_query.replace(pos, n, suggest);
1650                         UNSIGNED_OVERFLOW_OK(correction_offset += suggest.size());
1651                         UNSIGNED_OVERFLOW_OK(correction_offset -= n);
1652                     }
1653                     break;
1654                 }
1655             }
1656
1657             if (mode == IN_PHRASED_TERM) {
1658                 Parse(&parser, PHR_TERM, term_obj, &state);
1659             } else {
1660                 // See if the next token will be PHR_TERM - if so, this one
1661                 // needs to be TERM not GROUP_TERM.
1662                 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1663                     is_phrase_generator(*it)) {
1664                     // FIXME: can we clean this up?
1665                     Utf8Iterator p = it;
1666                     do {
1667                         ++p;
1668                     } while (p != end && is_phrase_generator(*p));
1669                     // Don't generate a phrase unless the phrase generators are
1670                     // immediately followed by another term.
1671                     if (p != end && is_wordchar(*p)) {
1672                         mode = DEFAULT;
1673                     }
1674                 }
1675
1676                 int token = TERM;
1677                 if (mode == IN_GROUP || mode == IN_GROUP2) {
1678                     mode = IN_GROUP2;
1679                     token = GROUP_TERM;
1680                 }
1681                 Parse(&parser, token, term_obj, &state);
1682                 if (token == TERM && mode != DEFAULT)
1683                     continue;
1684             }
1685         }
1686
1687         if (it == end) break;
1688
1689         if (is_phrase_generator(*it)) {
1690             // Skip multiple phrase generators.
1691             do {
1692                 ++it;
1693             } while (it != end && is_phrase_generator(*it));
1694             // Don't generate a phrase unless the phrase generators are
1695             // immediately followed by another term.
1696             if (it != end && is_wordchar(*it)) {
1697                 mode = IN_PHRASED_TERM;
1698                 term_start_index = it.raw() - qs.data();
1699                 goto phrased_term;
1700             }
1701         } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1702             int old_mode = mode;
1703             mode = DEFAULT;
1704             if (!last_was_operator_needing_term && is_whitespace(*it)) {
1705                 newprev = ' ';
1706                 // Skip multiple whitespace.
1707                 do {
1708                     ++it;
1709                 } while (it != end && is_whitespace(*it));
1710                 // Don't generate a group unless the terms are only separated
1711                 // by whitespace.
1712                 if (it != end && is_wordchar(*it)) {
1713                     if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1714                         mode = IN_GROUP2;
1715                     } else {
1716                         mode = IN_GROUP;
1717                     }
1718                 }
1719             }
1720         }
1721     }
1722 done:
1723     if (!state.error) {
1724         // Implicitly close any unclosed quotes.
1725         if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1726             Parse(&parser, QUOTE, NULL, &state);
1727
1728         // Implicitly close all unclosed brackets.
1729         while (prefix_stack.size() > 1) {
1730             Parse(&parser, KET, NULL, &state);
1731             prefix_stack.pop_back();
1732         }
1733         Parse(&parser, 0, NULL, &state);
1734     }
1735
1736     errmsg = state.error;
1737     return state.query;
1738 }
1739
1740 }
1741 %include {
1742
1743 struct ProbQuery {
1744     Query* query = NULL;
1745     Query* love = NULL;
1746     Query* hate = NULL;
1747     // filter is a map from prefix to a query for that prefix.  Queries with
1748     // the same prefix are combined with OR, and the results of this are
1749     // combined with AND to get the full filter.
1750     map<string, Query> filter;
1751
1752     ProbQuery() {}
1753
1754     explicit
1755     ProbQuery(Query* query_) : query(query_) {}
1756
1757     ~ProbQuery() {
1758         delete query;
1759         delete love;
1760         delete hate;
1761     }
1762
1763     void add_filter(const string& grouping, const Query & q) {
1764         filter[grouping] = q;
1765     }
1766
1767     void append_filter(const string& grouping, const Query & qnew) {
1768         auto it = filter.find(grouping);
1769         if (it == filter.end()) {
1770             filter.insert(make_pair(grouping, qnew));
1771         } else {
1772             Query & q = it->second;
1773             // We OR multiple filters with the same prefix if they're
1774             // exclusive, otherwise we AND them.
1775             bool exclusive = !grouping.empty();
1776             if (exclusive) {
1777                 q |= qnew;
1778             } else {
1779                 q &= qnew;
1780             }
1781         }
1782     }
1783
1784     void add_filter_range(const string& grouping, const Query & range) {
1785         filter[grouping] = range;
1786     }
1787
1788     void append_filter_range(const string& grouping, const Query & range) {
1789         Query & q = filter[grouping];
1790         q |= range;
1791     }
1792
1793     Query merge_filters() const {
1794         auto i = filter.begin();
1795         Assert(i != filter.end());
1796         Query q = i->second;
1797         while (++i != filter.end()) {
1798             q &= i->second;
1799         }
1800         return q;
1801     }
1802 };
1803
1804 /// A group of terms separated only by whitespace.
1805 class TermGroup {
1806     vector<Term *> terms;
1807
1808     /** Controls how to handle a group where all terms are stopwords.
1809      *
1810      *  If true, then as_group() returns NULL.  If false, then the
1811      *  stopword status of the terms is ignored.
1812      */
1813     bool empty_ok;
1814
1815     TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1816         add_term(t1);
1817         add_term(t2);
1818     }
1819
1820   public:
1821     /// Factory function - ensures heap allocation.
1822     static TermGroup* create(Term* t1, Term* t2) {
1823         return new TermGroup(t1, t2);
1824     }
1825
1826     ~TermGroup() {
1827         for (auto&& t : terms) {
1828             delete t;
1829         }
1830     }
1831
1832     /// Add a Term object to this TermGroup object.
1833     void add_term(Term * term) {
1834         terms.push_back(term);
1835     }
1836
1837     /// Set the empty_ok flag.
1838     void set_empty_ok() { empty_ok = true; }
1839
1840     /// Convert to a Xapian::Query * using default_op.
1841     Query * as_group(State *state) const;
1842 };
1843
1844 Query *
1845 TermGroup::as_group(State *state) const
1846 {
1847     const Xapian::Stopper * stopper = state->get_stopper();
1848     size_t stoplist_size = state->stoplist_size();
1849     bool default_op_is_positional = is_positional(state->default_op());
1850 reprocess:
1851     Query::op default_op = state->default_op();
1852     vector<Query> subqs;
1853     subqs.reserve(terms.size());
1854     if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1855         // Check for multi-word synonyms.
1856         Database db = state->get_database();
1857
1858         string key;
1859         vector<Term*>::size_type begin = 0;
1860         vector<Term*>::size_type i = begin;
1861         while (terms.size() - i > 0) {
1862             size_t longest_match = 0;
1863             // This value is never used, but GCC 4.8 warns with
1864             // -Wmaybe-uninitialized (GCC 5.4 doesn't).
1865             vector<Term*>::size_type longest_match_end = 0;
1866             if (terms.size() - i >= 2) {
1867                 // Greedily try to match as many consecutive words as possible.
1868                 key = terms[i]->name;
1869                 key += ' ';
1870                 key += terms[i + 1]->name;
1871                 TermIterator synkey(db.synonym_keys_begin(key));
1872                 TermIterator synend(db.synonym_keys_end(key));
1873                 if (synkey != synend) {
1874                     longest_match = key.size();
1875                     longest_match_end = i + 2;
1876                     for (auto j = i + 2; j < terms.size(); ++j) {
1877                         key += ' ';
1878                         key += terms[j]->name;
1879                         synkey.skip_to(key);
1880                         if (synkey == synend)
1881                             break;
1882                         const string& found = *synkey;
1883                         if (!startswith(found, key))
1884                             break;
1885                         if (found.size() == key.size()) {
1886                             longest_match = key.size();
1887                             longest_match_end = j + 1;
1888                         }
1889                     }
1890                 }
1891             }
1892             if (longest_match == 0) {
1893                 // No multi-synonym matches at position i.
1894                 if (stopper && (*stopper)(terms[i]->name)) {
1895                     state->add_to_stoplist(terms[i]);
1896                 } else {
1897                     if (default_op_is_positional)
1898                         terms[i]->need_positions();
1899                     subqs.push_back(terms[i]->get_query_with_auto_synonyms());
1900                 }
1901                 begin = ++i;
1902                 continue;
1903             }
1904             i = longest_match_end;
1905             key.resize(longest_match);
1906
1907             vector<Query> subqs2;
1908             for (auto j = begin; j != i; ++j) {
1909                 if (stopper && (*stopper)(terms[j]->name)) {
1910                     state->add_to_stoplist(terms[j]);
1911                 } else {
1912                     if (default_op_is_positional)
1913                         terms[i]->need_positions();
1914                     subqs2.push_back(terms[j]->get_query());
1915                 }
1916             }
1917             Query q_original_terms;
1918             if (default_op_is_positional) {
1919                 q_original_terms = Query(default_op,
1920                                          subqs2.begin(), subqs2.end(),
1921                                          subqs2.size() + 9);
1922             } else {
1923                 q_original_terms = Query(default_op,
1924                                          subqs2.begin(), subqs2.end());
1925             }
1926             subqs2.clear();
1927
1928             // Use the position of the first term for the synonyms.
1929             TermIterator syn = db.synonyms_begin(key);
1930             Query q(Query::OP_SYNONYM,
1931                     SynonymIterator(syn, terms[begin]->pos, &q_original_terms),
1932                     SynonymIterator(db.synonyms_end(key)));
1933             subqs.push_back(q);
1934
1935             begin = i;
1936         }
1937     } else {
1938         vector<Term*>::const_iterator i;
1939         for (i = terms.begin(); i != terms.end(); ++i) {
1940             if (stopper && (*stopper)((*i)->name)) {
1941                 state->add_to_stoplist(*i);
1942             } else {
1943                 if (default_op_is_positional)
1944                     (*i)->need_positions();
1945                 subqs.push_back((*i)->get_query_with_auto_synonyms());
1946             }
1947         }
1948     }
1949
1950     if (!empty_ok && stopper && subqs.empty() &&
1951         stoplist_size < state->stoplist_size()) {
1952         // This group is all stopwords, so roll-back, disable stopper
1953         // temporarily, and reprocess this group.
1954         state->stoplist_resize(stoplist_size);
1955         stopper = NULL;
1956         goto reprocess;
1957     }
1958
1959     Query * q = NULL;
1960     if (!subqs.empty()) {
1961         if (default_op_is_positional) {
1962             q = new Query(default_op, subqs.begin(), subqs.end(),
1963                              subqs.size() + 9);
1964         } else {
1965             q = new Query(default_op, subqs.begin(), subqs.end());
1966         }
1967     }
1968     delete this;
1969     return q;
1970 }
1971
1972 /// Some terms which form a positional sub-query.
1973 class Terms {
1974     vector<Term *> terms;
1975
1976     /** Window size.
1977      *
1978      *  size_t(-1) means don't use positional info (so an OP_AND query gets
1979      *  created).
1980      */
1981     size_t window;
1982
1983     /** Keep track of whether the terms added all have the same list of
1984      *  prefixes.  If so, we'll build a set of phrases, one using each prefix.
1985      *  This works around the limitation that a phrase cannot have multiple
1986      *  components which are "OR" combinations of terms, but is also probably
1987      *  what users expect: i.e., if a user specifies a phrase in a field, and
1988      *  that field maps to multiple prefixes, the user probably wants a phrase
1989      *  returned with all terms having one of those prefixes, rather than a
1990      *  phrase comprised of terms with differing prefixes.
1991      */
1992     bool uniform_prefixes;
1993
1994     /** The list of prefixes of the terms added.
1995      *  This will be NULL if the terms have different prefixes.
1996      */
1997     const vector<string>* prefixes;
1998
1999     Query opwindow_subq(Query::op op,
2000                         const vector<Query>& v,
2001                         Xapian::termcount w) const {
2002         if (op == Query::OP_AND) {
2003             return Query(op, v.begin(), v.end());
2004         }
2005         return Query(op, v.begin(), v.end(), w);
2006     }
2007
2008     /// Convert to a query using the given operator and window size.
2009     Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
2010         if (window == size_t(-1)) op = Query::OP_AND;
2011         Query * q = NULL;
2012         size_t n_terms = terms.size();
2013         Xapian::termcount w = w_delta + terms.size();
2014         if (uniform_prefixes) {
2015             if (prefixes) {
2016                 for (auto&& prefix : *prefixes) {
2017                     vector<Query> subqs;
2018                     subqs.reserve(n_terms);
2019                     for (Term* t : terms) {
2020                         subqs.push_back(Query(t->make_term(prefix), 1, t->pos));
2021                     }
2022                     add_to_query(q, Query::OP_OR, opwindow_subq(op, subqs, w));
2023                 }
2024             }
2025         } else {
2026             vector<Query> subqs;
2027             subqs.reserve(n_terms);
2028             for (Term* t : terms) {
2029                 subqs.push_back(t->get_query());
2030             }
2031             q = new Query(opwindow_subq(op, subqs, w));
2032         }
2033
2034         delete this;
2035         return q;
2036     }
2037
2038     explicit Terms(bool no_pos)
2039         : window(no_pos ? size_t(-1) : 0),
2040           uniform_prefixes(true),
2041           prefixes(NULL) { }
2042
2043   public:
2044     /// Factory function - ensures heap allocation.
2045     static Terms* create(State* state) {
2046         return new Terms(state->flags & QueryParser::FLAG_NO_POSITIONS);
2047     }
2048
2049     ~Terms() {
2050         for (auto&& t : terms) {
2051             delete t;
2052         }
2053     }
2054
2055     /// Add an unstemmed Term object to this Terms object.
2056     void add_positional_term(Term * term) {
2057         const auto& term_prefixes = term->field_info->prefixes;
2058         if (terms.empty()) {
2059             prefixes = &term_prefixes;
2060         } else if (uniform_prefixes && prefixes != &term_prefixes) {
2061             if (*prefixes != term_prefixes)  {
2062                 prefixes = NULL;
2063                 uniform_prefixes = false;
2064             }
2065         }
2066         term->need_positions();
2067         terms.push_back(term);
2068     }
2069
2070     void adjust_window(size_t alternative_window) {
2071         if (alternative_window > window) window = alternative_window;
2072     }
2073
2074     /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
2075     Query * as_phrase_query() const {
2076         return as_opwindow_query(Query::OP_PHRASE, 0);
2077     }
2078
2079     /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
2080     Query* as_synonym_phrase_query(State* state) const {
2081         string name;
2082         termpos pos = 0;
2083         for (Term* t : terms) {
2084             if (!name.empty()) {
2085                 name += ' ';
2086             } else {
2087                 pos = t->pos;
2088             }
2089             name += t->name;
2090         }
2091
2092         for (auto&& prefix : *prefixes) {
2093             // Only try unstemmed for multi-word.
2094             string term;
2095             if (!prefix.empty()) {
2096                 term += prefix;
2097                 if (prefix_needs_colon(prefix, name[0])) term += ':';
2098             }
2099             term += name;
2100
2101             Xapian::Database db = state->get_database();
2102             Xapian::TermIterator syn = db.synonyms_begin(term);
2103             Xapian::TermIterator end = db.synonyms_end(term);
2104
2105             // Caution: this does `delete this;`!
2106             Query* q = as_opwindow_query(Query::OP_PHRASE, 0);
2107             // FIXME: Is this right when there's more than one entry in
2108             // prefixes?
2109             Query* q2 = new Query(q->OP_SYNONYM,
2110                                   SynonymIterator(syn, pos, q),
2111                                   SynonymIterator(end));
2112             delete q;
2113             return q2;
2114             // FIXME: Handle multiple prefixes properly...
2115         }
2116         return new Query();
2117     }
2118
2119     /// Convert to a Xapian::Query * using OP_NEAR.
2120     Query * as_near_query() const {
2121         // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
2122         // means a window size of 11.  For more than 2 terms, we just add one
2123         // to the window size for each extra term.
2124         size_t w = window;
2125         if (w == 0) w = 10;
2126         return as_opwindow_query(Query::OP_NEAR, w - 1);
2127     }
2128
2129     /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
2130     Query * as_adj_query() const {
2131         // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
2132         // which means a window size of 11.  For more than 2 terms, we just add
2133         // one to the window size for each extra term.
2134         size_t w = window;
2135         if (w == 0) w = 10;
2136         return as_opwindow_query(Query::OP_PHRASE, w - 1);
2137     }
2138 };
2139
2140 void
2141 Term::as_positional_unbroken(Terms* terms) const
2142 {
2143 #ifdef USE_ICU
2144     if (state->flags & QueryParser::FLAG_WORD_BREAKS) {
2145         for (WordIterator tk(name); tk != WordIterator(); ++tk) {
2146             const string& t = *tk;
2147             Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
2148             terms->add_positional_term(c);
2149         }
2150         delete this;
2151         return;
2152     }
2153 #endif
2154     // Add each individual character to the phrase.
2155     string t;
2156     for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
2157         Unicode::append_utf8(t, *it);
2158         Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
2159         terms->add_positional_term(c);
2160         t.resize(0);
2161     }
2162
2163     // FIXME: we want to add the n-grams as filters too for efficiency.
2164
2165     delete this;
2166 }
2167
2168 // Helper macro to check for missing arguments to a boolean operator.
2169 #define VET_BOOL_ARGS(A, B, OP_TXT) \
2170     do {\
2171         if (!A || !B) {\
2172             state->error = "Syntax: <expression> " OP_TXT " <expression>";\
2173             yy_parse_failed(yypParser);\
2174             return;\
2175         }\
2176     } while (0)
2177
2178 }
2179
2180 %token_type {Term *}
2181 %token_destructor { delete $$; }
2182
2183 %extra_argument {State * state}
2184
2185 %parse_failure {
2186     // If we've not already set an error message, set a default one.
2187     if (!state->error) state->error = "parse error";
2188 }
2189
2190 %syntax_error {
2191     yy_parse_failed(yypParser);
2192 }
2193
2194 // Operators, grouped in order of increasing precedence:
2195 %nonassoc ERROR.
2196 %left OR.
2197 %left XOR.
2198 %left AND NOT.
2199 %left NEAR ADJ.
2200 %left LOVE HATE HATE_AFTER_AND SYNONYM.
2201 %left SYN.
2202
2203 // Destructors for terminal symbols:
2204
2205 // TERM is a query term, including prefix (if any).
2206 %destructor TERM { delete $$; }
2207
2208 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
2209 // is only separated by whitespace characters.
2210 %destructor GROUP_TERM { delete $$; }
2211
2212 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
2213 // separated only by one or more phrase generator characters (hyphen and
2214 // apostrophe are common examples - see is_phrase_generator() for the list
2215 // of all punctuation which does this).
2216 %destructor PHR_TERM { delete $$; }
2217
2218 // EDIT_TERM is like a TERM, but with an edit distance.
2219 %destructor EDIT_TERM { delete $$; }
2220
2221 // WILD_TERM is like a TERM, but with wildcards which needs to be expanded.
2222 %destructor WILD_TERM { delete $$; }
2223
2224 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
2225 // we're doing "search as you type".  It expands to something like:
2226 //     WILD_TERM($$+"*") OR stemmed_form
2227 %destructor PARTIAL_TERM { delete $$; }
2228
2229 // BOOLEAN_FILTER is a query term with a prefix registered using
2230 // add_boolean_prefix().  It's added to the query using an OP_FILTER operator,
2231 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
2232 %destructor BOOLEAN_FILTER { delete $$; }
2233
2234 // Grammar rules:
2235
2236 // query - The whole query - just an expr or nothing.
2237
2238 // query non-terminal doesn't need a type, so just give a dummy one.
2239 %type query {int}
2240
2241 query ::= expr(E). {
2242     // Save the parsed query in the State structure so we can return it.
2243     if (E) {
2244         state->query = *E;
2245         delete E;
2246     } else {
2247         state->query = Query();
2248     }
2249 }
2250
2251 query ::= . {
2252     // Handle a query string with no terms in.
2253     state->query = Query();
2254 }
2255
2256 // expr - A query expression.
2257
2258 %type expr {Query *}
2259 %destructor expr { delete $$; }
2260
2261 expr(E) ::= prob_expr(E).
2262
2263 expr(A) ::= bool_arg(A) AND bool_arg(B). {
2264     VET_BOOL_ARGS(A, B, "AND");
2265     *A &= *B;
2266     delete B;
2267 }
2268
2269 expr(A) ::= bool_arg(A) NOT bool_arg(B). {
2270     if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
2271         // 'NOT foo' -> '(0 * <alldocuments>) NOT foo'
2272         //
2273         // We scale the <alldocuments> by 0 so it doesn't count towards the
2274         // number of matching subqueries since that allows the query optimiser
2275         // to eliminate it if other subqueries are combined in an AND-like
2276         // way (e.g. 'bar AND (NOT foo)').
2277         A = new Query(0.0, Query(string(), 1, 0));
2278     }
2279     VET_BOOL_ARGS(A, B, "NOT");
2280     *A &= ~*B;
2281     delete B;
2282 }
2283
2284 expr(A) ::= bool_arg(A) AND NOT bool_arg(B). [NOT] {
2285     VET_BOOL_ARGS(A, B, "AND NOT");
2286     *A &= ~*B;
2287     delete B;
2288 }
2289
2290 expr(A) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND] {
2291     VET_BOOL_ARGS(A, B, "AND");
2292     *A &= ~*B;
2293     delete B;
2294 }
2295
2296 expr(A) ::= bool_arg(A) OR bool_arg(B). {
2297     VET_BOOL_ARGS(A, B, "OR");
2298     *A |= *B;
2299     delete B;
2300 }
2301
2302 expr(A) ::= bool_arg(A) XOR bool_arg(B). {
2303     VET_BOOL_ARGS(A, B, "XOR");
2304     *A ^= *B;
2305     delete B;
2306 }
2307
2308 expr(A) ::= bool_arg(A) SYN bool_arg(B). {
2309     VET_BOOL_ARGS(A, B, "SYN");
2310     *A = Query(Query::OP_SYNONYM, *A, *B);
2311     delete B;
2312 }
2313
2314 // bool_arg - an argument to a boolean operator such as AND or OR.
2315
2316 %type bool_arg {Query *}
2317 %destructor bool_arg { delete $$; }
2318
2319 bool_arg(A) ::= expr(A).
2320
2321 bool_arg(A) ::= . [ERROR] {
2322     // Set the argument to NULL, which enables the bool_arg-using rules in
2323     // expr above to report uses of AND, OR, etc which don't have two
2324     // arguments.
2325     A = NULL;
2326 }
2327
2328 // prob_expr - a single compound term, or a prob.
2329
2330 %type prob_expr {Query *}
2331 %destructor prob_expr { delete $$; }
2332
2333 prob_expr(E) ::= prob(P). {
2334     E = P->query;
2335     P->query = NULL;
2336     // Handle any "+ terms".
2337     if (P->love) {
2338         if (P->love->empty()) {
2339             // +<nothing>.
2340             delete E;
2341             E = P->love;
2342         } else if (E) {
2343             swap(E, P->love);
2344             add_to_query(E, Query::OP_AND_MAYBE, P->love);
2345         } else {
2346             E = P->love;
2347         }
2348         P->love = NULL;
2349     }
2350     // Handle any boolean filters.
2351     if (!P->filter.empty()) {
2352         if (E) {
2353             add_to_query(E, Query::OP_FILTER, P->merge_filters());
2354         } else {
2355             // Make the query a boolean one.
2356             E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
2357         }
2358     }
2359     // Handle any "- terms".
2360     if (P->hate && !P->hate->empty()) {
2361         if (!E) {
2362             // Can't just hate!
2363             yy_parse_failed(yypParser);
2364             return;
2365         }
2366         *E = Query(Query::OP_AND_NOT, *E, *P->hate);
2367     }
2368     delete P;
2369 }
2370
2371 prob_expr(E) ::= term(E).
2372
2373 // prob - a sub-expression consisting of stop_terms, "+" terms, "-" terms,
2374 // boolean filters, and/or ranges.
2375 //
2376 // Note: stop_term can also be several other things other than a simple term!
2377
2378 %type prob {ProbQuery *}
2379 %destructor prob { delete $$; }
2380
2381 prob(P) ::= RANGE(R). {
2382     string grouping = R->name;
2383     const Query & range = R->as_range_query();
2384     P = new ProbQuery; /*P-overwrites-R*/
2385     P->add_filter_range(grouping, range);
2386 }
2387
2388 prob(P) ::= stop_prob(P) RANGE(R). {
2389     string grouping = R->name;
2390     const Query & range = R->as_range_query();
2391     P->append_filter_range(grouping, range);
2392 }
2393
2394 prob(P) ::= stop_term(T) stop_term(U). {
2395     P = new ProbQuery(T); /*P-overwrites-T*/
2396     if (U) {
2397         Query::op op = state->default_op();
2398         if (P->query && is_positional(op)) {
2399             // If default_op is OP_NEAR or OP_PHRASE, set the window size to
2400             // 11 for the first pair of terms and it will automatically grow
2401             // by one for each subsequent term.
2402             Query * subqs[2] = { P->query, U };
2403             *(P->query) = Query(op, subqs, subqs + 2, 11);
2404             delete U;
2405         } else {
2406             add_to_query(P->query, op, U);
2407         }
2408     }
2409 }
2410
2411 prob(P) ::= prob(P) stop_term(T). {
2412     // If T is a stopword, there's nothing to do here.
2413     if (T) add_to_query(P->query, state->default_op(), T);
2414 }
2415
2416 prob(P) ::= LOVE term(T). {
2417     P = new ProbQuery;
2418     if (state->default_op() == Query::OP_AND) {
2419         P->query = T;
2420     } else {
2421         P->love = T;
2422     }
2423 }
2424
2425 prob(P) ::= stop_prob(P) LOVE term(T). {
2426     if (state->default_op() == Query::OP_AND) {
2427         /* The default op is AND, so we just put loved terms into the query
2428          * (in this case the only effect of love is to ignore the stopword
2429          * list). */
2430         add_to_query(P->query, Query::OP_AND, T);
2431     } else {
2432         add_to_query(P->love, Query::OP_AND, T);
2433     }
2434 }
2435
2436 prob(P) ::= HATE term(T). {
2437     P = new ProbQuery;
2438     P->hate = T;
2439 }
2440
2441 prob(P) ::= stop_prob(P) HATE term(T). {
2442     add_to_query(P->hate, Query::OP_OR, T);
2443 }
2444
2445 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2446     P = new ProbQuery;
2447     P->hate = new Query(T->get_query());
2448     delete T;
2449 }
2450
2451 prob(P) ::= stop_prob(P) HATE BOOLEAN_FILTER(T). {
2452     add_to_query(P->hate, Query::OP_OR, T->get_query());
2453     delete T;
2454 }
2455
2456 prob(P) ::= BOOLEAN_FILTER(T). {
2457     P = new ProbQuery;
2458     P->add_filter(T->get_grouping(), T->get_query());
2459     delete T;
2460 }
2461
2462 prob(P) ::= stop_prob(P) BOOLEAN_FILTER(T). {
2463     P->append_filter(T->get_grouping(), T->get_query());
2464     delete T;
2465 }
2466
2467 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2468     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2469     P = new ProbQuery;
2470     P->filter[T->get_grouping()] = T->get_query();
2471     delete T;
2472 }
2473
2474 prob(P) ::= stop_prob(P) LOVE BOOLEAN_FILTER(T). {
2475     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2476     // We OR filters with the same prefix...
2477     Query & q = P->filter[T->get_grouping()];
2478     q |= T->get_query();
2479     delete T;
2480 }
2481
2482 // stop_prob - A prob or a stop_term.
2483
2484 %type stop_prob {ProbQuery *}
2485 %destructor stop_prob { delete $$; }
2486
2487 stop_prob(P) ::= prob(P).
2488
2489 stop_prob(P) ::= stop_term(T). {
2490     P = new ProbQuery(T); /*P-overwrites-T*/
2491 }
2492
2493 // stop_term - A term which should be checked against the stopword list,
2494 // or a compound_term.
2495 //
2496 // If a term is loved, hated, or in a phrase, we don't want to consult the
2497 // stopword list, so stop_term isn't used there (instead term is).
2498
2499 %type stop_term {Query *}
2500 %destructor stop_term { delete $$; }
2501
2502 stop_term(T) ::= TERM(U). {
2503     if (state->is_stopword(U)) {
2504         T = NULL;
2505         state->add_to_stoplist(U);
2506     } else {
2507         T = new Query(U->get_query_with_auto_synonyms());
2508     }
2509     delete U;
2510 }
2511
2512 stop_term(T) ::= compound_term(T).
2513
2514 // term - A term or a compound_term.
2515
2516 %type term {Query *}
2517 %destructor term { delete $$; }
2518
2519 term(T) ::= TERM(U). {
2520     T = new Query(U->get_query_with_auto_synonyms());
2521     delete U;
2522 }
2523
2524 term(T) ::= compound_term(T).
2525
2526 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2527 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2528 // or without prefix).
2529
2530 %type compound_term {Query *}
2531 %destructor compound_term { delete $$; }
2532
2533 compound_term(T) ::= EDIT_TERM(U).
2534         { T = U->as_fuzzy_query(state); /*T-overwrites-U*/ }
2535
2536 compound_term(T) ::= WILD_TERM(U).
2537         { T = U->as_wildcarded_query(state); /*T-overwrites-U*/ }
2538
2539 compound_term(T) ::= PARTIAL_TERM(U).
2540         { T = U->as_partial_query(state); /*T-overwrites-U*/ }
2541
2542 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2543         { T = P->as_phrase_query(); }
2544
2545 compound_term(T) ::= phrased_term(P).
2546         { T = P->as_phrase_query(); /*T-overwrites-P*/ }
2547
2548 compound_term(T) ::= group(P).
2549         { T = P->as_group(state); /*T-overwrites-P*/ }
2550
2551 compound_term(T) ::= near_expr(P).
2552         { T = P->as_near_query(); /*T-overwrites-P*/ }
2553
2554 compound_term(T) ::= adj_expr(P).
2555         { T = P->as_adj_query(); /*T-overwrites-P*/ }
2556
2557 compound_term(T) ::= BRA expr(E) KET.
2558         { T = E; }
2559
2560 compound_term(T) ::= SYNONYM TERM(U). {
2561     T = new Query(U->get_query_with_synonyms());
2562     delete U;
2563 }
2564
2565 compound_term(T) ::= SYNONYM QUOTE phrase(P) QUOTE.
2566     { T = P->as_synonym_phrase_query(state); }
2567
2568 compound_term(T) ::= UNBROKEN_WORDS(U). {
2569     { T = U->as_unbroken_query(); /*T-overwrites-U*/ }
2570 }
2571
2572 // phrase - The "inside the quotes" part of a double-quoted phrase.
2573
2574 %type phrase {Terms *}
2575
2576 %destructor phrase { delete $$; }
2577
2578 phrase(P) ::= TERM(T). {
2579     P = Terms::create(state);
2580     P->add_positional_term(T);
2581 }
2582
2583 phrase(P) ::= UNBROKEN_WORDS(T). {
2584     P = Terms::create(state);
2585     T->as_positional_unbroken(P);
2586 }
2587
2588 phrase(P) ::= phrase(P) TERM(T). {
2589     P->add_positional_term(T);
2590 }
2591
2592 phrase(P) ::= phrase(P) UNBROKEN_WORDS(T). {
2593     T->as_positional_unbroken(P);
2594 }
2595
2596 // phrased_term - A phrased term works like a single term, but is actually
2597 // 2 or more terms linked together into a phrase by punctuation.  There must be
2598 // at least 2 terms in order to be able to have punctuation between the terms!
2599
2600 %type phrased_term {Terms *}
2601 %destructor phrased_term { delete $$; }
2602
2603 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2604     P = Terms::create(state);
2605     P->add_positional_term(T);
2606     P->add_positional_term(U);
2607 }
2608
2609 phrased_term(P) ::= phrased_term(P) PHR_TERM(T). {
2610     P->add_positional_term(T);
2611 }
2612
2613 // group - A group of terms separated only by whitespace - candidates for
2614 // multi-term synonyms.
2615
2616 %type group {TermGroup *}
2617 %destructor group { delete $$; }
2618
2619 group(P) ::= TERM(T) GROUP_TERM(U). {
2620     P = TermGroup::create(T, U); /*P-overwrites-T*/
2621 }
2622
2623 group(P) ::= group(P) GROUP_TERM(T). {
2624     P->add_term(T);
2625 }
2626
2627 group(P) ::= group(P) EMPTY_GROUP_OK. {
2628     P->set_empty_ok();
2629 }
2630
2631 // near_expr - 2 or more terms with NEAR in between.  There must be at least 2
2632 // terms in order for there to be any NEAR operators!
2633
2634 %type near_expr {Terms *}
2635 %destructor near_expr { delete $$; }
2636
2637 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2638     P = Terms::create(state);
2639     P->add_positional_term(T);
2640     P->add_positional_term(U);
2641     if (N) {
2642         P->adjust_window(N->get_termpos());
2643         delete N;
2644     }
2645 }
2646
2647 near_expr(P) ::= near_expr(P) NEAR(N) TERM(T). {
2648     P->add_positional_term(T);
2649     if (N) {
2650         P->adjust_window(N->get_termpos());
2651         delete N;
2652     }
2653 }
2654
2655 // adj_expr - 2 or more terms with ADJ in between.  There must be at least 2
2656 // terms in order for there to be any ADJ operators!
2657
2658 %type adj_expr {Terms *}
2659 %destructor adj_expr { delete $$; }
2660
2661 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2662     P = Terms::create(state);
2663     P->add_positional_term(T);
2664     P->add_positional_term(U);
2665     if (N) {
2666         P->adjust_window(N->get_termpos());
2667         delete N;
2668     }
2669 }
2670
2671 adj_expr(P) ::= adj_expr(P) ADJ(N) TERM(T). {
2672     P->add_positional_term(T);
2673     if (N) {
2674         P->adjust_window(N->get_termpos());
2675         delete N;
2676     }
2677 }
2678
2679 // Select lemon syntax highlighting in vim editor: vim: syntax=lemon