xapian-core/queryparser/queryparser.lemony

   1 %include {
   2 /* queryparser.lemony: build a Xapian::Query object from a user query string.
   3  *
   4  * Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016 Olly Betts
   5  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
   6  * Copyright (C) 2010 Adam Sjøgren
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  21  * USA
  22  */
  23
  24 #include <config.h>
  25
  26 #include "queryparser_internal.h"
  27
  28 #include "api/queryinternal.h"
  29 #include "omassert.h"
  30 #include "str.h"
  31 #include "stringutils.h"
  32 #include "xapian/error.h"
  33 #include "xapian/unicode.h"
  34
  35 // Include the list of token values lemon generates.
  36 #include "queryparser_token.h"
  37
  38 #include "cjk-tokenizer.h"
  39
  40 #include <algorithm>
  41 #include <cstring>
  42 #include <limits>
  43 #include <list>
  44 #include <string>
  45 #include <vector>
  46
  47 using namespace std;
  48
  49 using namespace Xapian;
  50
  51 inline bool
  52 U_isupper(unsigned ch) {
  53     return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
  54 }
  55
  56 inline bool
  57 U_isdigit(unsigned ch) {
  58     return (ch < 128 && C_isdigit(static_cast<unsigned char>(ch)));
  59 }
  60
  61 inline bool
  62 U_isalpha(unsigned ch) {
  63     return (ch < 128 && C_isalpha(static_cast<unsigned char>(ch)));
  64 }
  65
  66 using Xapian::Unicode::is_whitespace;
  67
  68 inline bool
  69 is_not_whitespace(unsigned ch) {
  70     return !is_whitespace(ch);
  71 }
  72
  73 using Xapian::Unicode::is_wordchar;
  74
  75 inline bool
  76 is_not_wordchar(unsigned ch) {
  77     return !is_wordchar(ch);
  78 }
  79
  80 inline bool
  81 is_digit(unsigned ch) {
  82     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
  83 }
  84
  85 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
  86 // and there's the risk of hyphens getting stuck onto the end of terms...
  87 inline bool
  88 is_suffix(unsigned ch) {
  89     return ch == '+' || ch == '#';
  90 }
  91
  92 inline bool
  93 is_double_quote(unsigned ch) {
  94     // We simply treat all double quotes as equivalent, which is a bit crude,
  95     // but it isn't clear that it would actually better to require them to
  96     // match up exactly.
  97     //
  98     // 0x201c is Unicode opening double quote.
  99     // 0x201d is Unicode closing double quote.
 100     return ch == '"' || ch == 0x201c || ch == 0x201d;
 101 }
 102
 103 inline bool
 104 prefix_needs_colon(const string & prefix, unsigned ch)
 105 {
 106     if (!U_isupper(ch) && ch != ':') return false;
 107     string::size_type len = prefix.length();
 108     return (len > 1 && prefix[len - 1] != ':');
 109 }
 110
 111 using Unicode::is_currency;
 112
 113 inline bool
 114 is_positional(Xapian::Query::op op)
 115 {
 116     return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
 117 }
 118
 119 class Terms;
 120
 121 /** Class used to pass information about a token from lexer to parser.
 122  *
 123  *  Generally an instance of this class carries term information, but it can be
 124  *  used for a range query, and with some operators (e.g. the distance in
 125  *  NEAR/3 or ADJ/3, etc).
 126  */
 127 class Term {
 128     State * state;
 129
 130   public:
 131     string name;
 132     const FieldInfo * field_info;
 133     string unstemmed;
 134     QueryParser::stem_strategy stem;
 135     termpos pos;
 136     Query query;
 137
 138     Term(const string &name_, termpos pos_)
 139         : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
 140     explicit Term(const string &name_)
 141         : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
 142     Term(const string &name_, const FieldInfo * field_info_)
 143         : name(name_), field_info(field_info_),
 144           stem(QueryParser::STEM_NONE), pos(0) { }
 145     explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
 146     Term(State * state_, const string &name_, const FieldInfo * field_info_,
 147          const string &unstemmed_,
 148          QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
 149          termpos pos_ = 0)
 150         : state(state_), name(name_), field_info(field_info_),
 151           unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
 152     // For RANGE tokens.
 153     Term(const Xapian::Query & q, const string & grouping)
 154         : name(grouping), query(q) { }
 155
 156     string make_term(const string & prefix) const;
 157
 158     void need_positions() {
 159         if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
 160     }
 161
 162     termpos get_termpos() const { return pos; }
 163
 164     string get_grouping() const {
 165         return field_info->grouping;
 166     }
 167
 168     Query * as_wildcarded_query(State * state) const;
 169
 170     /** Build a query for a term at the very end of the query string when
 171      *  FLAG_PARTIAL is in use.
 172      *
 173      *  This query should match documents containing any terms which start with
 174      *  the characters specified, but should give a higher score to exact
 175      *  matches (since the user might have finished typing - we simply don't
 176      *  know).
 177      */
 178     Query * as_partial_query(State * state_) const;
 179
 180     /** Build a query for a string of CJK characters. */
 181     Query * as_cjk_query() const;
 182
 183     /** Handle a CJK character string in a positional context. */
 184     void as_positional_cjk_term(Terms * terms) const;
 185
 186     /// Range query.
 187     Query as_range_query() const;
 188
 189     Query get_query() const;
 190
 191     Query get_query_with_synonyms() const;
 192
 193     Query get_query_with_auto_synonyms() const;
 194 };
 195
 196 /// Parser State shared between the lexer and the parser.
 197 class State {
 198     QueryParser::Internal * qpi;
 199
 200   public:
 201     Query query;
 202     const char * error;
 203     unsigned flags;
 204
 205     State(QueryParser::Internal * qpi_, unsigned flags_)
 206         : qpi(qpi_), error(NULL), flags(flags_) { }
 207
 208     string stem_term(const string &term) {
 209         return qpi->stemmer(term);
 210     }
 211
 212     void add_to_stoplist(const Term * term) {
 213         qpi->stoplist.push_back(term->name);
 214     }
 215
 216     void add_to_unstem(const string & term, const string & unstemmed) {
 217         qpi->unstem.insert(make_pair(term, unstemmed));
 218     }
 219
 220     Term * range(const string &a, const string &b) {
 221         for (auto i : qpi->rangeprocs) {
 222             Xapian::Query range_query = (i.proc)->check_range(a, b);
 223             Xapian::Query::op op = range_query.get_type();
 224             switch (op) {
 225                 case Xapian::Query::OP_INVALID:
 226                     break;
 227                 case Xapian::Query::OP_VALUE_RANGE:
 228                 case Xapian::Query::OP_VALUE_GE:
 229                 case Xapian::Query::OP_VALUE_LE:
 230                     if (i.default_grouping) {
 231                         Xapian::Internal::QueryValueBase * base =
 232                             static_cast<Xapian::Internal::QueryValueBase*>(
 233                                 range_query.internal.get());
 234                         Xapian::valueno slot = base->get_slot();
 235                         return new Term(range_query, str(slot));
 236                     }
 237                     // FALLTHRU
 238                 case Xapian::Query::LEAF_TERM:
 239                     return new Term(range_query, i.grouping);
 240                 default:
 241                     return new Term(range_query, string());
 242             }
 243         }
 244         return NULL;
 245     }
 246
 247     Query::op default_op() const { return qpi->default_op; }
 248
 249     bool is_stopword(const Term *term) const {
 250         return qpi->stopper.get() && (*qpi->stopper)(term->name);
 251     }
 252
 253     Database get_database() const {
 254         return qpi->db;
 255     }
 256
 257     const Stopper * get_stopper() const {
 258         return qpi->stopper.get();
 259     }
 260
 261     size_t stoplist_size() const {
 262         return qpi->stoplist.size();
 263     }
 264
 265     void stoplist_resize(size_t s) {
 266         qpi->stoplist.resize(s);
 267     }
 268
 269     Xapian::termcount get_max_wildcard_expansion() const {
 270         return qpi->max_wildcard_expansion;
 271     }
 272
 273     int get_max_wildcard_type() const {
 274         return qpi->max_wildcard_type;
 275     }
 276
 277     Xapian::termcount get_max_partial_expansion() const {
 278         return qpi->max_partial_expansion;
 279     }
 280
 281     int get_max_partial_type() const {
 282         return qpi->max_partial_type;
 283     }
 284 };
 285
 286 string
 287 Term::make_term(const string & prefix) const
 288 {
 289     string term;
 290     if (stem == QueryParser::STEM_SOME || stem == QueryParser::STEM_ALL_Z)
 291         term += 'Z';
 292     if (!prefix.empty()) {
 293         term += prefix;
 294         if (prefix_needs_colon(prefix, name[0])) term += ':';
 295     }
 296     if (stem != QueryParser::STEM_NONE) {
 297         term += state->stem_term(name);
 298     } else {
 299         term += name;
 300     }
 301
 302     if (!unstemmed.empty())
 303         state->add_to_unstem(term, unstemmed);
 304     return term;
 305 }
 306
 307 // Iterator shim to allow building a synonym query from a TermIterator pair.
 308 class SynonymIterator {
 309     Xapian::TermIterator i;
 310
 311     Xapian::termpos pos;
 312
 313     const Xapian::Query * first;
 314
 315   public:
 316     SynonymIterator(const Xapian::TermIterator & i_,
 317                     Xapian::termpos pos_ = 0,
 318                     const Xapian::Query * first_ = NULL)
 319         : i(i_), pos(pos_), first(first_) { }
 320
 321     SynonymIterator & operator++() {
 322         if (first)
 323             first = NULL;
 324         else
 325             ++i;
 326         return *this;
 327     }
 328
 329     const Xapian::Query operator*() const {
 330         if (first) return *first;
 331         return Xapian::Query(*i, 1, pos);
 332     }
 333
 334     bool operator==(const SynonymIterator & o) const {
 335         return i == o.i && first == o.first;
 336     }
 337
 338     bool operator!=(const SynonymIterator & o) const {
 339         return !(*this == o);
 340     }
 341
 342     typedef std::input_iterator_tag iterator_category;
 343     typedef Xapian::Query value_type;
 344     typedef Xapian::termcount_diff difference_type;
 345     typedef Xapian::Query * pointer;
 346     typedef Xapian::Query & reference;
 347 };
 348
 349 Query
 350 Term::get_query_with_synonyms() const
 351 {
 352     // Handle single-word synonyms with each prefix.
 353     const list<string> & prefixes = field_info->prefixes;
 354     if (prefixes.empty()) {
 355         // FIXME: handle multiple here
 356         Assert(!field_info->procs.empty());
 357         return (**field_info->procs.begin())(name);
 358     }
 359
 360     Query q = get_query();
 361
 362     list<string>::const_iterator piter;
 363     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 364         // First try the unstemmed term:
 365         string term;
 366         if (!piter->empty()) {
 367             term += *piter;
 368             if (prefix_needs_colon(*piter, name[0])) term += ':';
 369         }
 370         term += name;
 371
 372         Xapian::Database db = state->get_database();
 373         Xapian::TermIterator syn = db.synonyms_begin(term);
 374         Xapian::TermIterator end = db.synonyms_end(term);
 375         if (syn == end && stem != QueryParser::STEM_NONE) {
 376             // If that has no synonyms, try the stemmed form:
 377             term = 'Z';
 378             if (!piter->empty()) {
 379                 term += *piter;
 380                 if (prefix_needs_colon(*piter, name[0])) term += ':';
 381             }
 382             term += state->stem_term(name);
 383             syn = db.synonyms_begin(term);
 384             end = db.synonyms_end(term);
 385         }
 386         q = Query(q.OP_SYNONYM,
 387                   SynonymIterator(syn, pos, &q),
 388                   SynonymIterator(end));
 389     }
 390     return q;
 391 }
 392
 393 Query
 394 Term::get_query_with_auto_synonyms() const
 395 {
 396     const unsigned MASK_ENABLE_AUTO_SYNONYMS =
 397         QueryParser::FLAG_AUTO_SYNONYMS |
 398         QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 399     if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
 400         return get_query_with_synonyms();
 401
 402     return get_query();
 403 }
 404
 405 static void
 406 add_to_query(Query *& q, Query::op op, Query * term)
 407 {
 408     Assert(term);
 409     if (q) {
 410         *q = Query(op, *q, *term);
 411         delete term;
 412     } else {
 413         q = term;
 414     }
 415 }
 416
 417 static void
 418 add_to_query(Query *& q, Query::op op, const Query & term)
 419 {
 420     if (q) {
 421         *q = Query(op, *q, term);
 422     } else {
 423         q = new Query(term);
 424     }
 425 }
 426
 427 Query
 428 Term::get_query() const
 429 {
 430     const list<string> & prefixes = field_info->prefixes;
 431     if (prefixes.empty()) {
 432         // FIXME: handle multiple here
 433         Assert(!field_info->procs.empty());
 434         return (**field_info->procs.begin())(name);
 435     }
 436     list<string>::const_iterator piter = prefixes.begin();
 437     Query q(make_term(*piter), 1, pos);
 438     while (++piter != prefixes.end()) {
 439         q = Query(Query::OP_OR, q, Query(make_term(*piter), 1, pos));
 440     }
 441     return q;
 442 }
 443
 444 Query *
 445 Term::as_wildcarded_query(State * state_) const
 446 {
 447     const list<string> & prefixes = field_info->prefixes;
 448     list<string>::const_iterator piter;
 449     Xapian::termcount max = state_->get_max_wildcard_expansion();
 450     int max_type = state_->get_max_wildcard_type();
 451     vector<Query> subqs;
 452     subqs.reserve(prefixes.size());
 453     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 454         string root = *piter;
 455         root += name;
 456         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 457         subqs.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 458                               Query::OP_OR));
 459     }
 460     Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
 461     delete this;
 462     return q;
 463 }
 464
 465 Query *
 466 Term::as_partial_query(State * state_) const
 467 {
 468     Xapian::termcount max = state_->get_max_partial_expansion();
 469     int max_type = state_->get_max_partial_type();
 470     vector<Query> subqs_partial; // A synonym of all the partial terms.
 471     vector<Query> subqs_full; // A synonym of all the full terms.
 472
 473     const list<string> & prefixes = field_info->prefixes;
 474     list<string>::const_iterator piter;
 475     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 476         string root = *piter;
 477         root += name;
 478         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 479         subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 480                                       Query::OP_OR));
 481         // Add the term, as it would normally be handled, as an alternative.
 482         subqs_full.push_back(Query(make_term(*piter), 1, pos));
 483     }
 484     Query * q = new Query(Query::OP_OR,
 485                           Query(Query::OP_SYNONYM,
 486                                 subqs_partial.begin(), subqs_partial.end()),
 487                           Query(Query::OP_SYNONYM,
 488                                 subqs_full.begin(), subqs_full.end()));
 489     delete this;
 490     return q;
 491 }
 492
 493 Query *
 494 Term::as_cjk_query() const
 495 {
 496     vector<Query> prefix_subqs;
 497     vector<Query> cjk_subqs;
 498     const list<string> & prefixes = field_info->prefixes;
 499     list<string>::const_iterator piter;
 500     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 501         const string& prefix = *piter;
 502         for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
 503             cjk_subqs.push_back(Query(prefix + *tk, 1, pos));
 504         }
 505         prefix_subqs.push_back(Query(Query::OP_AND,
 506                                      cjk_subqs.begin(), cjk_subqs.end()));
 507         cjk_subqs.clear();
 508     }
 509     Query * q = new Query(Query::OP_OR,
 510                           prefix_subqs.begin(), prefix_subqs.end());
 511     delete this;
 512     return q;
 513 }
 514
 515 Query
 516 Term::as_range_query() const
 517 {
 518     Query q = query;
 519     delete this;
 520     return q;
 521 }
 522
 523 inline bool
 524 is_phrase_generator(unsigned ch)
 525 {
 526     // These characters generate a phrase search.
 527     // Ordered mostly by frequency of calls to this function done when
 528     // running the testcases in api_queryparser.cc.
 529     return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
 530 }
 531
 532 inline bool
 533 is_stem_preventer(unsigned ch)
 534 {
 535     return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
 536 }
 537
 538 inline bool
 539 should_stem(const string & term)
 540 {
 541     const unsigned int SHOULD_STEM_MASK =
 542         (1 << Unicode::LOWERCASE_LETTER) |
 543         (1 << Unicode::TITLECASE_LETTER) |
 544         (1 << Unicode::MODIFIER_LETTER) |
 545         (1 << Unicode::OTHER_LETTER);
 546     Utf8Iterator u(term);
 547     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
 548 }
 549
 550 /** Value representing "ignore this" when returned by check_infix() or
 551  *  check_infix_digit().
 552  */
 553 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
 554
 555 inline unsigned check_infix(unsigned ch) {
 556     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
 557         // Unicode includes all these except '&' in its word boundary rules,
 558         // as well as 0x2019 (which we handle below) and ':' (for Swedish
 559         // apparently, but we ignore this for now as it's problematic in
 560         // real world cases).
 561         return ch;
 562     }
 563     if (ch >= 0x200b) {
 564         // 0x2019 is Unicode apostrophe and single closing quote.
 565         // 0x201b is Unicode single opening quote with the tail rising.
 566         if (ch == 0x2019 || ch == 0x201b)
 567             return '\'';
 568         if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
 569             return UNICODE_IGNORE;
 570     }
 571     return 0;
 572 }
 573
 574 inline unsigned check_infix_digit(unsigned ch) {
 575     // This list of characters comes from Unicode's word identifying algorithm.
 576     switch (ch) {
 577         case ',':
 578         case '.':
 579         case ';':
 580         case 0x037e: // GREEK QUESTION MARK
 581         case 0x0589: // ARMENIAN FULL STOP
 582         case 0x060D: // ARABIC DATE SEPARATOR
 583         case 0x07F8: // NKO COMMA
 584         case 0x2044: // FRACTION SLASH
 585         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 586         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 587         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 588             return ch;
 589     }
 590     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 591         return UNICODE_IGNORE;
 592     return 0;
 593 }
 594
 595 struct yyParser;
 596
 597 // Prototype the functions lemon generates.
 598 static yyParser *ParseAlloc();
 599 static void ParseFree(yyParser *);
 600 static void Parse(yyParser *, int, Term *, State *);
 601 static void yy_parse_failed(yyParser *);
 602
 603 void
 604 QueryParser::Internal::add_prefix(const string &field, const string &prefix)
 605 {
 606     map<string, FieldInfo>::iterator p = field_map.find(field);
 607     if (p == field_map.end()) {
 608         field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, prefix)));
 609     } else {
 610         // Check that this is the same type of filter as the existing one(s).
 611         if (p->second.type != NON_BOOLEAN) {
 612             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
 613         }
 614         if (!p->second.procs.empty())
 615             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 616         p->second.prefixes.push_back(prefix);
 617    }
 618 }
 619
 620 void
 621 QueryParser::Internal::add_prefix(const string &field, FieldProcessor *proc)
 622 {
 623     map<string, FieldInfo>::iterator p = field_map.find(field);
 624     if (p == field_map.end()) {
 625         field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, proc)));
 626     } else {
 627         // Check that this is the same type of filter as the existing one(s).
 628         if (p->second.type != NON_BOOLEAN) {
 629             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
 630         }
 631         if (!p->second.prefixes.empty())
 632             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 633         throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
 634         // p->second.procs.push_back(proc);
 635    }
 636 }
 637
 638 void
 639 QueryParser::Internal::add_boolean_prefix(const string &field,
 640                                           const string &prefix,
 641                                           const string* grouping)
 642 {
 643     // Don't allow the empty prefix to be set as boolean as it doesn't
 644     // really make sense.
 645     if (field.empty())
 646         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 647     if (!grouping) grouping = &field;
 648     filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 649     map<string, FieldInfo>::iterator p = field_map.find(field);
 650     if (p == field_map.end()) {
 651         field_map.insert(make_pair(field, FieldInfo(type, prefix, *grouping)));
 652     } else {
 653         // Check that this is the same type of filter as the existing one(s).
 654         if (p->second.type != type) {
 655             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
 656         }
 657         if (!p->second.procs.empty())
 658             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 659         p->second.prefixes.push_back(prefix); // FIXME grouping
 660    }
 661 }
 662
 663 void
 664 QueryParser::Internal::add_boolean_prefix(const string &field,
 665                                           FieldProcessor *proc,
 666                                           const string* grouping)
 667 {
 668     // Don't allow the empty prefix to be set as boolean as it doesn't
 669     // really make sense.
 670     if (field.empty())
 671         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 672     if (!grouping) grouping = &field;
 673     filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 674     map<string, FieldInfo>::iterator p = field_map.find(field);
 675     if (p == field_map.end()) {
 676         field_map.insert(make_pair(field, FieldInfo(type, proc, *grouping)));
 677     } else {
 678         // Check that this is the same type of filter as the existing one(s).
 679         if (p->second.type != type) {
 680             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
 681         }
 682         if (!p->second.prefixes.empty())
 683             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 684         throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
 685         // p->second.procs.push_back(proc);
 686    }
 687 }
 688
 689 string
 690 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
 691                                   bool cjk_ngram, bool & is_cjk_term,
 692                                   bool &was_acronym)
 693 {
 694     string term;
 695     // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 696     // Don't worry if there's a trailing '.' or not.
 697     if (U_isupper(*it)) {
 698         string t;
 699         Utf8Iterator p = it;
 700         do {
 701             Unicode::append_utf8(t, *p++);
 702         } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 703         // One letter does not make an acronym!  If we handled a single
 704         // uppercase letter here, we wouldn't catch M&S below.
 705         if (t.length() > 1) {
 706             // Check there's not a (lower case) letter or digit
 707             // immediately after it.
 708             // FIXME: should I.B.M..P.T.O be a range search?
 709             if (p == end || !is_wordchar(*p)) {
 710                 it = p;
 711                 swap(term, t);
 712             }
 713         }
 714     }
 715     was_acronym = !term.empty();
 716
 717     if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
 718         term = CJK::get_cjk(it);
 719         is_cjk_term = true;
 720     }
 721
 722     if (term.empty()) {
 723         unsigned prevch = *it;
 724         Unicode::append_utf8(term, prevch);
 725         while (++it != end) {
 726             if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
 727             unsigned ch = *it;
 728             if (!is_wordchar(ch)) {
 729                 // Treat a single embedded '&' or "'" or similar as a word
 730                 // character (e.g. AT&T, Fred's).  Also, normalise
 731                 // apostrophes to ASCII apostrophe.
 732                 Utf8Iterator p = it;
 733                 ++p;
 734                 if (p == end || !is_wordchar(*p)) break;
 735                 unsigned nextch = *p;
 736                 if (is_digit(prevch) && is_digit(nextch)) {
 737                     ch = check_infix_digit(ch);
 738                 } else {
 739                     ch = check_infix(ch);
 740                 }
 741                 if (!ch) break;
 742                 if (ch == UNICODE_IGNORE)
 743                     continue;
 744             }
 745             Unicode::append_utf8(term, ch);
 746             prevch = ch;
 747         }
 748         if (it != end && is_suffix(*it)) {
 749             string suff_term = term;
 750             Utf8Iterator p = it;
 751             // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
 752             do {
 753                 if (suff_term.size() - term.size() == 3) {
 754                     suff_term.resize(0);
 755                     break;
 756                 }
 757                 suff_term += *p;
 758             } while (is_suffix(*++p));
 759             if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
 760                 // If the suffixed term doesn't exist, check that the
 761                 // non-suffixed term does.  This also takes care of
 762                 // the case when QueryParser::set_database() hasn't
 763                 // been called.
 764                 bool use_suff_term = false;
 765                 string lc = Unicode::tolower(suff_term);
 766                 if (db.term_exists(lc)) {
 767                     use_suff_term = true;
 768                 } else {
 769                     lc = Unicode::tolower(term);
 770                     if (!db.term_exists(lc)) use_suff_term = true;
 771                 }
 772                 if (use_suff_term) {
 773                     term = suff_term;
 774                     it = p;
 775                 }
 776             }
 777         }
 778     }
 779     return term;
 780 }
 781
 782 class ParserHandler {
 783     yyParser * parser;
 784
 785   public:
 786     explicit ParserHandler(yyParser * parser_) : parser(parser_) { }
 787     operator yyParser*() const { return parser; }
 788     ~ParserHandler() { ParseFree(parser); }
 789 };
 790
 791 Query
 792 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
 793                                    const string &default_prefix)
 794 {
 795     bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
 796
 797     // Set ranges if we may have to handle ranges in the query.
 798     bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
 799
 800     termpos term_pos = 1;
 801     Utf8Iterator it(qs), end;
 802
 803     State state(this, flags);
 804
 805     // To successfully apply more than one spelling correction to a query
 806     // string, we must keep track of the offset due to previous corrections.
 807     int correction_offset = 0;
 808     corrected_query.resize(0);
 809
 810     // Stack of prefixes, used for phrases and subexpressions.
 811     list<const FieldInfo *> prefix_stack;
 812
 813     // If default_prefix is specified, use it.  Otherwise, use any list
 814     // that has been set for the empty prefix.
 815     const FieldInfo def_pfx(NON_BOOLEAN, default_prefix);
 816     {
 817         const FieldInfo * default_field_info = &def_pfx;
 818         if (default_prefix.empty()) {
 819             auto f = field_map.find(string());
 820             if (f != field_map.end()) default_field_info = &(f->second);
 821         }
 822
 823         // We always have the current prefix on the top of the stack.
 824         prefix_stack.push_back(default_field_info);
 825     }
 826
 827     ParserHandler pParser(ParseAlloc());
 828
 829     unsigned newprev = ' ';
 830 main_lex_loop:
 831     enum {
 832         DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
 833         IN_GROUP2, EXPLICIT_SYNONYM
 834     } mode = DEFAULT;
 835     while (it != end && !state.error) {
 836         bool last_was_operator = false;
 837         bool last_was_operator_needing_term = false;
 838         if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
 839         if (false) {
 840 just_had_operator:
 841             if (it == end) break;
 842             mode = DEFAULT;
 843             last_was_operator_needing_term = false;
 844             last_was_operator = true;
 845         }
 846         if (false) {
 847 just_had_operator_needing_term:
 848             last_was_operator_needing_term = true;
 849             last_was_operator = true;
 850         }
 851         if (mode == IN_PHRASED_TERM) mode = DEFAULT;
 852         if (is_whitespace(*it)) {
 853             newprev = ' ';
 854             ++it;
 855             it = find_if(it, end, is_not_whitespace);
 856             if (it == end) break;
 857         }
 858
 859         if (ranges &&
 860             (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
 861             // Scan forward to see if this could be the "start of range"
 862             // token.  Sadly this has O(n^2) tendencies, though at least
 863             // "n" is the number of words in a query which is likely to
 864             // remain fairly small.  FIXME: can we tokenise more elegantly?
 865             Utf8Iterator it_initial = it;
 866             Utf8Iterator p = it;
 867             unsigned ch = 0;
 868             while (p != end) {
 869                 if (ch == '.' && *p == '.') {
 870                     string a;
 871                     while (it != p) {
 872                         Unicode::append_utf8(a, *it++);
 873                     }
 874                     // Trim off the trailing ".".
 875                     a.resize(a.size() - 1);
 876                     ++p;
 877                     // Either end of the range can be empty (for an open-ended
 878                     // range) but both can't be empty.
 879                     if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
 880                         string b;
 881                         // Allow any character except whitespace and ')' in the
 882                         // upper bound.
 883                         while (p != end && *p > ' ' && *p != ')') {
 884                             Unicode::append_utf8(b, *p++);
 885                         }
 886                         Term * range = state.range(a, b);
 887                         if (!range) {
 888                             state.error = "Unknown range operation";
 889                             if (a.find(':', 1) == string::npos) {
 890                                 goto done;
 891                             }
 892                             // Might be a boolean filter with ".." in.  Leave
 893                             // state.error in case it isn't.
 894                             it = it_initial;
 895                             break;
 896                         }
 897                         Parse(pParser, RANGE, range, &state);
 898                     }
 899                     it = p;
 900                     goto main_lex_loop;
 901                 }
 902                 ch = *p;
 903                 // Allow any character except whitespace and '(' in the lower
 904                 // bound.
 905                 if (ch <= ' ' || ch == '(') break;
 906                 ++p;
 907             }
 908         }
 909
 910         if (!is_wordchar(*it)) {
 911             unsigned prev = newprev;
 912             unsigned ch = *it++;
 913             newprev = ch;
 914             // Drop out of IN_GROUP mode.
 915             if (mode == IN_GROUP || mode == IN_GROUP2)
 916                 mode = DEFAULT;
 917             switch (ch) {
 918               case '"':
 919               case 0x201c: // Left curly double quote.
 920               case 0x201d: // Right curly double quote.
 921                 // Quoted phrase.
 922                 if (mode == DEFAULT) {
 923                     // Skip whitespace.
 924                     it = find_if(it, end, is_not_whitespace);
 925                     if (it == end) {
 926                         // Ignore an unmatched " at the end of the query to
 927                         // avoid generating an empty pair of QUOTEs which will
 928                         // cause a parse error.
 929                         goto done;
 930                     }
 931                     if (is_double_quote(*it)) {
 932                         // Ignore empty "" (but only if we're not already
 933                         // IN_QUOTES as we don't merge two adjacent quoted
 934                         // phrases!)
 935                         newprev = *it++;
 936                         break;
 937                     }
 938                 }
 939                 if (flags & QueryParser::FLAG_PHRASE) {
 940                     Parse(pParser, QUOTE, NULL, &state);
 941                     if (mode == DEFAULT) {
 942                         mode = IN_QUOTES;
 943                     } else {
 944                         // Remove the prefix we pushed for this phrase.
 945                         if (mode == IN_PREFIXED_QUOTES)
 946                             prefix_stack.pop_back();
 947                         mode = DEFAULT;
 948                     }
 949                 }
 950                 break;
 951
 952               case '+': case '-': // Loved or hated term/phrase/subexpression.
 953                 // Ignore + or - at the end of the query string.
 954                 if (it == end) goto done;
 955                 if (prev > ' ' && prev != '(') {
 956                     // Or if not after whitespace or an open bracket.
 957                     break;
 958                 }
 959                 if (is_whitespace(*it) || *it == '+' || *it == '-') {
 960                     // Ignore + or - followed by a space, or further + or -.
 961                     // Postfix + (such as in C++ and H+) is handled as part of
 962                     // the term lexing code in parse_term().
 963                     newprev = *it++;
 964                     break;
 965                 }
 966                 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
 967                     int token;
 968                     if (ch == '+') {
 969                         token = LOVE;
 970                     } else if (last_was_operator) {
 971                         token = HATE_AFTER_AND;
 972                     } else {
 973                         token = HATE;
 974                     }
 975                     Parse(pParser, token, NULL, &state);
 976                     goto just_had_operator_needing_term;
 977                 }
 978                 // Need to prevent the term after a LOVE or HATE starting a
 979                 // term group...
 980                 break;
 981
 982               case '(': // Bracketed subexpression.
 983                 // Skip whitespace.
 984                 it = find_if(it, end, is_not_whitespace);
 985                 // Ignore ( at the end of the query string.
 986                 if (it == end) goto done;
 987                 if (prev > ' ' && strchr("()+-", prev) == NULL) {
 988                     // Or if not after whitespace or a bracket or '+' or '-'.
 989                     break;
 990                 }
 991                 if (*it == ')') {
 992                     // Ignore empty ().
 993                     newprev = *it++;
 994                     break;
 995                 }
 996                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
 997                     prefix_stack.push_back(prefix_stack.back());
 998                     Parse(pParser, BRA, NULL, &state);
 999                 }
1000                 break;
1001
1002               case ')': // End of bracketed subexpression.
1003                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1004                     // Remove the prefix we pushed for the corresponding BRA.
1005                     // If brackets are unmatched, it's a syntax error, but
1006                     // that's no excuse to SEGV!
1007                     if (prefix_stack.size() > 1) prefix_stack.pop_back();
1008                     Parse(pParser, KET, NULL, &state);
1009                 }
1010                 break;
1011
1012               case '~': // Synonym expansion.
1013                 // Ignore at the end of the query string.
1014                 if (it == end) goto done;
1015                 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1016                     if (prev > ' ' && strchr("+-(", prev) == NULL) {
1017                         // Or if not after whitespace, +, -, or an open bracket.
1018                         break;
1019                     }
1020                     if (!is_wordchar(*it)) {
1021                         // Ignore if not followed by a word character.
1022                         break;
1023                     }
1024                     Parse(pParser, SYNONYM, NULL, &state);
1025                     mode = EXPLICIT_SYNONYM;
1026                     goto just_had_operator_needing_term;
1027                 }
1028                 break;
1029             }
1030             // Skip any other characters.
1031             continue;
1032         }
1033
1034         Assert(is_wordchar(*it));
1035
1036         size_t term_start_index = it.raw() - qs.data();
1037
1038         newprev = 'A'; // Any letter will do...
1039
1040         // A term, a prefix, or a boolean operator.
1041         const FieldInfo * field_info = NULL;
1042         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1043             !field_map.empty()) {
1044             // Check for a fieldname prefix (e.g. title:historical).
1045             Utf8Iterator p = find_if(it, end, is_not_wordchar);
1046             if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1047                 string field;
1048                 p = it;
1049                 while (*p != ':')
1050                     Unicode::append_utf8(field, *p++);
1051                 map<string, FieldInfo>::const_iterator f;
1052                 f = field_map.find(field);
1053                 if (f != field_map.end()) {
1054                     // Special handling for prefixed fields, depending on the
1055                     // type of the prefix.
1056                     unsigned ch = *++p;
1057                     field_info = &(f->second);
1058
1059                     if (field_info->type != NON_BOOLEAN) {
1060                         // Drop out of IN_GROUP if we're in it.
1061                         if (mode == IN_GROUP || mode == IN_GROUP2)
1062                             mode = DEFAULT;
1063                         it = p;
1064                         string name;
1065                         if (it != end && is_double_quote(*it)) {
1066                             // Quoted boolean term (can contain any character).
1067                             bool fancy = (*it != '"');
1068                             ++it;
1069                             while (it != end) {
1070                                 if (*it == '"') {
1071                                     // Interpret "" as an escaped ".
1072                                     if (++it == end || *it != '"')
1073                                         break;
1074                                 } else if (fancy && is_double_quote(*it)) {
1075                                     // If the opening quote was ASCII, then the
1076                                     // closing one must be too - otherwise
1077                                     // the user can't protect non-ASCII double
1078                                     // quote characters by quoting or escaping.
1079                                     ++it;
1080                                     break;
1081                                 }
1082                                 Unicode::append_utf8(name, *it++);
1083                             }
1084                         } else {
1085                             // Can't boolean filter prefix a subexpression, so
1086                             // just use anything following the prefix until the
1087                             // next space or ')' as part of the boolean filter
1088                             // term.
1089                             while (it != end && *it > ' ' && *it != ')')
1090                                 Unicode::append_utf8(name, *it++);
1091                         }
1092                         // Build the unstemmed form in field.
1093                         field += ':';
1094                         field += name;
1095                         // Clear any pending range error.
1096                         state.error = NULL;
1097                         Term * token = new Term(&state, name, field_info, field);
1098                         Parse(pParser, BOOLEAN_FILTER, token, &state);
1099                         continue;
1100                     }
1101
1102                     if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1103                         // Prefixed phrase, e.g.: subject:"space flight"
1104                         mode = IN_PREFIXED_QUOTES;
1105                         Parse(pParser, QUOTE, NULL, &state);
1106                         it = p;
1107                         newprev = ch;
1108                         ++it;
1109                         prefix_stack.push_back(field_info);
1110                         continue;
1111                     }
1112
1113                     if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1114                         // Prefixed subexpression, e.g.: title:(fast NEAR food)
1115                         mode = DEFAULT;
1116                         Parse(pParser, BRA, NULL, &state);
1117                         it = p;
1118                         newprev = ch;
1119                         ++it;
1120                         prefix_stack.push_back(field_info);
1121                         continue;
1122                     }
1123
1124                     if (ch != ':') {
1125                         // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1126                         while (is_phrase_generator(ch)) {
1127                             if (++p == end)
1128                                 goto not_prefix;
1129                             ch = *p;
1130                         }
1131                     }
1132
1133                     if (is_wordchar(ch)) {
1134                         // Prefixed term.
1135                         it = p;
1136                     } else {
1137 not_prefix:
1138                         // It looks like a prefix but isn't, so parse it as
1139                         // text instead.
1140                         field_info = NULL;
1141                     }
1142                 }
1143             }
1144         }
1145
1146 phrased_term:
1147         bool was_acronym;
1148         bool is_cjk_term = false;
1149         string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
1150
1151         // Boolean operators.
1152         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1153             (flags & FLAG_BOOLEAN) &&
1154             // Don't want to interpret A.N.D. as an AND operator.
1155             !was_acronym &&
1156             !field_info &&
1157             term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1158
1159             string op = term;
1160             if (flags & FLAG_BOOLEAN_ANY_CASE) {
1161                 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1162                     *i = C_toupper(*i);
1163                 }
1164             }
1165             if (op.size() == 3) {
1166                 if (op == "AND") {
1167                     Parse(pParser, AND, NULL, &state);
1168                     goto just_had_operator;
1169                 }
1170                 if (op == "NOT") {
1171                     Parse(pParser, NOT, NULL, &state);
1172                     goto just_had_operator;
1173                 }
1174                 if (op == "XOR") {
1175                     Parse(pParser, XOR, NULL, &state);
1176                     goto just_had_operator;
1177                 }
1178                 if (op == "ADJ") {
1179                     if (it != end && *it == '/') {
1180                         size_t width = 0;
1181                         Utf8Iterator p = it;
1182                         while (++p != end && U_isdigit(*p)) {
1183                             width = (width * 10) + (*p - '0');
1184                         }
1185                         if (width && (p == end || is_whitespace(*p))) {
1186                             it = p;
1187                             Parse(pParser, ADJ, new Term(width), &state);
1188                             goto just_had_operator;
1189                         }
1190                     } else {
1191                         Parse(pParser, ADJ, NULL, &state);
1192                         goto just_had_operator;
1193                     }
1194                 }
1195             } else if (op.size() == 2) {
1196                 if (op == "OR") {
1197                     Parse(pParser, OR, NULL, &state);
1198                     goto just_had_operator;
1199                 }
1200             } else if (op.size() == 4) {
1201                 if (op == "NEAR") {
1202                     if (it != end && *it == '/') {
1203                         size_t width = 0;
1204                         Utf8Iterator p = it;
1205                         while (++p != end && U_isdigit(*p)) {
1206                             width = (width * 10) + (*p - '0');
1207                         }
1208                         if (width && (p == end || is_whitespace(*p))) {
1209                             it = p;
1210                             Parse(pParser, NEAR, new Term(width), &state);
1211                             goto just_had_operator;
1212                         }
1213                     } else {
1214                         Parse(pParser, NEAR, NULL, &state);
1215                         goto just_had_operator;
1216                     }
1217                 }
1218             }
1219         }
1220
1221         // If no prefix is set, use the default one.
1222         if (!field_info) field_info = prefix_stack.back();
1223
1224         Assert(field_info->type == NON_BOOLEAN);
1225
1226         {
1227             string unstemmed_term(term);
1228             term = Unicode::tolower(term);
1229
1230             // Reuse stem_strategy - STEM_SOME here means "stem terms except
1231             // when used with positional operators".
1232             stem_strategy stem_term = stem_action;
1233             if (stem_term != STEM_NONE) {
1234                 if (!stemmer.internal.get()) {
1235                     // No stemmer is set.
1236                     stem_term = STEM_NONE;
1237                 } else if (stem_term == STEM_SOME) {
1238                     if (!should_stem(unstemmed_term) ||
1239                         (it != end && is_stem_preventer(*it))) {
1240                         // Don't stem this particular term.
1241                         stem_term = STEM_NONE;
1242                     }
1243                 }
1244             }
1245
1246             Term * term_obj = new Term(&state, term, field_info,
1247                                        unstemmed_term, stem_term, term_pos++);
1248
1249             if (is_cjk_term) {
1250                 Parse(pParser, CJKTERM, term_obj, &state);
1251                 if (it == end) break;
1252                 continue;
1253             }
1254
1255             if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1256                 if (it != end) {
1257                     if ((flags & FLAG_WILDCARD) && *it == '*') {
1258                         Utf8Iterator p(it);
1259                         ++p;
1260                         if (p == end || !is_wordchar(*p)) {
1261                             it = p;
1262                             if (mode == IN_GROUP || mode == IN_GROUP2) {
1263                                 // Drop out of IN_GROUP and flag that the group
1264                                 // can be empty if all members are stopwords.
1265                                 if (mode == IN_GROUP2)
1266                                     Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
1267                                 mode = DEFAULT;
1268                             }
1269                             // Wildcard at end of term (also known as
1270                             // "right truncation").
1271                             Parse(pParser, WILD_TERM, term_obj, &state);
1272                             continue;
1273                         }
1274                     }
1275                 } else {
1276                     if (flags & FLAG_PARTIAL) {
1277                         if (mode == IN_GROUP || mode == IN_GROUP2) {
1278                             // Drop out of IN_GROUP and flag that the group
1279                             // can be empty if all members are stopwords.
1280                             if (mode == IN_GROUP2)
1281                                 Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
1282                             mode = DEFAULT;
1283                         }
1284                         // Final term of a partial match query, with no
1285                         // following characters - treat as a wildcard.
1286                         Parse(pParser, PARTIAL_TERM, term_obj, &state);
1287                         continue;
1288                     }
1289                 }
1290             }
1291
1292             // Check spelling, if we're a normal term, and any of the prefixes
1293             // are empty.
1294             if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1295                 const list<string> & pfxes = field_info->prefixes;
1296                 list<string>::const_iterator pfx_it;
1297                 for (pfx_it = pfxes.begin(); pfx_it != pfxes.end(); ++pfx_it) {
1298                     if (!pfx_it->empty())
1299                         continue;
1300                     const string & suggest = db.get_spelling_suggestion(term);
1301                     if (!suggest.empty()) {
1302                         if (corrected_query.empty()) corrected_query = qs;
1303                         size_t term_end_index = it.raw() - qs.data();
1304                         size_t n = term_end_index - term_start_index;
1305                         size_t pos = term_start_index + correction_offset;
1306                         corrected_query.replace(pos, n, suggest);
1307                         correction_offset += suggest.size();
1308                         correction_offset -= n;
1309                     }
1310                     break;
1311                 }
1312             }
1313
1314             if (mode == IN_PHRASED_TERM) {
1315                 Parse(pParser, PHR_TERM, term_obj, &state);
1316             } else {
1317                 // See if the next token will be PHR_TERM - if so, this one
1318                 // needs to be TERM not GROUP_TERM.
1319                 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1320                     is_phrase_generator(*it)) {
1321                     // FIXME: can we clean this up?
1322                     Utf8Iterator p = it;
1323                     do {
1324                         ++p;
1325                     } while (p != end && is_phrase_generator(*p));
1326                     // Don't generate a phrase unless the phrase generators are
1327                     // immediately followed by another term.
1328                     if (p != end && is_wordchar(*p)) {
1329                         mode = DEFAULT;
1330                     }
1331                 }
1332
1333                 int token = TERM;
1334                 if (mode == IN_GROUP || mode == IN_GROUP2) {
1335                     mode = IN_GROUP2;
1336                     token = GROUP_TERM;
1337                 }
1338                 Parse(pParser, token, term_obj, &state);
1339                 if (token == TERM && mode != DEFAULT)
1340                     continue;
1341             }
1342         }
1343
1344         if (it == end) break;
1345
1346         if (is_phrase_generator(*it)) {
1347             // Skip multiple phrase generators.
1348             do {
1349                 ++it;
1350             } while (it != end && is_phrase_generator(*it));
1351             // Don't generate a phrase unless the phrase generators are
1352             // immediately followed by another term.
1353             if (it != end && is_wordchar(*it)) {
1354                 mode = IN_PHRASED_TERM;
1355                 term_start_index = it.raw() - qs.data();
1356                 goto phrased_term;
1357             }
1358         } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1359             int old_mode = mode;
1360             mode = DEFAULT;
1361             if (!last_was_operator_needing_term && is_whitespace(*it)) {
1362                 newprev = ' ';
1363                 // Skip multiple whitespace.
1364                 do {
1365                     ++it;
1366                 } while (it != end && is_whitespace(*it));
1367                 // Don't generate a group unless the terms are only separated
1368                 // by whitespace.
1369                 if (it != end && is_wordchar(*it)) {
1370                     if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1371                         mode = IN_GROUP2;
1372                     } else {
1373                         mode = IN_GROUP;
1374                     }
1375                 }
1376             }
1377         }
1378     }
1379 done:
1380     if (!state.error) {
1381         // Implicitly close any unclosed quotes.
1382         if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1383             Parse(pParser, QUOTE, NULL, &state);
1384
1385         // Implicitly close all unclosed brackets.
1386         while (prefix_stack.size() > 1) {
1387             Parse(pParser, KET, NULL, &state);
1388             prefix_stack.pop_back();
1389         }
1390         Parse(pParser, 0, NULL, &state);
1391     }
1392
1393     errmsg = state.error;
1394     return state.query;
1395 }
1396
1397 struct ProbQuery {
1398     Query * query;
1399     Query * love;
1400     Query * hate;
1401     // filter is a map from prefix to a query for that prefix.  Queries with
1402     // the same prefix are combined with OR, and the results of this are
1403     // combined with AND to get the full filter.
1404     map<string, Query> filter;
1405
1406     ProbQuery() : query(0), love(0), hate(0) { }
1407     ~ProbQuery() {
1408         delete query;
1409         delete love;
1410         delete hate;
1411     }
1412
1413     void add_filter(const string& grouping, const Query & q) {
1414         filter[grouping] = q;
1415     }
1416
1417     void append_filter(const string& grouping, const Query & qnew) {
1418         auto it = filter.find(grouping);
1419         if (it == filter.end()) {
1420             filter.insert(make_pair(grouping, qnew));
1421         } else {
1422             Query & q = it->second;
1423             // We OR multiple filters with the same prefix if they're
1424             // exclusive, otherwise we AND them.
1425             bool exclusive = !grouping.empty();
1426             Query::op op = exclusive ? Query::OP_OR : Query::OP_AND;
1427             q = Query(op, q, qnew);
1428         }
1429     }
1430
1431     void add_filter_range(const string& grouping, const Query & range) {
1432         filter[grouping] = range;
1433     }
1434
1435     void append_filter_range(const string& grouping, const Query & range) {
1436         Query & q = filter[grouping];
1437         q = Query(Query::OP_OR, q, range);
1438     }
1439
1440     Query merge_filters() const {
1441         auto i = filter.begin();
1442         Assert(i != filter.end());
1443         Query q = i->second;
1444         while (++i != filter.end()) {
1445             q = Query(Query::OP_AND, q, i->second);
1446         }
1447         return q;
1448     }
1449 };
1450
1451 /// A group of terms separated only by whitespace.
1452 class TermGroup {
1453     vector<Term *> terms;
1454
1455     /** Controls how to handle a group where all terms are stopwords.
1456      *
1457      *  If true, then as_group() returns NULL.  If false, then the
1458      *  stopword status of the terms is ignored.
1459      */
1460     bool empty_ok;
1461
1462     TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1463         add_term(t1);
1464         add_term(t2);
1465     }
1466
1467   public:
1468     /// Factory function - ensures heap allocation.
1469     static TermGroup* create(Term* t1, Term* t2) {
1470         return new TermGroup(t1, t2);
1471     }
1472
1473     ~TermGroup() {
1474         for (auto&& t : terms) {
1475             delete t;
1476         }
1477     }
1478
1479     /// Add a Term object to this TermGroup object.
1480     void add_term(Term * term) {
1481         terms.push_back(term);
1482     }
1483
1484     /// Set the empty_ok flag.
1485     void set_empty_ok() { empty_ok = true; }
1486
1487     /// Convert to a Xapian::Query * using default_op.
1488     Query * as_group(State *state) const;
1489 };
1490
1491 Query *
1492 TermGroup::as_group(State *state) const
1493 {
1494     const Xapian::Stopper * stopper = state->get_stopper();
1495     size_t stoplist_size = state->stoplist_size();
1496     bool default_op_is_positional = is_positional(state->default_op());
1497 reprocess:
1498     Query::op default_op = state->default_op();
1499     vector<Query> subqs;
1500     subqs.reserve(terms.size());
1501     if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1502         // Check for multi-word synonyms.
1503         Database db = state->get_database();
1504
1505         string key;
1506         vector<Term*>::const_iterator begin = terms.begin();
1507         vector<Term*>::const_iterator i = begin;
1508         while (i != terms.end()) {
1509             TermIterator synkey(db.synonym_keys_begin((*i)->name));
1510             TermIterator synend(db.synonym_keys_end((*i)->name));
1511             if (synkey == synend) {
1512                 // No multi-synonym matches.
1513                 if (stopper && (*stopper)((*i)->name)) {
1514                     state->add_to_stoplist(*i);
1515                 } else {
1516                     if (default_op_is_positional)
1517                         (*i)->need_positions();
1518                     subqs.push_back((*i)->get_query_with_auto_synonyms());
1519                 }
1520                 begin = ++i;
1521                 continue;
1522             }
1523             key.resize(0);
1524             while (i != terms.end()) {
1525                 if (!key.empty()) key += ' ';
1526                 key += (*i)->name;
1527                 ++i;
1528                 synkey.skip_to(key);
1529                 if (synkey == synend || !startswith(*synkey, key)) break;
1530             }
1531             // Greedily try to match as many consecutive words as possible.
1532             TermIterator syn, end;
1533             while (true) {
1534                 syn = db.synonyms_begin(key);
1535                 end = db.synonyms_end(key);
1536                 if (syn != end) break;
1537                 if (--i == begin) break;
1538                 key.resize(key.size() - (*i)->name.size() - 1);
1539             }
1540             if (i == begin) {
1541                 // No multi-synonym matches.
1542                 if (stopper && (*stopper)((*i)->name)) {
1543                     state->add_to_stoplist(*i);
1544                 } else {
1545                     if (default_op_is_positional)
1546                         (*i)->need_positions();
1547                     subqs.push_back((*i)->get_query_with_auto_synonyms());
1548                 }
1549                 begin = ++i;
1550                 continue;
1551             }
1552
1553             vector<Query> subqs2;
1554             vector<Term*>::const_iterator j;
1555             for (j = begin; j != i; ++j) {
1556                 if (stopper && (*stopper)((*j)->name)) {
1557                     state->add_to_stoplist(*j);
1558                 } else {
1559                     if (default_op_is_positional)
1560                         (*i)->need_positions();
1561                     subqs2.push_back((*j)->get_query());
1562                 }
1563             }
1564             Query q_original_terms;
1565             if (default_op_is_positional) {
1566                 q_original_terms = Query(default_op,
1567                                          subqs2.begin(), subqs2.end(),
1568                                          subqs2.size() + 9);
1569             } else {
1570                 q_original_terms = Query(default_op,
1571                                          subqs2.begin(), subqs2.end());
1572             }
1573             subqs2.clear();
1574
1575             // Use the position of the first term for the synonyms.
1576             Query q(Query::OP_SYNONYM,
1577                     SynonymIterator(syn, (*begin)->pos, &q_original_terms),
1578                     SynonymIterator(end));
1579             subqs.push_back(q);
1580
1581             begin = i;
1582         }
1583     } else {
1584         vector<Term*>::const_iterator i;
1585         for (i = terms.begin(); i != terms.end(); ++i) {
1586             if (stopper && (*stopper)((*i)->name)) {
1587                 state->add_to_stoplist(*i);
1588             } else {
1589                 if (default_op_is_positional)
1590                     (*i)->need_positions();
1591                 subqs.push_back((*i)->get_query_with_auto_synonyms());
1592             }
1593         }
1594     }
1595
1596     if (!empty_ok && stopper && subqs.empty() &&
1597         stoplist_size < state->stoplist_size()) {
1598         // This group is all stopwords, so roll-back, disable stopper
1599         // temporarily, and reprocess this group.
1600         state->stoplist_resize(stoplist_size);
1601         stopper = NULL;
1602         goto reprocess;
1603     }
1604
1605     Query * q = NULL;
1606     if (!subqs.empty()) {
1607         if (default_op_is_positional) {
1608             q = new Query(default_op, subqs.begin(), subqs.end(),
1609                              subqs.size() + 9);
1610         } else {
1611             q = new Query(default_op, subqs.begin(), subqs.end());
1612         }
1613     }
1614     delete this;
1615     return q;
1616 }
1617
1618 /// Some terms which form a positional sub-query.
1619 class Terms {
1620     vector<Term *> terms;
1621     size_t window;
1622
1623     /** Keep track of whether the terms added all have the same list of
1624      *  prefixes.  If so, we'll build a set of phrases, one using each prefix.
1625      *  This works around the limitation that a phrase cannot have multiple
1626      *  components which are "OR" combinations of terms, but is also probably
1627      *  what users expect: i.e., if a user specifies a phrase in a field, and
1628      *  that field maps to multiple prefixes, the user probably wants a phrase
1629      *  returned with all terms having one of those prefixes, rather than a
1630      *  phrase comprised of terms with differing prefixes.
1631      */
1632     bool uniform_prefixes;
1633
1634     /** The list of prefixes of the terms added.
1635      *  This will be NULL if the terms have different prefixes.
1636      */
1637     const list<string> * prefixes;
1638
1639     /// Convert to a query using the given operator and window size.
1640     Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
1641         Query * q = NULL;
1642         size_t n_terms = terms.size();
1643         Xapian::termcount w = w_delta + terms.size();
1644         if (uniform_prefixes) {
1645             if (prefixes) {
1646                 list<string>::const_iterator piter;
1647                 for (piter = prefixes->begin(); piter != prefixes->end(); ++piter) {
1648                     vector<Query> subqs;
1649                     subqs.reserve(n_terms);
1650                     vector<Term *>::const_iterator titer;
1651                     for (titer = terms.begin(); titer != terms.end(); ++titer) {
1652                         Term * t = *titer;
1653                         subqs.push_back(Query(t->make_term(*piter), 1, t->pos));
1654                     }
1655                     add_to_query(q, Query::OP_OR,
1656                                  Query(op, subqs.begin(), subqs.end(), w));
1657                 }
1658             }
1659         } else {
1660             vector<Query> subqs;
1661             subqs.reserve(n_terms);
1662             vector<Term *>::const_iterator titer;
1663             for (titer = terms.begin(); titer != terms.end(); ++titer) {
1664                 subqs.push_back((*titer)->get_query());
1665             }
1666             q = new Query(op, subqs.begin(), subqs.end(), w);
1667         }
1668
1669         delete this;
1670         return q;
1671     }
1672
1673     Terms() : window(0), uniform_prefixes(true), prefixes(NULL) { }
1674
1675   public:
1676     /// Factory function - ensures heap allocation.
1677     static Terms* create() {
1678         return new Terms();
1679     }
1680
1681     ~Terms() {
1682         for (auto&& t : terms) {
1683             delete t;
1684         }
1685     }
1686
1687     /// Add an unstemmed Term object to this Terms object.
1688     void add_positional_term(Term * term) {
1689         const list<string> & term_prefixes = term->field_info->prefixes;
1690         if (terms.empty()) {
1691             prefixes = &term_prefixes;
1692         } else if (uniform_prefixes && prefixes != &term_prefixes) {
1693             if (*prefixes != term_prefixes)  {
1694                 prefixes = NULL;
1695                 uniform_prefixes = false;
1696             }
1697         }
1698         term->need_positions();
1699         terms.push_back(term);
1700     }
1701
1702     void adjust_window(size_t alternative_window) {
1703         if (alternative_window > window) window = alternative_window;
1704     }
1705
1706     /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
1707     Query * as_phrase_query() const {
1708         return as_opwindow_query(Query::OP_PHRASE, 0);
1709     }
1710
1711     /// Convert to a Xapian::Query * using OP_NEAR.
1712     Query * as_near_query() const {
1713         // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
1714         // means a window size of 11.  For more than 2 terms, we just add one
1715         // to the window size for each extra term.
1716         size_t w = window;
1717         if (w == 0) w = 10;
1718         return as_opwindow_query(Query::OP_NEAR, w - 1);
1719     }
1720
1721     /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
1722     Query * as_adj_query() const {
1723         // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
1724         // which means a window size of 11.  For more than 2 terms, we just add
1725         // one to the window size for each extra term.
1726         size_t w = window;
1727         if (w == 0) w = 10;
1728         return as_opwindow_query(Query::OP_PHRASE, w - 1);
1729     }
1730 };
1731
1732 void
1733 Term::as_positional_cjk_term(Terms * terms) const
1734 {
1735     // Add each individual CJK character to the phrase.
1736     string t;
1737     for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
1738         Unicode::append_utf8(t, *it);
1739         Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
1740         terms->add_positional_term(c);
1741         t.resize(0);
1742     }
1743
1744     // FIXME: we want to add the n-grams as filters too for efficiency.
1745
1746     delete this;
1747 }
1748
1749 // Helper macro for converting a boolean operation into a Xapian::Query.
1750 #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
1751     do {\
1752         if (!A || !B) {\
1753             state->error = "Syntax: <expression> " OP_TXT " <expression>";\
1754             yy_parse_failed(yypParser);\
1755             return;\
1756         }\
1757         E = new Query(OP, *A, *B);\
1758         delete A;\
1759         delete B;\
1760     } while (0)
1761
1762 }
1763
1764 %token_type {Term *}
1765 %token_destructor {delete $$;}
1766
1767 %extra_argument {State * state}
1768
1769 %parse_failure {
1770     // If we've not already set an error message, set a default one.
1771     if (!state->error) state->error = "parse error";
1772 }
1773
1774 %syntax_error {
1775     yy_parse_failed(yypParser);
1776 }
1777
1778 // Operators, grouped in order of increasing precedence:
1779 %nonassoc ERROR.
1780 %left OR.
1781 %left XOR.
1782 %left AND NOT.
1783 %left NEAR ADJ.
1784 %left LOVE HATE HATE_AFTER_AND SYNONYM.
1785
1786 // Destructors for terminal symbols:
1787
1788 // TERM is a query term, including prefix (if any).
1789 %destructor TERM {delete $$;}
1790
1791 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
1792 // is only separated by whitespace characters.
1793 %destructor GROUP_TERM {delete $$;}
1794
1795 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
1796 // separated only by one or more phrase generator characters (hyphen and
1797 // apostrophe are common examples - see is_phrase_generator() for the list
1798 // of all punctuation which does this).
1799 %destructor PHR_TERM {delete $$;}
1800
1801 // WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
1802 // expanded.
1803 %destructor WILD_TERM {delete $$;}
1804
1805 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
1806 // we're doing "search as you type".  It expands to something like WILD_TERM
1807 // OR stemmed_form.
1808 %destructor PARTIAL_TERM {delete $$;}
1809
1810 // BOOLEAN_FILTER is a query term with a prefix registered using
1811 // add_boolean_prefix().  It's added to the query using an OP_FILTER operator,
1812 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
1813 %destructor BOOLEAN_FILTER {delete $$;}
1814
1815 // Grammar rules:
1816
1817 // query - The whole query - just an expr or nothing.
1818
1819 // query non-terminal doesn't need a type, so just give a dummy one.
1820 %type query {int}
1821
1822 query ::= expr(E). {
1823     // Save the parsed query in the State structure so we can return it.
1824     if (E) {
1825         state->query = *E;
1826         delete E;
1827     } else {
1828         state->query = Query();
1829     }
1830 }
1831
1832 query ::= . {
1833     // Handle a query string with no terms in.
1834     state->query = Query();
1835 }
1836
1837 // expr - A query expression.
1838
1839 %type expr {Query *}
1840 %destructor expr {delete $$;}
1841
1842 expr(E) ::= prob_expr(P).
1843         { E = P; }
1844
1845 expr(E) ::= bool_arg(A) AND bool_arg(B).
1846         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND, B, "AND"); }
1847
1848 expr(E) ::= bool_arg(A) NOT bool_arg(B). {
1849     // 'NOT foo' -> '<alldocuments> NOT foo'
1850     if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
1851         A = new Query("", 1, 0);
1852     }
1853     BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "NOT");
1854 }
1855
1856 expr(E) ::= bool_arg(A) AND NOT bool_arg(B). [NOT]
1857         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND NOT"); }
1858
1859 expr(E) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND]
1860         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND"); }
1861
1862 expr(E) ::= bool_arg(A) OR bool_arg(B).
1863         { BOOL_OP_TO_QUERY(E, A, Query::OP_OR, B, "OR"); }
1864
1865 expr(E) ::= bool_arg(A) XOR bool_arg(B).
1866         { BOOL_OP_TO_QUERY(E, A, Query::OP_XOR, B, "XOR"); }
1867
1868 // bool_arg - an argument to a boolean operator such as AND or OR.
1869
1870 %type bool_arg {Query *}
1871 %destructor bool_arg {delete $$;}
1872
1873 bool_arg(A) ::= expr(E). { A = E; }
1874
1875 bool_arg(A) ::= . [ERROR] {
1876     // Set the argument to NULL, which enables the bool_arg-using rules in
1877     // expr above to report uses of AND, OR, etc which don't have two
1878     // arguments.
1879     A = NULL;
1880 }
1881
1882 // prob_expr - a single compound term, or a prob.
1883
1884 %type prob_expr {Query *}
1885 %destructor prob_expr {delete $$;}
1886
1887 prob_expr(E) ::= prob(P). {
1888     E = P->query;
1889     P->query = NULL;
1890     // Handle any "+ terms".
1891     if (P->love) {
1892         if (P->love->empty()) {
1893             // +<nothing>.
1894             delete E;
1895             E = P->love;
1896         } else if (E) {
1897             swap(E, P->love);
1898             add_to_query(E, Query::OP_AND_MAYBE, P->love);
1899         } else {
1900             E = P->love;
1901         }
1902         P->love = NULL;
1903     }
1904     // Handle any boolean filters.
1905     if (!P->filter.empty()) {
1906         if (E) {
1907             add_to_query(E, Query::OP_FILTER, P->merge_filters());
1908         } else {
1909             // Make the query a boolean one.
1910             E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
1911         }
1912     }
1913     // Handle any "- terms".
1914     if (P->hate && !P->hate->empty()) {
1915         if (!E) {
1916             // Can't just hate!
1917             yy_parse_failed(yypParser);
1918             return;
1919         }
1920         *E = Query(Query::OP_AND_NOT, *E, *P->hate);
1921     }
1922     delete P;
1923 }
1924
1925 prob_expr(E) ::= term(T). {
1926     E = T;
1927 }
1928
1929 // prob - a probabilistic sub-expression consisting of stop_terms, "+" terms,
1930 // "-" terms, boolean filters, and/or ranges.
1931 //
1932 // Note: stop_term can also be several other things other than a simple term!
1933
1934 %type prob {ProbQuery *}
1935 %destructor prob {delete $$;}
1936
1937 prob(P) ::= RANGE(R). {
1938     string grouping = R->name;
1939     const Query & range = R->as_range_query();
1940     P = new ProbQuery;
1941     P->add_filter_range(grouping, range);
1942 }
1943
1944 prob(P) ::= stop_prob(Q) RANGE(R). {
1945     string grouping = R->name;
1946     const Query & range = R->as_range_query();
1947     P = Q;
1948     P->append_filter_range(grouping, range);
1949 }
1950
1951 prob(P) ::= stop_term(T) stop_term(U). {
1952     P = new ProbQuery;
1953     P->query = T;
1954     if (U) {
1955         Query::op op = state->default_op();
1956         if (P->query && is_positional(op)) {
1957             // If default_op is OP_NEAR or OP_PHRASE, set the window size to
1958             // 11 for the first pair of terms and it will automatically grow
1959             // by one for each subsequent term.
1960             Query * subqs[2] = { P->query, U };
1961             *(P->query) = Query(op, subqs, subqs + 2, 11);
1962             delete U;
1963         } else {
1964             add_to_query(P->query, op, U);
1965         }
1966     }
1967 }
1968
1969 prob(P) ::= prob(Q) stop_term(T). {
1970     P = Q;
1971     // If T is a stopword, there's nothing to do here.
1972     if (T) add_to_query(P->query, state->default_op(), T);
1973 }
1974
1975 prob(P) ::= LOVE term(T). {
1976     P = new ProbQuery;
1977     if (state->default_op() == Query::OP_AND) {
1978         P->query = T;
1979     } else {
1980         P->love = T;
1981     }
1982 }
1983
1984 prob(P) ::= stop_prob(Q) LOVE term(T). {
1985     P = Q;
1986     if (state->default_op() == Query::OP_AND) {
1987         /* The default op is AND, so we just put loved terms into the query
1988          * (in this case the only effect of love is to ignore the stopword
1989          * list). */
1990         add_to_query(P->query, Query::OP_AND, T);
1991     } else {
1992         add_to_query(P->love, Query::OP_AND, T);
1993     }
1994 }
1995
1996 prob(P) ::= HATE term(T). {
1997     P = new ProbQuery;
1998     P->hate = T;
1999 }
2000
2001 prob(P) ::= stop_prob(Q) HATE term(T). {
2002     P = Q;
2003     add_to_query(P->hate, Query::OP_OR, T);
2004 }
2005
2006 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2007     P = new ProbQuery;
2008     P->hate = new Query(T->get_query());
2009     delete T;
2010 }
2011
2012 prob(P) ::= stop_prob(Q) HATE BOOLEAN_FILTER(T). {
2013     P = Q;
2014     add_to_query(P->hate, Query::OP_OR, T->get_query());
2015     delete T;
2016 }
2017
2018 prob(P) ::= BOOLEAN_FILTER(T). {
2019     P = new ProbQuery;
2020     P->add_filter(T->get_grouping(), T->get_query());
2021     delete T;
2022 }
2023
2024 prob(P) ::= stop_prob(Q) BOOLEAN_FILTER(T). {
2025     P = Q;
2026     P->append_filter(T->get_grouping(), T->get_query());
2027     delete T;
2028 }
2029
2030 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2031     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2032     P = new ProbQuery;
2033     P->filter[T->get_grouping()] = T->get_query();
2034     delete T;
2035 }
2036
2037 prob(P) ::= stop_prob(Q) LOVE BOOLEAN_FILTER(T). {
2038     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2039     P = Q;
2040     // We OR filters with the same prefix...
2041     Query & q = P->filter[T->get_grouping()];
2042     q = Query(Query::OP_OR, q, T->get_query());
2043     delete T;
2044 }
2045
2046 // stop_prob - A prob or a stop_term.
2047
2048 %type stop_prob {ProbQuery *}
2049 %destructor stop_prob {delete $$;}
2050
2051 stop_prob(P) ::= prob(Q).
2052     { P = Q; }
2053
2054 stop_prob(P) ::= stop_term(T). {
2055     P = new ProbQuery;
2056     P->query = T;
2057 }
2058
2059 // stop_term - A term which should be checked against the stopword list,
2060 // or a compound_term.
2061 //
2062 // If a term is loved, hated, or in a phrase, we don't want to consult the
2063 // stopword list, so stop_term isn't used there (instead term is).
2064
2065 %type stop_term {Query *}
2066 %destructor stop_term {delete $$;}
2067
2068 stop_term(T) ::= TERM(U). {
2069     if (state->is_stopword(U)) {
2070         T = NULL;
2071         state->add_to_stoplist(U);
2072     } else {
2073         T = new Query(U->get_query_with_auto_synonyms());
2074     }
2075     delete U;
2076 }
2077
2078 stop_term(T) ::= compound_term(U). {
2079     T = U;
2080 }
2081
2082 // term - A term or a compound_term.
2083
2084 %type term {Query *}
2085 %destructor term {delete $$;}
2086
2087 term(T) ::= TERM(U). {
2088     T = new Query(U->get_query_with_auto_synonyms());
2089     delete U;
2090 }
2091
2092 term(T) ::= compound_term(U). {
2093     T = U;
2094 }
2095
2096 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2097 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2098 // or without prefix).
2099
2100 %type compound_term {Query *}
2101 %destructor compound_term {delete $$;}
2102
2103 compound_term(T) ::= WILD_TERM(U).
2104         { T = U->as_wildcarded_query(state); }
2105
2106 compound_term(T) ::= PARTIAL_TERM(U).
2107         { T = U->as_partial_query(state); }
2108
2109 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2110         { T = P->as_phrase_query(); }
2111
2112 compound_term(T) ::= phrased_term(P).
2113         { T = P->as_phrase_query(); }
2114
2115 compound_term(T) ::= group(P).
2116         { T = P->as_group(state); }
2117
2118 compound_term(T) ::= near_expr(P).
2119         { T = P->as_near_query(); }
2120
2121 compound_term(T) ::= adj_expr(P).
2122         { T = P->as_adj_query(); }
2123
2124 compound_term(T) ::= BRA expr(E) KET.
2125         { T = E; }
2126
2127 compound_term(T) ::= SYNONYM TERM(U). {
2128     T = new Query(U->get_query_with_synonyms());
2129     delete U;
2130 }
2131
2132 compound_term(T) ::= CJKTERM(U). {
2133     { T = U->as_cjk_query(); }
2134 }
2135
2136 // phrase - The "inside the quotes" part of a double-quoted phrase.
2137
2138 %type phrase {Terms *}
2139
2140 %destructor phrase {delete $$;}
2141
2142 phrase(P) ::= TERM(T). {
2143     P = Terms::create();
2144     P->add_positional_term(T);
2145 }
2146
2147 phrase(P) ::= CJKTERM(T). {
2148     P = Terms::create();
2149     T->as_positional_cjk_term(P);
2150 }
2151
2152 phrase(P) ::= phrase(Q) TERM(T). {
2153     P = Q;
2154     P->add_positional_term(T);
2155 }
2156
2157 phrase(P) ::= phrase(Q) CJKTERM(T). {
2158     P = Q;
2159     T->as_positional_cjk_term(P);
2160 }
2161
2162 // phrased_term - A phrased term works like a single term, but is actually
2163 // 2 or more terms linked together into a phrase by punctuation.  There must be
2164 // at least 2 terms in order to be able to have punctuation between the terms!
2165
2166 %type phrased_term {Terms *}
2167 %destructor phrased_term {delete $$;}
2168
2169 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2170     P = Terms::create();
2171     P->add_positional_term(T);
2172     P->add_positional_term(U);
2173 }
2174
2175 phrased_term(P) ::= phrased_term(Q) PHR_TERM(T). {
2176     P = Q;
2177     P->add_positional_term(T);
2178 }
2179
2180 // group - A group of terms separated only by whitespace - candidates for
2181 // multi-term synonyms.
2182
2183 %type group {TermGroup *}
2184 %destructor group {delete $$;}
2185
2186 group(P) ::= TERM(T) GROUP_TERM(U). {
2187     P = TermGroup::create(T, U);
2188 }
2189
2190 group(P) ::= group(Q) GROUP_TERM(T). {
2191     P = Q;
2192     P->add_term(T);
2193 }
2194
2195 group(P) ::= group(Q) EMPTY_GROUP_OK. {
2196     P = Q;
2197     P->set_empty_ok();
2198 }
2199
2200 // near_expr - 2 or more terms with NEAR in between.  There must be at least 2
2201 // terms in order for there to be any NEAR operators!
2202
2203 %type near_expr {Terms *}
2204 %destructor near_expr {delete $$;}
2205
2206 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2207     P = Terms::create();
2208     P->add_positional_term(T);
2209     P->add_positional_term(U);
2210     if (N) {
2211         P->adjust_window(N->get_termpos());
2212         delete N;
2213     }
2214 }
2215
2216 near_expr(P) ::= near_expr(Q) NEAR(N) TERM(T). {
2217     P = Q;
2218     P->add_positional_term(T);
2219     if (N) {
2220         P->adjust_window(N->get_termpos());
2221         delete N;
2222     }
2223 }
2224
2225 // adj_expr - 2 or more terms with ADJ in between.  There must be at least 2
2226 // terms in order for there to be any ADJ operators!
2227
2228 %type adj_expr {Terms *}
2229 %destructor adj_expr {delete $$;}
2230
2231 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2232     P = Terms::create();
2233     P->add_positional_term(T);
2234     P->add_positional_term(U);
2235     if (N) {
2236         P->adjust_window(N->get_termpos());
2237         delete N;
2238     }
2239 }
2240
2241 adj_expr(P) ::= adj_expr(Q) ADJ(N) TERM(T). {
2242     P = Q;
2243     P->add_positional_term(T);
2244     if (N) {
2245         P->adjust_window(N->get_termpos());
2246         delete N;
2247     }
2248 }
2249
2250 // Select yacc syntax highlighting in vim editor: vim: syntax=yacc
2251 // (lemon syntax colouring isn't supplied by default; yacc does an OK job).