xapian-core/queryparser/queryparser.lemony

   1 %include {
   2 /* queryparser.lemony: build a Xapian::Query object from a user query string.
   3  *
   4  * Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016,2018 Olly Betts
   5  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
   6  * Copyright (C) 2010 Adam Sjøgren
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  21  * USA
  22  */
  23
  24 #include <config.h>
  25
  26 #include "queryparser_internal.h"
  27
  28 #include "api/queryinternal.h"
  29 #include "omassert.h"
  30 #include "str.h"
  31 #include "stringutils.h"
  32 #include "xapian/error.h"
  33 #include "xapian/unicode.h"
  34
  35 // Include the list of token values lemon generates.
  36 #include "queryparser_token.h"
  37
  38 #include "cjk-tokenizer.h"
  39
  40 #include <algorithm>
  41 #include <cstring>
  42 #include <limits>
  43 #include <list>
  44 #include <string>
  45 #include <vector>
  46
  47 // We create the yyParser on the stack.
  48 #define Parse_ENGINEALWAYSONSTACK
  49
  50 using namespace std;
  51
  52 using namespace Xapian;
  53
  54 inline bool
  55 U_isupper(unsigned ch) {
  56     return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
  57 }
  58
  59 inline bool
  60 U_isdigit(unsigned ch) {
  61     return (ch < 128 && C_isdigit(static_cast<unsigned char>(ch)));
  62 }
  63
  64 inline bool
  65 U_isalpha(unsigned ch) {
  66     return (ch < 128 && C_isalpha(static_cast<unsigned char>(ch)));
  67 }
  68
  69 using Xapian::Unicode::is_whitespace;
  70
  71 inline bool
  72 is_not_whitespace(unsigned ch) {
  73     return !is_whitespace(ch);
  74 }
  75
  76 using Xapian::Unicode::is_wordchar;
  77
  78 inline bool
  79 is_not_wordchar(unsigned ch) {
  80     return !is_wordchar(ch);
  81 }
  82
  83 inline bool
  84 is_digit(unsigned ch) {
  85     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
  86 }
  87
  88 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
  89 // and there's the risk of hyphens getting stuck onto the end of terms...
  90 inline bool
  91 is_suffix(unsigned ch) {
  92     return ch == '+' || ch == '#';
  93 }
  94
  95 inline bool
  96 is_double_quote(unsigned ch) {
  97     // We simply treat all double quotes as equivalent, which is a bit crude,
  98     // but it isn't clear that it would actually better to require them to
  99     // match up exactly.
 100     //
 101     // 0x201c is Unicode opening double quote.
 102     // 0x201d is Unicode closing double quote.
 103     return ch == '"' || ch == 0x201c || ch == 0x201d;
 104 }
 105
 106 inline bool
 107 prefix_needs_colon(const string & prefix, unsigned ch)
 108 {
 109     if (!U_isupper(ch) && ch != ':') return false;
 110     string::size_type len = prefix.length();
 111     return (len > 1 && prefix[len - 1] != ':');
 112 }
 113
 114 using Unicode::is_currency;
 115
 116 inline bool
 117 is_positional(Xapian::Query::op op)
 118 {
 119     return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
 120 }
 121
 122 class Terms;
 123
 124 /** Class used to pass information about a token from lexer to parser.
 125  *
 126  *  Generally an instance of this class carries term information, but it can be
 127  *  used for a range query, and with some operators (e.g. the distance in
 128  *  NEAR/3 or ADJ/3, etc).
 129  */
 130 class Term {
 131     State * state;
 132
 133   public:
 134     string name;
 135     const FieldInfo * field_info;
 136     string unstemmed;
 137     QueryParser::stem_strategy stem;
 138     termpos pos;
 139     Query query;
 140
 141     Term(const string &name_, termpos pos_)
 142         : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
 143     explicit Term(const string &name_)
 144         : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
 145     Term(const string &name_, const FieldInfo * field_info_)
 146         : name(name_), field_info(field_info_),
 147           stem(QueryParser::STEM_NONE), pos(0) { }
 148     explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
 149     Term(State * state_, const string &name_, const FieldInfo * field_info_,
 150          const string &unstemmed_,
 151          QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
 152          termpos pos_ = 0)
 153         : state(state_), name(name_), field_info(field_info_),
 154           unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
 155     // For RANGE tokens.
 156     Term(const Xapian::Query & q, const string & grouping)
 157         : name(grouping), query(q) { }
 158
 159     string make_term(const string & prefix) const;
 160
 161     void need_positions() {
 162         if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
 163     }
 164
 165     termpos get_termpos() const { return pos; }
 166
 167     string get_grouping() const {
 168         return field_info->grouping;
 169     }
 170
 171     Query * as_wildcarded_query(State * state) const;
 172
 173     /** Build a query for a term at the very end of the query string when
 174      *  FLAG_PARTIAL is in use.
 175      *
 176      *  This query should match documents containing any terms which start with
 177      *  the characters specified, but should give a higher score to exact
 178      *  matches (since the user might have finished typing - we simply don't
 179      *  know).
 180      */
 181     Query * as_partial_query(State * state_) const;
 182
 183     /** Build a query for a string of CJK characters. */
 184     Query * as_cjk_query() const;
 185
 186     /** Handle a CJK character string in a positional context. */
 187     void as_positional_cjk_term(Terms * terms) const;
 188
 189     /// Range query.
 190     Query as_range_query() const;
 191
 192     Query get_query() const;
 193
 194     Query get_query_with_synonyms() const;
 195
 196     Query get_query_with_auto_synonyms() const;
 197 };
 198
 199 /// Parser State shared between the lexer and the parser.
 200 class State {
 201     QueryParser::Internal * qpi;
 202
 203   public:
 204     Query query;
 205     const char * error;
 206     unsigned flags;
 207
 208     State(QueryParser::Internal * qpi_, unsigned flags_)
 209         : qpi(qpi_), error(NULL), flags(flags_) { }
 210
 211     string stem_term(const string &term) {
 212         return qpi->stemmer(term);
 213     }
 214
 215     void add_to_stoplist(const Term * term) {
 216         qpi->stoplist.push_back(term->name);
 217     }
 218
 219     void add_to_unstem(const string & term, const string & unstemmed) {
 220         qpi->unstem.insert(make_pair(term, unstemmed));
 221     }
 222
 223     Term * range(const string &a, const string &b) {
 224         for (auto i : qpi->rangeprocs) {
 225             Xapian::Query range_query = (i.proc)->check_range(a, b);
 226             Xapian::Query::op op = range_query.get_type();
 227             switch (op) {
 228                 case Xapian::Query::OP_INVALID:
 229                     break;
 230                 case Xapian::Query::OP_VALUE_RANGE:
 231                 case Xapian::Query::OP_VALUE_GE:
 232                 case Xapian::Query::OP_VALUE_LE:
 233                     if (i.default_grouping) {
 234                         Xapian::Internal::QueryValueBase * base =
 235                             static_cast<Xapian::Internal::QueryValueBase*>(
 236                                 range_query.internal.get());
 237                         Xapian::valueno slot = base->get_slot();
 238                         return new Term(range_query, str(slot));
 239                     }
 240                     // FALLTHRU
 241                 case Xapian::Query::LEAF_TERM:
 242                     return new Term(range_query, i.grouping);
 243                 default:
 244                     return new Term(range_query, string());
 245             }
 246         }
 247         return NULL;
 248     }
 249
 250     Query::op default_op() const { return qpi->default_op; }
 251
 252     bool is_stopword(const Term *term) const {
 253         return qpi->stopper.get() && (*qpi->stopper)(term->name);
 254     }
 255
 256     Database get_database() const {
 257         return qpi->db;
 258     }
 259
 260     const Stopper * get_stopper() const {
 261         return qpi->stopper.get();
 262     }
 263
 264     size_t stoplist_size() const {
 265         return qpi->stoplist.size();
 266     }
 267
 268     void stoplist_resize(size_t s) {
 269         qpi->stoplist.resize(s);
 270     }
 271
 272     Xapian::termcount get_max_wildcard_expansion() const {
 273         return qpi->max_wildcard_expansion;
 274     }
 275
 276     int get_max_wildcard_type() const {
 277         return qpi->max_wildcard_type;
 278     }
 279
 280     Xapian::termcount get_max_partial_expansion() const {
 281         return qpi->max_partial_expansion;
 282     }
 283
 284     int get_max_partial_type() const {
 285         return qpi->max_partial_type;
 286     }
 287 };
 288
 289 string
 290 Term::make_term(const string & prefix) const
 291 {
 292     string term;
 293     if (stem != QueryParser::STEM_NONE && stem != QueryParser::STEM_ALL)
 294         term += 'Z';
 295     if (!prefix.empty()) {
 296         term += prefix;
 297         if (prefix_needs_colon(prefix, name[0])) term += ':';
 298     }
 299     if (stem != QueryParser::STEM_NONE) {
 300         term += state->stem_term(name);
 301     } else {
 302         term += name;
 303     }
 304
 305     if (!unstemmed.empty())
 306         state->add_to_unstem(term, unstemmed);
 307     return term;
 308 }
 309
 310 // Iterator shim to allow building a synonym query from a TermIterator pair.
 311 class SynonymIterator {
 312     Xapian::TermIterator i;
 313
 314     Xapian::termpos pos;
 315
 316     const Xapian::Query * first;
 317
 318   public:
 319     SynonymIterator(const Xapian::TermIterator & i_,
 320                     Xapian::termpos pos_ = 0,
 321                     const Xapian::Query * first_ = NULL)
 322         : i(i_), pos(pos_), first(first_) { }
 323
 324     SynonymIterator & operator++() {
 325         if (first)
 326             first = NULL;
 327         else
 328             ++i;
 329         return *this;
 330     }
 331
 332     const Xapian::Query operator*() const {
 333         if (first) return *first;
 334         return Xapian::Query(*i, 1, pos);
 335     }
 336
 337     bool operator==(const SynonymIterator & o) const {
 338         return i == o.i && first == o.first;
 339     }
 340
 341     bool operator!=(const SynonymIterator & o) const {
 342         return !(*this == o);
 343     }
 344
 345     typedef std::input_iterator_tag iterator_category;
 346     typedef Xapian::Query value_type;
 347     typedef Xapian::termcount_diff difference_type;
 348     typedef Xapian::Query * pointer;
 349     typedef Xapian::Query & reference;
 350 };
 351
 352 Query
 353 Term::get_query_with_synonyms() const
 354 {
 355     // Handle single-word synonyms with each prefix.
 356     const list<string> & prefixes = field_info->prefixes;
 357     if (prefixes.empty()) {
 358         // FIXME: handle multiple here
 359         Assert(!field_info->procs.empty());
 360         return (**field_info->procs.begin())(name);
 361     }
 362
 363     Query q = get_query();
 364
 365     list<string>::const_iterator piter;
 366     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 367         // First try the unstemmed term:
 368         string term;
 369         if (!piter->empty()) {
 370             term += *piter;
 371             if (prefix_needs_colon(*piter, name[0])) term += ':';
 372         }
 373         term += name;
 374
 375         Xapian::Database db = state->get_database();
 376         Xapian::TermIterator syn = db.synonyms_begin(term);
 377         Xapian::TermIterator end = db.synonyms_end(term);
 378         if (syn == end && stem != QueryParser::STEM_NONE) {
 379             // If that has no synonyms, try the stemmed form:
 380             term = 'Z';
 381             if (!piter->empty()) {
 382                 term += *piter;
 383                 if (prefix_needs_colon(*piter, name[0])) term += ':';
 384             }
 385             term += state->stem_term(name);
 386             syn = db.synonyms_begin(term);
 387             end = db.synonyms_end(term);
 388         }
 389         q = Query(q.OP_SYNONYM,
 390                   SynonymIterator(syn, pos, &q),
 391                   SynonymIterator(end));
 392     }
 393     return q;
 394 }
 395
 396 Query
 397 Term::get_query_with_auto_synonyms() const
 398 {
 399     const unsigned MASK_ENABLE_AUTO_SYNONYMS =
 400         QueryParser::FLAG_AUTO_SYNONYMS |
 401         QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 402     if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
 403         return get_query_with_synonyms();
 404
 405     return get_query();
 406 }
 407
 408 static void
 409 add_to_query(Query *& q, Query::op op, Query * term)
 410 {
 411     Assert(term);
 412     if (q) {
 413         *q = Query(op, *q, *term);
 414         delete term;
 415     } else {
 416         q = term;
 417     }
 418 }
 419
 420 static void
 421 add_to_query(Query *& q, Query::op op, const Query & term)
 422 {
 423     if (q) {
 424         *q = Query(op, *q, term);
 425     } else {
 426         q = new Query(term);
 427     }
 428 }
 429
 430 Query
 431 Term::get_query() const
 432 {
 433     const list<string> & prefixes = field_info->prefixes;
 434     if (prefixes.empty()) {
 435         // FIXME: handle multiple here
 436         Assert(!field_info->procs.empty());
 437         return (**field_info->procs.begin())(name);
 438     }
 439     list<string>::const_iterator piter = prefixes.begin();
 440     Query q(make_term(*piter), 1, pos);
 441     while (++piter != prefixes.end()) {
 442         q = Query(Query::OP_OR, q, Query(make_term(*piter), 1, pos));
 443     }
 444     return q;
 445 }
 446
 447 Query *
 448 Term::as_wildcarded_query(State * state_) const
 449 {
 450     const list<string> & prefixes = field_info->prefixes;
 451     list<string>::const_iterator piter;
 452     Xapian::termcount max = state_->get_max_wildcard_expansion();
 453     int max_type = state_->get_max_wildcard_type();
 454     vector<Query> subqs;
 455     subqs.reserve(prefixes.size());
 456     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 457         string root = *piter;
 458         root += name;
 459         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 460         subqs.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 461                               Query::OP_OR));
 462     }
 463     Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
 464     delete this;
 465     return q;
 466 }
 467
 468 Query *
 469 Term::as_partial_query(State * state_) const
 470 {
 471     Xapian::termcount max = state_->get_max_partial_expansion();
 472     int max_type = state_->get_max_partial_type();
 473     vector<Query> subqs_partial; // A synonym of all the partial terms.
 474     vector<Query> subqs_full; // A synonym of all the full terms.
 475
 476     const list<string> & prefixes = field_info->prefixes;
 477     list<string>::const_iterator piter;
 478     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 479         string root = *piter;
 480         root += name;
 481         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 482         subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 483                                       Query::OP_OR));
 484         // Add the term, as it would normally be handled, as an alternative.
 485         subqs_full.push_back(Query(make_term(*piter), 1, pos));
 486     }
 487     Query * q = new Query(Query::OP_OR,
 488                           Query(Query::OP_SYNONYM,
 489                                 subqs_partial.begin(), subqs_partial.end()),
 490                           Query(Query::OP_SYNONYM,
 491                                 subqs_full.begin(), subqs_full.end()));
 492     delete this;
 493     return q;
 494 }
 495
 496 Query *
 497 Term::as_cjk_query() const
 498 {
 499     vector<Query> prefix_subqs;
 500     vector<Query> cjk_subqs;
 501     const list<string> & prefixes = field_info->prefixes;
 502     list<string>::const_iterator piter;
 503     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
 504         const string& prefix = *piter;
 505         for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
 506             cjk_subqs.push_back(Query(prefix + *tk, 1, pos));
 507         }
 508         prefix_subqs.push_back(Query(Query::OP_AND,
 509                                      cjk_subqs.begin(), cjk_subqs.end()));
 510         cjk_subqs.clear();
 511     }
 512     Query * q = new Query(Query::OP_OR,
 513                           prefix_subqs.begin(), prefix_subqs.end());
 514     delete this;
 515     return q;
 516 }
 517
 518 Query
 519 Term::as_range_query() const
 520 {
 521     Query q = query;
 522     delete this;
 523     return q;
 524 }
 525
 526 inline bool
 527 is_phrase_generator(unsigned ch)
 528 {
 529     // These characters generate a phrase search.
 530     // Ordered mostly by frequency of calls to this function done when
 531     // running the testcases in api_queryparser.cc.
 532     return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
 533 }
 534
 535 inline bool
 536 is_stem_preventer(unsigned ch)
 537 {
 538     return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
 539 }
 540
 541 inline bool
 542 should_stem(const string & term)
 543 {
 544     const unsigned int SHOULD_STEM_MASK =
 545         (1 << Unicode::LOWERCASE_LETTER) |
 546         (1 << Unicode::TITLECASE_LETTER) |
 547         (1 << Unicode::MODIFIER_LETTER) |
 548         (1 << Unicode::OTHER_LETTER);
 549     Utf8Iterator u(term);
 550     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
 551 }
 552
 553 /** Value representing "ignore this" when returned by check_infix() or
 554  *  check_infix_digit().
 555  */
 556 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
 557
 558 inline unsigned check_infix(unsigned ch) {
 559     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
 560         // Unicode includes all these except '&' in its word boundary rules,
 561         // as well as 0x2019 (which we handle below) and ':' (for Swedish
 562         // apparently, but we ignore this for now as it's problematic in
 563         // real world cases).
 564         return ch;
 565     }
 566     if (ch >= 0x200b) {
 567         // 0x2019 is Unicode apostrophe and single closing quote.
 568         // 0x201b is Unicode single opening quote with the tail rising.
 569         if (ch == 0x2019 || ch == 0x201b)
 570             return '\'';
 571         if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
 572             return UNICODE_IGNORE;
 573     }
 574     return 0;
 575 }
 576
 577 inline unsigned check_infix_digit(unsigned ch) {
 578     // This list of characters comes from Unicode's word identifying algorithm.
 579     switch (ch) {
 580         case ',':
 581         case '.':
 582         case ';':
 583         case 0x037e: // GREEK QUESTION MARK
 584         case 0x0589: // ARMENIAN FULL STOP
 585         case 0x060D: // ARABIC DATE SEPARATOR
 586         case 0x07F8: // NKO COMMA
 587         case 0x2044: // FRACTION SLASH
 588         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 589         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 590         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 591             return ch;
 592     }
 593     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 594         return UNICODE_IGNORE;
 595     return 0;
 596 }
 597
 598 // Prototype a function lemon generates, but which we want to call before that
 599 // in the generated source code file.
 600 struct yyParser;
 601 static void yy_parse_failed(yyParser *);
 602
 603 void
 604 QueryParser::Internal::add_prefix(const string &field, const string &prefix)
 605 {
 606     map<string, FieldInfo>::iterator p = field_map.find(field);
 607     if (p == field_map.end()) {
 608         field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, prefix)));
 609     } else {
 610         // Check that this is the same type of filter as the existing one(s).
 611         if (p->second.type != NON_BOOLEAN) {
 612             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
 613         }
 614         if (!p->second.procs.empty())
 615             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 616         p->second.prefixes.push_back(prefix);
 617    }
 618 }
 619
 620 void
 621 QueryParser::Internal::add_prefix(const string &field, FieldProcessor *proc)
 622 {
 623     map<string, FieldInfo>::iterator p = field_map.find(field);
 624     if (p == field_map.end()) {
 625         field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, proc)));
 626     } else {
 627         // Check that this is the same type of filter as the existing one(s).
 628         if (p->second.type != NON_BOOLEAN) {
 629             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
 630         }
 631         if (!p->second.prefixes.empty())
 632             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 633         throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
 634         // p->second.procs.push_back(proc);
 635    }
 636 }
 637
 638 void
 639 QueryParser::Internal::add_boolean_prefix(const string &field,
 640                                           const string &prefix,
 641                                           const string* grouping)
 642 {
 643     // Don't allow the empty prefix to be set as boolean as it doesn't
 644     // really make sense.
 645     if (field.empty())
 646         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 647     if (!grouping) grouping = &field;
 648     filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 649     map<string, FieldInfo>::iterator p = field_map.find(field);
 650     if (p == field_map.end()) {
 651         field_map.insert(make_pair(field, FieldInfo(type, prefix, *grouping)));
 652     } else {
 653         // Check that this is the same type of filter as the existing one(s).
 654         if (p->second.type != type) {
 655             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
 656         }
 657         if (!p->second.procs.empty())
 658             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 659         p->second.prefixes.push_back(prefix); // FIXME grouping
 660    }
 661 }
 662
 663 void
 664 QueryParser::Internal::add_boolean_prefix(const string &field,
 665                                           FieldProcessor *proc,
 666                                           const string* grouping)
 667 {
 668     // Don't allow the empty prefix to be set as boolean as it doesn't
 669     // really make sense.
 670     if (field.empty())
 671         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 672     if (!grouping) grouping = &field;
 673     filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 674     map<string, FieldInfo>::iterator p = field_map.find(field);
 675     if (p == field_map.end()) {
 676         field_map.insert(make_pair(field, FieldInfo(type, proc, *grouping)));
 677     } else {
 678         // Check that this is the same type of filter as the existing one(s).
 679         if (p->second.type != type) {
 680             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
 681         }
 682         if (!p->second.prefixes.empty())
 683             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 684         throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
 685         // p->second.procs.push_back(proc);
 686    }
 687 }
 688
 689 string
 690 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
 691                                   bool cjk_ngram, bool & is_cjk_term,
 692                                   bool &was_acronym)
 693 {
 694     string term;
 695     // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 696     // Don't worry if there's a trailing '.' or not.
 697     if (U_isupper(*it)) {
 698         string t;
 699         Utf8Iterator p = it;
 700         do {
 701             Unicode::append_utf8(t, *p++);
 702         } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 703         // One letter does not make an acronym!  If we handled a single
 704         // uppercase letter here, we wouldn't catch M&S below.
 705         if (t.length() > 1) {
 706             // Check there's not a (lower case) letter or digit
 707             // immediately after it.
 708             // FIXME: should I.B.M..P.T.O be a range search?
 709             if (p == end || !is_wordchar(*p)) {
 710                 it = p;
 711                 swap(term, t);
 712             }
 713         }
 714     }
 715     was_acronym = !term.empty();
 716
 717     if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
 718         term = CJK::get_cjk(it);
 719         is_cjk_term = true;
 720     }
 721
 722     if (term.empty()) {
 723         unsigned prevch = *it;
 724         Unicode::append_utf8(term, prevch);
 725         while (++it != end) {
 726             if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
 727             unsigned ch = *it;
 728             if (!is_wordchar(ch)) {
 729                 // Treat a single embedded '&' or "'" or similar as a word
 730                 // character (e.g. AT&T, Fred's).  Also, normalise
 731                 // apostrophes to ASCII apostrophe.
 732                 Utf8Iterator p = it;
 733                 ++p;
 734                 if (p == end || !is_wordchar(*p)) break;
 735                 unsigned nextch = *p;
 736                 if (is_digit(prevch) && is_digit(nextch)) {
 737                     ch = check_infix_digit(ch);
 738                 } else {
 739                     ch = check_infix(ch);
 740                 }
 741                 if (!ch) break;
 742                 if (ch == UNICODE_IGNORE)
 743                     continue;
 744             }
 745             Unicode::append_utf8(term, ch);
 746             prevch = ch;
 747         }
 748         if (it != end && is_suffix(*it)) {
 749             string suff_term = term;
 750             Utf8Iterator p = it;
 751             // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
 752             do {
 753                 if (suff_term.size() - term.size() == 3) {
 754                     suff_term.resize(0);
 755                     break;
 756                 }
 757                 suff_term += *p;
 758             } while (is_suffix(*++p));
 759             if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
 760                 // If the suffixed term doesn't exist, check that the
 761                 // non-suffixed term does.  This also takes care of
 762                 // the case when QueryParser::set_database() hasn't
 763                 // been called.
 764                 bool use_suff_term = false;
 765                 string lc = Unicode::tolower(suff_term);
 766                 if (db.term_exists(lc)) {
 767                     use_suff_term = true;
 768                 } else {
 769                     lc = Unicode::tolower(term);
 770                     if (!db.term_exists(lc)) use_suff_term = true;
 771                 }
 772                 if (use_suff_term) {
 773                     term = suff_term;
 774                     it = p;
 775                 }
 776             }
 777         }
 778     }
 779     return term;
 780 }
 781
 782 }
 783 // Switch to %code to insert at the end of the file so struct yyParser has been
 784 // defined.
 785 %code {
 786
 787 Query
 788 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
 789                                    const string &default_prefix)
 790 {
 791     bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
 792
 793     // Set ranges if we may have to handle ranges in the query.
 794     bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
 795
 796     termpos term_pos = 1;
 797     Utf8Iterator it(qs), end;
 798
 799     State state(this, flags);
 800
 801     // To successfully apply more than one spelling correction to a query
 802     // string, we must keep track of the offset due to previous corrections.
 803     int correction_offset = 0;
 804     corrected_query.resize(0);
 805
 806     // Stack of prefixes, used for phrases and subexpressions.
 807     list<const FieldInfo *> prefix_stack;
 808
 809     // If default_prefix is specified, use it.  Otherwise, use any list
 810     // that has been set for the empty prefix.
 811     const FieldInfo def_pfx(NON_BOOLEAN, default_prefix);
 812     {
 813         const FieldInfo * default_field_info = &def_pfx;
 814         if (default_prefix.empty()) {
 815             auto f = field_map.find(string());
 816             if (f != field_map.end()) default_field_info = &(f->second);
 817         }
 818
 819         // We always have the current prefix on the top of the stack.
 820         prefix_stack.push_back(default_field_info);
 821     }
 822
 823     yyParser parser;
 824
 825     unsigned newprev = ' ';
 826 main_lex_loop:
 827     enum {
 828         DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
 829         IN_GROUP2, EXPLICIT_SYNONYM
 830     } mode = DEFAULT;
 831     while (it != end && !state.error) {
 832         bool last_was_operator = false;
 833         bool last_was_operator_needing_term = false;
 834         if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
 835         if (false) {
 836 just_had_operator:
 837             if (it == end) break;
 838             mode = DEFAULT;
 839             last_was_operator_needing_term = false;
 840             last_was_operator = true;
 841         }
 842         if (false) {
 843 just_had_operator_needing_term:
 844             last_was_operator_needing_term = true;
 845             last_was_operator = true;
 846         }
 847         if (mode == IN_PHRASED_TERM) mode = DEFAULT;
 848         if (is_whitespace(*it)) {
 849             newprev = ' ';
 850             ++it;
 851             it = find_if(it, end, is_not_whitespace);
 852             if (it == end) break;
 853         }
 854
 855         if (ranges &&
 856             (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
 857             // Scan forward to see if this could be the "start of range"
 858             // token.  Sadly this has O(n^2) tendencies, though at least
 859             // "n" is the number of words in a query which is likely to
 860             // remain fairly small.  FIXME: can we tokenise more elegantly?
 861             Utf8Iterator it_initial = it;
 862             Utf8Iterator p = it;
 863             unsigned ch = 0;
 864             while (p != end) {
 865                 if (ch == '.' && *p == '.') {
 866                     string a;
 867                     while (it != p) {
 868                         Unicode::append_utf8(a, *it++);
 869                     }
 870                     // Trim off the trailing ".".
 871                     a.resize(a.size() - 1);
 872                     ++p;
 873                     // Either end of the range can be empty (for an open-ended
 874                     // range) but both can't be empty.
 875                     if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
 876                         string b;
 877                         // Allow any character except whitespace and ')' in the
 878                         // upper bound.
 879                         while (p != end && *p > ' ' && *p != ')') {
 880                             Unicode::append_utf8(b, *p++);
 881                         }
 882                         Term * range = state.range(a, b);
 883                         if (!range) {
 884                             state.error = "Unknown range operation";
 885                             if (a.find(':', 1) == string::npos) {
 886                                 goto done;
 887                             }
 888                             // Might be a boolean filter with ".." in.  Leave
 889                             // state.error in case it isn't.
 890                             it = it_initial;
 891                             break;
 892                         }
 893                         Parse(&parser, RANGE, range, &state);
 894                     }
 895                     it = p;
 896                     goto main_lex_loop;
 897                 }
 898                 ch = *p;
 899                 // Allow any character except whitespace and '(' in the lower
 900                 // bound.
 901                 if (ch <= ' ' || ch == '(') break;
 902                 ++p;
 903             }
 904         }
 905
 906         if (!is_wordchar(*it)) {
 907             unsigned prev = newprev;
 908             unsigned ch = *it++;
 909             newprev = ch;
 910             // Drop out of IN_GROUP mode.
 911             if (mode == IN_GROUP || mode == IN_GROUP2)
 912                 mode = DEFAULT;
 913             switch (ch) {
 914               case '"':
 915               case 0x201c: // Left curly double quote.
 916               case 0x201d: // Right curly double quote.
 917                 // Quoted phrase.
 918                 if (mode == DEFAULT) {
 919                     // Skip whitespace.
 920                     it = find_if(it, end, is_not_whitespace);
 921                     if (it == end) {
 922                         // Ignore an unmatched " at the end of the query to
 923                         // avoid generating an empty pair of QUOTEs which will
 924                         // cause a parse error.
 925                         goto done;
 926                     }
 927                     if (is_double_quote(*it)) {
 928                         // Ignore empty "" (but only if we're not already
 929                         // IN_QUOTES as we don't merge two adjacent quoted
 930                         // phrases!)
 931                         newprev = *it++;
 932                         break;
 933                     }
 934                 }
 935                 if (flags & QueryParser::FLAG_PHRASE) {
 936                     Parse(&parser, QUOTE, NULL, &state);
 937                     if (mode == DEFAULT) {
 938                         mode = IN_QUOTES;
 939                     } else {
 940                         // Remove the prefix we pushed for this phrase.
 941                         if (mode == IN_PREFIXED_QUOTES)
 942                             prefix_stack.pop_back();
 943                         mode = DEFAULT;
 944                     }
 945                 }
 946                 break;
 947
 948               case '+': case '-': // Loved or hated term/phrase/subexpression.
 949                 // Ignore + or - at the end of the query string.
 950                 if (it == end) goto done;
 951                 if (prev > ' ' && prev != '(') {
 952                     // Or if not after whitespace or an open bracket.
 953                     break;
 954                 }
 955                 if (is_whitespace(*it) || *it == '+' || *it == '-') {
 956                     // Ignore + or - followed by a space, or further + or -.
 957                     // Postfix + (such as in C++ and H+) is handled as part of
 958                     // the term lexing code in parse_term().
 959                     newprev = *it++;
 960                     break;
 961                 }
 962                 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
 963                     int token;
 964                     if (ch == '+') {
 965                         token = LOVE;
 966                     } else if (last_was_operator) {
 967                         token = HATE_AFTER_AND;
 968                     } else {
 969                         token = HATE;
 970                     }
 971                     Parse(&parser, token, NULL, &state);
 972                     goto just_had_operator_needing_term;
 973                 }
 974                 // Need to prevent the term after a LOVE or HATE starting a
 975                 // term group...
 976                 break;
 977
 978               case '(': // Bracketed subexpression.
 979                 // Skip whitespace.
 980                 it = find_if(it, end, is_not_whitespace);
 981                 // Ignore ( at the end of the query string.
 982                 if (it == end) goto done;
 983                 if (prev > ' ' && strchr("()+-", prev) == NULL) {
 984                     // Or if not after whitespace or a bracket or '+' or '-'.
 985                     break;
 986                 }
 987                 if (*it == ')') {
 988                     // Ignore empty ().
 989                     newprev = *it++;
 990                     break;
 991                 }
 992                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
 993                     prefix_stack.push_back(prefix_stack.back());
 994                     Parse(&parser, BRA, NULL, &state);
 995                 }
 996                 break;
 997
 998               case ')': // End of bracketed subexpression.
 999                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1000                     // Remove the prefix we pushed for the corresponding BRA.
1001                     // If brackets are unmatched, it's a syntax error, but
1002                     // that's no excuse to SEGV!
1003                     if (prefix_stack.size() > 1) prefix_stack.pop_back();
1004                     Parse(&parser, KET, NULL, &state);
1005                 }
1006                 break;
1007
1008               case '~': // Synonym expansion.
1009                 // Ignore at the end of the query string.
1010                 if (it == end) goto done;
1011                 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1012                     if (prev > ' ' && strchr("+-(", prev) == NULL) {
1013                         // Or if not after whitespace, +, -, or an open bracket.
1014                         break;
1015                     }
1016                     if (!is_wordchar(*it)) {
1017                         // Ignore if not followed by a word character.
1018                         break;
1019                     }
1020                     Parse(&parser, SYNONYM, NULL, &state);
1021                     mode = EXPLICIT_SYNONYM;
1022                     goto just_had_operator_needing_term;
1023                 }
1024                 break;
1025             }
1026             // Skip any other characters.
1027             continue;
1028         }
1029
1030         Assert(is_wordchar(*it));
1031
1032         size_t term_start_index = it.raw() - qs.data();
1033
1034         newprev = 'A'; // Any letter will do...
1035
1036         // A term, a prefix, or a boolean operator.
1037         const FieldInfo * field_info = NULL;
1038         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1039             !field_map.empty()) {
1040             // Check for a fieldname prefix (e.g. title:historical).
1041             Utf8Iterator p = find_if(it, end, is_not_wordchar);
1042             if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1043                 string field;
1044                 p = it;
1045                 while (*p != ':')
1046                     Unicode::append_utf8(field, *p++);
1047                 map<string, FieldInfo>::const_iterator f;
1048                 f = field_map.find(field);
1049                 if (f != field_map.end()) {
1050                     // Special handling for prefixed fields, depending on the
1051                     // type of the prefix.
1052                     unsigned ch = *++p;
1053                     field_info = &(f->second);
1054
1055                     if (field_info->type != NON_BOOLEAN) {
1056                         // Drop out of IN_GROUP if we're in it.
1057                         if (mode == IN_GROUP || mode == IN_GROUP2)
1058                             mode = DEFAULT;
1059                         it = p;
1060                         string name;
1061                         if (it != end && is_double_quote(*it)) {
1062                             // Quoted boolean term (can contain any character).
1063                             bool fancy = (*it != '"');
1064                             ++it;
1065                             while (it != end) {
1066                                 if (*it == '"') {
1067                                     // Interpret "" as an escaped ".
1068                                     if (++it == end || *it != '"')
1069                                         break;
1070                                 } else if (fancy && is_double_quote(*it)) {
1071                                     // If the opening quote was ASCII, then the
1072                                     // closing one must be too - otherwise
1073                                     // the user can't protect non-ASCII double
1074                                     // quote characters by quoting or escaping.
1075                                     ++it;
1076                                     break;
1077                                 }
1078                                 Unicode::append_utf8(name, *it++);
1079                             }
1080                         } else {
1081                             // Can't boolean filter prefix a subexpression, so
1082                             // just use anything following the prefix until the
1083                             // next space or ')' as part of the boolean filter
1084                             // term.
1085                             while (it != end && *it > ' ' && *it != ')')
1086                                 Unicode::append_utf8(name, *it++);
1087                         }
1088                         // Build the unstemmed form in field.
1089                         field += ':';
1090                         field += name;
1091                         // Clear any pending range error.
1092                         state.error = NULL;
1093                         Term * token = new Term(&state, name, field_info, field);
1094                         Parse(&parser, BOOLEAN_FILTER, token, &state);
1095                         continue;
1096                     }
1097
1098                     if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1099                         // Prefixed phrase, e.g.: subject:"space flight"
1100                         mode = IN_PREFIXED_QUOTES;
1101                         Parse(&parser, QUOTE, NULL, &state);
1102                         it = p;
1103                         newprev = ch;
1104                         ++it;
1105                         prefix_stack.push_back(field_info);
1106                         continue;
1107                     }
1108
1109                     if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1110                         // Prefixed subexpression, e.g.: title:(fast NEAR food)
1111                         mode = DEFAULT;
1112                         Parse(&parser, BRA, NULL, &state);
1113                         it = p;
1114                         newprev = ch;
1115                         ++it;
1116                         prefix_stack.push_back(field_info);
1117                         continue;
1118                     }
1119
1120                     if (ch != ':') {
1121                         // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1122                         while (is_phrase_generator(ch)) {
1123                             if (++p == end)
1124                                 goto not_prefix;
1125                             ch = *p;
1126                         }
1127                     }
1128
1129                     if (is_wordchar(ch)) {
1130                         // Prefixed term.
1131                         it = p;
1132                     } else {
1133 not_prefix:
1134                         // It looks like a prefix but isn't, so parse it as
1135                         // text instead.
1136                         field_info = NULL;
1137                     }
1138                 }
1139             }
1140         }
1141
1142 phrased_term:
1143         bool was_acronym;
1144         bool is_cjk_term = false;
1145         string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
1146
1147         // Boolean operators.
1148         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1149             (flags & FLAG_BOOLEAN) &&
1150             // Don't want to interpret A.N.D. as an AND operator.
1151             !was_acronym &&
1152             !field_info &&
1153             term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1154
1155             string op = term;
1156             if (flags & FLAG_BOOLEAN_ANY_CASE) {
1157                 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1158                     *i = C_toupper(*i);
1159                 }
1160             }
1161             if (op.size() == 3) {
1162                 if (op == "AND") {
1163                     Parse(&parser, AND, NULL, &state);
1164                     goto just_had_operator;
1165                 }
1166                 if (op == "NOT") {
1167                     Parse(&parser, NOT, NULL, &state);
1168                     goto just_had_operator;
1169                 }
1170                 if (op == "XOR") {
1171                     Parse(&parser, XOR, NULL, &state);
1172                     goto just_had_operator;
1173                 }
1174                 if (op == "ADJ") {
1175                     if (it != end && *it == '/') {
1176                         size_t width = 0;
1177                         Utf8Iterator p = it;
1178                         while (++p != end && U_isdigit(*p)) {
1179                             width = (width * 10) + (*p - '0');
1180                         }
1181                         if (width && (p == end || is_whitespace(*p))) {
1182                             it = p;
1183                             Parse(&parser, ADJ, new Term(width), &state);
1184                             goto just_had_operator;
1185                         }
1186                     } else {
1187                         Parse(&parser, ADJ, NULL, &state);
1188                         goto just_had_operator;
1189                     }
1190                 }
1191             } else if (op.size() == 2) {
1192                 if (op == "OR") {
1193                     Parse(&parser, OR, NULL, &state);
1194                     goto just_had_operator;
1195                 }
1196             } else if (op.size() == 4) {
1197                 if (op == "NEAR") {
1198                     if (it != end && *it == '/') {
1199                         size_t width = 0;
1200                         Utf8Iterator p = it;
1201                         while (++p != end && U_isdigit(*p)) {
1202                             width = (width * 10) + (*p - '0');
1203                         }
1204                         if (width && (p == end || is_whitespace(*p))) {
1205                             it = p;
1206                             Parse(&parser, NEAR, new Term(width), &state);
1207                             goto just_had_operator;
1208                         }
1209                     } else {
1210                         Parse(&parser, NEAR, NULL, &state);
1211                         goto just_had_operator;
1212                     }
1213                 }
1214             }
1215         }
1216
1217         // If no prefix is set, use the default one.
1218         if (!field_info) field_info = prefix_stack.back();
1219
1220         Assert(field_info->type == NON_BOOLEAN);
1221
1222         {
1223             string unstemmed_term(term);
1224             term = Unicode::tolower(term);
1225
1226             // Reuse stem_strategy - STEM_SOME here means "stem terms except
1227             // when used with positional operators".
1228             stem_strategy stem_term = stem_action;
1229             if (stem_term != STEM_NONE) {
1230                 if (!stemmer.internal.get()) {
1231                     // No stemmer is set.
1232                     stem_term = STEM_NONE;
1233                 } else if (stem_term == STEM_SOME ||
1234                            stem_term == STEM_SOME_FULL_POS) {
1235                     if (!should_stem(unstemmed_term) ||
1236                         (it != end && is_stem_preventer(*it))) {
1237                         // Don't stem this particular term.
1238                         stem_term = STEM_NONE;
1239                     }
1240                 }
1241             }
1242
1243             Term * term_obj = new Term(&state, term, field_info,
1244                                        unstemmed_term, stem_term, term_pos++);
1245
1246             if (is_cjk_term) {
1247                 Parse(&parser, CJKTERM, term_obj, &state);
1248                 if (it == end) break;
1249                 continue;
1250             }
1251
1252             if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1253                 if (it != end) {
1254                     if ((flags & FLAG_WILDCARD) && *it == '*') {
1255                         Utf8Iterator p(it);
1256                         ++p;
1257                         if (p == end || !is_wordchar(*p)) {
1258                             it = p;
1259                             if (mode == IN_GROUP || mode == IN_GROUP2) {
1260                                 // Drop out of IN_GROUP and flag that the group
1261                                 // can be empty if all members are stopwords.
1262                                 if (mode == IN_GROUP2)
1263                                     Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1264                                 mode = DEFAULT;
1265                             }
1266                             // Wildcard at end of term (also known as
1267                             // "right truncation").
1268                             Parse(&parser, WILD_TERM, term_obj, &state);
1269                             continue;
1270                         }
1271                     }
1272                 } else {
1273                     if (flags & FLAG_PARTIAL) {
1274                         if (mode == IN_GROUP || mode == IN_GROUP2) {
1275                             // Drop out of IN_GROUP and flag that the group
1276                             // can be empty if all members are stopwords.
1277                             if (mode == IN_GROUP2)
1278                                 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1279                             mode = DEFAULT;
1280                         }
1281                         // Final term of a partial match query, with no
1282                         // following characters - treat as a wildcard.
1283                         Parse(&parser, PARTIAL_TERM, term_obj, &state);
1284                         continue;
1285                     }
1286                 }
1287             }
1288
1289             // Check spelling, if we're a normal term, and any of the prefixes
1290             // are empty.
1291             if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1292                 const list<string> & pfxes = field_info->prefixes;
1293                 list<string>::const_iterator pfx_it;
1294                 for (pfx_it = pfxes.begin(); pfx_it != pfxes.end(); ++pfx_it) {
1295                     if (!pfx_it->empty())
1296                         continue;
1297                     const string & suggest = db.get_spelling_suggestion(term);
1298                     if (!suggest.empty()) {
1299                         if (corrected_query.empty()) corrected_query = qs;
1300                         size_t term_end_index = it.raw() - qs.data();
1301                         size_t n = term_end_index - term_start_index;
1302                         size_t pos = term_start_index + correction_offset;
1303                         corrected_query.replace(pos, n, suggest);
1304                         correction_offset += suggest.size();
1305                         correction_offset -= n;
1306                     }
1307                     break;
1308                 }
1309             }
1310
1311             if (mode == IN_PHRASED_TERM) {
1312                 Parse(&parser, PHR_TERM, term_obj, &state);
1313             } else {
1314                 // See if the next token will be PHR_TERM - if so, this one
1315                 // needs to be TERM not GROUP_TERM.
1316                 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1317                     is_phrase_generator(*it)) {
1318                     // FIXME: can we clean this up?
1319                     Utf8Iterator p = it;
1320                     do {
1321                         ++p;
1322                     } while (p != end && is_phrase_generator(*p));
1323                     // Don't generate a phrase unless the phrase generators are
1324                     // immediately followed by another term.
1325                     if (p != end && is_wordchar(*p)) {
1326                         mode = DEFAULT;
1327                     }
1328                 }
1329
1330                 int token = TERM;
1331                 if (mode == IN_GROUP || mode == IN_GROUP2) {
1332                     mode = IN_GROUP2;
1333                     token = GROUP_TERM;
1334                 }
1335                 Parse(&parser, token, term_obj, &state);
1336                 if (token == TERM && mode != DEFAULT)
1337                     continue;
1338             }
1339         }
1340
1341         if (it == end) break;
1342
1343         if (is_phrase_generator(*it)) {
1344             // Skip multiple phrase generators.
1345             do {
1346                 ++it;
1347             } while (it != end && is_phrase_generator(*it));
1348             // Don't generate a phrase unless the phrase generators are
1349             // immediately followed by another term.
1350             if (it != end && is_wordchar(*it)) {
1351                 mode = IN_PHRASED_TERM;
1352                 term_start_index = it.raw() - qs.data();
1353                 goto phrased_term;
1354             }
1355         } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1356             int old_mode = mode;
1357             mode = DEFAULT;
1358             if (!last_was_operator_needing_term && is_whitespace(*it)) {
1359                 newprev = ' ';
1360                 // Skip multiple whitespace.
1361                 do {
1362                     ++it;
1363                 } while (it != end && is_whitespace(*it));
1364                 // Don't generate a group unless the terms are only separated
1365                 // by whitespace.
1366                 if (it != end && is_wordchar(*it)) {
1367                     if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1368                         mode = IN_GROUP2;
1369                     } else {
1370                         mode = IN_GROUP;
1371                     }
1372                 }
1373             }
1374         }
1375     }
1376 done:
1377     if (!state.error) {
1378         // Implicitly close any unclosed quotes.
1379         if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1380             Parse(&parser, QUOTE, NULL, &state);
1381
1382         // Implicitly close all unclosed brackets.
1383         while (prefix_stack.size() > 1) {
1384             Parse(&parser, KET, NULL, &state);
1385             prefix_stack.pop_back();
1386         }
1387         Parse(&parser, 0, NULL, &state);
1388     }
1389
1390     errmsg = state.error;
1391     return state.query;
1392 }
1393
1394 }
1395 %include {
1396
1397 struct ProbQuery {
1398     Query* query = NULL;
1399     Query* love = NULL;
1400     Query* hate = NULL;
1401     // filter is a map from prefix to a query for that prefix.  Queries with
1402     // the same prefix are combined with OR, and the results of this are
1403     // combined with AND to get the full filter.
1404     map<string, Query> filter;
1405
1406     ProbQuery() {}
1407
1408     explicit
1409     ProbQuery(Query* query_) : query(query_) {}
1410
1411     ~ProbQuery() {
1412         delete query;
1413         delete love;
1414         delete hate;
1415     }
1416
1417     void add_filter(const string& grouping, const Query & q) {
1418         filter[grouping] = q;
1419     }
1420
1421     void append_filter(const string& grouping, const Query & qnew) {
1422         auto it = filter.find(grouping);
1423         if (it == filter.end()) {
1424             filter.insert(make_pair(grouping, qnew));
1425         } else {
1426             Query & q = it->second;
1427             // We OR multiple filters with the same prefix if they're
1428             // exclusive, otherwise we AND them.
1429             bool exclusive = !grouping.empty();
1430             Query::op op = exclusive ? Query::OP_OR : Query::OP_AND;
1431             q = Query(op, q, qnew);
1432         }
1433     }
1434
1435     void add_filter_range(const string& grouping, const Query & range) {
1436         filter[grouping] = range;
1437     }
1438
1439     void append_filter_range(const string& grouping, const Query & range) {
1440         Query & q = filter[grouping];
1441         q = Query(Query::OP_OR, q, range);
1442     }
1443
1444     Query merge_filters() const {
1445         auto i = filter.begin();
1446         Assert(i != filter.end());
1447         Query q = i->second;
1448         while (++i != filter.end()) {
1449             q = Query(Query::OP_AND, q, i->second);
1450         }
1451         return q;
1452     }
1453 };
1454
1455 /// A group of terms separated only by whitespace.
1456 class TermGroup {
1457     vector<Term *> terms;
1458
1459     /** Controls how to handle a group where all terms are stopwords.
1460      *
1461      *  If true, then as_group() returns NULL.  If false, then the
1462      *  stopword status of the terms is ignored.
1463      */
1464     bool empty_ok;
1465
1466     TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1467         add_term(t1);
1468         add_term(t2);
1469     }
1470
1471   public:
1472     /// Factory function - ensures heap allocation.
1473     static TermGroup* create(Term* t1, Term* t2) {
1474         return new TermGroup(t1, t2);
1475     }
1476
1477     ~TermGroup() {
1478         for (auto&& t : terms) {
1479             delete t;
1480         }
1481     }
1482
1483     /// Add a Term object to this TermGroup object.
1484     void add_term(Term * term) {
1485         terms.push_back(term);
1486     }
1487
1488     /// Set the empty_ok flag.
1489     void set_empty_ok() { empty_ok = true; }
1490
1491     /// Convert to a Xapian::Query * using default_op.
1492     Query * as_group(State *state) const;
1493 };
1494
1495 Query *
1496 TermGroup::as_group(State *state) const
1497 {
1498     const Xapian::Stopper * stopper = state->get_stopper();
1499     size_t stoplist_size = state->stoplist_size();
1500     bool default_op_is_positional = is_positional(state->default_op());
1501 reprocess:
1502     Query::op default_op = state->default_op();
1503     vector<Query> subqs;
1504     subqs.reserve(terms.size());
1505     if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1506         // Check for multi-word synonyms.
1507         Database db = state->get_database();
1508
1509         string key;
1510         vector<Term*>::const_iterator begin = terms.begin();
1511         vector<Term*>::const_iterator i = begin;
1512         while (i != terms.end()) {
1513             TermIterator synkey(db.synonym_keys_begin((*i)->name));
1514             TermIterator synend(db.synonym_keys_end((*i)->name));
1515             if (synkey == synend) {
1516                 // No multi-synonym matches.
1517                 if (stopper && (*stopper)((*i)->name)) {
1518                     state->add_to_stoplist(*i);
1519                 } else {
1520                     if (default_op_is_positional)
1521                         (*i)->need_positions();
1522                     subqs.push_back((*i)->get_query_with_auto_synonyms());
1523                 }
1524                 begin = ++i;
1525                 continue;
1526             }
1527             key.resize(0);
1528             while (i != terms.end()) {
1529                 if (!key.empty()) key += ' ';
1530                 key += (*i)->name;
1531                 ++i;
1532                 synkey.skip_to(key);
1533                 if (synkey == synend || !startswith(*synkey, key)) break;
1534             }
1535             // Greedily try to match as many consecutive words as possible.
1536             TermIterator syn, end;
1537             while (true) {
1538                 syn = db.synonyms_begin(key);
1539                 end = db.synonyms_end(key);
1540                 if (syn != end) break;
1541                 if (--i == begin) break;
1542                 key.resize(key.size() - (*i)->name.size() - 1);
1543             }
1544             if (i == begin) {
1545                 // No multi-synonym matches.
1546                 if (stopper && (*stopper)((*i)->name)) {
1547                     state->add_to_stoplist(*i);
1548                 } else {
1549                     if (default_op_is_positional)
1550                         (*i)->need_positions();
1551                     subqs.push_back((*i)->get_query_with_auto_synonyms());
1552                 }
1553                 begin = ++i;
1554                 continue;
1555             }
1556
1557             vector<Query> subqs2;
1558             vector<Term*>::const_iterator j;
1559             for (j = begin; j != i; ++j) {
1560                 if (stopper && (*stopper)((*j)->name)) {
1561                     state->add_to_stoplist(*j);
1562                 } else {
1563                     if (default_op_is_positional)
1564                         (*i)->need_positions();
1565                     subqs2.push_back((*j)->get_query());
1566                 }
1567             }
1568             Query q_original_terms;
1569             if (default_op_is_positional) {
1570                 q_original_terms = Query(default_op,
1571                                          subqs2.begin(), subqs2.end(),
1572                                          subqs2.size() + 9);
1573             } else {
1574                 q_original_terms = Query(default_op,
1575                                          subqs2.begin(), subqs2.end());
1576             }
1577             subqs2.clear();
1578
1579             // Use the position of the first term for the synonyms.
1580             Query q(Query::OP_SYNONYM,
1581                     SynonymIterator(syn, (*begin)->pos, &q_original_terms),
1582                     SynonymIterator(end));
1583             subqs.push_back(q);
1584
1585             begin = i;
1586         }
1587     } else {
1588         vector<Term*>::const_iterator i;
1589         for (i = terms.begin(); i != terms.end(); ++i) {
1590             if (stopper && (*stopper)((*i)->name)) {
1591                 state->add_to_stoplist(*i);
1592             } else {
1593                 if (default_op_is_positional)
1594                     (*i)->need_positions();
1595                 subqs.push_back((*i)->get_query_with_auto_synonyms());
1596             }
1597         }
1598     }
1599
1600     if (!empty_ok && stopper && subqs.empty() &&
1601         stoplist_size < state->stoplist_size()) {
1602         // This group is all stopwords, so roll-back, disable stopper
1603         // temporarily, and reprocess this group.
1604         state->stoplist_resize(stoplist_size);
1605         stopper = NULL;
1606         goto reprocess;
1607     }
1608
1609     Query * q = NULL;
1610     if (!subqs.empty()) {
1611         if (default_op_is_positional) {
1612             q = new Query(default_op, subqs.begin(), subqs.end(),
1613                              subqs.size() + 9);
1614         } else {
1615             q = new Query(default_op, subqs.begin(), subqs.end());
1616         }
1617     }
1618     delete this;
1619     return q;
1620 }
1621
1622 /// Some terms which form a positional sub-query.
1623 class Terms {
1624     vector<Term *> terms;
1625     size_t window;
1626
1627     /** Keep track of whether the terms added all have the same list of
1628      *  prefixes.  If so, we'll build a set of phrases, one using each prefix.
1629      *  This works around the limitation that a phrase cannot have multiple
1630      *  components which are "OR" combinations of terms, but is also probably
1631      *  what users expect: i.e., if a user specifies a phrase in a field, and
1632      *  that field maps to multiple prefixes, the user probably wants a phrase
1633      *  returned with all terms having one of those prefixes, rather than a
1634      *  phrase comprised of terms with differing prefixes.
1635      */
1636     bool uniform_prefixes;
1637
1638     /** The list of prefixes of the terms added.
1639      *  This will be NULL if the terms have different prefixes.
1640      */
1641     const list<string> * prefixes;
1642
1643     /// Convert to a query using the given operator and window size.
1644     Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
1645         Query * q = NULL;
1646         size_t n_terms = terms.size();
1647         Xapian::termcount w = w_delta + terms.size();
1648         if (uniform_prefixes) {
1649             if (prefixes) {
1650                 list<string>::const_iterator piter;
1651                 for (piter = prefixes->begin(); piter != prefixes->end(); ++piter) {
1652                     vector<Query> subqs;
1653                     subqs.reserve(n_terms);
1654                     vector<Term *>::const_iterator titer;
1655                     for (titer = terms.begin(); titer != terms.end(); ++titer) {
1656                         Term * t = *titer;
1657                         subqs.push_back(Query(t->make_term(*piter), 1, t->pos));
1658                     }
1659                     add_to_query(q, Query::OP_OR,
1660                                  Query(op, subqs.begin(), subqs.end(), w));
1661                 }
1662             }
1663         } else {
1664             vector<Query> subqs;
1665             subqs.reserve(n_terms);
1666             vector<Term *>::const_iterator titer;
1667             for (titer = terms.begin(); titer != terms.end(); ++titer) {
1668                 subqs.push_back((*titer)->get_query());
1669             }
1670             q = new Query(op, subqs.begin(), subqs.end(), w);
1671         }
1672
1673         delete this;
1674         return q;
1675     }
1676
1677     Terms() : window(0), uniform_prefixes(true), prefixes(NULL) { }
1678
1679   public:
1680     /// Factory function - ensures heap allocation.
1681     static Terms* create() {
1682         return new Terms();
1683     }
1684
1685     ~Terms() {
1686         for (auto&& t : terms) {
1687             delete t;
1688         }
1689     }
1690
1691     /// Add an unstemmed Term object to this Terms object.
1692     void add_positional_term(Term * term) {
1693         const list<string> & term_prefixes = term->field_info->prefixes;
1694         if (terms.empty()) {
1695             prefixes = &term_prefixes;
1696         } else if (uniform_prefixes && prefixes != &term_prefixes) {
1697             if (*prefixes != term_prefixes)  {
1698                 prefixes = NULL;
1699                 uniform_prefixes = false;
1700             }
1701         }
1702         term->need_positions();
1703         terms.push_back(term);
1704     }
1705
1706     void adjust_window(size_t alternative_window) {
1707         if (alternative_window > window) window = alternative_window;
1708     }
1709
1710     /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
1711     Query * as_phrase_query() const {
1712         return as_opwindow_query(Query::OP_PHRASE, 0);
1713     }
1714
1715     /// Convert to a Xapian::Query * using OP_NEAR.
1716     Query * as_near_query() const {
1717         // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
1718         // means a window size of 11.  For more than 2 terms, we just add one
1719         // to the window size for each extra term.
1720         size_t w = window;
1721         if (w == 0) w = 10;
1722         return as_opwindow_query(Query::OP_NEAR, w - 1);
1723     }
1724
1725     /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
1726     Query * as_adj_query() const {
1727         // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
1728         // which means a window size of 11.  For more than 2 terms, we just add
1729         // one to the window size for each extra term.
1730         size_t w = window;
1731         if (w == 0) w = 10;
1732         return as_opwindow_query(Query::OP_PHRASE, w - 1);
1733     }
1734 };
1735
1736 void
1737 Term::as_positional_cjk_term(Terms * terms) const
1738 {
1739     // Add each individual CJK character to the phrase.
1740     string t;
1741     for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
1742         Unicode::append_utf8(t, *it);
1743         Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
1744         terms->add_positional_term(c);
1745         t.resize(0);
1746     }
1747
1748     // FIXME: we want to add the n-grams as filters too for efficiency.
1749
1750     delete this;
1751 }
1752
1753 // Helper macro for converting a boolean operation into a Xapian::Query.
1754 #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
1755     do {\
1756         if (!A || !B) {\
1757             state->error = "Syntax: <expression> " OP_TXT " <expression>";\
1758             yy_parse_failed(yypParser);\
1759             return;\
1760         }\
1761         E = new Query(OP, *A, *B);\
1762         delete A;\
1763         delete B;\
1764     } while (0)
1765
1766 }
1767
1768 %token_type {Term *}
1769 %token_destructor {delete $$;}
1770
1771 %extra_argument {State * state}
1772
1773 %parse_failure {
1774     // If we've not already set an error message, set a default one.
1775     if (!state->error) state->error = "parse error";
1776 }
1777
1778 %syntax_error {
1779     yy_parse_failed(yypParser);
1780 }
1781
1782 // Operators, grouped in order of increasing precedence:
1783 %nonassoc ERROR.
1784 %left OR.
1785 %left XOR.
1786 %left AND NOT.
1787 %left NEAR ADJ.
1788 %left LOVE HATE HATE_AFTER_AND SYNONYM.
1789
1790 // Destructors for terminal symbols:
1791
1792 // TERM is a query term, including prefix (if any).
1793 %destructor TERM {delete $$;}
1794
1795 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
1796 // is only separated by whitespace characters.
1797 %destructor GROUP_TERM {delete $$;}
1798
1799 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
1800 // separated only by one or more phrase generator characters (hyphen and
1801 // apostrophe are common examples - see is_phrase_generator() for the list
1802 // of all punctuation which does this).
1803 %destructor PHR_TERM {delete $$;}
1804
1805 // WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
1806 // expanded.
1807 %destructor WILD_TERM {delete $$;}
1808
1809 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
1810 // we're doing "search as you type".  It expands to something like WILD_TERM
1811 // OR stemmed_form.
1812 %destructor PARTIAL_TERM {delete $$;}
1813
1814 // BOOLEAN_FILTER is a query term with a prefix registered using
1815 // add_boolean_prefix().  It's added to the query using an OP_FILTER operator,
1816 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
1817 %destructor BOOLEAN_FILTER {delete $$;}
1818
1819 // Grammar rules:
1820
1821 // query - The whole query - just an expr or nothing.
1822
1823 // query non-terminal doesn't need a type, so just give a dummy one.
1824 %type query {int}
1825
1826 query ::= expr(E). {
1827     // Save the parsed query in the State structure so we can return it.
1828     if (E) {
1829         state->query = *E;
1830         delete E;
1831     } else {
1832         state->query = Query();
1833     }
1834 }
1835
1836 query ::= . {
1837     // Handle a query string with no terms in.
1838     state->query = Query();
1839 }
1840
1841 // expr - A query expression.
1842
1843 %type expr {Query *}
1844 %destructor expr {delete $$;}
1845
1846 expr(E) ::= prob_expr(E).
1847
1848 expr(E) ::= bool_arg(A) AND bool_arg(B).
1849         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND, B, "AND"); }
1850
1851 expr(E) ::= bool_arg(A) NOT bool_arg(B). {
1852     // 'NOT foo' -> '<alldocuments> NOT foo'
1853     if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
1854         A = new Query("", 1, 0);
1855     }
1856     BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "NOT");
1857 }
1858
1859 expr(E) ::= bool_arg(A) AND NOT bool_arg(B). [NOT]
1860         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND NOT"); }
1861
1862 expr(E) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND]
1863         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND"); }
1864
1865 expr(E) ::= bool_arg(A) OR bool_arg(B).
1866         { BOOL_OP_TO_QUERY(E, A, Query::OP_OR, B, "OR"); }
1867
1868 expr(E) ::= bool_arg(A) XOR bool_arg(B).
1869         { BOOL_OP_TO_QUERY(E, A, Query::OP_XOR, B, "XOR"); }
1870
1871 // bool_arg - an argument to a boolean operator such as AND or OR.
1872
1873 %type bool_arg {Query *}
1874 %destructor bool_arg {delete $$;}
1875
1876 bool_arg(A) ::= expr(A).
1877
1878 bool_arg(A) ::= . [ERROR] {
1879     // Set the argument to NULL, which enables the bool_arg-using rules in
1880     // expr above to report uses of AND, OR, etc which don't have two
1881     // arguments.
1882     A = NULL;
1883 }
1884
1885 // prob_expr - a single compound term, or a prob.
1886
1887 %type prob_expr {Query *}
1888 %destructor prob_expr {delete $$;}
1889
1890 prob_expr(E) ::= prob(P). {
1891     E = P->query;
1892     P->query = NULL;
1893     // Handle any "+ terms".
1894     if (P->love) {
1895         if (P->love->empty()) {
1896             // +<nothing>.
1897             delete E;
1898             E = P->love;
1899         } else if (E) {
1900             swap(E, P->love);
1901             add_to_query(E, Query::OP_AND_MAYBE, P->love);
1902         } else {
1903             E = P->love;
1904         }
1905         P->love = NULL;
1906     }
1907     // Handle any boolean filters.
1908     if (!P->filter.empty()) {
1909         if (E) {
1910             add_to_query(E, Query::OP_FILTER, P->merge_filters());
1911         } else {
1912             // Make the query a boolean one.
1913             E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
1914         }
1915     }
1916     // Handle any "- terms".
1917     if (P->hate && !P->hate->empty()) {
1918         if (!E) {
1919             // Can't just hate!
1920             yy_parse_failed(yypParser);
1921             return;
1922         }
1923         *E = Query(Query::OP_AND_NOT, *E, *P->hate);
1924     }
1925     delete P;
1926 }
1927
1928 prob_expr(E) ::= term(E).
1929
1930 // prob - a sub-expression consisting of stop_terms, "+" terms, "-" terms,
1931 // boolean filters, and/or ranges.
1932 //
1933 // Note: stop_term can also be several other things other than a simple term!
1934
1935 %type prob {ProbQuery *}
1936 %destructor prob {delete $$;}
1937
1938 prob(P) ::= RANGE(R). {
1939     string grouping = R->name;
1940     const Query & range = R->as_range_query();
1941     P = new ProbQuery; /*P-overwrites-R*/
1942     P->add_filter_range(grouping, range);
1943 }
1944
1945 prob(P) ::= stop_prob(P) RANGE(R). {
1946     string grouping = R->name;
1947     const Query & range = R->as_range_query();
1948     P->append_filter_range(grouping, range);
1949 }
1950
1951 prob(P) ::= stop_term(T) stop_term(U). {
1952     P = new ProbQuery(T); /*P-overwrites-T*/
1953     if (U) {
1954         Query::op op = state->default_op();
1955         if (P->query && is_positional(op)) {
1956             // If default_op is OP_NEAR or OP_PHRASE, set the window size to
1957             // 11 for the first pair of terms and it will automatically grow
1958             // by one for each subsequent term.
1959             Query * subqs[2] = { P->query, U };
1960             *(P->query) = Query(op, subqs, subqs + 2, 11);
1961             delete U;
1962         } else {
1963             add_to_query(P->query, op, U);
1964         }
1965     }
1966 }
1967
1968 prob(P) ::= prob(P) stop_term(T). {
1969     // If T is a stopword, there's nothing to do here.
1970     if (T) add_to_query(P->query, state->default_op(), T);
1971 }
1972
1973 prob(P) ::= LOVE term(T). {
1974     P = new ProbQuery;
1975     if (state->default_op() == Query::OP_AND) {
1976         P->query = T;
1977     } else {
1978         P->love = T;
1979     }
1980 }
1981
1982 prob(P) ::= stop_prob(P) LOVE term(T). {
1983     if (state->default_op() == Query::OP_AND) {
1984         /* The default op is AND, so we just put loved terms into the query
1985          * (in this case the only effect of love is to ignore the stopword
1986          * list). */
1987         add_to_query(P->query, Query::OP_AND, T);
1988     } else {
1989         add_to_query(P->love, Query::OP_AND, T);
1990     }
1991 }
1992
1993 prob(P) ::= HATE term(T). {
1994     P = new ProbQuery;
1995     P->hate = T;
1996 }
1997
1998 prob(P) ::= stop_prob(P) HATE term(T). {
1999     add_to_query(P->hate, Query::OP_OR, T);
2000 }
2001
2002 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2003     P = new ProbQuery;
2004     P->hate = new Query(T->get_query());
2005     delete T;
2006 }
2007
2008 prob(P) ::= stop_prob(P) HATE BOOLEAN_FILTER(T). {
2009     add_to_query(P->hate, Query::OP_OR, T->get_query());
2010     delete T;
2011 }
2012
2013 prob(P) ::= BOOLEAN_FILTER(T). {
2014     P = new ProbQuery;
2015     P->add_filter(T->get_grouping(), T->get_query());
2016     delete T;
2017 }
2018
2019 prob(P) ::= stop_prob(P) BOOLEAN_FILTER(T). {
2020     P->append_filter(T->get_grouping(), T->get_query());
2021     delete T;
2022 }
2023
2024 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2025     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2026     P = new ProbQuery;
2027     P->filter[T->get_grouping()] = T->get_query();
2028     delete T;
2029 }
2030
2031 prob(P) ::= stop_prob(P) LOVE BOOLEAN_FILTER(T). {
2032     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2033     // We OR filters with the same prefix...
2034     Query & q = P->filter[T->get_grouping()];
2035     q = Query(Query::OP_OR, q, T->get_query());
2036     delete T;
2037 }
2038
2039 // stop_prob - A prob or a stop_term.
2040
2041 %type stop_prob {ProbQuery *}
2042 %destructor stop_prob {delete $$;}
2043
2044 stop_prob(P) ::= prob(P).
2045
2046 stop_prob(P) ::= stop_term(T). {
2047     P = new ProbQuery(T); /*P-overwrites-T*/
2048 }
2049
2050 // stop_term - A term which should be checked against the stopword list,
2051 // or a compound_term.
2052 //
2053 // If a term is loved, hated, or in a phrase, we don't want to consult the
2054 // stopword list, so stop_term isn't used there (instead term is).
2055
2056 %type stop_term {Query *}
2057 %destructor stop_term {delete $$;}
2058
2059 stop_term(T) ::= TERM(U). {
2060     if (state->is_stopword(U)) {
2061         T = NULL;
2062         state->add_to_stoplist(U);
2063     } else {
2064         T = new Query(U->get_query_with_auto_synonyms());
2065     }
2066     delete U;
2067 }
2068
2069 stop_term(T) ::= compound_term(T).
2070
2071 // term - A term or a compound_term.
2072
2073 %type term {Query *}
2074 %destructor term {delete $$;}
2075
2076 term(T) ::= TERM(U). {
2077     T = new Query(U->get_query_with_auto_synonyms());
2078     delete U;
2079 }
2080
2081 term(T) ::= compound_term(T).
2082
2083 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2084 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2085 // or without prefix).
2086
2087 %type compound_term {Query *}
2088 %destructor compound_term {delete $$;}
2089
2090 compound_term(T) ::= WILD_TERM(U).
2091         { T = U->as_wildcarded_query(state); /*T-overwrites-U*/ }
2092
2093 compound_term(T) ::= PARTIAL_TERM(U).
2094         { T = U->as_partial_query(state); /*T-overwrites-U*/ }
2095
2096 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2097         { T = P->as_phrase_query(); }
2098
2099 compound_term(T) ::= phrased_term(P).
2100         { T = P->as_phrase_query(); /*T-overwrites-P*/ }
2101
2102 compound_term(T) ::= group(P).
2103         { T = P->as_group(state); /*T-overwrites-P*/ }
2104
2105 compound_term(T) ::= near_expr(P).
2106         { T = P->as_near_query(); /*T-overwrites-P*/ }
2107
2108 compound_term(T) ::= adj_expr(P).
2109         { T = P->as_adj_query(); /*T-overwrites-P*/ }
2110
2111 compound_term(T) ::= BRA expr(E) KET.
2112         { T = E; }
2113
2114 compound_term(T) ::= SYNONYM TERM(U). {
2115     T = new Query(U->get_query_with_synonyms());
2116     delete U;
2117 }
2118
2119 compound_term(T) ::= CJKTERM(U). {
2120     { T = U->as_cjk_query(); /*T-overwrites-U*/ }
2121 }
2122
2123 // phrase - The "inside the quotes" part of a double-quoted phrase.
2124
2125 %type phrase {Terms *}
2126
2127 %destructor phrase {delete $$;}
2128
2129 phrase(P) ::= TERM(T). {
2130     P = Terms::create();
2131     P->add_positional_term(T);
2132 }
2133
2134 phrase(P) ::= CJKTERM(T). {
2135     P = Terms::create();
2136     T->as_positional_cjk_term(P);
2137 }
2138
2139 phrase(P) ::= phrase(P) TERM(T). {
2140     P->add_positional_term(T);
2141 }
2142
2143 phrase(P) ::= phrase(P) CJKTERM(T). {
2144     T->as_positional_cjk_term(P);
2145 }
2146
2147 // phrased_term - A phrased term works like a single term, but is actually
2148 // 2 or more terms linked together into a phrase by punctuation.  There must be
2149 // at least 2 terms in order to be able to have punctuation between the terms!
2150
2151 %type phrased_term {Terms *}
2152 %destructor phrased_term {delete $$;}
2153
2154 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2155     P = Terms::create();
2156     P->add_positional_term(T);
2157     P->add_positional_term(U);
2158 }
2159
2160 phrased_term(P) ::= phrased_term(P) PHR_TERM(T). {
2161     P->add_positional_term(T);
2162 }
2163
2164 // group - A group of terms separated only by whitespace - candidates for
2165 // multi-term synonyms.
2166
2167 %type group {TermGroup *}
2168 %destructor group {delete $$;}
2169
2170 group(P) ::= TERM(T) GROUP_TERM(U). {
2171     P = TermGroup::create(T, U); /*P-overwrites-T*/
2172 }
2173
2174 group(P) ::= group(P) GROUP_TERM(T). {
2175     P->add_term(T);
2176 }
2177
2178 group(P) ::= group(P) EMPTY_GROUP_OK. {
2179     P->set_empty_ok();
2180 }
2181
2182 // near_expr - 2 or more terms with NEAR in between.  There must be at least 2
2183 // terms in order for there to be any NEAR operators!
2184
2185 %type near_expr {Terms *}
2186 %destructor near_expr {delete $$;}
2187
2188 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2189     P = Terms::create();
2190     P->add_positional_term(T);
2191     P->add_positional_term(U);
2192     if (N) {
2193         P->adjust_window(N->get_termpos());
2194         delete N;
2195     }
2196 }
2197
2198 near_expr(P) ::= near_expr(P) NEAR(N) TERM(T). {
2199     P->add_positional_term(T);
2200     if (N) {
2201         P->adjust_window(N->get_termpos());
2202         delete N;
2203     }
2204 }
2205
2206 // adj_expr - 2 or more terms with ADJ in between.  There must be at least 2
2207 // terms in order for there to be any ADJ operators!
2208
2209 %type adj_expr {Terms *}
2210 %destructor adj_expr {delete $$;}
2211
2212 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2213     P = Terms::create();
2214     P->add_positional_term(T);
2215     P->add_positional_term(U);
2216     if (N) {
2217         P->adjust_window(N->get_termpos());
2218         delete N;
2219     }
2220 }
2221
2222 adj_expr(P) ::= adj_expr(P) ADJ(N) TERM(T). {
2223     P->add_positional_term(T);
2224     if (N) {
2225         P->adjust_window(N->get_termpos());
2226         delete N;
2227     }
2228 }
2229
2230 // Select yacc syntax highlighting in vim editor: vim: syntax=yacc
2231 // (lemon syntax colouring isn't supplied by default; yacc does an OK job).