xapian-applications/omega/query.cc

   1 /* query.cc: query executor for omega
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 James Aylett
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002 Intercede 1749 Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
   8  * Copyright 2008 Thomas Viehmann
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 #include <algorithm>
  29 #include <iostream>
  30 #include <map>
  31 #include <set>
  32 #include <unordered_map>
  33 #include <vector>
  34
  35 #include <cassert>
  36 #include <cctype>
  37 #include "safeerrno.h"
  38 #include <stdio.h>
  39 #include <cstdlib>
  40 #include <cstring>
  41 #include "strcasecmp.h"
  42 #include <ctime>
  43
  44 #include "safeunistd.h"
  45 #include <sys/types.h>
  46 #include "safesysstat.h"
  47 #include "safefcntl.h"
  48
  49 #include "realtime.h"
  50
  51 #include <cdb.h>
  52
  53 #include "csvescape.h"
  54 #include "date.h"
  55 #include "datevalue.h"
  56 #include "jsonescape.h"
  57 #include "utils.h"
  58 #include "omega.h"
  59 #include "query.h"
  60 #include "cgiparam.h"
  61 #include "loadfile.h"
  62 #include "sample.h"
  63 #include "str.h"
  64 #include "stringutils.h"
  65 #include "transform.h"
  66 #include "urldecode.h"
  67 #include "urlencode.h"
  68 #include "unixperm.h"
  69 #include "values.h"
  70 #include "weight.h"
  71 #include "expand.h"
  72 #include "md5wrap.h"
  73
  74 #include <xapian.h>
  75
  76 using namespace std;
  77
  78 using Xapian::Utf8Iterator;
  79
  80 using Xapian::Unicode::is_wordchar;
  81
  82 #ifndef SNPRINTF
  83 #include <cstdarg>
  84
  85 static int my_snprintf(char *str, size_t size, const char *format, ...)
  86 {
  87     int res;
  88     va_list ap;
  89     va_start(ap, format);
  90     str[size - 1] = '\0';
  91     res = vsprintf(str, format, ap);
  92     if (str[size - 1] || res < 0 || size_t(res) >= size)
  93         abort(); /* Overflowed! */
  94     va_end(ap);
  95     return res;
  96 }
  97 #else
  98 #define my_snprintf SNPRINTF
  99 #endif
 100
 101 static bool query_parsed = false;
 102 static bool done_query = false;
 103 static Xapian::docid last = 0;
 104
 105 static Xapian::MSet mset;
 106
 107 static map<Xapian::docid, bool> ticked;
 108
 109 static void ensure_query_parsed();
 110 static void ensure_match();
 111
 112 static Xapian::Query query;
 113 //static string url_query_string;
 114 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 115
 116 static Xapian::QueryParser qp;
 117 static Xapian::NumberRangeProcessor * size_rp = NULL;
 118 static Xapian::Stem *stemmer = NULL;
 119
 120 static string eval_file(const string &fmtfile);
 121
 122 static set<string> termset;
 123
 124 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 125 static map<string, string> termprefix_to_userprefix;
 126
 127 static string queryterms;
 128
 129 static string error_msg;
 130
 131 static double secs = -1;
 132
 133 static const char DEFAULT_LOG_ENTRY[] =
 134         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 135         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 136         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 137         "$dbname\t"
 138         "$query\t"
 139         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 140
 141 class MyStopper : public Xapian::Stopper {
 142   public:
 143     bool operator()(const string &t) const {
 144         switch (t[0]) {
 145             case 'a':
 146                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 147                         t == "are" || t == "as" || t == "at");
 148             case 'b':
 149                 return (t == "be" || t == "by");
 150             case 'e':
 151                 return (t == "en");
 152             case 'f':
 153                 return (t == "for" || t == "from");
 154             case 'h':
 155                 return (t == "how");
 156             case 'i':
 157                 return (t == "i" || t == "in" || t == "is" || t == "it");
 158             case 'o':
 159                 return (t == "of" || t == "on" || t == "or");
 160             case 't':
 161                 return (t == "that" || t == "the" || t == "this" || t == "to");
 162             case 'w':
 163                 return (t == "was" || t == "what" || t == "when" ||
 164                         t == "where" || t == "which" || t == "who" ||
 165                         t == "why" || t == "will" || t == "with");
 166             case 'y':
 167                 return (t == "you" || t == "your");
 168             default:
 169                 return false;
 170         }
 171     }
 172 };
 173
 174 static size_t
 175 prefix_from_term(string* prefix, const string& term)
 176 {
 177     if (!term.empty()) {
 178         if (term[0] == 'X') {
 179             const string::const_iterator begin = term.begin();
 180             string::const_iterator i = begin + 1;
 181             while (i != term.end() && C_isupper(*i))
 182                 ++i;
 183             if (prefix)
 184                 prefix->assign(begin, i);
 185             if (i != term.end() && *i == ':')
 186                 ++i;
 187             return i - begin;
 188         }
 189
 190         if (C_isupper(term[0])) {
 191             if (prefix)
 192                 *prefix = term[0];
 193             return 1;
 194         }
 195     }
 196
 197     if (prefix)
 198         prefix->resize(0);
 199     return 0;
 200 }
 201
 202 // Don't allow ".." in format names, log file names, etc as this would allow
 203 // people to open a format "../../etc/passwd" or similar.
 204 // FIXME: make this check more exact ("foo..bar" is safe)
 205 // FIXME: log when this check fails
 206 static bool
 207 vet_filename(const string &filename)
 208 {
 209     string::size_type i = filename.find("..");
 210     return (i == string::npos);
 211 }
 212
 213 // Heuristics:
 214 // * If any terms have been removed, it's a "fresh query" so we discard any
 215 //   relevance judgements
 216 // * If all previous terms are there but more have been added then we keep
 217 //   the relevance judgements, but return the first page of hits
 218 //
 219 // NEW_QUERY entirely new query
 220 // SAME_QUERY unchanged query
 221 // EXTENDED_QUERY new query, but based on the old one
 222 // BAD_QUERY parse error (message in error_msg)
 223 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 224
 225 static multimap<string, string> probabilistic_query;
 226
 227 void
 228 set_probabilistic_query(const string & prefix, const string & s)
 229 {
 230     string query_string = s;
 231     // Strip leading and trailing whitespace from query_string.
 232     trim(query_string);
 233     if (!query_string.empty())
 234         probabilistic_query.insert(make_pair(prefix, query_string));
 235 }
 236
 237 static unsigned
 238 read_qp_flags(const string & opt_pfx, unsigned f)
 239 {
 240     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 241     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 242         unsigned mask = 0;
 243         const char * s = i->first.c_str() + opt_pfx.size();
 244         switch (s[0]) {
 245             case 'a':
 246                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 247                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 248                     break;
 249                 }
 250                 if (strcmp(s, "auto_synonyms") == 0) {
 251                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 252                     break;
 253                 }
 254                 break;
 255             case 'b':
 256                 if (strcmp(s, "boolean") == 0) {
 257                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 258                     break;
 259                 }
 260                 if (strcmp(s, "boolean_any_case") == 0) {
 261                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 262                     break;
 263                 }
 264                 break;
 265             case 'c':
 266                 if (strcmp(s, "cjk_ngram") == 0) {
 267                     mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
 268                     break;
 269                 }
 270                 break;
 271             case 'd':
 272                 if (strcmp(s, "default") == 0) {
 273                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 274                     break;
 275                 }
 276                 break;
 277             case 'l':
 278                 if (strcmp(s, "lovehate") == 0) {
 279                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 280                     break;
 281                 }
 282                 break;
 283             case 'p':
 284                 if (strcmp(s, "partial") == 0) {
 285                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 286                     break;
 287                 }
 288                 if (strcmp(s, "phrase") == 0) {
 289                     mask = Xapian::QueryParser::FLAG_PHRASE;
 290                     break;
 291                 }
 292                 if (strcmp(s, "pure_not") == 0) {
 293                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 294                     break;
 295                 }
 296                 break;
 297             case 's':
 298                 if (strcmp(s, "spelling_correction") == 0) {
 299                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 300                     break;
 301                 }
 302                 if (strcmp(s, "synonym") == 0) {
 303                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 304                     break;
 305                 }
 306                 break;
 307             case 'w':
 308                 if (strcmp(s, "wildcard") == 0) {
 309                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 310                     break;
 311                 }
 312                 break;
 313         }
 314
 315         if (i->second.empty()) {
 316             f &= ~mask;
 317         } else {
 318             f |= mask;
 319         }
 320     }
 321     return f;
 322 }
 323
 324 static querytype
 325 set_probabilistic(const string &oldp)
 326 {
 327     // Parse the query string.
 328     qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
 329     qp.set_stopper(new MyStopper());
 330     qp.set_default_op(default_op);
 331     qp.set_database(db);
 332     // FIXME: provide a custom RP which handles size:10..20K, etc.
 333     if (!size_rp)
 334         size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
 335     qp.add_rangeprocessor(size_rp);
 336     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 337     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 338         string user_prefix(pfx->first, 7);
 339         const string & term_pfx_list = pfx->second;
 340         string::size_type i = 0;
 341         do {
 342             string::size_type i0 = i;
 343             i = term_pfx_list.find('\t', i);
 344             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 345             qp.add_prefix(user_prefix, term_pfx);
 346             // std::map::insert() won't overwrite an existing entry, so we'll
 347             // prefer the first user_prefix for which a particular term prefix
 348             // is specified.
 349             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 350         } while (++i);
 351     }
 352     pfx = option.lower_bound("boolprefix,");
 353     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 354         string user_prefix(pfx->first, 11, string::npos);
 355         auto it = option.find("nonexclusiveprefix," + pfx->second);
 356         bool exclusive = (it == option.end() || it->second.empty());
 357         qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
 358         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 359     }
 360
 361     try {
 362         unsigned default_flags = read_qp_flags("flag_", 0);
 363
 364         vector<Xapian::Query> queries;
 365         queries.reserve(probabilistic_query.size());
 366
 367         multimap<string, string>::const_iterator j;
 368         for (j = probabilistic_query.begin();
 369              j != probabilistic_query.end();
 370              ++j) {
 371             const string & prefix = j->first;
 372
 373             // Choose the stemmer to use for this input.
 374             string stemlang = option[prefix + ":stemmer"];
 375             if (stemlang.empty())
 376                 stemlang = option["stemmer"];
 377             qp.set_stemmer(Xapian::Stem(stemlang));
 378
 379             // Work out the flags to use for this input.
 380             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 381
 382             const string & query_string = j->second;
 383             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 384             if (!q.empty())
 385                 queries.push_back(q);
 386         }
 387         query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
 388     } catch (Xapian::QueryParserError &e) {
 389         error_msg = e.get_msg();
 390         return BAD_QUERY;
 391     }
 392
 393     Xapian::termcount n_new_terms = 0;
 394     for (Xapian::TermIterator i = query.get_terms_begin();
 395          i != query.get_terms_end(); ++i) {
 396         if (termset.find(*i) == termset.end()) {
 397             termset.insert(*i);
 398             if (!queryterms.empty()) queryterms += '\t';
 399             queryterms += *i;
 400         }
 401         n_new_terms++;
 402     }
 403
 404     // Check new query against the previous one
 405     if (oldp.empty()) {
 406         // If oldp was empty that means there were no probabilistic terms
 407         // before, so if there are now this is a new query.
 408         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 409     }
 410
 411     // The terms in oldp are separated by tabs.
 412     const char oldp_separator = '\t';
 413     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 414
 415     // short-cut: if the new query has fewer terms, it must be a new one
 416     if (n_new_terms < n_old_terms) return NEW_QUERY;
 417
 418     const char *term = oldp.c_str();
 419     const char *pend;
 420     while ((pend = strchr(term, oldp_separator)) != NULL) {
 421         if (termset.find(string(term, pend - term)) == termset.end())
 422             return NEW_QUERY;
 423         term = pend + 1;
 424     }
 425     if (*term) {
 426         if (termset.find(string(term)) == termset.end())
 427             return NEW_QUERY;
 428     }
 429
 430     // Use termset.size() rather than n_new_terms so we correctly handle
 431     // the case when the query has repeated terms.
 432     // This works wrongly in the case when the user extends the query
 433     // by adding a term already in it, but that's unlikely and the behaviour
 434     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 435     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 436     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 437     return SAME_QUERY;
 438 }
 439
 440 static multimap<string, string> filter_map;
 441 static set<string> neg_filters;
 442
 443 typedef multimap<string, string>::const_iterator FMCI;
 444
 445 void add_bterm(const string &term) {
 446     string prefix;
 447     if (prefix_from_term(&prefix, term) > 0)
 448         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 449 }
 450
 451 void add_nterm(const string &term) {
 452     if (!term.empty())
 453         neg_filters.insert(term);
 454 }
 455
 456 static void
 457 run_query()
 458 {
 459     string scheme;
 460     bool force_boolean = false;
 461     if (!filter_map.empty()) {
 462         // OR together filters with the same prefix (or AND for non-exclusive
 463         // prefixes), then AND together the resultant groups.
 464         vector<Xapian::Query> filter_vec;
 465         vector<string> same_vec;
 466         string current;
 467         for (FMCI i = filter_map.begin(); ; ++i) {
 468             bool over = (i == filter_map.end());
 469             if (over || i->first != current) {
 470                 switch (same_vec.size()) {
 471                     case 0:
 472                         break;
 473                     case 1:
 474                         filter_vec.push_back(Xapian::Query(same_vec[0]));
 475                         break;
 476                     default: {
 477                         Xapian::Query::op op = Xapian::Query::OP_OR;
 478                         auto it = option.find("nonexclusiveprefix," + current);
 479                         if (it != option.end() && !it->second.empty()) {
 480                             op = Xapian::Query::OP_AND;
 481                         }
 482                         filter_vec.push_back(Xapian::Query(op,
 483                                                      same_vec.begin(),
 484                                                      same_vec.end()));
 485                         break;
 486                     }
 487                 }
 488                 same_vec.clear();
 489                 if (over) break;
 490                 current = i->first;
 491             }
 492             same_vec.push_back(i->second);
 493         }
 494
 495         Xapian::Query filter(Xapian::Query::OP_AND,
 496                              filter_vec.begin(), filter_vec.end());
 497
 498         if (query.empty()) {
 499             // If no probabilistic query is provided then promote the filters
 500             // to be THE query - filtering an empty query will give no
 501             // matches.
 502             std::swap(query, filter);
 503             auto&& it = option.find("weightingpurefilter");
 504             if (it != option.end() && !it->second.empty()) {
 505                 scheme = it->second;
 506             } else {
 507                 force_boolean = true;
 508             }
 509         } else {
 510             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 511         }
 512     }
 513
 514     if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
 515         Xapian::Query date_filter;
 516         if (date_value_slot != Xapian::BAD_VALUENO) {
 517             // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
 518             // latter the sort order just works correctly between different
 519             // precisions).
 520             bool as_time_t =
 521                 db.get_value_lower_bound(date_value_slot).size() == 4 &&
 522                 db.get_value_upper_bound(date_value_slot).size() == 4;
 523             date_filter = date_value_range(as_time_t, date_value_slot,
 524                                            date_start, date_end,
 525                                            date_span);
 526         } else {
 527             date_filter = date_range_filter(date_start, date_end, date_span);
 528             date_filter = Xapian::Query(Xapian::Query::OP_OR,
 529                                         date_filter,
 530                                         Xapian::Query("Dlatest"));
 531         }
 532
 533         // If no probabilistic query is provided then promote the daterange
 534         // filter to be THE query instead of filtering an empty query.
 535         if (query.empty()) {
 536             query = date_filter;
 537             force_boolean = true;
 538         } else {
 539             query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 540         }
 541     }
 542
 543     if (!neg_filters.empty()) {
 544         // OR together all negated filters.
 545         Xapian::Query filter(Xapian::Query::OP_OR,
 546                              neg_filters.begin(), neg_filters.end());
 547
 548         if (query.empty()) {
 549             // If we only have a negative filter for the query, use MatchAll as
 550             // the query to apply the filters to.
 551             query = Xapian::Query::MatchAll;
 552             force_boolean = true;
 553         }
 554         query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
 555     }
 556
 557     if (!enquire || !error_msg.empty()) return;
 558
 559     if (!force_boolean && scheme.empty()) {
 560         auto&& it = option.find("weighting");
 561         if (it != option.end()) scheme = it->second;
 562     }
 563     set_weighting_scheme(*enquire, scheme, force_boolean);
 564
 565     enquire->set_cutoff(threshold);
 566
 567     if (sort_keymaker) {
 568         if (sort_after) {
 569             enquire->set_sort_by_relevance_then_key(sort_keymaker,
 570                                                     reverse_sort);
 571         } else {
 572             enquire->set_sort_by_key_then_relevance(sort_keymaker,
 573                                                     reverse_sort);
 574         }
 575     } else if (sort_key != Xapian::BAD_VALUENO) {
 576         if (sort_after) {
 577             enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
 578         } else {
 579             enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
 580         }
 581     }
 582
 583     enquire->set_docid_order(docid_order);
 584
 585     if (collapse) {
 586         enquire->set_collapse_key(collapse_key);
 587     }
 588
 589     if (!query.empty()) {
 590 #if 0
 591         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 592         // we're going to break some existing setups if users upgrade.  We
 593         // probably want a way to set this from OmegaScript.
 594         const char * remote_user = getenv("REMOTE_USER");
 595         if (remote_user)
 596             apply_unix_permissions(query, remote_user);
 597 #endif
 598
 599         enquire->set_query(query);
 600         // We could use the value of topdoc as first parameter, but we
 601         // need to know the first few items in the mset to fake a
 602         // relevance set for topterms.
 603         //
 604         // If min_hits isn't set, check at least one extra result so we
 605         // know if we've reached the end of the matches or not - then we
 606         // can avoid offering a "next" button which leads to an empty page.
 607         mset = enquire->get_mset(0, topdoc + hits_per_page,
 608                                  topdoc + max(hits_per_page + 1, min_hits),
 609                                  &rset);
 610     }
 611 }
 612
 613 string
 614 html_escape(const string &str)
 615 {
 616     string res;
 617     string::size_type p = 0;
 618     while (p < str.size()) {
 619         char ch = str[p++];
 620         switch (ch) {
 621             case '<':
 622                 res += "&lt;";
 623                 continue;
 624             case '>':
 625                 res += "&gt;";
 626                 continue;
 627             case '&':
 628                 res += "&amp;";
 629                 continue;
 630             case '"':
 631                 res += "&quot;";
 632                 continue;
 633             default:
 634                 res += ch;
 635         }
 636     }
 637     return res;
 638 }
 639
 640 static string
 641 html_strip(const string &str)
 642 {
 643     string res;
 644     string::size_type p = 0;
 645     bool skip = false;
 646     while (p < str.size()) {
 647         char ch = str[p++];
 648         switch (ch) {
 649             case '<':
 650                 skip = true;
 651                 continue;
 652             case '>':
 653                 skip = false;
 654                 continue;
 655             default:
 656                 if (! skip) res += ch;
 657         }
 658     }
 659     return res;
 660 }
 661
 662 class WordList {
 663     static string prev_list;
 664     static unordered_map<string, int> word_to_occurrence;
 665   public:
 666     void build_word_map(const string& list) {
 667         // Don't build map again if passed list of terms is same as before.
 668         if (prev_list == list) return;
 669         word_to_occurrence.clear();
 670         string::size_type split = 0, split2;
 671         int word_index = 0;
 672         string word;
 673         while ((split2 = list.find('\t', split)) != string::npos) {
 674             word = list.substr(split, split2 - split);
 675             if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 676                 ++word_index;
 677             split = split2 + 1;
 678         }
 679         word = list.substr(split, list.size() - split);
 680         if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 681             ++word_index;
 682         prev_list = list;
 683     }
 684
 685     int word_in_list(const string& word) {
 686         auto it = word_to_occurrence.find(word);
 687         if (it == word_to_occurrence.end()) return -1;
 688         return it->second;
 689     }
 690 };
 691
 692 string WordList::prev_list;
 693 unordered_map<string, int> WordList::word_to_occurrence;
 694
 695 // Not a character in an identifier
 696 inline static bool
 697 p_notid(unsigned int c)
 698 {
 699     return !C_isalnum(c) && c != '_';
 700 }
 701
 702 // Not a character in an HTML tag name
 703 inline static bool
 704 p_nottag(unsigned int c)
 705 {
 706     return !C_isalnum(c) && c != '.' && c != '-';
 707 }
 708
 709 // FIXME: shares algorithm with indextext.cc!
 710 static string
 711 html_highlight(const string &s, const string &list,
 712                const string &bra, const string &ket)
 713 {
 714     if (!stemmer) {
 715         stemmer = new Xapian::Stem(option["stemmer"]);
 716     }
 717
 718     string res;
 719
 720     Utf8Iterator j(s);
 721     const Utf8Iterator s_end;
 722     while (true) {
 723         Utf8Iterator first = j;
 724         while (first != s_end && !is_wordchar(*first)) ++first;
 725         if (first == s_end) break;
 726         Utf8Iterator term_end;
 727         string term;
 728         string word;
 729         const char *l = j.raw();
 730         if (*first < 128 && C_isupper(*first)) {
 731             j = first;
 732             Xapian::Unicode::append_utf8(term, *j);
 733             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 734                 Xapian::Unicode::append_utf8(term, *j);
 735             }
 736             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 737                 term.resize(0);
 738             }
 739             term_end = j;
 740         }
 741         if (term.empty()) {
 742             j = first;
 743             while (is_wordchar(*j)) {
 744                 Xapian::Unicode::append_utf8(term, *j);
 745                 ++j;
 746                 if (j == s_end) break;
 747                 if (*j == '&' || *j == '\'') {
 748                     Utf8Iterator next = j;
 749                     ++next;
 750                     if (next == s_end || !is_wordchar(*next)) break;
 751                     term += *j;
 752                     j = next;
 753                 }
 754             }
 755             term_end = j;
 756             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 757                 string::size_type len = term.length();
 758                 if (*j == '#') {
 759                     term += '#';
 760                     do { ++j; } while (j != s_end && *j == '#');
 761                 } else {
 762                     while (j != s_end && (*j == '+' || *j == '-')) {
 763                         Xapian::Unicode::append_utf8(term, *j);
 764                         ++j;
 765                     }
 766                 }
 767                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 768                     term.resize(len);
 769                 } else {
 770                     term_end = j;
 771                 }
 772             }
 773         }
 774         j = term_end;
 775         term = Xapian::Unicode::tolower(term);
 776         WordList w;
 777         w.build_word_map(list);
 778         int match = w.word_in_list(term);
 779         if (match == -1) {
 780             string stem = "Z";
 781             stem += (*stemmer)(term);
 782             match = w.word_in_list(stem);
 783         }
 784         if (match >= 0) {
 785             res += html_escape(string(l, first.raw() - l));
 786             if (!bra.empty()) {
 787                 res += bra;
 788             } else {
 789                 static const char * colours[] = {
 790                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 791                     "990000", "009900", "996600", "006699", "990099"
 792                 };
 793                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 794                 const char * bg = colours[idx];
 795                 if (strchr(bg, 'f')) {
 796                     res += "<b style=\"color:black;background-color:#";
 797                 } else {
 798                     res += "<b style=\"color:white;background-color:#";
 799                 }
 800                 res += bg;
 801                 res += "\">";
 802             }
 803             word.assign(first.raw(), j.raw() - first.raw());
 804             res += html_escape(word);
 805             if (!bra.empty()) {
 806                 res += ket;
 807             } else {
 808                 res += "</b>";
 809             }
 810         } else {
 811             res += html_escape(string(l, j.raw() - l));
 812         }
 813     }
 814     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 815     return res;
 816 }
 817
 818 #if 0
 819 static void
 820 print_query_string(const char *after)
 821 {
 822     if (after && strncmp(after, "&B=", 3) == 0) {
 823         char prefix = after[3];
 824         string::size_type start = 0, amp = 0;
 825         while (true) {
 826             amp = url_query_string.find('&', amp);
 827             if (amp == string::npos) {
 828                 cout << url_query_string.substr(start);
 829                 return;
 830             }
 831             amp++;
 832             while (url_query_string[amp] == 'B' &&
 833                    url_query_string[amp + 1] == '=' &&
 834                    url_query_string[amp + 2] == prefix) {
 835                 cout << url_query_string.substr(start, amp - start - 1);
 836                 start = url_query_string.find('&', amp + 3);
 837                 if (start == string::npos) return;
 838                 amp = start + 1;
 839             }
 840         }
 841     }
 842     cout << url_query_string;
 843 }
 844 #endif
 845
 846 class Fields {
 847     mutable Xapian::docid did_cached;
 848     mutable map<string, string> fields;
 849
 850     void read_fields(Xapian::docid did) const;
 851
 852   public:
 853     Fields() : did_cached(0) { }
 854
 855     const string & get_field(Xapian::docid did, const string & field) const {
 856         if (did != did_cached) read_fields(did);
 857         return fields[field];
 858     }
 859 };
 860
 861 void
 862 Fields::read_fields(Xapian::docid did) const
 863 {
 864     fields.clear();
 865     did_cached = did;
 866     const string & data = db.get_document(did).get_data();
 867
 868     // Parse document data.
 869     string::size_type i = 0;
 870     const string & names = option["fieldnames"];
 871     if (!names.empty()) {
 872         // Each line is a field, with fieldnames taken from corresponding
 873         // entries in the tab-separated list specified by $opt{fieldnames}.
 874         string::size_type n = 0;
 875         do {
 876             string::size_type n0 = n;
 877             n = names.find('\t', n);
 878             string::size_type i0 = i;
 879             i = data.find('\n', i);
 880             fields.insert(make_pair(names.substr(n0, n - n0),
 881                                     data.substr(i0, i - i0)));
 882         } while (++n && ++i);
 883     } else {
 884         // Each line is a field, in the format NAME=VALUE.  We assume the field
 885         // name doesn't contain an "=".  Lines without an "=" are currently
 886         // just ignored.
 887         do {
 888             string::size_type i0 = i;
 889             i = data.find('\n', i);
 890             string line(data, i0, i - i0);
 891             string::size_type j = line.find('=');
 892             if (j != string::npos) {
 893                 string & value = fields[line.substr(0, j)];
 894                 if (!value.empty()) value += '\t';
 895                 value.append(line, j + 1, string::npos);
 896             }
 897         } while (++i);
 898     }
 899 }
 900
 901 static Fields fields;
 902 static Xapian::docid q0;
 903 static Xapian::doccount hit_no;
 904 static int percent;
 905 static double weight;
 906 static Xapian::doccount collapsed;
 907
 908 static string print_caption(const string &fmt, const vector<string> &param);
 909
 910 enum tagval {
 911 CMD_,
 912 CMD_add,
 913 CMD_addfilter,
 914 CMD_allterms,
 915 CMD_and,
 916 CMD_cgi,
 917 CMD_cgilist,
 918 CMD_cgiparams,
 919 CMD_chr,
 920 CMD_collapsed,
 921 CMD_contains,
 922 CMD_csv,
 923 CMD_date,
 924 CMD_dbname,
 925 CMD_dbsize,
 926 CMD_def,
 927 CMD_defaultop,
 928 CMD_div,
 929 CMD_eq,
 930 CMD_emptydocs,
 931 CMD_env,
 932 CMD_error,
 933 CMD_field,
 934 CMD_filesize,
 935 CMD_filters,
 936 CMD_filterterms,
 937 CMD_find,
 938 CMD_fmt,
 939 CMD_freq,
 940 CMD_ge,
 941 CMD_gt,
 942 CMD_hash,
 943 CMD_highlight,
 944 CMD_hit,
 945 CMD_hitlist,
 946 CMD_hitsperpage,
 947 CMD_hostname,
 948 CMD_html,
 949 CMD_htmlstrip,
 950 CMD_httpheader,
 951 CMD_id,
 952 CMD_if,
 953 CMD_include,
 954 CMD_json,
 955 CMD_jsonarray,
 956 CMD_last,
 957 CMD_lastpage,
 958 CMD_le,
 959 CMD_length,
 960 CMD_list,
 961 CMD_log,
 962 CMD_lookup,
 963 CMD_lower,
 964 CMD_lt,
 965 CMD_map,
 966 CMD_match,
 967 CMD_max,
 968 CMD_min,
 969 CMD_mod,
 970 CMD_msize,
 971 CMD_msizeexact,
 972 CMD_msizelower,
 973 CMD_msizeupper,
 974 CMD_mul,
 975 CMD_muldiv,
 976 CMD_ne,
 977 CMD_nice,
 978 CMD_not,
 979 CMD_now,
 980 CMD_opt,
 981 CMD_or,
 982 CMD_ord,
 983 CMD_pack,
 984 CMD_percentage,
 985 CMD_prettyterm,
 986 CMD_prettyurl,
 987 CMD_query,
 988 CMD_querydescription,
 989 CMD_queryterms,
 990 CMD_range,
 991 CMD_record,
 992 CMD_relevant,
 993 CMD_relevants,
 994 CMD_score,
 995 CMD_set,
 996 CMD_seterror,
 997 CMD_setmap,
 998 CMD_setrelevant,
 999 CMD_slice,
1000 CMD_snippet,
1001 CMD_split,
1002 CMD_stoplist,
1003 CMD_sub,
1004 CMD_substr,
1005 CMD_suggestion,
1006 CMD_termprefix,
1007 CMD_terms,
1008 CMD_thispage,
1009 CMD_time,
1010 CMD_topdoc,
1011 CMD_topterms,
1012 CMD_transform,
1013 CMD_truncate,
1014 CMD_uniq,
1015 CMD_unpack,
1016 CMD_unprefix,
1017 CMD_unstem,
1018 CMD_upper,
1019 CMD_url,
1020 CMD_value,
1021 CMD_version,
1022 CMD_weight,
1023 CMD_MACRO // special tag for macro evaluation
1024 };
1025
1026 struct func_attrib {
1027     int tag;
1028     int minargs, maxargs, evalargs;
1029     char ensure;
1030 };
1031
1032 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1033 struct func_desc {
1034     const char *name;
1035     struct func_attrib a;
1036 };
1037
1038 #define N -1
1039 #define M 'M'
1040 #define Q 'Q'
1041 // NB when adding a new command which ensures M or Q, update the list in
1042 // docs/omegascript.rst
1043 static struct func_desc func_tab[] = {
1044 //name minargs maxargs evalargs ensure
1045 {"",{CMD_,         N, N, 0, 0}},// commented out code
1046 T(add,             0, N, N, 0), // add a list of numbers
1047 T(addfilter,       1, 1, N, 0), // add filter term
1048 T(allterms,        0, 1, N, 0), // list of all terms matching document
1049 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
1050 T(cgi,             1, 1, N, 0), // return cgi parameter value
1051 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
1052 T(cgiparams,       0, 0, N, 0), // return list of cgi parameter names
1053 T(chr,             1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1054 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
1055 T(contains,        2, 2, N, 0), // return position of substring, or empty string
1056 T(csv,             1, 2, N, 0), // CSV string escaping
1057 T(date,            1, 2, N, 0), // convert time_t to strftime format
1058                                 // (default: YYYY-MM-DD)
1059 T(dbname,          0, 0, N, 0), // database name
1060 T(dbsize,          0, 0, N, 0), // database size (# of documents)
1061 T(def,             2, 2, 1, 0), // define a macro
1062 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
1063 T(div,             2, 2, N, 0), // integer divide
1064 T(emptydocs,       0, 1, N, 0), // list of empty documents
1065 T(env,             1, 1, N, 0), // environment variable
1066 T(error,           0, 0, N, 0), // error message
1067 T(eq,              2, 2, N, 0), // test equality
1068 T(field,           1, 2, N, 0), // lookup field in record
1069 T(filesize,        1, 1, N, 0), // pretty printed filesize
1070 T(filters,         0, 0, N, 0), // serialisation of current filters
1071 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
1072 T(find,            2, 2, N, 0), // find entry in list
1073 T(fmt,             0, 0, N, 0), // name of current format
1074 T(freq,            1, 1, N, 0), // frequency of a term
1075 T(ge,              2, 2, N, 0), // test >=
1076 T(gt,              2, 2, N, 0), // test >
1077 T(hash,            2, 2, N, 0), // hash a string using the specified hash function
1078 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
1079 T(hit,             0, 0, N, 0), // hit number of current mset entry (0-based)
1080 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
1081 T(hitsperpage,     0, 0, N, 0), // hits per page
1082 T(hostname,        1, 1, N, 0), // extract hostname from URL
1083 T(html,            1, 1, N, 0), // html escape string (<>&")
1084 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1085 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
1086 T(id,              0, 0, N, 0), // docid of current doc
1087 T(if,              2, 3, 1, 0), // conditional
1088 T(include,         1, 1, 1, 0), // include another file
1089 T(json,            1, 1, N, 0), // JSON string escaping
1090 T(jsonarray,       1, 1, N, 0), // Format list as a JSON array of strings
1091 T(last,            0, 0, N, M), // hit number one beyond end of current page
1092 T(lastpage,        0, 0, N, M), // number of last hit page
1093 T(le,              2, 2, N, 0), // test <=
1094 T(length,          1, 1, N, 0), // length of list
1095 T(list,            2, 5, N, 0), // pretty print list
1096 T(log,             1, 2, 1, 0), // create a log entry
1097 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1098 T(lower,           1, 1, N, 0), // convert string to lower case
1099 T(lt,              2, 2, N, 0), // test <
1100 T(map,             1, 2, 1, 0), // map a list into another list
1101 T(match,           2, 3, N, 0), // regex match
1102 T(max,             1, N, N, 0), // maximum of a list of values
1103 T(min,             1, N, N, 0), // minimum of a list of values
1104 T(mod,             2, 2, N, 0), // integer modulus
1105 T(msize,           0, 0, N, M), // number of matches (estimated)
1106 T(msizeexact,      0, 0, N, M), // is $msize exact?
1107 T(msizelower,      0, 0, N, M), // number of matches (lower bound)
1108 T(msizeupper,      0, 0, N, M), // number of matches (upper bound)
1109 T(mul,             2, N, N, 0), // multiply a list of numbers
1110 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1111 T(ne,              2, 2, N, 0), // test not equal
1112 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1113 T(not,             1, 1, N, 0), // logical not
1114 T(now,             0, 0, N, 0), // current date/time as a time_t
1115 T(opt,             1, 2, N, 0), // lookup an option value
1116 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1117 T(ord,             1, 1, N, 0), // return codepoint for first character of UTF-8 string
1118 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1119 T(percentage,      0, 0, N, 0), // percentage score of current hit
1120 T(prettyterm,      1, 1, N, Q), // pretty print term name
1121 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1122 T(query,           0, 1, N, Q), // query
1123 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1124 T(queryterms,      0, 0, N, Q), // list of query terms
1125 T(range,           2, 2, N, 0), // return list of values between start and end
1126 T(record,          0, 1, N, 0), // record contents of document
1127 T(relevant,        0, 1, N, Q), // is document relevant?
1128 T(relevants,       0, 0, N, Q), // return list of relevant documents
1129 T(score,           0, 0, N, 0), // score (0-10) of current hit
1130 T(set,             2, 2, N, 0), // set option value
1131 T(seterror,        1, 1, N, 0), // set error_msg, setting it early stops query execution
1132 T(setmap,          1, N, N, 0), // set map of option values
1133 T(setrelevant,     0, 1, N, Q), // set rset
1134 T(slice,           2, 2, N, 0), // slice a list using a second list
1135 T(snippet,         1, 2, N, M), // generate snippet from text
1136 T(split,           1, 2, N, 0), // split a string to give a list
1137 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1138 T(sub,             2, 2, N, 0), // subtract
1139 T(substr,          2, 3, N, 0), // substring
1140 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1141 T(termprefix,      1, 1, N, 0), // get any prefix from a term
1142 T(terms,           0, 1, N, M), // list of matching terms
1143 T(thispage,        0, 0, N, M), // page number of current page
1144 T(time,            0, 0, N, M), // how long the match took (in seconds)
1145 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1146                                 // (counting from 0)
1147 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1148                                 // (default 16)
1149 T(transform,       3, 4, N, 0), // transform with a regexp
1150 T(truncate,        2, 4, N, 0), // truncate after a word
1151 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1152 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1153 T(unprefix,        1, 1, N, 0), // remove any prefix from a term
1154 T(unstem,          1, 1, N, Q), // return list of probabilistic terms from
1155                                 // the query which stemmed to this term
1156 T(upper,           1, 1, N, 0), // convert string to upper case
1157 T(url,             1, 1, N, 0), // url encode argument
1158 T(value,           1, 2, N, 0), // return document value
1159 T(version,         0, 0, N, 0), // omega version string
1160 T(weight,          0, 0, N, 0), // weight of the current hit
1161 { NULL,{0,         0, 0, 0, 0}}
1162 };
1163
1164 #undef T // Leaving T defined screws up Sun's C++ compiler!
1165
1166 static vector<string> macros;
1167
1168 // Call write() repeatedly until all data is written or we get a
1169 // non-recoverable error.
1170 static ssize_t
1171 write_all(int fd, const char * buf, size_t count)
1172 {
1173     while (count) {
1174         ssize_t r = write(fd, buf, count);
1175         if (rare(r < 0)) {
1176             if (errno == EINTR) continue;
1177             return r;
1178         }
1179         buf += r;
1180         count -= r;
1181     }
1182     return 0;
1183 }
1184
1185 static string
1186 eval(const string &fmt, const vector<string> &param)
1187 {
1188     static map<string, const struct func_attrib *> func_map;
1189     if (func_map.empty()) {
1190         struct func_desc *p;
1191         for (p = func_tab; p->name != NULL; ++p) {
1192             func_map[string(p->name)] = &(p->a);
1193         }
1194     }
1195     string res;
1196     string::size_type p = 0, q;
1197     while ((q = fmt.find('$', p)) != string::npos) try {
1198         res.append(fmt, p, q - p);
1199         string::size_type code_start = q; // note down for error reporting
1200         q++;
1201         if (q >= fmt.size()) break;
1202         unsigned char ch = fmt[q];
1203         switch (ch) {
1204             // Magic sequences:
1205             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1206             case '$':
1207                 res += '$';
1208                 p = q + 1;
1209                 continue;
1210             case '(':
1211                 res += '{';
1212                 p = q + 1;
1213                 continue;
1214             case ')':
1215                 res += '}';
1216                 p = q + 1;
1217                 continue;
1218             case '.':
1219                 res += ',';
1220                 p = q + 1;
1221                 continue;
1222             case '_':
1223                 ch = '0';
1224                 // FALL THRU
1225             case '1': case '2': case '3': case '4': case '5':
1226             case '6': case '7': case '8': case '9':
1227                 ch -= '0';
1228                 if (ch < param.size()) res += param[ch];
1229                 p = q + 1;
1230                 continue;
1231             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1232             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1233             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1234             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1235             case 'y': case 'z':
1236             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1237             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1238             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1239             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1240             case 'Y': case 'Z':
1241             case '{':
1242                 break;
1243             default:
1244                 string msg = "Unknown $ code in: $";
1245                 msg.append(fmt, q, string::npos);
1246                 throw msg;
1247         }
1248         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1249         string var(fmt, q, p - q);
1250         map<string, const struct func_attrib *>::const_iterator func;
1251         func = func_map.find(var);
1252         if (func == func_map.end()) {
1253             throw "Unknown function '" + var + "'";
1254         }
1255         vector<string> args;
1256         if (fmt[p] == '{') {
1257             q = p + 1;
1258             int nest = 1;
1259             while (true) {
1260                 p = fmt.find_first_of(",{}", p + 1);
1261                 if (p == string::npos)
1262                     throw "missing } in " + fmt.substr(code_start);
1263                 if (fmt[p] == '{') {
1264                     ++nest;
1265                 } else {
1266                     if (nest == 1) {
1267                         // should we split the args
1268                         if (func->second->minargs != N) {
1269                             args.push_back(fmt.substr(q, p - q));
1270                             q = p + 1;
1271                         }
1272                     }
1273                     if (fmt[p] == '}' && --nest == 0) break;
1274                 }
1275             }
1276             if (func->second->minargs == N)
1277                 args.push_back(fmt.substr(q, p - q));
1278             ++p;
1279         }
1280
1281         if (func->second->minargs != N) {
1282             if (int(args.size()) < func->second->minargs)
1283                 throw "too few arguments to $" + var;
1284             if (func->second->maxargs != N &&
1285                 int(args.size()) > func->second->maxargs)
1286                 throw "too many arguments to $" + var;
1287
1288             vector<string>::size_type n;
1289             if (func->second->evalargs != N)
1290                 n = func->second->evalargs;
1291             else
1292                 n = args.size();
1293
1294             for (vector<string>::size_type j = 0; j < n; ++j)
1295                 args[j] = eval(args[j], param);
1296         }
1297         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1298             ensure_query_parsed();
1299         if (func->second->ensure == 'M') ensure_match();
1300         string value;
1301         switch (func->second->tag) {
1302             case CMD_:
1303                 break;
1304             case CMD_add: {
1305                 int total = 0;
1306                 vector<string>::const_iterator i;
1307                 for (auto&& arg : args)
1308                     total += string_to_int(arg);
1309                 value = str(total);
1310                 break;
1311             }
1312             case CMD_addfilter:
1313                 add_bterm(args[0]);
1314                 break;
1315             case CMD_allterms: {
1316                 // list of all terms indexing document
1317                 int id = q0;
1318                 if (!args.empty()) id = string_to_int(args[0]);
1319                 for (Xapian::TermIterator term = db.termlist_begin(id);
1320                      term != db.termlist_end(id); ++term) {
1321                     value += *term;
1322                     value += '\t';
1323                 }
1324
1325                 if (!value.empty()) value.erase(value.size() - 1);
1326                 break;
1327             }
1328             case CMD_and: {
1329                 value = "true";
1330                 for (auto&& arg : args) {
1331                     if (eval(arg, param).empty()) {
1332                         value.resize(0);
1333                         break;
1334                     }
1335                 }
1336                 break;
1337             }
1338             case CMD_cgi: {
1339                 MCI i = cgi_params.find(args[0]);
1340                 if (i != cgi_params.end()) value = i->second;
1341                 break;
1342             }
1343             case CMD_cgilist: {
1344                 pair<MCI, MCI> g;
1345                 g = cgi_params.equal_range(args[0]);
1346                 for (MCI i = g.first; i != g.second; ++i) {
1347                     value += i->second;
1348                     value += '\t';
1349                 }
1350                 if (!value.empty()) value.erase(value.size() - 1);
1351                 break;
1352             }
1353             case CMD_cgiparams: {
1354                 const string* prev = NULL;
1355                 for (auto&& i : cgi_params) {
1356                     if (prev && i.first == *prev) continue;
1357                     value += i.first;
1358                     value += '\t';
1359                     prev = &i.first;
1360                 }
1361                 if (!value.empty()) value.erase(value.size() - 1);
1362                 break;
1363             }
1364             case CMD_chr:
1365                 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1366                 break;
1367             case CMD_collapsed: {
1368                 value = str(collapsed);
1369                 break;
1370             }
1371             case CMD_contains: {
1372                 size_t pos = args[1].find(args[0]);
1373                 if (pos != string::npos) {
1374                     value = str(pos);
1375                 }
1376                 break;
1377             }
1378             case CMD_csv:
1379                 value = args[0];
1380                 if (args.size() > 1 && !args[1].empty()) {
1381                     csv_escape_always(value);
1382                 } else {
1383                     csv_escape(value);
1384                 }
1385                 break;
1386             case CMD_date:
1387                 value = args[0];
1388                 if (!value.empty()) {
1389                     char buf[64] = "";
1390                     time_t date = string_to_int(value);
1391                     if (date != static_cast<time_t>(-1)) {
1392                         struct tm *then;
1393                         then = gmtime(&date);
1394                         string date_fmt = "%Y-%m-%d";
1395                         if (args.size() > 1) date_fmt = eval(args[1], param);
1396                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1397                     }
1398                     value = buf;
1399                 }
1400                 break;
1401             case CMD_dbname:
1402                 value = dbname;
1403                 break;
1404             case CMD_dbsize: {
1405                 static Xapian::doccount dbsize;
1406                 if (!dbsize) dbsize = db.get_doccount();
1407                 value = str(dbsize);
1408                 break;
1409             }
1410             case CMD_def: {
1411                 func_attrib *fa = new func_attrib;
1412                 fa->tag = CMD_MACRO + macros.size();
1413                 fa->minargs = 0;
1414                 fa->maxargs = 9;
1415                 fa->evalargs = N; // FIXME: or 0?
1416                 fa->ensure = 0;
1417
1418                 macros.push_back(args[1]);
1419                 func_map[args[0]] = fa;
1420                 break;
1421             }
1422             case CMD_defaultop:
1423                 if (default_op == Xapian::Query::OP_AND) {
1424                     value = "and";
1425                 } else {
1426                     value = "or";
1427                 }
1428                 break;
1429             case CMD_div: {
1430                 int denom = string_to_int(args[1]);
1431                 if (denom == 0) {
1432                     value = "divide by 0";
1433                 } else {
1434                     value = str(string_to_int(args[0]) /
1435                                 string_to_int(args[1]));
1436                 }
1437                 break;
1438             }
1439             case CMD_eq:
1440                 if (args[0] == args[1]) value = "true";
1441                 break;
1442             case CMD_emptydocs: {
1443                 string t;
1444                 if (!args.empty())
1445                     t = args[0];
1446                 Xapian::PostingIterator i;
1447                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1448                     if (i.get_doclength() != 0) continue;
1449                     if (!value.empty()) value += '\t';
1450                     value += str(*i);
1451                 }
1452                 break;
1453             }
1454             case CMD_env: {
1455                 char *env = getenv(args[0].c_str());
1456                 if (env != NULL) value = env;
1457                 break;
1458             }
1459             case CMD_error:
1460                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1461                     error_msg = "Database '" + dbname + "' couldn't be opened";
1462                 }
1463                 value = error_msg;
1464                 break;
1465             case CMD_field: {
1466                 Xapian::docid did = q0;
1467                 if (args.size() > 1) did = string_to_int(args[1]);
1468                 value = fields.get_field(did, args[0]);
1469                 break;
1470             }
1471             case CMD_filesize: {
1472                 // FIXME: rounding?  i18n?
1473                 int size = string_to_int(args[0]);
1474                 int intpart = size;
1475                 int fraction = -1;
1476                 const char * format = 0;
1477                 if (size < 0) {
1478                     // Negative size -> empty result.
1479                 } else if (size == 1) {
1480                     format = "%d byte";
1481                 } else if (size < 1024) {
1482                     format = "%d bytes";
1483                 } else {
1484                     if (size < 1024 * 1024) {
1485                         format = "%d.%cK";
1486                     } else {
1487                         size /= 1024;
1488                         if (size < 1024 * 1024) {
1489                             format = "%d.%cM";
1490                         } else {
1491                             size /= 1024;
1492                             format = "%d.%cG";
1493                         }
1494                     }
1495                     intpart = unsigned(size) / 1024;
1496                     fraction = unsigned(size) % 1024;
1497                 }
1498                 if (format) {
1499                     char buf[200];
1500                     int len;
1501                     if (fraction == -1) {
1502                         len = my_snprintf(buf, sizeof(buf), format, intpart);
1503                     } else {
1504                         fraction = (fraction * 10 / 1024) + '0';
1505                         len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1506                     }
1507                     if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1508                     value.assign(buf, len);
1509                 }
1510                 break;
1511             }
1512             case CMD_filters:
1513                 value = filters;
1514                 break;
1515             case CMD_filterterms: {
1516                 Xapian::TermIterator term = db.allterms_begin();
1517                 term.skip_to(args[0]);
1518                 while (term != db.allterms_end()) {
1519                     string t = *term;
1520                     if (!startswith(t, args[0])) break;
1521                     value += t;
1522                     value += '\t';
1523                     ++term;
1524                 }
1525
1526                 if (!value.empty()) value.erase(value.size() - 1);
1527                 break;
1528             }
1529             case CMD_find: {
1530                 string l = args[0], s = args[1];
1531                 string::size_type i = 0, j = 0;
1532                 size_t count = 0;
1533                 while (j != l.size()) {
1534                     j = l.find('\t', i);
1535                     if (j == string::npos) j = l.size();
1536                     if (j - i == s.length()) {
1537                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1538                             value = str(count);
1539                             break;
1540                         }
1541                     }
1542                     ++count;
1543                     i = j + 1;
1544                 }
1545                 break;
1546             }
1547             case CMD_fmt:
1548                 value = fmtname;
1549                 break;
1550             case CMD_freq: {
1551                 const string& term = args[0];
1552                 Xapian::doccount termfreq = 0;
1553                 if (done_query) {
1554                     termfreq = mset.get_termfreq(term);
1555                 }
1556                 if (termfreq == 0) {
1557                     // We want $freq to work before the match is run, and we
1558                     // don't want using it to force the match to run.
1559                     termfreq = db.get_termfreq(term);
1560                 }
1561                 value = str(termfreq);
1562                 break;
1563             }
1564             case CMD_ge:
1565                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1566                     value = "true";
1567                 break;
1568             case CMD_gt:
1569                 if (string_to_int(args[0]) > string_to_int(args[1]))
1570                     value = "true";
1571                 break;
1572             case CMD_hash: {
1573                 const string& data = args[0];
1574                 const string& hash = args[1];
1575                 if (hash == "md5") {
1576                     string md5;
1577                     md5_string(data, md5);
1578                     value.reserve(md5.size() * 2);
1579                     for (unsigned char byte : md5) {
1580                         value += "0123456789abcdef"[byte >> 4];
1581                         value += "0123456789abcdef"[byte & 0x0f];
1582                     }
1583                 } else {
1584                     throw "Unknown hash function: " + hash;
1585                 }
1586                 break;
1587             }
1588             case CMD_highlight: {
1589                 string bra, ket;
1590                 if (args.size() > 2) {
1591                     bra = args[2];
1592                     if (args.size() > 3) {
1593                         ket = args[3];
1594                     } else {
1595                         string::const_iterator i;
1596                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1597                         ket = "</";
1598                         ket.append(bra, 1, i - bra.begin() - 1);
1599                         ket += '>';
1600                     }
1601                 }
1602
1603                 value = html_highlight(args[0], args[1], bra, ket);
1604                 break;
1605             }
1606             case CMD_hit:
1607                 // 0-based mset index
1608                 value = str(hit_no);
1609                 break;
1610             case CMD_hitlist:
1611 #if 0
1612                 url_query_string = "?DB=";
1613                 url_query_string += dbname;
1614                 multimap<string, string>::const_iterator j;
1615                 for (j = probabilistic_query.begin();
1616                      j != probabilistic_query.end();
1617                      ++j) {
1618                     if (j->first.empty()) {
1619                         url_query_string += "&P=";
1620                     } else {
1621                         url_query_string += "&P."
1622                         url_query_string += j->first;
1623                         url_query_string += '=';
1624                     }
1625                     const char *q = j->second.c_str();
1626                     int ch;
1627                     while ((ch = *q++) != '\0') {
1628                         switch (ch) {
1629                          case '+':
1630                             url_query_string += "%2b";
1631                             break;
1632                          case '"':
1633                             url_query_string += "%22";
1634                             break;
1635                          case '%':
1636                             url_query_string += "%25";
1637                             break;
1638                          case '&':
1639                             url_query_string += "%26";
1640                             break;
1641                          case ' ':
1642                             ch = '+';
1643                             /* fall through */
1644                          default:
1645                             url_query_string += ch;
1646                         }
1647                     }
1648                 }
1649                 // add any boolean terms
1650                 for (FMCI i = filter_map.begin(); i != filter_map.end(); ++i) {
1651                     url_query_string += "&B=";
1652                     url_query_string += i->second;
1653                 }
1654 #endif
1655                 for (hit_no = topdoc; hit_no < last; ++hit_no)
1656                     value += print_caption(args[0], param);
1657                 hit_no = 0;
1658                 break;
1659             case CMD_hitsperpage:
1660                 value = str(hits_per_page);
1661                 break;
1662             case CMD_hostname: {
1663                 value = args[0];
1664                 // remove URL scheme and/or path
1665                 string::size_type i = value.find("://");
1666                 if (i == string::npos) i = 0; else i += 3;
1667                 value = value.substr(i, value.find('/', i) - i);
1668                 // remove user@ or user:password@
1669                 i = value.find('@');
1670                 if (i != string::npos) value.erase(0, i + 1);
1671                 // remove :port
1672                 i = value.find(':');
1673                 if (i != string::npos) value.resize(i);
1674                 break;
1675             }
1676             case CMD_html:
1677                 value = html_escape(args[0]);
1678                 break;
1679             case CMD_htmlstrip:
1680                 value = html_strip(args[0]);
1681                 break;
1682             case CMD_httpheader:
1683                 if (!suppress_http_headers) {
1684                     cout << args[0] << ": " << args[1] << endl;
1685                     if (!set_content_type && args[0].length() == 12 &&
1686                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1687                         set_content_type = true;
1688                     }
1689                 }
1690                 break;
1691             case CMD_id:
1692                 // document id
1693                 value = str(q0);
1694                 break;
1695             case CMD_if:
1696                 if (!args[0].empty())
1697                     value = eval(args[1], param);
1698                 else if (args.size() > 2)
1699                     value = eval(args[2], param);
1700                 break;
1701             case CMD_include:
1702                 value = eval_file(args[0]);
1703                 break;
1704             case CMD_json:
1705                 value = args[0];
1706                 json_escape(value);
1707                 break;
1708             case CMD_jsonarray: {
1709                 const string & l = args[0];
1710                 string::size_type i = 0, j;
1711                 if (l.empty()) {
1712                     value = "[]";
1713                     break;
1714                 }
1715                 value = "[\"";
1716                 while (true) {
1717                     j = l.find('\t', i);
1718                     string elt(l, i, j - i);
1719                     json_escape(elt);
1720                     value += elt;
1721                     if (j == string::npos) break;
1722                     value += "\",\"";
1723                     i = j + 1;
1724                 }
1725                 value += "\"]";
1726                 break;
1727             }
1728             case CMD_last:
1729                 value = str(last);
1730                 break;
1731             case CMD_lastpage: {
1732                 int l = mset.get_matches_estimated();
1733                 if (l > 0) l = (l - 1) / hits_per_page + 1;
1734                 value = str(l);
1735                 break;
1736             }
1737             case CMD_le:
1738                 if (string_to_int(args[0]) <= string_to_int(args[1]))
1739                     value = "true";
1740                 break;
1741             case CMD_length:
1742                 if (args[0].empty()) {
1743                     value = "0";
1744                 } else {
1745                     size_t length = count(args[0].begin(), args[0].end(), '\t');
1746                     value = str(length + 1);
1747                 }
1748                 break;
1749             case CMD_list: {
1750                 if (!args[0].empty()) {
1751                     string pre, inter, interlast, post;
1752                     switch (args.size()) {
1753                      case 2:
1754                         inter = interlast = args[1];
1755                         break;
1756                      case 3:
1757                         inter = args[1];
1758                         interlast = args[2];
1759                         break;
1760                      case 4:
1761                         pre = args[1];
1762                         inter = interlast = args[2];
1763                         post = args[3];
1764                         break;
1765                      case 5:
1766                         pre = args[1];
1767                         inter = args[2];
1768                         interlast = args[3];
1769                         post = args[4];
1770                         break;
1771                     }
1772                     value += pre;
1773                     string list = args[0];
1774                     string::size_type split = 0, split2;
1775                     while ((split2 = list.find('\t', split)) != string::npos) {
1776                         if (split) value += inter;
1777                         value.append(list, split, split2 - split);
1778                         split = split2 + 1;
1779                     }
1780                     if (split) value += interlast;
1781                     value.append(list, split, string::npos);
1782                     value += post;
1783                 }
1784                 break;
1785             }
1786             case CMD_log: {
1787                 if (!vet_filename(args[0])) break;
1788                 string logfile = log_dir + args[0];
1789                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1790                 if (fd == -1) break;
1791                 vector<string> noargs;
1792                 noargs.resize(1);
1793                 string line;
1794                 if (args.size() > 1) {
1795                     line = args[1];
1796                 } else {
1797                     line = DEFAULT_LOG_ENTRY;
1798                 }
1799                 line = eval(line, noargs);
1800                 line += '\n';
1801                 (void)write_all(fd, line.data(), line.length());
1802                 close(fd);
1803                 break;
1804             }
1805             case CMD_lookup: {
1806                 if (!vet_filename(args[0])) break;
1807                 string cdbfile = cdb_dir + args[0];
1808                 int fd = open(cdbfile.c_str(), O_RDONLY);
1809                 if (fd == -1) break;
1810
1811                 struct cdb cdb;
1812                 cdb_init(&cdb, fd);
1813
1814                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1815                     size_t datalen = cdb_datalen(&cdb);
1816                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1817                     if (q) {
1818                         value.assign(static_cast<const char *>(dat), datalen);
1819                     }
1820                 }
1821
1822                 cdb_free(&cdb);
1823                 close(fd); // FIXME: cache fds?
1824                 break;
1825             }
1826             case CMD_lower:
1827                 value = Xapian::Unicode::tolower(args[0]);
1828                 break;
1829             case CMD_lt:
1830                 if (string_to_int(args[0]) < string_to_int(args[1]))
1831                     value = "true";
1832                 break;
1833             case CMD_map:
1834                 if (!args[0].empty()) {
1835                     string l = args[0], pat = args[1];
1836                     vector<string> new_args(param);
1837                     string::size_type i = 0, j;
1838                     while (true) {
1839                         j = l.find('\t', i);
1840                         new_args[0] = l.substr(i, j - i);
1841                         value += eval(pat, new_args);
1842                         if (j == string::npos) break;
1843                         value += '\t';
1844                         i = j + 1;
1845                     }
1846                 }
1847                 break;
1848             case CMD_match:
1849                 omegascript_match(value, args);
1850                 break;
1851             case CMD_max: {
1852                 vector<string>::const_iterator i = args.begin();
1853                 int val = string_to_int(*i++);
1854                 for (; i != args.end(); ++i) {
1855                     int x = string_to_int(*i);
1856                     if (x > val) val = x;
1857                 }
1858                 value = str(val);
1859                 break;
1860             }
1861             case CMD_min: {
1862                 vector<string>::const_iterator i = args.begin();
1863                 int val = string_to_int(*i++);
1864                 for (; i != args.end(); ++i) {
1865                     int x = string_to_int(*i);
1866                     if (x < val) val = x;
1867                 }
1868                 value = str(val);
1869                 break;
1870             }
1871             case CMD_msize:
1872                 // Estimated number of matches.
1873                 value = str(mset.get_matches_estimated());
1874                 break;
1875             case CMD_msizeexact:
1876                 // Is msize exact?
1877                 if (mset.get_matches_lower_bound()
1878                     == mset.get_matches_upper_bound())
1879                     value = "true";
1880                 break;
1881             case CMD_msizelower:
1882                 // Lower bound on number of matches.
1883                 value = str(mset.get_matches_lower_bound());
1884                 break;
1885             case CMD_msizeupper:
1886                 // Upper bound on number of matches.
1887                 value = str(mset.get_matches_upper_bound());
1888                 break;
1889             case CMD_mod: {
1890                 int denom = string_to_int(args[1]);
1891                 if (denom == 0) {
1892                     value = "divide by 0";
1893                 } else {
1894                     value = str(string_to_int(args[0]) %
1895                                 string_to_int(args[1]));
1896                 }
1897                 break;
1898             }
1899             case CMD_mul: {
1900                 vector<string>::const_iterator i = args.begin();
1901                 int total = string_to_int(*i++);
1902                 while (i != args.end())
1903                     total *= string_to_int(*i++);
1904                 value = str(total);
1905                 break;
1906             }
1907             case CMD_muldiv: {
1908                 int denom = string_to_int(args[2]);
1909                 if (denom == 0) {
1910                     value = "divide by 0";
1911                 } else {
1912                     int num = string_to_int(args[0]) * string_to_int(args[1]);
1913                     value = str(num / denom);
1914                 }
1915                 break;
1916             }
1917             case CMD_ne:
1918                 if (args[0] != args[1]) value = "true";
1919                 break;
1920             case CMD_nice: {
1921                 string::const_iterator i = args[0].begin();
1922                 int len = args[0].length();
1923                 while (len) {
1924                     value += *i++;
1925                     if (--len && len % 3 == 0) value += option["thousand"];
1926                 }
1927                 break;
1928             }
1929             case CMD_not:
1930                 if (args[0].empty()) value = "true";
1931                 break;
1932             case CMD_now: {
1933                 char buf[64];
1934                 my_snprintf(buf, sizeof(buf), "%lu",
1935                             static_cast<unsigned long>(time(NULL)));
1936                 // MSVC's snprintf omits the zero byte if the string if
1937                 // sizeof(buf) long.
1938                 buf[sizeof(buf) - 1] = '\0';
1939                 value = buf;
1940                 break;
1941             }
1942             case CMD_opt:
1943                 if (args.size() == 2) {
1944                     value = option[args[0] + "," + args[1]];
1945                 } else {
1946                     value = option[args[0]];
1947                 }
1948                 break;
1949             case CMD_or: {
1950                 for (auto&& arg : args) {
1951                     value = eval(arg, param);
1952                     if (!value.empty()) break;
1953                 }
1954                 break;
1955             }
1956             case CMD_ord: {
1957                 if (!args[0].empty()) {
1958                     Utf8Iterator it(args[0]);
1959                     value = str(*it);
1960                 }
1961                 break;
1962             }
1963             case CMD_pack:
1964                 value = int_to_binary_string(string_to_int(args[0]));
1965                 break;
1966             case CMD_percentage:
1967                 // percentage score
1968                 value = str(percent);
1969                 break;
1970             case CMD_prettyterm:
1971                 value = pretty_term(args[0]);
1972                 break;
1973             case CMD_prettyurl:
1974                 value = args[0];
1975                 url_prettify(value);
1976                 break;
1977             case CMD_query: {
1978                 pair<multimap<string, string>::const_iterator,
1979                      multimap<string, string>::const_iterator> r;
1980                 r = probabilistic_query.equal_range(args.empty() ?
1981                                                     string() : args[0]);
1982                 multimap<string, string>::const_iterator j;
1983                 for (j = r.first; j != r.second; ++j) {
1984                     if (!value.empty()) value += '\t';
1985                     const string & s = j->second;
1986                     size_t start = 0, tab;
1987                     while ((tab = s.find('\t', start)) != string::npos) {
1988                         value.append(s, start, tab - start);
1989                         value += ' ';
1990                         start = tab + 1;
1991                     }
1992                     value.append(s, start, string::npos);
1993                 }
1994                 break;
1995             }
1996             case CMD_querydescription:
1997                 value = query.get_description();
1998                 break;
1999             case CMD_queryterms:
2000                 value = queryterms;
2001                 break;
2002             case CMD_range: {
2003                 int start = string_to_int(args[0]);
2004                 int end = string_to_int(args[1]);
2005                 while (start <= end) {
2006                     value += str(start);
2007                     if (start < end) value += '\t';
2008                     start++;
2009                 }
2010                 break;
2011             }
2012             case CMD_record: {
2013                 int id = q0;
2014                 if (!args.empty()) id = string_to_int(args[0]);
2015                 value = db.get_document(id).get_data();
2016                 break;
2017             }
2018             case CMD_relevant: {
2019                 // document id if relevant; empty otherwise
2020                 int id = q0;
2021                 if (!args.empty()) id = string_to_int(args[0]);
2022                 map<Xapian::docid, bool>::iterator i = ticked.find(id);
2023                 if (i != ticked.end()) {
2024                     i->second = false; // icky side-effect
2025                     value = str(id);
2026                 }
2027                 break;
2028             }
2029             case CMD_relevants: {
2030                 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
2031                      i != ticked.end(); ++i) {
2032                     if (i->second) {
2033                         value += str(i->first);
2034                         value += '\t';
2035                     }
2036                 }
2037                 if (!value.empty()) value.erase(value.size() - 1);
2038                 break;
2039             }
2040             case CMD_score:
2041                 // Score (0 to 10)
2042                 value = str(percent / 10);
2043                 break;
2044             case CMD_set:
2045                 option[args[0]] = args[1];
2046                 break;
2047             case CMD_seterror:
2048                 error_msg = args[0];
2049                 break;
2050             case CMD_setmap: {
2051                 string base = args[0] + ',';
2052                 if (args.size() % 2 != 1)
2053                     throw string("$setmap requires an odd number of arguments");
2054                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2055                     option[base + args[i]] = args[i + 1];
2056                 }
2057                 break;
2058             }
2059             case CMD_setrelevant: {
2060                 string::size_type i = 0, j;
2061                 while (true) {
2062                     j = args[0].find_first_not_of("0123456789", i);
2063                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2064                     if (id) {
2065                         rset.add_document(id);
2066                         ticked[id] = true;
2067                     }
2068                     if (j == string::npos) break;
2069                     i = j + 1;
2070                 }
2071                 break;
2072             }
2073             case CMD_slice: {
2074                 string list = args[0], pos = args[1];
2075                 vector<string> items;
2076                 string::size_type i = 0, j;
2077                 while (true) {
2078                     j = list.find('\t', i);
2079                     items.push_back(list.substr(i, j - i));
2080                     if (j == string::npos) break;
2081                     i = j + 1;
2082                 }
2083                 i = 0;
2084                 bool have_added = false;
2085                 while (true) {
2086                     j = pos.find('\t', i);
2087                     int item = string_to_int(pos.substr(i, j - i));
2088                     if (item >= 0 && size_t(item) < items.size()) {
2089                         if (have_added) value += '\t';
2090                         value += items[item];
2091                         have_added = true;
2092                     }
2093                     if (j == string::npos) break;
2094                     i = j + 1;
2095                 }
2096                 break;
2097             }
2098             case CMD_snippet: {
2099                 size_t length = 200;
2100                 if (args.size() > 1) {
2101                     length = string_to_int(args[1]);
2102                 }
2103                 if (!stemmer)
2104                     stemmer = new Xapian::Stem(option["stemmer"]);
2105                 // FIXME: Allow start and end highlight and omit to be specified.
2106                 value = mset.snippet(args[0], length, *stemmer,
2107                                      mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2108                                      "<strong>", "</strong>", "...");
2109                 break;
2110             }
2111             case CMD_split: {
2112                 string split;
2113                 if (args.size() == 1) {
2114                     split = " ";
2115                     value = args[0];
2116                 } else {
2117                     split = args[0];
2118                     value = args[1];
2119                 }
2120                 string::size_type i = 0;
2121                 while (true) {
2122                     if (split.empty()) {
2123                         ++i;
2124                         if (i >= value.size()) break;
2125                     } else {
2126                         i = value.find(split, i);
2127                         if (i == string::npos) break;
2128                     }
2129                     value.replace(i, split.size(), 1, '\t');
2130                     ++i;
2131                 }
2132                 break;
2133             }
2134             case CMD_stoplist: {
2135                 Xapian::TermIterator i = qp.stoplist_begin();
2136                 Xapian::TermIterator end = qp.stoplist_end();
2137                 while (i != end) {
2138                     if (!value.empty()) value += '\t';
2139                     value += *i;
2140                     ++i;
2141                 }
2142                 break;
2143             }
2144             case CMD_sub:
2145                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2146                 break;
2147             case CMD_substr: {
2148                 int start = string_to_int(args[1]);
2149                 if (start < 0) {
2150                     if (static_cast<size_t>(-start) >= args[0].size()) {
2151                         start = 0;
2152                     } else {
2153                         start = static_cast<int>(args[0].size()) + start;
2154                     }
2155                 } else {
2156                     if (static_cast<size_t>(start) >= args[0].size()) break;
2157                 }
2158                 size_t len = string::npos;
2159                 if (args.size() > 2) {
2160                     int int_len = string_to_int(args[2]);
2161                     if (int_len >= 0) {
2162                         len = size_t(int_len);
2163                     } else {
2164                         len = args[0].size() - start;
2165                         if (static_cast<size_t>(-int_len) >= len) {
2166                             len = 0;
2167                         } else {
2168                             len -= static_cast<size_t>(-int_len);
2169                         }
2170                     }
2171                 }
2172                 value.assign(args[0], start, len);
2173                 break;
2174             }
2175             case CMD_suggestion:
2176                 value = qp.get_corrected_query_string();
2177                 break;
2178             case CMD_termprefix:
2179                 (void)prefix_from_term(&value, args[0]);
2180                 break;
2181             case CMD_terms: {
2182                 // list of matching terms
2183                 if (!enquire) break;
2184                 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2185                 if (args.empty()) {
2186                     while (term != enquire->get_matching_terms_end(q0)) {
2187                         // check term was in the typed query so we ignore
2188                         // boolean filter terms
2189                         const string & t = *term;
2190                         if (termset.find(t) != termset.end()) {
2191                             value += t;
2192                             value += '\t';
2193                         }
2194                         ++term;
2195                     }
2196                 } else {
2197                     // Return matching terms with specified prefix.  We can't
2198                     // use skip_to() as the terms aren't ordered by termname.
2199                     const string & pfx = args[0];
2200                     while (term != enquire->get_matching_terms_end(q0)) {
2201                         const string & t = *term;
2202                         if (startswith(t, pfx)) {
2203                             value += t;
2204                             value += '\t';
2205                         }
2206                         ++term;
2207                     }
2208                 }
2209
2210                 if (!value.empty()) value.erase(value.size() - 1);
2211                 break;
2212             }
2213             case CMD_thispage:
2214                 value = str(topdoc / hits_per_page + 1);
2215                 break;
2216             case CMD_time:
2217                 if (secs >= 0) {
2218                     char buf[64];
2219                     my_snprintf(buf, sizeof(buf), "%.6f", secs);
2220                     // MSVC's snprintf omits the zero byte if the string if
2221                     // sizeof(buf) long.
2222                     buf[sizeof(buf) - 1] = '\0';
2223                     value = buf;
2224                 }
2225                 break;
2226             case CMD_topdoc:
2227                 // first document on current page of hit list (counting from 0)
2228                 value = str(topdoc);
2229                 break;
2230             case CMD_topterms:
2231                 if (enquire) {
2232                     int howmany = 16;
2233                     if (!args.empty()) howmany = string_to_int(args[0]);
2234                     if (howmany < 0) howmany = 0;
2235
2236                     // List of expand terms
2237                     Xapian::ESet eset;
2238                     OmegaExpandDecider decider(db, &termset);
2239
2240                     if (!rset.empty()) {
2241                         set_expansion_scheme(*enquire, option);
2242                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2243                     } else if (mset.size()) {
2244                         // invent an rset
2245                         Xapian::RSet tmp;
2246
2247                         int c = 5;
2248                         // FIXME: what if mset does not start at first match?
2249                         for (Xapian::docid did : mset) {
2250                             tmp.add_document(did);
2251                             if (--c == 0) break;
2252                         }
2253
2254                         set_expansion_scheme(*enquire, option);
2255                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2256                     }
2257
2258                     // Don't show more than one word with the same stem.
2259                     set<string> stems;
2260                     Xapian::ESetIterator i;
2261                     for (i = eset.begin(); i != eset.end(); ++i) {
2262                         string term(*i);
2263                         string stem = (*stemmer)(term);
2264                         if (stems.find(stem) != stems.end()) continue;
2265                         stems.insert(stem);
2266                         value += term;
2267                         value += '\t';
2268                         if (--howmany == 0) break;
2269                     }
2270                     if (!value.empty()) value.erase(value.size() - 1);
2271                 }
2272                 break;
2273             case CMD_transform:
2274                 omegascript_transform(value, args);
2275                 break;
2276             case CMD_truncate:
2277                 value = generate_sample(args[0],
2278                                         string_to_int(args[1]),
2279                                         args.size() > 2 ? args[2] : string(),
2280                                         args.size() > 3 ? args[3] : string());
2281                 break;
2282             case CMD_uniq: {
2283                 const string &list = args[0];
2284                 if (list.empty()) break;
2285                 string::size_type split = 0, split2;
2286                 string prev;
2287                 do {
2288                     split2 = list.find('\t', split);
2289                     string item(list, split, split2 - split);
2290                     if (split == 0) {
2291                         value = item;
2292                     } else if (item != prev) {
2293                         value += '\t';
2294                         value += item;
2295                     }
2296                     prev = item;
2297                     split = split2 + 1;
2298                 } while (split2 != string::npos);
2299                 break;
2300             }
2301             case CMD_unpack:
2302                 value = str(binary_string_to_int(args[0]));
2303                 break;
2304             case CMD_unprefix: {
2305                 size_t prefix_len = prefix_from_term(NULL, args[0]);
2306                 value.assign(args[0], prefix_len, string::npos);
2307                 break;
2308             }
2309             case CMD_unstem: {
2310                 const string &term = args[0];
2311                 Xapian::TermIterator i = qp.unstem_begin(term);
2312                 Xapian::TermIterator end = qp.unstem_end(term);
2313                 while (i != end) {
2314                     if (!value.empty()) value += '\t';
2315                     value += *i;
2316                     ++i;
2317                 }
2318                 break;
2319             }
2320             case CMD_upper:
2321                 value = Xapian::Unicode::toupper(args[0]);
2322                 break;
2323             case CMD_url:
2324                 url_encode(value, args[0]);
2325                 break;
2326             case CMD_value: {
2327                 Xapian::docid id = q0;
2328                 Xapian::valueno value_no = string_to_int(args[0]);
2329                 if (args.size() > 1) id = string_to_int(args[1]);
2330                 value = db.get_document(id).get_value(value_no);
2331                 break;
2332             }
2333             case CMD_version:
2334                 value = PACKAGE_STRING;
2335                 break;
2336             case CMD_weight:
2337                 value = double_to_string(weight);
2338                 break;
2339             default: {
2340                 args.insert(args.begin(), param[0]);
2341                 int macro_no = func->second->tag - CMD_MACRO;
2342                 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2343                 // throw "Unknown function '" + var + "'";
2344                 value = eval(macros[macro_no], args);
2345                 break;
2346             }
2347         }
2348         res += value;
2349     } catch (const Xapian::Error & e) {
2350         // FIXME: this means we only see the most recent error in $error
2351         // - is that the best approach?
2352         error_msg = e.get_msg();
2353     }
2354
2355     res.append(fmt, p, string::npos);
2356     return res;
2357 }
2358
2359 static string
2360 eval_file(const string &fmtfile)
2361 {
2362     string err;
2363     if (vet_filename(fmtfile)) {
2364         string file = template_dir + fmtfile;
2365         string fmt;
2366         if (load_file(file, fmt)) {
2367             vector<string> noargs;
2368             noargs.resize(1);
2369             return eval(fmt, noargs);
2370         }
2371         err = strerror(errno);
2372     } else {
2373         err = "name contains '..'";
2374     }
2375
2376     // FIXME: report why!
2377     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2378     if (!err.empty()) msg += " (" + err + ')';
2379     throw msg;
2380 }
2381
2382 extern string
2383 pretty_term(string term)
2384 {
2385     // Just leave empty strings and single characters alone.
2386     if (term.length() <= 1) return term;
2387
2388     // Assume unprefixed terms are unstemmed.
2389     if (!C_isupper(term[0])) return term;
2390
2391     // Handle stemmed terms.
2392     bool stemmed = (term[0] == 'Z');
2393     if (stemmed) {
2394         // First of all, check if a term in the query stemmed to this one.
2395         Xapian::TermIterator u = qp.unstem_begin(term);
2396         // There might be multiple words with the same stem, but we only want
2397         // one so just take the first.
2398         if (u != qp.unstem_end(term)) return *u;
2399
2400         // Remove the 'Z'.
2401         term.erase(0, 1);
2402     }
2403
2404     bool add_quotes = false;
2405
2406     // Check if the term has a prefix.
2407     if (C_isupper(term[0])) {
2408         // See if we have this prefix in the termprefix_to_userprefix map.  If
2409         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2410         string prefix;
2411         size_t prefix_len = prefix_from_term(&prefix, term);
2412
2413         map<string, string>::const_iterator i;
2414         i = termprefix_to_userprefix.find(prefix);
2415         if (i != termprefix_to_userprefix.end()) {
2416             string user_prefix = i->second;
2417             user_prefix += ':';
2418             term.replace(0, prefix_len, user_prefix);
2419         } else {
2420             // We don't have a prefix mapping for this, so just set a flag to
2421             // add quotes around the term.
2422             add_quotes = true;
2423         }
2424     }
2425
2426     if (stemmed) term += '.';
2427
2428     if (add_quotes) {
2429         term.insert(0, "\"");
2430         term.append("\"");
2431     }
2432
2433     return term;
2434 }
2435
2436 static string
2437 print_caption(const string &fmt, const vector<string> &param)
2438 {
2439     q0 = *(mset[hit_no]);
2440
2441     weight = mset[hit_no].get_weight();
2442     percent = mset.convert_to_percent(mset[hit_no]);
2443     collapsed = mset[hit_no].get_collapse_count();
2444
2445     return eval(fmt, param);
2446 }
2447
2448 void
2449 parse_omegascript()
2450 {
2451     try {
2452         const char * p = getenv("SERVER_PROTOCOL");
2453         if (p && strcmp(p, "INCLUDED") == 0) {
2454             // We're being included in another page, so suppress headers.
2455             suppress_http_headers = true;
2456         }
2457
2458         string output = eval_file(fmtname);
2459         if (!set_content_type && !suppress_http_headers) {
2460             cout << "Content-Type: text/html" << endl;
2461             set_content_type = true;
2462         }
2463         if (!suppress_http_headers) cout << endl;
2464         cout << output;
2465     } catch (...) {
2466         // Ensure the headers have been output so that any exception gets
2467         // reported rather than giving a server error.
2468         if (!set_content_type && !suppress_http_headers) {
2469             cout << "Content-Type: text/html" << endl;
2470             set_content_type = true;
2471         }
2472         if (!suppress_http_headers) cout << endl;
2473         throw;
2474     }
2475 }
2476
2477 static void
2478 ensure_query_parsed()
2479 {
2480     if (query_parsed) return;
2481     query_parsed = true;
2482
2483     MCI val;
2484     pair<MCI, MCI> g;
2485
2486     // Should we discard the existing R-set recorded in R CGI parameters?
2487     bool discard_rset = false;
2488
2489     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2490     // CGI parameters)?
2491     bool force_first_page = false;
2492
2493     string v;
2494     // get list of terms from previous iteration of query
2495     val = cgi_params.find("xP");
2496     if (val != cgi_params.end()) {
2497         v = val->second;
2498         // If xP given, default to discarding any RSet and forcing the first
2499         // page of results.  If the query is the same, or an extension of
2500         // the previous query, we adjust these again below.
2501         discard_rset = true;
2502         force_first_page = true;
2503     }
2504     querytype result = set_probabilistic(v);
2505     switch (result) {
2506         case BAD_QUERY:
2507             break;
2508         case NEW_QUERY:
2509             break;
2510         case SAME_QUERY:
2511         case EXTENDED_QUERY:
2512             // If we've changed database, force the first page of hits
2513             // and discard the R-set (since the docids will have changed)
2514             val = cgi_params.find("xDB");
2515             if (val != cgi_params.end() && val->second != dbname) break;
2516             if (result == SAME_QUERY && force_first_page) {
2517                 val = cgi_params.find("xFILTERS");
2518                 if (val != cgi_params.end() && val->second != filters &&
2519                     val->second != old_filters) {
2520                     // Filters have changed since last query.
2521                 } else {
2522                     force_first_page = false;
2523                 }
2524             }
2525             discard_rset = false;
2526             break;
2527     }
2528
2529     if (!force_first_page) {
2530         // Work out which mset element is the first hit we want
2531         // to display
2532         val = cgi_params.find("TOPDOC");
2533         if (val != cgi_params.end()) {
2534             topdoc = atol(val->second.c_str());
2535         }
2536
2537         // Handle next, previous, and page links
2538         if (cgi_params.find(">") != cgi_params.end()) {
2539             topdoc += hits_per_page;
2540         } else if (cgi_params.find("<") != cgi_params.end()) {
2541             if (topdoc >= hits_per_page)
2542                 topdoc -= hits_per_page;
2543             else
2544                 topdoc = 0;
2545         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2546                    (val = cgi_params.find("#")) != cgi_params.end()) {
2547             long page = atol(val->second.c_str());
2548             // Do something sensible for page 0 (we count pages from 1).
2549             if (page == 0) page = 1;
2550             topdoc = (page - 1) * hits_per_page;
2551         }
2552
2553         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2554         // Normally we snap TOPDOC like this so that things work nicely if
2555         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
2556         // postprocessing the output of omega and want variable sized pages,
2557         // this is unhelpful.
2558         bool raw_search = false;
2559         val = cgi_params.find("RAWSEARCH");
2560         if (val != cgi_params.end()) {
2561             raw_search = bool(atol(val->second.c_str()));
2562         }
2563
2564         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2565     }
2566
2567     if (!discard_rset) {
2568         // put documents marked as relevant into the rset
2569         g = cgi_params.equal_range("R");
2570         for (MCI i = g.first; i != g.second; ++i) {
2571             const string & value = i->second;
2572             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2573                 while (value[j] == '.') ++j;
2574                 Xapian::docid d = atoi(value.c_str() + j);
2575                 if (d) {
2576                     rset.add_document(d);
2577                     ticked[d] = true;
2578                 }
2579             }
2580         }
2581     }
2582 }
2583
2584 // run query if we haven't already
2585 static void
2586 ensure_match()
2587 {
2588     if (done_query) return;
2589
2590     secs = RealTime::now();
2591     run_query();
2592     if (secs != -1)
2593         secs = RealTime::now() - secs;
2594
2595     done_query = true;
2596     last = mset.get_matches_lower_bound();
2597     if (last == 0) {
2598         // Otherwise topdoc ends up being -6 if it's non-zero!
2599         topdoc = 0;
2600     } else {
2601         if (topdoc >= last)
2602             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2603         // last is the count of documents up to the end of the current page
2604         // (as returned by $last)
2605         if (topdoc + hits_per_page < last)
2606             last = topdoc + hits_per_page;
2607     }
2608 }
2609
2610 // OmegaExpandDecider methods.
2611
2612 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2613                                        set<string> * querytermset)
2614     : db(db_)
2615 {
2616     // We'll want the stemmer for testing matches anyway.
2617     if (!stemmer)
2618         stemmer = new Xapian::Stem(option["stemmer"]);
2619     if (querytermset) {
2620         set<string>::const_iterator i;
2621         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2622             string term(*i);
2623             if (term.empty()) continue;
2624
2625             unsigned char ch = term[0];
2626             bool stemmed = (ch == 'Z');
2627             if (stemmed) {
2628                 term.erase(0, 1);
2629                 if (term.empty()) continue;
2630                 ch = term[0];
2631             }
2632
2633             if (C_isupper(ch)) {
2634                 size_t prefix_len = prefix_from_term(NULL, term);
2635                 term.erase(0, prefix_len);
2636             }
2637
2638             if (!stemmed) term = (*stemmer)(term);
2639
2640             exclude_stems.insert(term);
2641         }
2642     }
2643 }
2644
2645 bool
2646 OmegaExpandDecider::operator()(const string & term) const
2647 {
2648     unsigned char ch = term[0];
2649
2650     // Reject terms with a prefix.
2651     if (C_isupper(ch)) return false;
2652
2653     {
2654         MyStopper stopper;
2655         // Don't suggest stopwords.
2656         if (stopper(term)) return false;
2657     }
2658
2659     // Reject small numbers.
2660     if (term.size() < 4 && C_isdigit(ch)) return false;
2661
2662     // Reject terms containing a space.
2663     if (term.find(' ') != string::npos) return false;
2664
2665     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2666     // terms which are already in the query in some form.
2667     string stem = (*stemmer)(term);
2668     if (exclude_stems.find(stem) != exclude_stems.end())
2669         return false;
2670
2671     // Ignore terms that only occur once (hapaxes) since they aren't
2672     // useful for finding related documents - they only occur in a
2673     // document that's already been marked as relevant.
2674     // FIXME: add an expand option to ignore terms where
2675     // termfreq == rtermfreq.
2676     if (db.get_termfreq(term) <= 1) return false;
2677
2678     return true;
2679 }