xapian-applications/omega/query.cc

   1 /* query.cc: query executor for omega
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 James Aylett
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002 Intercede 1749 Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018 Olly Betts
   8  * Copyright 2008 Thomas Viehmann
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 #include <algorithm>
  29 #include <iostream>
  30 #include <map>
  31 #include <set>
  32 #include <unordered_map>
  33 #include <unordered_set>
  34 #include <vector>
  35
  36 #include <cassert>
  37 #include <cctype>
  38 #include "safeerrno.h"
  39 #include <stdio.h>
  40 #include <cstdlib>
  41 #include <cstring>
  42 #include "strcasecmp.h"
  43 #include <ctime>
  44
  45 #include "safeunistd.h"
  46 #include <sys/types.h>
  47 #include "safesysstat.h"
  48 #include "safefcntl.h"
  49
  50 #include "realtime.h"
  51
  52 #include <cdb.h>
  53
  54 #include "csvescape.h"
  55 #include "date.h"
  56 #include "datevalue.h"
  57 #include "jsonescape.h"
  58 #include "utils.h"
  59 #include "omega.h"
  60 #include "query.h"
  61 #include "cgiparam.h"
  62 #include "loadfile.h"
  63 #include "sample.h"
  64 #include "sort.h"
  65 #include "str.h"
  66 #include "stringutils.h"
  67 #include "transform.h"
  68 #include "urldecode.h"
  69 #include "urlencode.h"
  70 #include "unixperm.h"
  71 #include "values.h"
  72 #include "weight.h"
  73 #include "expand.h"
  74 #include "md5wrap.h"
  75
  76 #include <xapian.h>
  77
  78 using namespace std;
  79
  80 using Xapian::Utf8Iterator;
  81
  82 using Xapian::Unicode::is_wordchar;
  83
  84 #ifndef SNPRINTF
  85 #include <cstdarg>
  86
  87 static int my_snprintf(char *str, size_t size, const char *format, ...)
  88 {
  89     int res;
  90     va_list ap;
  91     va_start(ap, format);
  92     str[size - 1] = '\0';
  93     res = vsprintf(str, format, ap);
  94     if (str[size - 1] || res < 0 || size_t(res) >= size)
  95         abort(); /* Overflowed! */
  96     va_end(ap);
  97     return res;
  98 }
  99 #else
 100 #define my_snprintf SNPRINTF
 101 #endif
 102
 103 static bool query_parsed = false;
 104 static bool done_query = false;
 105 static Xapian::docid last = 0;
 106
 107 static Xapian::MSet mset;
 108
 109 static map<Xapian::docid, bool> ticked;
 110
 111 static void ensure_query_parsed();
 112 static void ensure_match();
 113
 114 static Xapian::Query query;
 115 //static string url_query_string;
 116 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 117
 118 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
 119 // be true if a date filter is specified which simplies to Query::MatchNothing
 120 // at construction time.
 121 static bool date_filter_set = false;
 122 static Xapian::Query date_filter;
 123
 124 static Xapian::QueryParser qp;
 125 static Xapian::NumberRangeProcessor * size_rp = NULL;
 126 static Xapian::Stem *stemmer = NULL;
 127
 128 static string eval_file(const string &fmtfile);
 129
 130 static set<string> termset;
 131
 132 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 133 static map<string, string> termprefix_to_userprefix;
 134
 135 static string queryterms;
 136
 137 static string error_msg;
 138
 139 static double secs = -1;
 140
 141 static const char DEFAULT_LOG_ENTRY[] =
 142         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 143         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 144         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 145         "$dbname\t"
 146         "$query\t"
 147         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 148
 149 class MyStopper : public Xapian::Stopper {
 150   public:
 151     bool operator()(const string &t) const {
 152         switch (t[0]) {
 153             case 'a':
 154                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 155                         t == "are" || t == "as" || t == "at");
 156             case 'b':
 157                 return (t == "be" || t == "by");
 158             case 'e':
 159                 return (t == "en");
 160             case 'f':
 161                 return (t == "for" || t == "from");
 162             case 'h':
 163                 return (t == "how");
 164             case 'i':
 165                 return (t == "i" || t == "in" || t == "is" || t == "it");
 166             case 'o':
 167                 return (t == "of" || t == "on" || t == "or");
 168             case 't':
 169                 return (t == "that" || t == "the" || t == "this" || t == "to");
 170             case 'w':
 171                 return (t == "was" || t == "what" || t == "when" ||
 172                         t == "where" || t == "which" || t == "who" ||
 173                         t == "why" || t == "will" || t == "with");
 174             case 'y':
 175                 return (t == "you" || t == "your");
 176             default:
 177                 return false;
 178         }
 179     }
 180 };
 181
 182 static size_t
 183 prefix_from_term(string* prefix, const string& term)
 184 {
 185     if (!term.empty()) {
 186         if (term[0] == 'X') {
 187             const string::const_iterator begin = term.begin();
 188             string::const_iterator i = begin + 1;
 189             while (i != term.end() && C_isupper(*i))
 190                 ++i;
 191             if (prefix)
 192                 prefix->assign(begin, i);
 193             if (i != term.end() && *i == ':')
 194                 ++i;
 195             return i - begin;
 196         }
 197
 198         if (C_isupper(term[0])) {
 199             if (prefix)
 200                 *prefix = term[0];
 201             return 1;
 202         }
 203     }
 204
 205     if (prefix)
 206         prefix->resize(0);
 207     return 0;
 208 }
 209
 210 // Don't allow ".." in format names, log file names, etc as this would allow
 211 // people to open a format "../../etc/passwd" or similar.
 212 // FIXME: make this check more exact ("foo..bar" is safe)
 213 // FIXME: log when this check fails
 214 static bool
 215 vet_filename(const string &filename)
 216 {
 217     string::size_type i = filename.find("..");
 218     return (i == string::npos);
 219 }
 220
 221 // Heuristics:
 222 // * If any terms have been removed, it's a "fresh query" so we discard any
 223 //   relevance judgements
 224 // * If all previous terms are there but more have been added then we keep
 225 //   the relevance judgements, but return the first page of hits
 226 //
 227 // NEW_QUERY entirely new query
 228 // SAME_QUERY unchanged query
 229 // EXTENDED_QUERY new query, but based on the old one
 230 // BAD_QUERY parse error (message in error_msg)
 231 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 232
 233 static multimap<string, string> query_strings;
 234
 235 void
 236 add_query_string(const string& prefix, const string& s)
 237 {
 238     string query_string = s;
 239     // Strip leading and trailing whitespace from query_string.
 240     trim(query_string);
 241     if (!query_string.empty())
 242         query_strings.insert(make_pair(prefix, query_string));
 243 }
 244
 245 static unsigned
 246 read_qp_flags(const string & opt_pfx, unsigned f)
 247 {
 248     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 249     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 250         unsigned mask = 0;
 251         const char * s = i->first.c_str() + opt_pfx.size();
 252         switch (s[0]) {
 253             case 'a':
 254                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 255                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 256                     break;
 257                 }
 258                 if (strcmp(s, "auto_synonyms") == 0) {
 259                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 260                     break;
 261                 }
 262                 break;
 263             case 'b':
 264                 if (strcmp(s, "boolean") == 0) {
 265                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 266                     break;
 267                 }
 268                 if (strcmp(s, "boolean_any_case") == 0) {
 269                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 270                     break;
 271                 }
 272                 break;
 273             case 'c':
 274                 if (strcmp(s, "cjk_ngram") == 0) {
 275                     mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
 276                     break;
 277                 }
 278                 break;
 279             case 'd':
 280                 if (strcmp(s, "default") == 0) {
 281                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 282                     break;
 283                 }
 284                 break;
 285             case 'l':
 286                 if (strcmp(s, "lovehate") == 0) {
 287                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 288                     break;
 289                 }
 290                 break;
 291             case 'p':
 292                 if (strcmp(s, "partial") == 0) {
 293                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 294                     break;
 295                 }
 296                 if (strcmp(s, "phrase") == 0) {
 297                     mask = Xapian::QueryParser::FLAG_PHRASE;
 298                     break;
 299                 }
 300                 if (strcmp(s, "pure_not") == 0) {
 301                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 302                     break;
 303                 }
 304                 break;
 305             case 's':
 306                 if (strcmp(s, "spelling_correction") == 0) {
 307                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 308                     break;
 309                 }
 310                 if (strcmp(s, "synonym") == 0) {
 311                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 312                     break;
 313                 }
 314                 break;
 315             case 'w':
 316                 if (strcmp(s, "wildcard") == 0) {
 317                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 318                     break;
 319                 }
 320                 break;
 321         }
 322
 323         if (i->second.empty()) {
 324             f &= ~mask;
 325         } else {
 326             f |= mask;
 327         }
 328     }
 329     return f;
 330 }
 331
 332 static querytype
 333 parse_queries(const string& oldp)
 334 {
 335     // Parse the query string.
 336     auto opt_it = option.find("stem_strategy");
 337     if (opt_it != option.end()) {
 338         if (opt_it->second == "all") {
 339             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
 340         } else if (opt_it->second == "all_z") {
 341             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z);
 342         } else if (opt_it->second == "none") {
 343             qp.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
 344         } else if (opt_it->second == "some") {
 345             qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
 346         } else if (opt_it->second == "some_full_pos") {
 347             qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS);
 348         }
 349     } else {
 350         opt_it = option.find("stem_all");
 351         if (opt_it != option.end() && opt_it->second == "true") {
 352             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
 353         }
 354     }
 355     qp.set_stopper(new MyStopper());
 356     qp.set_default_op(default_op);
 357     qp.set_database(db);
 358     // FIXME: provide a custom RP which handles size:10..20K, etc.
 359     if (!size_rp)
 360         size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
 361     qp.add_rangeprocessor(size_rp);
 362     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 363     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 364         string user_prefix(pfx->first, 7);
 365         const string & term_pfx_list = pfx->second;
 366         string::size_type i = 0;
 367         do {
 368             string::size_type i0 = i;
 369             i = term_pfx_list.find('\t', i);
 370             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 371             qp.add_prefix(user_prefix, term_pfx);
 372             // std::map::insert() won't overwrite an existing entry, so we'll
 373             // prefer the first user_prefix for which a particular term prefix
 374             // is specified.
 375             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 376         } while (++i);
 377     }
 378     pfx = option.lower_bound("boolprefix,");
 379     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 380         string user_prefix(pfx->first, 11, string::npos);
 381         auto it = option.find("nonexclusiveprefix," + pfx->second);
 382         bool exclusive = (it == option.end() || it->second.empty());
 383         qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
 384         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 385     }
 386
 387     try {
 388         unsigned default_flags = read_qp_flags("flag_", 0);
 389
 390         vector<Xapian::Query> queries;
 391         queries.reserve(query_strings.size());
 392
 393         for (auto& j : query_strings) {
 394             const string& prefix = j.first;
 395             const string& query_string = j.second;
 396
 397             // Choose the stemmer to use for this input.
 398             string stemlang = option[prefix + ":stemmer"];
 399             if (stemlang.empty())
 400                 stemlang = option["stemmer"];
 401             qp.set_stemmer(Xapian::Stem(stemlang));
 402
 403             // Work out the flags to use for this input.
 404             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 405
 406             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 407             if (!q.empty())
 408                 queries.push_back(q);
 409         }
 410         query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
 411     } catch (Xapian::QueryParserError &e) {
 412         error_msg = e.get_msg();
 413         return BAD_QUERY;
 414     }
 415
 416     Xapian::termcount n_new_terms = 0;
 417     for (Xapian::TermIterator i = query.get_terms_begin();
 418          i != query.get_terms_end(); ++i) {
 419         if (termset.find(*i) == termset.end()) {
 420             termset.insert(*i);
 421             if (!queryterms.empty()) queryterms += '\t';
 422             queryterms += *i;
 423         }
 424         n_new_terms++;
 425     }
 426
 427     // Check new query against the previous one
 428     if (oldp.empty()) {
 429         // If oldp was empty that means there were no parsed query terms
 430         // before, so if there are now this is a new query.
 431         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 432     }
 433
 434     // The terms in oldp are separated by tabs.
 435     const char oldp_separator = '\t';
 436     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 437
 438     // short-cut: if the new query has fewer terms, it must be a new one
 439     if (n_new_terms < n_old_terms) return NEW_QUERY;
 440
 441     const char *term = oldp.c_str();
 442     const char *pend;
 443     while ((pend = strchr(term, oldp_separator)) != NULL) {
 444         if (termset.find(string(term, pend - term)) == termset.end())
 445             return NEW_QUERY;
 446         term = pend + 1;
 447     }
 448     if (*term) {
 449         if (termset.find(string(term)) == termset.end())
 450             return NEW_QUERY;
 451     }
 452
 453     // Use termset.size() rather than n_new_terms so we correctly handle
 454     // the case when the query has repeated terms.
 455     // This works wrongly in the case when the user extends the query
 456     // by adding a term already in it, but that's unlikely and the behaviour
 457     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 458     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 459     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 460     return SAME_QUERY;
 461 }
 462
 463 static multimap<string, string> filter_map;
 464 static set<string> neg_filters;
 465
 466 void add_bterm(const string &term) {
 467     string prefix;
 468     if (prefix_from_term(&prefix, term) > 0)
 469         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 470 }
 471
 472 void add_nterm(const string &term) {
 473     if (!term.empty())
 474         neg_filters.insert(term);
 475 }
 476
 477 void
 478 add_date_filter(const string& date_start,
 479                 const string& date_end,
 480                 const string& date_span,
 481                 Xapian::valueno date_value_slot)
 482 {
 483     if (date_start.empty() && date_end.empty() && date_span.empty())
 484         return;
 485
 486     Xapian::Query q;
 487     if (date_value_slot != Xapian::BAD_VALUENO) {
 488         // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
 489         // latter the sort order just works correctly between different
 490         // precisions).
 491         bool as_time_t =
 492             db.get_value_lower_bound(date_value_slot).size() == 4 &&
 493             db.get_value_upper_bound(date_value_slot).size() == 4;
 494         q = date_value_range(as_time_t, date_value_slot,
 495                              date_start, date_end,
 496                              date_span);
 497     } else {
 498         q = date_range_filter(date_start, date_end, date_span);
 499         q |= Xapian::Query("Dlatest");
 500     }
 501
 502     if (date_filter_set) {
 503         date_filter &= q;
 504     } else {
 505         date_filter_set = true;
 506         date_filter = q;
 507     }
 508 }
 509
 510 static void
 511 run_query()
 512 {
 513     string scheme;
 514     bool force_boolean = false;
 515     if (!filter_map.empty()) {
 516         // OR together filters with the same prefix (or AND for non-exclusive
 517         // prefixes), then AND together the resultant groups.
 518         vector<Xapian::Query> filter_vec;
 519         vector<string> same_vec;
 520         string current;
 521         for (auto i = filter_map.begin(); ; ++i) {
 522             bool over = (i == filter_map.end());
 523             if (over || i->first != current) {
 524                 switch (same_vec.size()) {
 525                     case 0:
 526                         break;
 527                     case 1:
 528                         filter_vec.push_back(Xapian::Query(same_vec[0]));
 529                         break;
 530                     default: {
 531                         Xapian::Query::op op = Xapian::Query::OP_OR;
 532                         auto it = option.find("nonexclusiveprefix," + current);
 533                         if (it != option.end() && !it->second.empty()) {
 534                             op = Xapian::Query::OP_AND;
 535                         }
 536                         filter_vec.push_back(Xapian::Query(op,
 537                                                            same_vec.begin(),
 538                                                            same_vec.end()));
 539                         break;
 540                     }
 541                 }
 542                 same_vec.clear();
 543                 if (over) break;
 544                 current = i->first;
 545             }
 546             same_vec.push_back(i->second);
 547         }
 548
 549         Xapian::Query filter(Xapian::Query::OP_AND,
 550                              filter_vec.begin(), filter_vec.end());
 551
 552         if (query.empty()) {
 553             // If no query strings were provided then promote the filters
 554             // to be THE query - filtering an empty query will give no
 555             // matches.
 556             std::swap(query, filter);
 557             auto&& it = option.find("weightingpurefilter");
 558             if (it != option.end() && !it->second.empty()) {
 559                 scheme = it->second;
 560             } else {
 561                 force_boolean = true;
 562             }
 563         } else {
 564             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 565         }
 566     }
 567
 568     if (date_filter_set) {
 569         // If no query strings were provided then promote the daterange
 570         // filter to be THE query instead of filtering an empty query.
 571         if (query.empty()) {
 572             query = date_filter;
 573             force_boolean = true;
 574         } else {
 575             query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 576         }
 577     }
 578
 579     if (!neg_filters.empty()) {
 580         // OR together all negated filters.
 581         Xapian::Query filter(Xapian::Query::OP_OR,
 582                              neg_filters.begin(), neg_filters.end());
 583
 584         if (query.empty() && !date_filter_set) {
 585             // If we only have a negative filter for the query, use MatchAll as
 586             // the query to apply the filters to.
 587             query = Xapian::Query::MatchAll;
 588             force_boolean = true;
 589         }
 590         query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
 591     }
 592
 593     if (!enquire || !error_msg.empty()) return;
 594
 595     if (!force_boolean && scheme.empty()) {
 596         auto&& it = option.find("weighting");
 597         if (it != option.end()) scheme = it->second;
 598     }
 599     set_weighting_scheme(*enquire, scheme, force_boolean);
 600
 601     enquire->set_cutoff(threshold);
 602
 603     if (sort_keymaker) {
 604         if (sort_after) {
 605             enquire->set_sort_by_relevance_then_key(sort_keymaker,
 606                                                     reverse_sort);
 607         } else {
 608             enquire->set_sort_by_key_then_relevance(sort_keymaker,
 609                                                     reverse_sort);
 610         }
 611     } else if (sort_key != Xapian::BAD_VALUENO) {
 612         if (sort_after) {
 613             enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
 614         } else {
 615             enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
 616         }
 617     }
 618
 619     enquire->set_docid_order(docid_order);
 620
 621     if (collapse) {
 622         enquire->set_collapse_key(collapse_key);
 623     }
 624
 625     if (!query.empty()) {
 626 #if 0
 627         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 628         // we're going to break some existing setups if users upgrade.  We
 629         // probably want a way to set this from OmegaScript.
 630         const char * remote_user = getenv("REMOTE_USER");
 631         if (remote_user)
 632             apply_unix_permissions(query, remote_user);
 633 #endif
 634
 635         enquire->set_query(query);
 636         // We could use the value of topdoc as first parameter, but we
 637         // need to know the first few items in the mset to fake a
 638         // relevance set for topterms.
 639         //
 640         // If min_hits isn't set, check at least one extra result so we
 641         // know if we've reached the end of the matches or not - then we
 642         // can avoid offering a "next" button which leads to an empty page.
 643         mset = enquire->get_mset(0, topdoc + hits_per_page,
 644                                  topdoc + max(hits_per_page + 1, min_hits),
 645                                  &rset);
 646     }
 647 }
 648
 649 string
 650 html_escape(const string &str)
 651 {
 652     string res;
 653     string::size_type p = 0;
 654     while (p < str.size()) {
 655         char ch = str[p++];
 656         switch (ch) {
 657             case '<':
 658                 res += "&lt;";
 659                 continue;
 660             case '>':
 661                 res += "&gt;";
 662                 continue;
 663             case '&':
 664                 res += "&amp;";
 665                 continue;
 666             case '"':
 667                 res += "&quot;";
 668                 continue;
 669             default:
 670                 res += ch;
 671         }
 672     }
 673     return res;
 674 }
 675
 676 static string
 677 html_strip(const string &str)
 678 {
 679     string res;
 680     string::size_type p = 0;
 681     bool skip = false;
 682     while (p < str.size()) {
 683         char ch = str[p++];
 684         switch (ch) {
 685             case '<':
 686                 skip = true;
 687                 continue;
 688             case '>':
 689                 skip = false;
 690                 continue;
 691             default:
 692                 if (! skip) res += ch;
 693         }
 694     }
 695     return res;
 696 }
 697
 698 class WordList {
 699     static string prev_list;
 700     static unordered_map<string, int> word_to_occurrence;
 701   public:
 702     void build_word_map(const string& list) {
 703         // Don't build map again if passed list of terms is same as before.
 704         if (prev_list == list) return;
 705         word_to_occurrence.clear();
 706         string::size_type split = 0, split2;
 707         int word_index = 0;
 708         string word;
 709         while ((split2 = list.find('\t', split)) != string::npos) {
 710             word = list.substr(split, split2 - split);
 711             if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 712                 ++word_index;
 713             split = split2 + 1;
 714         }
 715         word = list.substr(split, list.size() - split);
 716         if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 717             ++word_index;
 718         prev_list = list;
 719     }
 720
 721     int word_in_list(const string& word) {
 722         auto it = word_to_occurrence.find(word);
 723         if (it == word_to_occurrence.end()) return -1;
 724         return it->second;
 725     }
 726 };
 727
 728 string WordList::prev_list;
 729 unordered_map<string, int> WordList::word_to_occurrence;
 730
 731 // Not a character in an identifier
 732 inline static bool
 733 p_notid(unsigned int c)
 734 {
 735     return !C_isalnum(c) && c != '_';
 736 }
 737
 738 // Not a character in an HTML tag name
 739 inline static bool
 740 p_nottag(unsigned int c)
 741 {
 742     return !C_isalnum(c) && c != '.' && c != '-';
 743 }
 744
 745 // FIXME: shares algorithm with indextext.cc!
 746 static string
 747 html_highlight(const string &s, const string &list,
 748                const string &bra, const string &ket)
 749 {
 750     if (!stemmer) {
 751         stemmer = new Xapian::Stem(option["stemmer"]);
 752     }
 753
 754     string res;
 755
 756     Utf8Iterator j(s);
 757     const Utf8Iterator s_end;
 758     while (true) {
 759         Utf8Iterator first = j;
 760         while (first != s_end && !is_wordchar(*first)) ++first;
 761         if (first == s_end) break;
 762         Utf8Iterator term_end;
 763         string term;
 764         string word;
 765         const char *l = j.raw();
 766         if (*first < 128 && C_isupper(*first)) {
 767             j = first;
 768             Xapian::Unicode::append_utf8(term, *j);
 769             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 770                 Xapian::Unicode::append_utf8(term, *j);
 771             }
 772             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 773                 term.resize(0);
 774             }
 775             term_end = j;
 776         }
 777         if (term.empty()) {
 778             j = first;
 779             while (is_wordchar(*j)) {
 780                 Xapian::Unicode::append_utf8(term, *j);
 781                 ++j;
 782                 if (j == s_end) break;
 783                 if (*j == '&' || *j == '\'') {
 784                     Utf8Iterator next = j;
 785                     ++next;
 786                     if (next == s_end || !is_wordchar(*next)) break;
 787                     term += *j;
 788                     j = next;
 789                 }
 790             }
 791             term_end = j;
 792             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 793                 string::size_type len = term.length();
 794                 if (*j == '#') {
 795                     term += '#';
 796                     do { ++j; } while (j != s_end && *j == '#');
 797                 } else {
 798                     while (j != s_end && (*j == '+' || *j == '-')) {
 799                         Xapian::Unicode::append_utf8(term, *j);
 800                         ++j;
 801                     }
 802                 }
 803                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 804                     term.resize(len);
 805                 } else {
 806                     term_end = j;
 807                 }
 808             }
 809         }
 810         j = term_end;
 811         term = Xapian::Unicode::tolower(term);
 812         WordList w;
 813         w.build_word_map(list);
 814         int match = w.word_in_list(term);
 815         if (match == -1) {
 816             string stem = "Z";
 817             stem += (*stemmer)(term);
 818             match = w.word_in_list(stem);
 819         }
 820         if (match >= 0) {
 821             res += html_escape(string(l, first.raw() - l));
 822             if (!bra.empty()) {
 823                 res += bra;
 824             } else {
 825                 static const char * colours[] = {
 826                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 827                     "990000", "009900", "996600", "006699", "990099"
 828                 };
 829                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 830                 const char * bg = colours[idx];
 831                 if (strchr(bg, 'f')) {
 832                     res += "<b style=\"color:black;background-color:#";
 833                 } else {
 834                     res += "<b style=\"color:white;background-color:#";
 835                 }
 836                 res += bg;
 837                 res += "\">";
 838             }
 839             word.assign(first.raw(), j.raw() - first.raw());
 840             res += html_escape(word);
 841             if (!bra.empty()) {
 842                 res += ket;
 843             } else {
 844                 res += "</b>";
 845             }
 846         } else {
 847             res += html_escape(string(l, j.raw() - l));
 848         }
 849     }
 850     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 851     return res;
 852 }
 853
 854 #if 0
 855 static void
 856 print_query_string(const char *after)
 857 {
 858     if (after && strncmp(after, "&B=", 3) == 0) {
 859         char prefix = after[3];
 860         string::size_type start = 0, amp = 0;
 861         while (true) {
 862             amp = url_query_string.find('&', amp);
 863             if (amp == string::npos) {
 864                 cout << url_query_string.substr(start);
 865                 return;
 866             }
 867             amp++;
 868             while (url_query_string[amp] == 'B' &&
 869                    url_query_string[amp + 1] == '=' &&
 870                    url_query_string[amp + 2] == prefix) {
 871                 cout << url_query_string.substr(start, amp - start - 1);
 872                 start = url_query_string.find('&', amp + 3);
 873                 if (start == string::npos) return;
 874                 amp = start + 1;
 875             }
 876         }
 877     }
 878     cout << url_query_string;
 879 }
 880 #endif
 881
 882 class Fields {
 883     mutable Xapian::docid did_cached;
 884     mutable map<string, string> fields;
 885
 886     void read_fields(Xapian::docid did) const;
 887
 888   public:
 889     Fields() : did_cached(0) { }
 890
 891     const string & get_field(Xapian::docid did, const string & field) const {
 892         if (did != did_cached) read_fields(did);
 893         return fields[field];
 894     }
 895 };
 896
 897 void
 898 Fields::read_fields(Xapian::docid did) const
 899 {
 900     fields.clear();
 901     did_cached = did;
 902     const string & data = db.get_document(did).get_data();
 903
 904     // Parse document data.
 905     string::size_type i = 0;
 906     const string & names = option["fieldnames"];
 907     if (!names.empty()) {
 908         // Each line is a field, with fieldnames taken from corresponding
 909         // entries in the tab-separated list specified by $opt{fieldnames}.
 910         string::size_type n = 0;
 911         do {
 912             string::size_type n0 = n;
 913             n = names.find('\t', n);
 914             string::size_type i0 = i;
 915             i = data.find('\n', i);
 916             fields.insert(make_pair(names.substr(n0, n - n0),
 917                                     data.substr(i0, i - i0)));
 918         } while (++n && ++i);
 919     } else {
 920         // Each line is a field, in the format NAME=VALUE.  We assume the field
 921         // name doesn't contain an "=".  Lines without an "=" are currently
 922         // just ignored.
 923         do {
 924             string::size_type i0 = i;
 925             i = data.find('\n', i);
 926             string line(data, i0, i - i0);
 927             string::size_type j = line.find('=');
 928             if (j != string::npos) {
 929                 string & value = fields[line.substr(0, j)];
 930                 if (!value.empty()) value += '\t';
 931                 value.append(line, j + 1, string::npos);
 932             }
 933         } while (++i);
 934     }
 935 }
 936
 937 static Fields fields;
 938 static Xapian::docid q0;
 939 static Xapian::doccount hit_no;
 940 static int percent;
 941 static double weight;
 942 static Xapian::doccount collapsed;
 943
 944 static string print_caption(const string &fmt, const vector<string> &param);
 945
 946 enum tagval {
 947 CMD_,
 948 CMD_add,
 949 CMD_addfilter,
 950 CMD_allterms,
 951 CMD_and,
 952 CMD_cgi,
 953 CMD_cgilist,
 954 CMD_cgiparams,
 955 CMD_chr,
 956 CMD_collapsed,
 957 CMD_cond,
 958 CMD_contains,
 959 CMD_csv,
 960 CMD_date,
 961 CMD_dbname,
 962 CMD_dbsize,
 963 CMD_def,
 964 CMD_defaultop,
 965 CMD_div,
 966 CMD_emptydocs,
 967 CMD_env,
 968 CMD_eq,
 969 CMD_error,
 970 CMD_field,
 971 CMD_filesize,
 972 CMD_filters,
 973 CMD_filterterms,
 974 CMD_find,
 975 CMD_fmt,
 976 CMD_freq,
 977 CMD_ge,
 978 CMD_gt,
 979 CMD_hash,
 980 CMD_highlight,
 981 CMD_hit,
 982 CMD_hitlist,
 983 CMD_hitsperpage,
 984 CMD_hostname,
 985 CMD_html,
 986 CMD_htmlstrip,
 987 CMD_httpheader,
 988 CMD_id,
 989 CMD_if,
 990 CMD_include,
 991 CMD_json,
 992 CMD_jsonarray,
 993 CMD_last,
 994 CMD_lastpage,
 995 CMD_le,
 996 CMD_length,
 997 CMD_list,
 998 CMD_log,
 999 CMD_lookup,
1000 CMD_lower,
1001 CMD_lt,
1002 CMD_map,
1003 CMD_match,
1004 CMD_max,
1005 CMD_min,
1006 CMD_mod,
1007 CMD_msize,
1008 CMD_msizeexact,
1009 CMD_msizelower,
1010 CMD_msizeupper,
1011 CMD_mul,
1012 CMD_muldiv,
1013 CMD_ne,
1014 CMD_nice,
1015 CMD_not,
1016 CMD_now,
1017 CMD_opt,
1018 CMD_or,
1019 CMD_ord,
1020 CMD_pack,
1021 CMD_percentage,
1022 CMD_prettyterm,
1023 CMD_prettyurl,
1024 CMD_query,
1025 CMD_querydescription,
1026 CMD_queryterms,
1027 CMD_range,
1028 CMD_record,
1029 CMD_relevant,
1030 CMD_relevants,
1031 CMD_score,
1032 CMD_set,
1033 CMD_seterror,
1034 CMD_setmap,
1035 CMD_setrelevant,
1036 CMD_slice,
1037 CMD_snippet,
1038 CMD_sort,
1039 CMD_split,
1040 CMD_stoplist,
1041 CMD_sub,
1042 CMD_subdb,
1043 CMD_subid,
1044 CMD_substr,
1045 CMD_suggestion,
1046 CMD_switch,
1047 CMD_termprefix,
1048 CMD_terms,
1049 CMD_thispage,
1050 CMD_time,
1051 CMD_topdoc,
1052 CMD_topterms,
1053 CMD_transform,
1054 CMD_truncate,
1055 CMD_uniq,
1056 CMD_unique,
1057 CMD_unpack,
1058 CMD_unprefix,
1059 CMD_unstem,
1060 CMD_upper,
1061 CMD_url,
1062 CMD_value,
1063 CMD_version,
1064 CMD_weight,
1065 CMD_MACRO // special tag for macro evaluation
1066 };
1067
1068 struct func_attrib {
1069     int tag;
1070     int minargs, maxargs, evalargs;
1071     char ensure;
1072 };
1073
1074 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1075 struct func_desc {
1076     const char *name;
1077     struct func_attrib a;
1078 };
1079
1080 #define N -1
1081 #define M 'M'
1082 #define Q 'Q'
1083 // NB when adding a new command which ensures M or Q, update the list in
1084 // docs/omegascript.rst
1085 static struct func_desc func_tab[] = {
1086 //name minargs maxargs evalargs ensure
1087 {"",{CMD_,         N, N, 0, 0}},// commented out code
1088 T(add,             0, N, N, 0), // add a list of numbers
1089 T(addfilter,       1, 1, N, 0), // add filter term
1090 T(allterms,        0, 1, N, 0), // list of all terms matching document
1091 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
1092 T(cgi,             1, 1, N, 0), // return cgi parameter value
1093 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
1094 T(cgiparams,       0, 0, N, 0), // return list of cgi parameter names
1095 T(chr,             1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1096 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
1097 T(cond,            2, N, 0, 0), // return position of substring, or empty string
1098 T(contains,        2, 2, N, 0), // return position of substring, or empty string
1099 T(csv,             1, 2, N, 0), // CSV string escaping
1100 T(date,            1, 2, N, 0), // convert time_t to strftime format
1101                                 // (default: YYYY-MM-DD)
1102 T(dbname,          0, 0, N, 0), // database name
1103 T(dbsize,          0, 0, N, 0), // database size (# of documents)
1104 T(def,             2, 2, 1, 0), // define a macro
1105 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
1106 T(div,             2, 2, N, 0), // integer divide
1107 T(emptydocs,       0, 1, N, 0), // list of empty documents
1108 T(env,             1, 1, N, 0), // environment variable
1109 T(eq,              2, 2, N, 0), // test equality
1110 T(error,           0, 0, N, 0), // error message
1111 T(field,           1, 2, N, 0), // lookup field in record
1112 T(filesize,        1, 1, N, 0), // pretty printed filesize
1113 T(filters,         0, 0, N, 0), // serialisation of current filters
1114 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
1115 T(find,            2, 2, N, 0), // find entry in list
1116 T(fmt,             0, 0, N, 0), // name of current format
1117 T(freq,            1, 1, N, 0), // frequency of a term
1118 T(ge,              2, 2, N, 0), // test >=
1119 T(gt,              2, 2, N, 0), // test >
1120 T(hash,            2, 2, N, 0), // hash a string using the specified hash function
1121 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
1122 T(hit,             0, 0, N, 0), // hit number of current mset entry (0-based)
1123 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
1124 T(hitsperpage,     0, 0, N, 0), // hits per page
1125 T(hostname,        1, 1, N, 0), // extract hostname from URL
1126 T(html,            1, 1, N, 0), // html escape string (<>&")
1127 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1128 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
1129 T(id,              0, 0, N, 0), // docid of current doc
1130 T(if,              2, 3, 1, 0), // conditional
1131 T(include,         1, 1, 1, 0), // include another file
1132 T(json,            1, 1, N, 0), // JSON string escaping
1133 T(jsonarray,       1, 1, N, 0), // Format list as a JSON array of strings
1134 T(last,            0, 0, N, M), // hit number one beyond end of current page
1135 T(lastpage,        0, 0, N, M), // number of last hit page
1136 T(le,              2, 2, N, 0), // test <=
1137 T(length,          1, 1, N, 0), // length of list
1138 T(list,            2, 5, N, 0), // pretty print list
1139 T(log,             1, 2, 1, 0), // create a log entry
1140 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1141 T(lower,           1, 1, N, 0), // convert string to lower case
1142 T(lt,              2, 2, N, 0), // test <
1143 T(map,             2, 2, 1, 0), // map a list into another list
1144 T(match,           2, 3, N, 0), // regex match
1145 T(max,             1, N, N, 0), // maximum of a list of values
1146 T(min,             1, N, N, 0), // minimum of a list of values
1147 T(mod,             2, 2, N, 0), // integer modulus
1148 T(msize,           0, 0, N, M), // number of matches (estimated)
1149 T(msizeexact,      0, 0, N, M), // is $msize exact?
1150 T(msizelower,      0, 0, N, M), // number of matches (lower bound)
1151 T(msizeupper,      0, 0, N, M), // number of matches (upper bound)
1152 T(mul,             2, N, N, 0), // multiply a list of numbers
1153 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1154 T(ne,              2, 2, N, 0), // test not equal
1155 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1156 T(not,             1, 1, N, 0), // logical not
1157 T(now,             0, 0, N, 0), // current date/time as a time_t
1158 T(opt,             1, 2, N, 0), // lookup an option value
1159 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1160 T(ord,             1, 1, N, 0), // return codepoint for first character of UTF-8 string
1161 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1162 T(percentage,      0, 0, N, 0), // percentage score of current hit
1163 T(prettyterm,      1, 1, N, Q), // pretty print term name
1164 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1165 T(query,           0, 1, N, Q), // query
1166 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1167 T(queryterms,      0, 0, N, Q), // list of query terms
1168 T(range,           2, 2, N, 0), // return list of values between start and end
1169 T(record,          0, 1, N, 0), // record contents of document
1170 T(relevant,        0, 1, N, Q), // is document relevant?
1171 T(relevants,       0, 0, N, Q), // return list of relevant documents
1172 T(score,           0, 0, N, 0), // score (0-10) of current hit
1173 T(set,             2, 2, N, 0), // set option value
1174 T(seterror,        1, 1, N, 0), // set error_msg, setting it early stops query execution
1175 T(setmap,          1, N, N, 0), // set map of option values
1176 T(setrelevant,     0, 1, N, Q), // set rset
1177 T(slice,           2, 2, N, 0), // slice a list using a second list
1178 T(snippet,         1, 2, N, M), // generate snippet from text
1179 T(sort,            1, 2, N, M), // alpha sort a list
1180 T(split,           1, 2, N, 0), // split a string to give a list
1181 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1182 T(sub,             2, 2, N, 0), // subtract
1183 T(subdb,           0, 1, N, 0), // name of subdb docid is in
1184 T(subid,           0, 1, N, 0), // docid in the subdb#
1185 T(substr,          2, 3, N, 0), // substring
1186 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1187 T(switch,          3, N, 1, 0), // return position of substring, or empty string
1188 T(termprefix,      1, 1, N, 0), // get any prefix from a term
1189 T(terms,           0, 1, N, M), // list of matching terms
1190 T(thispage,        0, 0, N, M), // page number of current page
1191 T(time,            0, 0, N, M), // how long the match took (in seconds)
1192 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1193                                 // (counting from 0)
1194 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1195                                 // (default 16)
1196 T(transform,       3, 4, N, 0), // transform with a regexp
1197 T(truncate,        2, 4, N, 0), // truncate after a word
1198 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1199 T(unique,          1, 1, N, 0), // removed duplicates from any list
1200 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1201 T(unprefix,        1, 1, N, 0), // remove any prefix from a term
1202 T(unstem,          1, 1, N, Q), // return list of terms from the parsed query
1203                                 // which stemmed to this term
1204 T(upper,           1, 1, N, 0), // convert string to upper case
1205 T(url,             1, 1, N, 0), // url encode argument
1206 T(value,           1, 2, N, 0), // return document value
1207 T(version,         0, 0, N, 0), // omega version string
1208 T(weight,          0, 0, N, 0), // weight of the current hit
1209 { NULL,{0,         0, 0, 0, 0}}
1210 };
1211
1212 #undef T // Leaving T defined screws up Sun's C++ compiler!
1213
1214 static vector<string> macros;
1215
1216 // Call write() repeatedly until all data is written or we get a
1217 // non-recoverable error.
1218 static ssize_t
1219 write_all(int fd, const char * buf, size_t count)
1220 {
1221     while (count) {
1222         ssize_t r = write(fd, buf, count);
1223         if (rare(r < 0)) {
1224             if (errno == EINTR) continue;
1225             return r;
1226         }
1227         buf += r;
1228         count -= r;
1229     }
1230     return 0;
1231 }
1232
1233 static const vector<string>&
1234 get_subdbs()
1235 {
1236     static vector<string> subdbs;
1237     if (subdbs.empty()) {
1238         size_t p = 0, q;
1239         while (true) {
1240             q = dbname.find('/', p);
1241             subdbs.emplace_back(dbname, p, q - p);
1242             if (q == string::npos) break;
1243             p = q + 1;
1244         }
1245     }
1246     return subdbs;
1247 }
1248
1249 static string
1250 eval(const string &fmt, const vector<string> &param)
1251 {
1252     static map<string, const struct func_attrib *> func_map;
1253     if (func_map.empty()) {
1254         struct func_desc *p;
1255         for (p = func_tab; p->name != NULL; ++p) {
1256             func_map[string(p->name)] = &(p->a);
1257         }
1258     }
1259     string res;
1260     string::size_type p = 0, q;
1261     while ((q = fmt.find('$', p)) != string::npos) try {
1262         res.append(fmt, p, q - p);
1263         string::size_type code_start = q; // note down for error reporting
1264         q++;
1265         if (q >= fmt.size()) break;
1266         unsigned char ch = fmt[q];
1267         switch (ch) {
1268             // Magic sequences:
1269             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1270             case '$':
1271                 res += '$';
1272                 p = q + 1;
1273                 continue;
1274             case '(':
1275                 res += '{';
1276                 p = q + 1;
1277                 continue;
1278             case ')':
1279                 res += '}';
1280                 p = q + 1;
1281                 continue;
1282             case '.':
1283                 res += ',';
1284                 p = q + 1;
1285                 continue;
1286             case '_':
1287                 ch = '0';
1288                 // FALL THRU
1289             case '1': case '2': case '3': case '4': case '5':
1290             case '6': case '7': case '8': case '9':
1291                 ch -= '0';
1292                 if (ch < param.size()) res += param[ch];
1293                 p = q + 1;
1294                 continue;
1295             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1296             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1297             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1298             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1299             case 'y': case 'z':
1300             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1301             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1302             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1303             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1304             case 'Y': case 'Z':
1305             case '{':
1306                 break;
1307             default:
1308                 string msg = "Unknown $ code in: $";
1309                 msg.append(fmt, q, string::npos);
1310                 throw msg;
1311         }
1312         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1313         string var(fmt, q, p - q);
1314         map<string, const struct func_attrib *>::const_iterator func;
1315         func = func_map.find(var);
1316         if (func == func_map.end()) {
1317             throw "Unknown function '" + var + "'";
1318         }
1319         vector<string> args;
1320         if (fmt[p] == '{') {
1321             q = p + 1;
1322             int nest = 1;
1323             while (true) {
1324                 p = fmt.find_first_of(",{}", p + 1);
1325                 if (p == string::npos)
1326                     throw "missing } in " + fmt.substr(code_start);
1327                 if (fmt[p] == '{') {
1328                     ++nest;
1329                 } else {
1330                     if (nest == 1) {
1331                         // should we split the args
1332                         if (func->second->minargs != N) {
1333                             args.push_back(fmt.substr(q, p - q));
1334                             q = p + 1;
1335                         }
1336                     }
1337                     if (fmt[p] == '}' && --nest == 0) break;
1338                 }
1339             }
1340             if (func->second->minargs == N)
1341                 args.push_back(fmt.substr(q, p - q));
1342             ++p;
1343         }
1344
1345         if (func->second->minargs != N) {
1346             if (int(args.size()) < func->second->minargs)
1347                 throw "too few arguments to $" + var;
1348             if (func->second->maxargs != N &&
1349                 int(args.size()) > func->second->maxargs)
1350                 throw "too many arguments to $" + var;
1351
1352             vector<string>::size_type n;
1353             if (func->second->evalargs != N)
1354                 n = func->second->evalargs;
1355             else
1356                 n = args.size();
1357
1358             for (vector<string>::size_type j = 0; j < n; ++j)
1359                 args[j] = eval(args[j], param);
1360         }
1361         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1362             ensure_query_parsed();
1363         if (func->second->ensure == 'M') ensure_match();
1364         string value;
1365         switch (func->second->tag) {
1366             case CMD_:
1367                 break;
1368             case CMD_add: {
1369                 int total = 0;
1370                 for (auto&& arg : args)
1371                     total += string_to_int(arg);
1372                 value = str(total);
1373                 break;
1374             }
1375             case CMD_addfilter:
1376                 add_bterm(args[0]);
1377                 break;
1378             case CMD_allterms: {
1379                 // list of all terms indexing document
1380                 Xapian::docid id = q0;
1381                 if (!args.empty()) id = string_to_int(args[0]);
1382                 for (Xapian::TermIterator term = db.termlist_begin(id);
1383                      term != db.termlist_end(id); ++term) {
1384                     value += *term;
1385                     value += '\t';
1386                 }
1387
1388                 if (!value.empty()) value.erase(value.size() - 1);
1389                 break;
1390             }
1391             case CMD_and: {
1392                 value = "true";
1393                 for (auto&& arg : args) {
1394                     if (eval(arg, param).empty()) {
1395                         value.resize(0);
1396                         break;
1397                     }
1398                 }
1399                 break;
1400             }
1401             case CMD_cgi: {
1402                 auto i = cgi_params.find(args[0]);
1403                 if (i != cgi_params.end()) value = i->second;
1404                 break;
1405             }
1406             case CMD_cgilist: {
1407                 auto g = cgi_params.equal_range(args[0]);
1408                 for (auto i = g.first; i != g.second; ++i) {
1409                     value += i->second;
1410                     value += '\t';
1411                 }
1412                 if (!value.empty()) value.erase(value.size() - 1);
1413                 break;
1414             }
1415             case CMD_cgiparams: {
1416                 const string* prev = NULL;
1417                 for (auto&& i : cgi_params) {
1418                     if (prev && i.first == *prev) continue;
1419                     value += i.first;
1420                     value += '\t';
1421                     prev = &i.first;
1422                 }
1423                 if (!value.empty()) value.erase(value.size() - 1);
1424                 break;
1425             }
1426             case CMD_chr:
1427                 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1428                 break;
1429             case CMD_collapsed: {
1430                 value = str(collapsed);
1431                 break;
1432             }
1433             case CMD_cond:
1434                 for (size_t i = 0; i < args.size(); i += 2) {
1435                     if (i == args.size() - 1) {
1436                         // Handle optional "else" value.
1437                         value = eval(args[i], param);
1438                         break;
1439                     }
1440                     if (!eval(args[i], param).empty()) {
1441                         value = eval(args[i + 1], param);
1442                         break;
1443                     }
1444                 }
1445                 break;
1446             case CMD_contains: {
1447                 size_t pos = args[1].find(args[0]);
1448                 if (pos != string::npos) {
1449                     value = str(pos);
1450                 }
1451                 break;
1452             }
1453             case CMD_csv:
1454                 value = args[0];
1455                 if (args.size() > 1 && !args[1].empty()) {
1456                     csv_escape_always(value);
1457                 } else {
1458                     csv_escape(value);
1459                 }
1460                 break;
1461             case CMD_date:
1462                 value = args[0];
1463                 if (!value.empty()) {
1464                     char buf[64] = "";
1465                     time_t date = string_to_int(value);
1466                     if (date != static_cast<time_t>(-1)) {
1467                         struct tm *then;
1468                         then = gmtime(&date);
1469                         string date_fmt = "%Y-%m-%d";
1470                         if (args.size() > 1) date_fmt = eval(args[1], param);
1471                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1472                     }
1473                     value = buf;
1474                 }
1475                 break;
1476             case CMD_dbname:
1477                 value = dbname;
1478                 break;
1479             case CMD_dbsize: {
1480                 static Xapian::doccount dbsize;
1481                 if (!dbsize) dbsize = db.get_doccount();
1482                 value = str(dbsize);
1483                 break;
1484             }
1485             case CMD_def: {
1486                 func_attrib *fa = new func_attrib;
1487                 fa->tag = CMD_MACRO + macros.size();
1488                 fa->minargs = 0;
1489                 fa->maxargs = 9;
1490                 fa->evalargs = N; // FIXME: or 0?
1491                 fa->ensure = 0;
1492
1493                 macros.push_back(args[1]);
1494                 func_map[args[0]] = fa;
1495                 break;
1496             }
1497             case CMD_defaultop:
1498                 if (default_op == Xapian::Query::OP_AND) {
1499                     value = "and";
1500                 } else {
1501                     value = "or";
1502                 }
1503                 break;
1504             case CMD_div: {
1505                 int denom = string_to_int(args[1]);
1506                 if (denom == 0) {
1507                     value = "divide by 0";
1508                 } else {
1509                     value = str(string_to_int(args[0]) /
1510                                 string_to_int(args[1]));
1511                 }
1512                 break;
1513             }
1514             case CMD_emptydocs: {
1515                 string t;
1516                 if (!args.empty())
1517                     t = args[0];
1518                 Xapian::PostingIterator i;
1519                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1520                     if (i.get_doclength() != 0) continue;
1521                     if (!value.empty()) value += '\t';
1522                     value += str(*i);
1523                 }
1524                 break;
1525             }
1526             case CMD_env: {
1527                 char *env = getenv(args[0].c_str());
1528                 if (env != NULL) value = env;
1529                 break;
1530             }
1531             case CMD_eq:
1532                 if (args[0] == args[1]) value = "true";
1533                 break;
1534             case CMD_error:
1535                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1536                     error_msg = "Database '" + dbname + "' couldn't be opened";
1537                 }
1538                 value = error_msg;
1539                 break;
1540             case CMD_field: {
1541                 Xapian::docid did = q0;
1542                 if (args.size() > 1) did = string_to_int(args[1]);
1543                 value = fields.get_field(did, args[0]);
1544                 break;
1545             }
1546             case CMD_filesize: {
1547                 // FIXME: rounding?  i18n?
1548                 int size = string_to_int(args[0]);
1549                 int intpart = size;
1550                 int fraction = -1;
1551                 const char * format = 0;
1552                 if (size < 0) {
1553                     // Negative size -> empty result.
1554                 } else if (size == 1) {
1555                     format = "%d byte";
1556                 } else if (size < 1024) {
1557                     format = "%d bytes";
1558                 } else {
1559                     if (size < 1024 * 1024) {
1560                         format = "%d.%cK";
1561                     } else {
1562                         size /= 1024;
1563                         if (size < 1024 * 1024) {
1564                             format = "%d.%cM";
1565                         } else {
1566                             size /= 1024;
1567                             format = "%d.%cG";
1568                         }
1569                     }
1570                     intpart = unsigned(size) / 1024;
1571                     fraction = unsigned(size) % 1024;
1572                 }
1573                 if (format) {
1574                     char buf[200];
1575                     int len;
1576                     if (fraction == -1) {
1577                         len = my_snprintf(buf, sizeof(buf), format, intpart);
1578                     } else {
1579                         fraction = (fraction * 10 / 1024) + '0';
1580                         len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1581                     }
1582                     if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1583                     value.assign(buf, len);
1584                 }
1585                 break;
1586             }
1587             case CMD_filters:
1588                 value = filters;
1589                 break;
1590             case CMD_filterterms: {
1591                 Xapian::TermIterator term = db.allterms_begin();
1592                 term.skip_to(args[0]);
1593                 while (term != db.allterms_end()) {
1594                     string t = *term;
1595                     if (!startswith(t, args[0])) break;
1596                     value += t;
1597                     value += '\t';
1598                     ++term;
1599                 }
1600
1601                 if (!value.empty()) value.erase(value.size() - 1);
1602                 break;
1603             }
1604             case CMD_find: {
1605                 string l = args[0], s = args[1];
1606                 string::size_type i = 0, j = 0;
1607                 size_t count = 0;
1608                 while (j != l.size()) {
1609                     j = l.find('\t', i);
1610                     if (j == string::npos) j = l.size();
1611                     if (j - i == s.length()) {
1612                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1613                             value = str(count);
1614                             break;
1615                         }
1616                     }
1617                     ++count;
1618                     i = j + 1;
1619                 }
1620                 break;
1621             }
1622             case CMD_fmt:
1623                 value = fmtname;
1624                 break;
1625             case CMD_freq: {
1626                 const string& term = args[0];
1627                 Xapian::doccount termfreq = 0;
1628                 if (done_query) {
1629                     termfreq = mset.get_termfreq(term);
1630                 }
1631                 if (termfreq == 0) {
1632                     // We want $freq to work before the match is run, and we
1633                     // don't want using it to force the match to run.
1634                     termfreq = db.get_termfreq(term);
1635                 }
1636                 value = str(termfreq);
1637                 break;
1638             }
1639             case CMD_ge:
1640                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1641                     value = "true";
1642                 break;
1643             case CMD_gt:
1644                 if (string_to_int(args[0]) > string_to_int(args[1]))
1645                     value = "true";
1646                 break;
1647             case CMD_hash: {
1648                 const string& data = args[0];
1649                 const string& hash = args[1];
1650                 if (hash == "md5") {
1651                     string md5;
1652                     md5_string(data, md5);
1653                     value.reserve(md5.size() * 2);
1654                     for (unsigned char byte : md5) {
1655                         value += "0123456789abcdef"[byte >> 4];
1656                         value += "0123456789abcdef"[byte & 0x0f];
1657                     }
1658                 } else {
1659                     throw "Unknown hash function: " + hash;
1660                 }
1661                 break;
1662             }
1663             case CMD_highlight: {
1664                 string bra, ket;
1665                 if (args.size() > 2) {
1666                     bra = args[2];
1667                     if (args.size() > 3) {
1668                         ket = args[3];
1669                     } else {
1670                         string::const_iterator i;
1671                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1672                         ket = "</";
1673                         ket.append(bra, 1, i - bra.begin() - 1);
1674                         ket += '>';
1675                     }
1676                 }
1677
1678                 value = html_highlight(args[0], args[1], bra, ket);
1679                 break;
1680             }
1681             case CMD_hit:
1682                 // 0-based mset index
1683                 value = str(hit_no);
1684                 break;
1685             case CMD_hitlist:
1686 #if 0
1687                 url_query_string = "?DB=";
1688                 url_query_string += dbname;
1689                 for (auto& j : query_strings) {
1690                     if (j.first.empty()) {
1691                         url_query_string += "&P=";
1692                     } else {
1693                         url_query_string += "&P."
1694                         url_query_string += j.first;
1695                         url_query_string += '=';
1696                     }
1697                     const char *q = j.second.c_str();
1698                     int ch;
1699                     while ((ch = *q++) != '\0') {
1700                         switch (ch) {
1701                          case '+':
1702                             url_query_string += "%2b";
1703                             break;
1704                          case '"':
1705                             url_query_string += "%22";
1706                             break;
1707                          case '%':
1708                             url_query_string += "%25";
1709                             break;
1710                          case '&':
1711                             url_query_string += "%26";
1712                             break;
1713                          case ' ':
1714                             ch = '+';
1715                             /* fall through */
1716                          default:
1717                             url_query_string += ch;
1718                         }
1719                     }
1720                 }
1721                 // add any boolean terms
1722                 for (auto i = filter_map.begin(); i != filter_map.end(); ++i) {
1723                     url_query_string += "&B=";
1724                     url_query_string += i->second;
1725                 }
1726 #endif
1727                 for (hit_no = topdoc; hit_no < last; ++hit_no)
1728                     value += print_caption(args[0], param);
1729                 hit_no = 0;
1730                 break;
1731             case CMD_hitsperpage:
1732                 value = str(hits_per_page);
1733                 break;
1734             case CMD_hostname: {
1735                 value = args[0];
1736                 // remove URL scheme and/or path
1737                 string::size_type i = value.find("://");
1738                 if (i == string::npos) i = 0; else i += 3;
1739                 value = value.substr(i, value.find('/', i) - i);
1740                 // remove user@ or user:password@
1741                 i = value.find('@');
1742                 if (i != string::npos) value.erase(0, i + 1);
1743                 // remove :port
1744                 i = value.find(':');
1745                 if (i != string::npos) value.resize(i);
1746                 break;
1747             }
1748             case CMD_html:
1749                 value = html_escape(args[0]);
1750                 break;
1751             case CMD_htmlstrip:
1752                 value = html_strip(args[0]);
1753                 break;
1754             case CMD_httpheader:
1755                 if (!suppress_http_headers) {
1756                     cout << args[0] << ": " << args[1] << endl;
1757                     if (!set_content_type && args[0].length() == 12 &&
1758                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1759                         set_content_type = true;
1760                     }
1761                 }
1762                 break;
1763             case CMD_id:
1764                 // document id
1765                 value = str(q0);
1766                 break;
1767             case CMD_if:
1768                 if (!args[0].empty())
1769                     value = eval(args[1], param);
1770                 else if (args.size() > 2)
1771                     value = eval(args[2], param);
1772                 break;
1773             case CMD_include:
1774                 value = eval_file(args[0]);
1775                 break;
1776             case CMD_json:
1777                 value = args[0];
1778                 json_escape(value);
1779                 break;
1780             case CMD_jsonarray: {
1781                 const string & l = args[0];
1782                 string::size_type i = 0, j;
1783                 if (l.empty()) {
1784                     value = "[]";
1785                     break;
1786                 }
1787                 value = "[\"";
1788                 while (true) {
1789                     j = l.find('\t', i);
1790                     string elt(l, i, j - i);
1791                     json_escape(elt);
1792                     value += elt;
1793                     if (j == string::npos) break;
1794                     value += "\",\"";
1795                     i = j + 1;
1796                 }
1797                 value += "\"]";
1798                 break;
1799             }
1800             case CMD_last:
1801                 value = str(last);
1802                 break;
1803             case CMD_lastpage: {
1804                 int l = mset.get_matches_estimated();
1805                 if (l > 0) l = (l - 1) / hits_per_page + 1;
1806                 value = str(l);
1807                 break;
1808             }
1809             case CMD_le:
1810                 if (string_to_int(args[0]) <= string_to_int(args[1]))
1811                     value = "true";
1812                 break;
1813             case CMD_length:
1814                 if (args[0].empty()) {
1815                     value = "0";
1816                 } else {
1817                     size_t length = count(args[0].begin(), args[0].end(), '\t');
1818                     value = str(length + 1);
1819                 }
1820                 break;
1821             case CMD_list: {
1822                 if (!args[0].empty()) {
1823                     string pre, inter, interlast, post;
1824                     switch (args.size()) {
1825                      case 2:
1826                         inter = interlast = args[1];
1827                         break;
1828                      case 3:
1829                         inter = args[1];
1830                         interlast = args[2];
1831                         break;
1832                      case 4:
1833                         pre = args[1];
1834                         inter = interlast = args[2];
1835                         post = args[3];
1836                         break;
1837                      case 5:
1838                         pre = args[1];
1839                         inter = args[2];
1840                         interlast = args[3];
1841                         post = args[4];
1842                         break;
1843                     }
1844                     value += pre;
1845                     string list = args[0];
1846                     string::size_type split = 0, split2;
1847                     while ((split2 = list.find('\t', split)) != string::npos) {
1848                         if (split) value += inter;
1849                         value.append(list, split, split2 - split);
1850                         split = split2 + 1;
1851                     }
1852                     if (split) value += interlast;
1853                     value.append(list, split, string::npos);
1854                     value += post;
1855                 }
1856                 break;
1857             }
1858             case CMD_log: {
1859                 if (!vet_filename(args[0])) break;
1860                 string logfile = log_dir + args[0];
1861                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1862                 if (fd == -1) break;
1863                 vector<string> noargs;
1864                 noargs.resize(1);
1865                 string line;
1866                 if (args.size() > 1) {
1867                     line = args[1];
1868                 } else {
1869                     line = DEFAULT_LOG_ENTRY;
1870                 }
1871                 line = eval(line, noargs);
1872                 line += '\n';
1873                 (void)write_all(fd, line.data(), line.length());
1874                 close(fd);
1875                 break;
1876             }
1877             case CMD_lookup: {
1878                 if (!vet_filename(args[0])) break;
1879                 string cdbfile = cdb_dir + args[0];
1880                 int fd = open(cdbfile.c_str(), O_RDONLY);
1881                 if (fd == -1) break;
1882
1883                 struct cdb cdb;
1884                 cdb_init(&cdb, fd);
1885
1886                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1887                     size_t datalen = cdb_datalen(&cdb);
1888                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1889                     if (q) {
1890                         value.assign(static_cast<const char *>(dat), datalen);
1891                     }
1892                 }
1893
1894                 cdb_free(&cdb);
1895                 close(fd); // FIXME: cache fds?
1896                 break;
1897             }
1898             case CMD_lower:
1899                 value = Xapian::Unicode::tolower(args[0]);
1900                 break;
1901             case CMD_lt:
1902                 if (string_to_int(args[0]) < string_to_int(args[1]))
1903                     value = "true";
1904                 break;
1905             case CMD_map:
1906                 if (!args[0].empty()) {
1907                     string l = args[0], pat = args[1];
1908                     vector<string> new_args(param);
1909                     string::size_type i = 0, j;
1910                     while (true) {
1911                         j = l.find('\t', i);
1912                         new_args[0] = l.substr(i, j - i);
1913                         value += eval(pat, new_args);
1914                         if (j == string::npos) break;
1915                         value += '\t';
1916                         i = j + 1;
1917                     }
1918                 }
1919                 break;
1920             case CMD_match:
1921                 omegascript_match(value, args);
1922                 break;
1923             case CMD_max: {
1924                 vector<string>::const_iterator i = args.begin();
1925                 int val = string_to_int(*i++);
1926                 for (; i != args.end(); ++i) {
1927                     int x = string_to_int(*i);
1928                     if (x > val) val = x;
1929                 }
1930                 value = str(val);
1931                 break;
1932             }
1933             case CMD_min: {
1934                 vector<string>::const_iterator i = args.begin();
1935                 int val = string_to_int(*i++);
1936                 for (; i != args.end(); ++i) {
1937                     int x = string_to_int(*i);
1938                     if (x < val) val = x;
1939                 }
1940                 value = str(val);
1941                 break;
1942             }
1943             case CMD_msize:
1944                 // Estimated number of matches.
1945                 value = str(mset.get_matches_estimated());
1946                 break;
1947             case CMD_msizeexact:
1948                 // Is msize exact?
1949                 if (mset.get_matches_lower_bound()
1950                     == mset.get_matches_upper_bound())
1951                     value = "true";
1952                 break;
1953             case CMD_msizelower:
1954                 // Lower bound on number of matches.
1955                 value = str(mset.get_matches_lower_bound());
1956                 break;
1957             case CMD_msizeupper:
1958                 // Upper bound on number of matches.
1959                 value = str(mset.get_matches_upper_bound());
1960                 break;
1961             case CMD_mod: {
1962                 int denom = string_to_int(args[1]);
1963                 if (denom == 0) {
1964                     value = "divide by 0";
1965                 } else {
1966                     value = str(string_to_int(args[0]) %
1967                                 string_to_int(args[1]));
1968                 }
1969                 break;
1970             }
1971             case CMD_mul: {
1972                 vector<string>::const_iterator i = args.begin();
1973                 int total = string_to_int(*i++);
1974                 while (i != args.end())
1975                     total *= string_to_int(*i++);
1976                 value = str(total);
1977                 break;
1978             }
1979             case CMD_muldiv: {
1980                 int denom = string_to_int(args[2]);
1981                 if (denom == 0) {
1982                     value = "divide by 0";
1983                 } else {
1984                     int num = string_to_int(args[0]) * string_to_int(args[1]);
1985                     value = str(num / denom);
1986                 }
1987                 break;
1988             }
1989             case CMD_ne:
1990                 if (args[0] != args[1]) value = "true";
1991                 break;
1992             case CMD_nice: {
1993                 string::const_iterator i = args[0].begin();
1994                 int len = args[0].length();
1995                 while (len) {
1996                     value += *i++;
1997                     if (--len && len % 3 == 0) value += option["thousand"];
1998                 }
1999                 break;
2000             }
2001             case CMD_not:
2002                 if (args[0].empty()) value = "true";
2003                 break;
2004             case CMD_now:
2005                 value = str(static_cast<unsigned long>(time(NULL)));
2006                 break;
2007             case CMD_opt:
2008                 if (args.size() == 2) {
2009                     value = option[args[0] + "," + args[1]];
2010                 } else {
2011                     value = option[args[0]];
2012                 }
2013                 break;
2014             case CMD_or: {
2015                 for (auto&& arg : args) {
2016                     value = eval(arg, param);
2017                     if (!value.empty()) break;
2018                 }
2019                 break;
2020             }
2021             case CMD_ord: {
2022                 if (!args[0].empty()) {
2023                     Utf8Iterator it(args[0]);
2024                     value = str(*it);
2025                 }
2026                 break;
2027             }
2028             case CMD_pack:
2029                 value = int_to_binary_string(string_to_int(args[0]));
2030                 break;
2031             case CMD_percentage:
2032                 // percentage score
2033                 value = str(percent);
2034                 break;
2035             case CMD_prettyterm:
2036                 value = pretty_term(args[0]);
2037                 break;
2038             case CMD_prettyurl:
2039                 value = args[0];
2040                 url_prettify(value);
2041                 break;
2042             case CMD_query: {
2043                 auto r = query_strings.equal_range(args.empty() ?
2044                                                    string() : args[0]);
2045                 for (auto j = r.first; j != r.second; ++j) {
2046                     if (!value.empty()) value += '\t';
2047                     const string & s = j->second;
2048                     size_t start = 0, tab;
2049                     while ((tab = s.find('\t', start)) != string::npos) {
2050                         value.append(s, start, tab - start);
2051                         value += ' ';
2052                         start = tab + 1;
2053                     }
2054                     value.append(s, start, string::npos);
2055                 }
2056                 break;
2057             }
2058             case CMD_querydescription:
2059                 value = query.get_description();
2060                 break;
2061             case CMD_queryterms:
2062                 value = queryterms;
2063                 break;
2064             case CMD_range: {
2065                 int start = string_to_int(args[0]);
2066                 int end = string_to_int(args[1]);
2067                 while (start <= end) {
2068                     value += str(start);
2069                     if (start < end) value += '\t';
2070                     start++;
2071                 }
2072                 break;
2073             }
2074             case CMD_record: {
2075                 Xapian::docid id = q0;
2076                 if (!args.empty()) id = string_to_int(args[0]);
2077                 value = db.get_document(id).get_data();
2078                 break;
2079             }
2080             case CMD_relevant: {
2081                 // document id if relevant; empty otherwise
2082                 Xapian::docid id = q0;
2083                 if (!args.empty()) id = string_to_int(args[0]);
2084                 auto i = ticked.find(id);
2085                 if (i != ticked.end()) {
2086                     i->second = false; // icky side-effect
2087                     value = str(id);
2088                 }
2089                 break;
2090             }
2091             case CMD_relevants: {
2092                 for (auto i : ticked) {
2093                     if (i.second) {
2094                         value += str(i.first);
2095                         value += '\t';
2096                     }
2097                 }
2098                 if (!value.empty()) value.erase(value.size() - 1);
2099                 break;
2100             }
2101             case CMD_score:
2102                 // Score (0 to 10)
2103                 value = str(percent / 10);
2104                 break;
2105             case CMD_set:
2106                 option[args[0]] = args[1];
2107                 break;
2108             case CMD_seterror:
2109                 error_msg = args[0];
2110                 break;
2111             case CMD_setmap: {
2112                 string base = args[0] + ',';
2113                 if (args.size() % 2 != 1)
2114                     throw string("$setmap requires an odd number of arguments");
2115                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2116                     option[base + args[i]] = args[i + 1];
2117                 }
2118                 break;
2119             }
2120             case CMD_setrelevant: {
2121                 string::size_type i = 0, j;
2122                 while (true) {
2123                     j = args[0].find_first_not_of("0123456789", i);
2124                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2125                     if (id) {
2126                         rset.add_document(id);
2127                         ticked[id] = true;
2128                     }
2129                     if (j == string::npos) break;
2130                     i = j + 1;
2131                 }
2132                 break;
2133             }
2134             case CMD_slice: {
2135                 string list = args[0], pos = args[1];
2136                 vector<string> items;
2137                 string::size_type i = 0, j;
2138                 while (true) {
2139                     j = list.find('\t', i);
2140                     items.push_back(list.substr(i, j - i));
2141                     if (j == string::npos) break;
2142                     i = j + 1;
2143                 }
2144                 i = 0;
2145                 bool have_added = false;
2146                 while (true) {
2147                     j = pos.find('\t', i);
2148                     int item = string_to_int(pos.substr(i, j - i));
2149                     if (item >= 0 && size_t(item) < items.size()) {
2150                         if (have_added) value += '\t';
2151                         value += items[item];
2152                         have_added = true;
2153                     }
2154                     if (j == string::npos) break;
2155                     i = j + 1;
2156                 }
2157                 break;
2158             }
2159             case CMD_snippet: {
2160                 size_t length = 200;
2161                 if (args.size() > 1) {
2162                     length = string_to_int(args[1]);
2163                 }
2164                 if (!stemmer)
2165                     stemmer = new Xapian::Stem(option["stemmer"]);
2166                 // FIXME: Allow start and end highlight and omit to be specified.
2167                 value = mset.snippet(args[0], length, *stemmer,
2168                                      mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2169                                      "<strong>", "</strong>", "...");
2170                 break;
2171             }
2172             case CMD_sort:
2173                 omegascript_sort(args, value);
2174                 break;
2175             case CMD_split: {
2176                 string split;
2177                 if (args.size() == 1) {
2178                     split = " ";
2179                     value = args[0];
2180                 } else {
2181                     split = args[0];
2182                     value = args[1];
2183                 }
2184                 string::size_type i = 0;
2185                 while (true) {
2186                     if (split.empty()) {
2187                         ++i;
2188                         if (i >= value.size()) break;
2189                     } else {
2190                         i = value.find(split, i);
2191                         if (i == string::npos) break;
2192                     }
2193                     value.replace(i, split.size(), 1, '\t');
2194                     ++i;
2195                 }
2196                 break;
2197             }
2198             case CMD_stoplist: {
2199                 Xapian::TermIterator i = qp.stoplist_begin();
2200                 Xapian::TermIterator end = qp.stoplist_end();
2201                 while (i != end) {
2202                     if (!value.empty()) value += '\t';
2203                     value += *i;
2204                     ++i;
2205                 }
2206                 break;
2207             }
2208             case CMD_sub:
2209                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2210                 break;
2211             case CMD_subdb: {
2212                 Xapian::docid id = q0;
2213                 if (args.size() > 0) id = string_to_int(args[0]);
2214                 auto subdbs = get_subdbs();
2215                 value = subdbs[(id - 1) % subdbs.size()];
2216                 break;
2217             }
2218             case CMD_subid: {
2219                 Xapian::docid id = q0;
2220                 if (args.size() > 0) id = string_to_int(args[0]);
2221                 value = str(((id - 1) / get_subdbs().size()) + 1);
2222                 break;
2223             }
2224             case CMD_substr: {
2225                 int start = string_to_int(args[1]);
2226                 if (start < 0) {
2227                     if (static_cast<size_t>(-start) >= args[0].size()) {
2228                         start = 0;
2229                     } else {
2230                         start = static_cast<int>(args[0].size()) + start;
2231                     }
2232                 } else {
2233                     if (static_cast<size_t>(start) >= args[0].size()) break;
2234                 }
2235                 size_t len = string::npos;
2236                 if (args.size() > 2) {
2237                     int int_len = string_to_int(args[2]);
2238                     if (int_len >= 0) {
2239                         len = size_t(int_len);
2240                     } else {
2241                         len = args[0].size() - start;
2242                         if (static_cast<size_t>(-int_len) >= len) {
2243                             len = 0;
2244                         } else {
2245                             len -= static_cast<size_t>(-int_len);
2246                         }
2247                     }
2248                 }
2249                 value.assign(args[0], start, len);
2250                 break;
2251             }
2252             case CMD_suggestion:
2253                 value = qp.get_corrected_query_string();
2254                 break;
2255             case CMD_switch: {
2256                 const string& val = args[0];
2257                 for (size_t i = 1; i < args.size(); i += 2) {
2258                     if (i == args.size() - 1) {
2259                         // Handle optional "else" value.
2260                         value = eval(args[i], param);
2261                         break;
2262                     }
2263                     if (val == eval(args[i], param)) {
2264                         value = eval(args[i + 1], param);
2265                         break;
2266                     }
2267                 }
2268                 break;
2269             }
2270             case CMD_termprefix:
2271                 (void)prefix_from_term(&value, args[0]);
2272                 break;
2273             case CMD_terms: {
2274                 // list of matching terms
2275                 if (!enquire) break;
2276                 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2277                 if (args.empty()) {
2278                     while (term != enquire->get_matching_terms_end(q0)) {
2279                         // check term was in the typed query so we ignore
2280                         // boolean filter terms
2281                         const string & t = *term;
2282                         if (termset.find(t) != termset.end()) {
2283                             value += t;
2284                             value += '\t';
2285                         }
2286                         ++term;
2287                     }
2288                 } else {
2289                     // Return matching terms with specified prefix.  We can't
2290                     // use skip_to() as the terms aren't ordered by termname.
2291                     const string & pfx = args[0];
2292                     while (term != enquire->get_matching_terms_end(q0)) {
2293                         const string & t = *term;
2294                         if (startswith(t, pfx)) {
2295                             value += t;
2296                             value += '\t';
2297                         }
2298                         ++term;
2299                     }
2300                 }
2301
2302                 if (!value.empty()) value.erase(value.size() - 1);
2303                 break;
2304             }
2305             case CMD_thispage:
2306                 value = str(topdoc / hits_per_page + 1);
2307                 break;
2308             case CMD_time:
2309                 if (secs >= 0) {
2310                     char buf[64];
2311                     my_snprintf(buf, sizeof(buf), "%.6f", secs);
2312                     // MSVC's snprintf omits the zero byte if the string if
2313                     // sizeof(buf) long.
2314                     buf[sizeof(buf) - 1] = '\0';
2315                     value = buf;
2316                 }
2317                 break;
2318             case CMD_topdoc:
2319                 // first document on current page of hit list (counting from 0)
2320                 value = str(topdoc);
2321                 break;
2322             case CMD_topterms:
2323                 if (enquire) {
2324                     int howmany = 16;
2325                     if (!args.empty()) howmany = string_to_int(args[0]);
2326                     if (howmany < 0) howmany = 0;
2327
2328                     // List of expand terms
2329                     Xapian::ESet eset;
2330                     OmegaExpandDecider decider(db, &termset);
2331
2332                     if (!rset.empty()) {
2333                         set_expansion_scheme(*enquire, option);
2334                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2335                     } else if (mset.size()) {
2336                         // invent an rset
2337                         Xapian::RSet tmp;
2338
2339                         int c = 5;
2340                         // FIXME: what if mset does not start at first match?
2341                         for (Xapian::docid did : mset) {
2342                             tmp.add_document(did);
2343                             if (--c == 0) break;
2344                         }
2345
2346                         set_expansion_scheme(*enquire, option);
2347                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2348                     }
2349
2350                     // Don't show more than one word with the same stem.
2351                     set<string> stems;
2352                     Xapian::ESetIterator i;
2353                     for (i = eset.begin(); i != eset.end(); ++i) {
2354                         string term(*i);
2355                         string stem = (*stemmer)(term);
2356                         if (stems.find(stem) != stems.end()) continue;
2357                         stems.insert(stem);
2358                         value += term;
2359                         value += '\t';
2360                         if (--howmany == 0) break;
2361                     }
2362                     if (!value.empty()) value.erase(value.size() - 1);
2363                 }
2364                 break;
2365             case CMD_transform:
2366                 omegascript_transform(value, args);
2367                 break;
2368             case CMD_truncate:
2369                 value = generate_sample(args[0],
2370                                         string_to_int(args[1]),
2371                                         args.size() > 2 ? args[2] : string(),
2372                                         args.size() > 3 ? args[3] : string());
2373                 break;
2374             case CMD_uniq: {
2375                 const string &list = args[0];
2376                 if (list.empty()) break;
2377                 string::size_type split = 0, split2;
2378                 string prev;
2379                 do {
2380                     split2 = list.find('\t', split);
2381                     string item(list, split, split2 - split);
2382                     if (split == 0) {
2383                         value = item;
2384                     } else if (item != prev) {
2385                         value += '\t';
2386                         value += item;
2387                     }
2388                     prev = item;
2389                     split = split2 + 1;
2390                 } while (split2 != string::npos);
2391                 break;
2392             }
2393             case CMD_unique: {
2394                 unordered_set<string> seen;
2395                 const string &list = args[0];
2396                 if (list.empty()) break;
2397                 string::size_type split = 0, split2;
2398                 do {
2399                     split2 = list.find('\t', split);
2400                     string item(list, split, split2 - split);
2401                     if (seen.insert(item).second) {
2402                         if (split != 0)
2403                             value += '\t';
2404                         value += item;
2405                     }
2406                     split = split2 + 1;
2407                 } while (split2 != string::npos);
2408                 break;
2409             }
2410             case CMD_unpack:
2411                 value = str(binary_string_to_int(args[0]));
2412                 break;
2413             case CMD_unprefix: {
2414                 size_t prefix_len = prefix_from_term(NULL, args[0]);
2415                 value.assign(args[0], prefix_len, string::npos);
2416                 break;
2417             }
2418             case CMD_unstem: {
2419                 const string &term = args[0];
2420                 Xapian::TermIterator i = qp.unstem_begin(term);
2421                 Xapian::TermIterator end = qp.unstem_end(term);
2422                 while (i != end) {
2423                     if (!value.empty()) value += '\t';
2424                     value += *i;
2425                     ++i;
2426                 }
2427                 break;
2428             }
2429             case CMD_upper:
2430                 value = Xapian::Unicode::toupper(args[0]);
2431                 break;
2432             case CMD_url:
2433                 url_encode(value, args[0]);
2434                 break;
2435             case CMD_value: {
2436                 Xapian::docid id = q0;
2437                 Xapian::valueno value_no = string_to_int(args[0]);
2438                 if (args.size() > 1) id = string_to_int(args[1]);
2439                 value = db.get_document(id).get_value(value_no);
2440                 break;
2441             }
2442             case CMD_version:
2443                 value = PACKAGE_STRING;
2444                 break;
2445             case CMD_weight:
2446                 value = double_to_string(weight);
2447                 break;
2448             default: {
2449                 args.insert(args.begin(), param[0]);
2450                 int macro_no = func->second->tag - CMD_MACRO;
2451                 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2452                 // throw "Unknown function '" + var + "'";
2453                 value = eval(macros[macro_no], args);
2454                 break;
2455             }
2456         }
2457         res += value;
2458     } catch (const Xapian::Error & e) {
2459         // FIXME: this means we only see the most recent error in $error
2460         // - is that the best approach?
2461         error_msg = e.get_msg();
2462     }
2463
2464     res.append(fmt, p, string::npos);
2465     return res;
2466 }
2467
2468 static string
2469 eval_file(const string &fmtfile)
2470 {
2471     string err;
2472     if (vet_filename(fmtfile)) {
2473         string file = template_dir + fmtfile;
2474         string fmt;
2475         if (load_file(file, fmt)) {
2476             vector<string> noargs;
2477             noargs.resize(1);
2478             return eval(fmt, noargs);
2479         }
2480         err = strerror(errno);
2481     } else {
2482         err = "name contains '..'";
2483     }
2484
2485     // FIXME: report why!
2486     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2487     if (!err.empty()) msg += " (" + err + ')';
2488     throw msg;
2489 }
2490
2491 extern string
2492 pretty_term(string term)
2493 {
2494     // Just leave empty strings and single characters alone.
2495     if (term.length() <= 1) return term;
2496
2497     // Assume unprefixed terms are unstemmed.
2498     if (!C_isupper(term[0])) return term;
2499
2500     // Handle stemmed terms.
2501     bool stemmed = (term[0] == 'Z');
2502     if (stemmed) {
2503         // First of all, check if a term in the query stemmed to this one.
2504         Xapian::TermIterator u = qp.unstem_begin(term);
2505         // There might be multiple words with the same stem, but we only want
2506         // one so just take the first.
2507         if (u != qp.unstem_end(term)) return *u;
2508
2509         // Remove the 'Z'.
2510         term.erase(0, 1);
2511     }
2512
2513     bool add_quotes = false;
2514
2515     // Check if the term has a prefix.
2516     if (C_isupper(term[0])) {
2517         // See if we have this prefix in the termprefix_to_userprefix map.  If
2518         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2519         string prefix;
2520         size_t prefix_len = prefix_from_term(&prefix, term);
2521
2522         map<string, string>::const_iterator i;
2523         i = termprefix_to_userprefix.find(prefix);
2524         if (i != termprefix_to_userprefix.end()) {
2525             string user_prefix = i->second;
2526             user_prefix += ':';
2527             term.replace(0, prefix_len, user_prefix);
2528         } else {
2529             // We don't have a prefix mapping for this, so just set a flag to
2530             // add quotes around the term.
2531             add_quotes = true;
2532         }
2533     }
2534
2535     if (stemmed) term += '.';
2536
2537     if (add_quotes) {
2538         term.insert(0, "\"");
2539         term.append("\"");
2540     }
2541
2542     return term;
2543 }
2544
2545 static string
2546 print_caption(const string &fmt, const vector<string> &param)
2547 {
2548     q0 = *(mset[hit_no]);
2549
2550     weight = mset[hit_no].get_weight();
2551     percent = mset.convert_to_percent(mset[hit_no]);
2552     collapsed = mset[hit_no].get_collapse_count();
2553
2554     return eval(fmt, param);
2555 }
2556
2557 void
2558 parse_omegascript()
2559 {
2560     try {
2561         const char * p = getenv("SERVER_PROTOCOL");
2562         if (p && strcmp(p, "INCLUDED") == 0) {
2563             // We're being included in another page, so suppress headers.
2564             suppress_http_headers = true;
2565         }
2566
2567         string output = eval_file(fmtname);
2568         if (!set_content_type && !suppress_http_headers) {
2569             cout << "Content-Type: text/html" << endl;
2570             set_content_type = true;
2571         }
2572         if (!suppress_http_headers) cout << endl;
2573         cout << output;
2574     } catch (...) {
2575         // Ensure the headers have been output so that any exception gets
2576         // reported rather than giving a server error.
2577         if (!set_content_type && !suppress_http_headers) {
2578             cout << "Content-Type: text/html" << endl;
2579             set_content_type = true;
2580         }
2581         if (!suppress_http_headers) cout << endl;
2582         throw;
2583     }
2584 }
2585
2586 static void
2587 ensure_query_parsed()
2588 {
2589     if (query_parsed) return;
2590     query_parsed = true;
2591
2592     // Should we discard the existing R-set recorded in R CGI parameters?
2593     bool discard_rset = false;
2594
2595     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2596     // CGI parameters)?
2597     bool force_first_page = false;
2598
2599     string v;
2600     // get list of terms from previous iteration of query
2601     auto val = cgi_params.find("xP");
2602     if (val != cgi_params.end()) {
2603         v = val->second;
2604         // If xP given, default to discarding any RSet and forcing the first
2605         // page of results.  If the query is the same, or an extension of
2606         // the previous query, we adjust these again below.
2607         discard_rset = true;
2608         force_first_page = true;
2609     }
2610     querytype result = parse_queries(v);
2611     switch (result) {
2612         case BAD_QUERY:
2613             break;
2614         case NEW_QUERY:
2615             break;
2616         case SAME_QUERY:
2617         case EXTENDED_QUERY:
2618             // If we've changed database, force the first page of hits
2619             // and discard the R-set (since the docids will have changed)
2620             val = cgi_params.find("xDB");
2621             if (val != cgi_params.end() && val->second != dbname) break;
2622             if (result == SAME_QUERY && force_first_page) {
2623                 val = cgi_params.find("xFILTERS");
2624                 if (val != cgi_params.end() && val->second != filters &&
2625                     val->second != old_filters) {
2626                     // Filters have changed since last query.
2627                 } else {
2628                     force_first_page = false;
2629                 }
2630             }
2631             discard_rset = false;
2632             break;
2633     }
2634
2635     if (!force_first_page) {
2636         // Work out which mset element is the first hit we want
2637         // to display
2638         val = cgi_params.find("TOPDOC");
2639         if (val != cgi_params.end()) {
2640             topdoc = atol(val->second.c_str());
2641         }
2642
2643         // Handle next, previous, and page links
2644         if (cgi_params.find(">") != cgi_params.end()) {
2645             topdoc += hits_per_page;
2646         } else if (cgi_params.find("<") != cgi_params.end()) {
2647             if (topdoc >= hits_per_page)
2648                 topdoc -= hits_per_page;
2649             else
2650                 topdoc = 0;
2651         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2652                    (val = cgi_params.find("#")) != cgi_params.end()) {
2653             long page = atol(val->second.c_str());
2654             // Do something sensible for page 0 (we count pages from 1).
2655             if (page == 0) page = 1;
2656             topdoc = (page - 1) * hits_per_page;
2657         }
2658
2659         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2660         // Normally we snap TOPDOC like this so that things work nicely if
2661         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
2662         // postprocessing the output of omega and want variable sized pages,
2663         // this is unhelpful.
2664         bool raw_search = false;
2665         val = cgi_params.find("RAWSEARCH");
2666         if (val != cgi_params.end()) {
2667             raw_search = bool(atol(val->second.c_str()));
2668         }
2669
2670         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2671     }
2672
2673     if (!discard_rset) {
2674         // put documents marked as relevant into the rset
2675         auto g = cgi_params.equal_range("R");
2676         for (auto i = g.first; i != g.second; ++i) {
2677             const string & value = i->second;
2678             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2679                 while (value[j] == '.') ++j;
2680                 Xapian::docid d = atoi(value.c_str() + j);
2681                 if (d) {
2682                     rset.add_document(d);
2683                     ticked[d] = true;
2684                 }
2685             }
2686         }
2687     }
2688 }
2689
2690 // run query if we haven't already
2691 static void
2692 ensure_match()
2693 {
2694     if (done_query) return;
2695
2696     secs = RealTime::now();
2697     run_query();
2698     if (secs != -1)
2699         secs = RealTime::now() - secs;
2700
2701     done_query = true;
2702     last = mset.get_matches_lower_bound();
2703     if (last == 0) {
2704         // Otherwise topdoc ends up being -6 if it's non-zero!
2705         topdoc = 0;
2706     } else {
2707         if (topdoc >= last)
2708             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2709         // last is the count of documents up to the end of the current page
2710         // (as returned by $last)
2711         if (topdoc + hits_per_page < last)
2712             last = topdoc + hits_per_page;
2713     }
2714 }
2715
2716 // OmegaExpandDecider methods.
2717
2718 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2719                                        set<string> * querytermset)
2720     : db(db_)
2721 {
2722     // We'll want the stemmer for testing matches anyway.
2723     if (!stemmer)
2724         stemmer = new Xapian::Stem(option["stemmer"]);
2725     if (querytermset) {
2726         set<string>::const_iterator i;
2727         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2728             string term(*i);
2729             if (term.empty()) continue;
2730
2731             unsigned char ch = term[0];
2732             bool stemmed = (ch == 'Z');
2733             if (stemmed) {
2734                 term.erase(0, 1);
2735                 if (term.empty()) continue;
2736                 ch = term[0];
2737             }
2738
2739             if (C_isupper(ch)) {
2740                 size_t prefix_len = prefix_from_term(NULL, term);
2741                 term.erase(0, prefix_len);
2742             }
2743
2744             if (!stemmed) term = (*stemmer)(term);
2745
2746             exclude_stems.insert(term);
2747         }
2748     }
2749 }
2750
2751 bool
2752 OmegaExpandDecider::operator()(const string & term) const
2753 {
2754     unsigned char ch = term[0];
2755
2756     // Reject terms with a prefix.
2757     if (C_isupper(ch)) return false;
2758
2759     {
2760         MyStopper stopper;
2761         // Don't suggest stopwords.
2762         if (stopper(term)) return false;
2763     }
2764
2765     // Reject small numbers.
2766     if (term.size() < 4 && C_isdigit(ch)) return false;
2767
2768     // Reject terms containing a space.
2769     if (term.find(' ') != string::npos) return false;
2770
2771     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2772     // terms which are already in the query in some form.
2773     string stem = (*stemmer)(term);
2774     if (exclude_stems.find(stem) != exclude_stems.end())
2775         return false;
2776
2777     // Ignore terms that only occur once (hapaxes) since they aren't
2778     // useful for finding related documents - they only occur in a
2779     // document that's already been marked as relevant.
2780     // FIXME: add an expand option to ignore terms where
2781     // termfreq == rtermfreq.
2782     if (db.get_termfreq(term) <= 1) return false;
2783
2784     return true;
2785 }