xapian-applications/omega/query.cc

   1 /* query.cc: query executor for omega
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 James Aylett
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002 Intercede 1749 Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018 Olly Betts
   8  * Copyright 2008 Thomas Viehmann
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 #include <algorithm>
  29 #include <iostream>
  30 #include <map>
  31 #include <set>
  32 #include <unordered_map>
  33 #include <unordered_set>
  34 #include <vector>
  35
  36 #include <cassert>
  37 #include <cctype>
  38 #include "safeerrno.h"
  39 #include <stdio.h>
  40 #include <cstdlib>
  41 #include <cstring>
  42 #include "strcasecmp.h"
  43 #include <ctime>
  44
  45 #include "safeunistd.h"
  46 #include <sys/types.h>
  47 #include "safesysstat.h"
  48 #include "safefcntl.h"
  49
  50 #include "realtime.h"
  51
  52 #include <cdb.h>
  53
  54 #include "csvescape.h"
  55 #include "date.h"
  56 #include "datevalue.h"
  57 #include "jsonescape.h"
  58 #include "utils.h"
  59 #include "omega.h"
  60 #include "query.h"
  61 #include "cgiparam.h"
  62 #include "loadfile.h"
  63 #include "sample.h"
  64 #include "str.h"
  65 #include "stringutils.h"
  66 #include "transform.h"
  67 #include "urldecode.h"
  68 #include "urlencode.h"
  69 #include "unixperm.h"
  70 #include "values.h"
  71 #include "weight.h"
  72 #include "expand.h"
  73 #include "md5wrap.h"
  74
  75 #include <xapian.h>
  76
  77 using namespace std;
  78
  79 using Xapian::Utf8Iterator;
  80
  81 using Xapian::Unicode::is_wordchar;
  82
  83 #ifndef SNPRINTF
  84 #include <cstdarg>
  85
  86 static int my_snprintf(char *str, size_t size, const char *format, ...)
  87 {
  88     int res;
  89     va_list ap;
  90     va_start(ap, format);
  91     str[size - 1] = '\0';
  92     res = vsprintf(str, format, ap);
  93     if (str[size - 1] || res < 0 || size_t(res) >= size)
  94         abort(); /* Overflowed! */
  95     va_end(ap);
  96     return res;
  97 }
  98 #else
  99 #define my_snprintf SNPRINTF
 100 #endif
 101
 102 static bool query_parsed = false;
 103 static bool done_query = false;
 104 static Xapian::docid last = 0;
 105
 106 static Xapian::MSet mset;
 107
 108 static map<Xapian::docid, bool> ticked;
 109
 110 static void ensure_query_parsed();
 111 static void ensure_match();
 112
 113 static Xapian::Query query;
 114 //static string url_query_string;
 115 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 116
 117 static Xapian::QueryParser qp;
 118 static Xapian::NumberRangeProcessor * size_rp = NULL;
 119 static Xapian::Stem *stemmer = NULL;
 120
 121 static string eval_file(const string &fmtfile);
 122
 123 static set<string> termset;
 124
 125 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 126 static map<string, string> termprefix_to_userprefix;
 127
 128 static string queryterms;
 129
 130 static string error_msg;
 131
 132 static double secs = -1;
 133
 134 static const char DEFAULT_LOG_ENTRY[] =
 135         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 136         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 137         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 138         "$dbname\t"
 139         "$query\t"
 140         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 141
 142 class MyStopper : public Xapian::Stopper {
 143   public:
 144     bool operator()(const string &t) const {
 145         switch (t[0]) {
 146             case 'a':
 147                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 148                         t == "are" || t == "as" || t == "at");
 149             case 'b':
 150                 return (t == "be" || t == "by");
 151             case 'e':
 152                 return (t == "en");
 153             case 'f':
 154                 return (t == "for" || t == "from");
 155             case 'h':
 156                 return (t == "how");
 157             case 'i':
 158                 return (t == "i" || t == "in" || t == "is" || t == "it");
 159             case 'o':
 160                 return (t == "of" || t == "on" || t == "or");
 161             case 't':
 162                 return (t == "that" || t == "the" || t == "this" || t == "to");
 163             case 'w':
 164                 return (t == "was" || t == "what" || t == "when" ||
 165                         t == "where" || t == "which" || t == "who" ||
 166                         t == "why" || t == "will" || t == "with");
 167             case 'y':
 168                 return (t == "you" || t == "your");
 169             default:
 170                 return false;
 171         }
 172     }
 173 };
 174
 175 static size_t
 176 prefix_from_term(string* prefix, const string& term)
 177 {
 178     if (!term.empty()) {
 179         if (term[0] == 'X') {
 180             const string::const_iterator begin = term.begin();
 181             string::const_iterator i = begin + 1;
 182             while (i != term.end() && C_isupper(*i))
 183                 ++i;
 184             if (prefix)
 185                 prefix->assign(begin, i);
 186             if (i != term.end() && *i == ':')
 187                 ++i;
 188             return i - begin;
 189         }
 190
 191         if (C_isupper(term[0])) {
 192             if (prefix)
 193                 *prefix = term[0];
 194             return 1;
 195         }
 196     }
 197
 198     if (prefix)
 199         prefix->resize(0);
 200     return 0;
 201 }
 202
 203 // Don't allow ".." in format names, log file names, etc as this would allow
 204 // people to open a format "../../etc/passwd" or similar.
 205 // FIXME: make this check more exact ("foo..bar" is safe)
 206 // FIXME: log when this check fails
 207 static bool
 208 vet_filename(const string &filename)
 209 {
 210     string::size_type i = filename.find("..");
 211     return (i == string::npos);
 212 }
 213
 214 // Heuristics:
 215 // * If any terms have been removed, it's a "fresh query" so we discard any
 216 //   relevance judgements
 217 // * If all previous terms are there but more have been added then we keep
 218 //   the relevance judgements, but return the first page of hits
 219 //
 220 // NEW_QUERY entirely new query
 221 // SAME_QUERY unchanged query
 222 // EXTENDED_QUERY new query, but based on the old one
 223 // BAD_QUERY parse error (message in error_msg)
 224 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 225
 226 static multimap<string, string> query_strings;
 227
 228 void
 229 add_query_string(const string& prefix, const string& s)
 230 {
 231     string query_string = s;
 232     // Strip leading and trailing whitespace from query_string.
 233     trim(query_string);
 234     if (!query_string.empty())
 235         query_strings.insert(make_pair(prefix, query_string));
 236 }
 237
 238 static unsigned
 239 read_qp_flags(const string & opt_pfx, unsigned f)
 240 {
 241     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 242     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 243         unsigned mask = 0;
 244         const char * s = i->first.c_str() + opt_pfx.size();
 245         switch (s[0]) {
 246             case 'a':
 247                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 248                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 249                     break;
 250                 }
 251                 if (strcmp(s, "auto_synonyms") == 0) {
 252                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 253                     break;
 254                 }
 255                 break;
 256             case 'b':
 257                 if (strcmp(s, "boolean") == 0) {
 258                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 259                     break;
 260                 }
 261                 if (strcmp(s, "boolean_any_case") == 0) {
 262                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 263                     break;
 264                 }
 265                 break;
 266             case 'c':
 267                 if (strcmp(s, "cjk_ngram") == 0) {
 268                     mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
 269                     break;
 270                 }
 271                 break;
 272             case 'd':
 273                 if (strcmp(s, "default") == 0) {
 274                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 275                     break;
 276                 }
 277                 break;
 278             case 'l':
 279                 if (strcmp(s, "lovehate") == 0) {
 280                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 281                     break;
 282                 }
 283                 break;
 284             case 'p':
 285                 if (strcmp(s, "partial") == 0) {
 286                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 287                     break;
 288                 }
 289                 if (strcmp(s, "phrase") == 0) {
 290                     mask = Xapian::QueryParser::FLAG_PHRASE;
 291                     break;
 292                 }
 293                 if (strcmp(s, "pure_not") == 0) {
 294                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 295                     break;
 296                 }
 297                 break;
 298             case 's':
 299                 if (strcmp(s, "spelling_correction") == 0) {
 300                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 301                     break;
 302                 }
 303                 if (strcmp(s, "synonym") == 0) {
 304                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 305                     break;
 306                 }
 307                 break;
 308             case 'w':
 309                 if (strcmp(s, "wildcard") == 0) {
 310                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 311                     break;
 312                 }
 313                 break;
 314         }
 315
 316         if (i->second.empty()) {
 317             f &= ~mask;
 318         } else {
 319             f |= mask;
 320         }
 321     }
 322     return f;
 323 }
 324
 325 static querytype
 326 parse_queries(const string& oldp)
 327 {
 328     // Parse the query string.
 329     qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
 330     qp.set_stopper(new MyStopper());
 331     qp.set_default_op(default_op);
 332     qp.set_database(db);
 333     // FIXME: provide a custom RP which handles size:10..20K, etc.
 334     if (!size_rp)
 335         size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
 336     qp.add_rangeprocessor(size_rp);
 337     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 338     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 339         string user_prefix(pfx->first, 7);
 340         const string & term_pfx_list = pfx->second;
 341         string::size_type i = 0;
 342         do {
 343             string::size_type i0 = i;
 344             i = term_pfx_list.find('\t', i);
 345             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 346             qp.add_prefix(user_prefix, term_pfx);
 347             // std::map::insert() won't overwrite an existing entry, so we'll
 348             // prefer the first user_prefix for which a particular term prefix
 349             // is specified.
 350             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 351         } while (++i);
 352     }
 353     pfx = option.lower_bound("boolprefix,");
 354     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 355         string user_prefix(pfx->first, 11, string::npos);
 356         auto it = option.find("nonexclusiveprefix," + pfx->second);
 357         bool exclusive = (it == option.end() || it->second.empty());
 358         qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
 359         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 360     }
 361
 362     try {
 363         unsigned default_flags = read_qp_flags("flag_", 0);
 364
 365         vector<Xapian::Query> queries;
 366         queries.reserve(query_strings.size());
 367
 368         for (auto& j : query_strings) {
 369             const string& prefix = j.first;
 370             const string& query_string = j.second;
 371
 372             // Choose the stemmer to use for this input.
 373             string stemlang = option[prefix + ":stemmer"];
 374             if (stemlang.empty())
 375                 stemlang = option["stemmer"];
 376             qp.set_stemmer(Xapian::Stem(stemlang));
 377
 378             // Work out the flags to use for this input.
 379             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 380
 381             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 382             if (!q.empty())
 383                 queries.push_back(q);
 384         }
 385         query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
 386     } catch (Xapian::QueryParserError &e) {
 387         error_msg = e.get_msg();
 388         return BAD_QUERY;
 389     }
 390
 391     Xapian::termcount n_new_terms = 0;
 392     for (Xapian::TermIterator i = query.get_terms_begin();
 393          i != query.get_terms_end(); ++i) {
 394         if (termset.find(*i) == termset.end()) {
 395             termset.insert(*i);
 396             if (!queryterms.empty()) queryterms += '\t';
 397             queryterms += *i;
 398         }
 399         n_new_terms++;
 400     }
 401
 402     // Check new query against the previous one
 403     if (oldp.empty()) {
 404         // If oldp was empty that means there were no parsed query terms
 405         // before, so if there are now this is a new query.
 406         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 407     }
 408
 409     // The terms in oldp are separated by tabs.
 410     const char oldp_separator = '\t';
 411     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 412
 413     // short-cut: if the new query has fewer terms, it must be a new one
 414     if (n_new_terms < n_old_terms) return NEW_QUERY;
 415
 416     const char *term = oldp.c_str();
 417     const char *pend;
 418     while ((pend = strchr(term, oldp_separator)) != NULL) {
 419         if (termset.find(string(term, pend - term)) == termset.end())
 420             return NEW_QUERY;
 421         term = pend + 1;
 422     }
 423     if (*term) {
 424         if (termset.find(string(term)) == termset.end())
 425             return NEW_QUERY;
 426     }
 427
 428     // Use termset.size() rather than n_new_terms so we correctly handle
 429     // the case when the query has repeated terms.
 430     // This works wrongly in the case when the user extends the query
 431     // by adding a term already in it, but that's unlikely and the behaviour
 432     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 433     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 434     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 435     return SAME_QUERY;
 436 }
 437
 438 static multimap<string, string> filter_map;
 439 static set<string> neg_filters;
 440
 441 typedef multimap<string, string>::const_iterator FMCI;
 442
 443 void add_bterm(const string &term) {
 444     string prefix;
 445     if (prefix_from_term(&prefix, term) > 0)
 446         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 447 }
 448
 449 void add_nterm(const string &term) {
 450     if (!term.empty())
 451         neg_filters.insert(term);
 452 }
 453
 454 static void
 455 run_query()
 456 {
 457     string scheme;
 458     bool force_boolean = false;
 459     if (!filter_map.empty()) {
 460         // OR together filters with the same prefix (or AND for non-exclusive
 461         // prefixes), then AND together the resultant groups.
 462         vector<Xapian::Query> filter_vec;
 463         vector<string> same_vec;
 464         string current;
 465         for (FMCI i = filter_map.begin(); ; ++i) {
 466             bool over = (i == filter_map.end());
 467             if (over || i->first != current) {
 468                 switch (same_vec.size()) {
 469                     case 0:
 470                         break;
 471                     case 1:
 472                         filter_vec.push_back(Xapian::Query(same_vec[0]));
 473                         break;
 474                     default: {
 475                         Xapian::Query::op op = Xapian::Query::OP_OR;
 476                         auto it = option.find("nonexclusiveprefix," + current);
 477                         if (it != option.end() && !it->second.empty()) {
 478                             op = Xapian::Query::OP_AND;
 479                         }
 480                         filter_vec.push_back(Xapian::Query(op,
 481                                                            same_vec.begin(),
 482                                                            same_vec.end()));
 483                         break;
 484                     }
 485                 }
 486                 same_vec.clear();
 487                 if (over) break;
 488                 current = i->first;
 489             }
 490             same_vec.push_back(i->second);
 491         }
 492
 493         Xapian::Query filter(Xapian::Query::OP_AND,
 494                              filter_vec.begin(), filter_vec.end());
 495
 496         if (query.empty()) {
 497             // If no query strings were provided then promote the filters
 498             // to be THE query - filtering an empty query will give no
 499             // matches.
 500             std::swap(query, filter);
 501             auto&& it = option.find("weightingpurefilter");
 502             if (it != option.end() && !it->second.empty()) {
 503                 scheme = it->second;
 504             } else {
 505                 force_boolean = true;
 506             }
 507         } else {
 508             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 509         }
 510     }
 511
 512     if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
 513         Xapian::Query date_filter;
 514         if (date_value_slot != Xapian::BAD_VALUENO) {
 515             // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
 516             // latter the sort order just works correctly between different
 517             // precisions).
 518             bool as_time_t =
 519                 db.get_value_lower_bound(date_value_slot).size() == 4 &&
 520                 db.get_value_upper_bound(date_value_slot).size() == 4;
 521             date_filter = date_value_range(as_time_t, date_value_slot,
 522                                            date_start, date_end,
 523                                            date_span);
 524         } else {
 525             date_filter = date_range_filter(date_start, date_end, date_span);
 526             date_filter = Xapian::Query(Xapian::Query::OP_OR,
 527                                         date_filter,
 528                                         Xapian::Query("Dlatest"));
 529         }
 530
 531         // If no query strings were provided then promote the daterange
 532         // filter to be THE query instead of filtering an empty query.
 533         if (query.empty()) {
 534             query = date_filter;
 535             force_boolean = true;
 536         } else {
 537             query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 538         }
 539     }
 540
 541     if (!neg_filters.empty()) {
 542         // OR together all negated filters.
 543         Xapian::Query filter(Xapian::Query::OP_OR,
 544                              neg_filters.begin(), neg_filters.end());
 545
 546         if (query.empty()) {
 547             // If we only have a negative filter for the query, use MatchAll as
 548             // the query to apply the filters to.
 549             query = Xapian::Query::MatchAll;
 550             force_boolean = true;
 551         }
 552         query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
 553     }
 554
 555     if (!enquire || !error_msg.empty()) return;
 556
 557     if (!force_boolean && scheme.empty()) {
 558         auto&& it = option.find("weighting");
 559         if (it != option.end()) scheme = it->second;
 560     }
 561     set_weighting_scheme(*enquire, scheme, force_boolean);
 562
 563     enquire->set_cutoff(threshold);
 564
 565     if (sort_keymaker) {
 566         if (sort_after) {
 567             enquire->set_sort_by_relevance_then_key(sort_keymaker,
 568                                                     reverse_sort);
 569         } else {
 570             enquire->set_sort_by_key_then_relevance(sort_keymaker,
 571                                                     reverse_sort);
 572         }
 573     } else if (sort_key != Xapian::BAD_VALUENO) {
 574         if (sort_after) {
 575             enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
 576         } else {
 577             enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
 578         }
 579     }
 580
 581     enquire->set_docid_order(docid_order);
 582
 583     if (collapse) {
 584         enquire->set_collapse_key(collapse_key);
 585     }
 586
 587     if (!query.empty()) {
 588 #if 0
 589         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 590         // we're going to break some existing setups if users upgrade.  We
 591         // probably want a way to set this from OmegaScript.
 592         const char * remote_user = getenv("REMOTE_USER");
 593         if (remote_user)
 594             apply_unix_permissions(query, remote_user);
 595 #endif
 596
 597         enquire->set_query(query);
 598         // We could use the value of topdoc as first parameter, but we
 599         // need to know the first few items in the mset to fake a
 600         // relevance set for topterms.
 601         //
 602         // If min_hits isn't set, check at least one extra result so we
 603         // know if we've reached the end of the matches or not - then we
 604         // can avoid offering a "next" button which leads to an empty page.
 605         mset = enquire->get_mset(0, topdoc + hits_per_page,
 606                                  topdoc + max(hits_per_page + 1, min_hits),
 607                                  &rset);
 608     }
 609 }
 610
 611 string
 612 html_escape(const string &str)
 613 {
 614     string res;
 615     string::size_type p = 0;
 616     while (p < str.size()) {
 617         char ch = str[p++];
 618         switch (ch) {
 619             case '<':
 620                 res += "&lt;";
 621                 continue;
 622             case '>':
 623                 res += "&gt;";
 624                 continue;
 625             case '&':
 626                 res += "&amp;";
 627                 continue;
 628             case '"':
 629                 res += "&quot;";
 630                 continue;
 631             default:
 632                 res += ch;
 633         }
 634     }
 635     return res;
 636 }
 637
 638 static string
 639 html_strip(const string &str)
 640 {
 641     string res;
 642     string::size_type p = 0;
 643     bool skip = false;
 644     while (p < str.size()) {
 645         char ch = str[p++];
 646         switch (ch) {
 647             case '<':
 648                 skip = true;
 649                 continue;
 650             case '>':
 651                 skip = false;
 652                 continue;
 653             default:
 654                 if (! skip) res += ch;
 655         }
 656     }
 657     return res;
 658 }
 659
 660 class WordList {
 661     static string prev_list;
 662     static unordered_map<string, int> word_to_occurrence;
 663   public:
 664     void build_word_map(const string& list) {
 665         // Don't build map again if passed list of terms is same as before.
 666         if (prev_list == list) return;
 667         word_to_occurrence.clear();
 668         string::size_type split = 0, split2;
 669         int word_index = 0;
 670         string word;
 671         while ((split2 = list.find('\t', split)) != string::npos) {
 672             word = list.substr(split, split2 - split);
 673             if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 674                 ++word_index;
 675             split = split2 + 1;
 676         }
 677         word = list.substr(split, list.size() - split);
 678         if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 679             ++word_index;
 680         prev_list = list;
 681     }
 682
 683     int word_in_list(const string& word) {
 684         auto it = word_to_occurrence.find(word);
 685         if (it == word_to_occurrence.end()) return -1;
 686         return it->second;
 687     }
 688 };
 689
 690 string WordList::prev_list;
 691 unordered_map<string, int> WordList::word_to_occurrence;
 692
 693 // Not a character in an identifier
 694 inline static bool
 695 p_notid(unsigned int c)
 696 {
 697     return !C_isalnum(c) && c != '_';
 698 }
 699
 700 // Not a character in an HTML tag name
 701 inline static bool
 702 p_nottag(unsigned int c)
 703 {
 704     return !C_isalnum(c) && c != '.' && c != '-';
 705 }
 706
 707 // FIXME: shares algorithm with indextext.cc!
 708 static string
 709 html_highlight(const string &s, const string &list,
 710                const string &bra, const string &ket)
 711 {
 712     if (!stemmer) {
 713         stemmer = new Xapian::Stem(option["stemmer"]);
 714     }
 715
 716     string res;
 717
 718     Utf8Iterator j(s);
 719     const Utf8Iterator s_end;
 720     while (true) {
 721         Utf8Iterator first = j;
 722         while (first != s_end && !is_wordchar(*first)) ++first;
 723         if (first == s_end) break;
 724         Utf8Iterator term_end;
 725         string term;
 726         string word;
 727         const char *l = j.raw();
 728         if (*first < 128 && C_isupper(*first)) {
 729             j = first;
 730             Xapian::Unicode::append_utf8(term, *j);
 731             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 732                 Xapian::Unicode::append_utf8(term, *j);
 733             }
 734             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 735                 term.resize(0);
 736             }
 737             term_end = j;
 738         }
 739         if (term.empty()) {
 740             j = first;
 741             while (is_wordchar(*j)) {
 742                 Xapian::Unicode::append_utf8(term, *j);
 743                 ++j;
 744                 if (j == s_end) break;
 745                 if (*j == '&' || *j == '\'') {
 746                     Utf8Iterator next = j;
 747                     ++next;
 748                     if (next == s_end || !is_wordchar(*next)) break;
 749                     term += *j;
 750                     j = next;
 751                 }
 752             }
 753             term_end = j;
 754             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 755                 string::size_type len = term.length();
 756                 if (*j == '#') {
 757                     term += '#';
 758                     do { ++j; } while (j != s_end && *j == '#');
 759                 } else {
 760                     while (j != s_end && (*j == '+' || *j == '-')) {
 761                         Xapian::Unicode::append_utf8(term, *j);
 762                         ++j;
 763                     }
 764                 }
 765                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 766                     term.resize(len);
 767                 } else {
 768                     term_end = j;
 769                 }
 770             }
 771         }
 772         j = term_end;
 773         term = Xapian::Unicode::tolower(term);
 774         WordList w;
 775         w.build_word_map(list);
 776         int match = w.word_in_list(term);
 777         if (match == -1) {
 778             string stem = "Z";
 779             stem += (*stemmer)(term);
 780             match = w.word_in_list(stem);
 781         }
 782         if (match >= 0) {
 783             res += html_escape(string(l, first.raw() - l));
 784             if (!bra.empty()) {
 785                 res += bra;
 786             } else {
 787                 static const char * colours[] = {
 788                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 789                     "990000", "009900", "996600", "006699", "990099"
 790                 };
 791                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 792                 const char * bg = colours[idx];
 793                 if (strchr(bg, 'f')) {
 794                     res += "<b style=\"color:black;background-color:#";
 795                 } else {
 796                     res += "<b style=\"color:white;background-color:#";
 797                 }
 798                 res += bg;
 799                 res += "\">";
 800             }
 801             word.assign(first.raw(), j.raw() - first.raw());
 802             res += html_escape(word);
 803             if (!bra.empty()) {
 804                 res += ket;
 805             } else {
 806                 res += "</b>";
 807             }
 808         } else {
 809             res += html_escape(string(l, j.raw() - l));
 810         }
 811     }
 812     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 813     return res;
 814 }
 815
 816 #if 0
 817 static void
 818 print_query_string(const char *after)
 819 {
 820     if (after && strncmp(after, "&B=", 3) == 0) {
 821         char prefix = after[3];
 822         string::size_type start = 0, amp = 0;
 823         while (true) {
 824             amp = url_query_string.find('&', amp);
 825             if (amp == string::npos) {
 826                 cout << url_query_string.substr(start);
 827                 return;
 828             }
 829             amp++;
 830             while (url_query_string[amp] == 'B' &&
 831                    url_query_string[amp + 1] == '=' &&
 832                    url_query_string[amp + 2] == prefix) {
 833                 cout << url_query_string.substr(start, amp - start - 1);
 834                 start = url_query_string.find('&', amp + 3);
 835                 if (start == string::npos) return;
 836                 amp = start + 1;
 837             }
 838         }
 839     }
 840     cout << url_query_string;
 841 }
 842 #endif
 843
 844 class Fields {
 845     mutable Xapian::docid did_cached;
 846     mutable map<string, string> fields;
 847
 848     void read_fields(Xapian::docid did) const;
 849
 850   public:
 851     Fields() : did_cached(0) { }
 852
 853     const string & get_field(Xapian::docid did, const string & field) const {
 854         if (did != did_cached) read_fields(did);
 855         return fields[field];
 856     }
 857 };
 858
 859 void
 860 Fields::read_fields(Xapian::docid did) const
 861 {
 862     fields.clear();
 863     did_cached = did;
 864     const string & data = db.get_document(did).get_data();
 865
 866     // Parse document data.
 867     string::size_type i = 0;
 868     const string & names = option["fieldnames"];
 869     if (!names.empty()) {
 870         // Each line is a field, with fieldnames taken from corresponding
 871         // entries in the tab-separated list specified by $opt{fieldnames}.
 872         string::size_type n = 0;
 873         do {
 874             string::size_type n0 = n;
 875             n = names.find('\t', n);
 876             string::size_type i0 = i;
 877             i = data.find('\n', i);
 878             fields.insert(make_pair(names.substr(n0, n - n0),
 879                                     data.substr(i0, i - i0)));
 880         } while (++n && ++i);
 881     } else {
 882         // Each line is a field, in the format NAME=VALUE.  We assume the field
 883         // name doesn't contain an "=".  Lines without an "=" are currently
 884         // just ignored.
 885         do {
 886             string::size_type i0 = i;
 887             i = data.find('\n', i);
 888             string line(data, i0, i - i0);
 889             string::size_type j = line.find('=');
 890             if (j != string::npos) {
 891                 string & value = fields[line.substr(0, j)];
 892                 if (!value.empty()) value += '\t';
 893                 value.append(line, j + 1, string::npos);
 894             }
 895         } while (++i);
 896     }
 897 }
 898
 899 static Fields fields;
 900 static Xapian::docid q0;
 901 static Xapian::doccount hit_no;
 902 static int percent;
 903 static double weight;
 904 static Xapian::doccount collapsed;
 905
 906 static string print_caption(const string &fmt, const vector<string> &param);
 907
 908 enum tagval {
 909 CMD_,
 910 CMD_add,
 911 CMD_addfilter,
 912 CMD_allterms,
 913 CMD_and,
 914 CMD_cgi,
 915 CMD_cgilist,
 916 CMD_cgiparams,
 917 CMD_chr,
 918 CMD_collapsed,
 919 CMD_cond,
 920 CMD_contains,
 921 CMD_csv,
 922 CMD_date,
 923 CMD_dbname,
 924 CMD_dbsize,
 925 CMD_def,
 926 CMD_defaultop,
 927 CMD_div,
 928 CMD_eq,
 929 CMD_emptydocs,
 930 CMD_env,
 931 CMD_error,
 932 CMD_field,
 933 CMD_filesize,
 934 CMD_filters,
 935 CMD_filterterms,
 936 CMD_find,
 937 CMD_fmt,
 938 CMD_freq,
 939 CMD_ge,
 940 CMD_gt,
 941 CMD_hash,
 942 CMD_highlight,
 943 CMD_hit,
 944 CMD_hitlist,
 945 CMD_hitsperpage,
 946 CMD_hostname,
 947 CMD_html,
 948 CMD_htmlstrip,
 949 CMD_httpheader,
 950 CMD_id,
 951 CMD_if,
 952 CMD_include,
 953 CMD_json,
 954 CMD_jsonarray,
 955 CMD_last,
 956 CMD_lastpage,
 957 CMD_le,
 958 CMD_length,
 959 CMD_list,
 960 CMD_log,
 961 CMD_lookup,
 962 CMD_lower,
 963 CMD_lt,
 964 CMD_map,
 965 CMD_match,
 966 CMD_max,
 967 CMD_min,
 968 CMD_mod,
 969 CMD_msize,
 970 CMD_msizeexact,
 971 CMD_msizelower,
 972 CMD_msizeupper,
 973 CMD_mul,
 974 CMD_muldiv,
 975 CMD_ne,
 976 CMD_nice,
 977 CMD_not,
 978 CMD_now,
 979 CMD_opt,
 980 CMD_or,
 981 CMD_ord,
 982 CMD_pack,
 983 CMD_percentage,
 984 CMD_prettyterm,
 985 CMD_prettyurl,
 986 CMD_query,
 987 CMD_querydescription,
 988 CMD_queryterms,
 989 CMD_range,
 990 CMD_record,
 991 CMD_relevant,
 992 CMD_relevants,
 993 CMD_score,
 994 CMD_set,
 995 CMD_seterror,
 996 CMD_setmap,
 997 CMD_setrelevant,
 998 CMD_slice,
 999 CMD_snippet,
1000 CMD_sort,
1001 CMD_split,
1002 CMD_stoplist,
1003 CMD_sub,
1004 CMD_subdb,
1005 CMD_subid,
1006 CMD_substr,
1007 CMD_suggestion,
1008 CMD_switch,
1009 CMD_termprefix,
1010 CMD_terms,
1011 CMD_thispage,
1012 CMD_time,
1013 CMD_topdoc,
1014 CMD_topterms,
1015 CMD_transform,
1016 CMD_truncate,
1017 CMD_uniq,
1018 CMD_unique,
1019 CMD_unpack,
1020 CMD_unprefix,
1021 CMD_unstem,
1022 CMD_upper,
1023 CMD_url,
1024 CMD_value,
1025 CMD_version,
1026 CMD_weight,
1027 CMD_MACRO // special tag for macro evaluation
1028 };
1029
1030 struct func_attrib {
1031     int tag;
1032     int minargs, maxargs, evalargs;
1033     char ensure;
1034 };
1035
1036 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1037 struct func_desc {
1038     const char *name;
1039     struct func_attrib a;
1040 };
1041
1042 #define N -1
1043 #define M 'M'
1044 #define Q 'Q'
1045 // NB when adding a new command which ensures M or Q, update the list in
1046 // docs/omegascript.rst
1047 static struct func_desc func_tab[] = {
1048 //name minargs maxargs evalargs ensure
1049 {"",{CMD_,         N, N, 0, 0}},// commented out code
1050 T(add,             0, N, N, 0), // add a list of numbers
1051 T(addfilter,       1, 1, N, 0), // add filter term
1052 T(allterms,        0, 1, N, 0), // list of all terms matching document
1053 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
1054 T(cgi,             1, 1, N, 0), // return cgi parameter value
1055 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
1056 T(cgiparams,       0, 0, N, 0), // return list of cgi parameter names
1057 T(chr,             1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1058 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
1059 T(cond,            2, N, 0, 0), // return position of substring, or empty string
1060 T(contains,        2, 2, N, 0), // return position of substring, or empty string
1061 T(csv,             1, 2, N, 0), // CSV string escaping
1062 T(date,            1, 2, N, 0), // convert time_t to strftime format
1063                                 // (default: YYYY-MM-DD)
1064 T(dbname,          0, 0, N, 0), // database name
1065 T(dbsize,          0, 0, N, 0), // database size (# of documents)
1066 T(def,             2, 2, 1, 0), // define a macro
1067 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
1068 T(div,             2, 2, N, 0), // integer divide
1069 T(emptydocs,       0, 1, N, 0), // list of empty documents
1070 T(env,             1, 1, N, 0), // environment variable
1071 T(error,           0, 0, N, 0), // error message
1072 T(eq,              2, 2, N, 0), // test equality
1073 T(field,           1, 2, N, 0), // lookup field in record
1074 T(filesize,        1, 1, N, 0), // pretty printed filesize
1075 T(filters,         0, 0, N, 0), // serialisation of current filters
1076 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
1077 T(find,            2, 2, N, 0), // find entry in list
1078 T(fmt,             0, 0, N, 0), // name of current format
1079 T(freq,            1, 1, N, 0), // frequency of a term
1080 T(ge,              2, 2, N, 0), // test >=
1081 T(gt,              2, 2, N, 0), // test >
1082 T(hash,            2, 2, N, 0), // hash a string using the specified hash function
1083 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
1084 T(hit,             0, 0, N, 0), // hit number of current mset entry (0-based)
1085 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
1086 T(hitsperpage,     0, 0, N, 0), // hits per page
1087 T(hostname,        1, 1, N, 0), // extract hostname from URL
1088 T(html,            1, 1, N, 0), // html escape string (<>&")
1089 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1090 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
1091 T(id,              0, 0, N, 0), // docid of current doc
1092 T(if,              2, 3, 1, 0), // conditional
1093 T(include,         1, 1, 1, 0), // include another file
1094 T(json,            1, 1, N, 0), // JSON string escaping
1095 T(jsonarray,       1, 1, N, 0), // Format list as a JSON array of strings
1096 T(last,            0, 0, N, M), // hit number one beyond end of current page
1097 T(lastpage,        0, 0, N, M), // number of last hit page
1098 T(le,              2, 2, N, 0), // test <=
1099 T(length,          1, 1, N, 0), // length of list
1100 T(list,            2, 5, N, 0), // pretty print list
1101 T(log,             1, 2, 1, 0), // create a log entry
1102 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1103 T(lower,           1, 1, N, 0), // convert string to lower case
1104 T(lt,              2, 2, N, 0), // test <
1105 T(map,             2, 2, 1, 0), // map a list into another list
1106 T(match,           2, 3, N, 0), // regex match
1107 T(max,             1, N, N, 0), // maximum of a list of values
1108 T(min,             1, N, N, 0), // minimum of a list of values
1109 T(mod,             2, 2, N, 0), // integer modulus
1110 T(msize,           0, 0, N, M), // number of matches (estimated)
1111 T(msizeexact,      0, 0, N, M), // is $msize exact?
1112 T(msizelower,      0, 0, N, M), // number of matches (lower bound)
1113 T(msizeupper,      0, 0, N, M), // number of matches (upper bound)
1114 T(mul,             2, N, N, 0), // multiply a list of numbers
1115 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1116 T(ne,              2, 2, N, 0), // test not equal
1117 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1118 T(not,             1, 1, N, 0), // logical not
1119 T(now,             0, 0, N, 0), // current date/time as a time_t
1120 T(opt,             1, 2, N, 0), // lookup an option value
1121 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1122 T(ord,             1, 1, N, 0), // return codepoint for first character of UTF-8 string
1123 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1124 T(percentage,      0, 0, N, 0), // percentage score of current hit
1125 T(prettyterm,      1, 1, N, Q), // pretty print term name
1126 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1127 T(query,           0, 1, N, Q), // query
1128 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1129 T(queryterms,      0, 0, N, Q), // list of query terms
1130 T(range,           2, 2, N, 0), // return list of values between start and end
1131 T(record,          0, 1, N, 0), // record contents of document
1132 T(relevant,        0, 1, N, Q), // is document relevant?
1133 T(relevants,       0, 0, N, Q), // return list of relevant documents
1134 T(score,           0, 0, N, 0), // score (0-10) of current hit
1135 T(set,             2, 2, N, 0), // set option value
1136 T(seterror,        1, 1, N, 0), // set error_msg, setting it early stops query execution
1137 T(setmap,          1, N, N, 0), // set map of option values
1138 T(setrelevant,     0, 1, N, Q), // set rset
1139 T(slice,           2, 2, N, 0), // slice a list using a second list
1140 T(snippet,         1, 2, N, M), // generate snippet from text
1141 T(sort,            1, 2, N, M), // alpha sort a list
1142 T(split,           1, 2, N, 0), // split a string to give a list
1143 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1144 T(sub,             2, 2, N, 0), // subtract
1145 T(subdb,           0, 1, N, 0), // name of subdb docid is in
1146 T(subid,           0, 1, N, 0), // docid in the subdb#
1147 T(substr,          2, 3, N, 0), // substring
1148 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1149 T(switch,          3, N, 1, 0), // return position of substring, or empty string
1150 T(termprefix,      1, 1, N, 0), // get any prefix from a term
1151 T(terms,           0, 1, N, M), // list of matching terms
1152 T(thispage,        0, 0, N, M), // page number of current page
1153 T(time,            0, 0, N, M), // how long the match took (in seconds)
1154 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1155                                 // (counting from 0)
1156 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1157                                 // (default 16)
1158 T(transform,       3, 4, N, 0), // transform with a regexp
1159 T(truncate,        2, 4, N, 0), // truncate after a word
1160 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1161 T(unique,          1, 1, N, 0), // removed duplicates from any list
1162 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1163 T(unprefix,        1, 1, N, 0), // remove any prefix from a term
1164 T(unstem,          1, 1, N, Q), // return list of terms from the parsed query
1165                                 // which stemmed to this term
1166 T(upper,           1, 1, N, 0), // convert string to upper case
1167 T(url,             1, 1, N, 0), // url encode argument
1168 T(value,           1, 2, N, 0), // return document value
1169 T(version,         0, 0, N, 0), // omega version string
1170 T(weight,          0, 0, N, 0), // weight of the current hit
1171 { NULL,{0,         0, 0, 0, 0}}
1172 };
1173
1174 #undef T // Leaving T defined screws up Sun's C++ compiler!
1175
1176 static vector<string> macros;
1177
1178 // Call write() repeatedly until all data is written or we get a
1179 // non-recoverable error.
1180 static ssize_t
1181 write_all(int fd, const char * buf, size_t count)
1182 {
1183     while (count) {
1184         ssize_t r = write(fd, buf, count);
1185         if (rare(r < 0)) {
1186             if (errno == EINTR) continue;
1187             return r;
1188         }
1189         buf += r;
1190         count -= r;
1191     }
1192     return 0;
1193 }
1194
1195 static const vector<string>&
1196 get_subdbs()
1197 {
1198     static vector<string> subdbs;
1199     if (subdbs.empty()) {
1200         size_t p = 0, q;
1201         while (true) {
1202             q = dbname.find('/', p);
1203             subdbs.emplace_back(dbname, p, q - p);
1204             if (q == string::npos) break;
1205             p = q + 1;
1206         }
1207     }
1208     return subdbs;
1209 }
1210
1211 static string
1212 eval(const string &fmt, const vector<string> &param)
1213 {
1214     static map<string, const struct func_attrib *> func_map;
1215     if (func_map.empty()) {
1216         struct func_desc *p;
1217         for (p = func_tab; p->name != NULL; ++p) {
1218             func_map[string(p->name)] = &(p->a);
1219         }
1220     }
1221     string res;
1222     string::size_type p = 0, q;
1223     while ((q = fmt.find('$', p)) != string::npos) try {
1224         res.append(fmt, p, q - p);
1225         string::size_type code_start = q; // note down for error reporting
1226         q++;
1227         if (q >= fmt.size()) break;
1228         unsigned char ch = fmt[q];
1229         switch (ch) {
1230             // Magic sequences:
1231             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1232             case '$':
1233                 res += '$';
1234                 p = q + 1;
1235                 continue;
1236             case '(':
1237                 res += '{';
1238                 p = q + 1;
1239                 continue;
1240             case ')':
1241                 res += '}';
1242                 p = q + 1;
1243                 continue;
1244             case '.':
1245                 res += ',';
1246                 p = q + 1;
1247                 continue;
1248             case '_':
1249                 ch = '0';
1250                 // FALL THRU
1251             case '1': case '2': case '3': case '4': case '5':
1252             case '6': case '7': case '8': case '9':
1253                 ch -= '0';
1254                 if (ch < param.size()) res += param[ch];
1255                 p = q + 1;
1256                 continue;
1257             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1258             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1259             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1260             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1261             case 'y': case 'z':
1262             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1263             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1264             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1265             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1266             case 'Y': case 'Z':
1267             case '{':
1268                 break;
1269             default:
1270                 string msg = "Unknown $ code in: $";
1271                 msg.append(fmt, q, string::npos);
1272                 throw msg;
1273         }
1274         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1275         string var(fmt, q, p - q);
1276         map<string, const struct func_attrib *>::const_iterator func;
1277         func = func_map.find(var);
1278         if (func == func_map.end()) {
1279             throw "Unknown function '" + var + "'";
1280         }
1281         vector<string> args;
1282         if (fmt[p] == '{') {
1283             q = p + 1;
1284             int nest = 1;
1285             while (true) {
1286                 p = fmt.find_first_of(",{}", p + 1);
1287                 if (p == string::npos)
1288                     throw "missing } in " + fmt.substr(code_start);
1289                 if (fmt[p] == '{') {
1290                     ++nest;
1291                 } else {
1292                     if (nest == 1) {
1293                         // should we split the args
1294                         if (func->second->minargs != N) {
1295                             args.push_back(fmt.substr(q, p - q));
1296                             q = p + 1;
1297                         }
1298                     }
1299                     if (fmt[p] == '}' && --nest == 0) break;
1300                 }
1301             }
1302             if (func->second->minargs == N)
1303                 args.push_back(fmt.substr(q, p - q));
1304             ++p;
1305         }
1306
1307         if (func->second->minargs != N) {
1308             if (int(args.size()) < func->second->minargs)
1309                 throw "too few arguments to $" + var;
1310             if (func->second->maxargs != N &&
1311                 int(args.size()) > func->second->maxargs)
1312                 throw "too many arguments to $" + var;
1313
1314             vector<string>::size_type n;
1315             if (func->second->evalargs != N)
1316                 n = func->second->evalargs;
1317             else
1318                 n = args.size();
1319
1320             for (vector<string>::size_type j = 0; j < n; ++j)
1321                 args[j] = eval(args[j], param);
1322         }
1323         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1324             ensure_query_parsed();
1325         if (func->second->ensure == 'M') ensure_match();
1326         string value;
1327         switch (func->second->tag) {
1328             case CMD_:
1329                 break;
1330             case CMD_add: {
1331                 int total = 0;
1332                 for (auto&& arg : args)
1333                     total += string_to_int(arg);
1334                 value = str(total);
1335                 break;
1336             }
1337             case CMD_addfilter:
1338                 add_bterm(args[0]);
1339                 break;
1340             case CMD_allterms: {
1341                 // list of all terms indexing document
1342                 Xapian::docid id = q0;
1343                 if (!args.empty()) id = string_to_int(args[0]);
1344                 for (Xapian::TermIterator term = db.termlist_begin(id);
1345                      term != db.termlist_end(id); ++term) {
1346                     value += *term;
1347                     value += '\t';
1348                 }
1349
1350                 if (!value.empty()) value.erase(value.size() - 1);
1351                 break;
1352             }
1353             case CMD_and: {
1354                 value = "true";
1355                 for (auto&& arg : args) {
1356                     if (eval(arg, param).empty()) {
1357                         value.resize(0);
1358                         break;
1359                     }
1360                 }
1361                 break;
1362             }
1363             case CMD_cgi: {
1364                 MCI i = cgi_params.find(args[0]);
1365                 if (i != cgi_params.end()) value = i->second;
1366                 break;
1367             }
1368             case CMD_cgilist: {
1369                 pair<MCI, MCI> g;
1370                 g = cgi_params.equal_range(args[0]);
1371                 for (MCI i = g.first; i != g.second; ++i) {
1372                     value += i->second;
1373                     value += '\t';
1374                 }
1375                 if (!value.empty()) value.erase(value.size() - 1);
1376                 break;
1377             }
1378             case CMD_cgiparams: {
1379                 const string* prev = NULL;
1380                 for (auto&& i : cgi_params) {
1381                     if (prev && i.first == *prev) continue;
1382                     value += i.first;
1383                     value += '\t';
1384                     prev = &i.first;
1385                 }
1386                 if (!value.empty()) value.erase(value.size() - 1);
1387                 break;
1388             }
1389             case CMD_chr:
1390                 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1391                 break;
1392             case CMD_collapsed: {
1393                 value = str(collapsed);
1394                 break;
1395             }
1396             case CMD_cond:
1397                 for (size_t i = 0; i < args.size(); i += 2) {
1398                     if (i == args.size() - 1) {
1399                         // Handle optional "else" value.
1400                         value = eval(args[i], param);
1401                         break;
1402                     }
1403                     if (!eval(args[i], param).empty()) {
1404                         value = eval(args[i + 1], param);
1405                         break;
1406                     }
1407                 }
1408                 break;
1409             case CMD_contains: {
1410                 size_t pos = args[1].find(args[0]);
1411                 if (pos != string::npos) {
1412                     value = str(pos);
1413                 }
1414                 break;
1415             }
1416             case CMD_csv:
1417                 value = args[0];
1418                 if (args.size() > 1 && !args[1].empty()) {
1419                     csv_escape_always(value);
1420                 } else {
1421                     csv_escape(value);
1422                 }
1423                 break;
1424             case CMD_date:
1425                 value = args[0];
1426                 if (!value.empty()) {
1427                     char buf[64] = "";
1428                     time_t date = string_to_int(value);
1429                     if (date != static_cast<time_t>(-1)) {
1430                         struct tm *then;
1431                         then = gmtime(&date);
1432                         string date_fmt = "%Y-%m-%d";
1433                         if (args.size() > 1) date_fmt = eval(args[1], param);
1434                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1435                     }
1436                     value = buf;
1437                 }
1438                 break;
1439             case CMD_dbname:
1440                 value = dbname;
1441                 break;
1442             case CMD_dbsize: {
1443                 static Xapian::doccount dbsize;
1444                 if (!dbsize) dbsize = db.get_doccount();
1445                 value = str(dbsize);
1446                 break;
1447             }
1448             case CMD_def: {
1449                 func_attrib *fa = new func_attrib;
1450                 fa->tag = CMD_MACRO + macros.size();
1451                 fa->minargs = 0;
1452                 fa->maxargs = 9;
1453                 fa->evalargs = N; // FIXME: or 0?
1454                 fa->ensure = 0;
1455
1456                 macros.push_back(args[1]);
1457                 func_map[args[0]] = fa;
1458                 break;
1459             }
1460             case CMD_defaultop:
1461                 if (default_op == Xapian::Query::OP_AND) {
1462                     value = "and";
1463                 } else {
1464                     value = "or";
1465                 }
1466                 break;
1467             case CMD_div: {
1468                 int denom = string_to_int(args[1]);
1469                 if (denom == 0) {
1470                     value = "divide by 0";
1471                 } else {
1472                     value = str(string_to_int(args[0]) /
1473                                 string_to_int(args[1]));
1474                 }
1475                 break;
1476             }
1477             case CMD_eq:
1478                 if (args[0] == args[1]) value = "true";
1479                 break;
1480             case CMD_emptydocs: {
1481                 string t;
1482                 if (!args.empty())
1483                     t = args[0];
1484                 Xapian::PostingIterator i;
1485                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1486                     if (i.get_doclength() != 0) continue;
1487                     if (!value.empty()) value += '\t';
1488                     value += str(*i);
1489                 }
1490                 break;
1491             }
1492             case CMD_env: {
1493                 char *env = getenv(args[0].c_str());
1494                 if (env != NULL) value = env;
1495                 break;
1496             }
1497             case CMD_error:
1498                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1499                     error_msg = "Database '" + dbname + "' couldn't be opened";
1500                 }
1501                 value = error_msg;
1502                 break;
1503             case CMD_field: {
1504                 Xapian::docid did = q0;
1505                 if (args.size() > 1) did = string_to_int(args[1]);
1506                 value = fields.get_field(did, args[0]);
1507                 break;
1508             }
1509             case CMD_filesize: {
1510                 // FIXME: rounding?  i18n?
1511                 int size = string_to_int(args[0]);
1512                 int intpart = size;
1513                 int fraction = -1;
1514                 const char * format = 0;
1515                 if (size < 0) {
1516                     // Negative size -> empty result.
1517                 } else if (size == 1) {
1518                     format = "%d byte";
1519                 } else if (size < 1024) {
1520                     format = "%d bytes";
1521                 } else {
1522                     if (size < 1024 * 1024) {
1523                         format = "%d.%cK";
1524                     } else {
1525                         size /= 1024;
1526                         if (size < 1024 * 1024) {
1527                             format = "%d.%cM";
1528                         } else {
1529                             size /= 1024;
1530                             format = "%d.%cG";
1531                         }
1532                     }
1533                     intpart = unsigned(size) / 1024;
1534                     fraction = unsigned(size) % 1024;
1535                 }
1536                 if (format) {
1537                     char buf[200];
1538                     int len;
1539                     if (fraction == -1) {
1540                         len = my_snprintf(buf, sizeof(buf), format, intpart);
1541                     } else {
1542                         fraction = (fraction * 10 / 1024) + '0';
1543                         len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1544                     }
1545                     if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1546                     value.assign(buf, len);
1547                 }
1548                 break;
1549             }
1550             case CMD_filters:
1551                 value = filters;
1552                 break;
1553             case CMD_filterterms: {
1554                 Xapian::TermIterator term = db.allterms_begin();
1555                 term.skip_to(args[0]);
1556                 while (term != db.allterms_end()) {
1557                     string t = *term;
1558                     if (!startswith(t, args[0])) break;
1559                     value += t;
1560                     value += '\t';
1561                     ++term;
1562                 }
1563
1564                 if (!value.empty()) value.erase(value.size() - 1);
1565                 break;
1566             }
1567             case CMD_find: {
1568                 string l = args[0], s = args[1];
1569                 string::size_type i = 0, j = 0;
1570                 size_t count = 0;
1571                 while (j != l.size()) {
1572                     j = l.find('\t', i);
1573                     if (j == string::npos) j = l.size();
1574                     if (j - i == s.length()) {
1575                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1576                             value = str(count);
1577                             break;
1578                         }
1579                     }
1580                     ++count;
1581                     i = j + 1;
1582                 }
1583                 break;
1584             }
1585             case CMD_fmt:
1586                 value = fmtname;
1587                 break;
1588             case CMD_freq: {
1589                 const string& term = args[0];
1590                 Xapian::doccount termfreq = 0;
1591                 if (done_query) {
1592                     termfreq = mset.get_termfreq(term);
1593                 }
1594                 if (termfreq == 0) {
1595                     // We want $freq to work before the match is run, and we
1596                     // don't want using it to force the match to run.
1597                     termfreq = db.get_termfreq(term);
1598                 }
1599                 value = str(termfreq);
1600                 break;
1601             }
1602             case CMD_ge:
1603                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1604                     value = "true";
1605                 break;
1606             case CMD_gt:
1607                 if (string_to_int(args[0]) > string_to_int(args[1]))
1608                     value = "true";
1609                 break;
1610             case CMD_hash: {
1611                 const string& data = args[0];
1612                 const string& hash = args[1];
1613                 if (hash == "md5") {
1614                     string md5;
1615                     md5_string(data, md5);
1616                     value.reserve(md5.size() * 2);
1617                     for (unsigned char byte : md5) {
1618                         value += "0123456789abcdef"[byte >> 4];
1619                         value += "0123456789abcdef"[byte & 0x0f];
1620                     }
1621                 } else {
1622                     throw "Unknown hash function: " + hash;
1623                 }
1624                 break;
1625             }
1626             case CMD_highlight: {
1627                 string bra, ket;
1628                 if (args.size() > 2) {
1629                     bra = args[2];
1630                     if (args.size() > 3) {
1631                         ket = args[3];
1632                     } else {
1633                         string::const_iterator i;
1634                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1635                         ket = "</";
1636                         ket.append(bra, 1, i - bra.begin() - 1);
1637                         ket += '>';
1638                     }
1639                 }
1640
1641                 value = html_highlight(args[0], args[1], bra, ket);
1642                 break;
1643             }
1644             case CMD_hit:
1645                 // 0-based mset index
1646                 value = str(hit_no);
1647                 break;
1648             case CMD_hitlist:
1649 #if 0
1650                 url_query_string = "?DB=";
1651                 url_query_string += dbname;
1652                 for (auto& j : query_strings) {
1653                     if (j.first.empty()) {
1654                         url_query_string += "&P=";
1655                     } else {
1656                         url_query_string += "&P."
1657                         url_query_string += j.first;
1658                         url_query_string += '=';
1659                     }
1660                     const char *q = j.second.c_str();
1661                     int ch;
1662                     while ((ch = *q++) != '\0') {
1663                         switch (ch) {
1664                          case '+':
1665                             url_query_string += "%2b";
1666                             break;
1667                          case '"':
1668                             url_query_string += "%22";
1669                             break;
1670                          case '%':
1671                             url_query_string += "%25";
1672                             break;
1673                          case '&':
1674                             url_query_string += "%26";
1675                             break;
1676                          case ' ':
1677                             ch = '+';
1678                             /* fall through */
1679                          default:
1680                             url_query_string += ch;
1681                         }
1682                     }
1683                 }
1684                 // add any boolean terms
1685                 for (FMCI i = filter_map.begin(); i != filter_map.end(); ++i) {
1686                     url_query_string += "&B=";
1687                     url_query_string += i->second;
1688                 }
1689 #endif
1690                 for (hit_no = topdoc; hit_no < last; ++hit_no)
1691                     value += print_caption(args[0], param);
1692                 hit_no = 0;
1693                 break;
1694             case CMD_hitsperpage:
1695                 value = str(hits_per_page);
1696                 break;
1697             case CMD_hostname: {
1698                 value = args[0];
1699                 // remove URL scheme and/or path
1700                 string::size_type i = value.find("://");
1701                 if (i == string::npos) i = 0; else i += 3;
1702                 value = value.substr(i, value.find('/', i) - i);
1703                 // remove user@ or user:password@
1704                 i = value.find('@');
1705                 if (i != string::npos) value.erase(0, i + 1);
1706                 // remove :port
1707                 i = value.find(':');
1708                 if (i != string::npos) value.resize(i);
1709                 break;
1710             }
1711             case CMD_html:
1712                 value = html_escape(args[0]);
1713                 break;
1714             case CMD_htmlstrip:
1715                 value = html_strip(args[0]);
1716                 break;
1717             case CMD_httpheader:
1718                 if (!suppress_http_headers) {
1719                     cout << args[0] << ": " << args[1] << endl;
1720                     if (!set_content_type && args[0].length() == 12 &&
1721                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1722                         set_content_type = true;
1723                     }
1724                 }
1725                 break;
1726             case CMD_id:
1727                 // document id
1728                 value = str(q0);
1729                 break;
1730             case CMD_if:
1731                 if (!args[0].empty())
1732                     value = eval(args[1], param);
1733                 else if (args.size() > 2)
1734                     value = eval(args[2], param);
1735                 break;
1736             case CMD_include:
1737                 value = eval_file(args[0]);
1738                 break;
1739             case CMD_json:
1740                 value = args[0];
1741                 json_escape(value);
1742                 break;
1743             case CMD_jsonarray: {
1744                 const string & l = args[0];
1745                 string::size_type i = 0, j;
1746                 if (l.empty()) {
1747                     value = "[]";
1748                     break;
1749                 }
1750                 value = "[\"";
1751                 while (true) {
1752                     j = l.find('\t', i);
1753                     string elt(l, i, j - i);
1754                     json_escape(elt);
1755                     value += elt;
1756                     if (j == string::npos) break;
1757                     value += "\",\"";
1758                     i = j + 1;
1759                 }
1760                 value += "\"]";
1761                 break;
1762             }
1763             case CMD_last:
1764                 value = str(last);
1765                 break;
1766             case CMD_lastpage: {
1767                 int l = mset.get_matches_estimated();
1768                 if (l > 0) l = (l - 1) / hits_per_page + 1;
1769                 value = str(l);
1770                 break;
1771             }
1772             case CMD_le:
1773                 if (string_to_int(args[0]) <= string_to_int(args[1]))
1774                     value = "true";
1775                 break;
1776             case CMD_length:
1777                 if (args[0].empty()) {
1778                     value = "0";
1779                 } else {
1780                     size_t length = count(args[0].begin(), args[0].end(), '\t');
1781                     value = str(length + 1);
1782                 }
1783                 break;
1784             case CMD_list: {
1785                 if (!args[0].empty()) {
1786                     string pre, inter, interlast, post;
1787                     switch (args.size()) {
1788                      case 2:
1789                         inter = interlast = args[1];
1790                         break;
1791                      case 3:
1792                         inter = args[1];
1793                         interlast = args[2];
1794                         break;
1795                      case 4:
1796                         pre = args[1];
1797                         inter = interlast = args[2];
1798                         post = args[3];
1799                         break;
1800                      case 5:
1801                         pre = args[1];
1802                         inter = args[2];
1803                         interlast = args[3];
1804                         post = args[4];
1805                         break;
1806                     }
1807                     value += pre;
1808                     string list = args[0];
1809                     string::size_type split = 0, split2;
1810                     while ((split2 = list.find('\t', split)) != string::npos) {
1811                         if (split) value += inter;
1812                         value.append(list, split, split2 - split);
1813                         split = split2 + 1;
1814                     }
1815                     if (split) value += interlast;
1816                     value.append(list, split, string::npos);
1817                     value += post;
1818                 }
1819                 break;
1820             }
1821             case CMD_log: {
1822                 if (!vet_filename(args[0])) break;
1823                 string logfile = log_dir + args[0];
1824                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1825                 if (fd == -1) break;
1826                 vector<string> noargs;
1827                 noargs.resize(1);
1828                 string line;
1829                 if (args.size() > 1) {
1830                     line = args[1];
1831                 } else {
1832                     line = DEFAULT_LOG_ENTRY;
1833                 }
1834                 line = eval(line, noargs);
1835                 line += '\n';
1836                 (void)write_all(fd, line.data(), line.length());
1837                 close(fd);
1838                 break;
1839             }
1840             case CMD_lookup: {
1841                 if (!vet_filename(args[0])) break;
1842                 string cdbfile = cdb_dir + args[0];
1843                 int fd = open(cdbfile.c_str(), O_RDONLY);
1844                 if (fd == -1) break;
1845
1846                 struct cdb cdb;
1847                 cdb_init(&cdb, fd);
1848
1849                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1850                     size_t datalen = cdb_datalen(&cdb);
1851                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1852                     if (q) {
1853                         value.assign(static_cast<const char *>(dat), datalen);
1854                     }
1855                 }
1856
1857                 cdb_free(&cdb);
1858                 close(fd); // FIXME: cache fds?
1859                 break;
1860             }
1861             case CMD_lower:
1862                 value = Xapian::Unicode::tolower(args[0]);
1863                 break;
1864             case CMD_lt:
1865                 if (string_to_int(args[0]) < string_to_int(args[1]))
1866                     value = "true";
1867                 break;
1868             case CMD_map:
1869                 if (!args[0].empty()) {
1870                     string l = args[0], pat = args[1];
1871                     vector<string> new_args(param);
1872                     string::size_type i = 0, j;
1873                     while (true) {
1874                         j = l.find('\t', i);
1875                         new_args[0] = l.substr(i, j - i);
1876                         value += eval(pat, new_args);
1877                         if (j == string::npos) break;
1878                         value += '\t';
1879                         i = j + 1;
1880                     }
1881                 }
1882                 break;
1883             case CMD_match:
1884                 omegascript_match(value, args);
1885                 break;
1886             case CMD_max: {
1887                 vector<string>::const_iterator i = args.begin();
1888                 int val = string_to_int(*i++);
1889                 for (; i != args.end(); ++i) {
1890                     int x = string_to_int(*i);
1891                     if (x > val) val = x;
1892                 }
1893                 value = str(val);
1894                 break;
1895             }
1896             case CMD_min: {
1897                 vector<string>::const_iterator i = args.begin();
1898                 int val = string_to_int(*i++);
1899                 for (; i != args.end(); ++i) {
1900                     int x = string_to_int(*i);
1901                     if (x < val) val = x;
1902                 }
1903                 value = str(val);
1904                 break;
1905             }
1906             case CMD_msize:
1907                 // Estimated number of matches.
1908                 value = str(mset.get_matches_estimated());
1909                 break;
1910             case CMD_msizeexact:
1911                 // Is msize exact?
1912                 if (mset.get_matches_lower_bound()
1913                     == mset.get_matches_upper_bound())
1914                     value = "true";
1915                 break;
1916             case CMD_msizelower:
1917                 // Lower bound on number of matches.
1918                 value = str(mset.get_matches_lower_bound());
1919                 break;
1920             case CMD_msizeupper:
1921                 // Upper bound on number of matches.
1922                 value = str(mset.get_matches_upper_bound());
1923                 break;
1924             case CMD_mod: {
1925                 int denom = string_to_int(args[1]);
1926                 if (denom == 0) {
1927                     value = "divide by 0";
1928                 } else {
1929                     value = str(string_to_int(args[0]) %
1930                                 string_to_int(args[1]));
1931                 }
1932                 break;
1933             }
1934             case CMD_mul: {
1935                 vector<string>::const_iterator i = args.begin();
1936                 int total = string_to_int(*i++);
1937                 while (i != args.end())
1938                     total *= string_to_int(*i++);
1939                 value = str(total);
1940                 break;
1941             }
1942             case CMD_muldiv: {
1943                 int denom = string_to_int(args[2]);
1944                 if (denom == 0) {
1945                     value = "divide by 0";
1946                 } else {
1947                     int num = string_to_int(args[0]) * string_to_int(args[1]);
1948                     value = str(num / denom);
1949                 }
1950                 break;
1951             }
1952             case CMD_ne:
1953                 if (args[0] != args[1]) value = "true";
1954                 break;
1955             case CMD_nice: {
1956                 string::const_iterator i = args[0].begin();
1957                 int len = args[0].length();
1958                 while (len) {
1959                     value += *i++;
1960                     if (--len && len % 3 == 0) value += option["thousand"];
1961                 }
1962                 break;
1963             }
1964             case CMD_not:
1965                 if (args[0].empty()) value = "true";
1966                 break;
1967             case CMD_now:
1968                 value = str(static_cast<unsigned long>(time(NULL)));
1969                 break;
1970             case CMD_opt:
1971                 if (args.size() == 2) {
1972                     value = option[args[0] + "," + args[1]];
1973                 } else {
1974                     value = option[args[0]];
1975                 }
1976                 break;
1977             case CMD_or: {
1978                 for (auto&& arg : args) {
1979                     value = eval(arg, param);
1980                     if (!value.empty()) break;
1981                 }
1982                 break;
1983             }
1984             case CMD_ord: {
1985                 if (!args[0].empty()) {
1986                     Utf8Iterator it(args[0]);
1987                     value = str(*it);
1988                 }
1989                 break;
1990             }
1991             case CMD_pack:
1992                 value = int_to_binary_string(string_to_int(args[0]));
1993                 break;
1994             case CMD_percentage:
1995                 // percentage score
1996                 value = str(percent);
1997                 break;
1998             case CMD_prettyterm:
1999                 value = pretty_term(args[0]);
2000                 break;
2001             case CMD_prettyurl:
2002                 value = args[0];
2003                 url_prettify(value);
2004                 break;
2005             case CMD_query: {
2006                 auto r = query_strings.equal_range(args.empty() ?
2007                                                    string() : args[0]);
2008                 for (auto j = r.first; j != r.second; ++j) {
2009                     if (!value.empty()) value += '\t';
2010                     const string & s = j->second;
2011                     size_t start = 0, tab;
2012                     while ((tab = s.find('\t', start)) != string::npos) {
2013                         value.append(s, start, tab - start);
2014                         value += ' ';
2015                         start = tab + 1;
2016                     }
2017                     value.append(s, start, string::npos);
2018                 }
2019                 break;
2020             }
2021             case CMD_querydescription:
2022                 value = query.get_description();
2023                 break;
2024             case CMD_queryterms:
2025                 value = queryterms;
2026                 break;
2027             case CMD_range: {
2028                 int start = string_to_int(args[0]);
2029                 int end = string_to_int(args[1]);
2030                 while (start <= end) {
2031                     value += str(start);
2032                     if (start < end) value += '\t';
2033                     start++;
2034                 }
2035                 break;
2036             }
2037             case CMD_record: {
2038                 Xapian::docid id = q0;
2039                 if (!args.empty()) id = string_to_int(args[0]);
2040                 value = db.get_document(id).get_data();
2041                 break;
2042             }
2043             case CMD_relevant: {
2044                 // document id if relevant; empty otherwise
2045                 Xapian::docid id = q0;
2046                 if (!args.empty()) id = string_to_int(args[0]);
2047                 map<Xapian::docid, bool>::iterator i = ticked.find(id);
2048                 if (i != ticked.end()) {
2049                     i->second = false; // icky side-effect
2050                     value = str(id);
2051                 }
2052                 break;
2053             }
2054             case CMD_relevants: {
2055                 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
2056                      i != ticked.end(); ++i) {
2057                     if (i->second) {
2058                         value += str(i->first);
2059                         value += '\t';
2060                     }
2061                 }
2062                 if (!value.empty()) value.erase(value.size() - 1);
2063                 break;
2064             }
2065             case CMD_score:
2066                 // Score (0 to 10)
2067                 value = str(percent / 10);
2068                 break;
2069             case CMD_set:
2070                 option[args[0]] = args[1];
2071                 break;
2072             case CMD_seterror:
2073                 error_msg = args[0];
2074                 break;
2075             case CMD_setmap: {
2076                 string base = args[0] + ',';
2077                 if (args.size() % 2 != 1)
2078                     throw string("$setmap requires an odd number of arguments");
2079                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2080                     option[base + args[i]] = args[i + 1];
2081                 }
2082                 break;
2083             }
2084             case CMD_setrelevant: {
2085                 string::size_type i = 0, j;
2086                 while (true) {
2087                     j = args[0].find_first_not_of("0123456789", i);
2088                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2089                     if (id) {
2090                         rset.add_document(id);
2091                         ticked[id] = true;
2092                     }
2093                     if (j == string::npos) break;
2094                     i = j + 1;
2095                 }
2096                 break;
2097             }
2098             case CMD_slice: {
2099                 string list = args[0], pos = args[1];
2100                 vector<string> items;
2101                 string::size_type i = 0, j;
2102                 while (true) {
2103                     j = list.find('\t', i);
2104                     items.push_back(list.substr(i, j - i));
2105                     if (j == string::npos) break;
2106                     i = j + 1;
2107                 }
2108                 i = 0;
2109                 bool have_added = false;
2110                 while (true) {
2111                     j = pos.find('\t', i);
2112                     int item = string_to_int(pos.substr(i, j - i));
2113                     if (item >= 0 && size_t(item) < items.size()) {
2114                         if (have_added) value += '\t';
2115                         value += items[item];
2116                         have_added = true;
2117                     }
2118                     if (j == string::npos) break;
2119                     i = j + 1;
2120                 }
2121                 break;
2122             }
2123             case CMD_snippet: {
2124                 size_t length = 200;
2125                 if (args.size() > 1) {
2126                     length = string_to_int(args[1]);
2127                 }
2128                 if (!stemmer)
2129                     stemmer = new Xapian::Stem(option["stemmer"]);
2130                 // FIXME: Allow start and end highlight and omit to be specified.
2131                 value = mset.snippet(args[0], length, *stemmer,
2132                                      mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2133                                      "<strong>", "</strong>", "...");
2134                 break;
2135             }
2136             case CMD_sort: {
2137                 const string &list = args[0];
2138                 if (list.empty()) break;
2139                 bool uniq = false;
2140                 bool rev = false;
2141                 if (args.size() > 1) {
2142                     for (auto opt_ch : args[1]) {
2143                         switch (opt_ch) {
2144                             case 'r':
2145                                 rev = true;
2146                                 break;
2147                             case 'u':
2148                                 uniq = true;
2149                                 break;
2150                             default:
2151                                 throw string("Unknown $sort option: ") + opt_ch;
2152                         }
2153                     }
2154                 }
2155                 vector<string> items;
2156                 string::size_type split = 0, split2;
2157                 do {
2158                     split2 = list.find('\t', split);
2159                     items.emplace_back(list, split, split2 - split);
2160                     split = split2 + 1;
2161                 } while (split2 != string::npos);
2162
2163                 if (!rev) {
2164                     sort(items.begin(), items.end());
2165                 } else {
2166                     sort(items.begin(), items.end(),
2167                          [](const string& a, const string& b) {
2168                              return a > b;
2169                          });
2170                 }
2171
2172                 value.reserve(list.size());
2173                 bool tab = false;
2174                 const string* prev = nullptr;
2175                 for (auto&& item : items) {
2176                     // Skip duplicates if "u" flag specified.
2177                     if (prev && *prev == item) {
2178                         continue;
2179                     }
2180                     if (uniq) {
2181                         prev = &item;
2182                     }
2183
2184                     if (tab) {
2185                         value += '\t';
2186                     } else {
2187                         tab = true;
2188                     }
2189                     value += item;
2190                 }
2191                 break;
2192             }
2193             case CMD_split: {
2194                 string split;
2195                 if (args.size() == 1) {
2196                     split = " ";
2197                     value = args[0];
2198                 } else {
2199                     split = args[0];
2200                     value = args[1];
2201                 }
2202                 string::size_type i = 0;
2203                 while (true) {
2204                     if (split.empty()) {
2205                         ++i;
2206                         if (i >= value.size()) break;
2207                     } else {
2208                         i = value.find(split, i);
2209                         if (i == string::npos) break;
2210                     }
2211                     value.replace(i, split.size(), 1, '\t');
2212                     ++i;
2213                 }
2214                 break;
2215             }
2216             case CMD_stoplist: {
2217                 Xapian::TermIterator i = qp.stoplist_begin();
2218                 Xapian::TermIterator end = qp.stoplist_end();
2219                 while (i != end) {
2220                     if (!value.empty()) value += '\t';
2221                     value += *i;
2222                     ++i;
2223                 }
2224                 break;
2225             }
2226             case CMD_sub:
2227                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2228                 break;
2229             case CMD_subdb: {
2230                 Xapian::docid id = q0;
2231                 if (args.size() > 0) id = string_to_int(args[0]);
2232                 auto subdbs = get_subdbs();
2233                 value = subdbs[(id - 1) % subdbs.size()];
2234                 break;
2235             }
2236             case CMD_subid: {
2237                 Xapian::docid id = q0;
2238                 if (args.size() > 0) id = string_to_int(args[0]);
2239                 value = str(((id - 1) / get_subdbs().size()) + 1);
2240                 break;
2241             }
2242             case CMD_substr: {
2243                 int start = string_to_int(args[1]);
2244                 if (start < 0) {
2245                     if (static_cast<size_t>(-start) >= args[0].size()) {
2246                         start = 0;
2247                     } else {
2248                         start = static_cast<int>(args[0].size()) + start;
2249                     }
2250                 } else {
2251                     if (static_cast<size_t>(start) >= args[0].size()) break;
2252                 }
2253                 size_t len = string::npos;
2254                 if (args.size() > 2) {
2255                     int int_len = string_to_int(args[2]);
2256                     if (int_len >= 0) {
2257                         len = size_t(int_len);
2258                     } else {
2259                         len = args[0].size() - start;
2260                         if (static_cast<size_t>(-int_len) >= len) {
2261                             len = 0;
2262                         } else {
2263                             len -= static_cast<size_t>(-int_len);
2264                         }
2265                     }
2266                 }
2267                 value.assign(args[0], start, len);
2268                 break;
2269             }
2270             case CMD_suggestion:
2271                 value = qp.get_corrected_query_string();
2272                 break;
2273             case CMD_switch: {
2274                 const string& val = args[0];
2275                 for (size_t i = 1; i < args.size(); i += 2) {
2276                     if (i == args.size() - 1) {
2277                         // Handle optional "else" value.
2278                         value = eval(args[i], param);
2279                         break;
2280                     }
2281                     if (val == eval(args[i], param)) {
2282                         value = eval(args[i + 1], param);
2283                         break;
2284                     }
2285                 }
2286                 break;
2287             }
2288             case CMD_termprefix:
2289                 (void)prefix_from_term(&value, args[0]);
2290                 break;
2291             case CMD_terms: {
2292                 // list of matching terms
2293                 if (!enquire) break;
2294                 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2295                 if (args.empty()) {
2296                     while (term != enquire->get_matching_terms_end(q0)) {
2297                         // check term was in the typed query so we ignore
2298                         // boolean filter terms
2299                         const string & t = *term;
2300                         if (termset.find(t) != termset.end()) {
2301                             value += t;
2302                             value += '\t';
2303                         }
2304                         ++term;
2305                     }
2306                 } else {
2307                     // Return matching terms with specified prefix.  We can't
2308                     // use skip_to() as the terms aren't ordered by termname.
2309                     const string & pfx = args[0];
2310                     while (term != enquire->get_matching_terms_end(q0)) {
2311                         const string & t = *term;
2312                         if (startswith(t, pfx)) {
2313                             value += t;
2314                             value += '\t';
2315                         }
2316                         ++term;
2317                     }
2318                 }
2319
2320                 if (!value.empty()) value.erase(value.size() - 1);
2321                 break;
2322             }
2323             case CMD_thispage:
2324                 value = str(topdoc / hits_per_page + 1);
2325                 break;
2326             case CMD_time:
2327                 if (secs >= 0) {
2328                     char buf[64];
2329                     my_snprintf(buf, sizeof(buf), "%.6f", secs);
2330                     // MSVC's snprintf omits the zero byte if the string if
2331                     // sizeof(buf) long.
2332                     buf[sizeof(buf) - 1] = '\0';
2333                     value = buf;
2334                 }
2335                 break;
2336             case CMD_topdoc:
2337                 // first document on current page of hit list (counting from 0)
2338                 value = str(topdoc);
2339                 break;
2340             case CMD_topterms:
2341                 if (enquire) {
2342                     int howmany = 16;
2343                     if (!args.empty()) howmany = string_to_int(args[0]);
2344                     if (howmany < 0) howmany = 0;
2345
2346                     // List of expand terms
2347                     Xapian::ESet eset;
2348                     OmegaExpandDecider decider(db, &termset);
2349
2350                     if (!rset.empty()) {
2351                         set_expansion_scheme(*enquire, option);
2352                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2353                     } else if (mset.size()) {
2354                         // invent an rset
2355                         Xapian::RSet tmp;
2356
2357                         int c = 5;
2358                         // FIXME: what if mset does not start at first match?
2359                         for (Xapian::docid did : mset) {
2360                             tmp.add_document(did);
2361                             if (--c == 0) break;
2362                         }
2363
2364                         set_expansion_scheme(*enquire, option);
2365                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2366                     }
2367
2368                     // Don't show more than one word with the same stem.
2369                     set<string> stems;
2370                     Xapian::ESetIterator i;
2371                     for (i = eset.begin(); i != eset.end(); ++i) {
2372                         string term(*i);
2373                         string stem = (*stemmer)(term);
2374                         if (stems.find(stem) != stems.end()) continue;
2375                         stems.insert(stem);
2376                         value += term;
2377                         value += '\t';
2378                         if (--howmany == 0) break;
2379                     }
2380                     if (!value.empty()) value.erase(value.size() - 1);
2381                 }
2382                 break;
2383             case CMD_transform:
2384                 omegascript_transform(value, args);
2385                 break;
2386             case CMD_truncate:
2387                 value = generate_sample(args[0],
2388                                         string_to_int(args[1]),
2389                                         args.size() > 2 ? args[2] : string(),
2390                                         args.size() > 3 ? args[3] : string());
2391                 break;
2392             case CMD_uniq: {
2393                 const string &list = args[0];
2394                 if (list.empty()) break;
2395                 string::size_type split = 0, split2;
2396                 string prev;
2397                 do {
2398                     split2 = list.find('\t', split);
2399                     string item(list, split, split2 - split);
2400                     if (split == 0) {
2401                         value = item;
2402                     } else if (item != prev) {
2403                         value += '\t';
2404                         value += item;
2405                     }
2406                     prev = item;
2407                     split = split2 + 1;
2408                 } while (split2 != string::npos);
2409                 break;
2410             }
2411             case CMD_unique: {
2412                 unordered_set<string> seen;
2413                 const string &list = args[0];
2414                 if (list.empty()) break;
2415                 string::size_type split = 0, split2;
2416                 do {
2417                     split2 = list.find('\t', split);
2418                     string item(list, split, split2 - split);
2419                     if (seen.insert(item).second) {
2420                         if (split != 0)
2421                             value += '\t';
2422                         value += item;
2423                     }
2424                     split = split2 + 1;
2425                 } while (split2 != string::npos);
2426                 break;
2427             }
2428             case CMD_unpack:
2429                 value = str(binary_string_to_int(args[0]));
2430                 break;
2431             case CMD_unprefix: {
2432                 size_t prefix_len = prefix_from_term(NULL, args[0]);
2433                 value.assign(args[0], prefix_len, string::npos);
2434                 break;
2435             }
2436             case CMD_unstem: {
2437                 const string &term = args[0];
2438                 Xapian::TermIterator i = qp.unstem_begin(term);
2439                 Xapian::TermIterator end = qp.unstem_end(term);
2440                 while (i != end) {
2441                     if (!value.empty()) value += '\t';
2442                     value += *i;
2443                     ++i;
2444                 }
2445                 break;
2446             }
2447             case CMD_upper:
2448                 value = Xapian::Unicode::toupper(args[0]);
2449                 break;
2450             case CMD_url:
2451                 url_encode(value, args[0]);
2452                 break;
2453             case CMD_value: {
2454                 Xapian::docid id = q0;
2455                 Xapian::valueno value_no = string_to_int(args[0]);
2456                 if (args.size() > 1) id = string_to_int(args[1]);
2457                 value = db.get_document(id).get_value(value_no);
2458                 break;
2459             }
2460             case CMD_version:
2461                 value = PACKAGE_STRING;
2462                 break;
2463             case CMD_weight:
2464                 value = double_to_string(weight);
2465                 break;
2466             default: {
2467                 args.insert(args.begin(), param[0]);
2468                 int macro_no = func->second->tag - CMD_MACRO;
2469                 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2470                 // throw "Unknown function '" + var + "'";
2471                 value = eval(macros[macro_no], args);
2472                 break;
2473             }
2474         }
2475         res += value;
2476     } catch (const Xapian::Error & e) {
2477         // FIXME: this means we only see the most recent error in $error
2478         // - is that the best approach?
2479         error_msg = e.get_msg();
2480     }
2481
2482     res.append(fmt, p, string::npos);
2483     return res;
2484 }
2485
2486 static string
2487 eval_file(const string &fmtfile)
2488 {
2489     string err;
2490     if (vet_filename(fmtfile)) {
2491         string file = template_dir + fmtfile;
2492         string fmt;
2493         if (load_file(file, fmt)) {
2494             vector<string> noargs;
2495             noargs.resize(1);
2496             return eval(fmt, noargs);
2497         }
2498         err = strerror(errno);
2499     } else {
2500         err = "name contains '..'";
2501     }
2502
2503     // FIXME: report why!
2504     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2505     if (!err.empty()) msg += " (" + err + ')';
2506     throw msg;
2507 }
2508
2509 extern string
2510 pretty_term(string term)
2511 {
2512     // Just leave empty strings and single characters alone.
2513     if (term.length() <= 1) return term;
2514
2515     // Assume unprefixed terms are unstemmed.
2516     if (!C_isupper(term[0])) return term;
2517
2518     // Handle stemmed terms.
2519     bool stemmed = (term[0] == 'Z');
2520     if (stemmed) {
2521         // First of all, check if a term in the query stemmed to this one.
2522         Xapian::TermIterator u = qp.unstem_begin(term);
2523         // There might be multiple words with the same stem, but we only want
2524         // one so just take the first.
2525         if (u != qp.unstem_end(term)) return *u;
2526
2527         // Remove the 'Z'.
2528         term.erase(0, 1);
2529     }
2530
2531     bool add_quotes = false;
2532
2533     // Check if the term has a prefix.
2534     if (C_isupper(term[0])) {
2535         // See if we have this prefix in the termprefix_to_userprefix map.  If
2536         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2537         string prefix;
2538         size_t prefix_len = prefix_from_term(&prefix, term);
2539
2540         map<string, string>::const_iterator i;
2541         i = termprefix_to_userprefix.find(prefix);
2542         if (i != termprefix_to_userprefix.end()) {
2543             string user_prefix = i->second;
2544             user_prefix += ':';
2545             term.replace(0, prefix_len, user_prefix);
2546         } else {
2547             // We don't have a prefix mapping for this, so just set a flag to
2548             // add quotes around the term.
2549             add_quotes = true;
2550         }
2551     }
2552
2553     if (stemmed) term += '.';
2554
2555     if (add_quotes) {
2556         term.insert(0, "\"");
2557         term.append("\"");
2558     }
2559
2560     return term;
2561 }
2562
2563 static string
2564 print_caption(const string &fmt, const vector<string> &param)
2565 {
2566     q0 = *(mset[hit_no]);
2567
2568     weight = mset[hit_no].get_weight();
2569     percent = mset.convert_to_percent(mset[hit_no]);
2570     collapsed = mset[hit_no].get_collapse_count();
2571
2572     return eval(fmt, param);
2573 }
2574
2575 void
2576 parse_omegascript()
2577 {
2578     try {
2579         const char * p = getenv("SERVER_PROTOCOL");
2580         if (p && strcmp(p, "INCLUDED") == 0) {
2581             // We're being included in another page, so suppress headers.
2582             suppress_http_headers = true;
2583         }
2584
2585         string output = eval_file(fmtname);
2586         if (!set_content_type && !suppress_http_headers) {
2587             cout << "Content-Type: text/html" << endl;
2588             set_content_type = true;
2589         }
2590         if (!suppress_http_headers) cout << endl;
2591         cout << output;
2592     } catch (...) {
2593         // Ensure the headers have been output so that any exception gets
2594         // reported rather than giving a server error.
2595         if (!set_content_type && !suppress_http_headers) {
2596             cout << "Content-Type: text/html" << endl;
2597             set_content_type = true;
2598         }
2599         if (!suppress_http_headers) cout << endl;
2600         throw;
2601     }
2602 }
2603
2604 static void
2605 ensure_query_parsed()
2606 {
2607     if (query_parsed) return;
2608     query_parsed = true;
2609
2610     MCI val;
2611     pair<MCI, MCI> g;
2612
2613     // Should we discard the existing R-set recorded in R CGI parameters?
2614     bool discard_rset = false;
2615
2616     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2617     // CGI parameters)?
2618     bool force_first_page = false;
2619
2620     string v;
2621     // get list of terms from previous iteration of query
2622     val = cgi_params.find("xP");
2623     if (val != cgi_params.end()) {
2624         v = val->second;
2625         // If xP given, default to discarding any RSet and forcing the first
2626         // page of results.  If the query is the same, or an extension of
2627         // the previous query, we adjust these again below.
2628         discard_rset = true;
2629         force_first_page = true;
2630     }
2631     querytype result = parse_queries(v);
2632     switch (result) {
2633         case BAD_QUERY:
2634             break;
2635         case NEW_QUERY:
2636             break;
2637         case SAME_QUERY:
2638         case EXTENDED_QUERY:
2639             // If we've changed database, force the first page of hits
2640             // and discard the R-set (since the docids will have changed)
2641             val = cgi_params.find("xDB");
2642             if (val != cgi_params.end() && val->second != dbname) break;
2643             if (result == SAME_QUERY && force_first_page) {
2644                 val = cgi_params.find("xFILTERS");
2645                 if (val != cgi_params.end() && val->second != filters &&
2646                     val->second != old_filters) {
2647                     // Filters have changed since last query.
2648                 } else {
2649                     force_first_page = false;
2650                 }
2651             }
2652             discard_rset = false;
2653             break;
2654     }
2655
2656     if (!force_first_page) {
2657         // Work out which mset element is the first hit we want
2658         // to display
2659         val = cgi_params.find("TOPDOC");
2660         if (val != cgi_params.end()) {
2661             topdoc = atol(val->second.c_str());
2662         }
2663
2664         // Handle next, previous, and page links
2665         if (cgi_params.find(">") != cgi_params.end()) {
2666             topdoc += hits_per_page;
2667         } else if (cgi_params.find("<") != cgi_params.end()) {
2668             if (topdoc >= hits_per_page)
2669                 topdoc -= hits_per_page;
2670             else
2671                 topdoc = 0;
2672         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2673                    (val = cgi_params.find("#")) != cgi_params.end()) {
2674             long page = atol(val->second.c_str());
2675             // Do something sensible for page 0 (we count pages from 1).
2676             if (page == 0) page = 1;
2677             topdoc = (page - 1) * hits_per_page;
2678         }
2679
2680         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2681         // Normally we snap TOPDOC like this so that things work nicely if
2682         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
2683         // postprocessing the output of omega and want variable sized pages,
2684         // this is unhelpful.
2685         bool raw_search = false;
2686         val = cgi_params.find("RAWSEARCH");
2687         if (val != cgi_params.end()) {
2688             raw_search = bool(atol(val->second.c_str()));
2689         }
2690
2691         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2692     }
2693
2694     if (!discard_rset) {
2695         // put documents marked as relevant into the rset
2696         g = cgi_params.equal_range("R");
2697         for (MCI i = g.first; i != g.second; ++i) {
2698             const string & value = i->second;
2699             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2700                 while (value[j] == '.') ++j;
2701                 Xapian::docid d = atoi(value.c_str() + j);
2702                 if (d) {
2703                     rset.add_document(d);
2704                     ticked[d] = true;
2705                 }
2706             }
2707         }
2708     }
2709 }
2710
2711 // run query if we haven't already
2712 static void
2713 ensure_match()
2714 {
2715     if (done_query) return;
2716
2717     secs = RealTime::now();
2718     run_query();
2719     if (secs != -1)
2720         secs = RealTime::now() - secs;
2721
2722     done_query = true;
2723     last = mset.get_matches_lower_bound();
2724     if (last == 0) {
2725         // Otherwise topdoc ends up being -6 if it's non-zero!
2726         topdoc = 0;
2727     } else {
2728         if (topdoc >= last)
2729             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2730         // last is the count of documents up to the end of the current page
2731         // (as returned by $last)
2732         if (topdoc + hits_per_page < last)
2733             last = topdoc + hits_per_page;
2734     }
2735 }
2736
2737 // OmegaExpandDecider methods.
2738
2739 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2740                                        set<string> * querytermset)
2741     : db(db_)
2742 {
2743     // We'll want the stemmer for testing matches anyway.
2744     if (!stemmer)
2745         stemmer = new Xapian::Stem(option["stemmer"]);
2746     if (querytermset) {
2747         set<string>::const_iterator i;
2748         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2749             string term(*i);
2750             if (term.empty()) continue;
2751
2752             unsigned char ch = term[0];
2753             bool stemmed = (ch == 'Z');
2754             if (stemmed) {
2755                 term.erase(0, 1);
2756                 if (term.empty()) continue;
2757                 ch = term[0];
2758             }
2759
2760             if (C_isupper(ch)) {
2761                 size_t prefix_len = prefix_from_term(NULL, term);
2762                 term.erase(0, prefix_len);
2763             }
2764
2765             if (!stemmed) term = (*stemmer)(term);
2766
2767             exclude_stems.insert(term);
2768         }
2769     }
2770 }
2771
2772 bool
2773 OmegaExpandDecider::operator()(const string & term) const
2774 {
2775     unsigned char ch = term[0];
2776
2777     // Reject terms with a prefix.
2778     if (C_isupper(ch)) return false;
2779
2780     {
2781         MyStopper stopper;
2782         // Don't suggest stopwords.
2783         if (stopper(term)) return false;
2784     }
2785
2786     // Reject small numbers.
2787     if (term.size() < 4 && C_isdigit(ch)) return false;
2788
2789     // Reject terms containing a space.
2790     if (term.find(' ') != string::npos) return false;
2791
2792     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2793     // terms which are already in the query in some form.
2794     string stem = (*stemmer)(term);
2795     if (exclude_stems.find(stem) != exclude_stems.end())
2796         return false;
2797
2798     // Ignore terms that only occur once (hapaxes) since they aren't
2799     // useful for finding related documents - they only occur in a
2800     // document that's already been marked as relevant.
2801     // FIXME: add an expand option to ignore terms where
2802     // termfreq == rtermfreq.
2803     if (db.get_termfreq(term) <= 1) return false;
2804
2805     return true;
2806 }