xapian-applications/omega/query.cc

   1 /* query.cc: query executor for omega
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 James Aylett
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002 Intercede 1749 Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
   8  * Copyright 2008 Thomas Viehmann
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 #include <algorithm>
  29 #include <iostream>
  30 #include <map>
  31 #include <set>
  32 #include <vector>
  33
  34 #include <cassert>
  35 #include <cctype>
  36 #include "safeerrno.h"
  37 #include <stdio.h>
  38 #include <cstdlib>
  39 #include <cstring>
  40 #include "strcasecmp.h"
  41 #include <ctime>
  42
  43 #include "safeunistd.h"
  44 #include <sys/types.h>
  45 #include "safesysstat.h"
  46 #include "safefcntl.h"
  47
  48 #include "realtime.h"
  49
  50 #include <cdb.h>
  51
  52 #include "csvescape.h"
  53 #include "date.h"
  54 #include "datevalue.h"
  55 #include "jsonescape.h"
  56 #include "utils.h"
  57 #include "omega.h"
  58 #include "query.h"
  59 #include "cgiparam.h"
  60 #include "loadfile.h"
  61 #include "sample.h"
  62 #include "str.h"
  63 #include "stringutils.h"
  64 #include "transform.h"
  65 #include "urldecode.h"
  66 #include "urlencode.h"
  67 #include "unixperm.h"
  68 #include "values.h"
  69 #include "weight.h"
  70 #include "expand.h"
  71
  72 #include <xapian.h>
  73
  74 using namespace std;
  75
  76 using Xapian::Utf8Iterator;
  77
  78 using Xapian::Unicode::is_wordchar;
  79
  80 #ifndef SNPRINTF
  81 #include <cstdarg>
  82
  83 static int my_snprintf(char *str, size_t size, const char *format, ...)
  84 {
  85     int res;
  86     va_list ap;
  87     va_start(ap, format);
  88     str[size - 1] = '\0';
  89     res = vsprintf(str, format, ap);
  90     if (str[size - 1] || res < 0 || size_t(res) >= size)
  91         abort(); /* Overflowed! */
  92     va_end(ap);
  93     return res;
  94 }
  95 #else
  96 #define my_snprintf SNPRINTF
  97 #endif
  98
  99 static bool query_parsed = false;
 100 static bool done_query = false;
 101 static Xapian::docid last = 0;
 102
 103 static Xapian::MSet mset;
 104
 105 static map<Xapian::docid, bool> ticked;
 106
 107 static void ensure_query_parsed();
 108 static void ensure_match();
 109
 110 static Xapian::Query query;
 111 //static string url_query_string;
 112 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 113
 114 static Xapian::QueryParser qp;
 115 static Xapian::NumberRangeProcessor * size_rp = NULL;
 116 static Xapian::Stem *stemmer = NULL;
 117
 118 static string eval_file(const string &fmtfile);
 119
 120 static set<string> termset;
 121
 122 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 123 static map<string, string> termprefix_to_userprefix;
 124
 125 static string queryterms;
 126
 127 static string error_msg;
 128
 129 static double secs = -1;
 130
 131 static const char DEFAULT_LOG_ENTRY[] =
 132         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 133         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 134         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 135         "$dbname\t"
 136         "$query\t"
 137         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 138
 139 class MyStopper : public Xapian::Stopper {
 140   public:
 141     bool operator()(const string &t) const {
 142         switch (t[0]) {
 143             case 'a':
 144                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 145                         t == "are" || t == "as" || t == "at");
 146             case 'b':
 147                 return (t == "be" || t == "by");
 148             case 'e':
 149                 return (t == "en");
 150             case 'f':
 151                 return (t == "for" || t == "from");
 152             case 'h':
 153                 return (t == "how");
 154             case 'i':
 155                 return (t == "i" || t == "in" || t == "is" || t == "it");
 156             case 'o':
 157                 return (t == "of" || t == "on" || t == "or");
 158             case 't':
 159                 return (t == "that" || t == "the" || t == "this" || t == "to");
 160             case 'w':
 161                 return (t == "was" || t == "what" || t == "when" ||
 162                         t == "where" || t == "which" || t == "who" ||
 163                         t == "why" || t == "will" || t == "with");
 164             case 'y':
 165                 return (t == "you" || t == "your");
 166             default:
 167                 return false;
 168         }
 169     }
 170 };
 171
 172 static size_t
 173 prefix_from_term(string &prefix, const string &term)
 174 {
 175     if (term.empty()) {
 176         prefix.resize(0);
 177         return 0;
 178     }
 179     if (term[0] == 'X') {
 180         const string::const_iterator begin = term.begin();
 181         string::const_iterator i = begin + 1;
 182         while (i != term.end() && C_isupper(*i)) ++i;
 183         prefix.assign(begin, i);
 184         if (i != term.end() && *i == ':') ++i;
 185         return i - begin;
 186     }
 187
 188     prefix = term[0];
 189     return 1;
 190 }
 191
 192 // Don't allow ".." in format names, log file names, etc as this would allow
 193 // people to open a format "../../etc/passwd" or similar.
 194 // FIXME: make this check more exact ("foo..bar" is safe)
 195 // FIXME: log when this check fails
 196 static bool
 197 vet_filename(const string &filename)
 198 {
 199     string::size_type i = filename.find("..");
 200     return (i == string::npos);
 201 }
 202
 203 // Heuristics:
 204 // * If any terms have been removed, it's a "fresh query" so we discard any
 205 //   relevance judgements
 206 // * If all previous terms are there but more have been added then we keep
 207 //   the relevance judgements, but return the first page of hits
 208 //
 209 // NEW_QUERY entirely new query
 210 // SAME_QUERY unchanged query
 211 // EXTENDED_QUERY new query, but based on the old one
 212 // BAD_QUERY parse error (message in error_msg)
 213 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 214
 215 static multimap<string, string> probabilistic_query;
 216
 217 void
 218 set_probabilistic_query(const string & prefix, const string & s)
 219 {
 220     string query_string = s;
 221     // Strip leading and trailing whitespace from query_string.
 222     trim(query_string);
 223     if (!query_string.empty())
 224         probabilistic_query.insert(make_pair(prefix, query_string));
 225 }
 226
 227 static unsigned
 228 read_qp_flags(const string & opt_pfx, unsigned f)
 229 {
 230     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 231     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 232         unsigned mask = 0;
 233         const char * s = i->first.c_str() + opt_pfx.size();
 234         switch (s[0]) {
 235             case 'a':
 236                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 237                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 238                     break;
 239                 }
 240                 if (strcmp(s, "auto_synonyms") == 0) {
 241                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 242                     break;
 243                 }
 244                 break;
 245             case 'b':
 246                 if (strcmp(s, "boolean") == 0) {
 247                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 248                     break;
 249                 }
 250                 if (strcmp(s, "boolean_any_case") == 0) {
 251                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 252                     break;
 253                 }
 254                 break;
 255             case 'c':
 256                 if (strcmp(s, "cjk_ngram") == 0) {
 257                     mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
 258                     break;
 259                 }
 260                 break;
 261             case 'd':
 262                 if (strcmp(s, "default") == 0) {
 263                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 264                     break;
 265                 }
 266                 break;
 267             case 'l':
 268                 if (strcmp(s, "lovehate") == 0) {
 269                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 270                     break;
 271                 }
 272                 break;
 273             case 'p':
 274                 if (strcmp(s, "partial") == 0) {
 275                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 276                     break;
 277                 }
 278                 if (strcmp(s, "phrase") == 0) {
 279                     mask = Xapian::QueryParser::FLAG_PHRASE;
 280                     break;
 281                 }
 282                 if (strcmp(s, "pure_not") == 0) {
 283                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 284                     break;
 285                 }
 286                 break;
 287             case 's':
 288                 if (strcmp(s, "spelling_correction") == 0) {
 289                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 290                     break;
 291                 }
 292                 if (strcmp(s, "synonym") == 0) {
 293                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 294                     break;
 295                 }
 296                 break;
 297             case 'w':
 298                 if (strcmp(s, "wildcard") == 0) {
 299                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 300                     break;
 301                 }
 302                 break;
 303         }
 304
 305         if (i->second.empty()) {
 306             f &= ~mask;
 307         } else {
 308             f |= mask;
 309         }
 310     }
 311     return f;
 312 }
 313
 314 static querytype
 315 set_probabilistic(const string &oldp)
 316 {
 317     // Parse the query string.
 318     qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
 319     qp.set_stopper(new MyStopper());
 320     qp.set_default_op(default_op);
 321     qp.set_database(db);
 322     // FIXME: provide a custom RP which handles size:10..20K, etc.
 323     if (!size_rp)
 324         size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
 325     qp.add_rangeprocessor(size_rp);
 326     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 327     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 328         string user_prefix(pfx->first, 7);
 329         const string & term_pfx_list = pfx->second;
 330         string::size_type i = 0;
 331         do {
 332             string::size_type i0 = i;
 333             i = term_pfx_list.find('\t', i);
 334             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 335             qp.add_prefix(user_prefix, term_pfx);
 336             // std::map::insert() won't overwrite an existing entry, so we'll
 337             // prefer the first user_prefix for which a particular term prefix
 338             // is specified.
 339             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 340         } while (++i);
 341     }
 342     pfx = option.lower_bound("boolprefix,");
 343     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 344         string user_prefix(pfx->first, 11, string::npos);
 345         auto it = option.find("nonexclusiveprefix," + pfx->second);
 346         bool exclusive = (it == option.end() || it->second.empty());
 347         qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
 348         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 349     }
 350
 351     try {
 352         unsigned default_flags = read_qp_flags("flag_", 0);
 353         if (option["spelling"] == "true")
 354             default_flags |= qp.FLAG_SPELLING_CORRECTION;
 355
 356         vector<Xapian::Query> queries;
 357         queries.reserve(probabilistic_query.size());
 358
 359         multimap<string, string>::const_iterator j;
 360         for (j = probabilistic_query.begin();
 361              j != probabilistic_query.end();
 362              ++j) {
 363             const string & prefix = j->first;
 364
 365             // Choose the stemmer to use for this input.
 366             string stemlang = option[prefix + ":stemmer"];
 367             if (stemlang.empty())
 368                 stemlang = option["stemmer"];
 369             qp.set_stemmer(Xapian::Stem(stemlang));
 370
 371             // Work out the flags to use for this input.
 372             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 373
 374             const string & query_string = j->second;
 375             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 376             if (!q.empty())
 377                 queries.push_back(q);
 378         }
 379         query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
 380     } catch (Xapian::QueryParserError &e) {
 381         error_msg = e.get_msg();
 382         return BAD_QUERY;
 383     }
 384
 385     Xapian::termcount n_new_terms = 0;
 386     for (Xapian::TermIterator i = query.get_terms_begin();
 387          i != query.get_terms_end(); ++i) {
 388         if (termset.find(*i) == termset.end()) {
 389             termset.insert(*i);
 390             if (!queryterms.empty()) queryterms += '\t';
 391             queryterms += *i;
 392         }
 393         n_new_terms++;
 394     }
 395
 396     // Check new query against the previous one
 397     if (oldp.empty()) {
 398         // If oldp was empty that means there were no probabilistic terms
 399         // before, so if there are now this is a new query.
 400         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 401     }
 402
 403     // The terms in oldp are separated by tabs.
 404     const char oldp_separator = '\t';
 405     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 406
 407     // short-cut: if the new query has fewer terms, it must be a new one
 408     if (n_new_terms < n_old_terms) return NEW_QUERY;
 409
 410     const char *term = oldp.c_str();
 411     const char *pend;
 412     while ((pend = strchr(term, oldp_separator)) != NULL) {
 413         if (termset.find(string(term, pend - term)) == termset.end())
 414             return NEW_QUERY;
 415         term = pend + 1;
 416     }
 417     if (*term) {
 418         if (termset.find(string(term)) == termset.end())
 419             return NEW_QUERY;
 420     }
 421
 422     // Use termset.size() rather than n_new_terms so we correctly handle
 423     // the case when the query has repeated terms.
 424     // This works wrongly in the case when the user extends the query
 425     // by adding a term already in it, but that's unlikely and the behaviour
 426     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 427     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 428     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 429     return SAME_QUERY;
 430 }
 431
 432 static multimap<string, string> filter_map;
 433 static set<string> neg_filters;
 434
 435 typedef multimap<string, string>::const_iterator FMCI;
 436
 437 void add_bterm(const string &term) {
 438     string prefix;
 439     if (prefix_from_term(prefix, term) > 0)
 440         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 441 }
 442
 443 void add_nterm(const string &term) {
 444     if (!term.empty())
 445         neg_filters.insert(term);
 446 }
 447
 448 static void
 449 run_query()
 450 {
 451     string scheme;
 452     bool force_boolean = false;
 453     if (!filter_map.empty()) {
 454         // OR together filters with the same prefix (or AND for non-exclusive
 455         // prefixes), then AND together the resultant groups.
 456         vector<Xapian::Query> filter_vec;
 457         vector<string> same_vec;
 458         string current;
 459         for (FMCI i = filter_map.begin(); ; i++) {
 460             bool over = (i == filter_map.end());
 461             if (over || i->first != current) {
 462                 switch (same_vec.size()) {
 463                     case 0:
 464                         break;
 465                     case 1:
 466                         filter_vec.push_back(Xapian::Query(same_vec[0]));
 467                         break;
 468                     default: {
 469                         Xapian::Query::op op = Xapian::Query::OP_OR;
 470                         auto it = option.find("nonexclusiveprefix," + current);
 471                         if (it != option.end() && !it->second.empty()) {
 472                             op = Xapian::Query::OP_AND;
 473                         }
 474                         filter_vec.push_back(Xapian::Query(op,
 475                                                      same_vec.begin(),
 476                                                      same_vec.end()));
 477                         break;
 478                     }
 479                 }
 480                 same_vec.clear();
 481                 if (over) break;
 482                 current = i->first;
 483             }
 484             same_vec.push_back(i->second);
 485         }
 486
 487         Xapian::Query filter(Xapian::Query::OP_AND,
 488                              filter_vec.begin(), filter_vec.end());
 489
 490         if (query.empty()) {
 491             // If no probabilistic query is provided then promote the filters
 492             // to be THE query - filtering an empty query will give no
 493             // matches.
 494             std::swap(query, filter);
 495             auto&& it = option.find("weightingpurefilter");
 496             if (it != option.end() && !it->second.empty()) {
 497                 scheme = it->second;
 498             } else {
 499                 force_boolean = true;
 500             }
 501         } else {
 502             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 503         }
 504     }
 505
 506     if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
 507         Xapian::Query date_filter;
 508         if (date_value_slot != Xapian::BAD_VALUENO) {
 509             // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
 510             // latter the sort order just works correctly between different
 511             // precisions).
 512             bool as_time_t =
 513                 db.get_value_lower_bound(date_value_slot).size() == 4 &&
 514                 db.get_value_upper_bound(date_value_slot).size() == 4;
 515             date_filter = date_value_range(as_time_t, date_value_slot,
 516                                            date_start, date_end,
 517                                            date_span);
 518         } else {
 519             date_filter = date_range_filter(date_start, date_end, date_span);
 520             date_filter = Xapian::Query(Xapian::Query::OP_OR,
 521                                         date_filter,
 522                                         Xapian::Query("Dlatest"));
 523         }
 524
 525         // If no probabilistic query is provided then promote the daterange
 526         // filter to be THE query instead of filtering an empty query.
 527         if (query.empty()) {
 528             query = date_filter;
 529             force_boolean = true;
 530         } else {
 531             query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 532         }
 533     }
 534
 535     if (!neg_filters.empty()) {
 536         // OR together all negated filters.
 537         Xapian::Query filter(Xapian::Query::OP_OR,
 538                              neg_filters.begin(), neg_filters.end());
 539
 540         if (query.empty()) {
 541             // If we only have a negative filter for the query, use MatchAll as
 542             // the query to apply the filters to.
 543             query = Xapian::Query::MatchAll;
 544             force_boolean = true;
 545         }
 546         query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
 547     }
 548
 549     if (!enquire || !error_msg.empty()) return;
 550
 551     if (!force_boolean && scheme.empty()) {
 552         auto&& it = option.find("weighting");
 553         if (it != option.end()) scheme = it->second;
 554     }
 555     set_weighting_scheme(*enquire, scheme, force_boolean);
 556
 557     enquire->set_cutoff(threshold);
 558
 559     if (sort_keymaker) {
 560         if (sort_after) {
 561             enquire->set_sort_by_relevance_then_key(sort_keymaker,
 562                                                     reverse_sort);
 563         } else {
 564             enquire->set_sort_by_key_then_relevance(sort_keymaker,
 565                                                     reverse_sort);
 566         }
 567     } else if (sort_key != Xapian::BAD_VALUENO) {
 568         if (sort_after) {
 569             enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
 570         } else {
 571             enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
 572         }
 573     }
 574
 575     enquire->set_docid_order(docid_order);
 576
 577     if (collapse) {
 578         enquire->set_collapse_key(collapse_key);
 579     }
 580
 581     if (!query.empty()) {
 582 #if 0
 583         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 584         // we're going to break some existing setups if users upgrade.  We
 585         // probably want a way to set this from OmegaScript.
 586         const char * remote_user = getenv("REMOTE_USER");
 587         if (remote_user)
 588             apply_unix_permissions(query, remote_user);
 589 #endif
 590
 591         enquire->set_query(query);
 592         // We could use the value of topdoc as first parameter, but we
 593         // need to know the first few items in the mset to fake a
 594         // relevance set for topterms.
 595         //
 596         // If min_hits isn't set, check at least one extra result so we
 597         // know if we've reached the end of the matches or not - then we
 598         // can avoid offering a "next" button which leads to an empty page.
 599         mset = enquire->get_mset(0, topdoc + hits_per_page,
 600                                  topdoc + max(hits_per_page + 1, min_hits),
 601                                  &rset);
 602     }
 603 }
 604
 605 string
 606 html_escape(const string &str)
 607 {
 608     string res;
 609     string::size_type p = 0;
 610     while (p < str.size()) {
 611         char ch = str[p++];
 612         switch (ch) {
 613             case '<':
 614                 res += "&lt;";
 615                 continue;
 616             case '>':
 617                 res += "&gt;";
 618                 continue;
 619             case '&':
 620                 res += "&amp;";
 621                 continue;
 622             case '"':
 623                 res += "&quot;";
 624                 continue;
 625             default:
 626                 res += ch;
 627         }
 628     }
 629     return res;
 630 }
 631
 632 static string
 633 html_strip(const string &str)
 634 {
 635     string res;
 636     string::size_type p = 0;
 637     bool skip = false;
 638     while (p < str.size()) {
 639         char ch = str[p++];
 640         switch (ch) {
 641             case '<':
 642                 skip = true;
 643                 continue;
 644             case '>':
 645                 skip = false;
 646                 continue;
 647             default:
 648                 if (! skip) res += ch;
 649         }
 650     }
 651     return res;
 652 }
 653
 654 // FIXME split list into hash or map and use that rather than linear lookup?
 655 static int word_in_list(const string& word, const string& list)
 656 {
 657     string::size_type split = 0, split2;
 658     int count = 0;
 659     while ((split2 = list.find('\t', split)) != string::npos) {
 660         if (word.size() == split2 - split) {
 661             if (memcmp(word.data(), list.data() + split, word.size()) == 0)
 662                 return count;
 663         }
 664         split = split2 + 1;
 665         ++count;
 666     }
 667     if (word.size() == list.size() - split) {
 668         if (memcmp(word.data(), list.data() + split, word.size()) == 0)
 669             return count;
 670     }
 671     return -1;
 672 }
 673
 674 // Not a character in an identifier
 675 inline static bool
 676 p_notid(unsigned int c)
 677 {
 678     return !C_isalnum(c) && c != '_';
 679 }
 680
 681 // Not a character in an HTML tag name
 682 inline static bool
 683 p_nottag(unsigned int c)
 684 {
 685     return !C_isalnum(c) && c != '.' && c != '-';
 686 }
 687
 688 // FIXME: shares algorithm with indextext.cc!
 689 static string
 690 html_highlight(const string &s, const string &list,
 691                const string &bra, const string &ket)
 692 {
 693     if (!stemmer) {
 694         stemmer = new Xapian::Stem(option["stemmer"]);
 695     }
 696
 697     string res;
 698
 699     Utf8Iterator j(s);
 700     const Utf8Iterator s_end;
 701     while (true) {
 702         Utf8Iterator first = j;
 703         while (first != s_end && !is_wordchar(*first)) ++first;
 704         if (first == s_end) break;
 705         Utf8Iterator term_end;
 706         string term;
 707         string word;
 708         const char *l = j.raw();
 709         if (*first < 128 && C_isupper(*first)) {
 710             j = first;
 711             Xapian::Unicode::append_utf8(term, *j);
 712             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 713                 Xapian::Unicode::append_utf8(term, *j);
 714             }
 715             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 716                 term.resize(0);
 717             }
 718             term_end = j;
 719         }
 720         if (term.empty()) {
 721             j = first;
 722             while (is_wordchar(*j)) {
 723                 Xapian::Unicode::append_utf8(term, *j);
 724                 ++j;
 725                 if (j == s_end) break;
 726                 if (*j == '&' || *j == '\'') {
 727                     Utf8Iterator next = j;
 728                     ++next;
 729                     if (next == s_end || !is_wordchar(*next)) break;
 730                     term += *j;
 731                     j = next;
 732                 }
 733             }
 734             term_end = j;
 735             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 736                 string::size_type len = term.length();
 737                 if (*j == '#') {
 738                     term += '#';
 739                     do { ++j; } while (j != s_end && *j == '#');
 740                 } else {
 741                     while (j != s_end && (*j == '+' || *j == '-')) {
 742                         Xapian::Unicode::append_utf8(term, *j);
 743                         ++j;
 744                     }
 745                 }
 746                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 747                     term.resize(len);
 748                 } else {
 749                     term_end = j;
 750                 }
 751             }
 752         }
 753         j = term_end;
 754         term = Xapian::Unicode::tolower(term);
 755         int match = word_in_list(term, list);
 756         if (match == -1) {
 757             string stem = "Z";
 758             stem += (*stemmer)(term);
 759             match = word_in_list(stem, list);
 760         }
 761         if (match >= 0) {
 762             res += html_escape(string(l, first.raw() - l));
 763             if (!bra.empty()) {
 764                 res += bra;
 765             } else {
 766                 static const char * colours[] = {
 767                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 768                     "990000", "009900", "996600", "006699", "990099"
 769                 };
 770                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 771                 const char * bg = colours[idx];
 772                 if (strchr(bg, 'f')) {
 773                     res += "<b style=\"color:black;background-color:#";
 774                 } else {
 775                     res += "<b style=\"color:white;background-color:#";
 776                 }
 777                 res += bg;
 778                 res += "\">";
 779             }
 780             word.assign(first.raw(), j.raw() - first.raw());
 781             res += html_escape(word);
 782             if (!bra.empty()) {
 783                 res += ket;
 784             } else {
 785                 res += "</b>";
 786             }
 787         } else {
 788             res += html_escape(string(l, j.raw() - l));
 789         }
 790     }
 791     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 792     return res;
 793 }
 794
 795 #if 0
 796 static void
 797 print_query_string(const char *after)
 798 {
 799     if (after && strncmp(after, "&B=", 3) == 0) {
 800         char prefix = after[3];
 801         string::size_type start = 0, amp = 0;
 802         while (true) {
 803             amp = url_query_string.find('&', amp);
 804             if (amp == string::npos) {
 805                 cout << url_query_string.substr(start);
 806                 return;
 807             }
 808             amp++;
 809             while (url_query_string[amp] == 'B' &&
 810                    url_query_string[amp + 1] == '=' &&
 811                    url_query_string[amp + 2] == prefix) {
 812                 cout << url_query_string.substr(start, amp - start - 1);
 813                 start = url_query_string.find('&', amp + 3);
 814                 if (start == string::npos) return;
 815                 amp = start + 1;
 816             }
 817         }
 818     }
 819     cout << url_query_string;
 820 }
 821 #endif
 822
 823 class Fields {
 824     mutable Xapian::docid did_cached;
 825     mutable map<string, string> fields;
 826
 827     void read_fields(Xapian::docid did) const;
 828
 829   public:
 830     Fields() : did_cached(0) { }
 831
 832     const string & get_field(Xapian::docid did, const string & field) const {
 833         if (did != did_cached) read_fields(did);
 834         return fields[field];
 835     }
 836 };
 837
 838 void
 839 Fields::read_fields(Xapian::docid did) const
 840 {
 841     fields.clear();
 842     did_cached = did;
 843     const string & data = db.get_document(did).get_data();
 844
 845     // Parse document data.
 846     string::size_type i = 0;
 847     const string & names = option["fieldnames"];
 848     if (!names.empty()) {
 849         // Each line is a field, with fieldnames taken from corresponding
 850         // entries in the tab-separated list specified by $opt{fieldnames}.
 851         string::size_type n = 0;
 852         do {
 853             string::size_type n0 = n;
 854             n = names.find('\t', n);
 855             string::size_type i0 = i;
 856             i = data.find('\n', i);
 857             fields.insert(make_pair(names.substr(n0, n - n0),
 858                                     data.substr(i0, i - i0)));
 859         } while (++n && ++i);
 860     } else {
 861         // Each line is a field, in the format NAME=VALUE.  We assume the field
 862         // name doesn't contain an "=".  Lines without an "=" are currently
 863         // just ignored.
 864         do {
 865             string::size_type i0 = i;
 866             i = data.find('\n', i);
 867             string line(data, i0, i - i0);
 868             string::size_type j = line.find('=');
 869             if (j != string::npos) {
 870                 string & value = fields[line.substr(0, j)];
 871                 if (!value.empty()) value += '\t';
 872                 value.append(line, j + 1, string::npos);
 873             }
 874         } while (++i);
 875     }
 876 }
 877
 878 static Fields fields;
 879 static Xapian::docid q0;
 880 static Xapian::doccount hit_no;
 881 static int percent;
 882 static double weight;
 883 static Xapian::doccount collapsed;
 884
 885 static string print_caption(const string &fmt, const vector<string> &param);
 886
 887 enum tagval {
 888 CMD_,
 889 CMD_add,
 890 CMD_addfilter,
 891 CMD_allterms,
 892 CMD_and,
 893 CMD_cgi,
 894 CMD_cgilist,
 895 CMD_chr,
 896 CMD_collapsed,
 897 CMD_contains,
 898 CMD_csv,
 899 CMD_date,
 900 CMD_dbname,
 901 CMD_dbsize,
 902 CMD_def,
 903 CMD_defaultop,
 904 CMD_div,
 905 CMD_eq,
 906 CMD_emptydocs,
 907 CMD_env,
 908 CMD_error,
 909 CMD_field,
 910 CMD_filesize,
 911 CMD_filters,
 912 CMD_filterterms,
 913 CMD_find,
 914 CMD_fmt,
 915 CMD_freq,
 916 CMD_ge,
 917 CMD_gt,
 918 CMD_highlight,
 919 CMD_hit,
 920 CMD_hitlist,
 921 CMD_hitsperpage,
 922 CMD_hostname,
 923 CMD_html,
 924 CMD_htmlstrip,
 925 CMD_httpheader,
 926 CMD_id,
 927 CMD_if,
 928 CMD_include,
 929 CMD_json,
 930 CMD_jsonarray,
 931 CMD_last,
 932 CMD_lastpage,
 933 CMD_le,
 934 CMD_length,
 935 CMD_list,
 936 CMD_log,
 937 CMD_lookup,
 938 CMD_lower,
 939 CMD_lt,
 940 CMD_map,
 941 CMD_match,
 942 CMD_max,
 943 CMD_min,
 944 CMD_mod,
 945 CMD_msize,
 946 CMD_msizeexact,
 947 CMD_msizelower,
 948 CMD_msizeupper,
 949 CMD_mul,
 950 CMD_muldiv,
 951 CMD_ne,
 952 CMD_nice,
 953 CMD_not,
 954 CMD_now,
 955 CMD_opt,
 956 CMD_or,
 957 CMD_ord,
 958 CMD_pack,
 959 CMD_percentage,
 960 CMD_prettyterm,
 961 CMD_prettyurl,
 962 CMD_query,
 963 CMD_querydescription,
 964 CMD_queryterms,
 965 CMD_range,
 966 CMD_record,
 967 CMD_relevant,
 968 CMD_relevants,
 969 CMD_score,
 970 CMD_set,
 971 CMD_setmap,
 972 CMD_setrelevant,
 973 CMD_slice,
 974 CMD_snippet,
 975 CMD_split,
 976 CMD_stoplist,
 977 CMD_sub,
 978 CMD_substr,
 979 CMD_suggestion,
 980 CMD_terms,
 981 CMD_thispage,
 982 CMD_time,
 983 CMD_topdoc,
 984 CMD_topterms,
 985 CMD_transform,
 986 CMD_truncate,
 987 CMD_uniq,
 988 CMD_unpack,
 989 CMD_unstem,
 990 CMD_upper,
 991 CMD_url,
 992 CMD_value,
 993 CMD_version,
 994 CMD_weight,
 995 CMD_MACRO // special tag for macro evaluation
 996 };
 997
 998 struct func_attrib {
 999     int tag;
1000     int minargs, maxargs, evalargs;
1001     char ensure;
1002 };
1003
1004 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1005 struct func_desc {
1006     const char *name;
1007     struct func_attrib a;
1008 };
1009
1010 #define N -1
1011 #define M 'M'
1012 #define Q 'Q'
1013 // NB when adding a new command which ensures M or Q, update the list in
1014 // docs/omegascript.rst
1015 static struct func_desc func_tab[] = {
1016 //name minargs maxargs evalargs ensure
1017 {"",{CMD_,         N, N, 0, 0}},// commented out code
1018 T(add,             0, N, N, 0), // add a list of numbers
1019 T(addfilter,       1, 1, N, 0), // add filter term
1020 T(allterms,        0, 1, N, 0), // list of all terms matching document
1021 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
1022 T(cgi,             1, 1, N, 0), // return cgi parameter value
1023 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
1024 T(chr,             1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1025 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
1026 T(contains,        2, 2, N, 0), // return position of substring, or empty string
1027 T(csv,             1, 2, N, 0), // CSV string escaping
1028 T(date,            1, 2, N, 0), // convert time_t to strftime format
1029                                 // (default: YYYY-MM-DD)
1030 T(dbname,          0, 0, N, 0), // database name
1031 T(dbsize,          0, 0, N, 0), // database size (# of documents)
1032 T(def,             2, 2, 1, 0), // define a macro
1033 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
1034 T(div,             2, 2, N, 0), // integer divide
1035 T(emptydocs,       0, 1, N, 0), // list of empty documents
1036 T(env,             1, 1, N, 0), // environment variable
1037 T(error,           0, 0, N, 0), // error message
1038 T(eq,              2, 2, N, 0), // test equality
1039 T(field,           1, 2, N, 0), // lookup field in record
1040 T(filesize,        1, 1, N, 0), // pretty printed filesize
1041 T(filters,         0, 0, N, 0), // serialisation of current filters
1042 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
1043 T(find,            2, 2, N, 0), // find entry in list
1044 T(fmt,             0, 0, N, 0), // name of current format
1045 T(freq,            1, 1, N, 0), // frequency of a term
1046 T(ge,              2, 2, N, 0), // test >=
1047 T(gt,              2, 2, N, 0), // test >
1048 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
1049 T(hit,             0, 0, N, 0), // hit number of current mset entry (0-based)
1050 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
1051 T(hitsperpage,     0, 0, N, 0), // hits per page
1052 T(hostname,        1, 1, N, 0), // extract hostname from URL
1053 T(html,            1, 1, N, 0), // html escape string (<>&")
1054 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1055 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
1056 T(id,              0, 0, N, 0), // docid of current doc
1057 T(if,              2, 3, 1, 0), // conditional
1058 T(include,         1, 1, 1, 0), // include another file
1059 T(json,            1, 1, N, 0), // JSON string escaping
1060 T(jsonarray,       1, 1, N, 0), // Format list as a JSON array of strings
1061 T(last,            0, 0, N, M), // hit number one beyond end of current page
1062 T(lastpage,        0, 0, N, M), // number of last hit page
1063 T(le,              2, 2, N, 0), // test <=
1064 T(length,          1, 1, N, 0), // length of list
1065 T(list,            2, 5, N, 0), // pretty print list
1066 T(log,             1, 2, 1, 0), // create a log entry
1067 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1068 T(lower,           1, 1, N, 0), // convert string to lower case
1069 T(lt,              2, 2, N, 0), // test <
1070 T(map,             1, 2, 1, 0), // map a list into another list
1071 T(match,           2, 3, N, 0), // regex match
1072 T(max,             1, N, N, 0), // maximum of a list of values
1073 T(min,             1, N, N, 0), // minimum of a list of values
1074 T(mod,             2, 2, N, 0), // integer modulus
1075 T(msize,           0, 0, N, M), // number of matches (estimated)
1076 T(msizeexact,      0, 0, N, M), // is $msize exact?
1077 T(msizelower,      0, 0, N, M), // number of matches (lower bound)
1078 T(msizeupper,      0, 0, N, M), // number of matches (upper bound)
1079 T(mul,             2, N, N, 0), // multiply a list of numbers
1080 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1081 T(ne,              2, 2, N, 0), // test not equal
1082 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1083 T(not,             1, 1, N, 0), // logical not
1084 T(now,             0, 0, N, 0), // current date/time as a time_t
1085 T(opt,             1, 2, N, 0), // lookup an option value
1086 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1087 T(ord,             1, 1, N, 0), // return codepoint for first character of UTF-8 string
1088 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1089 T(percentage,      0, 0, N, 0), // percentage score of current hit
1090 T(prettyterm,      1, 1, N, Q), // pretty print term name
1091 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1092 T(query,           0, 1, N, Q), // query
1093 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1094 T(queryterms,      0, 0, N, Q), // list of query terms
1095 T(range,           2, 2, N, 0), // return list of values between start and end
1096 T(record,          0, 1, N, 0), // record contents of document
1097 T(relevant,        0, 1, N, Q), // is document relevant?
1098 T(relevants,       0, 0, N, Q), // return list of relevant documents
1099 T(score,           0, 0, N, 0), // score (0-10) of current hit
1100 T(set,             2, 2, N, 0), // set option value
1101 T(setmap,          1, N, N, 0), // set map of option values
1102 T(setrelevant,     0, 1, N, Q), // set rset
1103 T(slice,           2, 2, N, 0), // slice a list using a second list
1104 T(snippet,         1, 2, N, M), // generate snippet from text
1105 T(split,           1, 2, N, 0), // split a string to give a list
1106 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1107 T(sub,             2, 2, N, 0), // subtract
1108 T(substr,          2, 3, N, 0), // substring
1109 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1110 T(terms,           0, 1, N, M), // list of matching terms
1111 T(thispage,        0, 0, N, M), // page number of current page
1112 T(time,            0, 0, N, M), // how long the match took (in seconds)
1113 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1114                                 // (counting from 0)
1115 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1116                                 // (default 16)
1117 T(transform,       3, 4, N, 0), // transform with a regexp
1118 T(truncate,        2, 4, N, 0), // truncate after a word
1119 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1120 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1121 T(unstem,          1, 1, N, Q), // return list of probabilistic terms from
1122                                 // the query which stemmed to this term
1123 T(upper,           1, 1, N, 0), // convert string to upper case
1124 T(url,             1, 1, N, 0), // url encode argument
1125 T(value,           1, 2, N, 0), // return document value
1126 T(version,         0, 0, N, 0), // omega version string
1127 T(weight,          0, 0, N, 0), // weight of the current hit
1128 { NULL,{0,         0, 0, 0, 0}}
1129 };
1130
1131 #undef T // Leaving T defined screws up Sun's C++ compiler!
1132
1133 static vector<string> macros;
1134
1135 // Call write() repeatedly until all data is written or we get a
1136 // non-recoverable error.
1137 static ssize_t
1138 write_all(int fd, const char * buf, size_t count)
1139 {
1140     while (count) {
1141         ssize_t r = write(fd, buf, count);
1142         if (rare(r < 0)) {
1143             if (errno == EINTR) continue;
1144             return r;
1145         }
1146         buf += r;
1147         count -= r;
1148     }
1149     return 0;
1150 }
1151
1152 static string
1153 eval(const string &fmt, const vector<string> &param)
1154 {
1155     static map<string, const struct func_attrib *> func_map;
1156     if (func_map.empty()) {
1157         struct func_desc *p;
1158         for (p = func_tab; p->name != NULL; p++) {
1159             func_map[string(p->name)] = &(p->a);
1160         }
1161     }
1162     string res;
1163     string::size_type p = 0, q;
1164     while ((q = fmt.find('$', p)) != string::npos) try {
1165         res.append(fmt, p, q - p);
1166         string::size_type code_start = q; // note down for error reporting
1167         q++;
1168         if (q >= fmt.size()) break;
1169         unsigned char ch = fmt[q];
1170         switch (ch) {
1171             // Magic sequences:
1172             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1173             case '$':
1174                 res += '$';
1175                 p = q + 1;
1176                 continue;
1177             case '(':
1178                 res += '{';
1179                 p = q + 1;
1180                 continue;
1181             case ')':
1182                 res += '}';
1183                 p = q + 1;
1184                 continue;
1185             case '.':
1186                 res += ',';
1187                 p = q + 1;
1188                 continue;
1189             case '_':
1190                 ch = '0';
1191                 // FALL THRU
1192             case '1': case '2': case '3': case '4': case '5':
1193             case '6': case '7': case '8': case '9':
1194                 ch -= '0';
1195                 if (ch < param.size()) res += param[ch];
1196                 p = q + 1;
1197                 continue;
1198             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1199             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1200             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1201             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1202             case 'y': case 'z':
1203             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1204             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1205             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1206             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1207             case 'Y': case 'Z':
1208             case '{':
1209                 break;
1210             default:
1211                 string msg = "Unknown $ code in: $";
1212                 msg.append(fmt, q, string::npos);
1213                 throw msg;
1214         }
1215         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1216         string var(fmt, q, p - q);
1217         map<string, const struct func_attrib *>::const_iterator func;
1218         func = func_map.find(var);
1219         if (func == func_map.end()) {
1220             throw "Unknown function '" + var + "'";
1221         }
1222         vector<string> args;
1223         if (fmt[p] == '{') {
1224             q = p + 1;
1225             int nest = 1;
1226             while (true) {
1227                 p = fmt.find_first_of(",{}", p + 1);
1228                 if (p == string::npos)
1229                     throw "missing } in " + fmt.substr(code_start);
1230                 if (fmt[p] == '{') {
1231                     ++nest;
1232                 } else {
1233                     if (nest == 1) {
1234                         // should we split the args
1235                         if (func->second->minargs != N) {
1236                             args.push_back(fmt.substr(q, p - q));
1237                             q = p + 1;
1238                         }
1239                     }
1240                     if (fmt[p] == '}' && --nest == 0) break;
1241                 }
1242             }
1243             if (func->second->minargs == N)
1244                 args.push_back(fmt.substr(q, p - q));
1245             p++;
1246         }
1247
1248         if (func->second->minargs != N) {
1249             if (int(args.size()) < func->second->minargs)
1250                 throw "too few arguments to $" + var;
1251             if (func->second->maxargs != N &&
1252                 int(args.size()) > func->second->maxargs)
1253                 throw "too many arguments to $" + var;
1254
1255             vector<string>::size_type n;
1256             if (func->second->evalargs != N)
1257                 n = func->second->evalargs;
1258             else
1259                 n = args.size();
1260
1261             for (vector<string>::size_type j = 0; j < n; j++)
1262                 args[j] = eval(args[j], param);
1263         }
1264         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1265             ensure_query_parsed();
1266         if (func->second->ensure == 'M') ensure_match();
1267         string value;
1268         switch (func->second->tag) {
1269             case CMD_:
1270                 break;
1271             case CMD_add: {
1272                 int total = 0;
1273                 vector<string>::const_iterator i;
1274                 for (i = args.begin(); i != args.end(); i++)
1275                     total += string_to_int(*i);
1276                 value = str(total);
1277                 break;
1278             }
1279             case CMD_addfilter:
1280                 add_bterm(args[0]);
1281                 break;
1282             case CMD_allterms: {
1283                 // list of all terms indexing document
1284                 int id = q0;
1285                 if (!args.empty()) id = string_to_int(args[0]);
1286                 for (Xapian::TermIterator term = db.termlist_begin(id);
1287                      term != db.termlist_end(id); term++) {
1288                     value += *term;
1289                     value += '\t';
1290                 }
1291
1292                 if (!value.empty()) value.erase(value.size() - 1);
1293                 break;
1294             }
1295             case CMD_and: {
1296                 value = "true";
1297                 for (vector<string>::const_iterator i = args.begin();
1298                      i != args.end(); i++) {
1299                     if (eval(*i, param).empty()) {
1300                         value.resize(0);
1301                         break;
1302                     }
1303                 }
1304                 break;
1305             }
1306             case CMD_cgi: {
1307                 MCI i = cgi_params.find(args[0]);
1308                 if (i != cgi_params.end()) value = i->second;
1309                 break;
1310             }
1311             case CMD_cgilist: {
1312                 pair<MCI, MCI> g;
1313                 g = cgi_params.equal_range(args[0]);
1314                 for (MCI i = g.first; i != g.second; i++) {
1315                     value += i->second;
1316                     value += '\t';
1317                 }
1318                 if (!value.empty()) value.erase(value.size() - 1);
1319                 break;
1320             }
1321             case CMD_chr:
1322                 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1323                 break;
1324             case CMD_collapsed: {
1325                 value = str(collapsed);
1326                 break;
1327             }
1328             case CMD_contains: {
1329                 size_t pos = args[1].find(args[0]);
1330                 if (pos != string::npos) {
1331                     value = str(pos);
1332                 }
1333                 break;
1334             }
1335             case CMD_csv:
1336                 value = args[0];
1337                 if (args.size() > 1 && !args[1].empty()) {
1338                     csv_escape_always(value);
1339                 } else {
1340                     csv_escape(value);
1341                 }
1342                 break;
1343             case CMD_date:
1344                 value = args[0];
1345                 if (!value.empty()) {
1346                     char buf[64] = "";
1347                     time_t date = string_to_int(value);
1348                     if (date != static_cast<time_t>(-1)) {
1349                         struct tm *then;
1350                         then = gmtime(&date);
1351                         string date_fmt = "%Y-%m-%d";
1352                         if (args.size() > 1) date_fmt = eval(args[1], param);
1353                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1354                     }
1355                     value = buf;
1356                 }
1357                 break;
1358             case CMD_dbname:
1359                 value = dbname;
1360                 break;
1361             case CMD_dbsize: {
1362                 static Xapian::doccount dbsize;
1363                 if (!dbsize) dbsize = db.get_doccount();
1364                 value = str(dbsize);
1365                 break;
1366             }
1367             case CMD_def: {
1368                 func_attrib *fa = new func_attrib;
1369                 fa->tag = CMD_MACRO + macros.size();
1370                 fa->minargs = 0;
1371                 fa->maxargs = 9;
1372                 fa->evalargs = N; // FIXME: or 0?
1373                 fa->ensure = 0;
1374
1375                 macros.push_back(args[1]);
1376                 func_map[args[0]] = fa;
1377                 break;
1378             }
1379             case CMD_defaultop:
1380                 if (default_op == Xapian::Query::OP_AND) {
1381                     value = "and";
1382                 } else {
1383                     value = "or";
1384                 }
1385                 break;
1386             case CMD_div: {
1387                 int denom = string_to_int(args[1]);
1388                 if (denom == 0) {
1389                     value = "divide by 0";
1390                 } else {
1391                     value = str(string_to_int(args[0]) /
1392                                 string_to_int(args[1]));
1393                 }
1394                 break;
1395             }
1396             case CMD_eq:
1397                 if (args[0] == args[1]) value = "true";
1398                 break;
1399             case CMD_emptydocs: {
1400                 string t;
1401                 if (!args.empty())
1402                     t = args[0];
1403                 Xapian::PostingIterator i;
1404                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1405                     if (i.get_doclength() != 0) continue;
1406                     if (!value.empty()) value += '\t';
1407                     value += str(*i);
1408                 }
1409                 break;
1410             }
1411             case CMD_env: {
1412                 char *env = getenv(args[0].c_str());
1413                 if (env != NULL) value = env;
1414                 break;
1415             }
1416             case CMD_error:
1417                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1418                     error_msg = "Database '" + dbname + "' couldn't be opened";
1419                 }
1420                 value = error_msg;
1421                 break;
1422             case CMD_field: {
1423                 Xapian::docid did = q0;
1424                 if (args.size() > 1) did = string_to_int(args[1]);
1425                 value = fields.get_field(did, args[0]);
1426                 break;
1427             }
1428             case CMD_filesize: {
1429                 // FIXME: rounding?  i18n?
1430                 int size = string_to_int(args[0]);
1431                 int intpart = size;
1432                 int fraction = -1;
1433                 const char * format = 0;
1434                 if (size < 0) {
1435                     // Negative size -> empty result.
1436                 } else if (size == 1) {
1437                     format = "%d byte";
1438                 } else if (size < 1024) {
1439                     format = "%d bytes";
1440                 } else {
1441                     if (size < 1024*1024) {
1442                         format = "%d.%cK";
1443                     } else {
1444                         size /= 1024;
1445                         if (size < 1024*1024) {
1446                             format = "%d.%cM";
1447                         } else {
1448                             size /= 1024;
1449                             format = "%d.%cG";
1450                         }
1451                     }
1452                     intpart = unsigned(size) / 1024;
1453                     fraction = unsigned(size) % 1024;
1454                 }
1455                 if (format) {
1456                     char buf[200];
1457                     int len;
1458                     if (fraction == -1) {
1459                         len = my_snprintf(buf, sizeof(buf), format, intpart);
1460                     } else {
1461                         fraction = (fraction * 10 / 1024) + '0';
1462                         len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1463                     }
1464                     if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1465                     value.assign(buf, len);
1466                 }
1467                 break;
1468             }
1469             case CMD_filters:
1470                 value = filters;
1471                 break;
1472             case CMD_filterterms: {
1473                 Xapian::TermIterator term = db.allterms_begin();
1474                 term.skip_to(args[0]);
1475                 while (term != db.allterms_end()) {
1476                     string t = *term;
1477                     if (!startswith(t, args[0])) break;
1478                     value += t;
1479                     value += '\t';
1480                     ++term;
1481                 }
1482
1483                 if (!value.empty()) value.erase(value.size() - 1);
1484                 break;
1485             }
1486             case CMD_find: {
1487                 string l = args[0], s = args[1];
1488                 string::size_type i = 0, j = 0;
1489                 size_t count = 0;
1490                 while (j != l.size()) {
1491                     j = l.find('\t', i);
1492                     if (j == string::npos) j = l.size();
1493                     if (j - i == s.length()) {
1494                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1495                             value = str(count);
1496                             break;
1497                         }
1498                     }
1499                     ++count;
1500                     i = j + 1;
1501                 }
1502                 break;
1503             }
1504             case CMD_fmt:
1505                 value = fmtname;
1506                 break;
1507             case CMD_freq:
1508                 try {
1509                     value = str(mset.get_termfreq(args[0]));
1510                 } catch (const Xapian::InvalidOperationError&) {
1511                     // An MSet will raise this error if it's empty and not
1512                     // associated with a search.
1513                     value = str(db.get_termfreq(args[0]));
1514                 }
1515                 break;
1516             case CMD_ge:
1517                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1518                     value = "true";
1519                 break;
1520             case CMD_gt:
1521                 if (string_to_int(args[0]) > string_to_int(args[1]))
1522                     value = "true";
1523                 break;
1524             case CMD_highlight: {
1525                 string bra, ket;
1526                 if (args.size() > 2) {
1527                     bra = args[2];
1528                     if (args.size() > 3) {
1529                         ket = args[3];
1530                     } else {
1531                         string::const_iterator i;
1532                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1533                         ket = "</";
1534                         ket.append(bra, 1, i - bra.begin() - 1);
1535                         ket += '>';
1536                     }
1537                 }
1538
1539                 value = html_highlight(args[0], args[1], bra, ket);
1540                 break;
1541             }
1542             case CMD_hit:
1543                 // 0-based mset index
1544                 value = str(hit_no);
1545                 break;
1546             case CMD_hitlist:
1547 #if 0
1548                 url_query_string = "?DB=";
1549                 url_query_string += dbname;
1550                 multimap<string, string>::const_iterator j;
1551                 for (j = probabilistic_query.begin();
1552                      j != probabilistic_query.end();
1553                      ++j) {
1554                     if (j->first.empty()) {
1555                         url_query_string += "&P=";
1556                     } else {
1557                         url_query_string += "&P."
1558                         url_query_string += j->first;
1559                         url_query_string += '=';
1560                     }
1561                     const char *q = j->second.c_str();
1562                     int ch;
1563                     while ((ch = *q++) != '\0') {
1564                         switch (ch) {
1565                          case '+':
1566                             url_query_string += "%2b";
1567                             break;
1568                          case '"':
1569                             url_query_string += "%22";
1570                             break;
1571                          case '%':
1572                             url_query_string += "%25";
1573                             break;
1574                          case '&':
1575                             url_query_string += "%26";
1576                             break;
1577                          case ' ':
1578                             ch = '+';
1579                             /* fall through */
1580                          default:
1581                             url_query_string += ch;
1582                         }
1583                     }
1584                 }
1585                 // add any boolean terms
1586                 for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
1587                     url_query_string += "&B=";
1588                     url_query_string += i->second;
1589                 }
1590 #endif
1591                 for (hit_no = topdoc; hit_no < last; hit_no++)
1592                     value += print_caption(args[0], param);
1593                 hit_no = 0;
1594                 break;
1595             case CMD_hitsperpage:
1596                 value = str(hits_per_page);
1597                 break;
1598             case CMD_hostname: {
1599                 value = args[0];
1600                 // remove URL scheme and/or path
1601                 string::size_type i = value.find("://");
1602                 if (i == string::npos) i = 0; else i += 3;
1603                 value = value.substr(i, value.find('/', i) - i);
1604                 // remove user@ or user:password@
1605                 i = value.find('@');
1606                 if (i != string::npos) value.erase(0, i + 1);
1607                 // remove :port
1608                 i = value.find(':');
1609                 if (i != string::npos) value.resize(i);
1610                 break;
1611             }
1612             case CMD_html:
1613                 value = html_escape(args[0]);
1614                 break;
1615             case CMD_htmlstrip:
1616                 value = html_strip(args[0]);
1617                 break;
1618             case CMD_httpheader:
1619                 if (!suppress_http_headers) {
1620                     cout << args[0] << ": " << args[1] << endl;
1621                     if (!set_content_type && args[0].length() == 12 &&
1622                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1623                         set_content_type = true;
1624                     }
1625                 }
1626                 break;
1627             case CMD_id:
1628                 // document id
1629                 value = str(q0);
1630                 break;
1631             case CMD_if:
1632                 if (!args[0].empty())
1633                     value = eval(args[1], param);
1634                 else if (args.size() > 2)
1635                     value = eval(args[2], param);
1636                 break;
1637             case CMD_include:
1638                 value = eval_file(args[0]);
1639                 break;
1640             case CMD_json:
1641                 value = args[0];
1642                 json_escape(value);
1643                 break;
1644             case CMD_jsonarray: {
1645                 const string & l = args[0];
1646                 string::size_type i = 0, j;
1647                 if (l.empty()) {
1648                     value = "[]";
1649                     break;
1650                 }
1651                 value = "[\"";
1652                 while (true) {
1653                     j = l.find('\t', i);
1654                     string elt(l, i, j - i);
1655                     json_escape(elt);
1656                     value += elt;
1657                     if (j == string::npos) break;
1658                     value += "\",\"";
1659                     i = j + 1;
1660                 }
1661                 value += "\"]";
1662                 break;
1663             }
1664             case CMD_last:
1665                 value = str(last);
1666                 break;
1667             case CMD_lastpage: {
1668                 int l = mset.get_matches_estimated();
1669                 if (l > 0) l = (l - 1) / hits_per_page + 1;
1670                 value = str(l);
1671                 break;
1672             }
1673             case CMD_le:
1674                 if (string_to_int(args[0]) <= string_to_int(args[1]))
1675                     value = "true";
1676                 break;
1677             case CMD_length:
1678                 if (args[0].empty()) {
1679                     value = "0";
1680                 } else {
1681                     size_t length = count(args[0].begin(), args[0].end(), '\t');
1682                     value = str(length + 1);
1683                 }
1684                 break;
1685             case CMD_list: {
1686                 if (!args[0].empty()) {
1687                     string pre, inter, interlast, post;
1688                     switch (args.size()) {
1689                      case 2:
1690                         inter = interlast = args[1];
1691                         break;
1692                      case 3:
1693                         inter = args[1];
1694                         interlast = args[2];
1695                         break;
1696                      case 4:
1697                         pre = args[1];
1698                         inter = interlast = args[2];
1699                         post = args[3];
1700                         break;
1701                      case 5:
1702                         pre = args[1];
1703                         inter = args[2];
1704                         interlast = args[3];
1705                         post = args[4];
1706                         break;
1707                     }
1708                     value += pre;
1709                     string list = args[0];
1710                     string::size_type split = 0, split2;
1711                     while ((split2 = list.find('\t', split)) != string::npos) {
1712                         if (split) value += inter;
1713                         value.append(list, split, split2 - split);
1714                         split = split2 + 1;
1715                     }
1716                     if (split) value += interlast;
1717                     value.append(list, split, string::npos);
1718                     value += post;
1719                 }
1720                 break;
1721             }
1722             case CMD_log: {
1723                 if (!vet_filename(args[0])) break;
1724                 string logfile = log_dir + args[0];
1725                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1726                 if (fd == -1) break;
1727                 vector<string> noargs;
1728                 noargs.resize(1);
1729                 string line;
1730                 if (args.size() > 1) {
1731                     line = args[1];
1732                 } else {
1733                     line = DEFAULT_LOG_ENTRY;
1734                 }
1735                 line = eval(line, noargs);
1736                 line += '\n';
1737                 (void)write_all(fd, line.data(), line.length());
1738                 close(fd);
1739                 break;
1740             }
1741             case CMD_lookup: {
1742                 if (!vet_filename(args[0])) break;
1743                 string cdbfile = cdb_dir + args[0];
1744                 int fd = open(cdbfile.c_str(), O_RDONLY);
1745                 if (fd == -1) break;
1746
1747                 struct cdb cdb;
1748                 cdb_init(&cdb, fd);
1749
1750                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1751                     size_t datalen = cdb_datalen(&cdb);
1752                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1753                     if (q) {
1754                         value.assign(static_cast<const char *>(dat), datalen);
1755                     }
1756                 }
1757
1758                 cdb_free(&cdb);
1759                 close(fd); // FIXME: cache fds?
1760                 break;
1761             }
1762             case CMD_lower:
1763                 value = Xapian::Unicode::tolower(args[0]);
1764                 break;
1765             case CMD_lt:
1766                 if (string_to_int(args[0]) < string_to_int(args[1]))
1767                     value = "true";
1768                 break;
1769             case CMD_map:
1770                 if (!args[0].empty()) {
1771                     string l = args[0], pat = args[1];
1772                     vector<string> new_args(param);
1773                     string::size_type i = 0, j;
1774                     while (true) {
1775                         j = l.find('\t', i);
1776                         new_args[0] = l.substr(i, j - i);
1777                         value += eval(pat, new_args);
1778                         if (j == string::npos) break;
1779                         value += '\t';
1780                         i = j + 1;
1781                     }
1782                 }
1783                 break;
1784             case CMD_match:
1785                 omegascript_match(value, args);
1786                 break;
1787             case CMD_max: {
1788                 vector<string>::const_iterator i = args.begin();
1789                 int val = string_to_int(*i++);
1790                 for (; i != args.end(); i++) {
1791                     int x = string_to_int(*i);
1792                     if (x > val) val = x;
1793                 }
1794                 value = str(val);
1795                 break;
1796             }
1797             case CMD_min: {
1798                 vector<string>::const_iterator i = args.begin();
1799                 int val = string_to_int(*i++);
1800                 for (; i != args.end(); i++) {
1801                     int x = string_to_int(*i);
1802                     if (x < val) val = x;
1803                 }
1804                 value = str(val);
1805                 break;
1806             }
1807             case CMD_msize:
1808                 // Estimated number of matches.
1809                 value = str(mset.get_matches_estimated());
1810                 break;
1811             case CMD_msizeexact:
1812                 // Is msize exact?
1813                 if (mset.get_matches_lower_bound()
1814                     == mset.get_matches_upper_bound())
1815                     value = "true";
1816                 break;
1817             case CMD_msizelower:
1818                 // Lower bound on number of matches.
1819                 value = str(mset.get_matches_lower_bound());
1820                 break;
1821             case CMD_msizeupper:
1822                 // Upper bound on number of matches.
1823                 value = str(mset.get_matches_upper_bound());
1824                 break;
1825             case CMD_mod: {
1826                 int denom = string_to_int(args[1]);
1827                 if (denom == 0) {
1828                     value = "divide by 0";
1829                 } else {
1830                     value = str(string_to_int(args[0]) %
1831                                 string_to_int(args[1]));
1832                 }
1833                 break;
1834             }
1835             case CMD_mul: {
1836                 vector<string>::const_iterator i = args.begin();
1837                 int total = string_to_int(*i++);
1838                 while (i != args.end())
1839                     total *= string_to_int(*i++);
1840                 value = str(total);
1841                 break;
1842             }
1843             case CMD_muldiv: {
1844                 int denom = string_to_int(args[2]);
1845                 if (denom == 0) {
1846                     value = "divide by 0";
1847                 } else {
1848                     int num = string_to_int(args[0]) * string_to_int(args[1]);
1849                     value = str(num / denom);
1850                 }
1851                 break;
1852             }
1853             case CMD_ne:
1854                 if (args[0] != args[1]) value = "true";
1855                 break;
1856             case CMD_nice: {
1857                 string::const_iterator i = args[0].begin();
1858                 int len = args[0].length();
1859                 while (len) {
1860                     value += *i++;
1861                     if (--len && len % 3 == 0) value += option["thousand"];
1862                 }
1863                 break;
1864             }
1865             case CMD_not:
1866                 if (args[0].empty()) value = "true";
1867                 break;
1868             case CMD_now: {
1869                 char buf[64];
1870                 my_snprintf(buf, sizeof(buf), "%lu",
1871                             static_cast<unsigned long>(time(NULL)));
1872                 // MSVC's snprintf omits the zero byte if the string if
1873                 // sizeof(buf) long.
1874                 buf[sizeof(buf) - 1] = '\0';
1875                 value = buf;
1876                 break;
1877             }
1878             case CMD_opt:
1879                 if (args.size() == 2) {
1880                     value = option[args[0] + "," + args[1]];
1881                 } else {
1882                     value = option[args[0]];
1883                 }
1884                 break;
1885             case CMD_or: {
1886                 for (vector<string>::const_iterator i = args.begin();
1887                      i != args.end(); i++) {
1888                     value = eval(*i, param);
1889                     if (!value.empty()) break;
1890                 }
1891                 break;
1892             }
1893             case CMD_ord: {
1894                 if (!args[0].empty()) {
1895                     Utf8Iterator it(args[0]);
1896                     value = str(*it);
1897                 }
1898                 break;
1899             }
1900             case CMD_pack:
1901                 value = int_to_binary_string(string_to_int(args[0]));
1902                 break;
1903             case CMD_percentage:
1904                 // percentage score
1905                 value = str(percent);
1906                 break;
1907             case CMD_prettyterm:
1908                 value = pretty_term(args[0]);
1909                 break;
1910             case CMD_prettyurl:
1911                 value = args[0];
1912                 url_prettify(value);
1913                 break;
1914             case CMD_query: {
1915                 pair<multimap<string, string>::const_iterator,
1916                      multimap<string, string>::const_iterator> r;
1917                 r = probabilistic_query.equal_range(args.empty() ?
1918                                                     string() : args[0]);
1919                 multimap<string, string>::const_iterator j;
1920                 for (j = r.first; j != r.second; ++j) {
1921                     if (!value.empty()) value += '\t';
1922                     const string & s = j->second;
1923                     size_t start = 0, tab;
1924                     while ((tab = s.find('\t', start)) != string::npos) {
1925                         value.append(s, start, tab - start);
1926                         value += ' ';
1927                         start = tab + 1;
1928                     }
1929                     value.append(s, start, string::npos);
1930                 }
1931                 break;
1932             }
1933             case CMD_querydescription:
1934                 value = query.get_description();
1935                 break;
1936             case CMD_queryterms:
1937                 value = queryterms;
1938                 break;
1939             case CMD_range: {
1940                 int start = string_to_int(args[0]);
1941                 int end = string_to_int(args[1]);
1942                 while (start <= end) {
1943                     value += str(start);
1944                     if (start < end) value += '\t';
1945                     start++;
1946                 }
1947                 break;
1948             }
1949             case CMD_record: {
1950                 int id = q0;
1951                 if (!args.empty()) id = string_to_int(args[0]);
1952                 value = db.get_document(id).get_data();
1953                 break;
1954             }
1955             case CMD_relevant: {
1956                 // document id if relevant; empty otherwise
1957                 int id = q0;
1958                 if (!args.empty()) id = string_to_int(args[0]);
1959                 map<Xapian::docid, bool>::iterator i = ticked.find(id);
1960                 if (i != ticked.end()) {
1961                     i->second = false; // icky side-effect
1962                     value = str(id);
1963                 }
1964                 break;
1965             }
1966             case CMD_relevants: {
1967                 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
1968                      i != ticked.end(); i++) {
1969                     if (i->second) {
1970                         value += str(i->first);
1971                         value += '\t';
1972                     }
1973                 }
1974                 if (!value.empty()) value.erase(value.size() - 1);
1975                 break;
1976             }
1977             case CMD_score:
1978                 // Score (0 to 10)
1979                 value = str(percent / 10);
1980                 break;
1981             case CMD_set:
1982                 option[args[0]] = args[1];
1983                 break;
1984             case CMD_setmap: {
1985                 string base = args[0] + ',';
1986                 if (args.size() % 2 != 1)
1987                     throw string("$setmap requires an odd number of arguments");
1988                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
1989                     option[base + args[i]] = args[i + 1];
1990                 }
1991                 break;
1992             }
1993             case CMD_setrelevant: {
1994                 string::size_type i = 0, j;
1995                 while (true) {
1996                     j = args[0].find_first_not_of("0123456789", i);
1997                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
1998                     if (id) {
1999                         rset.add_document(id);
2000                         ticked[id] = true;
2001                     }
2002                     if (j == string::npos) break;
2003                     i = j + 1;
2004                 }
2005                 break;
2006             }
2007             case CMD_slice: {
2008                 string list = args[0], pos = args[1];
2009                 vector<string> items;
2010                 string::size_type i = 0, j;
2011                 while (true) {
2012                     j = list.find('\t', i);
2013                     items.push_back(list.substr(i, j - i));
2014                     if (j == string::npos) break;
2015                     i = j + 1;
2016                 }
2017                 i = 0;
2018                 bool have_added = false;
2019                 while (true) {
2020                     j = pos.find('\t', i);
2021                     int item = string_to_int(pos.substr(i, j - i));
2022                     if (item >= 0 && size_t(item) < items.size()) {
2023                         if (have_added) value += '\t';
2024                         value += items[item];
2025                         have_added = true;
2026                     }
2027                     if (j == string::npos) break;
2028                     i = j + 1;
2029                 }
2030                 break;
2031             }
2032             case CMD_snippet: {
2033                 size_t length = 200;
2034                 if (args.size() > 1) {
2035                     length = string_to_int(args[1]);
2036                 }
2037                 if (!stemmer)
2038                     stemmer = new Xapian::Stem(option["stemmer"]);
2039                 // FIXME: Allow start and end highlight and omit to be specified.
2040                 value = mset.snippet(args[0], length, *stemmer,
2041                                      mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2042                                      "<strong>", "</strong>", "...");
2043                 break;
2044             }
2045             case CMD_split: {
2046                 string split;
2047                 if (args.size() == 1) {
2048                     split = " ";
2049                     value = args[0];
2050                 } else {
2051                     split = args[0];
2052                     value = args[1];
2053                 }
2054                 string::size_type i = 0;
2055                 while (true) {
2056                     if (split.empty()) {
2057                         ++i;
2058                         if (i >= value.size()) break;
2059                     } else {
2060                         i = value.find(split, i);
2061                         if (i == string::npos) break;
2062                     }
2063                     value.replace(i, split.size(), 1, '\t');
2064                     ++i;
2065                 }
2066                 break;
2067             }
2068             case CMD_stoplist: {
2069                 Xapian::TermIterator i = qp.stoplist_begin();
2070                 Xapian::TermIterator end = qp.stoplist_end();
2071                 while (i != end) {
2072                     if (!value.empty()) value += '\t';
2073                     value += *i;
2074                     ++i;
2075                 }
2076                 break;
2077             }
2078             case CMD_sub:
2079                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2080                 break;
2081             case CMD_substr: {
2082                 int start = string_to_int(args[1]);
2083                 if (start < 0) {
2084                     if (static_cast<size_t>(-start) >= args[0].size()) {
2085                         start = 0;
2086                     } else {
2087                         start = static_cast<int>(args[0].size()) + start;
2088                     }
2089                 } else {
2090                     if (static_cast<size_t>(start) >= args[0].size()) break;
2091                 }
2092                 size_t len = string::npos;
2093                 if (args.size() > 2) {
2094                     int int_len = string_to_int(args[2]);
2095                     if (int_len >= 0) {
2096                         len = size_t(int_len);
2097                     } else {
2098                         len = args[0].size() - start;
2099                         if (static_cast<size_t>(-int_len) >= len) {
2100                             len = 0;
2101                         } else {
2102                             len -= static_cast<size_t>(-int_len);
2103                         }
2104                     }
2105                 }
2106                 value.assign(args[0], start, len);
2107                 break;
2108             }
2109             case CMD_suggestion:
2110                 value = qp.get_corrected_query_string();
2111                 break;
2112             case CMD_terms: {
2113                 // list of matching terms
2114                 if (!enquire) break;
2115                 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2116                 if (args.empty()) {
2117                     while (term != enquire->get_matching_terms_end(q0)) {
2118                         // check term was in the typed query so we ignore
2119                         // boolean filter terms
2120                         const string & t = *term;
2121                         if (termset.find(t) != termset.end()) {
2122                             value += t;
2123                             value += '\t';
2124                         }
2125                         ++term;
2126                     }
2127                 } else {
2128                     // Return matching terms with specified prefix.  We can't
2129                     // use skip_to() as the terms aren't ordered by termname.
2130                     const string & pfx = args[0];
2131                     while (term != enquire->get_matching_terms_end(q0)) {
2132                         const string & t = *term;
2133                         if (startswith(t, pfx)) {
2134                             value += t;
2135                             value += '\t';
2136                         }
2137                         ++term;
2138                     }
2139                 }
2140
2141                 if (!value.empty()) value.erase(value.size() - 1);
2142                 break;
2143             }
2144             case CMD_thispage:
2145                 value = str(topdoc / hits_per_page + 1);
2146                 break;
2147             case CMD_time:
2148                 if (secs >= 0) {
2149                     char buf[64];
2150                     my_snprintf(buf, sizeof(buf), "%.6f", secs);
2151                     // MSVC's snprintf omits the zero byte if the string if
2152                     // sizeof(buf) long.
2153                     buf[sizeof(buf) - 1] = '\0';
2154                     value = buf;
2155                 }
2156                 break;
2157             case CMD_topdoc:
2158                 // first document on current page of hit list (counting from 0)
2159                 value = str(topdoc);
2160                 break;
2161             case CMD_topterms:
2162                 if (enquire) {
2163                     int howmany = 16;
2164                     if (!args.empty()) howmany = string_to_int(args[0]);
2165                     if (howmany < 0) howmany = 0;
2166
2167                     // List of expand terms
2168                     Xapian::ESet eset;
2169                     OmegaExpandDecider decider(db, &termset);
2170
2171                     if (!rset.empty()) {
2172                         set_expansion_scheme(*enquire, option);
2173 #if XAPIAN_AT_LEAST(1,3,2)
2174                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2175 #else
2176                         eset = enquire->get_eset(howmany * 2, rset, 0,
2177                                                  expand_param_k, &decider);
2178 #endif
2179                     } else if (mset.size()) {
2180                         // invent an rset
2181                         Xapian::RSet tmp;
2182
2183                         int c = 5;
2184                         // FIXME: what if mset does not start at first match?
2185                         for (Xapian::docid did : mset) {
2186                             tmp.add_document(did);
2187                             if (--c == 0) break;
2188                         }
2189
2190                         set_expansion_scheme(*enquire, option);
2191 #if XAPIAN_AT_LEAST(1,3,2)
2192                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2193 #else
2194                         eset = enquire->get_eset(howmany * 2, tmp, 0,
2195                                                  expand_param_k, &decider);
2196 #endif
2197                     }
2198
2199                     // Don't show more than one word with the same stem.
2200                     set<string> stems;
2201                     Xapian::ESetIterator i;
2202                     for (i = eset.begin(); i != eset.end(); ++i) {
2203                         string term(*i);
2204                         string stem = (*stemmer)(term);
2205                         if (stems.find(stem) != stems.end()) continue;
2206                         stems.insert(stem);
2207                         value += term;
2208                         value += '\t';
2209                         if (--howmany == 0) break;
2210                     }
2211                     if (!value.empty()) value.erase(value.size() - 1);
2212                 }
2213                 break;
2214             case CMD_transform:
2215                 omegascript_transform(value, args);
2216                 break;
2217             case CMD_truncate:
2218                 value = generate_sample(args[0],
2219                                         string_to_int(args[1]),
2220                                         args.size() > 2 ? args[2] : string(),
2221                                         args.size() > 3 ? args[3] : string());
2222                 break;
2223             case CMD_uniq: {
2224                 const string &list = args[0];
2225                 if (list.empty()) break;
2226                 string::size_type split = 0, split2;
2227                 string prev;
2228                 do {
2229                     split2 = list.find('\t', split);
2230                     string item(list, split, split2 - split);
2231                     if (split == 0) {
2232                         value = item;
2233                     } else if (item != prev) {
2234                         value += '\t';
2235                         value += item;
2236                     }
2237                     prev = item;
2238                     split = split2 + 1;
2239                 } while (split2 != string::npos);
2240                 break;
2241             }
2242             case CMD_unpack:
2243                 value = str(binary_string_to_int(args[0]));
2244                 break;
2245             case CMD_unstem: {
2246                 const string &term = args[0];
2247                 Xapian::TermIterator i = qp.unstem_begin(term);
2248                 Xapian::TermIterator end = qp.unstem_end(term);
2249                 while (i != end) {
2250                     if (!value.empty()) value += '\t';
2251                     value += *i;
2252                     ++i;
2253                 }
2254                 break;
2255             }
2256             case CMD_upper:
2257                 value = Xapian::Unicode::toupper(args[0]);
2258                 break;
2259             case CMD_url:
2260                 url_encode(value, args[0]);
2261                 break;
2262             case CMD_value: {
2263                 Xapian::docid id = q0;
2264                 Xapian::valueno value_no = string_to_int(args[0]);
2265                 if (args.size() > 1) id = string_to_int(args[1]);
2266                 value = db.get_document(id).get_value(value_no);
2267                 break;
2268             }
2269             case CMD_version:
2270                 value = PACKAGE_STRING;
2271                 break;
2272             case CMD_weight:
2273                 value = double_to_string(weight);
2274                 break;
2275             default: {
2276                 args.insert(args.begin(), param[0]);
2277                 int macro_no = func->second->tag - CMD_MACRO;
2278                 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2279                 // throw "Unknown function '" + var + "'";
2280                 value = eval(macros[macro_no], args);
2281                 break;
2282             }
2283         }
2284         res += value;
2285     } catch (const Xapian::Error & e) {
2286         // FIXME: this means we only see the most recent error in $error
2287         // - is that the best approach?
2288         error_msg = e.get_msg();
2289     }
2290
2291     res.append(fmt, p, string::npos);
2292     return res;
2293 }
2294
2295 static string
2296 eval_file(const string &fmtfile)
2297 {
2298     string err;
2299     if (vet_filename(fmtfile)) {
2300         string file = template_dir + fmtfile;
2301         string fmt;
2302         if (load_file(file, fmt)) {
2303             vector<string> noargs;
2304             noargs.resize(1);
2305             return eval(fmt, noargs);
2306         }
2307         err = strerror(errno);
2308     } else {
2309         err = "name contains '..'";
2310     }
2311
2312     // FIXME: report why!
2313     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2314     if (!err.empty()) msg += " (" + err + ')';
2315     throw msg;
2316 }
2317
2318 extern string
2319 pretty_term(string term)
2320 {
2321     // Just leave empty strings and single characters alone.
2322     if (term.length() <= 1) return term;
2323
2324     // Assume unprefixed terms are unstemmed.
2325     if (!C_isupper(term[0])) return term;
2326
2327     // Handle stemmed terms.
2328     bool stemmed = (term[0] == 'Z');
2329     if (stemmed) {
2330         // First of all, check if a term in the query stemmed to this one.
2331         Xapian::TermIterator u = qp.unstem_begin(term);
2332         // There might be multiple words with the same stem, but we only want
2333         // one so just take the first.
2334         if (u != qp.unstem_end(term)) return *u;
2335
2336         // Remove the 'Z'.
2337         term.erase(0, 1);
2338     }
2339
2340     bool add_quotes = false;
2341
2342     // Check if the term has a prefix.
2343     if (C_isupper(term[0])) {
2344         // See if we have this prefix in the termprefix_to_userprefix map.  If
2345         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2346         string prefix;
2347         size_t prefix_len = prefix_from_term(prefix, term);
2348
2349         map<string, string>::const_iterator i;
2350         i = termprefix_to_userprefix.find(prefix);
2351         if (i != termprefix_to_userprefix.end()) {
2352             string user_prefix = i->second;
2353             user_prefix += ':';
2354             term.replace(0, prefix_len, user_prefix);
2355         } else {
2356             // We don't have a prefix mapping for this, so just set a flag to
2357             // add quotes around the term.
2358             add_quotes = true;
2359         }
2360     }
2361
2362     if (stemmed) term += '.';
2363
2364     if (add_quotes) {
2365         term.insert(0, "\"");
2366         term.append("\"");
2367     }
2368
2369     return term;
2370 }
2371
2372 static string
2373 print_caption(const string &fmt, const vector<string> &param)
2374 {
2375     q0 = *(mset[hit_no]);
2376
2377     weight = mset[hit_no].get_weight();
2378     percent = mset.convert_to_percent(mset[hit_no]);
2379     collapsed = mset[hit_no].get_collapse_count();
2380
2381     return eval(fmt, param);
2382 }
2383
2384 void
2385 parse_omegascript()
2386 {
2387     try {
2388         const char * p = getenv("SERVER_PROTOCOL");
2389         if (p && strcmp(p, "INCLUDED") == 0) {
2390             // We're being included in another page, so suppress headers.
2391             suppress_http_headers = true;
2392         }
2393
2394         string output = eval_file(fmtname);
2395         if (!set_content_type && !suppress_http_headers) {
2396             cout << "Content-Type: text/html" << endl;
2397             set_content_type = true;
2398         }
2399         if (!suppress_http_headers) cout << endl;
2400         cout << output;
2401     } catch (...) {
2402         // Ensure the headers have been output so that any exception gets
2403         // reported rather than giving a server error.
2404         if (!set_content_type && !suppress_http_headers) {
2405             cout << "Content-Type: text/html" << endl;
2406             set_content_type = true;
2407         }
2408         if (!suppress_http_headers) cout << endl;
2409         throw;
2410     }
2411 }
2412
2413 static void
2414 ensure_query_parsed()
2415 {
2416     if (query_parsed) return;
2417     query_parsed = true;
2418
2419     MCI val;
2420     pair<MCI, MCI> g;
2421
2422     // Should we discard the existing R-set recorded in R CGI parameters?
2423     bool discard_rset = false;
2424
2425     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2426     // CGI parameters)?
2427     bool force_first_page = false;
2428
2429     string v;
2430     // get list of terms from previous iteration of query
2431     val = cgi_params.find("xP");
2432     if (val != cgi_params.end()) {
2433         v = val->second;
2434         // If xP given, default to discarding any RSet and forcing the first
2435         // page of results.  If the query is the same, or an extension of
2436         // the previous query, we adjust these again below.
2437         discard_rset = true;
2438         force_first_page = true;
2439     }
2440     querytype result = set_probabilistic(v);
2441     switch (result) {
2442         case BAD_QUERY:
2443             break;
2444         case NEW_QUERY:
2445             break;
2446         case SAME_QUERY:
2447         case EXTENDED_QUERY:
2448             // If we've changed database, force the first page of hits
2449             // and discard the R-set (since the docids will have changed)
2450             val = cgi_params.find("xDB");
2451             if (val != cgi_params.end() && val->second != dbname) break;
2452             if (result == SAME_QUERY && force_first_page) {
2453                 val = cgi_params.find("xFILTERS");
2454                 if (val != cgi_params.end() && val->second != filters &&
2455                     val->second != old_filters) {
2456                     // Filters have changed since last query.
2457                 } else {
2458                     force_first_page = false;
2459                 }
2460             }
2461             discard_rset = false;
2462             break;
2463     }
2464
2465     if (!force_first_page) {
2466         // Work out which mset element is the first hit we want
2467         // to display
2468         val = cgi_params.find("TOPDOC");
2469         if (val != cgi_params.end()) {
2470             topdoc = atol(val->second.c_str());
2471         }
2472
2473         // Handle next, previous, and page links
2474         if (cgi_params.find(">") != cgi_params.end()) {
2475             topdoc += hits_per_page;
2476         } else if (cgi_params.find("<") != cgi_params.end()) {
2477             if (topdoc >= hits_per_page)
2478                 topdoc -= hits_per_page;
2479             else
2480                 topdoc = 0;
2481         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2482                    (val = cgi_params.find("#")) != cgi_params.end()) {
2483             long page = atol(val->second.c_str());
2484             // Do something sensible for page 0 (we count pages from 1).
2485             if (page == 0) page = 1;
2486             topdoc = (page - 1) * hits_per_page;
2487         }
2488
2489         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2490         // Normally we snap TOPDOC like this so that things work nicely if
2491         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
2492         // postprocessing the output of omega and want variable sized pages,
2493         // this is unhelpful.
2494         bool raw_search = false;
2495         val = cgi_params.find("RAWSEARCH");
2496         if (val != cgi_params.end()) {
2497             raw_search = bool(atol(val->second.c_str()));
2498         }
2499
2500         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2501     }
2502
2503     if (!discard_rset) {
2504         // put documents marked as relevant into the rset
2505         g = cgi_params.equal_range("R");
2506         for (MCI i = g.first; i != g.second; i++) {
2507             const string & value = i->second;
2508             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2509                 while (value[j] == '.') ++j;
2510                 Xapian::docid d = atoi(value.c_str() + j);
2511                 if (d) {
2512                     rset.add_document(d);
2513                     ticked[d] = true;
2514                 }
2515             }
2516         }
2517     }
2518 }
2519
2520 // run query if we haven't already
2521 static void
2522 ensure_match()
2523 {
2524     if (done_query) return;
2525
2526     secs = RealTime::now();
2527     run_query();
2528     if (secs != -1)
2529         secs = RealTime::now() - secs;
2530
2531     done_query = true;
2532     last = mset.get_matches_lower_bound();
2533     if (last == 0) {
2534         // Otherwise topdoc ends up being -6 if it's non-zero!
2535         topdoc = 0;
2536     } else {
2537         if (topdoc >= last)
2538             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2539         // last is the count of documents up to the end of the current page
2540         // (as returned by $last)
2541         if (topdoc + hits_per_page < last)
2542             last = topdoc + hits_per_page;
2543     }
2544 }
2545
2546 // OmegaExpandDecider methods.
2547
2548 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2549                                        set<string> * querytermset)
2550     : db(db_)
2551 {
2552     // We'll want the stemmer for testing matches anyway.
2553     if (!stemmer)
2554         stemmer = new Xapian::Stem(option["stemmer"]);
2555     if (querytermset) {
2556         set<string>::const_iterator i;
2557         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2558             string term(*i);
2559             if (term.empty()) continue;
2560
2561             unsigned char ch = term[0];
2562             bool stemmed = (ch == 'Z');
2563             if (stemmed) {
2564                term.erase(0, 1);
2565                if (term.empty()) continue;
2566                ch = term[0];
2567             }
2568
2569             if (C_isupper(ch)) {
2570                 string prefix;
2571                 size_t prefix_len = prefix_from_term(prefix, term);
2572                 term.erase(0, prefix_len);
2573             }
2574
2575             if (!stemmed) term = (*stemmer)(term);
2576
2577             exclude_stems.insert(term);
2578         }
2579     }
2580 }
2581
2582 bool
2583 OmegaExpandDecider::operator()(const string & term) const
2584 {
2585     unsigned char ch = term[0];
2586
2587     // Reject terms with a prefix.
2588     if (C_isupper(ch)) return false;
2589
2590     {
2591         MyStopper stopper;
2592         // Don't suggest stopwords.
2593         if (stopper(term)) return false;
2594     }
2595
2596     // Reject small numbers.
2597     if (term.size() < 4 && C_isdigit(ch)) return false;
2598
2599     // Reject terms containing a space.
2600     if (term.find(' ') != string::npos) return false;
2601
2602     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2603     // terms which are already in the query in some form.
2604     string stem = (*stemmer)(term);
2605     if (exclude_stems.find(stem) != exclude_stems.end())
2606         return false;
2607
2608     // Ignore terms that only occur once (hapaxes) since they aren't
2609     // useful for finding related documents - they only occur in a
2610     // document that's already been marked as relevant.
2611     // FIXME: add an expand option to ignore terms where
2612     // termfreq == rtermfreq.
2613     if (db.get_termfreq(term) <= 1) return false;
2614
2615     return true;
2616 }