xapian-applications/omega/query.cc

   1 /* query.cc: query executor for omega
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 James Aylett
   5  * Copyright 2001,2002 Ananova Ltd
   6  * Copyright 2002 Intercede 1749 Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015 Olly Betts
   8  * Copyright 2008 Thomas Viehmann
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public License as
  12  * published by the Free Software Foundation; either version 2 of the
  13  * License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  23  * USA
  24  */
  25
  26 #include <config.h>
  27
  28 // If we're building against git after the expand API changed but before the
  29 // version gets bumped to 1.3.2, we'll get a deprecation warning from
  30 // get_eset() unless we suppress such warnings here.
  31 #define XAPIAN_DEPRECATED(D) D
  32
  33 #include <algorithm>
  34 #include <iostream>
  35 #include <map>
  36 #include <set>
  37 #include <vector>
  38
  39 #include <cassert>
  40 #include <cctype>
  41 #include "safeerrno.h"
  42 #include <stdio.h>
  43 #include <cstdlib>
  44 #include <cstring>
  45 #include "strcasecmp.h"
  46 #include <ctime>
  47
  48 #include "safeunistd.h"
  49 #include <sys/types.h>
  50 #include "safesysstat.h"
  51 #include "safefcntl.h"
  52
  53 #include "realtime.h"
  54
  55 #include <cdb.h>
  56
  57 #include "date.h"
  58 #include "datematchdecider.h"
  59 #include "jsonescape.h"
  60 #include "utils.h"
  61 #include "omega.h"
  62 #include "query.h"
  63 #include "cgiparam.h"
  64 #include "loadfile.h"
  65 #include "sample.h"
  66 #include "str.h"
  67 #include "stringutils.h"
  68 #include "transform.h"
  69 #include "urldecode.h"
  70 #include "urlencode.h"
  71 #include "unixperm.h"
  72 #include "values.h"
  73 #include "weight.h"
  74 #include "expand.h"
  75
  76 #include <xapian.h>
  77
  78 using namespace std;
  79
  80 using Xapian::Utf8Iterator;
  81
  82 using Xapian::Unicode::is_wordchar;
  83
  84 #ifndef SNPRINTF
  85 #include <cstdarg>
  86
  87 static int my_snprintf(char *str, size_t size, const char *format, ...)
  88 {
  89     int res;
  90     va_list ap;
  91     va_start(ap, format);
  92     str[size - 1] = '\0';
  93     res = vsprintf(str, format, ap);
  94     if (str[size - 1] || res < 0 || size_t(res) >= size)
  95         abort(); /* Overflowed! */
  96     va_end(ap);
  97     return res;
  98 }
  99 #else
 100 #define my_snprintf SNPRINTF
 101 #endif
 102
 103 static bool query_parsed = false;
 104 static bool done_query = false;
 105 static Xapian::docid last = 0;
 106
 107 static Xapian::MSet mset;
 108
 109 static map<Xapian::docid, bool> ticked;
 110
 111 static void ensure_query_parsed();
 112 static void ensure_match();
 113
 114 static Xapian::Query query;
 115 //static string url_query_string;
 116 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 117
 118 static Xapian::QueryParser qp;
 119 static Xapian::NumberValueRangeProcessor * size_vrp = NULL;
 120 static Xapian::Stem *stemmer = NULL;
 121
 122 static string eval_file(const string &fmtfile);
 123
 124 static set<string> termset;
 125
 126 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 127 static map<string, string> termprefix_to_userprefix;
 128
 129 static string queryterms;
 130
 131 static string error_msg;
 132
 133 static double secs = -1;
 134
 135 static const char DEFAULT_LOG_ENTRY[] =
 136         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 137         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 138         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 139         "$dbname\t"
 140         "$query\t"
 141         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 142
 143 class MyStopper : public Xapian::Stopper {
 144   public:
 145     bool operator()(const string &t) const {
 146         switch (t[0]) {
 147             case 'a':
 148                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 149                         t == "are" || t == "as" || t == "at");
 150             case 'b':
 151                 return (t == "be" || t == "by");
 152             case 'e':
 153                 return (t == "en");
 154             case 'f':
 155                 return (t == "for" || t == "from");
 156             case 'h':
 157                 return (t == "how");
 158             case 'i':
 159                 return (t == "i" || t == "in" || t == "is" || t == "it");
 160             case 'o':
 161                 return (t == "of" || t == "on" || t == "or");
 162             case 't':
 163                 return (t == "that" || t == "the" || t == "this" || t == "to");
 164             case 'w':
 165                 return (t == "was" || t == "what" || t == "when" ||
 166                         t == "where" || t == "which" || t == "who" ||
 167                         t == "why" || t == "will" || t == "with");
 168             case 'y':
 169                 return (t == "you" || t == "your");
 170             default:
 171                 return false;
 172         }
 173     }
 174 };
 175
 176 static size_t
 177 prefix_from_term(string &prefix, const string &term)
 178 {
 179     if (term.empty()) {
 180         prefix.resize(0);
 181         return 0;
 182     }
 183     if (term[0] == 'X') {
 184         const string::const_iterator begin = term.begin();
 185         string::const_iterator i = begin + 1;
 186         while (i != term.end() && C_isupper(*i)) ++i;
 187         prefix.assign(begin, i);
 188         if (i != term.end() && *i == ':') ++i;
 189         return i - begin;
 190     }
 191
 192     prefix = term[0];
 193     return 1;
 194 }
 195
 196 // Don't allow ".." in format names, log file names, etc as this would allow
 197 // people to open a format "../../etc/passwd" or similar.
 198 // FIXME: make this check more exact ("foo..bar" is safe)
 199 // FIXME: log when this check fails
 200 static bool
 201 vet_filename(const string &filename)
 202 {
 203     string::size_type i = filename.find("..");
 204     return (i == string::npos);
 205 }
 206
 207 // Heuristics:
 208 // * If any terms have been removed, it's a "fresh query" so we discard any
 209 //   relevance judgements
 210 // * If all previous terms are there but more have been added then we keep
 211 //   the relevance judgements, but return the first page of hits
 212 //
 213 // NEW_QUERY entirely new query
 214 // SAME_QUERY unchanged query
 215 // EXTENDED_QUERY new query, but based on the old one
 216 // BAD_QUERY parse error (message in error_msg)
 217 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 218
 219 static multimap<string, string> probabilistic_query;
 220
 221 void
 222 set_probabilistic_query(const string & prefix, const string & s)
 223 {
 224     string query_string = s;
 225     // Strip leading and trailing whitespace from query_string.
 226     trim(query_string);
 227     if (!query_string.empty())
 228         probabilistic_query.insert(make_pair(prefix, query_string));
 229 }
 230
 231 static unsigned
 232 read_qp_flags(const string & opt_pfx, unsigned f)
 233 {
 234     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 235     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 236         unsigned mask = 0;
 237         const char * s = i->first.c_str() + opt_pfx.size();
 238         switch (s[0]) {
 239             case 'a':
 240                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 241                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 242                     break;
 243                 }
 244                 if (strcmp(s, "auto_synonyms") == 0) {
 245                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 246                     break;
 247                 }
 248                 break;
 249             case 'b':
 250                 if (strcmp(s, "boolean") == 0) {
 251                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 252                     break;
 253                 }
 254                 if (strcmp(s, "boolean_any_case") == 0) {
 255                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 256                     break;
 257                 }
 258                 break;
 259             case 'd':
 260                 if (strcmp(s, "default") == 0) {
 261                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 262                     break;
 263                 }
 264                 break;
 265             case 'l':
 266                 if (strcmp(s, "lovehate") == 0) {
 267                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 268                     break;
 269                 }
 270                 break;
 271             case 'p':
 272                 if (strcmp(s, "partial") == 0) {
 273                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 274                     break;
 275                 }
 276                 if (strcmp(s, "phrase") == 0) {
 277                     mask = Xapian::QueryParser::FLAG_PHRASE;
 278                     break;
 279                 }
 280                 if (strcmp(s, "pure_not") == 0) {
 281                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 282                     break;
 283                 }
 284                 break;
 285             case 's':
 286                 if (strcmp(s, "spelling_correction") == 0) {
 287                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 288                     break;
 289                 }
 290                 if (strcmp(s, "synonym") == 0) {
 291                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 292                     break;
 293                 }
 294                 break;
 295             case 'w':
 296                 if (strcmp(s, "wildcard") == 0) {
 297                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 298                     break;
 299                 }
 300                 break;
 301         }
 302
 303         if (i->second.empty()) {
 304             f &= ~mask;
 305         } else {
 306             f |= mask;
 307         }
 308     }
 309     return f;
 310 }
 311
 312 static querytype
 313 set_probabilistic(const string &oldp)
 314 {
 315     // Parse the query string.
 316     qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
 317     qp.set_stopper(new MyStopper());
 318     qp.set_default_op(default_op);
 319     qp.set_database(db);
 320     // FIXME: provide a custom VRP which handles size:10..20K, etc.
 321     if (!size_vrp)
 322         size_vrp = new Xapian::NumberValueRangeProcessor(VALUE_SIZE, "size:",
 323                                                          true);
 324     qp.add_valuerangeprocessor(size_vrp);
 325     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 326     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 327         string user_prefix(pfx->first, 7);
 328         const string & term_pfx_list = pfx->second;
 329         string::size_type i = 0;
 330         do {
 331             string::size_type i0 = i;
 332             i = term_pfx_list.find('\t', i);
 333             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 334             qp.add_prefix(user_prefix, term_pfx);
 335             // std::map::insert() won't overwrite an existing entry, so we'll
 336             // prefer the first user_prefix for which a particular term prefix
 337             // is specified.
 338             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 339         } while (++i);
 340     }
 341     pfx = option.lower_bound("boolprefix,");
 342     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 343         string user_prefix = pfx->first.substr(11);
 344         qp.add_boolean_prefix(user_prefix, pfx->second);
 345         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 346     }
 347
 348     try {
 349         unsigned default_flags = read_qp_flags("flag_", 0);
 350         if (option["spelling"] == "true")
 351             default_flags |= qp.FLAG_SPELLING_CORRECTION;
 352
 353         vector<Xapian::Query> queries;
 354         queries.reserve(probabilistic_query.size());
 355
 356         multimap<string, string>::const_iterator j;
 357         for (j = probabilistic_query.begin();
 358              j != probabilistic_query.end();
 359              ++j) {
 360             const string & prefix = j->first;
 361
 362             // Choose the stemmer to use for this input.
 363             string stemlang = option[prefix + ":stemmer"];
 364             if (stemlang.empty())
 365                 stemlang = option["stemmer"];
 366             qp.set_stemmer(Xapian::Stem(stemlang));
 367
 368             // Work out the flags to use for this input.
 369             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 370
 371             const string & query_string = j->second;
 372             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 373             if (!q.empty())
 374                 queries.push_back(q);
 375         }
 376         query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
 377     } catch (Xapian::QueryParserError &e) {
 378         error_msg = e.get_msg();
 379         return BAD_QUERY;
 380     }
 381
 382     Xapian::termcount n_new_terms = 0;
 383     for (Xapian::TermIterator i = query.get_terms_begin();
 384          i != query.get_terms_end(); ++i) {
 385         if (termset.find(*i) == termset.end()) {
 386             termset.insert(*i);
 387             if (!queryterms.empty()) queryterms += '\t';
 388             queryterms += *i;
 389         }
 390         n_new_terms++;
 391     }
 392
 393     // Check new query against the previous one
 394     if (oldp.empty()) {
 395         // If oldp was empty that means there were no probabilistic terms
 396         // before, so if there are now this is a new query.
 397         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 398     }
 399
 400     // The terms in oldp are separated by tabs.
 401     const char oldp_separator = '\t';
 402     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 403
 404     // short-cut: if the new query has fewer terms, it must be a new one
 405     if (n_new_terms < n_old_terms) return NEW_QUERY;
 406
 407     const char *term = oldp.c_str();
 408     const char *pend;
 409     while ((pend = strchr(term, oldp_separator)) != NULL) {
 410         if (termset.find(string(term, pend - term)) == termset.end())
 411             return NEW_QUERY;
 412         term = pend + 1;
 413     }
 414     if (*term) {
 415         if (termset.find(string(term)) == termset.end())
 416             return NEW_QUERY;
 417     }
 418
 419     // Use termset.size() rather than n_new_terms so we correctly handle
 420     // the case when the query has repeated terms.
 421     // This works wrongly in the case when the user extends the query
 422     // by adding a term already in it, but that's unlikely and the behaviour
 423     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 424     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 425     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 426     return SAME_QUERY;
 427 }
 428
 429 static multimap<string, string> filter_map;
 430
 431 typedef multimap<string, string>::const_iterator FMCI;
 432
 433 void add_bterm(const string &term) {
 434     string prefix;
 435     if (prefix_from_term(prefix, term) > 0)
 436         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 437 }
 438
 439 static void
 440 run_query()
 441 {
 442     bool force_boolean = false;
 443     if (!filter_map.empty()) {
 444         // OR together filters with the same prefix, then AND together
 445         vector<Xapian::Query> filter_vec;
 446         vector<string> or_vec;
 447         string current;
 448         for (FMCI i = filter_map.begin(); ; i++) {
 449             bool over = (i == filter_map.end());
 450             if (over || i->first != current) {
 451                 switch (or_vec.size()) {
 452                     case 0:
 453                         break;
 454                     case 1:
 455                         filter_vec.push_back(Xapian::Query(or_vec[0]));
 456                         break;
 457                     default:
 458                         filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR,
 459                                                      or_vec.begin(),
 460                                                      or_vec.end()));
 461                         break;
 462                 }
 463                 or_vec.clear();
 464                 if (over) break;
 465                 current = i->first;
 466             }
 467             or_vec.push_back(i->second);
 468         }
 469
 470         Xapian::Query filter(Xapian::Query::OP_AND,
 471                              filter_vec.begin(), filter_vec.end());
 472
 473         if (query.empty()) {
 474             // If no probabilistic query is provided then promote the filters
 475             // to be THE query - filtering an empty query will give no
 476             // matches.
 477             std::swap(query, filter);
 478             if (enquire) force_boolean = true;
 479         } else {
 480             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 481         }
 482     }
 483
 484     Xapian::MatchDecider * mdecider = NULL;
 485     if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
 486         MCI i = cgi_params.find("DATEVALUE");
 487         if (i != cgi_params.end()) {
 488             Xapian::valueno datevalue = string_to_int(i->second);
 489             mdecider = new DateMatchDecider(datevalue, date_start, date_end, date_span);
 490         } else {
 491             Xapian::Query date_filter(Xapian::Query::OP_OR,
 492                                       date_range_filter(date_start, date_end,
 493                                                         date_span),
 494                                       Xapian::Query("Dlatest"));
 495
 496             // If no probabilistic query is provided then promote the daterange
 497             // filter to be THE query instead of filtering an empty query.
 498             if (query.empty()) {
 499                 query = date_filter;
 500             } else {
 501                 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 502             }
 503         }
 504     }
 505
 506     if (!enquire || !error_msg.empty()) return;
 507
 508     set_weighting_scheme(*enquire, option, force_boolean);
 509
 510     enquire->set_cutoff(threshold);
 511
 512     if (sort_key != Xapian::BAD_VALUENO) {
 513         if (sort_after) {
 514             enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending);
 515         } else {
 516             enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending);
 517         }
 518     }
 519
 520     enquire->set_docid_order(docid_order);
 521
 522     if (collapse) {
 523         enquire->set_collapse_key(collapse_key);
 524     }
 525
 526     if (!query.empty()) {
 527 #if 0
 528         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 529         // we're going to break some existing setups if users upgrade.  We
 530         // probably want a way to set this from OmegaScript.
 531         const char * remote_user = getenv("REMOTE_USER");
 532         if (remote_user)
 533             apply_unix_permissions(query, remote_user);
 534 #endif
 535
 536         enquire->set_query(query);
 537         // We could use the value of topdoc as first parameter, but we
 538         // need to know the first few items in the mset to fake a
 539         // relevance set for topterms.
 540         //
 541         // If min_hits isn't set, check at least one extra result so we
 542         // know if we've reached the end of the matches or not - then we
 543         // can avoid offering a "next" button which leads to an empty page.
 544         mset = enquire->get_mset(0, topdoc + hits_per_page,
 545                                  topdoc + max(hits_per_page + 1, min_hits),
 546                                  &rset, mdecider);
 547     }
 548 }
 549
 550 string
 551 html_escape(const string &str)
 552 {
 553     string res;
 554     string::size_type p = 0;
 555     while (p < str.size()) {
 556         char ch = str[p++];
 557         switch (ch) {
 558             case '<':
 559                 res += "&lt;";
 560                 continue;
 561             case '>':
 562                 res += "&gt;";
 563                 continue;
 564             case '&':
 565                 res += "&amp;";
 566                 continue;
 567             case '"':
 568                 res += "&quot;";
 569                 continue;
 570             default:
 571                 res += ch;
 572         }
 573     }
 574     return res;
 575 }
 576
 577 static string
 578 html_strip(const string &str)
 579 {
 580     string res;
 581     string::size_type p = 0;
 582     bool skip = false;
 583     while (p < str.size()) {
 584         char ch = str[p++];
 585         switch (ch) {
 586             case '<':
 587                 skip = true;
 588                 continue;
 589             case '>':
 590                 skip = false;
 591                 continue;
 592             default:
 593                 if (! skip) res += ch;
 594         }
 595     }
 596     return res;
 597 }
 598
 599 // FIXME split list into hash or map and use that rather than linear lookup?
 600 static int word_in_list(const string& word, const string& list)
 601 {
 602     string::size_type split = 0, split2;
 603     int count = 0;
 604     while ((split2 = list.find('\t', split)) != string::npos) {
 605         if (word.size() == split2 - split) {
 606             if (memcmp(word.data(), list.data() + split, word.size()) == 0)
 607                 return count;
 608         }
 609         split = split2 + 1;
 610         ++count;
 611     }
 612     if (word.size() == list.size() - split) {
 613         if (memcmp(word.data(), list.data() + split, word.size()) == 0)
 614             return count;
 615     }
 616     return -1;
 617 }
 618
 619 // Not a character in an identifier
 620 inline static bool
 621 p_notid(unsigned int c)
 622 {
 623     return !C_isalnum(c) && c != '_';
 624 }
 625
 626 // Not a character in an HTML tag name
 627 inline static bool
 628 p_nottag(unsigned int c)
 629 {
 630     return !C_isalnum(c) && c != '.' && c != '-';
 631 }
 632
 633 // FIXME: shares algorithm with indextext.cc!
 634 static string
 635 html_highlight(const string &s, const string &list,
 636                const string &bra, const string &ket)
 637 {
 638     if (!stemmer) {
 639         stemmer = new Xapian::Stem(option["stemmer"]);
 640     }
 641
 642     string res;
 643
 644     Utf8Iterator j(s);
 645     const Utf8Iterator s_end;
 646     while (true) {
 647         Utf8Iterator first = j;
 648         while (first != s_end && !is_wordchar(*first)) ++first;
 649         if (first == s_end) break;
 650         Utf8Iterator term_end;
 651         string term;
 652         string word;
 653         const char *l = j.raw();
 654         if (*first < 128 && C_isupper(*first)) {
 655             j = first;
 656             Xapian::Unicode::append_utf8(term, *j);
 657             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 658                 Xapian::Unicode::append_utf8(term, *j);
 659             }
 660             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 661                 term.resize(0);
 662             }
 663             term_end = j;
 664         }
 665         if (term.empty()) {
 666             j = first;
 667             while (is_wordchar(*j)) {
 668                 Xapian::Unicode::append_utf8(term, *j);
 669                 ++j;
 670                 if (j == s_end) break;
 671                 if (*j == '&' || *j == '\'') {
 672                     Utf8Iterator next = j;
 673                     ++next;
 674                     if (next == s_end || !is_wordchar(*next)) break;
 675                     term += *j;
 676                     j = next;
 677                 }
 678             }
 679             term_end = j;
 680             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 681                 string::size_type len = term.length();
 682                 if (*j == '#') {
 683                     term += '#';
 684                     do { ++j; } while (j != s_end && *j == '#');
 685                 } else {
 686                     while (j != s_end && (*j == '+' || *j == '-')) {
 687                         Xapian::Unicode::append_utf8(term, *j);
 688                         ++j;
 689                     }
 690                 }
 691                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 692                     term.resize(len);
 693                 } else {
 694                     term_end = j;
 695                 }
 696             }
 697         }
 698         j = term_end;
 699         term = Xapian::Unicode::tolower(term);
 700         int match = word_in_list(term, list);
 701         if (match == -1) {
 702             string stem = "Z";
 703             stem += (*stemmer)(term);
 704             match = word_in_list(stem, list);
 705         }
 706         if (match >= 0) {
 707             res += html_escape(string(l, first.raw() - l));
 708             if (!bra.empty()) {
 709                 res += bra;
 710             } else {
 711                 static const char * colours[] = {
 712                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 713                     "990000", "009900", "996600", "006699", "990099"
 714                 };
 715                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 716                 const char * bg = colours[idx];
 717                 if (strchr(bg, 'f')) {
 718                     res += "<b style=\"color:black;background-color:#";
 719                 } else {
 720                     res += "<b style=\"color:white;background-color:#";
 721                 }
 722                 res += bg;
 723                 res += "\">";
 724             }
 725             word = string(first.raw(), j.raw() - first.raw());
 726             res += html_escape(word);
 727             if (!bra.empty()) {
 728                 res += ket;
 729             } else {
 730                 res += "</b>";
 731             }
 732         } else {
 733             res += html_escape(string(l, j.raw() - l));
 734         }
 735     }
 736     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 737     return res;
 738 }
 739
 740 #if 0
 741 static void
 742 print_query_string(const char *after)
 743 {
 744     if (after && strncmp(after, "&B=", 3) == 0) {
 745         char prefix = after[3];
 746         string::size_type start = 0, amp = 0;
 747         while (true) {
 748             amp = url_query_string.find('&', amp);
 749             if (amp == string::npos) {
 750                 cout << url_query_string.substr(start);
 751                 return;
 752             }
 753             amp++;
 754             while (url_query_string[amp] == 'B' &&
 755                    url_query_string[amp + 1] == '=' &&
 756                    url_query_string[amp + 2] == prefix) {
 757                 cout << url_query_string.substr(start, amp - start - 1);
 758                 start = url_query_string.find('&', amp + 3);
 759                 if (start == string::npos) return;
 760                 amp = start + 1;
 761             }
 762         }
 763     }
 764     cout << url_query_string;
 765 }
 766 #endif
 767
 768 class Fields {
 769     mutable Xapian::docid did_cached;
 770     mutable map<string, string> fields;
 771
 772     void read_fields(Xapian::docid did) const;
 773
 774   public:
 775     Fields() : did_cached(0) { }
 776
 777     const string & get_field(Xapian::docid did, const string & field) const {
 778         if (did != did_cached) read_fields(did);
 779         return fields[field];
 780     }
 781 };
 782
 783 void
 784 Fields::read_fields(Xapian::docid did) const
 785 {
 786     fields.clear();
 787     did_cached = did;
 788     const string & data = db.get_document(did).get_data();
 789
 790     // Parse document data.
 791     string::size_type i = 0;
 792     const string & names = option["fieldnames"];
 793     if (!names.empty()) {
 794         // Each line is a field, with fieldnames taken from corresponding
 795         // entries in the tab-separated list specified by $opt{fieldnames}.
 796         string::size_type n = 0;
 797         do {
 798             string::size_type n0 = n;
 799             n = names.find('\t', n);
 800             string::size_type i0 = i;
 801             i = data.find('\n', i);
 802             fields.insert(make_pair(names.substr(n0, n  - n0),
 803                                     data.substr(i0, i - i0)));
 804         } while (++n && ++i);
 805     } else {
 806         // Each line is a field, in the format NAME=VALUE.  We assume the field
 807         // name doesn't contain an "=".  Lines without an "=" are currently
 808         // just ignored.
 809         do {
 810             string::size_type i0 = i;
 811             i = data.find('\n', i);
 812             string line = data.substr(i0, i - i0);
 813             string::size_type j = line.find('=');
 814             if (j != string::npos) {
 815                 string & value = fields[line.substr(0, j)];
 816                 if (!value.empty()) value += '\t';
 817                 value += line.substr(j + 1);
 818             }
 819         } while (++i);
 820     }
 821 }
 822
 823 static Fields fields;
 824 static Xapian::docid q0;
 825 static Xapian::doccount hit_no;
 826 static int percent;
 827 static double weight;
 828 static Xapian::doccount collapsed;
 829
 830 static string print_caption(const string &fmt, const vector<string> &param);
 831
 832 enum tagval {
 833 CMD_,
 834 CMD_add,
 835 CMD_addfilter,
 836 CMD_allterms,
 837 CMD_and,
 838 CMD_cgi,
 839 CMD_cgilist,
 840 CMD_collapsed,
 841 CMD_date,
 842 CMD_dbname,
 843 CMD_dbsize,
 844 CMD_def,
 845 CMD_defaultop,
 846 CMD_div,
 847 CMD_eq,
 848 CMD_emptydocs,
 849 CMD_env,
 850 CMD_error,
 851 CMD_field,
 852 CMD_filesize,
 853 CMD_filters,
 854 CMD_filterterms,
 855 CMD_find,
 856 CMD_fmt,
 857 CMD_freq,
 858 CMD_ge,
 859 CMD_gt,
 860 CMD_highlight,
 861 CMD_hit,
 862 CMD_hitlist,
 863 CMD_hitsperpage,
 864 CMD_hostname,
 865 CMD_html,
 866 CMD_htmlstrip,
 867 CMD_httpheader,
 868 CMD_id,
 869 CMD_if,
 870 CMD_include,
 871 CMD_json,
 872 CMD_jsonarray,
 873 CMD_last,
 874 CMD_lastpage,
 875 CMD_le,
 876 CMD_length,
 877 CMD_list,
 878 CMD_log,
 879 CMD_lookup,
 880 CMD_lower,
 881 CMD_lt,
 882 CMD_map,
 883 CMD_max,
 884 CMD_min,
 885 CMD_mod,
 886 CMD_msize,
 887 CMD_msizeexact,
 888 CMD_mul,
 889 CMD_muldiv,
 890 CMD_ne,
 891 CMD_nice,
 892 CMD_not,
 893 CMD_now,
 894 CMD_opt,
 895 CMD_or,
 896 CMD_pack,
 897 CMD_percentage,
 898 CMD_prettyterm,
 899 CMD_prettyurl,
 900 CMD_query,
 901 CMD_querydescription,
 902 CMD_queryterms,
 903 CMD_range,
 904 CMD_record,
 905 CMD_relevant,
 906 CMD_relevants,
 907 CMD_score,
 908 CMD_set,
 909 CMD_setmap,
 910 CMD_setrelevant,
 911 CMD_slice,
 912 CMD_snippet,
 913 CMD_split,
 914 CMD_stoplist,
 915 CMD_sub,
 916 CMD_substr,
 917 CMD_suggestion,
 918 CMD_terms,
 919 CMD_thispage,
 920 CMD_time,
 921 CMD_topdoc,
 922 CMD_topterms,
 923 CMD_transform,
 924 CMD_truncate,
 925 CMD_uniq,
 926 CMD_unpack,
 927 CMD_unstem,
 928 CMD_upper,
 929 CMD_url,
 930 CMD_value,
 931 CMD_version,
 932 CMD_weight,
 933 CMD_MACRO // special tag for macro evaluation
 934 };
 935
 936 struct func_attrib {
 937     int tag;
 938     int minargs, maxargs, evalargs;
 939     char ensure;
 940 };
 941
 942 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
 943 struct func_desc {
 944     const char *name;
 945     struct func_attrib a;
 946 };
 947
 948 #define N -1
 949 #define M 'M'
 950 #define Q 'Q'
 951 // NB when adding a new command which ensures M or Q, update the list in
 952 // docs/omegascript.rst
 953 static struct func_desc func_tab[] = {
 954 //name minargs maxargs evalargs ensure
 955 {"",{CMD_,         N, N, 0, 0}},// commented out code
 956 T(add,             0, N, N, 0), // add a list of numbers
 957 T(addfilter,       1, 1, N, 0), // add filter term
 958 T(allterms,        0, 1, N, 0), // list of all terms matching document
 959 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
 960 T(cgi,             1, 1, N, 0), // return cgi parameter value
 961 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
 962 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
 963 T(date,            1, 2, N, 0), // convert time_t to strftime format
 964                                 // (default: YYYY-MM-DD)
 965 T(dbname,          0, 0, N, 0), // database name
 966 T(dbsize,          0, 0, N, 0), // database size (# of documents)
 967 T(def,             2, 2, 1, 0), // define a macro
 968 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
 969 T(div,             2, 2, N, 0), // integer divide
 970 T(emptydocs,       0, 1, N, 0), // list of empty documents
 971 T(env,             1, 1, N, 0), // environment variable
 972 T(error,           0, 0, N, 0), // error message
 973 T(eq,              2, 2, N, 0), // test equality
 974 T(field,           1, 2, N, 0), // lookup field in record
 975 T(filesize,        1, 1, N, 0), // pretty printed filesize
 976 T(filters,         0, 0, N, 0), // serialisation of current filters
 977 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
 978 T(find,            2, 2, N, 0), // find entry in list
 979 T(fmt,             0, 0, N, 0), // name of current format
 980 T(freq,            1, 1, N, 0), // frequency of a term
 981 T(ge,              2, 2, N, 0), // test >=
 982 T(gt,              2, 2, N, 0), // test >
 983 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
 984 T(hit,             0, 0, N, 0), // hit number of current mset entry (starting
 985                                 // from 0
 986 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
 987 T(hitsperpage,     0, 0, N, 0), // hits per page
 988 T(hostname,        1, 1, N, 0), // extract hostname from URL
 989 T(html,            1, 1, N, 0), // html escape string (<>&")
 990 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
 991 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
 992 T(id,              0, 0, N, 0), // docid of current doc
 993 T(if,              2, 3, 1, 0), // conditional
 994 T(include,         1, 1, 1, 0), // include another file
 995 T(json,            1, 1, N, 0), // JSON string escaping
 996 T(jsonarray,       1, 1, N, 0), // Format list as a JSON array of strings
 997 T(last,            0, 0, N, M), // hit number one beyond end of current page
 998 T(lastpage,        0, 0, N, M), // number of last hit page
 999 T(le,              2, 2, N, 0), // test <=
1000 T(length,          1, 1, N, 0), // length of list
1001 T(list,            2, 5, N, 0), // pretty print list
1002 T(log,             1, 2, 1, 0), // create a log entry
1003 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1004 T(lower,           1, 1, N, 0), // convert string to lower case
1005 T(lt,              2, 2, N, 0), // test <
1006 T(map,             1, 2, 1, 0), // map a list into another list
1007 T(max,             1, N, N, 0), // maximum of a list of values
1008 T(min,             1, N, N, 0), // minimum of a list of values
1009 T(mod,             2, 2, N, 0), // integer modulus
1010 T(msize,           0, 0, N, M), // number of matches
1011 T(msizeexact,      0, 0, N, M), // is $msize exact?
1012 T(mul,             2, N, N, 0), // multiply a list of numbers
1013 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1014 T(ne,              2, 2, N, 0), // test not equal
1015 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1016 T(not,             1, 1, N, 0), // logical not
1017 T(now,             0, 0, N, 0), // current date/time as a time_t
1018 T(opt,             1, 2, N, 0), // lookup an option value
1019 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1020 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1021 T(percentage,      0, 0, N, 0), // percentage score of current hit
1022 T(prettyterm,      1, 1, N, Q), // pretty print term name
1023 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1024 T(query,           0, 1, N, Q), // query
1025 T(querydescription,0, 0, N, Q), // query.get_description()
1026 T(queryterms,      0, 0, N, Q), // list of query terms
1027 T(range,           2, 2, N, 0), // return list of values between start and end
1028 T(record,          0, 1, N, 0), // record contents of document
1029 T(relevant,        0, 1, N, Q), // is document relevant?
1030 T(relevants,       0, 0, N, Q), // return list of relevant documents
1031 T(score,           0, 0, N, 0), // score (0-10) of current hit
1032 T(set,             2, 2, N, 0), // set option value
1033 T(setmap,          1, N, N, 0), // set map of option values
1034 T(setrelevant,     0, 1, N, Q), // set rset
1035 T(slice,           2, 2, N, 0), // slice a list using a second list
1036 T(snippet,         1, 2, N, 0), // generate snippet from text
1037 T(split,           1, 2, N, 0), // split a string to give a list
1038 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1039 T(sub,             2, 2, N, 0), // subtract
1040 T(substr,          2, 3, N, 0), // substring
1041 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1042 T(terms,           0, 0, N, M), // list of matching terms
1043 T(thispage,        0, 0, N, M), // page number of current page
1044 T(time,            0, 0, N, M), // how long the match took (in seconds)
1045 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1046                                 // (counting from 0)
1047 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1048                                 // (default 16)
1049 T(transform,       3, 3, N, 0), // transform with a regexp
1050 T(truncate,        2, 4, N, 0), // truncate after a word
1051 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1052 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1053 T(unstem,          1, 1, N, Q), // return list of probabilistic terms from
1054                                 // the query which stemmed to this term
1055 T(upper,           1, 1, N, 0), // convert string to upper case
1056 T(url,             1, 1, N, 0), // url encode argument
1057 T(value,           1, 2, N, 0), // return document value
1058 T(version,         0, 0, N, 0), // omega version string
1059 T(weight,          0, 0, N, 0), // weight of the current hit
1060 { NULL,{0,         0, 0, 0, 0}}
1061 };
1062
1063 #undef T // Leaving T defined screws up Sun's C++ compiler!
1064
1065 static vector<string> macros;
1066
1067 // Call write() repeatedly until all data is written or we get a
1068 // non-recoverable error.
1069 static ssize_t
1070 write_all(int fd, const char * buf, size_t count)
1071 {
1072     while (count) {
1073         ssize_t r = write(fd, buf, count);
1074         if (rare(r < 0)) {
1075             if (errno == EINTR) continue;
1076             return r;
1077         }
1078         buf += r;
1079         count -= r;
1080     }
1081     return 0;
1082 }
1083
1084 static string
1085 eval(const string &fmt, const vector<string> &param)
1086 {
1087     static map<string, const struct func_attrib *> func_map;
1088     if (func_map.empty()) {
1089         struct func_desc *p;
1090         for (p = func_tab; p->name != NULL; p++) {
1091             func_map[string(p->name)] = &(p->a);
1092         }
1093     }
1094     string res;
1095     string::size_type p = 0, q;
1096     while ((q = fmt.find('$', p)) != string::npos) try {
1097         res += fmt.substr(p, q - p);
1098         string::size_type code_start = q; // note down for error reporting
1099         q++;
1100         if (q >= fmt.size()) break;
1101         unsigned char ch = fmt[q];
1102         switch (ch) {
1103             // Magic sequences:
1104             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1105             case '$':
1106                 res += '$';
1107                 p = q + 1;
1108                 continue;
1109             case '(':
1110                 res += '{';
1111                 p = q + 1;
1112                 continue;
1113             case ')':
1114                 res += '}';
1115                 p = q + 1;
1116                 continue;
1117             case '.':
1118                 res += ',';
1119                 p = q + 1;
1120                 continue;
1121             case '_':
1122                 ch = '0';
1123                 // FALL THRU
1124             case '1': case '2': case '3': case '4': case '5':
1125             case '6': case '7': case '8': case '9':
1126                 ch -= '0';
1127                 if (ch < param.size()) res += param[ch];
1128                 p = q + 1;
1129                 continue;
1130             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1131             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1132             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1133             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1134             case 'y': case 'z':
1135             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1136             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1137             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1138             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1139             case 'Y': case 'Z':
1140             case '{':
1141                 break;
1142             default:
1143                 string msg = "Unknown $ code in: $" + fmt.substr(q);
1144                 throw msg;
1145         }
1146         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1147         string var = fmt.substr(q, p - q);
1148         map<string, const struct func_attrib *>::const_iterator func;
1149         func = func_map.find(var);
1150         if (func == func_map.end()) {
1151             throw "Unknown function '" + var + "'";
1152         }
1153         vector<string> args;
1154         if (fmt[p] == '{') {
1155             q = p + 1;
1156             int nest = 1;
1157             while (true) {
1158                 p = fmt.find_first_of(",{}", p + 1);
1159                 if (p == string::npos)
1160                     throw "missing } in " + fmt.substr(code_start);
1161                 if (fmt[p] == '{') {
1162                     ++nest;
1163                 } else {
1164                     if (nest == 1) {
1165                         // should we split the args
1166                         if (func->second->minargs != N) {
1167                             args.push_back(fmt.substr(q, p - q));
1168                             q = p + 1;
1169                         }
1170                     }
1171                     if (fmt[p] == '}' && --nest == 0) break;
1172                 }
1173             }
1174             if (func->second->minargs == N)
1175                 args.push_back(fmt.substr(q, p - q));
1176             p++;
1177         }
1178
1179         if (func->second->minargs != N) {
1180             if ((int)args.size() < func->second->minargs)
1181                 throw "too few arguments to $" + var;
1182             if (func->second->maxargs != N &&
1183                 (int)args.size() > func->second->maxargs)
1184                 throw "too many arguments to $" + var;
1185
1186             vector<string>::size_type n;
1187             if (func->second->evalargs != N)
1188                 n = func->second->evalargs;
1189             else
1190                 n = args.size();
1191
1192             for (vector<string>::size_type j = 0; j < n; j++)
1193                 args[j] = eval(args[j], param);
1194         }
1195         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1196             ensure_query_parsed();
1197         if (func->second->ensure == 'M') ensure_match();
1198         string value;
1199         switch (func->second->tag) {
1200             case CMD_:
1201                 break;
1202             case CMD_add: {
1203                 int total = 0;
1204                 vector<string>::const_iterator i;
1205                 for (i = args.begin(); i != args.end(); i++)
1206                     total += string_to_int(*i);
1207                 value = str(total);
1208                 break;
1209             }
1210             case CMD_addfilter:
1211                 add_bterm(args[0]);
1212                 break;
1213             case CMD_allterms: {
1214                 // list of all terms indexing document
1215                 int id = q0;
1216                 if (!args.empty()) id = string_to_int(args[0]);
1217                 Xapian::TermIterator term = db.termlist_begin(id);
1218                 for ( ; term != db.termlist_end(id); term++) {
1219                     value += *term;
1220                     value += '\t';
1221                 }
1222
1223                 if (!value.empty()) value.erase(value.size() - 1);
1224                 break;
1225             }
1226             case CMD_and: {
1227                 value = "true";
1228                 for (vector<string>::const_iterator i = args.begin();
1229                      i != args.end(); i++) {
1230                     if (eval(*i, param).empty()) {
1231                         value.resize(0);
1232                         break;
1233                     }
1234                 }
1235                 break;
1236             }
1237             case CMD_cgi: {
1238                 MCI i = cgi_params.find(args[0]);
1239                 if (i != cgi_params.end()) value = i->second;
1240                 break;
1241             }
1242             case CMD_cgilist: {
1243                 pair<MCI, MCI> g;
1244                 g = cgi_params.equal_range(args[0]);
1245                 for (MCI i = g.first; i != g.second; i++) {
1246                     value += i->second;
1247                     value += '\t';
1248                 }
1249                 if (!value.empty()) value.erase(value.size() - 1);
1250                 break;
1251             }
1252             case CMD_collapsed: {
1253                 value = str(collapsed);
1254                 break;
1255             }
1256             case CMD_date:
1257                 value = args[0];
1258                 if (!value.empty()) {
1259                     char buf[64] = "";
1260                     time_t date = string_to_int(value);
1261                     if (date != (time_t)-1) {
1262                         struct tm *then;
1263                         then = gmtime(&date);
1264                         string date_fmt = "%Y-%m-%d";
1265                         if (args.size() > 1) date_fmt = eval(args[1], param);
1266                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1267                     }
1268                     value = buf;
1269                 }
1270                 break;
1271             case CMD_dbname:
1272                 value = dbname;
1273                 break;
1274             case CMD_dbsize: {
1275                 static Xapian::doccount dbsize;
1276                 if (!dbsize) dbsize = db.get_doccount();
1277                 value = str(dbsize);
1278                 break;
1279             }
1280             case CMD_def: {
1281                 func_attrib *fa = new func_attrib;
1282                 fa->tag = CMD_MACRO + macros.size();
1283                 fa->minargs = 0;
1284                 fa->maxargs = 9;
1285                 fa->evalargs = N; // FIXME: or 0?
1286                 fa->ensure = 0;
1287
1288                 macros.push_back(args[1]);
1289                 func_map[args[0]] = fa;
1290                 break;
1291             }
1292             case CMD_defaultop:
1293                 if (default_op == Xapian::Query::OP_AND) {
1294                     value = "and";
1295                 } else {
1296                     value = "or";
1297                 }
1298                 break;
1299             case CMD_div: {
1300                 int denom = string_to_int(args[1]);
1301                 if (denom == 0) {
1302                     value = "divide by 0";
1303                 } else {
1304                     value = str(string_to_int(args[0]) /
1305                                 string_to_int(args[1]));
1306                 }
1307                 break;
1308             }
1309             case CMD_eq:
1310                 if (args[0] == args[1]) value = "true";
1311                 break;
1312             case CMD_emptydocs: {
1313                 string t;
1314                 if (!args.empty())
1315                     t = args[0];
1316                 Xapian::PostingIterator i;
1317                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1318                     if (i.get_doclength() != 0) continue;
1319                     if (!value.empty()) value += '\t';
1320                     value += str(*i);
1321                 }
1322                 break;
1323             }
1324             case CMD_env: {
1325                 char *env = getenv(args[0].c_str());
1326                 if (env != NULL) value = env;
1327                 break;
1328             }
1329             case CMD_error:
1330                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1331                     error_msg = "Database '" + dbname + "' couldn't be opened";
1332                 }
1333                 value = error_msg;
1334                 break;
1335             case CMD_field: {
1336                 Xapian::docid did = q0;
1337                 if (args.size() > 1) did = string_to_int(args[1]);
1338                 value = fields.get_field(did, args[0]);
1339                 break;
1340             }
1341             case CMD_filesize: {
1342                 // FIXME: rounding?  i18n?
1343                 int size = string_to_int(args[0]);
1344                 int intpart = size;
1345                 int fraction = -1;
1346                 const char * format = 0;
1347                 if (size < 0) {
1348                     // Negative size -> empty result.
1349                 } else if (size == 1) {
1350                     format = "%d byte";
1351                 } else if (size < 1024) {
1352                     format = "%d bytes";
1353                 } else {
1354                     if (size < 1024*1024) {
1355                         format = "%d.%cK";
1356                     } else {
1357                         size /= 1024;
1358                         if (size < 1024*1024) {
1359                             format = "%d.%cM";
1360                         } else {
1361                             size /= 1024;
1362                             format = "%d.%cG";
1363                         }
1364                     }
1365                     intpart = unsigned(size) / 1024;
1366                     fraction = unsigned(size) % 1024;
1367                 }
1368                 if (format) {
1369                     char buf[200];
1370                     int len;
1371                     if (fraction == -1) {
1372                         len = my_snprintf(buf, sizeof(buf), format, intpart);
1373                     } else {
1374                         fraction = (fraction * 10 / 1024) + '0';
1375                         len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1376                     }
1377                     if (len < 0 || (unsigned)len > sizeof(buf)) len = sizeof(buf);
1378                     value.assign(buf, len);
1379                 }
1380                 break;
1381             }
1382             case CMD_filters:
1383                 value = filters;
1384                 break;
1385             case CMD_filterterms: {
1386                 Xapian::TermIterator term = db.allterms_begin();
1387                 term.skip_to(args[0]);
1388                 while (term != db.allterms_end()) {
1389                     string t = *term;
1390                     if (!startswith(t, args[0])) break;
1391                     value += t;
1392                     value += '\t';
1393                     ++term;
1394                 }
1395
1396                 if (!value.empty()) value.erase(value.size() - 1);
1397                 break;
1398             }
1399             case CMD_find: {
1400                 string l = args[0], s = args[1];
1401                 string::size_type i = 0, j = 0;
1402                 size_t count = 0;
1403                 while (j != l.size()) {
1404                     j = l.find('\t', i);
1405                     if (j == string::npos) j = l.size();
1406                     if (j - i == s.length()) {
1407                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1408                             value = str(count);
1409                             break;
1410                         }
1411                     }
1412                     ++count;
1413                     i = j + 1;
1414                 }
1415                 break;
1416             }
1417             case CMD_fmt:
1418                 value = fmtname;
1419                 break;
1420             case CMD_freq:
1421                 try {
1422                     value = str(mset.get_termfreq(args[0]));
1423                 } catch (const Xapian::InvalidOperationError&) {
1424                     // An MSet will raise this error if it's empty and not
1425                     // associated with a search.
1426                     value = str(db.get_termfreq(args[0]));
1427                 }
1428                 break;
1429             case CMD_ge:
1430                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1431                     value = "true";
1432                 break;
1433             case CMD_gt:
1434                 if (string_to_int(args[0]) > string_to_int(args[1]))
1435                     value = "true";
1436                 break;
1437             case CMD_highlight: {
1438                 string bra, ket;
1439                 if (args.size() > 2) {
1440                     bra = args[2];
1441                     if (args.size() > 3) {
1442                         ket = args[3];
1443                     } else {
1444                         string::const_iterator i;
1445                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1446                         ket = "</";
1447                         ket += bra.substr(1, i - bra.begin() - 1);
1448                         ket += '>';
1449                     }
1450                 }
1451
1452                 value = html_highlight(args[0], args[1], bra, ket);
1453                 break;
1454             }
1455             case CMD_hit:
1456                 // 0-based mset index
1457                 value = str(hit_no);
1458                 break;
1459             case CMD_hitlist:
1460 #if 0
1461                 url_query_string = "?DB=";
1462                 url_query_string += dbname;
1463                 multimap<string, string>::const_iterator j;
1464                 for (j = probabilistic_query.begin();
1465                      j != probabilistic_query.end();
1466                      ++j) {
1467                     if (j->first.empty()) {
1468                         url_query_string += "&P=";
1469                     } else {
1470                         url_query_string += "&P."
1471                         url_query_string += j->first;
1472                         url_query_string += '=';
1473                     }
1474                     const char *q = j->second.c_str();
1475                     int ch;
1476                     while ((ch = *q++) != '\0') {
1477                         switch (ch) {
1478                          case '+':
1479                             url_query_string += "%2b";
1480                             break;
1481                          case '"':
1482                             url_query_string += "%22";
1483                             break;
1484                          case '%':
1485                             url_query_string += "%25";
1486                             break;
1487                          case '&':
1488                             url_query_string += "%26";
1489                             break;
1490                          case ' ':
1491                             ch = '+';
1492                             /* fall through */
1493                          default:
1494                             url_query_string += ch;
1495                         }
1496                     }
1497                 }
1498                 // add any boolean terms
1499                 for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
1500                     url_query_string += "&B=";
1501                     url_query_string += i->second;
1502                 }
1503 #endif
1504                 for (hit_no = topdoc; hit_no < last; hit_no++)
1505                     value += print_caption(args[0], param);
1506                 hit_no = 0;
1507                 break;
1508             case CMD_hitsperpage:
1509                 value = str(hits_per_page);
1510                 break;
1511             case CMD_hostname: {
1512                 value = args[0];
1513                 // remove URL scheme and/or path
1514                 string::size_type i = value.find("://");
1515                 if (i == string::npos) i = 0; else i += 3;
1516                 value = value.substr(i, value.find('/', i) - i);
1517                 // remove user@ or user:password@
1518                 i = value.find('@');
1519                 if (i != string::npos) value.erase(0, i + 1);
1520                 // remove :port
1521                 i = value.find(':');
1522                 if (i != string::npos) value.resize(i);
1523                 break;
1524             }
1525             case CMD_html:
1526                 value = html_escape(args[0]);
1527                 break;
1528             case CMD_htmlstrip:
1529                 value = html_strip(args[0]);
1530                 break;
1531             case CMD_httpheader:
1532                 if (!suppress_http_headers) {
1533                     cout << args[0] << ": " << args[1] << endl;
1534                     if (!set_content_type && args[0].length() == 12 &&
1535                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1536                         set_content_type = true;
1537                     }
1538                 }
1539                 break;
1540             case CMD_id:
1541                 // document id
1542                 value = str(q0);
1543                 break;
1544             case CMD_if:
1545                 if (!args[0].empty())
1546                     value = eval(args[1], param);
1547                 else if (args.size() > 2)
1548                     value = eval(args[2], param);
1549                 break;
1550             case CMD_include:
1551                 value = eval_file(args[0]);
1552                 break;
1553             case CMD_json:
1554                 value = args[0];
1555                 json_escape(value);
1556                 break;
1557             case CMD_jsonarray: {
1558                 const string & l = args[0];
1559                 string::size_type i = 0, j;
1560                 if (l.empty()) {
1561                     value = "[]";
1562                     break;
1563                 }
1564                 value = "[\"";
1565                 while (true) {
1566                     j = l.find('\t', i);
1567                     string elt(l, i, j - i);
1568                     json_escape(elt);
1569                     value += elt;
1570                     if (j == string::npos) break;
1571                     value += "\",\"";
1572                     i = j + 1;
1573                 }
1574                 value += "\"]";
1575                 break;
1576             }
1577             case CMD_last:
1578                 value = str(last);
1579                 break;
1580             case CMD_lastpage: {
1581                 int l = mset.get_matches_estimated();
1582                 if (l > 0) l = (l - 1) / hits_per_page + 1;
1583                 value = str(l);
1584                 break;
1585             }
1586             case CMD_le:
1587                 if (string_to_int(args[0]) <= string_to_int(args[1]))
1588                     value = "true";
1589                 break;
1590             case CMD_length:
1591                 if (args[0].empty()) {
1592                     value = "0";
1593                 } else {
1594                     size_t length = count(args[0].begin(), args[0].end(), '\t');
1595                     value = str(length + 1);
1596                 }
1597                 break;
1598             case CMD_list: {
1599                 if (!args[0].empty()) {
1600                     string pre, inter, interlast, post;
1601                     switch (args.size()) {
1602                      case 2:
1603                         inter = interlast = args[1];
1604                         break;
1605                      case 3:
1606                         inter = args[1];
1607                         interlast = args[2];
1608                         break;
1609                      case 4:
1610                         pre = args[1];
1611                         inter = interlast = args[2];
1612                         post = args[3];
1613                         break;
1614                      case 5:
1615                         pre = args[1];
1616                         inter = args[2];
1617                         interlast = args[3];
1618                         post = args[4];
1619                         break;
1620                     }
1621                     value += pre;
1622                     string list = args[0];
1623                     string::size_type split = 0, split2;
1624                     while ((split2 = list.find('\t', split)) != string::npos) {
1625                         if (split) value += inter;
1626                         value += list.substr(split, split2 - split);
1627                         split = split2 + 1;
1628                     }
1629                     if (split) value += interlast;
1630                     value += list.substr(split);
1631                     value += post;
1632                 }
1633                 break;
1634             }
1635             case CMD_log: {
1636                 if (!vet_filename(args[0])) break;
1637                 string logfile = log_dir + args[0];
1638                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1639                 if (fd == -1) break;
1640                 vector<string> noargs;
1641                 noargs.resize(1);
1642                 string line;
1643                 if (args.size() > 1) {
1644                     line = args[1];
1645                 } else {
1646                     line = DEFAULT_LOG_ENTRY;
1647                 }
1648                 line = eval(line, noargs);
1649                 line += '\n';
1650                 (void)write_all(fd, line.data(), line.length());
1651                 close(fd);
1652                 break;
1653             }
1654             case CMD_lookup: {
1655                 if (!vet_filename(args[0])) break;
1656                 string cdbfile = cdb_dir + args[0];
1657                 int fd = open(cdbfile.c_str(), O_RDONLY);
1658                 if (fd == -1) break;
1659
1660                 struct cdb cdb;
1661                 cdb_init(&cdb, fd);
1662
1663                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1664                     size_t datalen = cdb_datalen(&cdb);
1665                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1666                     if (q) {
1667                         value = string(static_cast<const char *>(dat), datalen);
1668                     }
1669                 }
1670
1671                 cdb_free(&cdb);
1672                 close(fd); // FIXME: cache fds?
1673                 break;
1674             }
1675             case CMD_lower:
1676                 value = Xapian::Unicode::tolower(args[0]);
1677                 break;
1678             case CMD_lt:
1679                 if (string_to_int(args[0]) < string_to_int(args[1]))
1680                     value = "true";
1681                 break;
1682             case CMD_map:
1683                 if (!args[0].empty()) {
1684                     string l = args[0], pat = args[1];
1685                     vector<string> new_args(param);
1686                     string::size_type i = 0, j;
1687                     while (true) {
1688                         j = l.find('\t', i);
1689                         new_args[0] = l.substr(i, j - i);
1690                         value += eval(pat, new_args);
1691                         if (j == string::npos) break;
1692                         value += '\t';
1693                         i = j + 1;
1694                     }
1695                 }
1696                 break;
1697             case CMD_max: {
1698                 vector<string>::const_iterator i = args.begin();
1699                 int val = string_to_int(*i++);
1700                 for (; i != args.end(); i++) {
1701                     int x = string_to_int(*i);
1702                     if (x > val) val = x;
1703                 }
1704                 value = str(val);
1705                 break;
1706             }
1707             case CMD_min: {
1708                 vector<string>::const_iterator i = args.begin();
1709                 int val = string_to_int(*i++);
1710                 for (; i != args.end(); i++) {
1711                     int x = string_to_int(*i);
1712                     if (x < val) val = x;
1713                 }
1714                 value = str(val);
1715                 break;
1716             }
1717             case CMD_msize:
1718                 // number of matches
1719                 value = str(mset.get_matches_estimated());
1720                 break;
1721             case CMD_msizeexact:
1722                 // is msize exact?
1723                 if (mset.get_matches_lower_bound()
1724                     == mset.get_matches_upper_bound())
1725                     value = "true";
1726                 break;
1727             case CMD_mod: {
1728                 int denom = string_to_int(args[1]);
1729                 if (denom == 0) {
1730                     value = "divide by 0";
1731                 } else {
1732                     value = str(string_to_int(args[0]) %
1733                                 string_to_int(args[1]));
1734                 }
1735                 break;
1736             }
1737             case CMD_mul: {
1738                 vector<string>::const_iterator i = args.begin();
1739                 int total = string_to_int(*i++);
1740                 while (i != args.end())
1741                     total *= string_to_int(*i++);
1742                 value = str(total);
1743                 break;
1744             }
1745             case CMD_muldiv: {
1746                 int denom = string_to_int(args[2]);
1747                 if (denom == 0) {
1748                     value = "divide by 0";
1749                 } else {
1750                     int num = string_to_int(args[0]) * string_to_int(args[1]);
1751                     value = str(num / denom);
1752                 }
1753                 break;
1754             }
1755             case CMD_ne:
1756                 if (args[0] != args[1]) value = "true";
1757                 break;
1758             case CMD_nice: {
1759                 string::const_iterator i = args[0].begin();
1760                 int len = args[0].length();
1761                 while (len) {
1762                     value += *i++;
1763                     if (--len && len % 3 == 0) value += option["thousand"];
1764                 }
1765                 break;
1766             }
1767             case CMD_not:
1768                 if (args[0].empty()) value = "true";
1769                 break;
1770             case CMD_now: {
1771                 char buf[64];
1772                 my_snprintf(buf, sizeof(buf), "%lu", (unsigned long)time(NULL));
1773                 // MSVC's snprintf omits the zero byte if the string if
1774                 // sizeof(buf) long.
1775                 buf[sizeof(buf) - 1] = '\0';
1776                 value = buf;
1777                 break;
1778             }
1779             case CMD_opt:
1780                 if (args.size() == 2) {
1781                     value = option[args[0] + "," + args[1]];
1782                 } else {
1783                     value = option[args[0]];
1784                 }
1785                 break;
1786             case CMD_or: {
1787                 for (vector<string>::const_iterator i = args.begin();
1788                      i != args.end(); i++) {
1789                     value = eval(*i, param);
1790                     if (!value.empty()) break;
1791                 }
1792                 break;
1793             }
1794             case CMD_pack:
1795                 value = int_to_binary_string(string_to_int(args[0]));
1796                 break;
1797             case CMD_percentage:
1798                 // percentage score
1799                 value = str(percent);
1800                 break;
1801             case CMD_prettyterm:
1802                 value = pretty_term(args[0]);
1803                 break;
1804             case CMD_prettyurl:
1805                 value = args[0];
1806                 url_prettify(value);
1807                 break;
1808             case CMD_query: {
1809                 pair<multimap<string, string>::const_iterator,
1810                      multimap<string, string>::const_iterator> r;
1811                 r = probabilistic_query.equal_range(args.empty() ?
1812                                                     string() : args[0]);
1813                 multimap<string, string>::const_iterator j;
1814                 for (j = r.first; j != r.second; ++j) {
1815                     if (!value.empty()) value += '\t';
1816                     const string & s = j->second;
1817                     size_t start = 0, tab;
1818                     while ((tab = s.find('\t', start)) != string::npos) {
1819                         value.append(s, start, tab - start);
1820                         value += ' ';
1821                         start = tab + 1;
1822                     }
1823                     value.append(s, start, string::npos);
1824                 }
1825                 break;
1826             }
1827             case CMD_querydescription:
1828                 value = query.get_description();
1829                 break;
1830             case CMD_queryterms:
1831                 value = queryterms;
1832                 break;
1833             case CMD_range: {
1834                 int start = string_to_int(args[0]);
1835                 int end = string_to_int(args[1]);
1836                 while (start <= end) {
1837                     value += str(start);
1838                     if (start < end) value += '\t';
1839                     start++;
1840                 }
1841                 break;
1842             }
1843             case CMD_record: {
1844                 int id = q0;
1845                 if (!args.empty()) id = string_to_int(args[0]);
1846                 value = db.get_document(id).get_data();
1847                 break;
1848             }
1849             case CMD_relevant: {
1850                 // document id if relevant; empty otherwise
1851                 int id = q0;
1852                 if (!args.empty()) id = string_to_int(args[0]);
1853                 map<Xapian::docid, bool>::iterator i = ticked.find(id);
1854                 if (i != ticked.end()) {
1855                     i->second = false; // icky side-effect
1856                     value = str(id);
1857                 }
1858                 break;
1859             }
1860             case CMD_relevants: {
1861                 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
1862                      i != ticked.end(); i++) {
1863                     if (i->second) {
1864                         value += str(i->first);
1865                         value += '\t';
1866                     }
1867                 }
1868                 if (!value.empty()) value.erase(value.size() - 1);
1869                 break;
1870             }
1871             case CMD_score:
1872                 // Score (0 to 10)
1873                 value = str(percent / 10);
1874                 break;
1875             case CMD_set:
1876                 option[args[0]] = args[1];
1877                 break;
1878             case CMD_setmap: {
1879                 string base = args[0] + ',';
1880                 if (args.size() % 2 != 1)
1881                     throw string("$setmap requires an odd number of arguments");
1882                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
1883                     option[base + args[i]] = args[i + 1];
1884                 }
1885                 break;
1886             }
1887             case CMD_setrelevant: {
1888                 string::size_type i = 0, j;
1889                 while (true) {
1890                     j = args[0].find_first_not_of("0123456789", i);
1891                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
1892                     if (id) {
1893                         rset.add_document(id);
1894                         ticked[id] = true;
1895                     }
1896                     if (j == string::npos) break;
1897                     i = j + 1;
1898                 }
1899                 break;
1900             }
1901             case CMD_slice: {
1902                 string list = args[0], pos = args[1];
1903                 vector<string> items;
1904                 string::size_type i = 0, j;
1905                 while (true) {
1906                     j = list.find('\t', i);
1907                     items.push_back(list.substr(i, j - i));
1908                     if (j == string::npos) break;
1909                     i = j + 1;
1910                 }
1911                 i = 0;
1912                 bool have_added = false;
1913                 while (true) {
1914                     j = pos.find('\t', i);
1915                     int item = string_to_int(pos.substr(i, j - i));
1916                     if (item >= 0 && size_t(item) < items.size()) {
1917                         if (have_added) value += '\t';
1918                         value += items[item];
1919                         have_added = true;
1920                     }
1921                     if (j == string::npos) break;
1922                     i = j + 1;
1923                 }
1924                 break;
1925             }
1926             case CMD_snippet: {
1927                 Xapian::Snipper snipper;
1928                 snipper.set_mset(mset);
1929                 snipper.set_stemmer(Xapian::Stem(option["stemmer"]));
1930                 size_t len = (args.size() == 1) ? 200 : string_to_int(args[1]);
1931                 value = snipper.generate_snippet(args[0], len);
1932                 break;
1933             }
1934             case CMD_split: {
1935                 string split;
1936                 if (args.size() == 1) {
1937                     split = " ";
1938                     value = args[0];
1939                 } else {
1940                     split = args[0];
1941                     value = args[1];
1942                 }
1943                 string::size_type i = 0;
1944                 while (true) {
1945                     if (split.empty()) {
1946                         ++i;
1947                         if (i >= value.size()) break;
1948                     } else {
1949                         i = value.find(split, i);
1950                         if (i == string::npos) break;
1951                     }
1952                     value.replace(i, split.size(), 1, '\t');
1953                     ++i;
1954                 }
1955                 break;
1956             }
1957             case CMD_stoplist: {
1958                 Xapian::TermIterator i = qp.stoplist_begin();
1959                 Xapian::TermIterator end = qp.stoplist_end();
1960                 while (i != end) {
1961                     if (!value.empty()) value += '\t';
1962                     value += *i;
1963                     ++i;
1964                 }
1965                 break;
1966             }
1967             case CMD_sub:
1968                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
1969                 break;
1970             case CMD_substr: {
1971                 int start = string_to_int(args[1]);
1972                 if (start < 0) {
1973                     if (static_cast<size_t>(-start) >= args[0].size()) {
1974                         start = 0;
1975                     } else {
1976                         start = static_cast<int>(args[0].size()) + start;
1977                     }
1978                 } else {
1979                     if (static_cast<size_t>(start) >= args[0].size()) break;
1980                 }
1981                 size_t len = string::npos;
1982                 if (args.size() > 2) {
1983                     int int_len = string_to_int(args[2]);
1984                     if (int_len >= 0) {
1985                         len = size_t(int_len);
1986                     } else {
1987                         len = args[0].size() - start;
1988                         if (static_cast<size_t>(-int_len) >= len) {
1989                             len = 0;
1990                         } else {
1991                             len -= static_cast<size_t>(-int_len);
1992                         }
1993                     }
1994                 }
1995                 value = args[0].substr(start, len);
1996                 break;
1997             }
1998             case CMD_suggestion:
1999                 value = qp.get_corrected_query_string();
2000                 break;
2001             case CMD_terms:
2002                 if (enquire) {
2003                     // list of matching terms
2004                     Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2005                     while (term != enquire->get_matching_terms_end(q0)) {
2006                         // check term was in the typed query so we ignore
2007                         // boolean filter terms
2008                         if (termset.find(*term) != termset.end()) {
2009                             value += *term;
2010                             value += '\t';
2011                         }
2012                         ++term;
2013                     }
2014
2015                     if (!value.empty()) value.erase(value.size() - 1);
2016                 }
2017                 break;
2018             case CMD_thispage:
2019                 value = str(topdoc / hits_per_page + 1);
2020                 break;
2021             case CMD_time:
2022                 if (secs >= 0) {
2023                     char buf[64];
2024                     my_snprintf(buf, sizeof(buf), "%.6f", secs);
2025                     // MSVC's snprintf omits the zero byte if the string if
2026                     // sizeof(buf) long.
2027                     buf[sizeof(buf) - 1] = '\0';
2028                     value = buf;
2029                 }
2030                 break;
2031             case CMD_topdoc:
2032                 // first document on current page of hit list (counting from 0)
2033                 value = str(topdoc);
2034                 break;
2035             case CMD_topterms:
2036                 if (enquire) {
2037                     int howmany = 16;
2038                     if (!args.empty()) howmany = string_to_int(args[0]);
2039                     if (howmany < 0) howmany = 0;
2040
2041                     // List of expand terms
2042                     Xapian::ESet eset;
2043                     OmegaExpandDecider decider(db, &termset);
2044
2045                     if (!rset.empty()) {
2046                         set_expansion_scheme(*enquire, option);
2047 #if XAPIAN_AT_LEAST(1,3,2)
2048                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2049 #else
2050                         eset = enquire->get_eset(howmany * 2, rset, 0,
2051                                                  expand_param_k, &decider);
2052 #endif
2053                     } else if (mset.size()) {
2054                         // invent an rset
2055                         Xapian::RSet tmp;
2056
2057                         int c = 5;
2058                         // FIXME: what if mset does not start at first match?
2059                         Xapian::MSetIterator m = mset.begin();
2060                         for ( ; m != mset.end(); ++m) {
2061                             tmp.add_document(*m);
2062                             if (--c == 0) break;
2063                         }
2064
2065                         set_expansion_scheme(*enquire, option);
2066 #if XAPIAN_AT_LEAST(1,3,2)
2067                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2068 #else
2069                         eset = enquire->get_eset(howmany * 2, tmp, 0,
2070                                                  expand_param_k, &decider);
2071 #endif
2072                     }
2073
2074                     // Don't show more than one word with the same stem.
2075                     set<string> stems;
2076                     Xapian::ESetIterator i;
2077                     for (i = eset.begin(); i != eset.end(); ++i) {
2078                         string term(*i);
2079                         string stem = (*stemmer)(term);
2080                         if (stems.find(stem) != stems.end()) continue;
2081                         stems.insert(stem);
2082                         value += term;
2083                         value += '\t';
2084                         if (--howmany == 0) break;
2085                     }
2086                     if (!value.empty()) value.erase(value.size() - 1);
2087                 }
2088                 break;
2089             case CMD_transform:
2090                 omegascript_transform(value, args);
2091                 break;
2092             case CMD_truncate:
2093                 value = generate_sample(args[0],
2094                                         string_to_int(args[1]),
2095                                         args.size() > 2 ? args[2] : string(),
2096                                         args.size() > 3 ? args[3] : string());
2097                 break;
2098             case CMD_uniq: {
2099                 const string &list = args[0];
2100                 if (list.empty()) break;
2101                 string::size_type split = 0, split2;
2102                 string prev;
2103                 do {
2104                     split2 = list.find('\t', split);
2105                     string item = list.substr(split, split2 - split);
2106                     if (split == 0) {
2107                         value = item;
2108                     } else if (item != prev) {
2109                         value += '\t';
2110                         value += item;
2111                     }
2112                     prev = item;
2113                     split = split2 + 1;
2114                 } while (split2 != string::npos);
2115                 break;
2116             }
2117             case CMD_unpack:
2118                 value = str(binary_string_to_int(args[0]));
2119                 break;
2120             case CMD_unstem: {
2121                 const string &term = args[0];
2122                 Xapian::TermIterator i = qp.unstem_begin(term);
2123                 Xapian::TermIterator end = qp.unstem_end(term);
2124                 while (i != end) {
2125                     if (!value.empty()) value += '\t';
2126                     value += *i;
2127                     ++i;
2128                 }
2129                 break;
2130             }
2131             case CMD_upper:
2132                 value = Xapian::Unicode::toupper(args[0]);
2133                 break;
2134             case CMD_url:
2135                 url_encode(value, args[0]);
2136                 break;
2137             case CMD_value: {
2138                 Xapian::docid id = q0;
2139                 Xapian::valueno value_no = string_to_int(args[0]);
2140                 if (args.size() > 1) id = string_to_int(args[1]);
2141                 value = db.get_document(id).get_value(value_no);
2142                 break;
2143             }
2144             case CMD_version:
2145                 value = PACKAGE_STRING;
2146                 break;
2147             case CMD_weight:
2148                 value = double_to_string(weight);
2149                 break;
2150             default: {
2151                 args.insert(args.begin(), param[0]);
2152                 int macro_no = func->second->tag - CMD_MACRO;
2153                 assert(macro_no >= 0 && (unsigned int)macro_no < macros.size());
2154                 // throw "Unknown function '" + var + "'";
2155                 value = eval(macros[macro_no], args);
2156                 break;
2157             }
2158         }
2159         res += value;
2160     } catch (const Xapian::Error & e) {
2161         // FIXME: this means we only see the most recent error in $error
2162         // - is that the best approach?
2163         error_msg = e.get_msg();
2164     }
2165
2166     res += fmt.substr(p);
2167     return res;
2168 }
2169
2170 static string
2171 eval_file(const string &fmtfile)
2172 {
2173     string err;
2174     if (vet_filename(fmtfile)) {
2175         string file = template_dir + fmtfile;
2176         string fmt;
2177         if (load_file(file, fmt)) {
2178             vector<string> noargs;
2179             noargs.resize(1);
2180             return eval(fmt, noargs);
2181         }
2182         err = strerror(errno);
2183     } else {
2184         err = "name contains '..'";
2185     }
2186
2187     // FIXME: report why!
2188     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2189     if (!err.empty()) msg += " (" + err + ')';
2190     throw msg;
2191 }
2192
2193 extern string
2194 pretty_term(string term)
2195 {
2196     // Just leave empty strings and single characters alone.
2197     if (term.length() <= 1) return term;
2198
2199     // Assume unprefixed terms are unstemmed.
2200     if (!C_isupper(term[0])) return term;
2201
2202     // Handle stemmed terms.
2203     bool stemmed = (term[0] == 'Z');
2204     if (stemmed) {
2205         // First of all, check if a term in the query stemmed to this one.
2206         Xapian::TermIterator u = qp.unstem_begin(term);
2207         // There might be multiple words with the same stem, but we only want
2208         // one so just take the first.
2209         if (u != qp.unstem_end(term)) return *u;
2210
2211         // Remove the 'Z'.
2212         term.erase(0, 1);
2213     }
2214
2215     bool add_quotes = false;
2216
2217     // Check if the term has a prefix.
2218     if (C_isupper(term[0])) {
2219         // See if we have this prefix in the termprefix_to_userprefix map.  If
2220         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2221         string prefix;
2222         size_t prefix_len = prefix_from_term(prefix, term);
2223
2224         map<string, string>::const_iterator i;
2225         i = termprefix_to_userprefix.find(prefix);
2226         if (i != termprefix_to_userprefix.end()) {
2227             string user_prefix = i->second;
2228             user_prefix += ':';
2229             term.replace(0, prefix_len, user_prefix);
2230         } else {
2231             // We don't have a prefix mapping for this, so just set a flag to
2232             // add quotes around the term.
2233             add_quotes = true;
2234         }
2235     }
2236
2237     if (stemmed) term += '.';
2238
2239     if (add_quotes) {
2240         term.insert(0, "\"");
2241         term.append("\"");
2242     }
2243
2244     return term;
2245 }
2246
2247 static string
2248 print_caption(const string &fmt, const vector<string> &param)
2249 {
2250     q0 = *(mset[hit_no]);
2251
2252     weight = mset[hit_no].get_weight();
2253     percent = mset.convert_to_percent(mset[hit_no]);
2254     collapsed = mset[hit_no].get_collapse_count();
2255
2256     return eval(fmt, param);
2257 }
2258
2259 void
2260 parse_omegascript()
2261 {
2262     try {
2263         const char * p = getenv("SERVER_PROTOCOL");
2264         if (p && strcmp(p, "INCLUDED") == 0) {
2265             // We're being included in another page, so suppress headers.
2266             suppress_http_headers = true;
2267         }
2268
2269         std::string output = eval_file(fmtname);
2270         if (!set_content_type && !suppress_http_headers) {
2271             cout << "Content-Type: text/html" << std::endl;
2272             set_content_type = true;
2273         }
2274         cout << std::endl;
2275         cout << output;
2276     } catch (...) {
2277         // Ensure the headers have been output so that any exception gets
2278         // reported rather than giving a server error.
2279         if (!set_content_type && !suppress_http_headers) {
2280             cout << "Content-Type: text/html" << std::endl;
2281             set_content_type = true;
2282         }
2283         cout << std::endl;
2284         throw;
2285     }
2286 }
2287
2288 static void
2289 ensure_query_parsed()
2290 {
2291     if (query_parsed) return;
2292     query_parsed = true;
2293
2294     MCI val;
2295     pair<MCI, MCI> g;
2296
2297     // Should we discard the existing R-set recorded in R CGI parameters?
2298     bool discard_rset = false;
2299
2300     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2301     // CGI parameters)?
2302     bool force_first_page = false;
2303
2304     string v;
2305     // get list of terms from previous iteration of query
2306     val = cgi_params.find("xP");
2307     if (val != cgi_params.end()) {
2308         v = val->second;
2309         // If xP given, default to discarding any RSet and forcing the first
2310         // page of results.  If the query is the same, or an extension of
2311         // the previous query, we adjust these again below.
2312         discard_rset = true;
2313         force_first_page = true;
2314     }
2315     querytype result = set_probabilistic(v);
2316     switch (result) {
2317         case BAD_QUERY:
2318             break;
2319         case NEW_QUERY:
2320             break;
2321         case SAME_QUERY:
2322         case EXTENDED_QUERY:
2323             // If we've changed database, force the first page of hits
2324             // and discard the R-set (since the docids will have changed)
2325             val = cgi_params.find("xDB");
2326             if (val != cgi_params.end() && val->second != dbname) break;
2327             if (result == SAME_QUERY && force_first_page) {
2328                 val = cgi_params.find("xFILTERS");
2329                 if (val != cgi_params.end() && val->second != filters) {
2330                     // Filters have changed since last query.
2331                 } else {
2332                     force_first_page = false;
2333                 }
2334             }
2335             discard_rset = false;
2336             break;
2337     }
2338
2339     if (!force_first_page) {
2340         // Work out which mset element is the first hit we want
2341         // to display
2342         val = cgi_params.find("TOPDOC");
2343         if (val != cgi_params.end()) {
2344             topdoc = atol(val->second.c_str());
2345         }
2346
2347         // Handle next, previous, and page links
2348         if (cgi_params.find(">") != cgi_params.end()) {
2349             topdoc += hits_per_page;
2350         } else if (cgi_params.find("<") != cgi_params.end()) {
2351             if (topdoc >= hits_per_page)
2352                 topdoc -= hits_per_page;
2353             else
2354                 topdoc = 0;
2355         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2356                    (val = cgi_params.find("#")) != cgi_params.end()) {
2357             long page = atol(val->second.c_str());
2358             // Do something sensible for page 0 (we count pages from 1).
2359             if (page == 0) page = 1;
2360             topdoc = (page - 1) * hits_per_page;
2361         }
2362
2363         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2364         // Normally we snap TOPDOC like this so that things work nicely if
2365         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
2366         // postprocessing the output of omega and want variable sized pages,
2367         // this is unhelpful.
2368         bool raw_search = false;
2369         val = cgi_params.find("RAWSEARCH");
2370         if (val != cgi_params.end()) {
2371             raw_search = bool(atol(val->second.c_str()));
2372         }
2373
2374         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2375     }
2376
2377     if (!discard_rset) {
2378         // put documents marked as relevant into the rset
2379         g = cgi_params.equal_range("R");
2380         for (MCI i = g.first; i != g.second; i++) {
2381             const string & value = i->second;
2382             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2383                 while (value[j] == '.') ++j;
2384                 Xapian::docid d = atoi(value.c_str() + j);
2385                 if (d) {
2386                     rset.add_document(d);
2387                     ticked[d] = true;
2388                 }
2389             }
2390         }
2391     }
2392 }
2393
2394 // run query if we haven't already
2395 static void
2396 ensure_match()
2397 {
2398     if (done_query) return;
2399
2400     secs = RealTime::now();
2401     run_query();
2402     if (secs != -1)
2403         secs = RealTime::now() - secs;
2404
2405     done_query = true;
2406     last = mset.get_matches_lower_bound();
2407     if (last == 0) {
2408         // Otherwise topdoc ends up being -6 if it's non-zero!
2409         topdoc = 0;
2410     } else {
2411         if (topdoc >= last)
2412             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2413         // last is the count of documents up to the end of the current page
2414         // (as returned by $last)
2415         if (topdoc + hits_per_page < last)
2416             last = topdoc + hits_per_page;
2417     }
2418 }
2419
2420 // OmegaExpandDecider methods.
2421
2422 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2423                                        set<string> * querytermset)
2424     : db(db_)
2425 {
2426     // We'll want the stemmer for testing matches anyway.
2427     if (!stemmer)
2428         stemmer = new Xapian::Stem(option["stemmer"]);
2429     if (querytermset) {
2430         set<string>::const_iterator i;
2431         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2432             string term(*i);
2433             if (term.empty()) continue;
2434
2435             unsigned char ch = term[0];
2436             bool stemmed = (ch == 'Z');
2437             if (stemmed) {
2438                term.erase(0, 1);
2439                if (term.empty()) continue;
2440                ch = term[0];
2441             }
2442
2443             if (C_isupper(ch)) {
2444                 string prefix;
2445                 size_t prefix_len = prefix_from_term(prefix, term);
2446                 term.erase(0, prefix_len);
2447             }
2448
2449             if (!stemmed) term = (*stemmer)(term);
2450
2451             exclude_stems.insert(term);
2452         }
2453     }
2454 }
2455
2456 bool
2457 OmegaExpandDecider::operator()(const string & term) const
2458 {
2459     unsigned char ch = term[0];
2460
2461     // Reject terms with a prefix.
2462     if (C_isupper(ch)) return false;
2463
2464     {
2465         MyStopper stopper;
2466         // Don't suggest stopwords.
2467         if (stopper(term)) return false;
2468     }
2469
2470     // Reject small numbers.
2471     if (term.size() < 4 && C_isdigit(ch)) return false;
2472
2473     // Reject terms containing a space.
2474     if (term.find(' ') != string::npos) return false;
2475
2476     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2477     // terms which are already in the query in some form.
2478     string stem = (*stemmer)(term);
2479     if (exclude_stems.find(stem) != exclude_stems.end())
2480         return false;
2481
2482     // Ignore terms that only occur once (hapaxes) since they aren't
2483     // useful for finding related documents - they only occur in a
2484     // document that's already been marked as relevant.
2485     // FIXME: add an expand option to ignore terms where
2486     // termfreq == rtermfreq.
2487     if (db.get_termfreq(term) <= 1) return false;
2488
2489     return true;
2490 }