Explicitly #include <unordered_set>
[xapian.git] / xapian-applications / omega / query.cc
blobceb30804b4fcd38b46edb48a571db1bab2796207
1 /* query.cc: query executor for omega
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018 Olly Betts
8 * Copyright 2008 Thomas Viehmann
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include <algorithm>
29 #include <iostream>
30 #include <map>
31 #include <set>
32 #include <unordered_map>
33 #include <unordered_set>
34 #include <vector>
36 #include <cassert>
37 #include <cctype>
38 #include "safeerrno.h"
39 #include <stdio.h>
40 #include <cstdlib>
41 #include <cstring>
42 #include "strcasecmp.h"
43 #include <ctime>
45 #include "safeunistd.h"
46 #include <sys/types.h>
47 #include "safesysstat.h"
48 #include "safefcntl.h"
50 #include "realtime.h"
52 #include <cdb.h>
54 #include "csvescape.h"
55 #include "date.h"
56 #include "datevalue.h"
57 #include "jsonescape.h"
58 #include "utils.h"
59 #include "omega.h"
60 #include "query.h"
61 #include "cgiparam.h"
62 #include "loadfile.h"
63 #include "sample.h"
64 #include "str.h"
65 #include "stringutils.h"
66 #include "transform.h"
67 #include "urldecode.h"
68 #include "urlencode.h"
69 #include "unixperm.h"
70 #include "values.h"
71 #include "weight.h"
72 #include "expand.h"
73 #include "md5wrap.h"
75 #include <xapian.h>
77 using namespace std;
79 using Xapian::Utf8Iterator;
81 using Xapian::Unicode::is_wordchar;
83 #ifndef SNPRINTF
84 #include <cstdarg>
86 static int my_snprintf(char *str, size_t size, const char *format, ...)
88 int res;
89 va_list ap;
90 va_start(ap, format);
91 str[size - 1] = '\0';
92 res = vsprintf(str, format, ap);
93 if (str[size - 1] || res < 0 || size_t(res) >= size)
94 abort(); /* Overflowed! */
95 va_end(ap);
96 return res;
98 #else
99 #define my_snprintf SNPRINTF
100 #endif
102 static bool query_parsed = false;
103 static bool done_query = false;
104 static Xapian::docid last = 0;
106 static Xapian::MSet mset;
108 static map<Xapian::docid, bool> ticked;
110 static void ensure_query_parsed();
111 static void ensure_match();
113 static Xapian::Query query;
114 //static string url_query_string;
115 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
117 static Xapian::QueryParser qp;
118 static Xapian::NumberRangeProcessor * size_rp = NULL;
119 static Xapian::Stem *stemmer = NULL;
121 static string eval_file(const string &fmtfile);
123 static set<string> termset;
125 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
126 static map<string, string> termprefix_to_userprefix;
128 static string queryterms;
130 static string error_msg;
132 static double secs = -1;
134 static const char DEFAULT_LOG_ENTRY[] =
135 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
136 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
137 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
138 "$dbname\t"
139 "$query\t"
140 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
142 class MyStopper : public Xapian::Stopper {
143 public:
144 bool operator()(const string &t) const {
145 switch (t[0]) {
146 case 'a':
147 return (t == "a" || t == "about" || t == "an" || t == "and" ||
148 t == "are" || t == "as" || t == "at");
149 case 'b':
150 return (t == "be" || t == "by");
151 case 'e':
152 return (t == "en");
153 case 'f':
154 return (t == "for" || t == "from");
155 case 'h':
156 return (t == "how");
157 case 'i':
158 return (t == "i" || t == "in" || t == "is" || t == "it");
159 case 'o':
160 return (t == "of" || t == "on" || t == "or");
161 case 't':
162 return (t == "that" || t == "the" || t == "this" || t == "to");
163 case 'w':
164 return (t == "was" || t == "what" || t == "when" ||
165 t == "where" || t == "which" || t == "who" ||
166 t == "why" || t == "will" || t == "with");
167 case 'y':
168 return (t == "you" || t == "your");
169 default:
170 return false;
175 static size_t
176 prefix_from_term(string* prefix, const string& term)
178 if (!term.empty()) {
179 if (term[0] == 'X') {
180 const string::const_iterator begin = term.begin();
181 string::const_iterator i = begin + 1;
182 while (i != term.end() && C_isupper(*i))
183 ++i;
184 if (prefix)
185 prefix->assign(begin, i);
186 if (i != term.end() && *i == ':')
187 ++i;
188 return i - begin;
191 if (C_isupper(term[0])) {
192 if (prefix)
193 *prefix = term[0];
194 return 1;
198 if (prefix)
199 prefix->resize(0);
200 return 0;
203 // Don't allow ".." in format names, log file names, etc as this would allow
204 // people to open a format "../../etc/passwd" or similar.
205 // FIXME: make this check more exact ("foo..bar" is safe)
206 // FIXME: log when this check fails
207 static bool
208 vet_filename(const string &filename)
210 string::size_type i = filename.find("..");
211 return (i == string::npos);
214 // Heuristics:
215 // * If any terms have been removed, it's a "fresh query" so we discard any
216 // relevance judgements
217 // * If all previous terms are there but more have been added then we keep
218 // the relevance judgements, but return the first page of hits
220 // NEW_QUERY entirely new query
221 // SAME_QUERY unchanged query
222 // EXTENDED_QUERY new query, but based on the old one
223 // BAD_QUERY parse error (message in error_msg)
224 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
226 static multimap<string, string> query_strings;
228 void
229 add_query_string(const string& prefix, const string& s)
231 string query_string = s;
232 // Strip leading and trailing whitespace from query_string.
233 trim(query_string);
234 if (!query_string.empty())
235 query_strings.insert(make_pair(prefix, query_string));
238 static unsigned
239 read_qp_flags(const string & opt_pfx, unsigned f)
241 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
242 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
243 unsigned mask = 0;
244 const char * s = i->first.c_str() + opt_pfx.size();
245 switch (s[0]) {
246 case 'a':
247 if (strcmp(s, "auto_multiword_synonyms") == 0) {
248 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
249 break;
251 if (strcmp(s, "auto_synonyms") == 0) {
252 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
253 break;
255 break;
256 case 'b':
257 if (strcmp(s, "boolean") == 0) {
258 mask = Xapian::QueryParser::FLAG_BOOLEAN;
259 break;
261 if (strcmp(s, "boolean_any_case") == 0) {
262 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
263 break;
265 break;
266 case 'c':
267 if (strcmp(s, "cjk_ngram") == 0) {
268 mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
269 break;
271 break;
272 case 'd':
273 if (strcmp(s, "default") == 0) {
274 mask = Xapian::QueryParser::FLAG_DEFAULT;
275 break;
277 break;
278 case 'l':
279 if (strcmp(s, "lovehate") == 0) {
280 mask = Xapian::QueryParser::FLAG_LOVEHATE;
281 break;
283 break;
284 case 'p':
285 if (strcmp(s, "partial") == 0) {
286 mask = Xapian::QueryParser::FLAG_PARTIAL;
287 break;
289 if (strcmp(s, "phrase") == 0) {
290 mask = Xapian::QueryParser::FLAG_PHRASE;
291 break;
293 if (strcmp(s, "pure_not") == 0) {
294 mask = Xapian::QueryParser::FLAG_PURE_NOT;
295 break;
297 break;
298 case 's':
299 if (strcmp(s, "spelling_correction") == 0) {
300 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
301 break;
303 if (strcmp(s, "synonym") == 0) {
304 mask = Xapian::QueryParser::FLAG_SYNONYM;
305 break;
307 break;
308 case 'w':
309 if (strcmp(s, "wildcard") == 0) {
310 mask = Xapian::QueryParser::FLAG_WILDCARD;
311 break;
313 break;
316 if (i->second.empty()) {
317 f &= ~mask;
318 } else {
319 f |= mask;
322 return f;
325 static querytype
326 parse_queries(const string& oldp)
328 // Parse the query string.
329 qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
330 qp.set_stopper(new MyStopper());
331 qp.set_default_op(default_op);
332 qp.set_database(db);
333 // FIXME: provide a custom RP which handles size:10..20K, etc.
334 if (!size_rp)
335 size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
336 qp.add_rangeprocessor(size_rp);
337 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
338 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
339 string user_prefix(pfx->first, 7);
340 const string & term_pfx_list = pfx->second;
341 string::size_type i = 0;
342 do {
343 string::size_type i0 = i;
344 i = term_pfx_list.find('\t', i);
345 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
346 qp.add_prefix(user_prefix, term_pfx);
347 // std::map::insert() won't overwrite an existing entry, so we'll
348 // prefer the first user_prefix for which a particular term prefix
349 // is specified.
350 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
351 } while (++i);
353 pfx = option.lower_bound("boolprefix,");
354 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
355 string user_prefix(pfx->first, 11, string::npos);
356 auto it = option.find("nonexclusiveprefix," + pfx->second);
357 bool exclusive = (it == option.end() || it->second.empty());
358 qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
359 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
362 try {
363 unsigned default_flags = read_qp_flags("flag_", 0);
365 vector<Xapian::Query> queries;
366 queries.reserve(query_strings.size());
368 for (auto& j : query_strings) {
369 const string& prefix = j.first;
370 const string& query_string = j.second;
372 // Choose the stemmer to use for this input.
373 string stemlang = option[prefix + ":stemmer"];
374 if (stemlang.empty())
375 stemlang = option["stemmer"];
376 qp.set_stemmer(Xapian::Stem(stemlang));
378 // Work out the flags to use for this input.
379 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
381 Xapian::Query q = qp.parse_query(query_string, f, prefix);
382 if (!q.empty())
383 queries.push_back(q);
385 query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
386 } catch (Xapian::QueryParserError &e) {
387 error_msg = e.get_msg();
388 return BAD_QUERY;
391 Xapian::termcount n_new_terms = 0;
392 for (Xapian::TermIterator i = query.get_terms_begin();
393 i != query.get_terms_end(); ++i) {
394 if (termset.find(*i) == termset.end()) {
395 termset.insert(*i);
396 if (!queryterms.empty()) queryterms += '\t';
397 queryterms += *i;
399 n_new_terms++;
402 // Check new query against the previous one
403 if (oldp.empty()) {
404 // If oldp was empty that means there were no parsed query terms
405 // before, so if there are now this is a new query.
406 return n_new_terms ? NEW_QUERY : SAME_QUERY;
409 // The terms in oldp are separated by tabs.
410 const char oldp_separator = '\t';
411 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
413 // short-cut: if the new query has fewer terms, it must be a new one
414 if (n_new_terms < n_old_terms) return NEW_QUERY;
416 const char *term = oldp.c_str();
417 const char *pend;
418 while ((pend = strchr(term, oldp_separator)) != NULL) {
419 if (termset.find(string(term, pend - term)) == termset.end())
420 return NEW_QUERY;
421 term = pend + 1;
423 if (*term) {
424 if (termset.find(string(term)) == termset.end())
425 return NEW_QUERY;
428 // Use termset.size() rather than n_new_terms so we correctly handle
429 // the case when the query has repeated terms.
430 // This works wrongly in the case when the user extends the query
431 // by adding a term already in it, but that's unlikely and the behaviour
432 // isn't too bad (we just don't reset page 1). We also mishandle a few
433 // other obscure cases e.g. adding quotes to turn a query into a phrase.
434 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
435 return SAME_QUERY;
438 static multimap<string, string> filter_map;
439 static set<string> neg_filters;
441 typedef multimap<string, string>::const_iterator FMCI;
443 void add_bterm(const string &term) {
444 string prefix;
445 if (prefix_from_term(&prefix, term) > 0)
446 filter_map.insert(multimap<string, string>::value_type(prefix, term));
449 void add_nterm(const string &term) {
450 if (!term.empty())
451 neg_filters.insert(term);
454 static void
455 run_query()
457 string scheme;
458 bool force_boolean = false;
459 if (!filter_map.empty()) {
460 // OR together filters with the same prefix (or AND for non-exclusive
461 // prefixes), then AND together the resultant groups.
462 vector<Xapian::Query> filter_vec;
463 vector<string> same_vec;
464 string current;
465 for (FMCI i = filter_map.begin(); ; ++i) {
466 bool over = (i == filter_map.end());
467 if (over || i->first != current) {
468 switch (same_vec.size()) {
469 case 0:
470 break;
471 case 1:
472 filter_vec.push_back(Xapian::Query(same_vec[0]));
473 break;
474 default: {
475 Xapian::Query::op op = Xapian::Query::OP_OR;
476 auto it = option.find("nonexclusiveprefix," + current);
477 if (it != option.end() && !it->second.empty()) {
478 op = Xapian::Query::OP_AND;
480 filter_vec.push_back(Xapian::Query(op,
481 same_vec.begin(),
482 same_vec.end()));
483 break;
486 same_vec.clear();
487 if (over) break;
488 current = i->first;
490 same_vec.push_back(i->second);
493 Xapian::Query filter(Xapian::Query::OP_AND,
494 filter_vec.begin(), filter_vec.end());
496 if (query.empty()) {
497 // If no query strings were provided then promote the filters
498 // to be THE query - filtering an empty query will give no
499 // matches.
500 std::swap(query, filter);
501 auto&& it = option.find("weightingpurefilter");
502 if (it != option.end() && !it->second.empty()) {
503 scheme = it->second;
504 } else {
505 force_boolean = true;
507 } else {
508 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
512 if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
513 Xapian::Query date_filter;
514 if (date_value_slot != Xapian::BAD_VALUENO) {
515 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
516 // latter the sort order just works correctly between different
517 // precisions).
518 bool as_time_t =
519 db.get_value_lower_bound(date_value_slot).size() == 4 &&
520 db.get_value_upper_bound(date_value_slot).size() == 4;
521 date_filter = date_value_range(as_time_t, date_value_slot,
522 date_start, date_end,
523 date_span);
524 } else {
525 date_filter = date_range_filter(date_start, date_end, date_span);
526 date_filter = Xapian::Query(Xapian::Query::OP_OR,
527 date_filter,
528 Xapian::Query("Dlatest"));
531 // If no query strings were provided then promote the daterange
532 // filter to be THE query instead of filtering an empty query.
533 if (query.empty()) {
534 query = date_filter;
535 force_boolean = true;
536 } else {
537 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
541 if (!neg_filters.empty()) {
542 // OR together all negated filters.
543 Xapian::Query filter(Xapian::Query::OP_OR,
544 neg_filters.begin(), neg_filters.end());
546 if (query.empty()) {
547 // If we only have a negative filter for the query, use MatchAll as
548 // the query to apply the filters to.
549 query = Xapian::Query::MatchAll;
550 force_boolean = true;
552 query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
555 if (!enquire || !error_msg.empty()) return;
557 if (!force_boolean && scheme.empty()) {
558 auto&& it = option.find("weighting");
559 if (it != option.end()) scheme = it->second;
561 set_weighting_scheme(*enquire, scheme, force_boolean);
563 enquire->set_cutoff(threshold);
565 if (sort_keymaker) {
566 if (sort_after) {
567 enquire->set_sort_by_relevance_then_key(sort_keymaker,
568 reverse_sort);
569 } else {
570 enquire->set_sort_by_key_then_relevance(sort_keymaker,
571 reverse_sort);
573 } else if (sort_key != Xapian::BAD_VALUENO) {
574 if (sort_after) {
575 enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
576 } else {
577 enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
581 enquire->set_docid_order(docid_order);
583 if (collapse) {
584 enquire->set_collapse_key(collapse_key);
587 if (!query.empty()) {
588 #if 0
589 // FIXME: If we start doing permissions checks based on $REMOTE_USER
590 // we're going to break some existing setups if users upgrade. We
591 // probably want a way to set this from OmegaScript.
592 const char * remote_user = getenv("REMOTE_USER");
593 if (remote_user)
594 apply_unix_permissions(query, remote_user);
595 #endif
597 enquire->set_query(query);
598 // We could use the value of topdoc as first parameter, but we
599 // need to know the first few items in the mset to fake a
600 // relevance set for topterms.
602 // If min_hits isn't set, check at least one extra result so we
603 // know if we've reached the end of the matches or not - then we
604 // can avoid offering a "next" button which leads to an empty page.
605 mset = enquire->get_mset(0, topdoc + hits_per_page,
606 topdoc + max(hits_per_page + 1, min_hits),
607 &rset);
611 string
612 html_escape(const string &str)
614 string res;
615 string::size_type p = 0;
616 while (p < str.size()) {
617 char ch = str[p++];
618 switch (ch) {
619 case '<':
620 res += "&lt;";
621 continue;
622 case '>':
623 res += "&gt;";
624 continue;
625 case '&':
626 res += "&amp;";
627 continue;
628 case '"':
629 res += "&quot;";
630 continue;
631 default:
632 res += ch;
635 return res;
638 static string
639 html_strip(const string &str)
641 string res;
642 string::size_type p = 0;
643 bool skip = false;
644 while (p < str.size()) {
645 char ch = str[p++];
646 switch (ch) {
647 case '<':
648 skip = true;
649 continue;
650 case '>':
651 skip = false;
652 continue;
653 default:
654 if (! skip) res += ch;
657 return res;
660 class WordList {
661 static string prev_list;
662 static unordered_map<string, int> word_to_occurrence;
663 public:
664 void build_word_map(const string& list) {
665 // Don't build map again if passed list of terms is same as before.
666 if (prev_list == list) return;
667 word_to_occurrence.clear();
668 string::size_type split = 0, split2;
669 int word_index = 0;
670 string word;
671 while ((split2 = list.find('\t', split)) != string::npos) {
672 word = list.substr(split, split2 - split);
673 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
674 ++word_index;
675 split = split2 + 1;
677 word = list.substr(split, list.size() - split);
678 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
679 ++word_index;
680 prev_list = list;
683 int word_in_list(const string& word) {
684 auto it = word_to_occurrence.find(word);
685 if (it == word_to_occurrence.end()) return -1;
686 return it->second;
690 string WordList::prev_list;
691 unordered_map<string, int> WordList::word_to_occurrence;
693 // Not a character in an identifier
694 inline static bool
695 p_notid(unsigned int c)
697 return !C_isalnum(c) && c != '_';
700 // Not a character in an HTML tag name
701 inline static bool
702 p_nottag(unsigned int c)
704 return !C_isalnum(c) && c != '.' && c != '-';
707 // FIXME: shares algorithm with indextext.cc!
708 static string
709 html_highlight(const string &s, const string &list,
710 const string &bra, const string &ket)
712 if (!stemmer) {
713 stemmer = new Xapian::Stem(option["stemmer"]);
716 string res;
718 Utf8Iterator j(s);
719 const Utf8Iterator s_end;
720 while (true) {
721 Utf8Iterator first = j;
722 while (first != s_end && !is_wordchar(*first)) ++first;
723 if (first == s_end) break;
724 Utf8Iterator term_end;
725 string term;
726 string word;
727 const char *l = j.raw();
728 if (*first < 128 && C_isupper(*first)) {
729 j = first;
730 Xapian::Unicode::append_utf8(term, *j);
731 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
732 Xapian::Unicode::append_utf8(term, *j);
734 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
735 term.resize(0);
737 term_end = j;
739 if (term.empty()) {
740 j = first;
741 while (is_wordchar(*j)) {
742 Xapian::Unicode::append_utf8(term, *j);
743 ++j;
744 if (j == s_end) break;
745 if (*j == '&' || *j == '\'') {
746 Utf8Iterator next = j;
747 ++next;
748 if (next == s_end || !is_wordchar(*next)) break;
749 term += *j;
750 j = next;
753 term_end = j;
754 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
755 string::size_type len = term.length();
756 if (*j == '#') {
757 term += '#';
758 do { ++j; } while (j != s_end && *j == '#');
759 } else {
760 while (j != s_end && (*j == '+' || *j == '-')) {
761 Xapian::Unicode::append_utf8(term, *j);
762 ++j;
765 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
766 term.resize(len);
767 } else {
768 term_end = j;
772 j = term_end;
773 term = Xapian::Unicode::tolower(term);
774 WordList w;
775 w.build_word_map(list);
776 int match = w.word_in_list(term);
777 if (match == -1) {
778 string stem = "Z";
779 stem += (*stemmer)(term);
780 match = w.word_in_list(stem);
782 if (match >= 0) {
783 res += html_escape(string(l, first.raw() - l));
784 if (!bra.empty()) {
785 res += bra;
786 } else {
787 static const char * colours[] = {
788 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
789 "990000", "009900", "996600", "006699", "990099"
791 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
792 const char * bg = colours[idx];
793 if (strchr(bg, 'f')) {
794 res += "<b style=\"color:black;background-color:#";
795 } else {
796 res += "<b style=\"color:white;background-color:#";
798 res += bg;
799 res += "\">";
801 word.assign(first.raw(), j.raw() - first.raw());
802 res += html_escape(word);
803 if (!bra.empty()) {
804 res += ket;
805 } else {
806 res += "</b>";
808 } else {
809 res += html_escape(string(l, j.raw() - l));
812 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
813 return res;
816 #if 0
817 static void
818 print_query_string(const char *after)
820 if (after && strncmp(after, "&B=", 3) == 0) {
821 char prefix = after[3];
822 string::size_type start = 0, amp = 0;
823 while (true) {
824 amp = url_query_string.find('&', amp);
825 if (amp == string::npos) {
826 cout << url_query_string.substr(start);
827 return;
829 amp++;
830 while (url_query_string[amp] == 'B' &&
831 url_query_string[amp + 1] == '=' &&
832 url_query_string[amp + 2] == prefix) {
833 cout << url_query_string.substr(start, amp - start - 1);
834 start = url_query_string.find('&', amp + 3);
835 if (start == string::npos) return;
836 amp = start + 1;
840 cout << url_query_string;
842 #endif
844 class Fields {
845 mutable Xapian::docid did_cached;
846 mutable map<string, string> fields;
848 void read_fields(Xapian::docid did) const;
850 public:
851 Fields() : did_cached(0) { }
853 const string & get_field(Xapian::docid did, const string & field) const {
854 if (did != did_cached) read_fields(did);
855 return fields[field];
859 void
860 Fields::read_fields(Xapian::docid did) const
862 fields.clear();
863 did_cached = did;
864 const string & data = db.get_document(did).get_data();
866 // Parse document data.
867 string::size_type i = 0;
868 const string & names = option["fieldnames"];
869 if (!names.empty()) {
870 // Each line is a field, with fieldnames taken from corresponding
871 // entries in the tab-separated list specified by $opt{fieldnames}.
872 string::size_type n = 0;
873 do {
874 string::size_type n0 = n;
875 n = names.find('\t', n);
876 string::size_type i0 = i;
877 i = data.find('\n', i);
878 fields.insert(make_pair(names.substr(n0, n - n0),
879 data.substr(i0, i - i0)));
880 } while (++n && ++i);
881 } else {
882 // Each line is a field, in the format NAME=VALUE. We assume the field
883 // name doesn't contain an "=". Lines without an "=" are currently
884 // just ignored.
885 do {
886 string::size_type i0 = i;
887 i = data.find('\n', i);
888 string line(data, i0, i - i0);
889 string::size_type j = line.find('=');
890 if (j != string::npos) {
891 string & value = fields[line.substr(0, j)];
892 if (!value.empty()) value += '\t';
893 value.append(line, j + 1, string::npos);
895 } while (++i);
899 static Fields fields;
900 static Xapian::docid q0;
901 static Xapian::doccount hit_no;
902 static int percent;
903 static double weight;
904 static Xapian::doccount collapsed;
906 static string print_caption(const string &fmt, const vector<string> &param);
908 enum tagval {
909 CMD_,
910 CMD_add,
911 CMD_addfilter,
912 CMD_allterms,
913 CMD_and,
914 CMD_cgi,
915 CMD_cgilist,
916 CMD_cgiparams,
917 CMD_chr,
918 CMD_collapsed,
919 CMD_cond,
920 CMD_contains,
921 CMD_csv,
922 CMD_date,
923 CMD_dbname,
924 CMD_dbsize,
925 CMD_def,
926 CMD_defaultop,
927 CMD_div,
928 CMD_eq,
929 CMD_emptydocs,
930 CMD_env,
931 CMD_error,
932 CMD_field,
933 CMD_filesize,
934 CMD_filters,
935 CMD_filterterms,
936 CMD_find,
937 CMD_fmt,
938 CMD_freq,
939 CMD_ge,
940 CMD_gt,
941 CMD_hash,
942 CMD_highlight,
943 CMD_hit,
944 CMD_hitlist,
945 CMD_hitsperpage,
946 CMD_hostname,
947 CMD_html,
948 CMD_htmlstrip,
949 CMD_httpheader,
950 CMD_id,
951 CMD_if,
952 CMD_include,
953 CMD_json,
954 CMD_jsonarray,
955 CMD_last,
956 CMD_lastpage,
957 CMD_le,
958 CMD_length,
959 CMD_list,
960 CMD_log,
961 CMD_lookup,
962 CMD_lower,
963 CMD_lt,
964 CMD_map,
965 CMD_match,
966 CMD_max,
967 CMD_min,
968 CMD_mod,
969 CMD_msize,
970 CMD_msizeexact,
971 CMD_msizelower,
972 CMD_msizeupper,
973 CMD_mul,
974 CMD_muldiv,
975 CMD_ne,
976 CMD_nice,
977 CMD_not,
978 CMD_now,
979 CMD_opt,
980 CMD_or,
981 CMD_ord,
982 CMD_pack,
983 CMD_percentage,
984 CMD_prettyterm,
985 CMD_prettyurl,
986 CMD_query,
987 CMD_querydescription,
988 CMD_queryterms,
989 CMD_range,
990 CMD_record,
991 CMD_relevant,
992 CMD_relevants,
993 CMD_score,
994 CMD_set,
995 CMD_seterror,
996 CMD_setmap,
997 CMD_setrelevant,
998 CMD_slice,
999 CMD_snippet,
1000 CMD_sort,
1001 CMD_split,
1002 CMD_stoplist,
1003 CMD_sub,
1004 CMD_subdb,
1005 CMD_subid,
1006 CMD_substr,
1007 CMD_suggestion,
1008 CMD_switch,
1009 CMD_termprefix,
1010 CMD_terms,
1011 CMD_thispage,
1012 CMD_time,
1013 CMD_topdoc,
1014 CMD_topterms,
1015 CMD_transform,
1016 CMD_truncate,
1017 CMD_uniq,
1018 CMD_unique,
1019 CMD_unpack,
1020 CMD_unprefix,
1021 CMD_unstem,
1022 CMD_upper,
1023 CMD_url,
1024 CMD_value,
1025 CMD_version,
1026 CMD_weight,
1027 CMD_MACRO // special tag for macro evaluation
1030 struct func_attrib {
1031 int tag;
1032 int minargs, maxargs, evalargs;
1033 char ensure;
1036 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1037 struct func_desc {
1038 const char *name;
1039 struct func_attrib a;
1042 #define N -1
1043 #define M 'M'
1044 #define Q 'Q'
1045 // NB when adding a new command which ensures M or Q, update the list in
1046 // docs/omegascript.rst
1047 static struct func_desc func_tab[] = {
1048 //name minargs maxargs evalargs ensure
1049 {"",{CMD_, N, N, 0, 0}},// commented out code
1050 T(add, 0, N, N, 0), // add a list of numbers
1051 T(addfilter, 1, 1, N, 0), // add filter term
1052 T(allterms, 0, 1, N, 0), // list of all terms matching document
1053 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
1054 T(cgi, 1, 1, N, 0), // return cgi parameter value
1055 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
1056 T(cgiparams, 0, 0, N, 0), // return list of cgi parameter names
1057 T(chr, 1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1058 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
1059 T(cond, 2, N, 0, 0), // return position of substring, or empty string
1060 T(contains, 2, 2, N, 0), // return position of substring, or empty string
1061 T(csv, 1, 2, N, 0), // CSV string escaping
1062 T(date, 1, 2, N, 0), // convert time_t to strftime format
1063 // (default: YYYY-MM-DD)
1064 T(dbname, 0, 0, N, 0), // database name
1065 T(dbsize, 0, 0, N, 0), // database size (# of documents)
1066 T(def, 2, 2, 1, 0), // define a macro
1067 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
1068 T(div, 2, 2, N, 0), // integer divide
1069 T(emptydocs, 0, 1, N, 0), // list of empty documents
1070 T(env, 1, 1, N, 0), // environment variable
1071 T(error, 0, 0, N, 0), // error message
1072 T(eq, 2, 2, N, 0), // test equality
1073 T(field, 1, 2, N, 0), // lookup field in record
1074 T(filesize, 1, 1, N, 0), // pretty printed filesize
1075 T(filters, 0, 0, N, 0), // serialisation of current filters
1076 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
1077 T(find, 2, 2, N, 0), // find entry in list
1078 T(fmt, 0, 0, N, 0), // name of current format
1079 T(freq, 1, 1, N, 0), // frequency of a term
1080 T(ge, 2, 2, N, 0), // test >=
1081 T(gt, 2, 2, N, 0), // test >
1082 T(hash, 2, 2, N, 0), // hash a string using the specified hash function
1083 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
1084 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
1085 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
1086 T(hitsperpage, 0, 0, N, 0), // hits per page
1087 T(hostname, 1, 1, N, 0), // extract hostname from URL
1088 T(html, 1, 1, N, 0), // html escape string (<>&")
1089 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1090 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
1091 T(id, 0, 0, N, 0), // docid of current doc
1092 T(if, 2, 3, 1, 0), // conditional
1093 T(include, 1, 1, 1, 0), // include another file
1094 T(json, 1, 1, N, 0), // JSON string escaping
1095 T(jsonarray, 1, 1, N, 0), // Format list as a JSON array of strings
1096 T(last, 0, 0, N, M), // hit number one beyond end of current page
1097 T(lastpage, 0, 0, N, M), // number of last hit page
1098 T(le, 2, 2, N, 0), // test <=
1099 T(length, 1, 1, N, 0), // length of list
1100 T(list, 2, 5, N, 0), // pretty print list
1101 T(log, 1, 2, 1, 0), // create a log entry
1102 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1103 T(lower, 1, 1, N, 0), // convert string to lower case
1104 T(lt, 2, 2, N, 0), // test <
1105 T(map, 2, 2, 1, 0), // map a list into another list
1106 T(match, 2, 3, N, 0), // regex match
1107 T(max, 1, N, N, 0), // maximum of a list of values
1108 T(min, 1, N, N, 0), // minimum of a list of values
1109 T(mod, 2, 2, N, 0), // integer modulus
1110 T(msize, 0, 0, N, M), // number of matches (estimated)
1111 T(msizeexact, 0, 0, N, M), // is $msize exact?
1112 T(msizelower, 0, 0, N, M), // number of matches (lower bound)
1113 T(msizeupper, 0, 0, N, M), // number of matches (upper bound)
1114 T(mul, 2, N, N, 0), // multiply a list of numbers
1115 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1116 T(ne, 2, 2, N, 0), // test not equal
1117 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1118 T(not, 1, 1, N, 0), // logical not
1119 T(now, 0, 0, N, 0), // current date/time as a time_t
1120 T(opt, 1, 2, N, 0), // lookup an option value
1121 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1122 T(ord, 1, 1, N, 0), // return codepoint for first character of UTF-8 string
1123 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1124 T(percentage, 0, 0, N, 0), // percentage score of current hit
1125 T(prettyterm, 1, 1, N, Q), // pretty print term name
1126 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1127 T(query, 0, 1, N, Q), // query
1128 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1129 T(queryterms, 0, 0, N, Q), // list of query terms
1130 T(range, 2, 2, N, 0), // return list of values between start and end
1131 T(record, 0, 1, N, 0), // record contents of document
1132 T(relevant, 0, 1, N, Q), // is document relevant?
1133 T(relevants, 0, 0, N, Q), // return list of relevant documents
1134 T(score, 0, 0, N, 0), // score (0-10) of current hit
1135 T(set, 2, 2, N, 0), // set option value
1136 T(seterror, 1, 1, N, 0), // set error_msg, setting it early stops query execution
1137 T(setmap, 1, N, N, 0), // set map of option values
1138 T(setrelevant, 0, 1, N, Q), // set rset
1139 T(slice, 2, 2, N, 0), // slice a list using a second list
1140 T(snippet, 1, 2, N, M), // generate snippet from text
1141 T(sort, 1, 2, N, M), // alpha sort a list
1142 T(split, 1, 2, N, 0), // split a string to give a list
1143 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1144 T(sub, 2, 2, N, 0), // subtract
1145 T(subdb, 0, 1, N, 0), // name of subdb docid is in
1146 T(subid, 0, 1, N, 0), // docid in the subdb#
1147 T(substr, 2, 3, N, 0), // substring
1148 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1149 T(switch, 3, N, 1, 0), // return position of substring, or empty string
1150 T(termprefix, 1, 1, N, 0), // get any prefix from a term
1151 T(terms, 0, 1, N, M), // list of matching terms
1152 T(thispage, 0, 0, N, M), // page number of current page
1153 T(time, 0, 0, N, M), // how long the match took (in seconds)
1154 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1155 // (counting from 0)
1156 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1157 // (default 16)
1158 T(transform, 3, 4, N, 0), // transform with a regexp
1159 T(truncate, 2, 4, N, 0), // truncate after a word
1160 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1161 T(unique, 1, 1, N, 0), // removed duplicates from any list
1162 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1163 T(unprefix, 1, 1, N, 0), // remove any prefix from a term
1164 T(unstem, 1, 1, N, Q), // return list of terms from the parsed query
1165 // which stemmed to this term
1166 T(upper, 1, 1, N, 0), // convert string to upper case
1167 T(url, 1, 1, N, 0), // url encode argument
1168 T(value, 1, 2, N, 0), // return document value
1169 T(version, 0, 0, N, 0), // omega version string
1170 T(weight, 0, 0, N, 0), // weight of the current hit
1171 { NULL,{0, 0, 0, 0, 0}}
1174 #undef T // Leaving T defined screws up Sun's C++ compiler!
1176 static vector<string> macros;
1178 // Call write() repeatedly until all data is written or we get a
1179 // non-recoverable error.
1180 static ssize_t
1181 write_all(int fd, const char * buf, size_t count)
1183 while (count) {
1184 ssize_t r = write(fd, buf, count);
1185 if (rare(r < 0)) {
1186 if (errno == EINTR) continue;
1187 return r;
1189 buf += r;
1190 count -= r;
1192 return 0;
1195 static const vector<string>&
1196 get_subdbs()
1198 static vector<string> subdbs;
1199 if (subdbs.empty()) {
1200 size_t p = 0, q;
1201 while (true) {
1202 q = dbname.find('/', p);
1203 subdbs.emplace_back(dbname, p, q - p);
1204 if (q == string::npos) break;
1205 p = q + 1;
1208 return subdbs;
1211 static string
1212 eval(const string &fmt, const vector<string> &param)
1214 static map<string, const struct func_attrib *> func_map;
1215 if (func_map.empty()) {
1216 struct func_desc *p;
1217 for (p = func_tab; p->name != NULL; ++p) {
1218 func_map[string(p->name)] = &(p->a);
1221 string res;
1222 string::size_type p = 0, q;
1223 while ((q = fmt.find('$', p)) != string::npos) try {
1224 res.append(fmt, p, q - p);
1225 string::size_type code_start = q; // note down for error reporting
1226 q++;
1227 if (q >= fmt.size()) break;
1228 unsigned char ch = fmt[q];
1229 switch (ch) {
1230 // Magic sequences:
1231 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1232 case '$':
1233 res += '$';
1234 p = q + 1;
1235 continue;
1236 case '(':
1237 res += '{';
1238 p = q + 1;
1239 continue;
1240 case ')':
1241 res += '}';
1242 p = q + 1;
1243 continue;
1244 case '.':
1245 res += ',';
1246 p = q + 1;
1247 continue;
1248 case '_':
1249 ch = '0';
1250 // FALL THRU
1251 case '1': case '2': case '3': case '4': case '5':
1252 case '6': case '7': case '8': case '9':
1253 ch -= '0';
1254 if (ch < param.size()) res += param[ch];
1255 p = q + 1;
1256 continue;
1257 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1258 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1259 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1260 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1261 case 'y': case 'z':
1262 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1263 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1264 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1265 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1266 case 'Y': case 'Z':
1267 case '{':
1268 break;
1269 default:
1270 string msg = "Unknown $ code in: $";
1271 msg.append(fmt, q, string::npos);
1272 throw msg;
1274 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1275 string var(fmt, q, p - q);
1276 map<string, const struct func_attrib *>::const_iterator func;
1277 func = func_map.find(var);
1278 if (func == func_map.end()) {
1279 throw "Unknown function '" + var + "'";
1281 vector<string> args;
1282 if (fmt[p] == '{') {
1283 q = p + 1;
1284 int nest = 1;
1285 while (true) {
1286 p = fmt.find_first_of(",{}", p + 1);
1287 if (p == string::npos)
1288 throw "missing } in " + fmt.substr(code_start);
1289 if (fmt[p] == '{') {
1290 ++nest;
1291 } else {
1292 if (nest == 1) {
1293 // should we split the args
1294 if (func->second->minargs != N) {
1295 args.push_back(fmt.substr(q, p - q));
1296 q = p + 1;
1299 if (fmt[p] == '}' && --nest == 0) break;
1302 if (func->second->minargs == N)
1303 args.push_back(fmt.substr(q, p - q));
1304 ++p;
1307 if (func->second->minargs != N) {
1308 if (int(args.size()) < func->second->minargs)
1309 throw "too few arguments to $" + var;
1310 if (func->second->maxargs != N &&
1311 int(args.size()) > func->second->maxargs)
1312 throw "too many arguments to $" + var;
1314 vector<string>::size_type n;
1315 if (func->second->evalargs != N)
1316 n = func->second->evalargs;
1317 else
1318 n = args.size();
1320 for (vector<string>::size_type j = 0; j < n; ++j)
1321 args[j] = eval(args[j], param);
1323 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1324 ensure_query_parsed();
1325 if (func->second->ensure == 'M') ensure_match();
1326 string value;
1327 switch (func->second->tag) {
1328 case CMD_:
1329 break;
1330 case CMD_add: {
1331 int total = 0;
1332 for (auto&& arg : args)
1333 total += string_to_int(arg);
1334 value = str(total);
1335 break;
1337 case CMD_addfilter:
1338 add_bterm(args[0]);
1339 break;
1340 case CMD_allterms: {
1341 // list of all terms indexing document
1342 Xapian::docid id = q0;
1343 if (!args.empty()) id = string_to_int(args[0]);
1344 for (Xapian::TermIterator term = db.termlist_begin(id);
1345 term != db.termlist_end(id); ++term) {
1346 value += *term;
1347 value += '\t';
1350 if (!value.empty()) value.erase(value.size() - 1);
1351 break;
1353 case CMD_and: {
1354 value = "true";
1355 for (auto&& arg : args) {
1356 if (eval(arg, param).empty()) {
1357 value.resize(0);
1358 break;
1361 break;
1363 case CMD_cgi: {
1364 MCI i = cgi_params.find(args[0]);
1365 if (i != cgi_params.end()) value = i->second;
1366 break;
1368 case CMD_cgilist: {
1369 pair<MCI, MCI> g;
1370 g = cgi_params.equal_range(args[0]);
1371 for (MCI i = g.first; i != g.second; ++i) {
1372 value += i->second;
1373 value += '\t';
1375 if (!value.empty()) value.erase(value.size() - 1);
1376 break;
1378 case CMD_cgiparams: {
1379 const string* prev = NULL;
1380 for (auto&& i : cgi_params) {
1381 if (prev && i.first == *prev) continue;
1382 value += i.first;
1383 value += '\t';
1384 prev = &i.first;
1386 if (!value.empty()) value.erase(value.size() - 1);
1387 break;
1389 case CMD_chr:
1390 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1391 break;
1392 case CMD_collapsed: {
1393 value = str(collapsed);
1394 break;
1396 case CMD_cond:
1397 for (size_t i = 0; i < args.size(); i += 2) {
1398 if (i == args.size() - 1) {
1399 // Handle optional "else" value.
1400 value = eval(args[i], param);
1401 break;
1403 if (!eval(args[i], param).empty()) {
1404 value = eval(args[i + 1], param);
1405 break;
1408 break;
1409 case CMD_contains: {
1410 size_t pos = args[1].find(args[0]);
1411 if (pos != string::npos) {
1412 value = str(pos);
1414 break;
1416 case CMD_csv:
1417 value = args[0];
1418 if (args.size() > 1 && !args[1].empty()) {
1419 csv_escape_always(value);
1420 } else {
1421 csv_escape(value);
1423 break;
1424 case CMD_date:
1425 value = args[0];
1426 if (!value.empty()) {
1427 char buf[64] = "";
1428 time_t date = string_to_int(value);
1429 if (date != static_cast<time_t>(-1)) {
1430 struct tm *then;
1431 then = gmtime(&date);
1432 string date_fmt = "%Y-%m-%d";
1433 if (args.size() > 1) date_fmt = eval(args[1], param);
1434 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1436 value = buf;
1438 break;
1439 case CMD_dbname:
1440 value = dbname;
1441 break;
1442 case CMD_dbsize: {
1443 static Xapian::doccount dbsize;
1444 if (!dbsize) dbsize = db.get_doccount();
1445 value = str(dbsize);
1446 break;
1448 case CMD_def: {
1449 func_attrib *fa = new func_attrib;
1450 fa->tag = CMD_MACRO + macros.size();
1451 fa->minargs = 0;
1452 fa->maxargs = 9;
1453 fa->evalargs = N; // FIXME: or 0?
1454 fa->ensure = 0;
1456 macros.push_back(args[1]);
1457 func_map[args[0]] = fa;
1458 break;
1460 case CMD_defaultop:
1461 if (default_op == Xapian::Query::OP_AND) {
1462 value = "and";
1463 } else {
1464 value = "or";
1466 break;
1467 case CMD_div: {
1468 int denom = string_to_int(args[1]);
1469 if (denom == 0) {
1470 value = "divide by 0";
1471 } else {
1472 value = str(string_to_int(args[0]) /
1473 string_to_int(args[1]));
1475 break;
1477 case CMD_eq:
1478 if (args[0] == args[1]) value = "true";
1479 break;
1480 case CMD_emptydocs: {
1481 string t;
1482 if (!args.empty())
1483 t = args[0];
1484 Xapian::PostingIterator i;
1485 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1486 if (i.get_doclength() != 0) continue;
1487 if (!value.empty()) value += '\t';
1488 value += str(*i);
1490 break;
1492 case CMD_env: {
1493 char *env = getenv(args[0].c_str());
1494 if (env != NULL) value = env;
1495 break;
1497 case CMD_error:
1498 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1499 error_msg = "Database '" + dbname + "' couldn't be opened";
1501 value = error_msg;
1502 break;
1503 case CMD_field: {
1504 Xapian::docid did = q0;
1505 if (args.size() > 1) did = string_to_int(args[1]);
1506 value = fields.get_field(did, args[0]);
1507 break;
1509 case CMD_filesize: {
1510 // FIXME: rounding? i18n?
1511 int size = string_to_int(args[0]);
1512 int intpart = size;
1513 int fraction = -1;
1514 const char * format = 0;
1515 if (size < 0) {
1516 // Negative size -> empty result.
1517 } else if (size == 1) {
1518 format = "%d byte";
1519 } else if (size < 1024) {
1520 format = "%d bytes";
1521 } else {
1522 if (size < 1024 * 1024) {
1523 format = "%d.%cK";
1524 } else {
1525 size /= 1024;
1526 if (size < 1024 * 1024) {
1527 format = "%d.%cM";
1528 } else {
1529 size /= 1024;
1530 format = "%d.%cG";
1533 intpart = unsigned(size) / 1024;
1534 fraction = unsigned(size) % 1024;
1536 if (format) {
1537 char buf[200];
1538 int len;
1539 if (fraction == -1) {
1540 len = my_snprintf(buf, sizeof(buf), format, intpart);
1541 } else {
1542 fraction = (fraction * 10 / 1024) + '0';
1543 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1545 if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1546 value.assign(buf, len);
1548 break;
1550 case CMD_filters:
1551 value = filters;
1552 break;
1553 case CMD_filterterms: {
1554 Xapian::TermIterator term = db.allterms_begin();
1555 term.skip_to(args[0]);
1556 while (term != db.allterms_end()) {
1557 string t = *term;
1558 if (!startswith(t, args[0])) break;
1559 value += t;
1560 value += '\t';
1561 ++term;
1564 if (!value.empty()) value.erase(value.size() - 1);
1565 break;
1567 case CMD_find: {
1568 string l = args[0], s = args[1];
1569 string::size_type i = 0, j = 0;
1570 size_t count = 0;
1571 while (j != l.size()) {
1572 j = l.find('\t', i);
1573 if (j == string::npos) j = l.size();
1574 if (j - i == s.length()) {
1575 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1576 value = str(count);
1577 break;
1580 ++count;
1581 i = j + 1;
1583 break;
1585 case CMD_fmt:
1586 value = fmtname;
1587 break;
1588 case CMD_freq: {
1589 const string& term = args[0];
1590 Xapian::doccount termfreq = 0;
1591 if (done_query) {
1592 termfreq = mset.get_termfreq(term);
1594 if (termfreq == 0) {
1595 // We want $freq to work before the match is run, and we
1596 // don't want using it to force the match to run.
1597 termfreq = db.get_termfreq(term);
1599 value = str(termfreq);
1600 break;
1602 case CMD_ge:
1603 if (string_to_int(args[0]) >= string_to_int(args[1]))
1604 value = "true";
1605 break;
1606 case CMD_gt:
1607 if (string_to_int(args[0]) > string_to_int(args[1]))
1608 value = "true";
1609 break;
1610 case CMD_hash: {
1611 const string& data = args[0];
1612 const string& hash = args[1];
1613 if (hash == "md5") {
1614 string md5;
1615 md5_string(data, md5);
1616 value.reserve(md5.size() * 2);
1617 for (unsigned char byte : md5) {
1618 value += "0123456789abcdef"[byte >> 4];
1619 value += "0123456789abcdef"[byte & 0x0f];
1621 } else {
1622 throw "Unknown hash function: " + hash;
1624 break;
1626 case CMD_highlight: {
1627 string bra, ket;
1628 if (args.size() > 2) {
1629 bra = args[2];
1630 if (args.size() > 3) {
1631 ket = args[3];
1632 } else {
1633 string::const_iterator i;
1634 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1635 ket = "</";
1636 ket.append(bra, 1, i - bra.begin() - 1);
1637 ket += '>';
1641 value = html_highlight(args[0], args[1], bra, ket);
1642 break;
1644 case CMD_hit:
1645 // 0-based mset index
1646 value = str(hit_no);
1647 break;
1648 case CMD_hitlist:
1649 #if 0
1650 url_query_string = "?DB=";
1651 url_query_string += dbname;
1652 for (auto& j : query_strings) {
1653 if (j.first.empty()) {
1654 url_query_string += "&P=";
1655 } else {
1656 url_query_string += "&P."
1657 url_query_string += j.first;
1658 url_query_string += '=';
1660 const char *q = j.second.c_str();
1661 int ch;
1662 while ((ch = *q++) != '\0') {
1663 switch (ch) {
1664 case '+':
1665 url_query_string += "%2b";
1666 break;
1667 case '"':
1668 url_query_string += "%22";
1669 break;
1670 case '%':
1671 url_query_string += "%25";
1672 break;
1673 case '&':
1674 url_query_string += "%26";
1675 break;
1676 case ' ':
1677 ch = '+';
1678 /* fall through */
1679 default:
1680 url_query_string += ch;
1684 // add any boolean terms
1685 for (FMCI i = filter_map.begin(); i != filter_map.end(); ++i) {
1686 url_query_string += "&B=";
1687 url_query_string += i->second;
1689 #endif
1690 for (hit_no = topdoc; hit_no < last; ++hit_no)
1691 value += print_caption(args[0], param);
1692 hit_no = 0;
1693 break;
1694 case CMD_hitsperpage:
1695 value = str(hits_per_page);
1696 break;
1697 case CMD_hostname: {
1698 value = args[0];
1699 // remove URL scheme and/or path
1700 string::size_type i = value.find("://");
1701 if (i == string::npos) i = 0; else i += 3;
1702 value = value.substr(i, value.find('/', i) - i);
1703 // remove user@ or user:password@
1704 i = value.find('@');
1705 if (i != string::npos) value.erase(0, i + 1);
1706 // remove :port
1707 i = value.find(':');
1708 if (i != string::npos) value.resize(i);
1709 break;
1711 case CMD_html:
1712 value = html_escape(args[0]);
1713 break;
1714 case CMD_htmlstrip:
1715 value = html_strip(args[0]);
1716 break;
1717 case CMD_httpheader:
1718 if (!suppress_http_headers) {
1719 cout << args[0] << ": " << args[1] << endl;
1720 if (!set_content_type && args[0].length() == 12 &&
1721 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1722 set_content_type = true;
1725 break;
1726 case CMD_id:
1727 // document id
1728 value = str(q0);
1729 break;
1730 case CMD_if:
1731 if (!args[0].empty())
1732 value = eval(args[1], param);
1733 else if (args.size() > 2)
1734 value = eval(args[2], param);
1735 break;
1736 case CMD_include:
1737 value = eval_file(args[0]);
1738 break;
1739 case CMD_json:
1740 value = args[0];
1741 json_escape(value);
1742 break;
1743 case CMD_jsonarray: {
1744 const string & l = args[0];
1745 string::size_type i = 0, j;
1746 if (l.empty()) {
1747 value = "[]";
1748 break;
1750 value = "[\"";
1751 while (true) {
1752 j = l.find('\t', i);
1753 string elt(l, i, j - i);
1754 json_escape(elt);
1755 value += elt;
1756 if (j == string::npos) break;
1757 value += "\",\"";
1758 i = j + 1;
1760 value += "\"]";
1761 break;
1763 case CMD_last:
1764 value = str(last);
1765 break;
1766 case CMD_lastpage: {
1767 int l = mset.get_matches_estimated();
1768 if (l > 0) l = (l - 1) / hits_per_page + 1;
1769 value = str(l);
1770 break;
1772 case CMD_le:
1773 if (string_to_int(args[0]) <= string_to_int(args[1]))
1774 value = "true";
1775 break;
1776 case CMD_length:
1777 if (args[0].empty()) {
1778 value = "0";
1779 } else {
1780 size_t length = count(args[0].begin(), args[0].end(), '\t');
1781 value = str(length + 1);
1783 break;
1784 case CMD_list: {
1785 if (!args[0].empty()) {
1786 string pre, inter, interlast, post;
1787 switch (args.size()) {
1788 case 2:
1789 inter = interlast = args[1];
1790 break;
1791 case 3:
1792 inter = args[1];
1793 interlast = args[2];
1794 break;
1795 case 4:
1796 pre = args[1];
1797 inter = interlast = args[2];
1798 post = args[3];
1799 break;
1800 case 5:
1801 pre = args[1];
1802 inter = args[2];
1803 interlast = args[3];
1804 post = args[4];
1805 break;
1807 value += pre;
1808 string list = args[0];
1809 string::size_type split = 0, split2;
1810 while ((split2 = list.find('\t', split)) != string::npos) {
1811 if (split) value += inter;
1812 value.append(list, split, split2 - split);
1813 split = split2 + 1;
1815 if (split) value += interlast;
1816 value.append(list, split, string::npos);
1817 value += post;
1819 break;
1821 case CMD_log: {
1822 if (!vet_filename(args[0])) break;
1823 string logfile = log_dir + args[0];
1824 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1825 if (fd == -1) break;
1826 vector<string> noargs;
1827 noargs.resize(1);
1828 string line;
1829 if (args.size() > 1) {
1830 line = args[1];
1831 } else {
1832 line = DEFAULT_LOG_ENTRY;
1834 line = eval(line, noargs);
1835 line += '\n';
1836 (void)write_all(fd, line.data(), line.length());
1837 close(fd);
1838 break;
1840 case CMD_lookup: {
1841 if (!vet_filename(args[0])) break;
1842 string cdbfile = cdb_dir + args[0];
1843 int fd = open(cdbfile.c_str(), O_RDONLY);
1844 if (fd == -1) break;
1846 struct cdb cdb;
1847 cdb_init(&cdb, fd);
1849 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1850 size_t datalen = cdb_datalen(&cdb);
1851 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1852 if (q) {
1853 value.assign(static_cast<const char *>(dat), datalen);
1857 cdb_free(&cdb);
1858 close(fd); // FIXME: cache fds?
1859 break;
1861 case CMD_lower:
1862 value = Xapian::Unicode::tolower(args[0]);
1863 break;
1864 case CMD_lt:
1865 if (string_to_int(args[0]) < string_to_int(args[1]))
1866 value = "true";
1867 break;
1868 case CMD_map:
1869 if (!args[0].empty()) {
1870 string l = args[0], pat = args[1];
1871 vector<string> new_args(param);
1872 string::size_type i = 0, j;
1873 while (true) {
1874 j = l.find('\t', i);
1875 new_args[0] = l.substr(i, j - i);
1876 value += eval(pat, new_args);
1877 if (j == string::npos) break;
1878 value += '\t';
1879 i = j + 1;
1882 break;
1883 case CMD_match:
1884 omegascript_match(value, args);
1885 break;
1886 case CMD_max: {
1887 vector<string>::const_iterator i = args.begin();
1888 int val = string_to_int(*i++);
1889 for (; i != args.end(); ++i) {
1890 int x = string_to_int(*i);
1891 if (x > val) val = x;
1893 value = str(val);
1894 break;
1896 case CMD_min: {
1897 vector<string>::const_iterator i = args.begin();
1898 int val = string_to_int(*i++);
1899 for (; i != args.end(); ++i) {
1900 int x = string_to_int(*i);
1901 if (x < val) val = x;
1903 value = str(val);
1904 break;
1906 case CMD_msize:
1907 // Estimated number of matches.
1908 value = str(mset.get_matches_estimated());
1909 break;
1910 case CMD_msizeexact:
1911 // Is msize exact?
1912 if (mset.get_matches_lower_bound()
1913 == mset.get_matches_upper_bound())
1914 value = "true";
1915 break;
1916 case CMD_msizelower:
1917 // Lower bound on number of matches.
1918 value = str(mset.get_matches_lower_bound());
1919 break;
1920 case CMD_msizeupper:
1921 // Upper bound on number of matches.
1922 value = str(mset.get_matches_upper_bound());
1923 break;
1924 case CMD_mod: {
1925 int denom = string_to_int(args[1]);
1926 if (denom == 0) {
1927 value = "divide by 0";
1928 } else {
1929 value = str(string_to_int(args[0]) %
1930 string_to_int(args[1]));
1932 break;
1934 case CMD_mul: {
1935 vector<string>::const_iterator i = args.begin();
1936 int total = string_to_int(*i++);
1937 while (i != args.end())
1938 total *= string_to_int(*i++);
1939 value = str(total);
1940 break;
1942 case CMD_muldiv: {
1943 int denom = string_to_int(args[2]);
1944 if (denom == 0) {
1945 value = "divide by 0";
1946 } else {
1947 int num = string_to_int(args[0]) * string_to_int(args[1]);
1948 value = str(num / denom);
1950 break;
1952 case CMD_ne:
1953 if (args[0] != args[1]) value = "true";
1954 break;
1955 case CMD_nice: {
1956 string::const_iterator i = args[0].begin();
1957 int len = args[0].length();
1958 while (len) {
1959 value += *i++;
1960 if (--len && len % 3 == 0) value += option["thousand"];
1962 break;
1964 case CMD_not:
1965 if (args[0].empty()) value = "true";
1966 break;
1967 case CMD_now:
1968 value = str(static_cast<unsigned long>(time(NULL)));
1969 break;
1970 case CMD_opt:
1971 if (args.size() == 2) {
1972 value = option[args[0] + "," + args[1]];
1973 } else {
1974 value = option[args[0]];
1976 break;
1977 case CMD_or: {
1978 for (auto&& arg : args) {
1979 value = eval(arg, param);
1980 if (!value.empty()) break;
1982 break;
1984 case CMD_ord: {
1985 if (!args[0].empty()) {
1986 Utf8Iterator it(args[0]);
1987 value = str(*it);
1989 break;
1991 case CMD_pack:
1992 value = int_to_binary_string(string_to_int(args[0]));
1993 break;
1994 case CMD_percentage:
1995 // percentage score
1996 value = str(percent);
1997 break;
1998 case CMD_prettyterm:
1999 value = pretty_term(args[0]);
2000 break;
2001 case CMD_prettyurl:
2002 value = args[0];
2003 url_prettify(value);
2004 break;
2005 case CMD_query: {
2006 auto r = query_strings.equal_range(args.empty() ?
2007 string() : args[0]);
2008 for (auto j = r.first; j != r.second; ++j) {
2009 if (!value.empty()) value += '\t';
2010 const string & s = j->second;
2011 size_t start = 0, tab;
2012 while ((tab = s.find('\t', start)) != string::npos) {
2013 value.append(s, start, tab - start);
2014 value += ' ';
2015 start = tab + 1;
2017 value.append(s, start, string::npos);
2019 break;
2021 case CMD_querydescription:
2022 value = query.get_description();
2023 break;
2024 case CMD_queryterms:
2025 value = queryterms;
2026 break;
2027 case CMD_range: {
2028 int start = string_to_int(args[0]);
2029 int end = string_to_int(args[1]);
2030 while (start <= end) {
2031 value += str(start);
2032 if (start < end) value += '\t';
2033 start++;
2035 break;
2037 case CMD_record: {
2038 Xapian::docid id = q0;
2039 if (!args.empty()) id = string_to_int(args[0]);
2040 value = db.get_document(id).get_data();
2041 break;
2043 case CMD_relevant: {
2044 // document id if relevant; empty otherwise
2045 Xapian::docid id = q0;
2046 if (!args.empty()) id = string_to_int(args[0]);
2047 map<Xapian::docid, bool>::iterator i = ticked.find(id);
2048 if (i != ticked.end()) {
2049 i->second = false; // icky side-effect
2050 value = str(id);
2052 break;
2054 case CMD_relevants: {
2055 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
2056 i != ticked.end(); ++i) {
2057 if (i->second) {
2058 value += str(i->first);
2059 value += '\t';
2062 if (!value.empty()) value.erase(value.size() - 1);
2063 break;
2065 case CMD_score:
2066 // Score (0 to 10)
2067 value = str(percent / 10);
2068 break;
2069 case CMD_set:
2070 option[args[0]] = args[1];
2071 break;
2072 case CMD_seterror:
2073 error_msg = args[0];
2074 break;
2075 case CMD_setmap: {
2076 string base = args[0] + ',';
2077 if (args.size() % 2 != 1)
2078 throw string("$setmap requires an odd number of arguments");
2079 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2080 option[base + args[i]] = args[i + 1];
2082 break;
2084 case CMD_setrelevant: {
2085 string::size_type i = 0, j;
2086 while (true) {
2087 j = args[0].find_first_not_of("0123456789", i);
2088 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2089 if (id) {
2090 rset.add_document(id);
2091 ticked[id] = true;
2093 if (j == string::npos) break;
2094 i = j + 1;
2096 break;
2098 case CMD_slice: {
2099 string list = args[0], pos = args[1];
2100 vector<string> items;
2101 string::size_type i = 0, j;
2102 while (true) {
2103 j = list.find('\t', i);
2104 items.push_back(list.substr(i, j - i));
2105 if (j == string::npos) break;
2106 i = j + 1;
2108 i = 0;
2109 bool have_added = false;
2110 while (true) {
2111 j = pos.find('\t', i);
2112 int item = string_to_int(pos.substr(i, j - i));
2113 if (item >= 0 && size_t(item) < items.size()) {
2114 if (have_added) value += '\t';
2115 value += items[item];
2116 have_added = true;
2118 if (j == string::npos) break;
2119 i = j + 1;
2121 break;
2123 case CMD_snippet: {
2124 size_t length = 200;
2125 if (args.size() > 1) {
2126 length = string_to_int(args[1]);
2128 if (!stemmer)
2129 stemmer = new Xapian::Stem(option["stemmer"]);
2130 // FIXME: Allow start and end highlight and omit to be specified.
2131 value = mset.snippet(args[0], length, *stemmer,
2132 mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2133 "<strong>", "</strong>", "...");
2134 break;
2136 case CMD_sort: {
2137 const string &list = args[0];
2138 if (list.empty()) break;
2139 bool uniq = false;
2140 bool rev = false;
2141 if (args.size() > 1) {
2142 for (auto opt_ch : args[1]) {
2143 switch (opt_ch) {
2144 case 'r':
2145 rev = true;
2146 break;
2147 case 'u':
2148 uniq = true;
2149 break;
2150 default:
2151 throw string("Unknown $sort option: ") + opt_ch;
2155 vector<string> items;
2156 string::size_type split = 0, split2;
2157 do {
2158 split2 = list.find('\t', split);
2159 items.emplace_back(list, split, split2 - split);
2160 split = split2 + 1;
2161 } while (split2 != string::npos);
2163 if (!rev) {
2164 sort(items.begin(), items.end());
2165 } else {
2166 sort(items.begin(), items.end(),
2167 [](const string& a, const string& b) {
2168 return a > b;
2172 value.reserve(list.size());
2173 bool tab = false;
2174 const string* prev = nullptr;
2175 for (auto&& item : items) {
2176 // Skip duplicates if "u" flag specified.
2177 if (prev && *prev == item) {
2178 continue;
2180 if (uniq) {
2181 prev = &item;
2184 if (tab) {
2185 value += '\t';
2186 } else {
2187 tab = true;
2189 value += item;
2191 break;
2193 case CMD_split: {
2194 string split;
2195 if (args.size() == 1) {
2196 split = " ";
2197 value = args[0];
2198 } else {
2199 split = args[0];
2200 value = args[1];
2202 string::size_type i = 0;
2203 while (true) {
2204 if (split.empty()) {
2205 ++i;
2206 if (i >= value.size()) break;
2207 } else {
2208 i = value.find(split, i);
2209 if (i == string::npos) break;
2211 value.replace(i, split.size(), 1, '\t');
2212 ++i;
2214 break;
2216 case CMD_stoplist: {
2217 Xapian::TermIterator i = qp.stoplist_begin();
2218 Xapian::TermIterator end = qp.stoplist_end();
2219 while (i != end) {
2220 if (!value.empty()) value += '\t';
2221 value += *i;
2222 ++i;
2224 break;
2226 case CMD_sub:
2227 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2228 break;
2229 case CMD_subdb: {
2230 Xapian::docid id = q0;
2231 if (args.size() > 0) id = string_to_int(args[0]);
2232 auto subdbs = get_subdbs();
2233 value = subdbs[(id - 1) % subdbs.size()];
2234 break;
2236 case CMD_subid: {
2237 Xapian::docid id = q0;
2238 if (args.size() > 0) id = string_to_int(args[0]);
2239 value = str(((id - 1) / get_subdbs().size()) + 1);
2240 break;
2242 case CMD_substr: {
2243 int start = string_to_int(args[1]);
2244 if (start < 0) {
2245 if (static_cast<size_t>(-start) >= args[0].size()) {
2246 start = 0;
2247 } else {
2248 start = static_cast<int>(args[0].size()) + start;
2250 } else {
2251 if (static_cast<size_t>(start) >= args[0].size()) break;
2253 size_t len = string::npos;
2254 if (args.size() > 2) {
2255 int int_len = string_to_int(args[2]);
2256 if (int_len >= 0) {
2257 len = size_t(int_len);
2258 } else {
2259 len = args[0].size() - start;
2260 if (static_cast<size_t>(-int_len) >= len) {
2261 len = 0;
2262 } else {
2263 len -= static_cast<size_t>(-int_len);
2267 value.assign(args[0], start, len);
2268 break;
2270 case CMD_suggestion:
2271 value = qp.get_corrected_query_string();
2272 break;
2273 case CMD_switch: {
2274 const string& val = args[0];
2275 for (size_t i = 1; i < args.size(); i += 2) {
2276 if (i == args.size() - 1) {
2277 // Handle optional "else" value.
2278 value = eval(args[i], param);
2279 break;
2281 if (val == eval(args[i], param)) {
2282 value = eval(args[i + 1], param);
2283 break;
2286 break;
2288 case CMD_termprefix:
2289 (void)prefix_from_term(&value, args[0]);
2290 break;
2291 case CMD_terms: {
2292 // list of matching terms
2293 if (!enquire) break;
2294 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2295 if (args.empty()) {
2296 while (term != enquire->get_matching_terms_end(q0)) {
2297 // check term was in the typed query so we ignore
2298 // boolean filter terms
2299 const string & t = *term;
2300 if (termset.find(t) != termset.end()) {
2301 value += t;
2302 value += '\t';
2304 ++term;
2306 } else {
2307 // Return matching terms with specified prefix. We can't
2308 // use skip_to() as the terms aren't ordered by termname.
2309 const string & pfx = args[0];
2310 while (term != enquire->get_matching_terms_end(q0)) {
2311 const string & t = *term;
2312 if (startswith(t, pfx)) {
2313 value += t;
2314 value += '\t';
2316 ++term;
2320 if (!value.empty()) value.erase(value.size() - 1);
2321 break;
2323 case CMD_thispage:
2324 value = str(topdoc / hits_per_page + 1);
2325 break;
2326 case CMD_time:
2327 if (secs >= 0) {
2328 char buf[64];
2329 my_snprintf(buf, sizeof(buf), "%.6f", secs);
2330 // MSVC's snprintf omits the zero byte if the string if
2331 // sizeof(buf) long.
2332 buf[sizeof(buf) - 1] = '\0';
2333 value = buf;
2335 break;
2336 case CMD_topdoc:
2337 // first document on current page of hit list (counting from 0)
2338 value = str(topdoc);
2339 break;
2340 case CMD_topterms:
2341 if (enquire) {
2342 int howmany = 16;
2343 if (!args.empty()) howmany = string_to_int(args[0]);
2344 if (howmany < 0) howmany = 0;
2346 // List of expand terms
2347 Xapian::ESet eset;
2348 OmegaExpandDecider decider(db, &termset);
2350 if (!rset.empty()) {
2351 set_expansion_scheme(*enquire, option);
2352 eset = enquire->get_eset(howmany * 2, rset, &decider);
2353 } else if (mset.size()) {
2354 // invent an rset
2355 Xapian::RSet tmp;
2357 int c = 5;
2358 // FIXME: what if mset does not start at first match?
2359 for (Xapian::docid did : mset) {
2360 tmp.add_document(did);
2361 if (--c == 0) break;
2364 set_expansion_scheme(*enquire, option);
2365 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2368 // Don't show more than one word with the same stem.
2369 set<string> stems;
2370 Xapian::ESetIterator i;
2371 for (i = eset.begin(); i != eset.end(); ++i) {
2372 string term(*i);
2373 string stem = (*stemmer)(term);
2374 if (stems.find(stem) != stems.end()) continue;
2375 stems.insert(stem);
2376 value += term;
2377 value += '\t';
2378 if (--howmany == 0) break;
2380 if (!value.empty()) value.erase(value.size() - 1);
2382 break;
2383 case CMD_transform:
2384 omegascript_transform(value, args);
2385 break;
2386 case CMD_truncate:
2387 value = generate_sample(args[0],
2388 string_to_int(args[1]),
2389 args.size() > 2 ? args[2] : string(),
2390 args.size() > 3 ? args[3] : string());
2391 break;
2392 case CMD_uniq: {
2393 const string &list = args[0];
2394 if (list.empty()) break;
2395 string::size_type split = 0, split2;
2396 string prev;
2397 do {
2398 split2 = list.find('\t', split);
2399 string item(list, split, split2 - split);
2400 if (split == 0) {
2401 value = item;
2402 } else if (item != prev) {
2403 value += '\t';
2404 value += item;
2406 prev = item;
2407 split = split2 + 1;
2408 } while (split2 != string::npos);
2409 break;
2411 case CMD_unique: {
2412 unordered_set<string> seen;
2413 const string &list = args[0];
2414 if (list.empty()) break;
2415 string::size_type split = 0, split2;
2416 do {
2417 split2 = list.find('\t', split);
2418 string item(list, split, split2 - split);
2419 if (seen.insert(item).second) {
2420 if (split != 0)
2421 value += '\t';
2422 value += item;
2424 split = split2 + 1;
2425 } while (split2 != string::npos);
2426 break;
2428 case CMD_unpack:
2429 value = str(binary_string_to_int(args[0]));
2430 break;
2431 case CMD_unprefix: {
2432 size_t prefix_len = prefix_from_term(NULL, args[0]);
2433 value.assign(args[0], prefix_len, string::npos);
2434 break;
2436 case CMD_unstem: {
2437 const string &term = args[0];
2438 Xapian::TermIterator i = qp.unstem_begin(term);
2439 Xapian::TermIterator end = qp.unstem_end(term);
2440 while (i != end) {
2441 if (!value.empty()) value += '\t';
2442 value += *i;
2443 ++i;
2445 break;
2447 case CMD_upper:
2448 value = Xapian::Unicode::toupper(args[0]);
2449 break;
2450 case CMD_url:
2451 url_encode(value, args[0]);
2452 break;
2453 case CMD_value: {
2454 Xapian::docid id = q0;
2455 Xapian::valueno value_no = string_to_int(args[0]);
2456 if (args.size() > 1) id = string_to_int(args[1]);
2457 value = db.get_document(id).get_value(value_no);
2458 break;
2460 case CMD_version:
2461 value = PACKAGE_STRING;
2462 break;
2463 case CMD_weight:
2464 value = double_to_string(weight);
2465 break;
2466 default: {
2467 args.insert(args.begin(), param[0]);
2468 int macro_no = func->second->tag - CMD_MACRO;
2469 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2470 // throw "Unknown function '" + var + "'";
2471 value = eval(macros[macro_no], args);
2472 break;
2475 res += value;
2476 } catch (const Xapian::Error & e) {
2477 // FIXME: this means we only see the most recent error in $error
2478 // - is that the best approach?
2479 error_msg = e.get_msg();
2482 res.append(fmt, p, string::npos);
2483 return res;
2486 static string
2487 eval_file(const string &fmtfile)
2489 string err;
2490 if (vet_filename(fmtfile)) {
2491 string file = template_dir + fmtfile;
2492 string fmt;
2493 if (load_file(file, fmt)) {
2494 vector<string> noargs;
2495 noargs.resize(1);
2496 return eval(fmt, noargs);
2498 err = strerror(errno);
2499 } else {
2500 err = "name contains '..'";
2503 // FIXME: report why!
2504 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2505 if (!err.empty()) msg += " (" + err + ')';
2506 throw msg;
2509 extern string
2510 pretty_term(string term)
2512 // Just leave empty strings and single characters alone.
2513 if (term.length() <= 1) return term;
2515 // Assume unprefixed terms are unstemmed.
2516 if (!C_isupper(term[0])) return term;
2518 // Handle stemmed terms.
2519 bool stemmed = (term[0] == 'Z');
2520 if (stemmed) {
2521 // First of all, check if a term in the query stemmed to this one.
2522 Xapian::TermIterator u = qp.unstem_begin(term);
2523 // There might be multiple words with the same stem, but we only want
2524 // one so just take the first.
2525 if (u != qp.unstem_end(term)) return *u;
2527 // Remove the 'Z'.
2528 term.erase(0, 1);
2531 bool add_quotes = false;
2533 // Check if the term has a prefix.
2534 if (C_isupper(term[0])) {
2535 // See if we have this prefix in the termprefix_to_userprefix map. If
2536 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2537 string prefix;
2538 size_t prefix_len = prefix_from_term(&prefix, term);
2540 map<string, string>::const_iterator i;
2541 i = termprefix_to_userprefix.find(prefix);
2542 if (i != termprefix_to_userprefix.end()) {
2543 string user_prefix = i->second;
2544 user_prefix += ':';
2545 term.replace(0, prefix_len, user_prefix);
2546 } else {
2547 // We don't have a prefix mapping for this, so just set a flag to
2548 // add quotes around the term.
2549 add_quotes = true;
2553 if (stemmed) term += '.';
2555 if (add_quotes) {
2556 term.insert(0, "\"");
2557 term.append("\"");
2560 return term;
2563 static string
2564 print_caption(const string &fmt, const vector<string> &param)
2566 q0 = *(mset[hit_no]);
2568 weight = mset[hit_no].get_weight();
2569 percent = mset.convert_to_percent(mset[hit_no]);
2570 collapsed = mset[hit_no].get_collapse_count();
2572 return eval(fmt, param);
2575 void
2576 parse_omegascript()
2578 try {
2579 const char * p = getenv("SERVER_PROTOCOL");
2580 if (p && strcmp(p, "INCLUDED") == 0) {
2581 // We're being included in another page, so suppress headers.
2582 suppress_http_headers = true;
2585 string output = eval_file(fmtname);
2586 if (!set_content_type && !suppress_http_headers) {
2587 cout << "Content-Type: text/html" << endl;
2588 set_content_type = true;
2590 if (!suppress_http_headers) cout << endl;
2591 cout << output;
2592 } catch (...) {
2593 // Ensure the headers have been output so that any exception gets
2594 // reported rather than giving a server error.
2595 if (!set_content_type && !suppress_http_headers) {
2596 cout << "Content-Type: text/html" << endl;
2597 set_content_type = true;
2599 if (!suppress_http_headers) cout << endl;
2600 throw;
2604 static void
2605 ensure_query_parsed()
2607 if (query_parsed) return;
2608 query_parsed = true;
2610 MCI val;
2611 pair<MCI, MCI> g;
2613 // Should we discard the existing R-set recorded in R CGI parameters?
2614 bool discard_rset = false;
2616 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2617 // CGI parameters)?
2618 bool force_first_page = false;
2620 string v;
2621 // get list of terms from previous iteration of query
2622 val = cgi_params.find("xP");
2623 if (val != cgi_params.end()) {
2624 v = val->second;
2625 // If xP given, default to discarding any RSet and forcing the first
2626 // page of results. If the query is the same, or an extension of
2627 // the previous query, we adjust these again below.
2628 discard_rset = true;
2629 force_first_page = true;
2631 querytype result = parse_queries(v);
2632 switch (result) {
2633 case BAD_QUERY:
2634 break;
2635 case NEW_QUERY:
2636 break;
2637 case SAME_QUERY:
2638 case EXTENDED_QUERY:
2639 // If we've changed database, force the first page of hits
2640 // and discard the R-set (since the docids will have changed)
2641 val = cgi_params.find("xDB");
2642 if (val != cgi_params.end() && val->second != dbname) break;
2643 if (result == SAME_QUERY && force_first_page) {
2644 val = cgi_params.find("xFILTERS");
2645 if (val != cgi_params.end() && val->second != filters &&
2646 val->second != old_filters) {
2647 // Filters have changed since last query.
2648 } else {
2649 force_first_page = false;
2652 discard_rset = false;
2653 break;
2656 if (!force_first_page) {
2657 // Work out which mset element is the first hit we want
2658 // to display
2659 val = cgi_params.find("TOPDOC");
2660 if (val != cgi_params.end()) {
2661 topdoc = atol(val->second.c_str());
2664 // Handle next, previous, and page links
2665 if (cgi_params.find(">") != cgi_params.end()) {
2666 topdoc += hits_per_page;
2667 } else if (cgi_params.find("<") != cgi_params.end()) {
2668 if (topdoc >= hits_per_page)
2669 topdoc -= hits_per_page;
2670 else
2671 topdoc = 0;
2672 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2673 (val = cgi_params.find("#")) != cgi_params.end()) {
2674 long page = atol(val->second.c_str());
2675 // Do something sensible for page 0 (we count pages from 1).
2676 if (page == 0) page = 1;
2677 topdoc = (page - 1) * hits_per_page;
2680 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2681 // Normally we snap TOPDOC like this so that things work nicely if
2682 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2683 // postprocessing the output of omega and want variable sized pages,
2684 // this is unhelpful.
2685 bool raw_search = false;
2686 val = cgi_params.find("RAWSEARCH");
2687 if (val != cgi_params.end()) {
2688 raw_search = bool(atol(val->second.c_str()));
2691 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2694 if (!discard_rset) {
2695 // put documents marked as relevant into the rset
2696 g = cgi_params.equal_range("R");
2697 for (MCI i = g.first; i != g.second; ++i) {
2698 const string & value = i->second;
2699 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2700 while (value[j] == '.') ++j;
2701 Xapian::docid d = atoi(value.c_str() + j);
2702 if (d) {
2703 rset.add_document(d);
2704 ticked[d] = true;
2711 // run query if we haven't already
2712 static void
2713 ensure_match()
2715 if (done_query) return;
2717 secs = RealTime::now();
2718 run_query();
2719 if (secs != -1)
2720 secs = RealTime::now() - secs;
2722 done_query = true;
2723 last = mset.get_matches_lower_bound();
2724 if (last == 0) {
2725 // Otherwise topdoc ends up being -6 if it's non-zero!
2726 topdoc = 0;
2727 } else {
2728 if (topdoc >= last)
2729 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2730 // last is the count of documents up to the end of the current page
2731 // (as returned by $last)
2732 if (topdoc + hits_per_page < last)
2733 last = topdoc + hits_per_page;
2737 // OmegaExpandDecider methods.
2739 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2740 set<string> * querytermset)
2741 : db(db_)
2743 // We'll want the stemmer for testing matches anyway.
2744 if (!stemmer)
2745 stemmer = new Xapian::Stem(option["stemmer"]);
2746 if (querytermset) {
2747 set<string>::const_iterator i;
2748 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2749 string term(*i);
2750 if (term.empty()) continue;
2752 unsigned char ch = term[0];
2753 bool stemmed = (ch == 'Z');
2754 if (stemmed) {
2755 term.erase(0, 1);
2756 if (term.empty()) continue;
2757 ch = term[0];
2760 if (C_isupper(ch)) {
2761 size_t prefix_len = prefix_from_term(NULL, term);
2762 term.erase(0, prefix_len);
2765 if (!stemmed) term = (*stemmer)(term);
2767 exclude_stems.insert(term);
2772 bool
2773 OmegaExpandDecider::operator()(const string & term) const
2775 unsigned char ch = term[0];
2777 // Reject terms with a prefix.
2778 if (C_isupper(ch)) return false;
2781 MyStopper stopper;
2782 // Don't suggest stopwords.
2783 if (stopper(term)) return false;
2786 // Reject small numbers.
2787 if (term.size() < 4 && C_isdigit(ch)) return false;
2789 // Reject terms containing a space.
2790 if (term.find(' ') != string::npos) return false;
2792 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2793 // terms which are already in the query in some form.
2794 string stem = (*stemmer)(term);
2795 if (exclude_stems.find(stem) != exclude_stems.end())
2796 return false;
2798 // Ignore terms that only occur once (hapaxes) since they aren't
2799 // useful for finding related documents - they only occur in a
2800 // document that's already been marked as relevant.
2801 // FIXME: add an expand option to ignore terms where
2802 // termfreq == rtermfreq.
2803 if (db.get_termfreq(term) <= 1) return false;
2805 return true;