Make PostList subclasses return PostList* not Internal*
[xapian.git] / xapian-applications / omega / query.cc
blob907c5507347bd415d419d54beb033f6432a2488b
1 /* query.cc: query executor for omega
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
8 * Copyright 2008 Thomas Viehmann
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include <algorithm>
29 #include <iostream>
30 #include <map>
31 #include <set>
32 #include <unordered_map>
33 #include <vector>
35 #include <cassert>
36 #include <cctype>
37 #include "safeerrno.h"
38 #include <stdio.h>
39 #include <cstdlib>
40 #include <cstring>
41 #include "strcasecmp.h"
42 #include <ctime>
44 #include "safeunistd.h"
45 #include <sys/types.h>
46 #include "safesysstat.h"
47 #include "safefcntl.h"
49 #include "realtime.h"
51 #include <cdb.h>
53 #include "csvescape.h"
54 #include "date.h"
55 #include "datevalue.h"
56 #include "jsonescape.h"
57 #include "utils.h"
58 #include "omega.h"
59 #include "query.h"
60 #include "cgiparam.h"
61 #include "loadfile.h"
62 #include "sample.h"
63 #include "str.h"
64 #include "stringutils.h"
65 #include "transform.h"
66 #include "urldecode.h"
67 #include "urlencode.h"
68 #include "unixperm.h"
69 #include "values.h"
70 #include "weight.h"
71 #include "expand.h"
72 #include "md5wrap.h"
74 #include <xapian.h>
76 using namespace std;
78 using Xapian::Utf8Iterator;
80 using Xapian::Unicode::is_wordchar;
82 #ifndef SNPRINTF
83 #include <cstdarg>
85 static int my_snprintf(char *str, size_t size, const char *format, ...)
87 int res;
88 va_list ap;
89 va_start(ap, format);
90 str[size - 1] = '\0';
91 res = vsprintf(str, format, ap);
92 if (str[size - 1] || res < 0 || size_t(res) >= size)
93 abort(); /* Overflowed! */
94 va_end(ap);
95 return res;
97 #else
98 #define my_snprintf SNPRINTF
99 #endif
101 static bool query_parsed = false;
102 static bool done_query = false;
103 static Xapian::docid last = 0;
105 static Xapian::MSet mset;
107 static map<Xapian::docid, bool> ticked;
109 static void ensure_query_parsed();
110 static void ensure_match();
112 static Xapian::Query query;
113 //static string url_query_string;
114 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
116 static Xapian::QueryParser qp;
117 static Xapian::NumberRangeProcessor * size_rp = NULL;
118 static Xapian::Stem *stemmer = NULL;
120 static string eval_file(const string &fmtfile);
122 static set<string> termset;
124 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
125 static map<string, string> termprefix_to_userprefix;
127 static string queryterms;
129 static string error_msg;
131 static double secs = -1;
133 static const char DEFAULT_LOG_ENTRY[] =
134 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
135 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
136 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
137 "$dbname\t"
138 "$query\t"
139 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
141 class MyStopper : public Xapian::Stopper {
142 public:
143 bool operator()(const string &t) const {
144 switch (t[0]) {
145 case 'a':
146 return (t == "a" || t == "about" || t == "an" || t == "and" ||
147 t == "are" || t == "as" || t == "at");
148 case 'b':
149 return (t == "be" || t == "by");
150 case 'e':
151 return (t == "en");
152 case 'f':
153 return (t == "for" || t == "from");
154 case 'h':
155 return (t == "how");
156 case 'i':
157 return (t == "i" || t == "in" || t == "is" || t == "it");
158 case 'o':
159 return (t == "of" || t == "on" || t == "or");
160 case 't':
161 return (t == "that" || t == "the" || t == "this" || t == "to");
162 case 'w':
163 return (t == "was" || t == "what" || t == "when" ||
164 t == "where" || t == "which" || t == "who" ||
165 t == "why" || t == "will" || t == "with");
166 case 'y':
167 return (t == "you" || t == "your");
168 default:
169 return false;
174 static size_t
175 prefix_from_term(string* prefix, const string& term)
177 if (!term.empty()) {
178 if (term[0] == 'X') {
179 const string::const_iterator begin = term.begin();
180 string::const_iterator i = begin + 1;
181 while (i != term.end() && C_isupper(*i))
182 ++i;
183 if (prefix)
184 prefix->assign(begin, i);
185 if (i != term.end() && *i == ':')
186 ++i;
187 return i - begin;
190 if (C_isupper(term[0])) {
191 if (prefix)
192 *prefix = term[0];
193 return 1;
197 if (prefix)
198 prefix->resize(0);
199 return 0;
202 // Don't allow ".." in format names, log file names, etc as this would allow
203 // people to open a format "../../etc/passwd" or similar.
204 // FIXME: make this check more exact ("foo..bar" is safe)
205 // FIXME: log when this check fails
206 static bool
207 vet_filename(const string &filename)
209 string::size_type i = filename.find("..");
210 return (i == string::npos);
213 // Heuristics:
214 // * If any terms have been removed, it's a "fresh query" so we discard any
215 // relevance judgements
216 // * If all previous terms are there but more have been added then we keep
217 // the relevance judgements, but return the first page of hits
219 // NEW_QUERY entirely new query
220 // SAME_QUERY unchanged query
221 // EXTENDED_QUERY new query, but based on the old one
222 // BAD_QUERY parse error (message in error_msg)
223 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
225 static multimap<string, string> probabilistic_query;
227 void
228 set_probabilistic_query(const string & prefix, const string & s)
230 string query_string = s;
231 // Strip leading and trailing whitespace from query_string.
232 trim(query_string);
233 if (!query_string.empty())
234 probabilistic_query.insert(make_pair(prefix, query_string));
237 static unsigned
238 read_qp_flags(const string & opt_pfx, unsigned f)
240 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
241 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
242 unsigned mask = 0;
243 const char * s = i->first.c_str() + opt_pfx.size();
244 switch (s[0]) {
245 case 'a':
246 if (strcmp(s, "auto_multiword_synonyms") == 0) {
247 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
248 break;
250 if (strcmp(s, "auto_synonyms") == 0) {
251 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
252 break;
254 break;
255 case 'b':
256 if (strcmp(s, "boolean") == 0) {
257 mask = Xapian::QueryParser::FLAG_BOOLEAN;
258 break;
260 if (strcmp(s, "boolean_any_case") == 0) {
261 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
262 break;
264 break;
265 case 'c':
266 if (strcmp(s, "cjk_ngram") == 0) {
267 mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
268 break;
270 break;
271 case 'd':
272 if (strcmp(s, "default") == 0) {
273 mask = Xapian::QueryParser::FLAG_DEFAULT;
274 break;
276 break;
277 case 'l':
278 if (strcmp(s, "lovehate") == 0) {
279 mask = Xapian::QueryParser::FLAG_LOVEHATE;
280 break;
282 break;
283 case 'p':
284 if (strcmp(s, "partial") == 0) {
285 mask = Xapian::QueryParser::FLAG_PARTIAL;
286 break;
288 if (strcmp(s, "phrase") == 0) {
289 mask = Xapian::QueryParser::FLAG_PHRASE;
290 break;
292 if (strcmp(s, "pure_not") == 0) {
293 mask = Xapian::QueryParser::FLAG_PURE_NOT;
294 break;
296 break;
297 case 's':
298 if (strcmp(s, "spelling_correction") == 0) {
299 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
300 break;
302 if (strcmp(s, "synonym") == 0) {
303 mask = Xapian::QueryParser::FLAG_SYNONYM;
304 break;
306 break;
307 case 'w':
308 if (strcmp(s, "wildcard") == 0) {
309 mask = Xapian::QueryParser::FLAG_WILDCARD;
310 break;
312 break;
315 if (i->second.empty()) {
316 f &= ~mask;
317 } else {
318 f |= mask;
321 return f;
324 static querytype
325 set_probabilistic(const string &oldp)
327 // Parse the query string.
328 qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
329 qp.set_stopper(new MyStopper());
330 qp.set_default_op(default_op);
331 qp.set_database(db);
332 // FIXME: provide a custom RP which handles size:10..20K, etc.
333 if (!size_rp)
334 size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
335 qp.add_rangeprocessor(size_rp);
336 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
337 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
338 string user_prefix(pfx->first, 7);
339 const string & term_pfx_list = pfx->second;
340 string::size_type i = 0;
341 do {
342 string::size_type i0 = i;
343 i = term_pfx_list.find('\t', i);
344 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
345 qp.add_prefix(user_prefix, term_pfx);
346 // std::map::insert() won't overwrite an existing entry, so we'll
347 // prefer the first user_prefix for which a particular term prefix
348 // is specified.
349 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
350 } while (++i);
352 pfx = option.lower_bound("boolprefix,");
353 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
354 string user_prefix(pfx->first, 11, string::npos);
355 auto it = option.find("nonexclusiveprefix," + pfx->second);
356 bool exclusive = (it == option.end() || it->second.empty());
357 qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
358 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
361 try {
362 unsigned default_flags = read_qp_flags("flag_", 0);
364 vector<Xapian::Query> queries;
365 queries.reserve(probabilistic_query.size());
367 multimap<string, string>::const_iterator j;
368 for (j = probabilistic_query.begin();
369 j != probabilistic_query.end();
370 ++j) {
371 const string & prefix = j->first;
373 // Choose the stemmer to use for this input.
374 string stemlang = option[prefix + ":stemmer"];
375 if (stemlang.empty())
376 stemlang = option["stemmer"];
377 qp.set_stemmer(Xapian::Stem(stemlang));
379 // Work out the flags to use for this input.
380 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
382 const string & query_string = j->second;
383 Xapian::Query q = qp.parse_query(query_string, f, prefix);
384 if (!q.empty())
385 queries.push_back(q);
387 query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
388 } catch (Xapian::QueryParserError &e) {
389 error_msg = e.get_msg();
390 return BAD_QUERY;
393 Xapian::termcount n_new_terms = 0;
394 for (Xapian::TermIterator i = query.get_terms_begin();
395 i != query.get_terms_end(); ++i) {
396 if (termset.find(*i) == termset.end()) {
397 termset.insert(*i);
398 if (!queryterms.empty()) queryterms += '\t';
399 queryterms += *i;
401 n_new_terms++;
404 // Check new query against the previous one
405 if (oldp.empty()) {
406 // If oldp was empty that means there were no probabilistic terms
407 // before, so if there are now this is a new query.
408 return n_new_terms ? NEW_QUERY : SAME_QUERY;
411 // The terms in oldp are separated by tabs.
412 const char oldp_separator = '\t';
413 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
415 // short-cut: if the new query has fewer terms, it must be a new one
416 if (n_new_terms < n_old_terms) return NEW_QUERY;
418 const char *term = oldp.c_str();
419 const char *pend;
420 while ((pend = strchr(term, oldp_separator)) != NULL) {
421 if (termset.find(string(term, pend - term)) == termset.end())
422 return NEW_QUERY;
423 term = pend + 1;
425 if (*term) {
426 if (termset.find(string(term)) == termset.end())
427 return NEW_QUERY;
430 // Use termset.size() rather than n_new_terms so we correctly handle
431 // the case when the query has repeated terms.
432 // This works wrongly in the case when the user extends the query
433 // by adding a term already in it, but that's unlikely and the behaviour
434 // isn't too bad (we just don't reset page 1). We also mishandle a few
435 // other obscure cases e.g. adding quotes to turn a query into a phrase.
436 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
437 return SAME_QUERY;
440 static multimap<string, string> filter_map;
441 static set<string> neg_filters;
443 typedef multimap<string, string>::const_iterator FMCI;
445 void add_bterm(const string &term) {
446 string prefix;
447 if (prefix_from_term(&prefix, term) > 0)
448 filter_map.insert(multimap<string, string>::value_type(prefix, term));
451 void add_nterm(const string &term) {
452 if (!term.empty())
453 neg_filters.insert(term);
456 static void
457 run_query()
459 string scheme;
460 bool force_boolean = false;
461 if (!filter_map.empty()) {
462 // OR together filters with the same prefix (or AND for non-exclusive
463 // prefixes), then AND together the resultant groups.
464 vector<Xapian::Query> filter_vec;
465 vector<string> same_vec;
466 string current;
467 for (FMCI i = filter_map.begin(); ; ++i) {
468 bool over = (i == filter_map.end());
469 if (over || i->first != current) {
470 switch (same_vec.size()) {
471 case 0:
472 break;
473 case 1:
474 filter_vec.push_back(Xapian::Query(same_vec[0]));
475 break;
476 default: {
477 Xapian::Query::op op = Xapian::Query::OP_OR;
478 auto it = option.find("nonexclusiveprefix," + current);
479 if (it != option.end() && !it->second.empty()) {
480 op = Xapian::Query::OP_AND;
482 filter_vec.push_back(Xapian::Query(op,
483 same_vec.begin(),
484 same_vec.end()));
485 break;
488 same_vec.clear();
489 if (over) break;
490 current = i->first;
492 same_vec.push_back(i->second);
495 Xapian::Query filter(Xapian::Query::OP_AND,
496 filter_vec.begin(), filter_vec.end());
498 if (query.empty()) {
499 // If no probabilistic query is provided then promote the filters
500 // to be THE query - filtering an empty query will give no
501 // matches.
502 std::swap(query, filter);
503 auto&& it = option.find("weightingpurefilter");
504 if (it != option.end() && !it->second.empty()) {
505 scheme = it->second;
506 } else {
507 force_boolean = true;
509 } else {
510 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
514 if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
515 Xapian::Query date_filter;
516 if (date_value_slot != Xapian::BAD_VALUENO) {
517 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
518 // latter the sort order just works correctly between different
519 // precisions).
520 bool as_time_t =
521 db.get_value_lower_bound(date_value_slot).size() == 4 &&
522 db.get_value_upper_bound(date_value_slot).size() == 4;
523 date_filter = date_value_range(as_time_t, date_value_slot,
524 date_start, date_end,
525 date_span);
526 } else {
527 date_filter = date_range_filter(date_start, date_end, date_span);
528 date_filter = Xapian::Query(Xapian::Query::OP_OR,
529 date_filter,
530 Xapian::Query("Dlatest"));
533 // If no probabilistic query is provided then promote the daterange
534 // filter to be THE query instead of filtering an empty query.
535 if (query.empty()) {
536 query = date_filter;
537 force_boolean = true;
538 } else {
539 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
543 if (!neg_filters.empty()) {
544 // OR together all negated filters.
545 Xapian::Query filter(Xapian::Query::OP_OR,
546 neg_filters.begin(), neg_filters.end());
548 if (query.empty()) {
549 // If we only have a negative filter for the query, use MatchAll as
550 // the query to apply the filters to.
551 query = Xapian::Query::MatchAll;
552 force_boolean = true;
554 query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
557 if (!enquire || !error_msg.empty()) return;
559 if (!force_boolean && scheme.empty()) {
560 auto&& it = option.find("weighting");
561 if (it != option.end()) scheme = it->second;
563 set_weighting_scheme(*enquire, scheme, force_boolean);
565 enquire->set_cutoff(threshold);
567 if (sort_keymaker) {
568 if (sort_after) {
569 enquire->set_sort_by_relevance_then_key(sort_keymaker,
570 reverse_sort);
571 } else {
572 enquire->set_sort_by_key_then_relevance(sort_keymaker,
573 reverse_sort);
575 } else if (sort_key != Xapian::BAD_VALUENO) {
576 if (sort_after) {
577 enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
578 } else {
579 enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
583 enquire->set_docid_order(docid_order);
585 if (collapse) {
586 enquire->set_collapse_key(collapse_key);
589 if (!query.empty()) {
590 #if 0
591 // FIXME: If we start doing permissions checks based on $REMOTE_USER
592 // we're going to break some existing setups if users upgrade. We
593 // probably want a way to set this from OmegaScript.
594 const char * remote_user = getenv("REMOTE_USER");
595 if (remote_user)
596 apply_unix_permissions(query, remote_user);
597 #endif
599 enquire->set_query(query);
600 // We could use the value of topdoc as first parameter, but we
601 // need to know the first few items in the mset to fake a
602 // relevance set for topterms.
604 // If min_hits isn't set, check at least one extra result so we
605 // know if we've reached the end of the matches or not - then we
606 // can avoid offering a "next" button which leads to an empty page.
607 mset = enquire->get_mset(0, topdoc + hits_per_page,
608 topdoc + max(hits_per_page + 1, min_hits),
609 &rset);
613 string
614 html_escape(const string &str)
616 string res;
617 string::size_type p = 0;
618 while (p < str.size()) {
619 char ch = str[p++];
620 switch (ch) {
621 case '<':
622 res += "&lt;";
623 continue;
624 case '>':
625 res += "&gt;";
626 continue;
627 case '&':
628 res += "&amp;";
629 continue;
630 case '"':
631 res += "&quot;";
632 continue;
633 default:
634 res += ch;
637 return res;
640 static string
641 html_strip(const string &str)
643 string res;
644 string::size_type p = 0;
645 bool skip = false;
646 while (p < str.size()) {
647 char ch = str[p++];
648 switch (ch) {
649 case '<':
650 skip = true;
651 continue;
652 case '>':
653 skip = false;
654 continue;
655 default:
656 if (! skip) res += ch;
659 return res;
662 class WordList {
663 static string prev_list;
664 static unordered_map<string, int> word_to_occurrence;
665 public:
666 void build_word_map(const string& list) {
667 // Don't build map again if passed list of terms is same as before.
668 if (prev_list == list) return;
669 word_to_occurrence.clear();
670 string::size_type split = 0, split2;
671 int word_index = 0;
672 string word;
673 while ((split2 = list.find('\t', split)) != string::npos) {
674 word = list.substr(split, split2 - split);
675 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
676 ++word_index;
677 split = split2 + 1;
679 word = list.substr(split, list.size() - split);
680 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
681 ++word_index;
682 prev_list = list;
685 int word_in_list(const string& word) {
686 auto it = word_to_occurrence.find(word);
687 if (it == word_to_occurrence.end()) return -1;
688 return it->second;
692 string WordList::prev_list;
693 unordered_map<string, int> WordList::word_to_occurrence;
695 // Not a character in an identifier
696 inline static bool
697 p_notid(unsigned int c)
699 return !C_isalnum(c) && c != '_';
702 // Not a character in an HTML tag name
703 inline static bool
704 p_nottag(unsigned int c)
706 return !C_isalnum(c) && c != '.' && c != '-';
709 // FIXME: shares algorithm with indextext.cc!
710 static string
711 html_highlight(const string &s, const string &list,
712 const string &bra, const string &ket)
714 if (!stemmer) {
715 stemmer = new Xapian::Stem(option["stemmer"]);
718 string res;
720 Utf8Iterator j(s);
721 const Utf8Iterator s_end;
722 while (true) {
723 Utf8Iterator first = j;
724 while (first != s_end && !is_wordchar(*first)) ++first;
725 if (first == s_end) break;
726 Utf8Iterator term_end;
727 string term;
728 string word;
729 const char *l = j.raw();
730 if (*first < 128 && C_isupper(*first)) {
731 j = first;
732 Xapian::Unicode::append_utf8(term, *j);
733 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
734 Xapian::Unicode::append_utf8(term, *j);
736 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
737 term.resize(0);
739 term_end = j;
741 if (term.empty()) {
742 j = first;
743 while (is_wordchar(*j)) {
744 Xapian::Unicode::append_utf8(term, *j);
745 ++j;
746 if (j == s_end) break;
747 if (*j == '&' || *j == '\'') {
748 Utf8Iterator next = j;
749 ++next;
750 if (next == s_end || !is_wordchar(*next)) break;
751 term += *j;
752 j = next;
755 term_end = j;
756 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
757 string::size_type len = term.length();
758 if (*j == '#') {
759 term += '#';
760 do { ++j; } while (j != s_end && *j == '#');
761 } else {
762 while (j != s_end && (*j == '+' || *j == '-')) {
763 Xapian::Unicode::append_utf8(term, *j);
764 ++j;
767 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
768 term.resize(len);
769 } else {
770 term_end = j;
774 j = term_end;
775 term = Xapian::Unicode::tolower(term);
776 WordList w;
777 w.build_word_map(list);
778 int match = w.word_in_list(term);
779 if (match == -1) {
780 string stem = "Z";
781 stem += (*stemmer)(term);
782 match = w.word_in_list(stem);
784 if (match >= 0) {
785 res += html_escape(string(l, first.raw() - l));
786 if (!bra.empty()) {
787 res += bra;
788 } else {
789 static const char * colours[] = {
790 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
791 "990000", "009900", "996600", "006699", "990099"
793 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
794 const char * bg = colours[idx];
795 if (strchr(bg, 'f')) {
796 res += "<b style=\"color:black;background-color:#";
797 } else {
798 res += "<b style=\"color:white;background-color:#";
800 res += bg;
801 res += "\">";
803 word.assign(first.raw(), j.raw() - first.raw());
804 res += html_escape(word);
805 if (!bra.empty()) {
806 res += ket;
807 } else {
808 res += "</b>";
810 } else {
811 res += html_escape(string(l, j.raw() - l));
814 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
815 return res;
818 #if 0
819 static void
820 print_query_string(const char *after)
822 if (after && strncmp(after, "&B=", 3) == 0) {
823 char prefix = after[3];
824 string::size_type start = 0, amp = 0;
825 while (true) {
826 amp = url_query_string.find('&', amp);
827 if (amp == string::npos) {
828 cout << url_query_string.substr(start);
829 return;
831 amp++;
832 while (url_query_string[amp] == 'B' &&
833 url_query_string[amp + 1] == '=' &&
834 url_query_string[amp + 2] == prefix) {
835 cout << url_query_string.substr(start, amp - start - 1);
836 start = url_query_string.find('&', amp + 3);
837 if (start == string::npos) return;
838 amp = start + 1;
842 cout << url_query_string;
844 #endif
846 class Fields {
847 mutable Xapian::docid did_cached;
848 mutable map<string, string> fields;
850 void read_fields(Xapian::docid did) const;
852 public:
853 Fields() : did_cached(0) { }
855 const string & get_field(Xapian::docid did, const string & field) const {
856 if (did != did_cached) read_fields(did);
857 return fields[field];
861 void
862 Fields::read_fields(Xapian::docid did) const
864 fields.clear();
865 did_cached = did;
866 const string & data = db.get_document(did).get_data();
868 // Parse document data.
869 string::size_type i = 0;
870 const string & names = option["fieldnames"];
871 if (!names.empty()) {
872 // Each line is a field, with fieldnames taken from corresponding
873 // entries in the tab-separated list specified by $opt{fieldnames}.
874 string::size_type n = 0;
875 do {
876 string::size_type n0 = n;
877 n = names.find('\t', n);
878 string::size_type i0 = i;
879 i = data.find('\n', i);
880 fields.insert(make_pair(names.substr(n0, n - n0),
881 data.substr(i0, i - i0)));
882 } while (++n && ++i);
883 } else {
884 // Each line is a field, in the format NAME=VALUE. We assume the field
885 // name doesn't contain an "=". Lines without an "=" are currently
886 // just ignored.
887 do {
888 string::size_type i0 = i;
889 i = data.find('\n', i);
890 string line(data, i0, i - i0);
891 string::size_type j = line.find('=');
892 if (j != string::npos) {
893 string & value = fields[line.substr(0, j)];
894 if (!value.empty()) value += '\t';
895 value.append(line, j + 1, string::npos);
897 } while (++i);
901 static Fields fields;
902 static Xapian::docid q0;
903 static Xapian::doccount hit_no;
904 static int percent;
905 static double weight;
906 static Xapian::doccount collapsed;
908 static string print_caption(const string &fmt, const vector<string> &param);
910 enum tagval {
911 CMD_,
912 CMD_add,
913 CMD_addfilter,
914 CMD_allterms,
915 CMD_and,
916 CMD_cgi,
917 CMD_cgilist,
918 CMD_cgiparams,
919 CMD_chr,
920 CMD_collapsed,
921 CMD_contains,
922 CMD_csv,
923 CMD_date,
924 CMD_dbname,
925 CMD_dbsize,
926 CMD_def,
927 CMD_defaultop,
928 CMD_div,
929 CMD_eq,
930 CMD_emptydocs,
931 CMD_env,
932 CMD_error,
933 CMD_field,
934 CMD_filesize,
935 CMD_filters,
936 CMD_filterterms,
937 CMD_find,
938 CMD_fmt,
939 CMD_freq,
940 CMD_ge,
941 CMD_gt,
942 CMD_hash,
943 CMD_highlight,
944 CMD_hit,
945 CMD_hitlist,
946 CMD_hitsperpage,
947 CMD_hostname,
948 CMD_html,
949 CMD_htmlstrip,
950 CMD_httpheader,
951 CMD_id,
952 CMD_if,
953 CMD_include,
954 CMD_json,
955 CMD_jsonarray,
956 CMD_last,
957 CMD_lastpage,
958 CMD_le,
959 CMD_length,
960 CMD_list,
961 CMD_log,
962 CMD_lookup,
963 CMD_lower,
964 CMD_lt,
965 CMD_map,
966 CMD_match,
967 CMD_max,
968 CMD_min,
969 CMD_mod,
970 CMD_msize,
971 CMD_msizeexact,
972 CMD_msizelower,
973 CMD_msizeupper,
974 CMD_mul,
975 CMD_muldiv,
976 CMD_ne,
977 CMD_nice,
978 CMD_not,
979 CMD_now,
980 CMD_opt,
981 CMD_or,
982 CMD_ord,
983 CMD_pack,
984 CMD_percentage,
985 CMD_prettyterm,
986 CMD_prettyurl,
987 CMD_query,
988 CMD_querydescription,
989 CMD_queryterms,
990 CMD_range,
991 CMD_record,
992 CMD_relevant,
993 CMD_relevants,
994 CMD_score,
995 CMD_set,
996 CMD_seterror,
997 CMD_setmap,
998 CMD_setrelevant,
999 CMD_slice,
1000 CMD_snippet,
1001 CMD_split,
1002 CMD_stoplist,
1003 CMD_sub,
1004 CMD_substr,
1005 CMD_suggestion,
1006 CMD_termprefix,
1007 CMD_terms,
1008 CMD_thispage,
1009 CMD_time,
1010 CMD_topdoc,
1011 CMD_topterms,
1012 CMD_transform,
1013 CMD_truncate,
1014 CMD_uniq,
1015 CMD_unpack,
1016 CMD_unprefix,
1017 CMD_unstem,
1018 CMD_upper,
1019 CMD_url,
1020 CMD_value,
1021 CMD_version,
1022 CMD_weight,
1023 CMD_MACRO // special tag for macro evaluation
1026 struct func_attrib {
1027 int tag;
1028 int minargs, maxargs, evalargs;
1029 char ensure;
1032 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1033 struct func_desc {
1034 const char *name;
1035 struct func_attrib a;
1038 #define N -1
1039 #define M 'M'
1040 #define Q 'Q'
1041 // NB when adding a new command which ensures M or Q, update the list in
1042 // docs/omegascript.rst
1043 static struct func_desc func_tab[] = {
1044 //name minargs maxargs evalargs ensure
1045 {"",{CMD_, N, N, 0, 0}},// commented out code
1046 T(add, 0, N, N, 0), // add a list of numbers
1047 T(addfilter, 1, 1, N, 0), // add filter term
1048 T(allterms, 0, 1, N, 0), // list of all terms matching document
1049 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
1050 T(cgi, 1, 1, N, 0), // return cgi parameter value
1051 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
1052 T(cgiparams, 0, 0, N, 0), // return list of cgi parameter names
1053 T(chr, 1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1054 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
1055 T(contains, 2, 2, N, 0), // return position of substring, or empty string
1056 T(csv, 1, 2, N, 0), // CSV string escaping
1057 T(date, 1, 2, N, 0), // convert time_t to strftime format
1058 // (default: YYYY-MM-DD)
1059 T(dbname, 0, 0, N, 0), // database name
1060 T(dbsize, 0, 0, N, 0), // database size (# of documents)
1061 T(def, 2, 2, 1, 0), // define a macro
1062 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
1063 T(div, 2, 2, N, 0), // integer divide
1064 T(emptydocs, 0, 1, N, 0), // list of empty documents
1065 T(env, 1, 1, N, 0), // environment variable
1066 T(error, 0, 0, N, 0), // error message
1067 T(eq, 2, 2, N, 0), // test equality
1068 T(field, 1, 2, N, 0), // lookup field in record
1069 T(filesize, 1, 1, N, 0), // pretty printed filesize
1070 T(filters, 0, 0, N, 0), // serialisation of current filters
1071 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
1072 T(find, 2, 2, N, 0), // find entry in list
1073 T(fmt, 0, 0, N, 0), // name of current format
1074 T(freq, 1, 1, N, 0), // frequency of a term
1075 T(ge, 2, 2, N, 0), // test >=
1076 T(gt, 2, 2, N, 0), // test >
1077 T(hash, 2, 2, N, 0), // hash a string using the specified hash function
1078 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
1079 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
1080 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
1081 T(hitsperpage, 0, 0, N, 0), // hits per page
1082 T(hostname, 1, 1, N, 0), // extract hostname from URL
1083 T(html, 1, 1, N, 0), // html escape string (<>&")
1084 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1085 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
1086 T(id, 0, 0, N, 0), // docid of current doc
1087 T(if, 2, 3, 1, 0), // conditional
1088 T(include, 1, 1, 1, 0), // include another file
1089 T(json, 1, 1, N, 0), // JSON string escaping
1090 T(jsonarray, 1, 1, N, 0), // Format list as a JSON array of strings
1091 T(last, 0, 0, N, M), // hit number one beyond end of current page
1092 T(lastpage, 0, 0, N, M), // number of last hit page
1093 T(le, 2, 2, N, 0), // test <=
1094 T(length, 1, 1, N, 0), // length of list
1095 T(list, 2, 5, N, 0), // pretty print list
1096 T(log, 1, 2, 1, 0), // create a log entry
1097 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1098 T(lower, 1, 1, N, 0), // convert string to lower case
1099 T(lt, 2, 2, N, 0), // test <
1100 T(map, 1, 2, 1, 0), // map a list into another list
1101 T(match, 2, 3, N, 0), // regex match
1102 T(max, 1, N, N, 0), // maximum of a list of values
1103 T(min, 1, N, N, 0), // minimum of a list of values
1104 T(mod, 2, 2, N, 0), // integer modulus
1105 T(msize, 0, 0, N, M), // number of matches (estimated)
1106 T(msizeexact, 0, 0, N, M), // is $msize exact?
1107 T(msizelower, 0, 0, N, M), // number of matches (lower bound)
1108 T(msizeupper, 0, 0, N, M), // number of matches (upper bound)
1109 T(mul, 2, N, N, 0), // multiply a list of numbers
1110 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1111 T(ne, 2, 2, N, 0), // test not equal
1112 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1113 T(not, 1, 1, N, 0), // logical not
1114 T(now, 0, 0, N, 0), // current date/time as a time_t
1115 T(opt, 1, 2, N, 0), // lookup an option value
1116 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1117 T(ord, 1, 1, N, 0), // return codepoint for first character of UTF-8 string
1118 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1119 T(percentage, 0, 0, N, 0), // percentage score of current hit
1120 T(prettyterm, 1, 1, N, Q), // pretty print term name
1121 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1122 T(query, 0, 1, N, Q), // query
1123 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1124 T(queryterms, 0, 0, N, Q), // list of query terms
1125 T(range, 2, 2, N, 0), // return list of values between start and end
1126 T(record, 0, 1, N, 0), // record contents of document
1127 T(relevant, 0, 1, N, Q), // is document relevant?
1128 T(relevants, 0, 0, N, Q), // return list of relevant documents
1129 T(score, 0, 0, N, 0), // score (0-10) of current hit
1130 T(set, 2, 2, N, 0), // set option value
1131 T(seterror, 1, 1, N, 0), // set error_msg, setting it early stops query execution
1132 T(setmap, 1, N, N, 0), // set map of option values
1133 T(setrelevant, 0, 1, N, Q), // set rset
1134 T(slice, 2, 2, N, 0), // slice a list using a second list
1135 T(snippet, 1, 2, N, M), // generate snippet from text
1136 T(split, 1, 2, N, 0), // split a string to give a list
1137 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1138 T(sub, 2, 2, N, 0), // subtract
1139 T(substr, 2, 3, N, 0), // substring
1140 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1141 T(termprefix, 1, 1, N, 0), // get any prefix from a term
1142 T(terms, 0, 1, N, M), // list of matching terms
1143 T(thispage, 0, 0, N, M), // page number of current page
1144 T(time, 0, 0, N, M), // how long the match took (in seconds)
1145 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1146 // (counting from 0)
1147 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1148 // (default 16)
1149 T(transform, 3, 4, N, 0), // transform with a regexp
1150 T(truncate, 2, 4, N, 0), // truncate after a word
1151 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1152 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1153 T(unprefix, 1, 1, N, 0), // remove any prefix from a term
1154 T(unstem, 1, 1, N, Q), // return list of probabilistic terms from
1155 // the query which stemmed to this term
1156 T(upper, 1, 1, N, 0), // convert string to upper case
1157 T(url, 1, 1, N, 0), // url encode argument
1158 T(value, 1, 2, N, 0), // return document value
1159 T(version, 0, 0, N, 0), // omega version string
1160 T(weight, 0, 0, N, 0), // weight of the current hit
1161 { NULL,{0, 0, 0, 0, 0}}
1164 #undef T // Leaving T defined screws up Sun's C++ compiler!
1166 static vector<string> macros;
1168 // Call write() repeatedly until all data is written or we get a
1169 // non-recoverable error.
1170 static ssize_t
1171 write_all(int fd, const char * buf, size_t count)
1173 while (count) {
1174 ssize_t r = write(fd, buf, count);
1175 if (rare(r < 0)) {
1176 if (errno == EINTR) continue;
1177 return r;
1179 buf += r;
1180 count -= r;
1182 return 0;
1185 static string
1186 eval(const string &fmt, const vector<string> &param)
1188 static map<string, const struct func_attrib *> func_map;
1189 if (func_map.empty()) {
1190 struct func_desc *p;
1191 for (p = func_tab; p->name != NULL; ++p) {
1192 func_map[string(p->name)] = &(p->a);
1195 string res;
1196 string::size_type p = 0, q;
1197 while ((q = fmt.find('$', p)) != string::npos) try {
1198 res.append(fmt, p, q - p);
1199 string::size_type code_start = q; // note down for error reporting
1200 q++;
1201 if (q >= fmt.size()) break;
1202 unsigned char ch = fmt[q];
1203 switch (ch) {
1204 // Magic sequences:
1205 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1206 case '$':
1207 res += '$';
1208 p = q + 1;
1209 continue;
1210 case '(':
1211 res += '{';
1212 p = q + 1;
1213 continue;
1214 case ')':
1215 res += '}';
1216 p = q + 1;
1217 continue;
1218 case '.':
1219 res += ',';
1220 p = q + 1;
1221 continue;
1222 case '_':
1223 ch = '0';
1224 // FALL THRU
1225 case '1': case '2': case '3': case '4': case '5':
1226 case '6': case '7': case '8': case '9':
1227 ch -= '0';
1228 if (ch < param.size()) res += param[ch];
1229 p = q + 1;
1230 continue;
1231 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1232 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1233 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1234 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1235 case 'y': case 'z':
1236 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1237 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1238 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1239 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1240 case 'Y': case 'Z':
1241 case '{':
1242 break;
1243 default:
1244 string msg = "Unknown $ code in: $";
1245 msg.append(fmt, q, string::npos);
1246 throw msg;
1248 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1249 string var(fmt, q, p - q);
1250 map<string, const struct func_attrib *>::const_iterator func;
1251 func = func_map.find(var);
1252 if (func == func_map.end()) {
1253 throw "Unknown function '" + var + "'";
1255 vector<string> args;
1256 if (fmt[p] == '{') {
1257 q = p + 1;
1258 int nest = 1;
1259 while (true) {
1260 p = fmt.find_first_of(",{}", p + 1);
1261 if (p == string::npos)
1262 throw "missing } in " + fmt.substr(code_start);
1263 if (fmt[p] == '{') {
1264 ++nest;
1265 } else {
1266 if (nest == 1) {
1267 // should we split the args
1268 if (func->second->minargs != N) {
1269 args.push_back(fmt.substr(q, p - q));
1270 q = p + 1;
1273 if (fmt[p] == '}' && --nest == 0) break;
1276 if (func->second->minargs == N)
1277 args.push_back(fmt.substr(q, p - q));
1278 ++p;
1281 if (func->second->minargs != N) {
1282 if (int(args.size()) < func->second->minargs)
1283 throw "too few arguments to $" + var;
1284 if (func->second->maxargs != N &&
1285 int(args.size()) > func->second->maxargs)
1286 throw "too many arguments to $" + var;
1288 vector<string>::size_type n;
1289 if (func->second->evalargs != N)
1290 n = func->second->evalargs;
1291 else
1292 n = args.size();
1294 for (vector<string>::size_type j = 0; j < n; ++j)
1295 args[j] = eval(args[j], param);
1297 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1298 ensure_query_parsed();
1299 if (func->second->ensure == 'M') ensure_match();
1300 string value;
1301 switch (func->second->tag) {
1302 case CMD_:
1303 break;
1304 case CMD_add: {
1305 int total = 0;
1306 vector<string>::const_iterator i;
1307 for (auto&& arg : args)
1308 total += string_to_int(arg);
1309 value = str(total);
1310 break;
1312 case CMD_addfilter:
1313 add_bterm(args[0]);
1314 break;
1315 case CMD_allterms: {
1316 // list of all terms indexing document
1317 int id = q0;
1318 if (!args.empty()) id = string_to_int(args[0]);
1319 for (Xapian::TermIterator term = db.termlist_begin(id);
1320 term != db.termlist_end(id); ++term) {
1321 value += *term;
1322 value += '\t';
1325 if (!value.empty()) value.erase(value.size() - 1);
1326 break;
1328 case CMD_and: {
1329 value = "true";
1330 for (auto&& arg : args) {
1331 if (eval(arg, param).empty()) {
1332 value.resize(0);
1333 break;
1336 break;
1338 case CMD_cgi: {
1339 MCI i = cgi_params.find(args[0]);
1340 if (i != cgi_params.end()) value = i->second;
1341 break;
1343 case CMD_cgilist: {
1344 pair<MCI, MCI> g;
1345 g = cgi_params.equal_range(args[0]);
1346 for (MCI i = g.first; i != g.second; ++i) {
1347 value += i->second;
1348 value += '\t';
1350 if (!value.empty()) value.erase(value.size() - 1);
1351 break;
1353 case CMD_cgiparams: {
1354 const string* prev = NULL;
1355 for (auto&& i : cgi_params) {
1356 if (prev && i.first == *prev) continue;
1357 value += i.first;
1358 value += '\t';
1359 prev = &i.first;
1361 if (!value.empty()) value.erase(value.size() - 1);
1362 break;
1364 case CMD_chr:
1365 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1366 break;
1367 case CMD_collapsed: {
1368 value = str(collapsed);
1369 break;
1371 case CMD_contains: {
1372 size_t pos = args[1].find(args[0]);
1373 if (pos != string::npos) {
1374 value = str(pos);
1376 break;
1378 case CMD_csv:
1379 value = args[0];
1380 if (args.size() > 1 && !args[1].empty()) {
1381 csv_escape_always(value);
1382 } else {
1383 csv_escape(value);
1385 break;
1386 case CMD_date:
1387 value = args[0];
1388 if (!value.empty()) {
1389 char buf[64] = "";
1390 time_t date = string_to_int(value);
1391 if (date != static_cast<time_t>(-1)) {
1392 struct tm *then;
1393 then = gmtime(&date);
1394 string date_fmt = "%Y-%m-%d";
1395 if (args.size() > 1) date_fmt = eval(args[1], param);
1396 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1398 value = buf;
1400 break;
1401 case CMD_dbname:
1402 value = dbname;
1403 break;
1404 case CMD_dbsize: {
1405 static Xapian::doccount dbsize;
1406 if (!dbsize) dbsize = db.get_doccount();
1407 value = str(dbsize);
1408 break;
1410 case CMD_def: {
1411 func_attrib *fa = new func_attrib;
1412 fa->tag = CMD_MACRO + macros.size();
1413 fa->minargs = 0;
1414 fa->maxargs = 9;
1415 fa->evalargs = N; // FIXME: or 0?
1416 fa->ensure = 0;
1418 macros.push_back(args[1]);
1419 func_map[args[0]] = fa;
1420 break;
1422 case CMD_defaultop:
1423 if (default_op == Xapian::Query::OP_AND) {
1424 value = "and";
1425 } else {
1426 value = "or";
1428 break;
1429 case CMD_div: {
1430 int denom = string_to_int(args[1]);
1431 if (denom == 0) {
1432 value = "divide by 0";
1433 } else {
1434 value = str(string_to_int(args[0]) /
1435 string_to_int(args[1]));
1437 break;
1439 case CMD_eq:
1440 if (args[0] == args[1]) value = "true";
1441 break;
1442 case CMD_emptydocs: {
1443 string t;
1444 if (!args.empty())
1445 t = args[0];
1446 Xapian::PostingIterator i;
1447 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1448 if (i.get_doclength() != 0) continue;
1449 if (!value.empty()) value += '\t';
1450 value += str(*i);
1452 break;
1454 case CMD_env: {
1455 char *env = getenv(args[0].c_str());
1456 if (env != NULL) value = env;
1457 break;
1459 case CMD_error:
1460 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1461 error_msg = "Database '" + dbname + "' couldn't be opened";
1463 value = error_msg;
1464 break;
1465 case CMD_field: {
1466 Xapian::docid did = q0;
1467 if (args.size() > 1) did = string_to_int(args[1]);
1468 value = fields.get_field(did, args[0]);
1469 break;
1471 case CMD_filesize: {
1472 // FIXME: rounding? i18n?
1473 int size = string_to_int(args[0]);
1474 int intpart = size;
1475 int fraction = -1;
1476 const char * format = 0;
1477 if (size < 0) {
1478 // Negative size -> empty result.
1479 } else if (size == 1) {
1480 format = "%d byte";
1481 } else if (size < 1024) {
1482 format = "%d bytes";
1483 } else {
1484 if (size < 1024 * 1024) {
1485 format = "%d.%cK";
1486 } else {
1487 size /= 1024;
1488 if (size < 1024 * 1024) {
1489 format = "%d.%cM";
1490 } else {
1491 size /= 1024;
1492 format = "%d.%cG";
1495 intpart = unsigned(size) / 1024;
1496 fraction = unsigned(size) % 1024;
1498 if (format) {
1499 char buf[200];
1500 int len;
1501 if (fraction == -1) {
1502 len = my_snprintf(buf, sizeof(buf), format, intpart);
1503 } else {
1504 fraction = (fraction * 10 / 1024) + '0';
1505 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1507 if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1508 value.assign(buf, len);
1510 break;
1512 case CMD_filters:
1513 value = filters;
1514 break;
1515 case CMD_filterterms: {
1516 Xapian::TermIterator term = db.allterms_begin();
1517 term.skip_to(args[0]);
1518 while (term != db.allterms_end()) {
1519 string t = *term;
1520 if (!startswith(t, args[0])) break;
1521 value += t;
1522 value += '\t';
1523 ++term;
1526 if (!value.empty()) value.erase(value.size() - 1);
1527 break;
1529 case CMD_find: {
1530 string l = args[0], s = args[1];
1531 string::size_type i = 0, j = 0;
1532 size_t count = 0;
1533 while (j != l.size()) {
1534 j = l.find('\t', i);
1535 if (j == string::npos) j = l.size();
1536 if (j - i == s.length()) {
1537 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1538 value = str(count);
1539 break;
1542 ++count;
1543 i = j + 1;
1545 break;
1547 case CMD_fmt:
1548 value = fmtname;
1549 break;
1550 case CMD_freq: {
1551 const string& term = args[0];
1552 Xapian::doccount termfreq = 0;
1553 if (done_query) {
1554 termfreq = mset.get_termfreq(term);
1556 if (termfreq == 0) {
1557 // We want $freq to work before the match is run, and we
1558 // don't want using it to force the match to run.
1559 termfreq = db.get_termfreq(term);
1561 value = str(termfreq);
1562 break;
1564 case CMD_ge:
1565 if (string_to_int(args[0]) >= string_to_int(args[1]))
1566 value = "true";
1567 break;
1568 case CMD_gt:
1569 if (string_to_int(args[0]) > string_to_int(args[1]))
1570 value = "true";
1571 break;
1572 case CMD_hash: {
1573 const string& data = args[0];
1574 const string& hash = args[1];
1575 if (hash == "md5") {
1576 string md5;
1577 md5_string(data, md5);
1578 value.reserve(md5.size() * 2);
1579 for (unsigned char byte : md5) {
1580 value += "0123456789abcdef"[byte >> 4];
1581 value += "0123456789abcdef"[byte & 0x0f];
1583 } else {
1584 throw "Unknown hash function: " + hash;
1586 break;
1588 case CMD_highlight: {
1589 string bra, ket;
1590 if (args.size() > 2) {
1591 bra = args[2];
1592 if (args.size() > 3) {
1593 ket = args[3];
1594 } else {
1595 string::const_iterator i;
1596 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1597 ket = "</";
1598 ket.append(bra, 1, i - bra.begin() - 1);
1599 ket += '>';
1603 value = html_highlight(args[0], args[1], bra, ket);
1604 break;
1606 case CMD_hit:
1607 // 0-based mset index
1608 value = str(hit_no);
1609 break;
1610 case CMD_hitlist:
1611 #if 0
1612 url_query_string = "?DB=";
1613 url_query_string += dbname;
1614 multimap<string, string>::const_iterator j;
1615 for (j = probabilistic_query.begin();
1616 j != probabilistic_query.end();
1617 ++j) {
1618 if (j->first.empty()) {
1619 url_query_string += "&P=";
1620 } else {
1621 url_query_string += "&P."
1622 url_query_string += j->first;
1623 url_query_string += '=';
1625 const char *q = j->second.c_str();
1626 int ch;
1627 while ((ch = *q++) != '\0') {
1628 switch (ch) {
1629 case '+':
1630 url_query_string += "%2b";
1631 break;
1632 case '"':
1633 url_query_string += "%22";
1634 break;
1635 case '%':
1636 url_query_string += "%25";
1637 break;
1638 case '&':
1639 url_query_string += "%26";
1640 break;
1641 case ' ':
1642 ch = '+';
1643 /* fall through */
1644 default:
1645 url_query_string += ch;
1649 // add any boolean terms
1650 for (FMCI i = filter_map.begin(); i != filter_map.end(); ++i) {
1651 url_query_string += "&B=";
1652 url_query_string += i->second;
1654 #endif
1655 for (hit_no = topdoc; hit_no < last; ++hit_no)
1656 value += print_caption(args[0], param);
1657 hit_no = 0;
1658 break;
1659 case CMD_hitsperpage:
1660 value = str(hits_per_page);
1661 break;
1662 case CMD_hostname: {
1663 value = args[0];
1664 // remove URL scheme and/or path
1665 string::size_type i = value.find("://");
1666 if (i == string::npos) i = 0; else i += 3;
1667 value = value.substr(i, value.find('/', i) - i);
1668 // remove user@ or user:password@
1669 i = value.find('@');
1670 if (i != string::npos) value.erase(0, i + 1);
1671 // remove :port
1672 i = value.find(':');
1673 if (i != string::npos) value.resize(i);
1674 break;
1676 case CMD_html:
1677 value = html_escape(args[0]);
1678 break;
1679 case CMD_htmlstrip:
1680 value = html_strip(args[0]);
1681 break;
1682 case CMD_httpheader:
1683 if (!suppress_http_headers) {
1684 cout << args[0] << ": " << args[1] << endl;
1685 if (!set_content_type && args[0].length() == 12 &&
1686 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1687 set_content_type = true;
1690 break;
1691 case CMD_id:
1692 // document id
1693 value = str(q0);
1694 break;
1695 case CMD_if:
1696 if (!args[0].empty())
1697 value = eval(args[1], param);
1698 else if (args.size() > 2)
1699 value = eval(args[2], param);
1700 break;
1701 case CMD_include:
1702 value = eval_file(args[0]);
1703 break;
1704 case CMD_json:
1705 value = args[0];
1706 json_escape(value);
1707 break;
1708 case CMD_jsonarray: {
1709 const string & l = args[0];
1710 string::size_type i = 0, j;
1711 if (l.empty()) {
1712 value = "[]";
1713 break;
1715 value = "[\"";
1716 while (true) {
1717 j = l.find('\t', i);
1718 string elt(l, i, j - i);
1719 json_escape(elt);
1720 value += elt;
1721 if (j == string::npos) break;
1722 value += "\",\"";
1723 i = j + 1;
1725 value += "\"]";
1726 break;
1728 case CMD_last:
1729 value = str(last);
1730 break;
1731 case CMD_lastpage: {
1732 int l = mset.get_matches_estimated();
1733 if (l > 0) l = (l - 1) / hits_per_page + 1;
1734 value = str(l);
1735 break;
1737 case CMD_le:
1738 if (string_to_int(args[0]) <= string_to_int(args[1]))
1739 value = "true";
1740 break;
1741 case CMD_length:
1742 if (args[0].empty()) {
1743 value = "0";
1744 } else {
1745 size_t length = count(args[0].begin(), args[0].end(), '\t');
1746 value = str(length + 1);
1748 break;
1749 case CMD_list: {
1750 if (!args[0].empty()) {
1751 string pre, inter, interlast, post;
1752 switch (args.size()) {
1753 case 2:
1754 inter = interlast = args[1];
1755 break;
1756 case 3:
1757 inter = args[1];
1758 interlast = args[2];
1759 break;
1760 case 4:
1761 pre = args[1];
1762 inter = interlast = args[2];
1763 post = args[3];
1764 break;
1765 case 5:
1766 pre = args[1];
1767 inter = args[2];
1768 interlast = args[3];
1769 post = args[4];
1770 break;
1772 value += pre;
1773 string list = args[0];
1774 string::size_type split = 0, split2;
1775 while ((split2 = list.find('\t', split)) != string::npos) {
1776 if (split) value += inter;
1777 value.append(list, split, split2 - split);
1778 split = split2 + 1;
1780 if (split) value += interlast;
1781 value.append(list, split, string::npos);
1782 value += post;
1784 break;
1786 case CMD_log: {
1787 if (!vet_filename(args[0])) break;
1788 string logfile = log_dir + args[0];
1789 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1790 if (fd == -1) break;
1791 vector<string> noargs;
1792 noargs.resize(1);
1793 string line;
1794 if (args.size() > 1) {
1795 line = args[1];
1796 } else {
1797 line = DEFAULT_LOG_ENTRY;
1799 line = eval(line, noargs);
1800 line += '\n';
1801 (void)write_all(fd, line.data(), line.length());
1802 close(fd);
1803 break;
1805 case CMD_lookup: {
1806 if (!vet_filename(args[0])) break;
1807 string cdbfile = cdb_dir + args[0];
1808 int fd = open(cdbfile.c_str(), O_RDONLY);
1809 if (fd == -1) break;
1811 struct cdb cdb;
1812 cdb_init(&cdb, fd);
1814 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1815 size_t datalen = cdb_datalen(&cdb);
1816 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1817 if (q) {
1818 value.assign(static_cast<const char *>(dat), datalen);
1822 cdb_free(&cdb);
1823 close(fd); // FIXME: cache fds?
1824 break;
1826 case CMD_lower:
1827 value = Xapian::Unicode::tolower(args[0]);
1828 break;
1829 case CMD_lt:
1830 if (string_to_int(args[0]) < string_to_int(args[1]))
1831 value = "true";
1832 break;
1833 case CMD_map:
1834 if (!args[0].empty()) {
1835 string l = args[0], pat = args[1];
1836 vector<string> new_args(param);
1837 string::size_type i = 0, j;
1838 while (true) {
1839 j = l.find('\t', i);
1840 new_args[0] = l.substr(i, j - i);
1841 value += eval(pat, new_args);
1842 if (j == string::npos) break;
1843 value += '\t';
1844 i = j + 1;
1847 break;
1848 case CMD_match:
1849 omegascript_match(value, args);
1850 break;
1851 case CMD_max: {
1852 vector<string>::const_iterator i = args.begin();
1853 int val = string_to_int(*i++);
1854 for (; i != args.end(); ++i) {
1855 int x = string_to_int(*i);
1856 if (x > val) val = x;
1858 value = str(val);
1859 break;
1861 case CMD_min: {
1862 vector<string>::const_iterator i = args.begin();
1863 int val = string_to_int(*i++);
1864 for (; i != args.end(); ++i) {
1865 int x = string_to_int(*i);
1866 if (x < val) val = x;
1868 value = str(val);
1869 break;
1871 case CMD_msize:
1872 // Estimated number of matches.
1873 value = str(mset.get_matches_estimated());
1874 break;
1875 case CMD_msizeexact:
1876 // Is msize exact?
1877 if (mset.get_matches_lower_bound()
1878 == mset.get_matches_upper_bound())
1879 value = "true";
1880 break;
1881 case CMD_msizelower:
1882 // Lower bound on number of matches.
1883 value = str(mset.get_matches_lower_bound());
1884 break;
1885 case CMD_msizeupper:
1886 // Upper bound on number of matches.
1887 value = str(mset.get_matches_upper_bound());
1888 break;
1889 case CMD_mod: {
1890 int denom = string_to_int(args[1]);
1891 if (denom == 0) {
1892 value = "divide by 0";
1893 } else {
1894 value = str(string_to_int(args[0]) %
1895 string_to_int(args[1]));
1897 break;
1899 case CMD_mul: {
1900 vector<string>::const_iterator i = args.begin();
1901 int total = string_to_int(*i++);
1902 while (i != args.end())
1903 total *= string_to_int(*i++);
1904 value = str(total);
1905 break;
1907 case CMD_muldiv: {
1908 int denom = string_to_int(args[2]);
1909 if (denom == 0) {
1910 value = "divide by 0";
1911 } else {
1912 int num = string_to_int(args[0]) * string_to_int(args[1]);
1913 value = str(num / denom);
1915 break;
1917 case CMD_ne:
1918 if (args[0] != args[1]) value = "true";
1919 break;
1920 case CMD_nice: {
1921 string::const_iterator i = args[0].begin();
1922 int len = args[0].length();
1923 while (len) {
1924 value += *i++;
1925 if (--len && len % 3 == 0) value += option["thousand"];
1927 break;
1929 case CMD_not:
1930 if (args[0].empty()) value = "true";
1931 break;
1932 case CMD_now: {
1933 char buf[64];
1934 my_snprintf(buf, sizeof(buf), "%lu",
1935 static_cast<unsigned long>(time(NULL)));
1936 // MSVC's snprintf omits the zero byte if the string if
1937 // sizeof(buf) long.
1938 buf[sizeof(buf) - 1] = '\0';
1939 value = buf;
1940 break;
1942 case CMD_opt:
1943 if (args.size() == 2) {
1944 value = option[args[0] + "," + args[1]];
1945 } else {
1946 value = option[args[0]];
1948 break;
1949 case CMD_or: {
1950 for (auto&& arg : args) {
1951 value = eval(arg, param);
1952 if (!value.empty()) break;
1954 break;
1956 case CMD_ord: {
1957 if (!args[0].empty()) {
1958 Utf8Iterator it(args[0]);
1959 value = str(*it);
1961 break;
1963 case CMD_pack:
1964 value = int_to_binary_string(string_to_int(args[0]));
1965 break;
1966 case CMD_percentage:
1967 // percentage score
1968 value = str(percent);
1969 break;
1970 case CMD_prettyterm:
1971 value = pretty_term(args[0]);
1972 break;
1973 case CMD_prettyurl:
1974 value = args[0];
1975 url_prettify(value);
1976 break;
1977 case CMD_query: {
1978 pair<multimap<string, string>::const_iterator,
1979 multimap<string, string>::const_iterator> r;
1980 r = probabilistic_query.equal_range(args.empty() ?
1981 string() : args[0]);
1982 multimap<string, string>::const_iterator j;
1983 for (j = r.first; j != r.second; ++j) {
1984 if (!value.empty()) value += '\t';
1985 const string & s = j->second;
1986 size_t start = 0, tab;
1987 while ((tab = s.find('\t', start)) != string::npos) {
1988 value.append(s, start, tab - start);
1989 value += ' ';
1990 start = tab + 1;
1992 value.append(s, start, string::npos);
1994 break;
1996 case CMD_querydescription:
1997 value = query.get_description();
1998 break;
1999 case CMD_queryterms:
2000 value = queryterms;
2001 break;
2002 case CMD_range: {
2003 int start = string_to_int(args[0]);
2004 int end = string_to_int(args[1]);
2005 while (start <= end) {
2006 value += str(start);
2007 if (start < end) value += '\t';
2008 start++;
2010 break;
2012 case CMD_record: {
2013 int id = q0;
2014 if (!args.empty()) id = string_to_int(args[0]);
2015 value = db.get_document(id).get_data();
2016 break;
2018 case CMD_relevant: {
2019 // document id if relevant; empty otherwise
2020 int id = q0;
2021 if (!args.empty()) id = string_to_int(args[0]);
2022 map<Xapian::docid, bool>::iterator i = ticked.find(id);
2023 if (i != ticked.end()) {
2024 i->second = false; // icky side-effect
2025 value = str(id);
2027 break;
2029 case CMD_relevants: {
2030 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
2031 i != ticked.end(); ++i) {
2032 if (i->second) {
2033 value += str(i->first);
2034 value += '\t';
2037 if (!value.empty()) value.erase(value.size() - 1);
2038 break;
2040 case CMD_score:
2041 // Score (0 to 10)
2042 value = str(percent / 10);
2043 break;
2044 case CMD_set:
2045 option[args[0]] = args[1];
2046 break;
2047 case CMD_seterror:
2048 error_msg = args[0];
2049 break;
2050 case CMD_setmap: {
2051 string base = args[0] + ',';
2052 if (args.size() % 2 != 1)
2053 throw string("$setmap requires an odd number of arguments");
2054 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2055 option[base + args[i]] = args[i + 1];
2057 break;
2059 case CMD_setrelevant: {
2060 string::size_type i = 0, j;
2061 while (true) {
2062 j = args[0].find_first_not_of("0123456789", i);
2063 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2064 if (id) {
2065 rset.add_document(id);
2066 ticked[id] = true;
2068 if (j == string::npos) break;
2069 i = j + 1;
2071 break;
2073 case CMD_slice: {
2074 string list = args[0], pos = args[1];
2075 vector<string> items;
2076 string::size_type i = 0, j;
2077 while (true) {
2078 j = list.find('\t', i);
2079 items.push_back(list.substr(i, j - i));
2080 if (j == string::npos) break;
2081 i = j + 1;
2083 i = 0;
2084 bool have_added = false;
2085 while (true) {
2086 j = pos.find('\t', i);
2087 int item = string_to_int(pos.substr(i, j - i));
2088 if (item >= 0 && size_t(item) < items.size()) {
2089 if (have_added) value += '\t';
2090 value += items[item];
2091 have_added = true;
2093 if (j == string::npos) break;
2094 i = j + 1;
2096 break;
2098 case CMD_snippet: {
2099 size_t length = 200;
2100 if (args.size() > 1) {
2101 length = string_to_int(args[1]);
2103 if (!stemmer)
2104 stemmer = new Xapian::Stem(option["stemmer"]);
2105 // FIXME: Allow start and end highlight and omit to be specified.
2106 value = mset.snippet(args[0], length, *stemmer,
2107 mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2108 "<strong>", "</strong>", "...");
2109 break;
2111 case CMD_split: {
2112 string split;
2113 if (args.size() == 1) {
2114 split = " ";
2115 value = args[0];
2116 } else {
2117 split = args[0];
2118 value = args[1];
2120 string::size_type i = 0;
2121 while (true) {
2122 if (split.empty()) {
2123 ++i;
2124 if (i >= value.size()) break;
2125 } else {
2126 i = value.find(split, i);
2127 if (i == string::npos) break;
2129 value.replace(i, split.size(), 1, '\t');
2130 ++i;
2132 break;
2134 case CMD_stoplist: {
2135 Xapian::TermIterator i = qp.stoplist_begin();
2136 Xapian::TermIterator end = qp.stoplist_end();
2137 while (i != end) {
2138 if (!value.empty()) value += '\t';
2139 value += *i;
2140 ++i;
2142 break;
2144 case CMD_sub:
2145 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2146 break;
2147 case CMD_substr: {
2148 int start = string_to_int(args[1]);
2149 if (start < 0) {
2150 if (static_cast<size_t>(-start) >= args[0].size()) {
2151 start = 0;
2152 } else {
2153 start = static_cast<int>(args[0].size()) + start;
2155 } else {
2156 if (static_cast<size_t>(start) >= args[0].size()) break;
2158 size_t len = string::npos;
2159 if (args.size() > 2) {
2160 int int_len = string_to_int(args[2]);
2161 if (int_len >= 0) {
2162 len = size_t(int_len);
2163 } else {
2164 len = args[0].size() - start;
2165 if (static_cast<size_t>(-int_len) >= len) {
2166 len = 0;
2167 } else {
2168 len -= static_cast<size_t>(-int_len);
2172 value.assign(args[0], start, len);
2173 break;
2175 case CMD_suggestion:
2176 value = qp.get_corrected_query_string();
2177 break;
2178 case CMD_termprefix:
2179 (void)prefix_from_term(&value, args[0]);
2180 break;
2181 case CMD_terms: {
2182 // list of matching terms
2183 if (!enquire) break;
2184 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2185 if (args.empty()) {
2186 while (term != enquire->get_matching_terms_end(q0)) {
2187 // check term was in the typed query so we ignore
2188 // boolean filter terms
2189 const string & t = *term;
2190 if (termset.find(t) != termset.end()) {
2191 value += t;
2192 value += '\t';
2194 ++term;
2196 } else {
2197 // Return matching terms with specified prefix. We can't
2198 // use skip_to() as the terms aren't ordered by termname.
2199 const string & pfx = args[0];
2200 while (term != enquire->get_matching_terms_end(q0)) {
2201 const string & t = *term;
2202 if (startswith(t, pfx)) {
2203 value += t;
2204 value += '\t';
2206 ++term;
2210 if (!value.empty()) value.erase(value.size() - 1);
2211 break;
2213 case CMD_thispage:
2214 value = str(topdoc / hits_per_page + 1);
2215 break;
2216 case CMD_time:
2217 if (secs >= 0) {
2218 char buf[64];
2219 my_snprintf(buf, sizeof(buf), "%.6f", secs);
2220 // MSVC's snprintf omits the zero byte if the string if
2221 // sizeof(buf) long.
2222 buf[sizeof(buf) - 1] = '\0';
2223 value = buf;
2225 break;
2226 case CMD_topdoc:
2227 // first document on current page of hit list (counting from 0)
2228 value = str(topdoc);
2229 break;
2230 case CMD_topterms:
2231 if (enquire) {
2232 int howmany = 16;
2233 if (!args.empty()) howmany = string_to_int(args[0]);
2234 if (howmany < 0) howmany = 0;
2236 // List of expand terms
2237 Xapian::ESet eset;
2238 OmegaExpandDecider decider(db, &termset);
2240 if (!rset.empty()) {
2241 set_expansion_scheme(*enquire, option);
2242 eset = enquire->get_eset(howmany * 2, rset, &decider);
2243 } else if (mset.size()) {
2244 // invent an rset
2245 Xapian::RSet tmp;
2247 int c = 5;
2248 // FIXME: what if mset does not start at first match?
2249 for (Xapian::docid did : mset) {
2250 tmp.add_document(did);
2251 if (--c == 0) break;
2254 set_expansion_scheme(*enquire, option);
2255 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2258 // Don't show more than one word with the same stem.
2259 set<string> stems;
2260 Xapian::ESetIterator i;
2261 for (i = eset.begin(); i != eset.end(); ++i) {
2262 string term(*i);
2263 string stem = (*stemmer)(term);
2264 if (stems.find(stem) != stems.end()) continue;
2265 stems.insert(stem);
2266 value += term;
2267 value += '\t';
2268 if (--howmany == 0) break;
2270 if (!value.empty()) value.erase(value.size() - 1);
2272 break;
2273 case CMD_transform:
2274 omegascript_transform(value, args);
2275 break;
2276 case CMD_truncate:
2277 value = generate_sample(args[0],
2278 string_to_int(args[1]),
2279 args.size() > 2 ? args[2] : string(),
2280 args.size() > 3 ? args[3] : string());
2281 break;
2282 case CMD_uniq: {
2283 const string &list = args[0];
2284 if (list.empty()) break;
2285 string::size_type split = 0, split2;
2286 string prev;
2287 do {
2288 split2 = list.find('\t', split);
2289 string item(list, split, split2 - split);
2290 if (split == 0) {
2291 value = item;
2292 } else if (item != prev) {
2293 value += '\t';
2294 value += item;
2296 prev = item;
2297 split = split2 + 1;
2298 } while (split2 != string::npos);
2299 break;
2301 case CMD_unpack:
2302 value = str(binary_string_to_int(args[0]));
2303 break;
2304 case CMD_unprefix: {
2305 size_t prefix_len = prefix_from_term(NULL, args[0]);
2306 value.assign(args[0], prefix_len, string::npos);
2307 break;
2309 case CMD_unstem: {
2310 const string &term = args[0];
2311 Xapian::TermIterator i = qp.unstem_begin(term);
2312 Xapian::TermIterator end = qp.unstem_end(term);
2313 while (i != end) {
2314 if (!value.empty()) value += '\t';
2315 value += *i;
2316 ++i;
2318 break;
2320 case CMD_upper:
2321 value = Xapian::Unicode::toupper(args[0]);
2322 break;
2323 case CMD_url:
2324 url_encode(value, args[0]);
2325 break;
2326 case CMD_value: {
2327 Xapian::docid id = q0;
2328 Xapian::valueno value_no = string_to_int(args[0]);
2329 if (args.size() > 1) id = string_to_int(args[1]);
2330 value = db.get_document(id).get_value(value_no);
2331 break;
2333 case CMD_version:
2334 value = PACKAGE_STRING;
2335 break;
2336 case CMD_weight:
2337 value = double_to_string(weight);
2338 break;
2339 default: {
2340 args.insert(args.begin(), param[0]);
2341 int macro_no = func->second->tag - CMD_MACRO;
2342 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2343 // throw "Unknown function '" + var + "'";
2344 value = eval(macros[macro_no], args);
2345 break;
2348 res += value;
2349 } catch (const Xapian::Error & e) {
2350 // FIXME: this means we only see the most recent error in $error
2351 // - is that the best approach?
2352 error_msg = e.get_msg();
2355 res.append(fmt, p, string::npos);
2356 return res;
2359 static string
2360 eval_file(const string &fmtfile)
2362 string err;
2363 if (vet_filename(fmtfile)) {
2364 string file = template_dir + fmtfile;
2365 string fmt;
2366 if (load_file(file, fmt)) {
2367 vector<string> noargs;
2368 noargs.resize(1);
2369 return eval(fmt, noargs);
2371 err = strerror(errno);
2372 } else {
2373 err = "name contains '..'";
2376 // FIXME: report why!
2377 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2378 if (!err.empty()) msg += " (" + err + ')';
2379 throw msg;
2382 extern string
2383 pretty_term(string term)
2385 // Just leave empty strings and single characters alone.
2386 if (term.length() <= 1) return term;
2388 // Assume unprefixed terms are unstemmed.
2389 if (!C_isupper(term[0])) return term;
2391 // Handle stemmed terms.
2392 bool stemmed = (term[0] == 'Z');
2393 if (stemmed) {
2394 // First of all, check if a term in the query stemmed to this one.
2395 Xapian::TermIterator u = qp.unstem_begin(term);
2396 // There might be multiple words with the same stem, but we only want
2397 // one so just take the first.
2398 if (u != qp.unstem_end(term)) return *u;
2400 // Remove the 'Z'.
2401 term.erase(0, 1);
2404 bool add_quotes = false;
2406 // Check if the term has a prefix.
2407 if (C_isupper(term[0])) {
2408 // See if we have this prefix in the termprefix_to_userprefix map. If
2409 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2410 string prefix;
2411 size_t prefix_len = prefix_from_term(&prefix, term);
2413 map<string, string>::const_iterator i;
2414 i = termprefix_to_userprefix.find(prefix);
2415 if (i != termprefix_to_userprefix.end()) {
2416 string user_prefix = i->second;
2417 user_prefix += ':';
2418 term.replace(0, prefix_len, user_prefix);
2419 } else {
2420 // We don't have a prefix mapping for this, so just set a flag to
2421 // add quotes around the term.
2422 add_quotes = true;
2426 if (stemmed) term += '.';
2428 if (add_quotes) {
2429 term.insert(0, "\"");
2430 term.append("\"");
2433 return term;
2436 static string
2437 print_caption(const string &fmt, const vector<string> &param)
2439 q0 = *(mset[hit_no]);
2441 weight = mset[hit_no].get_weight();
2442 percent = mset.convert_to_percent(mset[hit_no]);
2443 collapsed = mset[hit_no].get_collapse_count();
2445 return eval(fmt, param);
2448 void
2449 parse_omegascript()
2451 try {
2452 const char * p = getenv("SERVER_PROTOCOL");
2453 if (p && strcmp(p, "INCLUDED") == 0) {
2454 // We're being included in another page, so suppress headers.
2455 suppress_http_headers = true;
2458 string output = eval_file(fmtname);
2459 if (!set_content_type && !suppress_http_headers) {
2460 cout << "Content-Type: text/html" << endl;
2461 set_content_type = true;
2463 if (!suppress_http_headers) cout << endl;
2464 cout << output;
2465 } catch (...) {
2466 // Ensure the headers have been output so that any exception gets
2467 // reported rather than giving a server error.
2468 if (!set_content_type && !suppress_http_headers) {
2469 cout << "Content-Type: text/html" << endl;
2470 set_content_type = true;
2472 if (!suppress_http_headers) cout << endl;
2473 throw;
2477 static void
2478 ensure_query_parsed()
2480 if (query_parsed) return;
2481 query_parsed = true;
2483 MCI val;
2484 pair<MCI, MCI> g;
2486 // Should we discard the existing R-set recorded in R CGI parameters?
2487 bool discard_rset = false;
2489 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2490 // CGI parameters)?
2491 bool force_first_page = false;
2493 string v;
2494 // get list of terms from previous iteration of query
2495 val = cgi_params.find("xP");
2496 if (val != cgi_params.end()) {
2497 v = val->second;
2498 // If xP given, default to discarding any RSet and forcing the first
2499 // page of results. If the query is the same, or an extension of
2500 // the previous query, we adjust these again below.
2501 discard_rset = true;
2502 force_first_page = true;
2504 querytype result = set_probabilistic(v);
2505 switch (result) {
2506 case BAD_QUERY:
2507 break;
2508 case NEW_QUERY:
2509 break;
2510 case SAME_QUERY:
2511 case EXTENDED_QUERY:
2512 // If we've changed database, force the first page of hits
2513 // and discard the R-set (since the docids will have changed)
2514 val = cgi_params.find("xDB");
2515 if (val != cgi_params.end() && val->second != dbname) break;
2516 if (result == SAME_QUERY && force_first_page) {
2517 val = cgi_params.find("xFILTERS");
2518 if (val != cgi_params.end() && val->second != filters &&
2519 val->second != old_filters) {
2520 // Filters have changed since last query.
2521 } else {
2522 force_first_page = false;
2525 discard_rset = false;
2526 break;
2529 if (!force_first_page) {
2530 // Work out which mset element is the first hit we want
2531 // to display
2532 val = cgi_params.find("TOPDOC");
2533 if (val != cgi_params.end()) {
2534 topdoc = atol(val->second.c_str());
2537 // Handle next, previous, and page links
2538 if (cgi_params.find(">") != cgi_params.end()) {
2539 topdoc += hits_per_page;
2540 } else if (cgi_params.find("<") != cgi_params.end()) {
2541 if (topdoc >= hits_per_page)
2542 topdoc -= hits_per_page;
2543 else
2544 topdoc = 0;
2545 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2546 (val = cgi_params.find("#")) != cgi_params.end()) {
2547 long page = atol(val->second.c_str());
2548 // Do something sensible for page 0 (we count pages from 1).
2549 if (page == 0) page = 1;
2550 topdoc = (page - 1) * hits_per_page;
2553 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2554 // Normally we snap TOPDOC like this so that things work nicely if
2555 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2556 // postprocessing the output of omega and want variable sized pages,
2557 // this is unhelpful.
2558 bool raw_search = false;
2559 val = cgi_params.find("RAWSEARCH");
2560 if (val != cgi_params.end()) {
2561 raw_search = bool(atol(val->second.c_str()));
2564 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2567 if (!discard_rset) {
2568 // put documents marked as relevant into the rset
2569 g = cgi_params.equal_range("R");
2570 for (MCI i = g.first; i != g.second; ++i) {
2571 const string & value = i->second;
2572 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2573 while (value[j] == '.') ++j;
2574 Xapian::docid d = atoi(value.c_str() + j);
2575 if (d) {
2576 rset.add_document(d);
2577 ticked[d] = true;
2584 // run query if we haven't already
2585 static void
2586 ensure_match()
2588 if (done_query) return;
2590 secs = RealTime::now();
2591 run_query();
2592 if (secs != -1)
2593 secs = RealTime::now() - secs;
2595 done_query = true;
2596 last = mset.get_matches_lower_bound();
2597 if (last == 0) {
2598 // Otherwise topdoc ends up being -6 if it's non-zero!
2599 topdoc = 0;
2600 } else {
2601 if (topdoc >= last)
2602 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2603 // last is the count of documents up to the end of the current page
2604 // (as returned by $last)
2605 if (topdoc + hits_per_page < last)
2606 last = topdoc + hits_per_page;
2610 // OmegaExpandDecider methods.
2612 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2613 set<string> * querytermset)
2614 : db(db_)
2616 // We'll want the stemmer for testing matches anyway.
2617 if (!stemmer)
2618 stemmer = new Xapian::Stem(option["stemmer"]);
2619 if (querytermset) {
2620 set<string>::const_iterator i;
2621 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2622 string term(*i);
2623 if (term.empty()) continue;
2625 unsigned char ch = term[0];
2626 bool stemmed = (ch == 'Z');
2627 if (stemmed) {
2628 term.erase(0, 1);
2629 if (term.empty()) continue;
2630 ch = term[0];
2633 if (C_isupper(ch)) {
2634 size_t prefix_len = prefix_from_term(NULL, term);
2635 term.erase(0, prefix_len);
2638 if (!stemmed) term = (*stemmer)(term);
2640 exclude_stems.insert(term);
2645 bool
2646 OmegaExpandDecider::operator()(const string & term) const
2648 unsigned char ch = term[0];
2650 // Reject terms with a prefix.
2651 if (C_isupper(ch)) return false;
2654 MyStopper stopper;
2655 // Don't suggest stopwords.
2656 if (stopper(term)) return false;
2659 // Reject small numbers.
2660 if (term.size() < 4 && C_isdigit(ch)) return false;
2662 // Reject terms containing a space.
2663 if (term.find(' ') != string::npos) return false;
2665 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2666 // terms which are already in the query in some form.
2667 string stem = (*stemmer)(term);
2668 if (exclude_stems.find(stem) != exclude_stems.end())
2669 return false;
2671 // Ignore terms that only occur once (hapaxes) since they aren't
2672 // useful for finding related documents - they only occur in a
2673 // document that's already been marked as relevant.
2674 // FIXME: add an expand option to ignore terms where
2675 // termfreq == rtermfreq.
2676 if (db.get_termfreq(term) <= 1) return false;
2678 return true;