Support concurrent date range filters
[xapian.git] / xapian-applications / omega / query.cc
blob9662003f4b923d47331ae24e36e3ba5a61e81048
1 /* query.cc: query executor for omega
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018 Olly Betts
8 * Copyright 2008 Thomas Viehmann
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include <algorithm>
29 #include <iostream>
30 #include <map>
31 #include <set>
32 #include <unordered_map>
33 #include <unordered_set>
34 #include <vector>
36 #include <cassert>
37 #include <cctype>
38 #include "safeerrno.h"
39 #include <stdio.h>
40 #include <cstdlib>
41 #include <cstring>
42 #include "strcasecmp.h"
43 #include <ctime>
45 #include "safeunistd.h"
46 #include <sys/types.h>
47 #include "safesysstat.h"
48 #include "safefcntl.h"
50 #include "realtime.h"
52 #include <cdb.h>
54 #include "csvescape.h"
55 #include "date.h"
56 #include "datevalue.h"
57 #include "jsonescape.h"
58 #include "utils.h"
59 #include "omega.h"
60 #include "query.h"
61 #include "cgiparam.h"
62 #include "loadfile.h"
63 #include "sample.h"
64 #include "sort.h"
65 #include "str.h"
66 #include "stringutils.h"
67 #include "transform.h"
68 #include "urldecode.h"
69 #include "urlencode.h"
70 #include "unixperm.h"
71 #include "values.h"
72 #include "weight.h"
73 #include "expand.h"
74 #include "md5wrap.h"
76 #include <xapian.h>
78 using namespace std;
80 using Xapian::Utf8Iterator;
82 using Xapian::Unicode::is_wordchar;
84 #ifndef SNPRINTF
85 #include <cstdarg>
87 static int my_snprintf(char *str, size_t size, const char *format, ...)
89 int res;
90 va_list ap;
91 va_start(ap, format);
92 str[size - 1] = '\0';
93 res = vsprintf(str, format, ap);
94 if (str[size - 1] || res < 0 || size_t(res) >= size)
95 abort(); /* Overflowed! */
96 va_end(ap);
97 return res;
99 #else
100 #define my_snprintf SNPRINTF
101 #endif
103 static bool query_parsed = false;
104 static bool done_query = false;
105 static Xapian::docid last = 0;
107 static Xapian::MSet mset;
109 static map<Xapian::docid, bool> ticked;
111 static void ensure_query_parsed();
112 static void ensure_match();
114 static Xapian::Query query;
115 //static string url_query_string;
116 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
118 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
119 // be true if a date filter is specified which simplies to Query::MatchNothing
120 // at construction time.
121 static bool date_filter_set = false;
122 static Xapian::Query date_filter;
124 static Xapian::QueryParser qp;
125 static Xapian::NumberRangeProcessor * size_rp = NULL;
126 static Xapian::Stem *stemmer = NULL;
128 static string eval_file(const string &fmtfile);
130 static set<string> termset;
132 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
133 static map<string, string> termprefix_to_userprefix;
135 static string queryterms;
137 static string error_msg;
139 static double secs = -1;
141 static const char DEFAULT_LOG_ENTRY[] =
142 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
143 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
144 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
145 "$dbname\t"
146 "$query\t"
147 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
149 class MyStopper : public Xapian::Stopper {
150 public:
151 bool operator()(const string &t) const {
152 switch (t[0]) {
153 case 'a':
154 return (t == "a" || t == "about" || t == "an" || t == "and" ||
155 t == "are" || t == "as" || t == "at");
156 case 'b':
157 return (t == "be" || t == "by");
158 case 'e':
159 return (t == "en");
160 case 'f':
161 return (t == "for" || t == "from");
162 case 'h':
163 return (t == "how");
164 case 'i':
165 return (t == "i" || t == "in" || t == "is" || t == "it");
166 case 'o':
167 return (t == "of" || t == "on" || t == "or");
168 case 't':
169 return (t == "that" || t == "the" || t == "this" || t == "to");
170 case 'w':
171 return (t == "was" || t == "what" || t == "when" ||
172 t == "where" || t == "which" || t == "who" ||
173 t == "why" || t == "will" || t == "with");
174 case 'y':
175 return (t == "you" || t == "your");
176 default:
177 return false;
182 static size_t
183 prefix_from_term(string* prefix, const string& term)
185 if (!term.empty()) {
186 if (term[0] == 'X') {
187 const string::const_iterator begin = term.begin();
188 string::const_iterator i = begin + 1;
189 while (i != term.end() && C_isupper(*i))
190 ++i;
191 if (prefix)
192 prefix->assign(begin, i);
193 if (i != term.end() && *i == ':')
194 ++i;
195 return i - begin;
198 if (C_isupper(term[0])) {
199 if (prefix)
200 *prefix = term[0];
201 return 1;
205 if (prefix)
206 prefix->resize(0);
207 return 0;
210 // Don't allow ".." in format names, log file names, etc as this would allow
211 // people to open a format "../../etc/passwd" or similar.
212 // FIXME: make this check more exact ("foo..bar" is safe)
213 // FIXME: log when this check fails
214 static bool
215 vet_filename(const string &filename)
217 string::size_type i = filename.find("..");
218 return (i == string::npos);
221 // Heuristics:
222 // * If any terms have been removed, it's a "fresh query" so we discard any
223 // relevance judgements
224 // * If all previous terms are there but more have been added then we keep
225 // the relevance judgements, but return the first page of hits
227 // NEW_QUERY entirely new query
228 // SAME_QUERY unchanged query
229 // EXTENDED_QUERY new query, but based on the old one
230 // BAD_QUERY parse error (message in error_msg)
231 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
233 static multimap<string, string> query_strings;
235 void
236 add_query_string(const string& prefix, const string& s)
238 string query_string = s;
239 // Strip leading and trailing whitespace from query_string.
240 trim(query_string);
241 if (!query_string.empty())
242 query_strings.insert(make_pair(prefix, query_string));
245 static unsigned
246 read_qp_flags(const string & opt_pfx, unsigned f)
248 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
249 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
250 unsigned mask = 0;
251 const char * s = i->first.c_str() + opt_pfx.size();
252 switch (s[0]) {
253 case 'a':
254 if (strcmp(s, "auto_multiword_synonyms") == 0) {
255 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
256 break;
258 if (strcmp(s, "auto_synonyms") == 0) {
259 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
260 break;
262 break;
263 case 'b':
264 if (strcmp(s, "boolean") == 0) {
265 mask = Xapian::QueryParser::FLAG_BOOLEAN;
266 break;
268 if (strcmp(s, "boolean_any_case") == 0) {
269 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
270 break;
272 break;
273 case 'c':
274 if (strcmp(s, "cjk_ngram") == 0) {
275 mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
276 break;
278 break;
279 case 'd':
280 if (strcmp(s, "default") == 0) {
281 mask = Xapian::QueryParser::FLAG_DEFAULT;
282 break;
284 break;
285 case 'l':
286 if (strcmp(s, "lovehate") == 0) {
287 mask = Xapian::QueryParser::FLAG_LOVEHATE;
288 break;
290 break;
291 case 'p':
292 if (strcmp(s, "partial") == 0) {
293 mask = Xapian::QueryParser::FLAG_PARTIAL;
294 break;
296 if (strcmp(s, "phrase") == 0) {
297 mask = Xapian::QueryParser::FLAG_PHRASE;
298 break;
300 if (strcmp(s, "pure_not") == 0) {
301 mask = Xapian::QueryParser::FLAG_PURE_NOT;
302 break;
304 break;
305 case 's':
306 if (strcmp(s, "spelling_correction") == 0) {
307 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
308 break;
310 if (strcmp(s, "synonym") == 0) {
311 mask = Xapian::QueryParser::FLAG_SYNONYM;
312 break;
314 break;
315 case 'w':
316 if (strcmp(s, "wildcard") == 0) {
317 mask = Xapian::QueryParser::FLAG_WILDCARD;
318 break;
320 break;
323 if (i->second.empty()) {
324 f &= ~mask;
325 } else {
326 f |= mask;
329 return f;
332 static querytype
333 parse_queries(const string& oldp)
335 // Parse the query string.
336 auto opt_it = option.find("stem_strategy");
337 if (opt_it != option.end()) {
338 if (opt_it->second == "all") {
339 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
340 } else if (opt_it->second == "all_z") {
341 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z);
342 } else if (opt_it->second == "none") {
343 qp.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
344 } else if (opt_it->second == "some") {
345 qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
346 } else if (opt_it->second == "some_full_pos") {
347 qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS);
349 } else {
350 opt_it = option.find("stem_all");
351 if (opt_it != option.end() && opt_it->second == "true") {
352 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
355 qp.set_stopper(new MyStopper());
356 qp.set_default_op(default_op);
357 qp.set_database(db);
358 // FIXME: provide a custom RP which handles size:10..20K, etc.
359 if (!size_rp)
360 size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
361 qp.add_rangeprocessor(size_rp);
362 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
363 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
364 string user_prefix(pfx->first, 7);
365 const string & term_pfx_list = pfx->second;
366 string::size_type i = 0;
367 do {
368 string::size_type i0 = i;
369 i = term_pfx_list.find('\t', i);
370 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
371 qp.add_prefix(user_prefix, term_pfx);
372 // std::map::insert() won't overwrite an existing entry, so we'll
373 // prefer the first user_prefix for which a particular term prefix
374 // is specified.
375 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
376 } while (++i);
378 pfx = option.lower_bound("boolprefix,");
379 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
380 string user_prefix(pfx->first, 11, string::npos);
381 auto it = option.find("nonexclusiveprefix," + pfx->second);
382 bool exclusive = (it == option.end() || it->second.empty());
383 qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
384 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
387 try {
388 unsigned default_flags = read_qp_flags("flag_", 0);
390 vector<Xapian::Query> queries;
391 queries.reserve(query_strings.size());
393 for (auto& j : query_strings) {
394 const string& prefix = j.first;
395 const string& query_string = j.second;
397 // Choose the stemmer to use for this input.
398 string stemlang = option[prefix + ":stemmer"];
399 if (stemlang.empty())
400 stemlang = option["stemmer"];
401 qp.set_stemmer(Xapian::Stem(stemlang));
403 // Work out the flags to use for this input.
404 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
406 Xapian::Query q = qp.parse_query(query_string, f, prefix);
407 if (!q.empty())
408 queries.push_back(q);
410 query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
411 } catch (Xapian::QueryParserError &e) {
412 error_msg = e.get_msg();
413 return BAD_QUERY;
416 Xapian::termcount n_new_terms = 0;
417 for (Xapian::TermIterator i = query.get_terms_begin();
418 i != query.get_terms_end(); ++i) {
419 if (termset.find(*i) == termset.end()) {
420 termset.insert(*i);
421 if (!queryterms.empty()) queryterms += '\t';
422 queryterms += *i;
424 n_new_terms++;
427 // Check new query against the previous one
428 if (oldp.empty()) {
429 // If oldp was empty that means there were no parsed query terms
430 // before, so if there are now this is a new query.
431 return n_new_terms ? NEW_QUERY : SAME_QUERY;
434 // The terms in oldp are separated by tabs.
435 const char oldp_separator = '\t';
436 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
438 // short-cut: if the new query has fewer terms, it must be a new one
439 if (n_new_terms < n_old_terms) return NEW_QUERY;
441 const char *term = oldp.c_str();
442 const char *pend;
443 while ((pend = strchr(term, oldp_separator)) != NULL) {
444 if (termset.find(string(term, pend - term)) == termset.end())
445 return NEW_QUERY;
446 term = pend + 1;
448 if (*term) {
449 if (termset.find(string(term)) == termset.end())
450 return NEW_QUERY;
453 // Use termset.size() rather than n_new_terms so we correctly handle
454 // the case when the query has repeated terms.
455 // This works wrongly in the case when the user extends the query
456 // by adding a term already in it, but that's unlikely and the behaviour
457 // isn't too bad (we just don't reset page 1). We also mishandle a few
458 // other obscure cases e.g. adding quotes to turn a query into a phrase.
459 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
460 return SAME_QUERY;
463 static multimap<string, string> filter_map;
464 static set<string> neg_filters;
466 void add_bterm(const string &term) {
467 string prefix;
468 if (prefix_from_term(&prefix, term) > 0)
469 filter_map.insert(multimap<string, string>::value_type(prefix, term));
472 void add_nterm(const string &term) {
473 if (!term.empty())
474 neg_filters.insert(term);
477 void
478 add_date_filter(const string& date_start,
479 const string& date_end,
480 const string& date_span,
481 Xapian::valueno date_value_slot)
483 if (date_start.empty() && date_end.empty() && date_span.empty())
484 return;
486 Xapian::Query q;
487 if (date_value_slot != Xapian::BAD_VALUENO) {
488 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
489 // latter the sort order just works correctly between different
490 // precisions).
491 bool as_time_t =
492 db.get_value_lower_bound(date_value_slot).size() == 4 &&
493 db.get_value_upper_bound(date_value_slot).size() == 4;
494 q = date_value_range(as_time_t, date_value_slot,
495 date_start, date_end,
496 date_span);
497 } else {
498 q = date_range_filter(date_start, date_end, date_span);
499 q |= Xapian::Query("Dlatest");
502 if (date_filter_set) {
503 date_filter &= q;
504 } else {
505 date_filter_set = true;
506 date_filter = q;
510 static void
511 run_query()
513 string scheme;
514 bool force_boolean = false;
515 if (!filter_map.empty()) {
516 // OR together filters with the same prefix (or AND for non-exclusive
517 // prefixes), then AND together the resultant groups.
518 vector<Xapian::Query> filter_vec;
519 vector<string> same_vec;
520 string current;
521 for (auto i = filter_map.begin(); ; ++i) {
522 bool over = (i == filter_map.end());
523 if (over || i->first != current) {
524 switch (same_vec.size()) {
525 case 0:
526 break;
527 case 1:
528 filter_vec.push_back(Xapian::Query(same_vec[0]));
529 break;
530 default: {
531 Xapian::Query::op op = Xapian::Query::OP_OR;
532 auto it = option.find("nonexclusiveprefix," + current);
533 if (it != option.end() && !it->second.empty()) {
534 op = Xapian::Query::OP_AND;
536 filter_vec.push_back(Xapian::Query(op,
537 same_vec.begin(),
538 same_vec.end()));
539 break;
542 same_vec.clear();
543 if (over) break;
544 current = i->first;
546 same_vec.push_back(i->second);
549 Xapian::Query filter(Xapian::Query::OP_AND,
550 filter_vec.begin(), filter_vec.end());
552 if (query.empty()) {
553 // If no query strings were provided then promote the filters
554 // to be THE query - filtering an empty query will give no
555 // matches.
556 std::swap(query, filter);
557 auto&& it = option.find("weightingpurefilter");
558 if (it != option.end() && !it->second.empty()) {
559 scheme = it->second;
560 } else {
561 force_boolean = true;
563 } else {
564 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
568 if (date_filter_set) {
569 // If no query strings were provided then promote the daterange
570 // filter to be THE query instead of filtering an empty query.
571 if (query.empty()) {
572 query = date_filter;
573 force_boolean = true;
574 } else {
575 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
579 if (!neg_filters.empty()) {
580 // OR together all negated filters.
581 Xapian::Query filter(Xapian::Query::OP_OR,
582 neg_filters.begin(), neg_filters.end());
584 if (query.empty() && !date_filter_set) {
585 // If we only have a negative filter for the query, use MatchAll as
586 // the query to apply the filters to.
587 query = Xapian::Query::MatchAll;
588 force_boolean = true;
590 query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
593 if (!enquire || !error_msg.empty()) return;
595 if (!force_boolean && scheme.empty()) {
596 auto&& it = option.find("weighting");
597 if (it != option.end()) scheme = it->second;
599 set_weighting_scheme(*enquire, scheme, force_boolean);
601 enquire->set_cutoff(threshold);
603 if (sort_keymaker) {
604 if (sort_after) {
605 enquire->set_sort_by_relevance_then_key(sort_keymaker,
606 reverse_sort);
607 } else {
608 enquire->set_sort_by_key_then_relevance(sort_keymaker,
609 reverse_sort);
611 } else if (sort_key != Xapian::BAD_VALUENO) {
612 if (sort_after) {
613 enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
614 } else {
615 enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
619 enquire->set_docid_order(docid_order);
621 if (collapse) {
622 enquire->set_collapse_key(collapse_key);
625 if (!query.empty()) {
626 #if 0
627 // FIXME: If we start doing permissions checks based on $REMOTE_USER
628 // we're going to break some existing setups if users upgrade. We
629 // probably want a way to set this from OmegaScript.
630 const char * remote_user = getenv("REMOTE_USER");
631 if (remote_user)
632 apply_unix_permissions(query, remote_user);
633 #endif
635 enquire->set_query(query);
636 // We could use the value of topdoc as first parameter, but we
637 // need to know the first few items in the mset to fake a
638 // relevance set for topterms.
640 // If min_hits isn't set, check at least one extra result so we
641 // know if we've reached the end of the matches or not - then we
642 // can avoid offering a "next" button which leads to an empty page.
643 mset = enquire->get_mset(0, topdoc + hits_per_page,
644 topdoc + max(hits_per_page + 1, min_hits),
645 &rset);
649 string
650 html_escape(const string &str)
652 string res;
653 string::size_type p = 0;
654 while (p < str.size()) {
655 char ch = str[p++];
656 switch (ch) {
657 case '<':
658 res += "&lt;";
659 continue;
660 case '>':
661 res += "&gt;";
662 continue;
663 case '&':
664 res += "&amp;";
665 continue;
666 case '"':
667 res += "&quot;";
668 continue;
669 default:
670 res += ch;
673 return res;
676 static string
677 html_strip(const string &str)
679 string res;
680 string::size_type p = 0;
681 bool skip = false;
682 while (p < str.size()) {
683 char ch = str[p++];
684 switch (ch) {
685 case '<':
686 skip = true;
687 continue;
688 case '>':
689 skip = false;
690 continue;
691 default:
692 if (! skip) res += ch;
695 return res;
698 class WordList {
699 static string prev_list;
700 static unordered_map<string, int> word_to_occurrence;
701 public:
702 void build_word_map(const string& list) {
703 // Don't build map again if passed list of terms is same as before.
704 if (prev_list == list) return;
705 word_to_occurrence.clear();
706 string::size_type split = 0, split2;
707 int word_index = 0;
708 string word;
709 while ((split2 = list.find('\t', split)) != string::npos) {
710 word = list.substr(split, split2 - split);
711 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
712 ++word_index;
713 split = split2 + 1;
715 word = list.substr(split, list.size() - split);
716 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
717 ++word_index;
718 prev_list = list;
721 int word_in_list(const string& word) {
722 auto it = word_to_occurrence.find(word);
723 if (it == word_to_occurrence.end()) return -1;
724 return it->second;
728 string WordList::prev_list;
729 unordered_map<string, int> WordList::word_to_occurrence;
731 // Not a character in an identifier
732 inline static bool
733 p_notid(unsigned int c)
735 return !C_isalnum(c) && c != '_';
738 // Not a character in an HTML tag name
739 inline static bool
740 p_nottag(unsigned int c)
742 return !C_isalnum(c) && c != '.' && c != '-';
745 // FIXME: shares algorithm with indextext.cc!
746 static string
747 html_highlight(const string &s, const string &list,
748 const string &bra, const string &ket)
750 if (!stemmer) {
751 stemmer = new Xapian::Stem(option["stemmer"]);
754 string res;
756 Utf8Iterator j(s);
757 const Utf8Iterator s_end;
758 while (true) {
759 Utf8Iterator first = j;
760 while (first != s_end && !is_wordchar(*first)) ++first;
761 if (first == s_end) break;
762 Utf8Iterator term_end;
763 string term;
764 string word;
765 const char *l = j.raw();
766 if (*first < 128 && C_isupper(*first)) {
767 j = first;
768 Xapian::Unicode::append_utf8(term, *j);
769 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
770 Xapian::Unicode::append_utf8(term, *j);
772 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
773 term.resize(0);
775 term_end = j;
777 if (term.empty()) {
778 j = first;
779 while (is_wordchar(*j)) {
780 Xapian::Unicode::append_utf8(term, *j);
781 ++j;
782 if (j == s_end) break;
783 if (*j == '&' || *j == '\'') {
784 Utf8Iterator next = j;
785 ++next;
786 if (next == s_end || !is_wordchar(*next)) break;
787 term += *j;
788 j = next;
791 term_end = j;
792 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
793 string::size_type len = term.length();
794 if (*j == '#') {
795 term += '#';
796 do { ++j; } while (j != s_end && *j == '#');
797 } else {
798 while (j != s_end && (*j == '+' || *j == '-')) {
799 Xapian::Unicode::append_utf8(term, *j);
800 ++j;
803 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
804 term.resize(len);
805 } else {
806 term_end = j;
810 j = term_end;
811 term = Xapian::Unicode::tolower(term);
812 WordList w;
813 w.build_word_map(list);
814 int match = w.word_in_list(term);
815 if (match == -1) {
816 string stem = "Z";
817 stem += (*stemmer)(term);
818 match = w.word_in_list(stem);
820 if (match >= 0) {
821 res += html_escape(string(l, first.raw() - l));
822 if (!bra.empty()) {
823 res += bra;
824 } else {
825 static const char * colours[] = {
826 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
827 "990000", "009900", "996600", "006699", "990099"
829 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
830 const char * bg = colours[idx];
831 if (strchr(bg, 'f')) {
832 res += "<b style=\"color:black;background-color:#";
833 } else {
834 res += "<b style=\"color:white;background-color:#";
836 res += bg;
837 res += "\">";
839 word.assign(first.raw(), j.raw() - first.raw());
840 res += html_escape(word);
841 if (!bra.empty()) {
842 res += ket;
843 } else {
844 res += "</b>";
846 } else {
847 res += html_escape(string(l, j.raw() - l));
850 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
851 return res;
854 #if 0
855 static void
856 print_query_string(const char *after)
858 if (after && strncmp(after, "&B=", 3) == 0) {
859 char prefix = after[3];
860 string::size_type start = 0, amp = 0;
861 while (true) {
862 amp = url_query_string.find('&', amp);
863 if (amp == string::npos) {
864 cout << url_query_string.substr(start);
865 return;
867 amp++;
868 while (url_query_string[amp] == 'B' &&
869 url_query_string[amp + 1] == '=' &&
870 url_query_string[amp + 2] == prefix) {
871 cout << url_query_string.substr(start, amp - start - 1);
872 start = url_query_string.find('&', amp + 3);
873 if (start == string::npos) return;
874 amp = start + 1;
878 cout << url_query_string;
880 #endif
882 class Fields {
883 mutable Xapian::docid did_cached;
884 mutable map<string, string> fields;
886 void read_fields(Xapian::docid did) const;
888 public:
889 Fields() : did_cached(0) { }
891 const string & get_field(Xapian::docid did, const string & field) const {
892 if (did != did_cached) read_fields(did);
893 return fields[field];
897 void
898 Fields::read_fields(Xapian::docid did) const
900 fields.clear();
901 did_cached = did;
902 const string & data = db.get_document(did).get_data();
904 // Parse document data.
905 string::size_type i = 0;
906 const string & names = option["fieldnames"];
907 if (!names.empty()) {
908 // Each line is a field, with fieldnames taken from corresponding
909 // entries in the tab-separated list specified by $opt{fieldnames}.
910 string::size_type n = 0;
911 do {
912 string::size_type n0 = n;
913 n = names.find('\t', n);
914 string::size_type i0 = i;
915 i = data.find('\n', i);
916 fields.insert(make_pair(names.substr(n0, n - n0),
917 data.substr(i0, i - i0)));
918 } while (++n && ++i);
919 } else {
920 // Each line is a field, in the format NAME=VALUE. We assume the field
921 // name doesn't contain an "=". Lines without an "=" are currently
922 // just ignored.
923 do {
924 string::size_type i0 = i;
925 i = data.find('\n', i);
926 string line(data, i0, i - i0);
927 string::size_type j = line.find('=');
928 if (j != string::npos) {
929 string & value = fields[line.substr(0, j)];
930 if (!value.empty()) value += '\t';
931 value.append(line, j + 1, string::npos);
933 } while (++i);
937 static Fields fields;
938 static Xapian::docid q0;
939 static Xapian::doccount hit_no;
940 static int percent;
941 static double weight;
942 static Xapian::doccount collapsed;
944 static string print_caption(const string &fmt, const vector<string> &param);
946 enum tagval {
947 CMD_,
948 CMD_add,
949 CMD_addfilter,
950 CMD_allterms,
951 CMD_and,
952 CMD_cgi,
953 CMD_cgilist,
954 CMD_cgiparams,
955 CMD_chr,
956 CMD_collapsed,
957 CMD_cond,
958 CMD_contains,
959 CMD_csv,
960 CMD_date,
961 CMD_dbname,
962 CMD_dbsize,
963 CMD_def,
964 CMD_defaultop,
965 CMD_div,
966 CMD_emptydocs,
967 CMD_env,
968 CMD_eq,
969 CMD_error,
970 CMD_field,
971 CMD_filesize,
972 CMD_filters,
973 CMD_filterterms,
974 CMD_find,
975 CMD_fmt,
976 CMD_freq,
977 CMD_ge,
978 CMD_gt,
979 CMD_hash,
980 CMD_highlight,
981 CMD_hit,
982 CMD_hitlist,
983 CMD_hitsperpage,
984 CMD_hostname,
985 CMD_html,
986 CMD_htmlstrip,
987 CMD_httpheader,
988 CMD_id,
989 CMD_if,
990 CMD_include,
991 CMD_json,
992 CMD_jsonarray,
993 CMD_last,
994 CMD_lastpage,
995 CMD_le,
996 CMD_length,
997 CMD_list,
998 CMD_log,
999 CMD_lookup,
1000 CMD_lower,
1001 CMD_lt,
1002 CMD_map,
1003 CMD_match,
1004 CMD_max,
1005 CMD_min,
1006 CMD_mod,
1007 CMD_msize,
1008 CMD_msizeexact,
1009 CMD_msizelower,
1010 CMD_msizeupper,
1011 CMD_mul,
1012 CMD_muldiv,
1013 CMD_ne,
1014 CMD_nice,
1015 CMD_not,
1016 CMD_now,
1017 CMD_opt,
1018 CMD_or,
1019 CMD_ord,
1020 CMD_pack,
1021 CMD_percentage,
1022 CMD_prettyterm,
1023 CMD_prettyurl,
1024 CMD_query,
1025 CMD_querydescription,
1026 CMD_queryterms,
1027 CMD_range,
1028 CMD_record,
1029 CMD_relevant,
1030 CMD_relevants,
1031 CMD_score,
1032 CMD_set,
1033 CMD_seterror,
1034 CMD_setmap,
1035 CMD_setrelevant,
1036 CMD_slice,
1037 CMD_snippet,
1038 CMD_sort,
1039 CMD_split,
1040 CMD_stoplist,
1041 CMD_sub,
1042 CMD_subdb,
1043 CMD_subid,
1044 CMD_substr,
1045 CMD_suggestion,
1046 CMD_switch,
1047 CMD_termprefix,
1048 CMD_terms,
1049 CMD_thispage,
1050 CMD_time,
1051 CMD_topdoc,
1052 CMD_topterms,
1053 CMD_transform,
1054 CMD_truncate,
1055 CMD_uniq,
1056 CMD_unique,
1057 CMD_unpack,
1058 CMD_unprefix,
1059 CMD_unstem,
1060 CMD_upper,
1061 CMD_url,
1062 CMD_value,
1063 CMD_version,
1064 CMD_weight,
1065 CMD_MACRO // special tag for macro evaluation
1068 struct func_attrib {
1069 int tag;
1070 int minargs, maxargs, evalargs;
1071 char ensure;
1074 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1075 struct func_desc {
1076 const char *name;
1077 struct func_attrib a;
1080 #define N -1
1081 #define M 'M'
1082 #define Q 'Q'
1083 // NB when adding a new command which ensures M or Q, update the list in
1084 // docs/omegascript.rst
1085 static struct func_desc func_tab[] = {
1086 //name minargs maxargs evalargs ensure
1087 {"",{CMD_, N, N, 0, 0}},// commented out code
1088 T(add, 0, N, N, 0), // add a list of numbers
1089 T(addfilter, 1, 1, N, 0), // add filter term
1090 T(allterms, 0, 1, N, 0), // list of all terms matching document
1091 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
1092 T(cgi, 1, 1, N, 0), // return cgi parameter value
1093 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
1094 T(cgiparams, 0, 0, N, 0), // return list of cgi parameter names
1095 T(chr, 1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1096 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
1097 T(cond, 2, N, 0, 0), // return position of substring, or empty string
1098 T(contains, 2, 2, N, 0), // return position of substring, or empty string
1099 T(csv, 1, 2, N, 0), // CSV string escaping
1100 T(date, 1, 2, N, 0), // convert time_t to strftime format
1101 // (default: YYYY-MM-DD)
1102 T(dbname, 0, 0, N, 0), // database name
1103 T(dbsize, 0, 0, N, 0), // database size (# of documents)
1104 T(def, 2, 2, 1, 0), // define a macro
1105 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
1106 T(div, 2, 2, N, 0), // integer divide
1107 T(emptydocs, 0, 1, N, 0), // list of empty documents
1108 T(env, 1, 1, N, 0), // environment variable
1109 T(eq, 2, 2, N, 0), // test equality
1110 T(error, 0, 0, N, 0), // error message
1111 T(field, 1, 2, N, 0), // lookup field in record
1112 T(filesize, 1, 1, N, 0), // pretty printed filesize
1113 T(filters, 0, 0, N, 0), // serialisation of current filters
1114 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
1115 T(find, 2, 2, N, 0), // find entry in list
1116 T(fmt, 0, 0, N, 0), // name of current format
1117 T(freq, 1, 1, N, 0), // frequency of a term
1118 T(ge, 2, 2, N, 0), // test >=
1119 T(gt, 2, 2, N, 0), // test >
1120 T(hash, 2, 2, N, 0), // hash a string using the specified hash function
1121 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
1122 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
1123 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
1124 T(hitsperpage, 0, 0, N, 0), // hits per page
1125 T(hostname, 1, 1, N, 0), // extract hostname from URL
1126 T(html, 1, 1, N, 0), // html escape string (<>&")
1127 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1128 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
1129 T(id, 0, 0, N, 0), // docid of current doc
1130 T(if, 2, 3, 1, 0), // conditional
1131 T(include, 1, 1, 1, 0), // include another file
1132 T(json, 1, 1, N, 0), // JSON string escaping
1133 T(jsonarray, 1, 1, N, 0), // Format list as a JSON array of strings
1134 T(last, 0, 0, N, M), // hit number one beyond end of current page
1135 T(lastpage, 0, 0, N, M), // number of last hit page
1136 T(le, 2, 2, N, 0), // test <=
1137 T(length, 1, 1, N, 0), // length of list
1138 T(list, 2, 5, N, 0), // pretty print list
1139 T(log, 1, 2, 1, 0), // create a log entry
1140 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1141 T(lower, 1, 1, N, 0), // convert string to lower case
1142 T(lt, 2, 2, N, 0), // test <
1143 T(map, 2, 2, 1, 0), // map a list into another list
1144 T(match, 2, 3, N, 0), // regex match
1145 T(max, 1, N, N, 0), // maximum of a list of values
1146 T(min, 1, N, N, 0), // minimum of a list of values
1147 T(mod, 2, 2, N, 0), // integer modulus
1148 T(msize, 0, 0, N, M), // number of matches (estimated)
1149 T(msizeexact, 0, 0, N, M), // is $msize exact?
1150 T(msizelower, 0, 0, N, M), // number of matches (lower bound)
1151 T(msizeupper, 0, 0, N, M), // number of matches (upper bound)
1152 T(mul, 2, N, N, 0), // multiply a list of numbers
1153 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1154 T(ne, 2, 2, N, 0), // test not equal
1155 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1156 T(not, 1, 1, N, 0), // logical not
1157 T(now, 0, 0, N, 0), // current date/time as a time_t
1158 T(opt, 1, 2, N, 0), // lookup an option value
1159 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1160 T(ord, 1, 1, N, 0), // return codepoint for first character of UTF-8 string
1161 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1162 T(percentage, 0, 0, N, 0), // percentage score of current hit
1163 T(prettyterm, 1, 1, N, Q), // pretty print term name
1164 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1165 T(query, 0, 1, N, Q), // query
1166 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1167 T(queryterms, 0, 0, N, Q), // list of query terms
1168 T(range, 2, 2, N, 0), // return list of values between start and end
1169 T(record, 0, 1, N, 0), // record contents of document
1170 T(relevant, 0, 1, N, Q), // is document relevant?
1171 T(relevants, 0, 0, N, Q), // return list of relevant documents
1172 T(score, 0, 0, N, 0), // score (0-10) of current hit
1173 T(set, 2, 2, N, 0), // set option value
1174 T(seterror, 1, 1, N, 0), // set error_msg, setting it early stops query execution
1175 T(setmap, 1, N, N, 0), // set map of option values
1176 T(setrelevant, 0, 1, N, Q), // set rset
1177 T(slice, 2, 2, N, 0), // slice a list using a second list
1178 T(snippet, 1, 2, N, M), // generate snippet from text
1179 T(sort, 1, 2, N, M), // alpha sort a list
1180 T(split, 1, 2, N, 0), // split a string to give a list
1181 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1182 T(sub, 2, 2, N, 0), // subtract
1183 T(subdb, 0, 1, N, 0), // name of subdb docid is in
1184 T(subid, 0, 1, N, 0), // docid in the subdb#
1185 T(substr, 2, 3, N, 0), // substring
1186 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1187 T(switch, 3, N, 1, 0), // return position of substring, or empty string
1188 T(termprefix, 1, 1, N, 0), // get any prefix from a term
1189 T(terms, 0, 1, N, M), // list of matching terms
1190 T(thispage, 0, 0, N, M), // page number of current page
1191 T(time, 0, 0, N, M), // how long the match took (in seconds)
1192 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1193 // (counting from 0)
1194 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1195 // (default 16)
1196 T(transform, 3, 4, N, 0), // transform with a regexp
1197 T(truncate, 2, 4, N, 0), // truncate after a word
1198 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1199 T(unique, 1, 1, N, 0), // removed duplicates from any list
1200 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1201 T(unprefix, 1, 1, N, 0), // remove any prefix from a term
1202 T(unstem, 1, 1, N, Q), // return list of terms from the parsed query
1203 // which stemmed to this term
1204 T(upper, 1, 1, N, 0), // convert string to upper case
1205 T(url, 1, 1, N, 0), // url encode argument
1206 T(value, 1, 2, N, 0), // return document value
1207 T(version, 0, 0, N, 0), // omega version string
1208 T(weight, 0, 0, N, 0), // weight of the current hit
1209 { NULL,{0, 0, 0, 0, 0}}
1212 #undef T // Leaving T defined screws up Sun's C++ compiler!
1214 static vector<string> macros;
1216 // Call write() repeatedly until all data is written or we get a
1217 // non-recoverable error.
1218 static ssize_t
1219 write_all(int fd, const char * buf, size_t count)
1221 while (count) {
1222 ssize_t r = write(fd, buf, count);
1223 if (rare(r < 0)) {
1224 if (errno == EINTR) continue;
1225 return r;
1227 buf += r;
1228 count -= r;
1230 return 0;
1233 static const vector<string>&
1234 get_subdbs()
1236 static vector<string> subdbs;
1237 if (subdbs.empty()) {
1238 size_t p = 0, q;
1239 while (true) {
1240 q = dbname.find('/', p);
1241 subdbs.emplace_back(dbname, p, q - p);
1242 if (q == string::npos) break;
1243 p = q + 1;
1246 return subdbs;
1249 static string
1250 eval(const string &fmt, const vector<string> &param)
1252 static map<string, const struct func_attrib *> func_map;
1253 if (func_map.empty()) {
1254 struct func_desc *p;
1255 for (p = func_tab; p->name != NULL; ++p) {
1256 func_map[string(p->name)] = &(p->a);
1259 string res;
1260 string::size_type p = 0, q;
1261 while ((q = fmt.find('$', p)) != string::npos) try {
1262 res.append(fmt, p, q - p);
1263 string::size_type code_start = q; // note down for error reporting
1264 q++;
1265 if (q >= fmt.size()) break;
1266 unsigned char ch = fmt[q];
1267 switch (ch) {
1268 // Magic sequences:
1269 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1270 case '$':
1271 res += '$';
1272 p = q + 1;
1273 continue;
1274 case '(':
1275 res += '{';
1276 p = q + 1;
1277 continue;
1278 case ')':
1279 res += '}';
1280 p = q + 1;
1281 continue;
1282 case '.':
1283 res += ',';
1284 p = q + 1;
1285 continue;
1286 case '_':
1287 ch = '0';
1288 // FALL THRU
1289 case '1': case '2': case '3': case '4': case '5':
1290 case '6': case '7': case '8': case '9':
1291 ch -= '0';
1292 if (ch < param.size()) res += param[ch];
1293 p = q + 1;
1294 continue;
1295 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1296 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1297 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1298 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1299 case 'y': case 'z':
1300 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1301 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1302 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1303 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1304 case 'Y': case 'Z':
1305 case '{':
1306 break;
1307 default:
1308 string msg = "Unknown $ code in: $";
1309 msg.append(fmt, q, string::npos);
1310 throw msg;
1312 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1313 string var(fmt, q, p - q);
1314 map<string, const struct func_attrib *>::const_iterator func;
1315 func = func_map.find(var);
1316 if (func == func_map.end()) {
1317 throw "Unknown function '" + var + "'";
1319 vector<string> args;
1320 if (fmt[p] == '{') {
1321 q = p + 1;
1322 int nest = 1;
1323 while (true) {
1324 p = fmt.find_first_of(",{}", p + 1);
1325 if (p == string::npos)
1326 throw "missing } in " + fmt.substr(code_start);
1327 if (fmt[p] == '{') {
1328 ++nest;
1329 } else {
1330 if (nest == 1) {
1331 // should we split the args
1332 if (func->second->minargs != N) {
1333 args.push_back(fmt.substr(q, p - q));
1334 q = p + 1;
1337 if (fmt[p] == '}' && --nest == 0) break;
1340 if (func->second->minargs == N)
1341 args.push_back(fmt.substr(q, p - q));
1342 ++p;
1345 if (func->second->minargs != N) {
1346 if (int(args.size()) < func->second->minargs)
1347 throw "too few arguments to $" + var;
1348 if (func->second->maxargs != N &&
1349 int(args.size()) > func->second->maxargs)
1350 throw "too many arguments to $" + var;
1352 vector<string>::size_type n;
1353 if (func->second->evalargs != N)
1354 n = func->second->evalargs;
1355 else
1356 n = args.size();
1358 for (vector<string>::size_type j = 0; j < n; ++j)
1359 args[j] = eval(args[j], param);
1361 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1362 ensure_query_parsed();
1363 if (func->second->ensure == 'M') ensure_match();
1364 string value;
1365 switch (func->second->tag) {
1366 case CMD_:
1367 break;
1368 case CMD_add: {
1369 int total = 0;
1370 for (auto&& arg : args)
1371 total += string_to_int(arg);
1372 value = str(total);
1373 break;
1375 case CMD_addfilter:
1376 add_bterm(args[0]);
1377 break;
1378 case CMD_allterms: {
1379 // list of all terms indexing document
1380 Xapian::docid id = q0;
1381 if (!args.empty()) id = string_to_int(args[0]);
1382 for (Xapian::TermIterator term = db.termlist_begin(id);
1383 term != db.termlist_end(id); ++term) {
1384 value += *term;
1385 value += '\t';
1388 if (!value.empty()) value.erase(value.size() - 1);
1389 break;
1391 case CMD_and: {
1392 value = "true";
1393 for (auto&& arg : args) {
1394 if (eval(arg, param).empty()) {
1395 value.resize(0);
1396 break;
1399 break;
1401 case CMD_cgi: {
1402 auto i = cgi_params.find(args[0]);
1403 if (i != cgi_params.end()) value = i->second;
1404 break;
1406 case CMD_cgilist: {
1407 auto g = cgi_params.equal_range(args[0]);
1408 for (auto i = g.first; i != g.second; ++i) {
1409 value += i->second;
1410 value += '\t';
1412 if (!value.empty()) value.erase(value.size() - 1);
1413 break;
1415 case CMD_cgiparams: {
1416 const string* prev = NULL;
1417 for (auto&& i : cgi_params) {
1418 if (prev && i.first == *prev) continue;
1419 value += i.first;
1420 value += '\t';
1421 prev = &i.first;
1423 if (!value.empty()) value.erase(value.size() - 1);
1424 break;
1426 case CMD_chr:
1427 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1428 break;
1429 case CMD_collapsed: {
1430 value = str(collapsed);
1431 break;
1433 case CMD_cond:
1434 for (size_t i = 0; i < args.size(); i += 2) {
1435 if (i == args.size() - 1) {
1436 // Handle optional "else" value.
1437 value = eval(args[i], param);
1438 break;
1440 if (!eval(args[i], param).empty()) {
1441 value = eval(args[i + 1], param);
1442 break;
1445 break;
1446 case CMD_contains: {
1447 size_t pos = args[1].find(args[0]);
1448 if (pos != string::npos) {
1449 value = str(pos);
1451 break;
1453 case CMD_csv:
1454 value = args[0];
1455 if (args.size() > 1 && !args[1].empty()) {
1456 csv_escape_always(value);
1457 } else {
1458 csv_escape(value);
1460 break;
1461 case CMD_date:
1462 value = args[0];
1463 if (!value.empty()) {
1464 char buf[64] = "";
1465 time_t date = string_to_int(value);
1466 if (date != static_cast<time_t>(-1)) {
1467 struct tm *then;
1468 then = gmtime(&date);
1469 string date_fmt = "%Y-%m-%d";
1470 if (args.size() > 1) date_fmt = eval(args[1], param);
1471 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1473 value = buf;
1475 break;
1476 case CMD_dbname:
1477 value = dbname;
1478 break;
1479 case CMD_dbsize: {
1480 static Xapian::doccount dbsize;
1481 if (!dbsize) dbsize = db.get_doccount();
1482 value = str(dbsize);
1483 break;
1485 case CMD_def: {
1486 func_attrib *fa = new func_attrib;
1487 fa->tag = CMD_MACRO + macros.size();
1488 fa->minargs = 0;
1489 fa->maxargs = 9;
1490 fa->evalargs = N; // FIXME: or 0?
1491 fa->ensure = 0;
1493 macros.push_back(args[1]);
1494 func_map[args[0]] = fa;
1495 break;
1497 case CMD_defaultop:
1498 if (default_op == Xapian::Query::OP_AND) {
1499 value = "and";
1500 } else {
1501 value = "or";
1503 break;
1504 case CMD_div: {
1505 int denom = string_to_int(args[1]);
1506 if (denom == 0) {
1507 value = "divide by 0";
1508 } else {
1509 value = str(string_to_int(args[0]) /
1510 string_to_int(args[1]));
1512 break;
1514 case CMD_emptydocs: {
1515 string t;
1516 if (!args.empty())
1517 t = args[0];
1518 Xapian::PostingIterator i;
1519 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1520 if (i.get_doclength() != 0) continue;
1521 if (!value.empty()) value += '\t';
1522 value += str(*i);
1524 break;
1526 case CMD_env: {
1527 char *env = getenv(args[0].c_str());
1528 if (env != NULL) value = env;
1529 break;
1531 case CMD_eq:
1532 if (args[0] == args[1]) value = "true";
1533 break;
1534 case CMD_error:
1535 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1536 error_msg = "Database '" + dbname + "' couldn't be opened";
1538 value = error_msg;
1539 break;
1540 case CMD_field: {
1541 Xapian::docid did = q0;
1542 if (args.size() > 1) did = string_to_int(args[1]);
1543 value = fields.get_field(did, args[0]);
1544 break;
1546 case CMD_filesize: {
1547 // FIXME: rounding? i18n?
1548 int size = string_to_int(args[0]);
1549 int intpart = size;
1550 int fraction = -1;
1551 const char * format = 0;
1552 if (size < 0) {
1553 // Negative size -> empty result.
1554 } else if (size == 1) {
1555 format = "%d byte";
1556 } else if (size < 1024) {
1557 format = "%d bytes";
1558 } else {
1559 if (size < 1024 * 1024) {
1560 format = "%d.%cK";
1561 } else {
1562 size /= 1024;
1563 if (size < 1024 * 1024) {
1564 format = "%d.%cM";
1565 } else {
1566 size /= 1024;
1567 format = "%d.%cG";
1570 intpart = unsigned(size) / 1024;
1571 fraction = unsigned(size) % 1024;
1573 if (format) {
1574 char buf[200];
1575 int len;
1576 if (fraction == -1) {
1577 len = my_snprintf(buf, sizeof(buf), format, intpart);
1578 } else {
1579 fraction = (fraction * 10 / 1024) + '0';
1580 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1582 if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1583 value.assign(buf, len);
1585 break;
1587 case CMD_filters:
1588 value = filters;
1589 break;
1590 case CMD_filterterms: {
1591 Xapian::TermIterator term = db.allterms_begin();
1592 term.skip_to(args[0]);
1593 while (term != db.allterms_end()) {
1594 string t = *term;
1595 if (!startswith(t, args[0])) break;
1596 value += t;
1597 value += '\t';
1598 ++term;
1601 if (!value.empty()) value.erase(value.size() - 1);
1602 break;
1604 case CMD_find: {
1605 string l = args[0], s = args[1];
1606 string::size_type i = 0, j = 0;
1607 size_t count = 0;
1608 while (j != l.size()) {
1609 j = l.find('\t', i);
1610 if (j == string::npos) j = l.size();
1611 if (j - i == s.length()) {
1612 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1613 value = str(count);
1614 break;
1617 ++count;
1618 i = j + 1;
1620 break;
1622 case CMD_fmt:
1623 value = fmtname;
1624 break;
1625 case CMD_freq: {
1626 const string& term = args[0];
1627 Xapian::doccount termfreq = 0;
1628 if (done_query) {
1629 termfreq = mset.get_termfreq(term);
1631 if (termfreq == 0) {
1632 // We want $freq to work before the match is run, and we
1633 // don't want using it to force the match to run.
1634 termfreq = db.get_termfreq(term);
1636 value = str(termfreq);
1637 break;
1639 case CMD_ge:
1640 if (string_to_int(args[0]) >= string_to_int(args[1]))
1641 value = "true";
1642 break;
1643 case CMD_gt:
1644 if (string_to_int(args[0]) > string_to_int(args[1]))
1645 value = "true";
1646 break;
1647 case CMD_hash: {
1648 const string& data = args[0];
1649 const string& hash = args[1];
1650 if (hash == "md5") {
1651 string md5;
1652 md5_string(data, md5);
1653 value.reserve(md5.size() * 2);
1654 for (unsigned char byte : md5) {
1655 value += "0123456789abcdef"[byte >> 4];
1656 value += "0123456789abcdef"[byte & 0x0f];
1658 } else {
1659 throw "Unknown hash function: " + hash;
1661 break;
1663 case CMD_highlight: {
1664 string bra, ket;
1665 if (args.size() > 2) {
1666 bra = args[2];
1667 if (args.size() > 3) {
1668 ket = args[3];
1669 } else {
1670 string::const_iterator i;
1671 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1672 ket = "</";
1673 ket.append(bra, 1, i - bra.begin() - 1);
1674 ket += '>';
1678 value = html_highlight(args[0], args[1], bra, ket);
1679 break;
1681 case CMD_hit:
1682 // 0-based mset index
1683 value = str(hit_no);
1684 break;
1685 case CMD_hitlist:
1686 #if 0
1687 url_query_string = "?DB=";
1688 url_query_string += dbname;
1689 for (auto& j : query_strings) {
1690 if (j.first.empty()) {
1691 url_query_string += "&P=";
1692 } else {
1693 url_query_string += "&P."
1694 url_query_string += j.first;
1695 url_query_string += '=';
1697 const char *q = j.second.c_str();
1698 int ch;
1699 while ((ch = *q++) != '\0') {
1700 switch (ch) {
1701 case '+':
1702 url_query_string += "%2b";
1703 break;
1704 case '"':
1705 url_query_string += "%22";
1706 break;
1707 case '%':
1708 url_query_string += "%25";
1709 break;
1710 case '&':
1711 url_query_string += "%26";
1712 break;
1713 case ' ':
1714 ch = '+';
1715 /* fall through */
1716 default:
1717 url_query_string += ch;
1721 // add any boolean terms
1722 for (auto i = filter_map.begin(); i != filter_map.end(); ++i) {
1723 url_query_string += "&B=";
1724 url_query_string += i->second;
1726 #endif
1727 for (hit_no = topdoc; hit_no < last; ++hit_no)
1728 value += print_caption(args[0], param);
1729 hit_no = 0;
1730 break;
1731 case CMD_hitsperpage:
1732 value = str(hits_per_page);
1733 break;
1734 case CMD_hostname: {
1735 value = args[0];
1736 // remove URL scheme and/or path
1737 string::size_type i = value.find("://");
1738 if (i == string::npos) i = 0; else i += 3;
1739 value = value.substr(i, value.find('/', i) - i);
1740 // remove user@ or user:password@
1741 i = value.find('@');
1742 if (i != string::npos) value.erase(0, i + 1);
1743 // remove :port
1744 i = value.find(':');
1745 if (i != string::npos) value.resize(i);
1746 break;
1748 case CMD_html:
1749 value = html_escape(args[0]);
1750 break;
1751 case CMD_htmlstrip:
1752 value = html_strip(args[0]);
1753 break;
1754 case CMD_httpheader:
1755 if (!suppress_http_headers) {
1756 cout << args[0] << ": " << args[1] << endl;
1757 if (!set_content_type && args[0].length() == 12 &&
1758 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1759 set_content_type = true;
1762 break;
1763 case CMD_id:
1764 // document id
1765 value = str(q0);
1766 break;
1767 case CMD_if:
1768 if (!args[0].empty())
1769 value = eval(args[1], param);
1770 else if (args.size() > 2)
1771 value = eval(args[2], param);
1772 break;
1773 case CMD_include:
1774 value = eval_file(args[0]);
1775 break;
1776 case CMD_json:
1777 value = args[0];
1778 json_escape(value);
1779 break;
1780 case CMD_jsonarray: {
1781 const string & l = args[0];
1782 string::size_type i = 0, j;
1783 if (l.empty()) {
1784 value = "[]";
1785 break;
1787 value = "[\"";
1788 while (true) {
1789 j = l.find('\t', i);
1790 string elt(l, i, j - i);
1791 json_escape(elt);
1792 value += elt;
1793 if (j == string::npos) break;
1794 value += "\",\"";
1795 i = j + 1;
1797 value += "\"]";
1798 break;
1800 case CMD_last:
1801 value = str(last);
1802 break;
1803 case CMD_lastpage: {
1804 int l = mset.get_matches_estimated();
1805 if (l > 0) l = (l - 1) / hits_per_page + 1;
1806 value = str(l);
1807 break;
1809 case CMD_le:
1810 if (string_to_int(args[0]) <= string_to_int(args[1]))
1811 value = "true";
1812 break;
1813 case CMD_length:
1814 if (args[0].empty()) {
1815 value = "0";
1816 } else {
1817 size_t length = count(args[0].begin(), args[0].end(), '\t');
1818 value = str(length + 1);
1820 break;
1821 case CMD_list: {
1822 if (!args[0].empty()) {
1823 string pre, inter, interlast, post;
1824 switch (args.size()) {
1825 case 2:
1826 inter = interlast = args[1];
1827 break;
1828 case 3:
1829 inter = args[1];
1830 interlast = args[2];
1831 break;
1832 case 4:
1833 pre = args[1];
1834 inter = interlast = args[2];
1835 post = args[3];
1836 break;
1837 case 5:
1838 pre = args[1];
1839 inter = args[2];
1840 interlast = args[3];
1841 post = args[4];
1842 break;
1844 value += pre;
1845 string list = args[0];
1846 string::size_type split = 0, split2;
1847 while ((split2 = list.find('\t', split)) != string::npos) {
1848 if (split) value += inter;
1849 value.append(list, split, split2 - split);
1850 split = split2 + 1;
1852 if (split) value += interlast;
1853 value.append(list, split, string::npos);
1854 value += post;
1856 break;
1858 case CMD_log: {
1859 if (!vet_filename(args[0])) break;
1860 string logfile = log_dir + args[0];
1861 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1862 if (fd == -1) break;
1863 vector<string> noargs;
1864 noargs.resize(1);
1865 string line;
1866 if (args.size() > 1) {
1867 line = args[1];
1868 } else {
1869 line = DEFAULT_LOG_ENTRY;
1871 line = eval(line, noargs);
1872 line += '\n';
1873 (void)write_all(fd, line.data(), line.length());
1874 close(fd);
1875 break;
1877 case CMD_lookup: {
1878 if (!vet_filename(args[0])) break;
1879 string cdbfile = cdb_dir + args[0];
1880 int fd = open(cdbfile.c_str(), O_RDONLY);
1881 if (fd == -1) break;
1883 struct cdb cdb;
1884 cdb_init(&cdb, fd);
1886 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1887 size_t datalen = cdb_datalen(&cdb);
1888 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1889 if (q) {
1890 value.assign(static_cast<const char *>(dat), datalen);
1894 cdb_free(&cdb);
1895 close(fd); // FIXME: cache fds?
1896 break;
1898 case CMD_lower:
1899 value = Xapian::Unicode::tolower(args[0]);
1900 break;
1901 case CMD_lt:
1902 if (string_to_int(args[0]) < string_to_int(args[1]))
1903 value = "true";
1904 break;
1905 case CMD_map:
1906 if (!args[0].empty()) {
1907 string l = args[0], pat = args[1];
1908 vector<string> new_args(param);
1909 string::size_type i = 0, j;
1910 while (true) {
1911 j = l.find('\t', i);
1912 new_args[0] = l.substr(i, j - i);
1913 value += eval(pat, new_args);
1914 if (j == string::npos) break;
1915 value += '\t';
1916 i = j + 1;
1919 break;
1920 case CMD_match:
1921 omegascript_match(value, args);
1922 break;
1923 case CMD_max: {
1924 vector<string>::const_iterator i = args.begin();
1925 int val = string_to_int(*i++);
1926 for (; i != args.end(); ++i) {
1927 int x = string_to_int(*i);
1928 if (x > val) val = x;
1930 value = str(val);
1931 break;
1933 case CMD_min: {
1934 vector<string>::const_iterator i = args.begin();
1935 int val = string_to_int(*i++);
1936 for (; i != args.end(); ++i) {
1937 int x = string_to_int(*i);
1938 if (x < val) val = x;
1940 value = str(val);
1941 break;
1943 case CMD_msize:
1944 // Estimated number of matches.
1945 value = str(mset.get_matches_estimated());
1946 break;
1947 case CMD_msizeexact:
1948 // Is msize exact?
1949 if (mset.get_matches_lower_bound()
1950 == mset.get_matches_upper_bound())
1951 value = "true";
1952 break;
1953 case CMD_msizelower:
1954 // Lower bound on number of matches.
1955 value = str(mset.get_matches_lower_bound());
1956 break;
1957 case CMD_msizeupper:
1958 // Upper bound on number of matches.
1959 value = str(mset.get_matches_upper_bound());
1960 break;
1961 case CMD_mod: {
1962 int denom = string_to_int(args[1]);
1963 if (denom == 0) {
1964 value = "divide by 0";
1965 } else {
1966 value = str(string_to_int(args[0]) %
1967 string_to_int(args[1]));
1969 break;
1971 case CMD_mul: {
1972 vector<string>::const_iterator i = args.begin();
1973 int total = string_to_int(*i++);
1974 while (i != args.end())
1975 total *= string_to_int(*i++);
1976 value = str(total);
1977 break;
1979 case CMD_muldiv: {
1980 int denom = string_to_int(args[2]);
1981 if (denom == 0) {
1982 value = "divide by 0";
1983 } else {
1984 int num = string_to_int(args[0]) * string_to_int(args[1]);
1985 value = str(num / denom);
1987 break;
1989 case CMD_ne:
1990 if (args[0] != args[1]) value = "true";
1991 break;
1992 case CMD_nice: {
1993 string::const_iterator i = args[0].begin();
1994 int len = args[0].length();
1995 while (len) {
1996 value += *i++;
1997 if (--len && len % 3 == 0) value += option["thousand"];
1999 break;
2001 case CMD_not:
2002 if (args[0].empty()) value = "true";
2003 break;
2004 case CMD_now:
2005 value = str(static_cast<unsigned long>(time(NULL)));
2006 break;
2007 case CMD_opt:
2008 if (args.size() == 2) {
2009 value = option[args[0] + "," + args[1]];
2010 } else {
2011 value = option[args[0]];
2013 break;
2014 case CMD_or: {
2015 for (auto&& arg : args) {
2016 value = eval(arg, param);
2017 if (!value.empty()) break;
2019 break;
2021 case CMD_ord: {
2022 if (!args[0].empty()) {
2023 Utf8Iterator it(args[0]);
2024 value = str(*it);
2026 break;
2028 case CMD_pack:
2029 value = int_to_binary_string(string_to_int(args[0]));
2030 break;
2031 case CMD_percentage:
2032 // percentage score
2033 value = str(percent);
2034 break;
2035 case CMD_prettyterm:
2036 value = pretty_term(args[0]);
2037 break;
2038 case CMD_prettyurl:
2039 value = args[0];
2040 url_prettify(value);
2041 break;
2042 case CMD_query: {
2043 auto r = query_strings.equal_range(args.empty() ?
2044 string() : args[0]);
2045 for (auto j = r.first; j != r.second; ++j) {
2046 if (!value.empty()) value += '\t';
2047 const string & s = j->second;
2048 size_t start = 0, tab;
2049 while ((tab = s.find('\t', start)) != string::npos) {
2050 value.append(s, start, tab - start);
2051 value += ' ';
2052 start = tab + 1;
2054 value.append(s, start, string::npos);
2056 break;
2058 case CMD_querydescription:
2059 value = query.get_description();
2060 break;
2061 case CMD_queryterms:
2062 value = queryterms;
2063 break;
2064 case CMD_range: {
2065 int start = string_to_int(args[0]);
2066 int end = string_to_int(args[1]);
2067 while (start <= end) {
2068 value += str(start);
2069 if (start < end) value += '\t';
2070 start++;
2072 break;
2074 case CMD_record: {
2075 Xapian::docid id = q0;
2076 if (!args.empty()) id = string_to_int(args[0]);
2077 value = db.get_document(id).get_data();
2078 break;
2080 case CMD_relevant: {
2081 // document id if relevant; empty otherwise
2082 Xapian::docid id = q0;
2083 if (!args.empty()) id = string_to_int(args[0]);
2084 auto i = ticked.find(id);
2085 if (i != ticked.end()) {
2086 i->second = false; // icky side-effect
2087 value = str(id);
2089 break;
2091 case CMD_relevants: {
2092 for (auto i : ticked) {
2093 if (i.second) {
2094 value += str(i.first);
2095 value += '\t';
2098 if (!value.empty()) value.erase(value.size() - 1);
2099 break;
2101 case CMD_score:
2102 // Score (0 to 10)
2103 value = str(percent / 10);
2104 break;
2105 case CMD_set:
2106 option[args[0]] = args[1];
2107 break;
2108 case CMD_seterror:
2109 error_msg = args[0];
2110 break;
2111 case CMD_setmap: {
2112 string base = args[0] + ',';
2113 if (args.size() % 2 != 1)
2114 throw string("$setmap requires an odd number of arguments");
2115 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2116 option[base + args[i]] = args[i + 1];
2118 break;
2120 case CMD_setrelevant: {
2121 string::size_type i = 0, j;
2122 while (true) {
2123 j = args[0].find_first_not_of("0123456789", i);
2124 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2125 if (id) {
2126 rset.add_document(id);
2127 ticked[id] = true;
2129 if (j == string::npos) break;
2130 i = j + 1;
2132 break;
2134 case CMD_slice: {
2135 string list = args[0], pos = args[1];
2136 vector<string> items;
2137 string::size_type i = 0, j;
2138 while (true) {
2139 j = list.find('\t', i);
2140 items.push_back(list.substr(i, j - i));
2141 if (j == string::npos) break;
2142 i = j + 1;
2144 i = 0;
2145 bool have_added = false;
2146 while (true) {
2147 j = pos.find('\t', i);
2148 int item = string_to_int(pos.substr(i, j - i));
2149 if (item >= 0 && size_t(item) < items.size()) {
2150 if (have_added) value += '\t';
2151 value += items[item];
2152 have_added = true;
2154 if (j == string::npos) break;
2155 i = j + 1;
2157 break;
2159 case CMD_snippet: {
2160 size_t length = 200;
2161 if (args.size() > 1) {
2162 length = string_to_int(args[1]);
2164 if (!stemmer)
2165 stemmer = new Xapian::Stem(option["stemmer"]);
2166 // FIXME: Allow start and end highlight and omit to be specified.
2167 value = mset.snippet(args[0], length, *stemmer,
2168 mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2169 "<strong>", "</strong>", "...");
2170 break;
2172 case CMD_sort:
2173 omegascript_sort(args, value);
2174 break;
2175 case CMD_split: {
2176 string split;
2177 if (args.size() == 1) {
2178 split = " ";
2179 value = args[0];
2180 } else {
2181 split = args[0];
2182 value = args[1];
2184 string::size_type i = 0;
2185 while (true) {
2186 if (split.empty()) {
2187 ++i;
2188 if (i >= value.size()) break;
2189 } else {
2190 i = value.find(split, i);
2191 if (i == string::npos) break;
2193 value.replace(i, split.size(), 1, '\t');
2194 ++i;
2196 break;
2198 case CMD_stoplist: {
2199 Xapian::TermIterator i = qp.stoplist_begin();
2200 Xapian::TermIterator end = qp.stoplist_end();
2201 while (i != end) {
2202 if (!value.empty()) value += '\t';
2203 value += *i;
2204 ++i;
2206 break;
2208 case CMD_sub:
2209 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2210 break;
2211 case CMD_subdb: {
2212 Xapian::docid id = q0;
2213 if (args.size() > 0) id = string_to_int(args[0]);
2214 auto subdbs = get_subdbs();
2215 value = subdbs[(id - 1) % subdbs.size()];
2216 break;
2218 case CMD_subid: {
2219 Xapian::docid id = q0;
2220 if (args.size() > 0) id = string_to_int(args[0]);
2221 value = str(((id - 1) / get_subdbs().size()) + 1);
2222 break;
2224 case CMD_substr: {
2225 int start = string_to_int(args[1]);
2226 if (start < 0) {
2227 if (static_cast<size_t>(-start) >= args[0].size()) {
2228 start = 0;
2229 } else {
2230 start = static_cast<int>(args[0].size()) + start;
2232 } else {
2233 if (static_cast<size_t>(start) >= args[0].size()) break;
2235 size_t len = string::npos;
2236 if (args.size() > 2) {
2237 int int_len = string_to_int(args[2]);
2238 if (int_len >= 0) {
2239 len = size_t(int_len);
2240 } else {
2241 len = args[0].size() - start;
2242 if (static_cast<size_t>(-int_len) >= len) {
2243 len = 0;
2244 } else {
2245 len -= static_cast<size_t>(-int_len);
2249 value.assign(args[0], start, len);
2250 break;
2252 case CMD_suggestion:
2253 value = qp.get_corrected_query_string();
2254 break;
2255 case CMD_switch: {
2256 const string& val = args[0];
2257 for (size_t i = 1; i < args.size(); i += 2) {
2258 if (i == args.size() - 1) {
2259 // Handle optional "else" value.
2260 value = eval(args[i], param);
2261 break;
2263 if (val == eval(args[i], param)) {
2264 value = eval(args[i + 1], param);
2265 break;
2268 break;
2270 case CMD_termprefix:
2271 (void)prefix_from_term(&value, args[0]);
2272 break;
2273 case CMD_terms: {
2274 // list of matching terms
2275 if (!enquire) break;
2276 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2277 if (args.empty()) {
2278 while (term != enquire->get_matching_terms_end(q0)) {
2279 // check term was in the typed query so we ignore
2280 // boolean filter terms
2281 const string & t = *term;
2282 if (termset.find(t) != termset.end()) {
2283 value += t;
2284 value += '\t';
2286 ++term;
2288 } else {
2289 // Return matching terms with specified prefix. We can't
2290 // use skip_to() as the terms aren't ordered by termname.
2291 const string & pfx = args[0];
2292 while (term != enquire->get_matching_terms_end(q0)) {
2293 const string & t = *term;
2294 if (startswith(t, pfx)) {
2295 value += t;
2296 value += '\t';
2298 ++term;
2302 if (!value.empty()) value.erase(value.size() - 1);
2303 break;
2305 case CMD_thispage:
2306 value = str(topdoc / hits_per_page + 1);
2307 break;
2308 case CMD_time:
2309 if (secs >= 0) {
2310 char buf[64];
2311 my_snprintf(buf, sizeof(buf), "%.6f", secs);
2312 // MSVC's snprintf omits the zero byte if the string if
2313 // sizeof(buf) long.
2314 buf[sizeof(buf) - 1] = '\0';
2315 value = buf;
2317 break;
2318 case CMD_topdoc:
2319 // first document on current page of hit list (counting from 0)
2320 value = str(topdoc);
2321 break;
2322 case CMD_topterms:
2323 if (enquire) {
2324 int howmany = 16;
2325 if (!args.empty()) howmany = string_to_int(args[0]);
2326 if (howmany < 0) howmany = 0;
2328 // List of expand terms
2329 Xapian::ESet eset;
2330 OmegaExpandDecider decider(db, &termset);
2332 if (!rset.empty()) {
2333 set_expansion_scheme(*enquire, option);
2334 eset = enquire->get_eset(howmany * 2, rset, &decider);
2335 } else if (mset.size()) {
2336 // invent an rset
2337 Xapian::RSet tmp;
2339 int c = 5;
2340 // FIXME: what if mset does not start at first match?
2341 for (Xapian::docid did : mset) {
2342 tmp.add_document(did);
2343 if (--c == 0) break;
2346 set_expansion_scheme(*enquire, option);
2347 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2350 // Don't show more than one word with the same stem.
2351 set<string> stems;
2352 Xapian::ESetIterator i;
2353 for (i = eset.begin(); i != eset.end(); ++i) {
2354 string term(*i);
2355 string stem = (*stemmer)(term);
2356 if (stems.find(stem) != stems.end()) continue;
2357 stems.insert(stem);
2358 value += term;
2359 value += '\t';
2360 if (--howmany == 0) break;
2362 if (!value.empty()) value.erase(value.size() - 1);
2364 break;
2365 case CMD_transform:
2366 omegascript_transform(value, args);
2367 break;
2368 case CMD_truncate:
2369 value = generate_sample(args[0],
2370 string_to_int(args[1]),
2371 args.size() > 2 ? args[2] : string(),
2372 args.size() > 3 ? args[3] : string());
2373 break;
2374 case CMD_uniq: {
2375 const string &list = args[0];
2376 if (list.empty()) break;
2377 string::size_type split = 0, split2;
2378 string prev;
2379 do {
2380 split2 = list.find('\t', split);
2381 string item(list, split, split2 - split);
2382 if (split == 0) {
2383 value = item;
2384 } else if (item != prev) {
2385 value += '\t';
2386 value += item;
2388 prev = item;
2389 split = split2 + 1;
2390 } while (split2 != string::npos);
2391 break;
2393 case CMD_unique: {
2394 unordered_set<string> seen;
2395 const string &list = args[0];
2396 if (list.empty()) break;
2397 string::size_type split = 0, split2;
2398 do {
2399 split2 = list.find('\t', split);
2400 string item(list, split, split2 - split);
2401 if (seen.insert(item).second) {
2402 if (split != 0)
2403 value += '\t';
2404 value += item;
2406 split = split2 + 1;
2407 } while (split2 != string::npos);
2408 break;
2410 case CMD_unpack:
2411 value = str(binary_string_to_int(args[0]));
2412 break;
2413 case CMD_unprefix: {
2414 size_t prefix_len = prefix_from_term(NULL, args[0]);
2415 value.assign(args[0], prefix_len, string::npos);
2416 break;
2418 case CMD_unstem: {
2419 const string &term = args[0];
2420 Xapian::TermIterator i = qp.unstem_begin(term);
2421 Xapian::TermIterator end = qp.unstem_end(term);
2422 while (i != end) {
2423 if (!value.empty()) value += '\t';
2424 value += *i;
2425 ++i;
2427 break;
2429 case CMD_upper:
2430 value = Xapian::Unicode::toupper(args[0]);
2431 break;
2432 case CMD_url:
2433 url_encode(value, args[0]);
2434 break;
2435 case CMD_value: {
2436 Xapian::docid id = q0;
2437 Xapian::valueno value_no = string_to_int(args[0]);
2438 if (args.size() > 1) id = string_to_int(args[1]);
2439 value = db.get_document(id).get_value(value_no);
2440 break;
2442 case CMD_version:
2443 value = PACKAGE_STRING;
2444 break;
2445 case CMD_weight:
2446 value = double_to_string(weight);
2447 break;
2448 default: {
2449 args.insert(args.begin(), param[0]);
2450 int macro_no = func->second->tag - CMD_MACRO;
2451 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2452 // throw "Unknown function '" + var + "'";
2453 value = eval(macros[macro_no], args);
2454 break;
2457 res += value;
2458 } catch (const Xapian::Error & e) {
2459 // FIXME: this means we only see the most recent error in $error
2460 // - is that the best approach?
2461 error_msg = e.get_msg();
2464 res.append(fmt, p, string::npos);
2465 return res;
2468 static string
2469 eval_file(const string &fmtfile)
2471 string err;
2472 if (vet_filename(fmtfile)) {
2473 string file = template_dir + fmtfile;
2474 string fmt;
2475 if (load_file(file, fmt)) {
2476 vector<string> noargs;
2477 noargs.resize(1);
2478 return eval(fmt, noargs);
2480 err = strerror(errno);
2481 } else {
2482 err = "name contains '..'";
2485 // FIXME: report why!
2486 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2487 if (!err.empty()) msg += " (" + err + ')';
2488 throw msg;
2491 extern string
2492 pretty_term(string term)
2494 // Just leave empty strings and single characters alone.
2495 if (term.length() <= 1) return term;
2497 // Assume unprefixed terms are unstemmed.
2498 if (!C_isupper(term[0])) return term;
2500 // Handle stemmed terms.
2501 bool stemmed = (term[0] == 'Z');
2502 if (stemmed) {
2503 // First of all, check if a term in the query stemmed to this one.
2504 Xapian::TermIterator u = qp.unstem_begin(term);
2505 // There might be multiple words with the same stem, but we only want
2506 // one so just take the first.
2507 if (u != qp.unstem_end(term)) return *u;
2509 // Remove the 'Z'.
2510 term.erase(0, 1);
2513 bool add_quotes = false;
2515 // Check if the term has a prefix.
2516 if (C_isupper(term[0])) {
2517 // See if we have this prefix in the termprefix_to_userprefix map. If
2518 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2519 string prefix;
2520 size_t prefix_len = prefix_from_term(&prefix, term);
2522 map<string, string>::const_iterator i;
2523 i = termprefix_to_userprefix.find(prefix);
2524 if (i != termprefix_to_userprefix.end()) {
2525 string user_prefix = i->second;
2526 user_prefix += ':';
2527 term.replace(0, prefix_len, user_prefix);
2528 } else {
2529 // We don't have a prefix mapping for this, so just set a flag to
2530 // add quotes around the term.
2531 add_quotes = true;
2535 if (stemmed) term += '.';
2537 if (add_quotes) {
2538 term.insert(0, "\"");
2539 term.append("\"");
2542 return term;
2545 static string
2546 print_caption(const string &fmt, const vector<string> &param)
2548 q0 = *(mset[hit_no]);
2550 weight = mset[hit_no].get_weight();
2551 percent = mset.convert_to_percent(mset[hit_no]);
2552 collapsed = mset[hit_no].get_collapse_count();
2554 return eval(fmt, param);
2557 void
2558 parse_omegascript()
2560 try {
2561 const char * p = getenv("SERVER_PROTOCOL");
2562 if (p && strcmp(p, "INCLUDED") == 0) {
2563 // We're being included in another page, so suppress headers.
2564 suppress_http_headers = true;
2567 string output = eval_file(fmtname);
2568 if (!set_content_type && !suppress_http_headers) {
2569 cout << "Content-Type: text/html" << endl;
2570 set_content_type = true;
2572 if (!suppress_http_headers) cout << endl;
2573 cout << output;
2574 } catch (...) {
2575 // Ensure the headers have been output so that any exception gets
2576 // reported rather than giving a server error.
2577 if (!set_content_type && !suppress_http_headers) {
2578 cout << "Content-Type: text/html" << endl;
2579 set_content_type = true;
2581 if (!suppress_http_headers) cout << endl;
2582 throw;
2586 static void
2587 ensure_query_parsed()
2589 if (query_parsed) return;
2590 query_parsed = true;
2592 // Should we discard the existing R-set recorded in R CGI parameters?
2593 bool discard_rset = false;
2595 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2596 // CGI parameters)?
2597 bool force_first_page = false;
2599 string v;
2600 // get list of terms from previous iteration of query
2601 auto val = cgi_params.find("xP");
2602 if (val != cgi_params.end()) {
2603 v = val->second;
2604 // If xP given, default to discarding any RSet and forcing the first
2605 // page of results. If the query is the same, or an extension of
2606 // the previous query, we adjust these again below.
2607 discard_rset = true;
2608 force_first_page = true;
2610 querytype result = parse_queries(v);
2611 switch (result) {
2612 case BAD_QUERY:
2613 break;
2614 case NEW_QUERY:
2615 break;
2616 case SAME_QUERY:
2617 case EXTENDED_QUERY:
2618 // If we've changed database, force the first page of hits
2619 // and discard the R-set (since the docids will have changed)
2620 val = cgi_params.find("xDB");
2621 if (val != cgi_params.end() && val->second != dbname) break;
2622 if (result == SAME_QUERY && force_first_page) {
2623 val = cgi_params.find("xFILTERS");
2624 if (val != cgi_params.end() && val->second != filters &&
2625 val->second != old_filters) {
2626 // Filters have changed since last query.
2627 } else {
2628 force_first_page = false;
2631 discard_rset = false;
2632 break;
2635 if (!force_first_page) {
2636 // Work out which mset element is the first hit we want
2637 // to display
2638 val = cgi_params.find("TOPDOC");
2639 if (val != cgi_params.end()) {
2640 topdoc = atol(val->second.c_str());
2643 // Handle next, previous, and page links
2644 if (cgi_params.find(">") != cgi_params.end()) {
2645 topdoc += hits_per_page;
2646 } else if (cgi_params.find("<") != cgi_params.end()) {
2647 if (topdoc >= hits_per_page)
2648 topdoc -= hits_per_page;
2649 else
2650 topdoc = 0;
2651 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2652 (val = cgi_params.find("#")) != cgi_params.end()) {
2653 long page = atol(val->second.c_str());
2654 // Do something sensible for page 0 (we count pages from 1).
2655 if (page == 0) page = 1;
2656 topdoc = (page - 1) * hits_per_page;
2659 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2660 // Normally we snap TOPDOC like this so that things work nicely if
2661 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2662 // postprocessing the output of omega and want variable sized pages,
2663 // this is unhelpful.
2664 bool raw_search = false;
2665 val = cgi_params.find("RAWSEARCH");
2666 if (val != cgi_params.end()) {
2667 raw_search = bool(atol(val->second.c_str()));
2670 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2673 if (!discard_rset) {
2674 // put documents marked as relevant into the rset
2675 auto g = cgi_params.equal_range("R");
2676 for (auto i = g.first; i != g.second; ++i) {
2677 const string & value = i->second;
2678 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2679 while (value[j] == '.') ++j;
2680 Xapian::docid d = atoi(value.c_str() + j);
2681 if (d) {
2682 rset.add_document(d);
2683 ticked[d] = true;
2690 // run query if we haven't already
2691 static void
2692 ensure_match()
2694 if (done_query) return;
2696 secs = RealTime::now();
2697 run_query();
2698 if (secs != -1)
2699 secs = RealTime::now() - secs;
2701 done_query = true;
2702 last = mset.get_matches_lower_bound();
2703 if (last == 0) {
2704 // Otherwise topdoc ends up being -6 if it's non-zero!
2705 topdoc = 0;
2706 } else {
2707 if (topdoc >= last)
2708 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2709 // last is the count of documents up to the end of the current page
2710 // (as returned by $last)
2711 if (topdoc + hits_per_page < last)
2712 last = topdoc + hits_per_page;
2716 // OmegaExpandDecider methods.
2718 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2719 set<string> * querytermset)
2720 : db(db_)
2722 // We'll want the stemmer for testing matches anyway.
2723 if (!stemmer)
2724 stemmer = new Xapian::Stem(option["stemmer"]);
2725 if (querytermset) {
2726 set<string>::const_iterator i;
2727 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2728 string term(*i);
2729 if (term.empty()) continue;
2731 unsigned char ch = term[0];
2732 bool stemmed = (ch == 'Z');
2733 if (stemmed) {
2734 term.erase(0, 1);
2735 if (term.empty()) continue;
2736 ch = term[0];
2739 if (C_isupper(ch)) {
2740 size_t prefix_len = prefix_from_term(NULL, term);
2741 term.erase(0, prefix_len);
2744 if (!stemmed) term = (*stemmer)(term);
2746 exclude_stems.insert(term);
2751 bool
2752 OmegaExpandDecider::operator()(const string & term) const
2754 unsigned char ch = term[0];
2756 // Reject terms with a prefix.
2757 if (C_isupper(ch)) return false;
2760 MyStopper stopper;
2761 // Don't suggest stopwords.
2762 if (stopper(term)) return false;
2765 // Reject small numbers.
2766 if (term.size() < 4 && C_isdigit(ch)) return false;
2768 // Reject terms containing a space.
2769 if (term.find(' ') != string::npos) return false;
2771 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2772 // terms which are already in the query in some form.
2773 string stem = (*stemmer)(term);
2774 if (exclude_stems.find(stem) != exclude_stems.end())
2775 return false;
2777 // Ignore terms that only occur once (hapaxes) since they aren't
2778 // useful for finding related documents - they only occur in a
2779 // document that's already been marked as relevant.
2780 // FIXME: add an expand option to ignore terms where
2781 // termfreq == rtermfreq.
2782 if (db.get_termfreq(term) <= 1) return false;
2784 return true;