Fix whitespace irregularities in code
[xapian.git] / xapian-applications / omega / query.cc
blob97c67e4acb84cb4b93ca98a3d8466c9d8712a66f
1 /* query.cc: query executor for omega
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
8 * Copyright 2008 Thomas Viehmann
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include <algorithm>
29 #include <iostream>
30 #include <map>
31 #include <set>
32 #include <vector>
34 #include <cassert>
35 #include <cctype>
36 #include "safeerrno.h"
37 #include <stdio.h>
38 #include <cstdlib>
39 #include <cstring>
40 #include "strcasecmp.h"
41 #include <ctime>
43 #include "safeunistd.h"
44 #include <sys/types.h>
45 #include "safesysstat.h"
46 #include "safefcntl.h"
48 #include "realtime.h"
50 #include <cdb.h>
52 #include "csvescape.h"
53 #include "date.h"
54 #include "datevalue.h"
55 #include "jsonescape.h"
56 #include "utils.h"
57 #include "omega.h"
58 #include "query.h"
59 #include "cgiparam.h"
60 #include "loadfile.h"
61 #include "sample.h"
62 #include "str.h"
63 #include "stringutils.h"
64 #include "transform.h"
65 #include "urldecode.h"
66 #include "urlencode.h"
67 #include "unixperm.h"
68 #include "values.h"
69 #include "weight.h"
70 #include "expand.h"
72 #include <xapian.h>
74 using namespace std;
76 using Xapian::Utf8Iterator;
78 using Xapian::Unicode::is_wordchar;
80 #ifndef SNPRINTF
81 #include <cstdarg>
83 static int my_snprintf(char *str, size_t size, const char *format, ...)
85 int res;
86 va_list ap;
87 va_start(ap, format);
88 str[size - 1] = '\0';
89 res = vsprintf(str, format, ap);
90 if (str[size - 1] || res < 0 || size_t(res) >= size)
91 abort(); /* Overflowed! */
92 va_end(ap);
93 return res;
95 #else
96 #define my_snprintf SNPRINTF
97 #endif
99 static bool query_parsed = false;
100 static bool done_query = false;
101 static Xapian::docid last = 0;
103 static Xapian::MSet mset;
105 static map<Xapian::docid, bool> ticked;
107 static void ensure_query_parsed();
108 static void ensure_match();
110 static Xapian::Query query;
111 //static string url_query_string;
112 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
114 static Xapian::QueryParser qp;
115 static Xapian::NumberRangeProcessor * size_rp = NULL;
116 static Xapian::Stem *stemmer = NULL;
118 static string eval_file(const string &fmtfile);
120 static set<string> termset;
122 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
123 static map<string, string> termprefix_to_userprefix;
125 static string queryterms;
127 static string error_msg;
129 static double secs = -1;
131 static const char DEFAULT_LOG_ENTRY[] =
132 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
133 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
134 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
135 "$dbname\t"
136 "$query\t"
137 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
139 class MyStopper : public Xapian::Stopper {
140 public:
141 bool operator()(const string &t) const {
142 switch (t[0]) {
143 case 'a':
144 return (t == "a" || t == "about" || t == "an" || t == "and" ||
145 t == "are" || t == "as" || t == "at");
146 case 'b':
147 return (t == "be" || t == "by");
148 case 'e':
149 return (t == "en");
150 case 'f':
151 return (t == "for" || t == "from");
152 case 'h':
153 return (t == "how");
154 case 'i':
155 return (t == "i" || t == "in" || t == "is" || t == "it");
156 case 'o':
157 return (t == "of" || t == "on" || t == "or");
158 case 't':
159 return (t == "that" || t == "the" || t == "this" || t == "to");
160 case 'w':
161 return (t == "was" || t == "what" || t == "when" ||
162 t == "where" || t == "which" || t == "who" ||
163 t == "why" || t == "will" || t == "with");
164 case 'y':
165 return (t == "you" || t == "your");
166 default:
167 return false;
172 static size_t
173 prefix_from_term(string &prefix, const string &term)
175 if (term.empty()) {
176 prefix.resize(0);
177 return 0;
179 if (term[0] == 'X') {
180 const string::const_iterator begin = term.begin();
181 string::const_iterator i = begin + 1;
182 while (i != term.end() && C_isupper(*i)) ++i;
183 prefix.assign(begin, i);
184 if (i != term.end() && *i == ':') ++i;
185 return i - begin;
188 prefix = term[0];
189 return 1;
192 // Don't allow ".." in format names, log file names, etc as this would allow
193 // people to open a format "../../etc/passwd" or similar.
194 // FIXME: make this check more exact ("foo..bar" is safe)
195 // FIXME: log when this check fails
196 static bool
197 vet_filename(const string &filename)
199 string::size_type i = filename.find("..");
200 return (i == string::npos);
203 // Heuristics:
204 // * If any terms have been removed, it's a "fresh query" so we discard any
205 // relevance judgements
206 // * If all previous terms are there but more have been added then we keep
207 // the relevance judgements, but return the first page of hits
209 // NEW_QUERY entirely new query
210 // SAME_QUERY unchanged query
211 // EXTENDED_QUERY new query, but based on the old one
212 // BAD_QUERY parse error (message in error_msg)
213 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
215 static multimap<string, string> probabilistic_query;
217 void
218 set_probabilistic_query(const string & prefix, const string & s)
220 string query_string = s;
221 // Strip leading and trailing whitespace from query_string.
222 trim(query_string);
223 if (!query_string.empty())
224 probabilistic_query.insert(make_pair(prefix, query_string));
227 static unsigned
228 read_qp_flags(const string & opt_pfx, unsigned f)
230 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
231 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
232 unsigned mask = 0;
233 const char * s = i->first.c_str() + opt_pfx.size();
234 switch (s[0]) {
235 case 'a':
236 if (strcmp(s, "auto_multiword_synonyms") == 0) {
237 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
238 break;
240 if (strcmp(s, "auto_synonyms") == 0) {
241 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
242 break;
244 break;
245 case 'b':
246 if (strcmp(s, "boolean") == 0) {
247 mask = Xapian::QueryParser::FLAG_BOOLEAN;
248 break;
250 if (strcmp(s, "boolean_any_case") == 0) {
251 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
252 break;
254 break;
255 case 'c':
256 if (strcmp(s, "cjk_ngram") == 0) {
257 mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
258 break;
260 break;
261 case 'd':
262 if (strcmp(s, "default") == 0) {
263 mask = Xapian::QueryParser::FLAG_DEFAULT;
264 break;
266 break;
267 case 'l':
268 if (strcmp(s, "lovehate") == 0) {
269 mask = Xapian::QueryParser::FLAG_LOVEHATE;
270 break;
272 break;
273 case 'p':
274 if (strcmp(s, "partial") == 0) {
275 mask = Xapian::QueryParser::FLAG_PARTIAL;
276 break;
278 if (strcmp(s, "phrase") == 0) {
279 mask = Xapian::QueryParser::FLAG_PHRASE;
280 break;
282 if (strcmp(s, "pure_not") == 0) {
283 mask = Xapian::QueryParser::FLAG_PURE_NOT;
284 break;
286 break;
287 case 's':
288 if (strcmp(s, "spelling_correction") == 0) {
289 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
290 break;
292 if (strcmp(s, "synonym") == 0) {
293 mask = Xapian::QueryParser::FLAG_SYNONYM;
294 break;
296 break;
297 case 'w':
298 if (strcmp(s, "wildcard") == 0) {
299 mask = Xapian::QueryParser::FLAG_WILDCARD;
300 break;
302 break;
305 if (i->second.empty()) {
306 f &= ~mask;
307 } else {
308 f |= mask;
311 return f;
314 static querytype
315 set_probabilistic(const string &oldp)
317 // Parse the query string.
318 qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
319 qp.set_stopper(new MyStopper());
320 qp.set_default_op(default_op);
321 qp.set_database(db);
322 // FIXME: provide a custom RP which handles size:10..20K, etc.
323 if (!size_rp)
324 size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
325 qp.add_rangeprocessor(size_rp);
326 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
327 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
328 string user_prefix(pfx->first, 7);
329 const string & term_pfx_list = pfx->second;
330 string::size_type i = 0;
331 do {
332 string::size_type i0 = i;
333 i = term_pfx_list.find('\t', i);
334 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
335 qp.add_prefix(user_prefix, term_pfx);
336 // std::map::insert() won't overwrite an existing entry, so we'll
337 // prefer the first user_prefix for which a particular term prefix
338 // is specified.
339 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
340 } while (++i);
342 pfx = option.lower_bound("boolprefix,");
343 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
344 string user_prefix(pfx->first, 11, string::npos);
345 auto it = option.find("nonexclusiveprefix," + pfx->second);
346 bool exclusive = (it == option.end() || it->second.empty());
347 qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
348 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
351 try {
352 unsigned default_flags = read_qp_flags("flag_", 0);
353 if (option["spelling"] == "true")
354 default_flags |= qp.FLAG_SPELLING_CORRECTION;
356 vector<Xapian::Query> queries;
357 queries.reserve(probabilistic_query.size());
359 multimap<string, string>::const_iterator j;
360 for (j = probabilistic_query.begin();
361 j != probabilistic_query.end();
362 ++j) {
363 const string & prefix = j->first;
365 // Choose the stemmer to use for this input.
366 string stemlang = option[prefix + ":stemmer"];
367 if (stemlang.empty())
368 stemlang = option["stemmer"];
369 qp.set_stemmer(Xapian::Stem(stemlang));
371 // Work out the flags to use for this input.
372 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
374 const string & query_string = j->second;
375 Xapian::Query q = qp.parse_query(query_string, f, prefix);
376 if (!q.empty())
377 queries.push_back(q);
379 query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
380 } catch (Xapian::QueryParserError &e) {
381 error_msg = e.get_msg();
382 return BAD_QUERY;
385 Xapian::termcount n_new_terms = 0;
386 for (Xapian::TermIterator i = query.get_terms_begin();
387 i != query.get_terms_end(); ++i) {
388 if (termset.find(*i) == termset.end()) {
389 termset.insert(*i);
390 if (!queryterms.empty()) queryterms += '\t';
391 queryterms += *i;
393 n_new_terms++;
396 // Check new query against the previous one
397 if (oldp.empty()) {
398 // If oldp was empty that means there were no probabilistic terms
399 // before, so if there are now this is a new query.
400 return n_new_terms ? NEW_QUERY : SAME_QUERY;
403 // The terms in oldp are separated by tabs.
404 const char oldp_separator = '\t';
405 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
407 // short-cut: if the new query has fewer terms, it must be a new one
408 if (n_new_terms < n_old_terms) return NEW_QUERY;
410 const char *term = oldp.c_str();
411 const char *pend;
412 while ((pend = strchr(term, oldp_separator)) != NULL) {
413 if (termset.find(string(term, pend - term)) == termset.end())
414 return NEW_QUERY;
415 term = pend + 1;
417 if (*term) {
418 if (termset.find(string(term)) == termset.end())
419 return NEW_QUERY;
422 // Use termset.size() rather than n_new_terms so we correctly handle
423 // the case when the query has repeated terms.
424 // This works wrongly in the case when the user extends the query
425 // by adding a term already in it, but that's unlikely and the behaviour
426 // isn't too bad (we just don't reset page 1). We also mishandle a few
427 // other obscure cases e.g. adding quotes to turn a query into a phrase.
428 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
429 return SAME_QUERY;
432 static multimap<string, string> filter_map;
433 static set<string> neg_filters;
435 typedef multimap<string, string>::const_iterator FMCI;
437 void add_bterm(const string &term) {
438 string prefix;
439 if (prefix_from_term(prefix, term) > 0)
440 filter_map.insert(multimap<string, string>::value_type(prefix, term));
443 void add_nterm(const string &term) {
444 if (!term.empty())
445 neg_filters.insert(term);
448 static void
449 run_query()
451 string scheme;
452 bool force_boolean = false;
453 if (!filter_map.empty()) {
454 // OR together filters with the same prefix (or AND for non-exclusive
455 // prefixes), then AND together the resultant groups.
456 vector<Xapian::Query> filter_vec;
457 vector<string> same_vec;
458 string current;
459 for (FMCI i = filter_map.begin(); ; i++) {
460 bool over = (i == filter_map.end());
461 if (over || i->first != current) {
462 switch (same_vec.size()) {
463 case 0:
464 break;
465 case 1:
466 filter_vec.push_back(Xapian::Query(same_vec[0]));
467 break;
468 default: {
469 Xapian::Query::op op = Xapian::Query::OP_OR;
470 auto it = option.find("nonexclusiveprefix," + current);
471 if (it != option.end() && !it->second.empty()) {
472 op = Xapian::Query::OP_AND;
474 filter_vec.push_back(Xapian::Query(op,
475 same_vec.begin(),
476 same_vec.end()));
477 break;
480 same_vec.clear();
481 if (over) break;
482 current = i->first;
484 same_vec.push_back(i->second);
487 Xapian::Query filter(Xapian::Query::OP_AND,
488 filter_vec.begin(), filter_vec.end());
490 if (query.empty()) {
491 // If no probabilistic query is provided then promote the filters
492 // to be THE query - filtering an empty query will give no
493 // matches.
494 std::swap(query, filter);
495 auto&& it = option.find("weightingpurefilter");
496 if (it != option.end() && !it->second.empty()) {
497 scheme = it->second;
498 } else {
499 force_boolean = true;
501 } else {
502 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
506 if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
507 Xapian::Query date_filter;
508 if (date_value_slot != Xapian::BAD_VALUENO) {
509 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
510 // latter the sort order just works correctly between different
511 // precisions).
512 bool as_time_t =
513 db.get_value_lower_bound(date_value_slot).size() == 4 &&
514 db.get_value_upper_bound(date_value_slot).size() == 4;
515 date_filter = date_value_range(as_time_t, date_value_slot,
516 date_start, date_end,
517 date_span);
518 } else {
519 date_filter = date_range_filter(date_start, date_end, date_span);
520 date_filter = Xapian::Query(Xapian::Query::OP_OR,
521 date_filter,
522 Xapian::Query("Dlatest"));
525 // If no probabilistic query is provided then promote the daterange
526 // filter to be THE query instead of filtering an empty query.
527 if (query.empty()) {
528 query = date_filter;
529 force_boolean = true;
530 } else {
531 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
535 if (!neg_filters.empty()) {
536 // OR together all negated filters.
537 Xapian::Query filter(Xapian::Query::OP_OR,
538 neg_filters.begin(), neg_filters.end());
540 if (query.empty()) {
541 // If we only have a negative filter for the query, use MatchAll as
542 // the query to apply the filters to.
543 query = Xapian::Query::MatchAll;
544 force_boolean = true;
546 query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
549 if (!enquire || !error_msg.empty()) return;
551 if (!force_boolean && scheme.empty()) {
552 auto&& it = option.find("weighting");
553 if (it != option.end()) scheme = it->second;
555 set_weighting_scheme(*enquire, scheme, force_boolean);
557 enquire->set_cutoff(threshold);
559 if (sort_keymaker) {
560 if (sort_after) {
561 enquire->set_sort_by_relevance_then_key(sort_keymaker,
562 reverse_sort);
563 } else {
564 enquire->set_sort_by_key_then_relevance(sort_keymaker,
565 reverse_sort);
567 } else if (sort_key != Xapian::BAD_VALUENO) {
568 if (sort_after) {
569 enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
570 } else {
571 enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
575 enquire->set_docid_order(docid_order);
577 if (collapse) {
578 enquire->set_collapse_key(collapse_key);
581 if (!query.empty()) {
582 #if 0
583 // FIXME: If we start doing permissions checks based on $REMOTE_USER
584 // we're going to break some existing setups if users upgrade. We
585 // probably want a way to set this from OmegaScript.
586 const char * remote_user = getenv("REMOTE_USER");
587 if (remote_user)
588 apply_unix_permissions(query, remote_user);
589 #endif
591 enquire->set_query(query);
592 // We could use the value of topdoc as first parameter, but we
593 // need to know the first few items in the mset to fake a
594 // relevance set for topterms.
596 // If min_hits isn't set, check at least one extra result so we
597 // know if we've reached the end of the matches or not - then we
598 // can avoid offering a "next" button which leads to an empty page.
599 mset = enquire->get_mset(0, topdoc + hits_per_page,
600 topdoc + max(hits_per_page + 1, min_hits),
601 &rset);
605 string
606 html_escape(const string &str)
608 string res;
609 string::size_type p = 0;
610 while (p < str.size()) {
611 char ch = str[p++];
612 switch (ch) {
613 case '<':
614 res += "&lt;";
615 continue;
616 case '>':
617 res += "&gt;";
618 continue;
619 case '&':
620 res += "&amp;";
621 continue;
622 case '"':
623 res += "&quot;";
624 continue;
625 default:
626 res += ch;
629 return res;
632 static string
633 html_strip(const string &str)
635 string res;
636 string::size_type p = 0;
637 bool skip = false;
638 while (p < str.size()) {
639 char ch = str[p++];
640 switch (ch) {
641 case '<':
642 skip = true;
643 continue;
644 case '>':
645 skip = false;
646 continue;
647 default:
648 if (! skip) res += ch;
651 return res;
654 // FIXME split list into hash or map and use that rather than linear lookup?
655 static int word_in_list(const string& word, const string& list)
657 string::size_type split = 0, split2;
658 int count = 0;
659 while ((split2 = list.find('\t', split)) != string::npos) {
660 if (word.size() == split2 - split) {
661 if (memcmp(word.data(), list.data() + split, word.size()) == 0)
662 return count;
664 split = split2 + 1;
665 ++count;
667 if (word.size() == list.size() - split) {
668 if (memcmp(word.data(), list.data() + split, word.size()) == 0)
669 return count;
671 return -1;
674 // Not a character in an identifier
675 inline static bool
676 p_notid(unsigned int c)
678 return !C_isalnum(c) && c != '_';
681 // Not a character in an HTML tag name
682 inline static bool
683 p_nottag(unsigned int c)
685 return !C_isalnum(c) && c != '.' && c != '-';
688 // FIXME: shares algorithm with indextext.cc!
689 static string
690 html_highlight(const string &s, const string &list,
691 const string &bra, const string &ket)
693 if (!stemmer) {
694 stemmer = new Xapian::Stem(option["stemmer"]);
697 string res;
699 Utf8Iterator j(s);
700 const Utf8Iterator s_end;
701 while (true) {
702 Utf8Iterator first = j;
703 while (first != s_end && !is_wordchar(*first)) ++first;
704 if (first == s_end) break;
705 Utf8Iterator term_end;
706 string term;
707 string word;
708 const char *l = j.raw();
709 if (*first < 128 && C_isupper(*first)) {
710 j = first;
711 Xapian::Unicode::append_utf8(term, *j);
712 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
713 Xapian::Unicode::append_utf8(term, *j);
715 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
716 term.resize(0);
718 term_end = j;
720 if (term.empty()) {
721 j = first;
722 while (is_wordchar(*j)) {
723 Xapian::Unicode::append_utf8(term, *j);
724 ++j;
725 if (j == s_end) break;
726 if (*j == '&' || *j == '\'') {
727 Utf8Iterator next = j;
728 ++next;
729 if (next == s_end || !is_wordchar(*next)) break;
730 term += *j;
731 j = next;
734 term_end = j;
735 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
736 string::size_type len = term.length();
737 if (*j == '#') {
738 term += '#';
739 do { ++j; } while (j != s_end && *j == '#');
740 } else {
741 while (j != s_end && (*j == '+' || *j == '-')) {
742 Xapian::Unicode::append_utf8(term, *j);
743 ++j;
746 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
747 term.resize(len);
748 } else {
749 term_end = j;
753 j = term_end;
754 term = Xapian::Unicode::tolower(term);
755 int match = word_in_list(term, list);
756 if (match == -1) {
757 string stem = "Z";
758 stem += (*stemmer)(term);
759 match = word_in_list(stem, list);
761 if (match >= 0) {
762 res += html_escape(string(l, first.raw() - l));
763 if (!bra.empty()) {
764 res += bra;
765 } else {
766 static const char * colours[] = {
767 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
768 "990000", "009900", "996600", "006699", "990099"
770 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
771 const char * bg = colours[idx];
772 if (strchr(bg, 'f')) {
773 res += "<b style=\"color:black;background-color:#";
774 } else {
775 res += "<b style=\"color:white;background-color:#";
777 res += bg;
778 res += "\">";
780 word.assign(first.raw(), j.raw() - first.raw());
781 res += html_escape(word);
782 if (!bra.empty()) {
783 res += ket;
784 } else {
785 res += "</b>";
787 } else {
788 res += html_escape(string(l, j.raw() - l));
791 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
792 return res;
795 #if 0
796 static void
797 print_query_string(const char *after)
799 if (after && strncmp(after, "&B=", 3) == 0) {
800 char prefix = after[3];
801 string::size_type start = 0, amp = 0;
802 while (true) {
803 amp = url_query_string.find('&', amp);
804 if (amp == string::npos) {
805 cout << url_query_string.substr(start);
806 return;
808 amp++;
809 while (url_query_string[amp] == 'B' &&
810 url_query_string[amp + 1] == '=' &&
811 url_query_string[amp + 2] == prefix) {
812 cout << url_query_string.substr(start, amp - start - 1);
813 start = url_query_string.find('&', amp + 3);
814 if (start == string::npos) return;
815 amp = start + 1;
819 cout << url_query_string;
821 #endif
823 class Fields {
824 mutable Xapian::docid did_cached;
825 mutable map<string, string> fields;
827 void read_fields(Xapian::docid did) const;
829 public:
830 Fields() : did_cached(0) { }
832 const string & get_field(Xapian::docid did, const string & field) const {
833 if (did != did_cached) read_fields(did);
834 return fields[field];
838 void
839 Fields::read_fields(Xapian::docid did) const
841 fields.clear();
842 did_cached = did;
843 const string & data = db.get_document(did).get_data();
845 // Parse document data.
846 string::size_type i = 0;
847 const string & names = option["fieldnames"];
848 if (!names.empty()) {
849 // Each line is a field, with fieldnames taken from corresponding
850 // entries in the tab-separated list specified by $opt{fieldnames}.
851 string::size_type n = 0;
852 do {
853 string::size_type n0 = n;
854 n = names.find('\t', n);
855 string::size_type i0 = i;
856 i = data.find('\n', i);
857 fields.insert(make_pair(names.substr(n0, n - n0),
858 data.substr(i0, i - i0)));
859 } while (++n && ++i);
860 } else {
861 // Each line is a field, in the format NAME=VALUE. We assume the field
862 // name doesn't contain an "=". Lines without an "=" are currently
863 // just ignored.
864 do {
865 string::size_type i0 = i;
866 i = data.find('\n', i);
867 string line(data, i0, i - i0);
868 string::size_type j = line.find('=');
869 if (j != string::npos) {
870 string & value = fields[line.substr(0, j)];
871 if (!value.empty()) value += '\t';
872 value.append(line, j + 1, string::npos);
874 } while (++i);
878 static Fields fields;
879 static Xapian::docid q0;
880 static Xapian::doccount hit_no;
881 static int percent;
882 static double weight;
883 static Xapian::doccount collapsed;
885 static string print_caption(const string &fmt, const vector<string> &param);
887 enum tagval {
888 CMD_,
889 CMD_add,
890 CMD_addfilter,
891 CMD_allterms,
892 CMD_and,
893 CMD_cgi,
894 CMD_cgilist,
895 CMD_chr,
896 CMD_collapsed,
897 CMD_contains,
898 CMD_csv,
899 CMD_date,
900 CMD_dbname,
901 CMD_dbsize,
902 CMD_def,
903 CMD_defaultop,
904 CMD_div,
905 CMD_eq,
906 CMD_emptydocs,
907 CMD_env,
908 CMD_error,
909 CMD_field,
910 CMD_filesize,
911 CMD_filters,
912 CMD_filterterms,
913 CMD_find,
914 CMD_fmt,
915 CMD_freq,
916 CMD_ge,
917 CMD_gt,
918 CMD_highlight,
919 CMD_hit,
920 CMD_hitlist,
921 CMD_hitsperpage,
922 CMD_hostname,
923 CMD_html,
924 CMD_htmlstrip,
925 CMD_httpheader,
926 CMD_id,
927 CMD_if,
928 CMD_include,
929 CMD_json,
930 CMD_jsonarray,
931 CMD_last,
932 CMD_lastpage,
933 CMD_le,
934 CMD_length,
935 CMD_list,
936 CMD_log,
937 CMD_lookup,
938 CMD_lower,
939 CMD_lt,
940 CMD_map,
941 CMD_match,
942 CMD_max,
943 CMD_min,
944 CMD_mod,
945 CMD_msize,
946 CMD_msizeexact,
947 CMD_msizelower,
948 CMD_msizeupper,
949 CMD_mul,
950 CMD_muldiv,
951 CMD_ne,
952 CMD_nice,
953 CMD_not,
954 CMD_now,
955 CMD_opt,
956 CMD_or,
957 CMD_ord,
958 CMD_pack,
959 CMD_percentage,
960 CMD_prettyterm,
961 CMD_prettyurl,
962 CMD_query,
963 CMD_querydescription,
964 CMD_queryterms,
965 CMD_range,
966 CMD_record,
967 CMD_relevant,
968 CMD_relevants,
969 CMD_score,
970 CMD_set,
971 CMD_setmap,
972 CMD_setrelevant,
973 CMD_slice,
974 CMD_snippet,
975 CMD_split,
976 CMD_stoplist,
977 CMD_sub,
978 CMD_substr,
979 CMD_suggestion,
980 CMD_terms,
981 CMD_thispage,
982 CMD_time,
983 CMD_topdoc,
984 CMD_topterms,
985 CMD_transform,
986 CMD_truncate,
987 CMD_uniq,
988 CMD_unpack,
989 CMD_unstem,
990 CMD_upper,
991 CMD_url,
992 CMD_value,
993 CMD_version,
994 CMD_weight,
995 CMD_MACRO // special tag for macro evaluation
998 struct func_attrib {
999 int tag;
1000 int minargs, maxargs, evalargs;
1001 char ensure;
1004 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1005 struct func_desc {
1006 const char *name;
1007 struct func_attrib a;
1010 #define N -1
1011 #define M 'M'
1012 #define Q 'Q'
1013 // NB when adding a new command which ensures M or Q, update the list in
1014 // docs/omegascript.rst
1015 static struct func_desc func_tab[] = {
1016 //name minargs maxargs evalargs ensure
1017 {"",{CMD_, N, N, 0, 0}},// commented out code
1018 T(add, 0, N, N, 0), // add a list of numbers
1019 T(addfilter, 1, 1, N, 0), // add filter term
1020 T(allterms, 0, 1, N, 0), // list of all terms matching document
1021 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
1022 T(cgi, 1, 1, N, 0), // return cgi parameter value
1023 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
1024 T(chr, 1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1025 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
1026 T(contains, 2, 2, N, 0), // return position of substring, or empty string
1027 T(csv, 1, 2, N, 0), // CSV string escaping
1028 T(date, 1, 2, N, 0), // convert time_t to strftime format
1029 // (default: YYYY-MM-DD)
1030 T(dbname, 0, 0, N, 0), // database name
1031 T(dbsize, 0, 0, N, 0), // database size (# of documents)
1032 T(def, 2, 2, 1, 0), // define a macro
1033 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
1034 T(div, 2, 2, N, 0), // integer divide
1035 T(emptydocs, 0, 1, N, 0), // list of empty documents
1036 T(env, 1, 1, N, 0), // environment variable
1037 T(error, 0, 0, N, 0), // error message
1038 T(eq, 2, 2, N, 0), // test equality
1039 T(field, 1, 2, N, 0), // lookup field in record
1040 T(filesize, 1, 1, N, 0), // pretty printed filesize
1041 T(filters, 0, 0, N, 0), // serialisation of current filters
1042 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
1043 T(find, 2, 2, N, 0), // find entry in list
1044 T(fmt, 0, 0, N, 0), // name of current format
1045 T(freq, 1, 1, N, 0), // frequency of a term
1046 T(ge, 2, 2, N, 0), // test >=
1047 T(gt, 2, 2, N, 0), // test >
1048 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
1049 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
1050 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
1051 T(hitsperpage, 0, 0, N, 0), // hits per page
1052 T(hostname, 1, 1, N, 0), // extract hostname from URL
1053 T(html, 1, 1, N, 0), // html escape string (<>&")
1054 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1055 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
1056 T(id, 0, 0, N, 0), // docid of current doc
1057 T(if, 2, 3, 1, 0), // conditional
1058 T(include, 1, 1, 1, 0), // include another file
1059 T(json, 1, 1, N, 0), // JSON string escaping
1060 T(jsonarray, 1, 1, N, 0), // Format list as a JSON array of strings
1061 T(last, 0, 0, N, M), // hit number one beyond end of current page
1062 T(lastpage, 0, 0, N, M), // number of last hit page
1063 T(le, 2, 2, N, 0), // test <=
1064 T(length, 1, 1, N, 0), // length of list
1065 T(list, 2, 5, N, 0), // pretty print list
1066 T(log, 1, 2, 1, 0), // create a log entry
1067 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1068 T(lower, 1, 1, N, 0), // convert string to lower case
1069 T(lt, 2, 2, N, 0), // test <
1070 T(map, 1, 2, 1, 0), // map a list into another list
1071 T(match, 2, 3, N, 0), // regex match
1072 T(max, 1, N, N, 0), // maximum of a list of values
1073 T(min, 1, N, N, 0), // minimum of a list of values
1074 T(mod, 2, 2, N, 0), // integer modulus
1075 T(msize, 0, 0, N, M), // number of matches (estimated)
1076 T(msizeexact, 0, 0, N, M), // is $msize exact?
1077 T(msizelower, 0, 0, N, M), // number of matches (lower bound)
1078 T(msizeupper, 0, 0, N, M), // number of matches (upper bound)
1079 T(mul, 2, N, N, 0), // multiply a list of numbers
1080 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1081 T(ne, 2, 2, N, 0), // test not equal
1082 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1083 T(not, 1, 1, N, 0), // logical not
1084 T(now, 0, 0, N, 0), // current date/time as a time_t
1085 T(opt, 1, 2, N, 0), // lookup an option value
1086 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1087 T(ord, 1, 1, N, 0), // return codepoint for first character of UTF-8 string
1088 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1089 T(percentage, 0, 0, N, 0), // percentage score of current hit
1090 T(prettyterm, 1, 1, N, Q), // pretty print term name
1091 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1092 T(query, 0, 1, N, Q), // query
1093 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1094 T(queryterms, 0, 0, N, Q), // list of query terms
1095 T(range, 2, 2, N, 0), // return list of values between start and end
1096 T(record, 0, 1, N, 0), // record contents of document
1097 T(relevant, 0, 1, N, Q), // is document relevant?
1098 T(relevants, 0, 0, N, Q), // return list of relevant documents
1099 T(score, 0, 0, N, 0), // score (0-10) of current hit
1100 T(set, 2, 2, N, 0), // set option value
1101 T(setmap, 1, N, N, 0), // set map of option values
1102 T(setrelevant, 0, 1, N, Q), // set rset
1103 T(slice, 2, 2, N, 0), // slice a list using a second list
1104 T(snippet, 1, 2, N, M), // generate snippet from text
1105 T(split, 1, 2, N, 0), // split a string to give a list
1106 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1107 T(sub, 2, 2, N, 0), // subtract
1108 T(substr, 2, 3, N, 0), // substring
1109 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1110 T(terms, 0, 1, N, M), // list of matching terms
1111 T(thispage, 0, 0, N, M), // page number of current page
1112 T(time, 0, 0, N, M), // how long the match took (in seconds)
1113 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1114 // (counting from 0)
1115 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1116 // (default 16)
1117 T(transform, 3, 4, N, 0), // transform with a regexp
1118 T(truncate, 2, 4, N, 0), // truncate after a word
1119 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1120 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1121 T(unstem, 1, 1, N, Q), // return list of probabilistic terms from
1122 // the query which stemmed to this term
1123 T(upper, 1, 1, N, 0), // convert string to upper case
1124 T(url, 1, 1, N, 0), // url encode argument
1125 T(value, 1, 2, N, 0), // return document value
1126 T(version, 0, 0, N, 0), // omega version string
1127 T(weight, 0, 0, N, 0), // weight of the current hit
1128 { NULL,{0, 0, 0, 0, 0}}
1131 #undef T // Leaving T defined screws up Sun's C++ compiler!
1133 static vector<string> macros;
1135 // Call write() repeatedly until all data is written or we get a
1136 // non-recoverable error.
1137 static ssize_t
1138 write_all(int fd, const char * buf, size_t count)
1140 while (count) {
1141 ssize_t r = write(fd, buf, count);
1142 if (rare(r < 0)) {
1143 if (errno == EINTR) continue;
1144 return r;
1146 buf += r;
1147 count -= r;
1149 return 0;
1152 static string
1153 eval(const string &fmt, const vector<string> &param)
1155 static map<string, const struct func_attrib *> func_map;
1156 if (func_map.empty()) {
1157 struct func_desc *p;
1158 for (p = func_tab; p->name != NULL; p++) {
1159 func_map[string(p->name)] = &(p->a);
1162 string res;
1163 string::size_type p = 0, q;
1164 while ((q = fmt.find('$', p)) != string::npos) try {
1165 res.append(fmt, p, q - p);
1166 string::size_type code_start = q; // note down for error reporting
1167 q++;
1168 if (q >= fmt.size()) break;
1169 unsigned char ch = fmt[q];
1170 switch (ch) {
1171 // Magic sequences:
1172 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1173 case '$':
1174 res += '$';
1175 p = q + 1;
1176 continue;
1177 case '(':
1178 res += '{';
1179 p = q + 1;
1180 continue;
1181 case ')':
1182 res += '}';
1183 p = q + 1;
1184 continue;
1185 case '.':
1186 res += ',';
1187 p = q + 1;
1188 continue;
1189 case '_':
1190 ch = '0';
1191 // FALL THRU
1192 case '1': case '2': case '3': case '4': case '5':
1193 case '6': case '7': case '8': case '9':
1194 ch -= '0';
1195 if (ch < param.size()) res += param[ch];
1196 p = q + 1;
1197 continue;
1198 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1199 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1200 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1201 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1202 case 'y': case 'z':
1203 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1204 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1205 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1206 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1207 case 'Y': case 'Z':
1208 case '{':
1209 break;
1210 default:
1211 string msg = "Unknown $ code in: $";
1212 msg.append(fmt, q, string::npos);
1213 throw msg;
1215 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1216 string var(fmt, q, p - q);
1217 map<string, const struct func_attrib *>::const_iterator func;
1218 func = func_map.find(var);
1219 if (func == func_map.end()) {
1220 throw "Unknown function '" + var + "'";
1222 vector<string> args;
1223 if (fmt[p] == '{') {
1224 q = p + 1;
1225 int nest = 1;
1226 while (true) {
1227 p = fmt.find_first_of(",{}", p + 1);
1228 if (p == string::npos)
1229 throw "missing } in " + fmt.substr(code_start);
1230 if (fmt[p] == '{') {
1231 ++nest;
1232 } else {
1233 if (nest == 1) {
1234 // should we split the args
1235 if (func->second->minargs != N) {
1236 args.push_back(fmt.substr(q, p - q));
1237 q = p + 1;
1240 if (fmt[p] == '}' && --nest == 0) break;
1243 if (func->second->minargs == N)
1244 args.push_back(fmt.substr(q, p - q));
1245 p++;
1248 if (func->second->minargs != N) {
1249 if (int(args.size()) < func->second->minargs)
1250 throw "too few arguments to $" + var;
1251 if (func->second->maxargs != N &&
1252 int(args.size()) > func->second->maxargs)
1253 throw "too many arguments to $" + var;
1255 vector<string>::size_type n;
1256 if (func->second->evalargs != N)
1257 n = func->second->evalargs;
1258 else
1259 n = args.size();
1261 for (vector<string>::size_type j = 0; j < n; j++)
1262 args[j] = eval(args[j], param);
1264 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1265 ensure_query_parsed();
1266 if (func->second->ensure == 'M') ensure_match();
1267 string value;
1268 switch (func->second->tag) {
1269 case CMD_:
1270 break;
1271 case CMD_add: {
1272 int total = 0;
1273 vector<string>::const_iterator i;
1274 for (i = args.begin(); i != args.end(); i++)
1275 total += string_to_int(*i);
1276 value = str(total);
1277 break;
1279 case CMD_addfilter:
1280 add_bterm(args[0]);
1281 break;
1282 case CMD_allterms: {
1283 // list of all terms indexing document
1284 int id = q0;
1285 if (!args.empty()) id = string_to_int(args[0]);
1286 for (Xapian::TermIterator term = db.termlist_begin(id);
1287 term != db.termlist_end(id); term++) {
1288 value += *term;
1289 value += '\t';
1292 if (!value.empty()) value.erase(value.size() - 1);
1293 break;
1295 case CMD_and: {
1296 value = "true";
1297 for (vector<string>::const_iterator i = args.begin();
1298 i != args.end(); i++) {
1299 if (eval(*i, param).empty()) {
1300 value.resize(0);
1301 break;
1304 break;
1306 case CMD_cgi: {
1307 MCI i = cgi_params.find(args[0]);
1308 if (i != cgi_params.end()) value = i->second;
1309 break;
1311 case CMD_cgilist: {
1312 pair<MCI, MCI> g;
1313 g = cgi_params.equal_range(args[0]);
1314 for (MCI i = g.first; i != g.second; i++) {
1315 value += i->second;
1316 value += '\t';
1318 if (!value.empty()) value.erase(value.size() - 1);
1319 break;
1321 case CMD_chr:
1322 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1323 break;
1324 case CMD_collapsed: {
1325 value = str(collapsed);
1326 break;
1328 case CMD_contains: {
1329 size_t pos = args[1].find(args[0]);
1330 if (pos != string::npos) {
1331 value = str(pos);
1333 break;
1335 case CMD_csv:
1336 value = args[0];
1337 if (args.size() > 1 && !args[1].empty()) {
1338 csv_escape_always(value);
1339 } else {
1340 csv_escape(value);
1342 break;
1343 case CMD_date:
1344 value = args[0];
1345 if (!value.empty()) {
1346 char buf[64] = "";
1347 time_t date = string_to_int(value);
1348 if (date != static_cast<time_t>(-1)) {
1349 struct tm *then;
1350 then = gmtime(&date);
1351 string date_fmt = "%Y-%m-%d";
1352 if (args.size() > 1) date_fmt = eval(args[1], param);
1353 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1355 value = buf;
1357 break;
1358 case CMD_dbname:
1359 value = dbname;
1360 break;
1361 case CMD_dbsize: {
1362 static Xapian::doccount dbsize;
1363 if (!dbsize) dbsize = db.get_doccount();
1364 value = str(dbsize);
1365 break;
1367 case CMD_def: {
1368 func_attrib *fa = new func_attrib;
1369 fa->tag = CMD_MACRO + macros.size();
1370 fa->minargs = 0;
1371 fa->maxargs = 9;
1372 fa->evalargs = N; // FIXME: or 0?
1373 fa->ensure = 0;
1375 macros.push_back(args[1]);
1376 func_map[args[0]] = fa;
1377 break;
1379 case CMD_defaultop:
1380 if (default_op == Xapian::Query::OP_AND) {
1381 value = "and";
1382 } else {
1383 value = "or";
1385 break;
1386 case CMD_div: {
1387 int denom = string_to_int(args[1]);
1388 if (denom == 0) {
1389 value = "divide by 0";
1390 } else {
1391 value = str(string_to_int(args[0]) /
1392 string_to_int(args[1]));
1394 break;
1396 case CMD_eq:
1397 if (args[0] == args[1]) value = "true";
1398 break;
1399 case CMD_emptydocs: {
1400 string t;
1401 if (!args.empty())
1402 t = args[0];
1403 Xapian::PostingIterator i;
1404 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1405 if (i.get_doclength() != 0) continue;
1406 if (!value.empty()) value += '\t';
1407 value += str(*i);
1409 break;
1411 case CMD_env: {
1412 char *env = getenv(args[0].c_str());
1413 if (env != NULL) value = env;
1414 break;
1416 case CMD_error:
1417 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1418 error_msg = "Database '" + dbname + "' couldn't be opened";
1420 value = error_msg;
1421 break;
1422 case CMD_field: {
1423 Xapian::docid did = q0;
1424 if (args.size() > 1) did = string_to_int(args[1]);
1425 value = fields.get_field(did, args[0]);
1426 break;
1428 case CMD_filesize: {
1429 // FIXME: rounding? i18n?
1430 int size = string_to_int(args[0]);
1431 int intpart = size;
1432 int fraction = -1;
1433 const char * format = 0;
1434 if (size < 0) {
1435 // Negative size -> empty result.
1436 } else if (size == 1) {
1437 format = "%d byte";
1438 } else if (size < 1024) {
1439 format = "%d bytes";
1440 } else {
1441 if (size < 1024*1024) {
1442 format = "%d.%cK";
1443 } else {
1444 size /= 1024;
1445 if (size < 1024*1024) {
1446 format = "%d.%cM";
1447 } else {
1448 size /= 1024;
1449 format = "%d.%cG";
1452 intpart = unsigned(size) / 1024;
1453 fraction = unsigned(size) % 1024;
1455 if (format) {
1456 char buf[200];
1457 int len;
1458 if (fraction == -1) {
1459 len = my_snprintf(buf, sizeof(buf), format, intpart);
1460 } else {
1461 fraction = (fraction * 10 / 1024) + '0';
1462 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1464 if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1465 value.assign(buf, len);
1467 break;
1469 case CMD_filters:
1470 value = filters;
1471 break;
1472 case CMD_filterterms: {
1473 Xapian::TermIterator term = db.allterms_begin();
1474 term.skip_to(args[0]);
1475 while (term != db.allterms_end()) {
1476 string t = *term;
1477 if (!startswith(t, args[0])) break;
1478 value += t;
1479 value += '\t';
1480 ++term;
1483 if (!value.empty()) value.erase(value.size() - 1);
1484 break;
1486 case CMD_find: {
1487 string l = args[0], s = args[1];
1488 string::size_type i = 0, j = 0;
1489 size_t count = 0;
1490 while (j != l.size()) {
1491 j = l.find('\t', i);
1492 if (j == string::npos) j = l.size();
1493 if (j - i == s.length()) {
1494 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1495 value = str(count);
1496 break;
1499 ++count;
1500 i = j + 1;
1502 break;
1504 case CMD_fmt:
1505 value = fmtname;
1506 break;
1507 case CMD_freq:
1508 try {
1509 value = str(mset.get_termfreq(args[0]));
1510 } catch (const Xapian::InvalidOperationError&) {
1511 // An MSet will raise this error if it's empty and not
1512 // associated with a search.
1513 value = str(db.get_termfreq(args[0]));
1515 break;
1516 case CMD_ge:
1517 if (string_to_int(args[0]) >= string_to_int(args[1]))
1518 value = "true";
1519 break;
1520 case CMD_gt:
1521 if (string_to_int(args[0]) > string_to_int(args[1]))
1522 value = "true";
1523 break;
1524 case CMD_highlight: {
1525 string bra, ket;
1526 if (args.size() > 2) {
1527 bra = args[2];
1528 if (args.size() > 3) {
1529 ket = args[3];
1530 } else {
1531 string::const_iterator i;
1532 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1533 ket = "</";
1534 ket.append(bra, 1, i - bra.begin() - 1);
1535 ket += '>';
1539 value = html_highlight(args[0], args[1], bra, ket);
1540 break;
1542 case CMD_hit:
1543 // 0-based mset index
1544 value = str(hit_no);
1545 break;
1546 case CMD_hitlist:
1547 #if 0
1548 url_query_string = "?DB=";
1549 url_query_string += dbname;
1550 multimap<string, string>::const_iterator j;
1551 for (j = probabilistic_query.begin();
1552 j != probabilistic_query.end();
1553 ++j) {
1554 if (j->first.empty()) {
1555 url_query_string += "&P=";
1556 } else {
1557 url_query_string += "&P."
1558 url_query_string += j->first;
1559 url_query_string += '=';
1561 const char *q = j->second.c_str();
1562 int ch;
1563 while ((ch = *q++) != '\0') {
1564 switch (ch) {
1565 case '+':
1566 url_query_string += "%2b";
1567 break;
1568 case '"':
1569 url_query_string += "%22";
1570 break;
1571 case '%':
1572 url_query_string += "%25";
1573 break;
1574 case '&':
1575 url_query_string += "%26";
1576 break;
1577 case ' ':
1578 ch = '+';
1579 /* fall through */
1580 default:
1581 url_query_string += ch;
1585 // add any boolean terms
1586 for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
1587 url_query_string += "&B=";
1588 url_query_string += i->second;
1590 #endif
1591 for (hit_no = topdoc; hit_no < last; hit_no++)
1592 value += print_caption(args[0], param);
1593 hit_no = 0;
1594 break;
1595 case CMD_hitsperpage:
1596 value = str(hits_per_page);
1597 break;
1598 case CMD_hostname: {
1599 value = args[0];
1600 // remove URL scheme and/or path
1601 string::size_type i = value.find("://");
1602 if (i == string::npos) i = 0; else i += 3;
1603 value = value.substr(i, value.find('/', i) - i);
1604 // remove user@ or user:password@
1605 i = value.find('@');
1606 if (i != string::npos) value.erase(0, i + 1);
1607 // remove :port
1608 i = value.find(':');
1609 if (i != string::npos) value.resize(i);
1610 break;
1612 case CMD_html:
1613 value = html_escape(args[0]);
1614 break;
1615 case CMD_htmlstrip:
1616 value = html_strip(args[0]);
1617 break;
1618 case CMD_httpheader:
1619 if (!suppress_http_headers) {
1620 cout << args[0] << ": " << args[1] << endl;
1621 if (!set_content_type && args[0].length() == 12 &&
1622 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1623 set_content_type = true;
1626 break;
1627 case CMD_id:
1628 // document id
1629 value = str(q0);
1630 break;
1631 case CMD_if:
1632 if (!args[0].empty())
1633 value = eval(args[1], param);
1634 else if (args.size() > 2)
1635 value = eval(args[2], param);
1636 break;
1637 case CMD_include:
1638 value = eval_file(args[0]);
1639 break;
1640 case CMD_json:
1641 value = args[0];
1642 json_escape(value);
1643 break;
1644 case CMD_jsonarray: {
1645 const string & l = args[0];
1646 string::size_type i = 0, j;
1647 if (l.empty()) {
1648 value = "[]";
1649 break;
1651 value = "[\"";
1652 while (true) {
1653 j = l.find('\t', i);
1654 string elt(l, i, j - i);
1655 json_escape(elt);
1656 value += elt;
1657 if (j == string::npos) break;
1658 value += "\",\"";
1659 i = j + 1;
1661 value += "\"]";
1662 break;
1664 case CMD_last:
1665 value = str(last);
1666 break;
1667 case CMD_lastpage: {
1668 int l = mset.get_matches_estimated();
1669 if (l > 0) l = (l - 1) / hits_per_page + 1;
1670 value = str(l);
1671 break;
1673 case CMD_le:
1674 if (string_to_int(args[0]) <= string_to_int(args[1]))
1675 value = "true";
1676 break;
1677 case CMD_length:
1678 if (args[0].empty()) {
1679 value = "0";
1680 } else {
1681 size_t length = count(args[0].begin(), args[0].end(), '\t');
1682 value = str(length + 1);
1684 break;
1685 case CMD_list: {
1686 if (!args[0].empty()) {
1687 string pre, inter, interlast, post;
1688 switch (args.size()) {
1689 case 2:
1690 inter = interlast = args[1];
1691 break;
1692 case 3:
1693 inter = args[1];
1694 interlast = args[2];
1695 break;
1696 case 4:
1697 pre = args[1];
1698 inter = interlast = args[2];
1699 post = args[3];
1700 break;
1701 case 5:
1702 pre = args[1];
1703 inter = args[2];
1704 interlast = args[3];
1705 post = args[4];
1706 break;
1708 value += pre;
1709 string list = args[0];
1710 string::size_type split = 0, split2;
1711 while ((split2 = list.find('\t', split)) != string::npos) {
1712 if (split) value += inter;
1713 value.append(list, split, split2 - split);
1714 split = split2 + 1;
1716 if (split) value += interlast;
1717 value.append(list, split, string::npos);
1718 value += post;
1720 break;
1722 case CMD_log: {
1723 if (!vet_filename(args[0])) break;
1724 string logfile = log_dir + args[0];
1725 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1726 if (fd == -1) break;
1727 vector<string> noargs;
1728 noargs.resize(1);
1729 string line;
1730 if (args.size() > 1) {
1731 line = args[1];
1732 } else {
1733 line = DEFAULT_LOG_ENTRY;
1735 line = eval(line, noargs);
1736 line += '\n';
1737 (void)write_all(fd, line.data(), line.length());
1738 close(fd);
1739 break;
1741 case CMD_lookup: {
1742 if (!vet_filename(args[0])) break;
1743 string cdbfile = cdb_dir + args[0];
1744 int fd = open(cdbfile.c_str(), O_RDONLY);
1745 if (fd == -1) break;
1747 struct cdb cdb;
1748 cdb_init(&cdb, fd);
1750 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1751 size_t datalen = cdb_datalen(&cdb);
1752 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1753 if (q) {
1754 value.assign(static_cast<const char *>(dat), datalen);
1758 cdb_free(&cdb);
1759 close(fd); // FIXME: cache fds?
1760 break;
1762 case CMD_lower:
1763 value = Xapian::Unicode::tolower(args[0]);
1764 break;
1765 case CMD_lt:
1766 if (string_to_int(args[0]) < string_to_int(args[1]))
1767 value = "true";
1768 break;
1769 case CMD_map:
1770 if (!args[0].empty()) {
1771 string l = args[0], pat = args[1];
1772 vector<string> new_args(param);
1773 string::size_type i = 0, j;
1774 while (true) {
1775 j = l.find('\t', i);
1776 new_args[0] = l.substr(i, j - i);
1777 value += eval(pat, new_args);
1778 if (j == string::npos) break;
1779 value += '\t';
1780 i = j + 1;
1783 break;
1784 case CMD_match:
1785 omegascript_match(value, args);
1786 break;
1787 case CMD_max: {
1788 vector<string>::const_iterator i = args.begin();
1789 int val = string_to_int(*i++);
1790 for (; i != args.end(); i++) {
1791 int x = string_to_int(*i);
1792 if (x > val) val = x;
1794 value = str(val);
1795 break;
1797 case CMD_min: {
1798 vector<string>::const_iterator i = args.begin();
1799 int val = string_to_int(*i++);
1800 for (; i != args.end(); i++) {
1801 int x = string_to_int(*i);
1802 if (x < val) val = x;
1804 value = str(val);
1805 break;
1807 case CMD_msize:
1808 // Estimated number of matches.
1809 value = str(mset.get_matches_estimated());
1810 break;
1811 case CMD_msizeexact:
1812 // Is msize exact?
1813 if (mset.get_matches_lower_bound()
1814 == mset.get_matches_upper_bound())
1815 value = "true";
1816 break;
1817 case CMD_msizelower:
1818 // Lower bound on number of matches.
1819 value = str(mset.get_matches_lower_bound());
1820 break;
1821 case CMD_msizeupper:
1822 // Upper bound on number of matches.
1823 value = str(mset.get_matches_upper_bound());
1824 break;
1825 case CMD_mod: {
1826 int denom = string_to_int(args[1]);
1827 if (denom == 0) {
1828 value = "divide by 0";
1829 } else {
1830 value = str(string_to_int(args[0]) %
1831 string_to_int(args[1]));
1833 break;
1835 case CMD_mul: {
1836 vector<string>::const_iterator i = args.begin();
1837 int total = string_to_int(*i++);
1838 while (i != args.end())
1839 total *= string_to_int(*i++);
1840 value = str(total);
1841 break;
1843 case CMD_muldiv: {
1844 int denom = string_to_int(args[2]);
1845 if (denom == 0) {
1846 value = "divide by 0";
1847 } else {
1848 int num = string_to_int(args[0]) * string_to_int(args[1]);
1849 value = str(num / denom);
1851 break;
1853 case CMD_ne:
1854 if (args[0] != args[1]) value = "true";
1855 break;
1856 case CMD_nice: {
1857 string::const_iterator i = args[0].begin();
1858 int len = args[0].length();
1859 while (len) {
1860 value += *i++;
1861 if (--len && len % 3 == 0) value += option["thousand"];
1863 break;
1865 case CMD_not:
1866 if (args[0].empty()) value = "true";
1867 break;
1868 case CMD_now: {
1869 char buf[64];
1870 my_snprintf(buf, sizeof(buf), "%lu",
1871 static_cast<unsigned long>(time(NULL)));
1872 // MSVC's snprintf omits the zero byte if the string if
1873 // sizeof(buf) long.
1874 buf[sizeof(buf) - 1] = '\0';
1875 value = buf;
1876 break;
1878 case CMD_opt:
1879 if (args.size() == 2) {
1880 value = option[args[0] + "," + args[1]];
1881 } else {
1882 value = option[args[0]];
1884 break;
1885 case CMD_or: {
1886 for (vector<string>::const_iterator i = args.begin();
1887 i != args.end(); i++) {
1888 value = eval(*i, param);
1889 if (!value.empty()) break;
1891 break;
1893 case CMD_ord: {
1894 if (!args[0].empty()) {
1895 Utf8Iterator it(args[0]);
1896 value = str(*it);
1898 break;
1900 case CMD_pack:
1901 value = int_to_binary_string(string_to_int(args[0]));
1902 break;
1903 case CMD_percentage:
1904 // percentage score
1905 value = str(percent);
1906 break;
1907 case CMD_prettyterm:
1908 value = pretty_term(args[0]);
1909 break;
1910 case CMD_prettyurl:
1911 value = args[0];
1912 url_prettify(value);
1913 break;
1914 case CMD_query: {
1915 pair<multimap<string, string>::const_iterator,
1916 multimap<string, string>::const_iterator> r;
1917 r = probabilistic_query.equal_range(args.empty() ?
1918 string() : args[0]);
1919 multimap<string, string>::const_iterator j;
1920 for (j = r.first; j != r.second; ++j) {
1921 if (!value.empty()) value += '\t';
1922 const string & s = j->second;
1923 size_t start = 0, tab;
1924 while ((tab = s.find('\t', start)) != string::npos) {
1925 value.append(s, start, tab - start);
1926 value += ' ';
1927 start = tab + 1;
1929 value.append(s, start, string::npos);
1931 break;
1933 case CMD_querydescription:
1934 value = query.get_description();
1935 break;
1936 case CMD_queryterms:
1937 value = queryterms;
1938 break;
1939 case CMD_range: {
1940 int start = string_to_int(args[0]);
1941 int end = string_to_int(args[1]);
1942 while (start <= end) {
1943 value += str(start);
1944 if (start < end) value += '\t';
1945 start++;
1947 break;
1949 case CMD_record: {
1950 int id = q0;
1951 if (!args.empty()) id = string_to_int(args[0]);
1952 value = db.get_document(id).get_data();
1953 break;
1955 case CMD_relevant: {
1956 // document id if relevant; empty otherwise
1957 int id = q0;
1958 if (!args.empty()) id = string_to_int(args[0]);
1959 map<Xapian::docid, bool>::iterator i = ticked.find(id);
1960 if (i != ticked.end()) {
1961 i->second = false; // icky side-effect
1962 value = str(id);
1964 break;
1966 case CMD_relevants: {
1967 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
1968 i != ticked.end(); i++) {
1969 if (i->second) {
1970 value += str(i->first);
1971 value += '\t';
1974 if (!value.empty()) value.erase(value.size() - 1);
1975 break;
1977 case CMD_score:
1978 // Score (0 to 10)
1979 value = str(percent / 10);
1980 break;
1981 case CMD_set:
1982 option[args[0]] = args[1];
1983 break;
1984 case CMD_setmap: {
1985 string base = args[0] + ',';
1986 if (args.size() % 2 != 1)
1987 throw string("$setmap requires an odd number of arguments");
1988 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
1989 option[base + args[i]] = args[i + 1];
1991 break;
1993 case CMD_setrelevant: {
1994 string::size_type i = 0, j;
1995 while (true) {
1996 j = args[0].find_first_not_of("0123456789", i);
1997 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
1998 if (id) {
1999 rset.add_document(id);
2000 ticked[id] = true;
2002 if (j == string::npos) break;
2003 i = j + 1;
2005 break;
2007 case CMD_slice: {
2008 string list = args[0], pos = args[1];
2009 vector<string> items;
2010 string::size_type i = 0, j;
2011 while (true) {
2012 j = list.find('\t', i);
2013 items.push_back(list.substr(i, j - i));
2014 if (j == string::npos) break;
2015 i = j + 1;
2017 i = 0;
2018 bool have_added = false;
2019 while (true) {
2020 j = pos.find('\t', i);
2021 int item = string_to_int(pos.substr(i, j - i));
2022 if (item >= 0 && size_t(item) < items.size()) {
2023 if (have_added) value += '\t';
2024 value += items[item];
2025 have_added = true;
2027 if (j == string::npos) break;
2028 i = j + 1;
2030 break;
2032 case CMD_snippet: {
2033 size_t length = 200;
2034 if (args.size() > 1) {
2035 length = string_to_int(args[1]);
2037 if (!stemmer)
2038 stemmer = new Xapian::Stem(option["stemmer"]);
2039 // FIXME: Allow start and end highlight and omit to be specified.
2040 value = mset.snippet(args[0], length, *stemmer,
2041 mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2042 "<strong>", "</strong>", "...");
2043 break;
2045 case CMD_split: {
2046 string split;
2047 if (args.size() == 1) {
2048 split = " ";
2049 value = args[0];
2050 } else {
2051 split = args[0];
2052 value = args[1];
2054 string::size_type i = 0;
2055 while (true) {
2056 if (split.empty()) {
2057 ++i;
2058 if (i >= value.size()) break;
2059 } else {
2060 i = value.find(split, i);
2061 if (i == string::npos) break;
2063 value.replace(i, split.size(), 1, '\t');
2064 ++i;
2066 break;
2068 case CMD_stoplist: {
2069 Xapian::TermIterator i = qp.stoplist_begin();
2070 Xapian::TermIterator end = qp.stoplist_end();
2071 while (i != end) {
2072 if (!value.empty()) value += '\t';
2073 value += *i;
2074 ++i;
2076 break;
2078 case CMD_sub:
2079 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2080 break;
2081 case CMD_substr: {
2082 int start = string_to_int(args[1]);
2083 if (start < 0) {
2084 if (static_cast<size_t>(-start) >= args[0].size()) {
2085 start = 0;
2086 } else {
2087 start = static_cast<int>(args[0].size()) + start;
2089 } else {
2090 if (static_cast<size_t>(start) >= args[0].size()) break;
2092 size_t len = string::npos;
2093 if (args.size() > 2) {
2094 int int_len = string_to_int(args[2]);
2095 if (int_len >= 0) {
2096 len = size_t(int_len);
2097 } else {
2098 len = args[0].size() - start;
2099 if (static_cast<size_t>(-int_len) >= len) {
2100 len = 0;
2101 } else {
2102 len -= static_cast<size_t>(-int_len);
2106 value.assign(args[0], start, len);
2107 break;
2109 case CMD_suggestion:
2110 value = qp.get_corrected_query_string();
2111 break;
2112 case CMD_terms: {
2113 // list of matching terms
2114 if (!enquire) break;
2115 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2116 if (args.empty()) {
2117 while (term != enquire->get_matching_terms_end(q0)) {
2118 // check term was in the typed query so we ignore
2119 // boolean filter terms
2120 const string & t = *term;
2121 if (termset.find(t) != termset.end()) {
2122 value += t;
2123 value += '\t';
2125 ++term;
2127 } else {
2128 // Return matching terms with specified prefix. We can't
2129 // use skip_to() as the terms aren't ordered by termname.
2130 const string & pfx = args[0];
2131 while (term != enquire->get_matching_terms_end(q0)) {
2132 const string & t = *term;
2133 if (startswith(t, pfx)) {
2134 value += t;
2135 value += '\t';
2137 ++term;
2141 if (!value.empty()) value.erase(value.size() - 1);
2142 break;
2144 case CMD_thispage:
2145 value = str(topdoc / hits_per_page + 1);
2146 break;
2147 case CMD_time:
2148 if (secs >= 0) {
2149 char buf[64];
2150 my_snprintf(buf, sizeof(buf), "%.6f", secs);
2151 // MSVC's snprintf omits the zero byte if the string if
2152 // sizeof(buf) long.
2153 buf[sizeof(buf) - 1] = '\0';
2154 value = buf;
2156 break;
2157 case CMD_topdoc:
2158 // first document on current page of hit list (counting from 0)
2159 value = str(topdoc);
2160 break;
2161 case CMD_topterms:
2162 if (enquire) {
2163 int howmany = 16;
2164 if (!args.empty()) howmany = string_to_int(args[0]);
2165 if (howmany < 0) howmany = 0;
2167 // List of expand terms
2168 Xapian::ESet eset;
2169 OmegaExpandDecider decider(db, &termset);
2171 if (!rset.empty()) {
2172 set_expansion_scheme(*enquire, option);
2173 #if XAPIAN_AT_LEAST(1,3,2)
2174 eset = enquire->get_eset(howmany * 2, rset, &decider);
2175 #else
2176 eset = enquire->get_eset(howmany * 2, rset, 0,
2177 expand_param_k, &decider);
2178 #endif
2179 } else if (mset.size()) {
2180 // invent an rset
2181 Xapian::RSet tmp;
2183 int c = 5;
2184 // FIXME: what if mset does not start at first match?
2185 for (Xapian::docid did : mset) {
2186 tmp.add_document(did);
2187 if (--c == 0) break;
2190 set_expansion_scheme(*enquire, option);
2191 #if XAPIAN_AT_LEAST(1,3,2)
2192 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2193 #else
2194 eset = enquire->get_eset(howmany * 2, tmp, 0,
2195 expand_param_k, &decider);
2196 #endif
2199 // Don't show more than one word with the same stem.
2200 set<string> stems;
2201 Xapian::ESetIterator i;
2202 for (i = eset.begin(); i != eset.end(); ++i) {
2203 string term(*i);
2204 string stem = (*stemmer)(term);
2205 if (stems.find(stem) != stems.end()) continue;
2206 stems.insert(stem);
2207 value += term;
2208 value += '\t';
2209 if (--howmany == 0) break;
2211 if (!value.empty()) value.erase(value.size() - 1);
2213 break;
2214 case CMD_transform:
2215 omegascript_transform(value, args);
2216 break;
2217 case CMD_truncate:
2218 value = generate_sample(args[0],
2219 string_to_int(args[1]),
2220 args.size() > 2 ? args[2] : string(),
2221 args.size() > 3 ? args[3] : string());
2222 break;
2223 case CMD_uniq: {
2224 const string &list = args[0];
2225 if (list.empty()) break;
2226 string::size_type split = 0, split2;
2227 string prev;
2228 do {
2229 split2 = list.find('\t', split);
2230 string item(list, split, split2 - split);
2231 if (split == 0) {
2232 value = item;
2233 } else if (item != prev) {
2234 value += '\t';
2235 value += item;
2237 prev = item;
2238 split = split2 + 1;
2239 } while (split2 != string::npos);
2240 break;
2242 case CMD_unpack:
2243 value = str(binary_string_to_int(args[0]));
2244 break;
2245 case CMD_unstem: {
2246 const string &term = args[0];
2247 Xapian::TermIterator i = qp.unstem_begin(term);
2248 Xapian::TermIterator end = qp.unstem_end(term);
2249 while (i != end) {
2250 if (!value.empty()) value += '\t';
2251 value += *i;
2252 ++i;
2254 break;
2256 case CMD_upper:
2257 value = Xapian::Unicode::toupper(args[0]);
2258 break;
2259 case CMD_url:
2260 url_encode(value, args[0]);
2261 break;
2262 case CMD_value: {
2263 Xapian::docid id = q0;
2264 Xapian::valueno value_no = string_to_int(args[0]);
2265 if (args.size() > 1) id = string_to_int(args[1]);
2266 value = db.get_document(id).get_value(value_no);
2267 break;
2269 case CMD_version:
2270 value = PACKAGE_STRING;
2271 break;
2272 case CMD_weight:
2273 value = double_to_string(weight);
2274 break;
2275 default: {
2276 args.insert(args.begin(), param[0]);
2277 int macro_no = func->second->tag - CMD_MACRO;
2278 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2279 // throw "Unknown function '" + var + "'";
2280 value = eval(macros[macro_no], args);
2281 break;
2284 res += value;
2285 } catch (const Xapian::Error & e) {
2286 // FIXME: this means we only see the most recent error in $error
2287 // - is that the best approach?
2288 error_msg = e.get_msg();
2291 res.append(fmt, p, string::npos);
2292 return res;
2295 static string
2296 eval_file(const string &fmtfile)
2298 string err;
2299 if (vet_filename(fmtfile)) {
2300 string file = template_dir + fmtfile;
2301 string fmt;
2302 if (load_file(file, fmt)) {
2303 vector<string> noargs;
2304 noargs.resize(1);
2305 return eval(fmt, noargs);
2307 err = strerror(errno);
2308 } else {
2309 err = "name contains '..'";
2312 // FIXME: report why!
2313 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2314 if (!err.empty()) msg += " (" + err + ')';
2315 throw msg;
2318 extern string
2319 pretty_term(string term)
2321 // Just leave empty strings and single characters alone.
2322 if (term.length() <= 1) return term;
2324 // Assume unprefixed terms are unstemmed.
2325 if (!C_isupper(term[0])) return term;
2327 // Handle stemmed terms.
2328 bool stemmed = (term[0] == 'Z');
2329 if (stemmed) {
2330 // First of all, check if a term in the query stemmed to this one.
2331 Xapian::TermIterator u = qp.unstem_begin(term);
2332 // There might be multiple words with the same stem, but we only want
2333 // one so just take the first.
2334 if (u != qp.unstem_end(term)) return *u;
2336 // Remove the 'Z'.
2337 term.erase(0, 1);
2340 bool add_quotes = false;
2342 // Check if the term has a prefix.
2343 if (C_isupper(term[0])) {
2344 // See if we have this prefix in the termprefix_to_userprefix map. If
2345 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2346 string prefix;
2347 size_t prefix_len = prefix_from_term(prefix, term);
2349 map<string, string>::const_iterator i;
2350 i = termprefix_to_userprefix.find(prefix);
2351 if (i != termprefix_to_userprefix.end()) {
2352 string user_prefix = i->second;
2353 user_prefix += ':';
2354 term.replace(0, prefix_len, user_prefix);
2355 } else {
2356 // We don't have a prefix mapping for this, so just set a flag to
2357 // add quotes around the term.
2358 add_quotes = true;
2362 if (stemmed) term += '.';
2364 if (add_quotes) {
2365 term.insert(0, "\"");
2366 term.append("\"");
2369 return term;
2372 static string
2373 print_caption(const string &fmt, const vector<string> &param)
2375 q0 = *(mset[hit_no]);
2377 weight = mset[hit_no].get_weight();
2378 percent = mset.convert_to_percent(mset[hit_no]);
2379 collapsed = mset[hit_no].get_collapse_count();
2381 return eval(fmt, param);
2384 void
2385 parse_omegascript()
2387 try {
2388 const char * p = getenv("SERVER_PROTOCOL");
2389 if (p && strcmp(p, "INCLUDED") == 0) {
2390 // We're being included in another page, so suppress headers.
2391 suppress_http_headers = true;
2394 string output = eval_file(fmtname);
2395 if (!set_content_type && !suppress_http_headers) {
2396 cout << "Content-Type: text/html" << endl;
2397 set_content_type = true;
2399 if (!suppress_http_headers) cout << endl;
2400 cout << output;
2401 } catch (...) {
2402 // Ensure the headers have been output so that any exception gets
2403 // reported rather than giving a server error.
2404 if (!set_content_type && !suppress_http_headers) {
2405 cout << "Content-Type: text/html" << endl;
2406 set_content_type = true;
2408 if (!suppress_http_headers) cout << endl;
2409 throw;
2413 static void
2414 ensure_query_parsed()
2416 if (query_parsed) return;
2417 query_parsed = true;
2419 MCI val;
2420 pair<MCI, MCI> g;
2422 // Should we discard the existing R-set recorded in R CGI parameters?
2423 bool discard_rset = false;
2425 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2426 // CGI parameters)?
2427 bool force_first_page = false;
2429 string v;
2430 // get list of terms from previous iteration of query
2431 val = cgi_params.find("xP");
2432 if (val != cgi_params.end()) {
2433 v = val->second;
2434 // If xP given, default to discarding any RSet and forcing the first
2435 // page of results. If the query is the same, or an extension of
2436 // the previous query, we adjust these again below.
2437 discard_rset = true;
2438 force_first_page = true;
2440 querytype result = set_probabilistic(v);
2441 switch (result) {
2442 case BAD_QUERY:
2443 break;
2444 case NEW_QUERY:
2445 break;
2446 case SAME_QUERY:
2447 case EXTENDED_QUERY:
2448 // If we've changed database, force the first page of hits
2449 // and discard the R-set (since the docids will have changed)
2450 val = cgi_params.find("xDB");
2451 if (val != cgi_params.end() && val->second != dbname) break;
2452 if (result == SAME_QUERY && force_first_page) {
2453 val = cgi_params.find("xFILTERS");
2454 if (val != cgi_params.end() && val->second != filters &&
2455 val->second != old_filters) {
2456 // Filters have changed since last query.
2457 } else {
2458 force_first_page = false;
2461 discard_rset = false;
2462 break;
2465 if (!force_first_page) {
2466 // Work out which mset element is the first hit we want
2467 // to display
2468 val = cgi_params.find("TOPDOC");
2469 if (val != cgi_params.end()) {
2470 topdoc = atol(val->second.c_str());
2473 // Handle next, previous, and page links
2474 if (cgi_params.find(">") != cgi_params.end()) {
2475 topdoc += hits_per_page;
2476 } else if (cgi_params.find("<") != cgi_params.end()) {
2477 if (topdoc >= hits_per_page)
2478 topdoc -= hits_per_page;
2479 else
2480 topdoc = 0;
2481 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2482 (val = cgi_params.find("#")) != cgi_params.end()) {
2483 long page = atol(val->second.c_str());
2484 // Do something sensible for page 0 (we count pages from 1).
2485 if (page == 0) page = 1;
2486 topdoc = (page - 1) * hits_per_page;
2489 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2490 // Normally we snap TOPDOC like this so that things work nicely if
2491 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2492 // postprocessing the output of omega and want variable sized pages,
2493 // this is unhelpful.
2494 bool raw_search = false;
2495 val = cgi_params.find("RAWSEARCH");
2496 if (val != cgi_params.end()) {
2497 raw_search = bool(atol(val->second.c_str()));
2500 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2503 if (!discard_rset) {
2504 // put documents marked as relevant into the rset
2505 g = cgi_params.equal_range("R");
2506 for (MCI i = g.first; i != g.second; i++) {
2507 const string & value = i->second;
2508 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2509 while (value[j] == '.') ++j;
2510 Xapian::docid d = atoi(value.c_str() + j);
2511 if (d) {
2512 rset.add_document(d);
2513 ticked[d] = true;
2520 // run query if we haven't already
2521 static void
2522 ensure_match()
2524 if (done_query) return;
2526 secs = RealTime::now();
2527 run_query();
2528 if (secs != -1)
2529 secs = RealTime::now() - secs;
2531 done_query = true;
2532 last = mset.get_matches_lower_bound();
2533 if (last == 0) {
2534 // Otherwise topdoc ends up being -6 if it's non-zero!
2535 topdoc = 0;
2536 } else {
2537 if (topdoc >= last)
2538 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2539 // last is the count of documents up to the end of the current page
2540 // (as returned by $last)
2541 if (topdoc + hits_per_page < last)
2542 last = topdoc + hits_per_page;
2546 // OmegaExpandDecider methods.
2548 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2549 set<string> * querytermset)
2550 : db(db_)
2552 // We'll want the stemmer for testing matches anyway.
2553 if (!stemmer)
2554 stemmer = new Xapian::Stem(option["stemmer"]);
2555 if (querytermset) {
2556 set<string>::const_iterator i;
2557 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2558 string term(*i);
2559 if (term.empty()) continue;
2561 unsigned char ch = term[0];
2562 bool stemmed = (ch == 'Z');
2563 if (stemmed) {
2564 term.erase(0, 1);
2565 if (term.empty()) continue;
2566 ch = term[0];
2569 if (C_isupper(ch)) {
2570 string prefix;
2571 size_t prefix_len = prefix_from_term(prefix, term);
2572 term.erase(0, prefix_len);
2575 if (!stemmed) term = (*stemmer)(term);
2577 exclude_stems.insert(term);
2582 bool
2583 OmegaExpandDecider::operator()(const string & term) const
2585 unsigned char ch = term[0];
2587 // Reject terms with a prefix.
2588 if (C_isupper(ch)) return false;
2591 MyStopper stopper;
2592 // Don't suggest stopwords.
2593 if (stopper(term)) return false;
2596 // Reject small numbers.
2597 if (term.size() < 4 && C_isdigit(ch)) return false;
2599 // Reject terms containing a space.
2600 if (term.find(' ') != string::npos) return false;
2602 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2603 // terms which are already in the query in some form.
2604 string stem = (*stemmer)(term);
2605 if (exclude_stems.find(stem) != exclude_stems.end())
2606 return false;
2608 // Ignore terms that only occur once (hapaxes) since they aren't
2609 // useful for finding related documents - they only occur in a
2610 // document that's already been marked as relevant.
2611 // FIXME: add an expand option to ignore terms where
2612 // termfreq == rtermfreq.
2613 if (db.get_termfreq(term) <= 1) return false;
2615 return true;