query.cc: Use += to build up strings (which should be O(n)), rather
[xapian.git] / xapian-applications / omega / query.cc
blob403b3d9aa5b0f1ec81376cbe383492be5f21ba5a
1 /* query.cc: query executor for omega
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015 Olly Betts
8 * Copyright 2008 Thomas Viehmann
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 // If we're building against git after the expand API changed but before the
29 // version gets bumped to 1.3.2, we'll get a deprecation warning from
30 // get_eset() unless we suppress such warnings here.
31 #define XAPIAN_DEPRECATED(D) D
33 #include <algorithm>
34 #include <iostream>
35 #include <map>
36 #include <set>
37 #include <vector>
39 #include <cassert>
40 #include <cctype>
41 #include "safeerrno.h"
42 #include <stdio.h>
43 #include <cstdlib>
44 #include <cstring>
45 #include "strcasecmp.h"
46 #include <ctime>
48 #include "safeunistd.h"
49 #include <sys/types.h>
50 #include "safesysstat.h"
51 #include "safefcntl.h"
53 #include "realtime.h"
55 #include <cdb.h>
57 #include "date.h"
58 #include "datematchdecider.h"
59 #include "jsonescape.h"
60 #include "utils.h"
61 #include "omega.h"
62 #include "query.h"
63 #include "cgiparam.h"
64 #include "loadfile.h"
65 #include "sample.h"
66 #include "str.h"
67 #include "stringutils.h"
68 #include "transform.h"
69 #include "urldecode.h"
70 #include "urlencode.h"
71 #include "unixperm.h"
72 #include "values.h"
73 #include "weight.h"
74 #include "expand.h"
76 #include <xapian.h>
78 using namespace std;
80 using Xapian::Utf8Iterator;
82 using Xapian::Unicode::is_wordchar;
84 #ifndef SNPRINTF
85 #include <cstdarg>
87 static int my_snprintf(char *str, size_t size, const char *format, ...)
89 int res;
90 va_list ap;
91 va_start(ap, format);
92 str[size - 1] = '\0';
93 res = vsprintf(str, format, ap);
94 if (str[size - 1] || res < 0 || size_t(res) >= size)
95 abort(); /* Overflowed! */
96 va_end(ap);
97 return res;
99 #else
100 #define my_snprintf SNPRINTF
101 #endif
103 static bool query_parsed = false;
104 static bool done_query = false;
105 static Xapian::docid last = 0;
107 static Xapian::MSet mset;
109 static map<Xapian::docid, bool> ticked;
111 static void ensure_query_parsed();
112 static void ensure_match();
114 static Xapian::Query query;
115 //static string url_query_string;
116 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
118 static Xapian::QueryParser qp;
119 static Xapian::NumberValueRangeProcessor * size_vrp = NULL;
120 static Xapian::Stem *stemmer = NULL;
122 static string eval_file(const string &fmtfile);
124 static set<string> termset;
126 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
127 static map<string, string> termprefix_to_userprefix;
129 static string queryterms;
131 static string error_msg;
133 static double secs = -1;
135 static const char DEFAULT_LOG_ENTRY[] =
136 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
137 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
138 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
139 "$dbname\t"
140 "$query\t"
141 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
143 class MyStopper : public Xapian::Stopper {
144 public:
145 bool operator()(const string &t) const {
146 switch (t[0]) {
147 case 'a':
148 return (t == "a" || t == "about" || t == "an" || t == "and" ||
149 t == "are" || t == "as" || t == "at");
150 case 'b':
151 return (t == "be" || t == "by");
152 case 'e':
153 return (t == "en");
154 case 'f':
155 return (t == "for" || t == "from");
156 case 'h':
157 return (t == "how");
158 case 'i':
159 return (t == "i" || t == "in" || t == "is" || t == "it");
160 case 'o':
161 return (t == "of" || t == "on" || t == "or");
162 case 't':
163 return (t == "that" || t == "the" || t == "this" || t == "to");
164 case 'w':
165 return (t == "was" || t == "what" || t == "when" ||
166 t == "where" || t == "which" || t == "who" ||
167 t == "why" || t == "will" || t == "with");
168 case 'y':
169 return (t == "you" || t == "your");
170 default:
171 return false;
176 static size_t
177 prefix_from_term(string &prefix, const string &term)
179 if (term.empty()) {
180 prefix.resize(0);
181 return 0;
183 if (term[0] == 'X') {
184 const string::const_iterator begin = term.begin();
185 string::const_iterator i = begin + 1;
186 while (i != term.end() && C_isupper(*i)) ++i;
187 prefix.assign(begin, i);
188 if (i != term.end() && *i == ':') ++i;
189 return i - begin;
192 prefix = term[0];
193 return 1;
196 // Don't allow ".." in format names, log file names, etc as this would allow
197 // people to open a format "../../etc/passwd" or similar.
198 // FIXME: make this check more exact ("foo..bar" is safe)
199 // FIXME: log when this check fails
200 static bool
201 vet_filename(const string &filename)
203 string::size_type i = filename.find("..");
204 return (i == string::npos);
207 // Heuristics:
208 // * If any terms have been removed, it's a "fresh query" so we discard any
209 // relevance judgements
210 // * If all previous terms are there but more have been added then we keep
211 // the relevance judgements, but return the first page of hits
213 // NEW_QUERY entirely new query
214 // SAME_QUERY unchanged query
215 // EXTENDED_QUERY new query, but based on the old one
216 // BAD_QUERY parse error (message in error_msg)
217 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
219 static multimap<string, string> probabilistic_query;
221 void
222 set_probabilistic_query(const string & prefix, const string & s)
224 string query_string = s;
225 // Strip leading and trailing whitespace from query_string.
226 trim(query_string);
227 if (!query_string.empty())
228 probabilistic_query.insert(make_pair(prefix, query_string));
231 static unsigned
232 read_qp_flags(const string & opt_pfx, unsigned f)
234 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
235 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
236 unsigned mask = 0;
237 const char * s = i->first.c_str() + opt_pfx.size();
238 switch (s[0]) {
239 case 'a':
240 if (strcmp(s, "auto_multiword_synonyms") == 0) {
241 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
242 break;
244 if (strcmp(s, "auto_synonyms") == 0) {
245 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
246 break;
248 break;
249 case 'b':
250 if (strcmp(s, "boolean") == 0) {
251 mask = Xapian::QueryParser::FLAG_BOOLEAN;
252 break;
254 if (strcmp(s, "boolean_any_case") == 0) {
255 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
256 break;
258 break;
259 case 'd':
260 if (strcmp(s, "default") == 0) {
261 mask = Xapian::QueryParser::FLAG_DEFAULT;
262 break;
264 break;
265 case 'l':
266 if (strcmp(s, "lovehate") == 0) {
267 mask = Xapian::QueryParser::FLAG_LOVEHATE;
268 break;
270 break;
271 case 'p':
272 if (strcmp(s, "partial") == 0) {
273 mask = Xapian::QueryParser::FLAG_PARTIAL;
274 break;
276 if (strcmp(s, "phrase") == 0) {
277 mask = Xapian::QueryParser::FLAG_PHRASE;
278 break;
280 if (strcmp(s, "pure_not") == 0) {
281 mask = Xapian::QueryParser::FLAG_PURE_NOT;
282 break;
284 break;
285 case 's':
286 if (strcmp(s, "spelling_correction") == 0) {
287 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
288 break;
290 if (strcmp(s, "synonym") == 0) {
291 mask = Xapian::QueryParser::FLAG_SYNONYM;
292 break;
294 break;
295 case 'w':
296 if (strcmp(s, "wildcard") == 0) {
297 mask = Xapian::QueryParser::FLAG_WILDCARD;
298 break;
300 break;
303 if (i->second.empty()) {
304 f &= ~mask;
305 } else {
306 f |= mask;
309 return f;
312 static querytype
313 set_probabilistic(const string &oldp)
315 // Parse the query string.
316 qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
317 qp.set_stopper(new MyStopper());
318 qp.set_default_op(default_op);
319 qp.set_database(db);
320 // FIXME: provide a custom VRP which handles size:10..20K, etc.
321 if (!size_vrp)
322 size_vrp = new Xapian::NumberValueRangeProcessor(VALUE_SIZE, "size:",
323 true);
324 qp.add_valuerangeprocessor(size_vrp);
325 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
326 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
327 string user_prefix(pfx->first, 7);
328 const string & term_pfx_list = pfx->second;
329 string::size_type i = 0;
330 do {
331 string::size_type i0 = i;
332 i = term_pfx_list.find('\t', i);
333 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
334 qp.add_prefix(user_prefix, term_pfx);
335 // std::map::insert() won't overwrite an existing entry, so we'll
336 // prefer the first user_prefix for which a particular term prefix
337 // is specified.
338 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
339 } while (++i);
341 pfx = option.lower_bound("boolprefix,");
342 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
343 string user_prefix = pfx->first.substr(11);
344 qp.add_boolean_prefix(user_prefix, pfx->second);
345 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
348 try {
349 unsigned default_flags = read_qp_flags("flag_", 0);
350 if (option["spelling"] == "true")
351 default_flags |= qp.FLAG_SPELLING_CORRECTION;
353 vector<Xapian::Query> queries;
354 queries.reserve(probabilistic_query.size());
356 multimap<string, string>::const_iterator j;
357 for (j = probabilistic_query.begin();
358 j != probabilistic_query.end();
359 ++j) {
360 const string & prefix = j->first;
362 // Choose the stemmer to use for this input.
363 string stemlang = option[prefix + ":stemmer"];
364 if (stemlang.empty())
365 stemlang = option["stemmer"];
366 qp.set_stemmer(Xapian::Stem(stemlang));
368 // Work out the flags to use for this input.
369 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
371 const string & query_string = j->second;
372 Xapian::Query q = qp.parse_query(query_string, f, prefix);
373 if (!q.empty())
374 queries.push_back(q);
376 query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
377 } catch (Xapian::QueryParserError &e) {
378 error_msg = e.get_msg();
379 return BAD_QUERY;
382 Xapian::termcount n_new_terms = 0;
383 for (Xapian::TermIterator i = query.get_terms_begin();
384 i != query.get_terms_end(); ++i) {
385 if (termset.find(*i) == termset.end()) {
386 termset.insert(*i);
387 if (!queryterms.empty()) queryterms += '\t';
388 queryterms += *i;
390 n_new_terms++;
393 // Check new query against the previous one
394 if (oldp.empty()) {
395 // If oldp was empty that means there were no probabilistic terms
396 // before, so if there are now this is a new query.
397 return n_new_terms ? NEW_QUERY : SAME_QUERY;
400 // The terms in oldp are separated by tabs.
401 const char oldp_separator = '\t';
402 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
404 // short-cut: if the new query has fewer terms, it must be a new one
405 if (n_new_terms < n_old_terms) return NEW_QUERY;
407 const char *term = oldp.c_str();
408 const char *pend;
409 while ((pend = strchr(term, oldp_separator)) != NULL) {
410 if (termset.find(string(term, pend - term)) == termset.end())
411 return NEW_QUERY;
412 term = pend + 1;
414 if (*term) {
415 if (termset.find(string(term)) == termset.end())
416 return NEW_QUERY;
419 // Use termset.size() rather than n_new_terms so we correctly handle
420 // the case when the query has repeated terms.
421 // This works wrongly in the case when the user extends the query
422 // by adding a term already in it, but that's unlikely and the behaviour
423 // isn't too bad (we just don't reset page 1). We also mishandle a few
424 // other obscure cases e.g. adding quotes to turn a query into a phrase.
425 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
426 return SAME_QUERY;
429 static multimap<string, string> filter_map;
431 typedef multimap<string, string>::const_iterator FMCI;
433 void add_bterm(const string &term) {
434 string prefix;
435 if (prefix_from_term(prefix, term) > 0)
436 filter_map.insert(multimap<string, string>::value_type(prefix, term));
439 static void
440 run_query()
442 bool force_boolean = false;
443 if (!filter_map.empty()) {
444 // OR together filters with the same prefix, then AND together
445 vector<Xapian::Query> filter_vec;
446 vector<string> or_vec;
447 string current;
448 for (FMCI i = filter_map.begin(); ; i++) {
449 bool over = (i == filter_map.end());
450 if (over || i->first != current) {
451 switch (or_vec.size()) {
452 case 0:
453 break;
454 case 1:
455 filter_vec.push_back(Xapian::Query(or_vec[0]));
456 break;
457 default:
458 filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR,
459 or_vec.begin(),
460 or_vec.end()));
461 break;
463 or_vec.clear();
464 if (over) break;
465 current = i->first;
467 or_vec.push_back(i->second);
470 Xapian::Query filter(Xapian::Query::OP_AND,
471 filter_vec.begin(), filter_vec.end());
473 if (query.empty()) {
474 // If no probabilistic query is provided then promote the filters
475 // to be THE query - filtering an empty query will give no
476 // matches.
477 std::swap(query, filter);
478 if (enquire) force_boolean = true;
479 } else {
480 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
484 Xapian::MatchDecider * mdecider = NULL;
485 if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
486 MCI i = cgi_params.find("DATEVALUE");
487 if (i != cgi_params.end()) {
488 Xapian::valueno datevalue = string_to_int(i->second);
489 mdecider = new DateMatchDecider(datevalue, date_start, date_end, date_span);
490 } else {
491 Xapian::Query date_filter(Xapian::Query::OP_OR,
492 date_range_filter(date_start, date_end,
493 date_span),
494 Xapian::Query("Dlatest"));
496 // If no probabilistic query is provided then promote the daterange
497 // filter to be THE query instead of filtering an empty query.
498 if (query.empty()) {
499 query = date_filter;
500 } else {
501 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
506 if (!enquire || !error_msg.empty()) return;
508 set_weighting_scheme(*enquire, option, force_boolean);
510 enquire->set_cutoff(threshold);
512 if (sort_key != Xapian::BAD_VALUENO) {
513 if (sort_after) {
514 enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending);
515 } else {
516 enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending);
520 enquire->set_docid_order(docid_order);
522 if (collapse) {
523 enquire->set_collapse_key(collapse_key);
526 if (!query.empty()) {
527 #if 0
528 // FIXME: If we start doing permissions checks based on $REMOTE_USER
529 // we're going to break some existing setups if users upgrade. We
530 // probably want a way to set this from OmegaScript.
531 const char * remote_user = getenv("REMOTE_USER");
532 if (remote_user)
533 apply_unix_permissions(query, remote_user);
534 #endif
536 enquire->set_query(query);
537 // We could use the value of topdoc as first parameter, but we
538 // need to know the first few items in the mset to fake a
539 // relevance set for topterms.
541 // If min_hits isn't set, check at least one extra result so we
542 // know if we've reached the end of the matches or not - then we
543 // can avoid offering a "next" button which leads to an empty page.
544 mset = enquire->get_mset(0, topdoc + hits_per_page,
545 topdoc + max(hits_per_page + 1, min_hits),
546 &rset, mdecider);
550 string
551 html_escape(const string &str)
553 string res;
554 string::size_type p = 0;
555 while (p < str.size()) {
556 char ch = str[p++];
557 switch (ch) {
558 case '<':
559 res += "&lt;";
560 continue;
561 case '>':
562 res += "&gt;";
563 continue;
564 case '&':
565 res += "&amp;";
566 continue;
567 case '"':
568 res += "&quot;";
569 continue;
570 default:
571 res += ch;
574 return res;
577 static string
578 html_strip(const string &str)
580 string res;
581 string::size_type p = 0;
582 bool skip = false;
583 while (p < str.size()) {
584 char ch = str[p++];
585 switch (ch) {
586 case '<':
587 skip = true;
588 continue;
589 case '>':
590 skip = false;
591 continue;
592 default:
593 if (! skip) res += ch;
596 return res;
599 // FIXME split list into hash or map and use that rather than linear lookup?
600 static int word_in_list(const string& word, const string& list)
602 string::size_type split = 0, split2;
603 int count = 0;
604 while ((split2 = list.find('\t', split)) != string::npos) {
605 if (word.size() == split2 - split) {
606 if (memcmp(word.data(), list.data() + split, word.size()) == 0)
607 return count;
609 split = split2 + 1;
610 ++count;
612 if (word.size() == list.size() - split) {
613 if (memcmp(word.data(), list.data() + split, word.size()) == 0)
614 return count;
616 return -1;
619 // Not a character in an identifier
620 inline static bool
621 p_notid(unsigned int c)
623 return !C_isalnum(c) && c != '_';
626 // Not a character in an HTML tag name
627 inline static bool
628 p_nottag(unsigned int c)
630 return !C_isalnum(c) && c != '.' && c != '-';
633 // FIXME: shares algorithm with indextext.cc!
634 static string
635 html_highlight(const string &s, const string &list,
636 const string &bra, const string &ket)
638 if (!stemmer) {
639 stemmer = new Xapian::Stem(option["stemmer"]);
642 string res;
644 Utf8Iterator j(s);
645 const Utf8Iterator s_end;
646 while (true) {
647 Utf8Iterator first = j;
648 while (first != s_end && !is_wordchar(*first)) ++first;
649 if (first == s_end) break;
650 Utf8Iterator term_end;
651 string term;
652 string word;
653 const char *l = j.raw();
654 if (*first < 128 && C_isupper(*first)) {
655 j = first;
656 Xapian::Unicode::append_utf8(term, *j);
657 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
658 Xapian::Unicode::append_utf8(term, *j);
660 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
661 term.resize(0);
663 term_end = j;
665 if (term.empty()) {
666 j = first;
667 while (is_wordchar(*j)) {
668 Xapian::Unicode::append_utf8(term, *j);
669 ++j;
670 if (j == s_end) break;
671 if (*j == '&' || *j == '\'') {
672 Utf8Iterator next = j;
673 ++next;
674 if (next == s_end || !is_wordchar(*next)) break;
675 term += *j;
676 j = next;
679 term_end = j;
680 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
681 string::size_type len = term.length();
682 if (*j == '#') {
683 term += '#';
684 do { ++j; } while (j != s_end && *j == '#');
685 } else {
686 while (j != s_end && (*j == '+' || *j == '-')) {
687 Xapian::Unicode::append_utf8(term, *j);
688 ++j;
691 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
692 term.resize(len);
693 } else {
694 term_end = j;
698 j = term_end;
699 term = Xapian::Unicode::tolower(term);
700 int match = word_in_list(term, list);
701 if (match == -1) {
702 string stem = "Z";
703 stem += (*stemmer)(term);
704 match = word_in_list(stem, list);
706 if (match >= 0) {
707 res += html_escape(string(l, first.raw() - l));
708 if (!bra.empty()) {
709 res += bra;
710 } else {
711 static const char * colours[] = {
712 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
713 "990000", "009900", "996600", "006699", "990099"
715 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
716 const char * bg = colours[idx];
717 if (strchr(bg, 'f')) {
718 res += "<b style=\"color:black;background-color:#";
719 } else {
720 res += "<b style=\"color:white;background-color:#";
722 res += bg;
723 res += "\">";
725 word = string(first.raw(), j.raw() - first.raw());
726 res += html_escape(word);
727 if (!bra.empty()) {
728 res += ket;
729 } else {
730 res += "</b>";
732 } else {
733 res += html_escape(string(l, j.raw() - l));
736 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
737 return res;
740 #if 0
741 static void
742 print_query_string(const char *after)
744 if (after && strncmp(after, "&B=", 3) == 0) {
745 char prefix = after[3];
746 string::size_type start = 0, amp = 0;
747 while (true) {
748 amp = url_query_string.find('&', amp);
749 if (amp == string::npos) {
750 cout << url_query_string.substr(start);
751 return;
753 amp++;
754 while (url_query_string[amp] == 'B' &&
755 url_query_string[amp + 1] == '=' &&
756 url_query_string[amp + 2] == prefix) {
757 cout << url_query_string.substr(start, amp - start - 1);
758 start = url_query_string.find('&', amp + 3);
759 if (start == string::npos) return;
760 amp = start + 1;
764 cout << url_query_string;
766 #endif
768 class Fields {
769 mutable Xapian::docid did_cached;
770 mutable map<string, string> fields;
772 void read_fields(Xapian::docid did) const;
774 public:
775 Fields() : did_cached(0) { }
777 const string & get_field(Xapian::docid did, const string & field) const {
778 if (did != did_cached) read_fields(did);
779 return fields[field];
783 void
784 Fields::read_fields(Xapian::docid did) const
786 fields.clear();
787 did_cached = did;
788 const string & data = db.get_document(did).get_data();
790 // Parse document data.
791 string::size_type i = 0;
792 const string & names = option["fieldnames"];
793 if (!names.empty()) {
794 // Each line is a field, with fieldnames taken from corresponding
795 // entries in the tab-separated list specified by $opt{fieldnames}.
796 string::size_type n = 0;
797 do {
798 string::size_type n0 = n;
799 n = names.find('\t', n);
800 string::size_type i0 = i;
801 i = data.find('\n', i);
802 fields.insert(make_pair(names.substr(n0, n - n0),
803 data.substr(i0, i - i0)));
804 } while (++n && ++i);
805 } else {
806 // Each line is a field, in the format NAME=VALUE. We assume the field
807 // name doesn't contain an "=". Lines without an "=" are currently
808 // just ignored.
809 do {
810 string::size_type i0 = i;
811 i = data.find('\n', i);
812 string line = data.substr(i0, i - i0);
813 string::size_type j = line.find('=');
814 if (j != string::npos) {
815 string & value = fields[line.substr(0, j)];
816 if (!value.empty()) value += '\t';
817 value += line.substr(j + 1);
819 } while (++i);
823 static Fields fields;
824 static Xapian::docid q0;
825 static Xapian::doccount hit_no;
826 static int percent;
827 static double weight;
828 static Xapian::doccount collapsed;
830 static string print_caption(const string &fmt, const vector<string> &param);
832 enum tagval {
833 CMD_,
834 CMD_add,
835 CMD_addfilter,
836 CMD_allterms,
837 CMD_and,
838 CMD_cgi,
839 CMD_cgilist,
840 CMD_collapsed,
841 CMD_date,
842 CMD_dbname,
843 CMD_dbsize,
844 CMD_def,
845 CMD_defaultop,
846 CMD_div,
847 CMD_eq,
848 CMD_emptydocs,
849 CMD_env,
850 CMD_error,
851 CMD_field,
852 CMD_filesize,
853 CMD_filters,
854 CMD_filterterms,
855 CMD_find,
856 CMD_fmt,
857 CMD_freq,
858 CMD_ge,
859 CMD_gt,
860 CMD_highlight,
861 CMD_hit,
862 CMD_hitlist,
863 CMD_hitsperpage,
864 CMD_hostname,
865 CMD_html,
866 CMD_htmlstrip,
867 CMD_httpheader,
868 CMD_id,
869 CMD_if,
870 CMD_include,
871 CMD_json,
872 CMD_jsonarray,
873 CMD_last,
874 CMD_lastpage,
875 CMD_le,
876 CMD_length,
877 CMD_list,
878 CMD_log,
879 CMD_lookup,
880 CMD_lower,
881 CMD_lt,
882 CMD_map,
883 CMD_max,
884 CMD_min,
885 CMD_mod,
886 CMD_msize,
887 CMD_msizeexact,
888 CMD_mul,
889 CMD_muldiv,
890 CMD_ne,
891 CMD_nice,
892 CMD_not,
893 CMD_now,
894 CMD_opt,
895 CMD_or,
896 CMD_pack,
897 CMD_percentage,
898 CMD_prettyterm,
899 CMD_prettyurl,
900 CMD_query,
901 CMD_querydescription,
902 CMD_queryterms,
903 CMD_range,
904 CMD_record,
905 CMD_relevant,
906 CMD_relevants,
907 CMD_score,
908 CMD_set,
909 CMD_setmap,
910 CMD_setrelevant,
911 CMD_slice,
912 CMD_snippet,
913 CMD_split,
914 CMD_stoplist,
915 CMD_sub,
916 CMD_substr,
917 CMD_suggestion,
918 CMD_terms,
919 CMD_thispage,
920 CMD_time,
921 CMD_topdoc,
922 CMD_topterms,
923 CMD_transform,
924 CMD_truncate,
925 CMD_uniq,
926 CMD_unpack,
927 CMD_unstem,
928 CMD_upper,
929 CMD_url,
930 CMD_value,
931 CMD_version,
932 CMD_weight,
933 CMD_MACRO // special tag for macro evaluation
936 struct func_attrib {
937 int tag;
938 int minargs, maxargs, evalargs;
939 char ensure;
942 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
943 struct func_desc {
944 const char *name;
945 struct func_attrib a;
948 #define N -1
949 #define M 'M'
950 #define Q 'Q'
951 // NB when adding a new command which ensures M or Q, update the list in
952 // docs/omegascript.rst
953 static struct func_desc func_tab[] = {
954 //name minargs maxargs evalargs ensure
955 {"",{CMD_, N, N, 0, 0}},// commented out code
956 T(add, 0, N, N, 0), // add a list of numbers
957 T(addfilter, 1, 1, N, 0), // add filter term
958 T(allterms, 0, 1, N, 0), // list of all terms matching document
959 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
960 T(cgi, 1, 1, N, 0), // return cgi parameter value
961 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
962 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
963 T(date, 1, 2, N, 0), // convert time_t to strftime format
964 // (default: YYYY-MM-DD)
965 T(dbname, 0, 0, N, 0), // database name
966 T(dbsize, 0, 0, N, 0), // database size (# of documents)
967 T(def, 2, 2, 1, 0), // define a macro
968 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
969 T(div, 2, 2, N, 0), // integer divide
970 T(emptydocs, 0, 1, N, 0), // list of empty documents
971 T(env, 1, 1, N, 0), // environment variable
972 T(error, 0, 0, N, 0), // error message
973 T(eq, 2, 2, N, 0), // test equality
974 T(field, 1, 2, N, 0), // lookup field in record
975 T(filesize, 1, 1, N, 0), // pretty printed filesize
976 T(filters, 0, 0, N, 0), // serialisation of current filters
977 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
978 T(find, 2, 2, N, 0), // find entry in list
979 T(fmt, 0, 0, N, 0), // name of current format
980 T(freq, 1, 1, N, 0), // frequency of a term
981 T(ge, 2, 2, N, 0), // test >=
982 T(gt, 2, 2, N, 0), // test >
983 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
984 T(hit, 0, 0, N, 0), // hit number of current mset entry (starting
985 // from 0
986 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
987 T(hitsperpage, 0, 0, N, 0), // hits per page
988 T(hostname, 1, 1, N, 0), // extract hostname from URL
989 T(html, 1, 1, N, 0), // html escape string (<>&")
990 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
991 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
992 T(id, 0, 0, N, 0), // docid of current doc
993 T(if, 2, 3, 1, 0), // conditional
994 T(include, 1, 1, 1, 0), // include another file
995 T(json, 1, 1, N, 0), // JSON string escaping
996 T(jsonarray, 1, 1, N, 0), // Format list as a JSON array of strings
997 T(last, 0, 0, N, M), // hit number one beyond end of current page
998 T(lastpage, 0, 0, N, M), // number of last hit page
999 T(le, 2, 2, N, 0), // test <=
1000 T(length, 1, 1, N, 0), // length of list
1001 T(list, 2, 5, N, 0), // pretty print list
1002 T(log, 1, 2, 1, 0), // create a log entry
1003 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1004 T(lower, 1, 1, N, 0), // convert string to lower case
1005 T(lt, 2, 2, N, 0), // test <
1006 T(map, 1, 2, 1, 0), // map a list into another list
1007 T(max, 1, N, N, 0), // maximum of a list of values
1008 T(min, 1, N, N, 0), // minimum of a list of values
1009 T(mod, 2, 2, N, 0), // integer modulus
1010 T(msize, 0, 0, N, M), // number of matches
1011 T(msizeexact, 0, 0, N, M), // is $msize exact?
1012 T(mul, 2, N, N, 0), // multiply a list of numbers
1013 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1014 T(ne, 2, 2, N, 0), // test not equal
1015 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1016 T(not, 1, 1, N, 0), // logical not
1017 T(now, 0, 0, N, 0), // current date/time as a time_t
1018 T(opt, 1, 2, N, 0), // lookup an option value
1019 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1020 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1021 T(percentage, 0, 0, N, 0), // percentage score of current hit
1022 T(prettyterm, 1, 1, N, Q), // pretty print term name
1023 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1024 T(query, 0, 1, N, Q), // query
1025 T(querydescription,0, 0, N, Q), // query.get_description()
1026 T(queryterms, 0, 0, N, Q), // list of query terms
1027 T(range, 2, 2, N, 0), // return list of values between start and end
1028 T(record, 0, 1, N, 0), // record contents of document
1029 T(relevant, 0, 1, N, Q), // is document relevant?
1030 T(relevants, 0, 0, N, Q), // return list of relevant documents
1031 T(score, 0, 0, N, 0), // score (0-10) of current hit
1032 T(set, 2, 2, N, 0), // set option value
1033 T(setmap, 1, N, N, 0), // set map of option values
1034 T(setrelevant, 0, 1, N, Q), // set rset
1035 T(slice, 2, 2, N, 0), // slice a list using a second list
1036 T(snippet, 1, 2, N, 0), // generate snippet from text
1037 T(split, 1, 2, N, 0), // split a string to give a list
1038 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1039 T(sub, 2, 2, N, 0), // subtract
1040 T(substr, 2, 3, N, 0), // substring
1041 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1042 T(terms, 0, 0, N, M), // list of matching terms
1043 T(thispage, 0, 0, N, M), // page number of current page
1044 T(time, 0, 0, N, M), // how long the match took (in seconds)
1045 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1046 // (counting from 0)
1047 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1048 // (default 16)
1049 T(transform, 3, 3, N, 0), // transform with a regexp
1050 T(truncate, 2, 4, N, 0), // truncate after a word
1051 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1052 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1053 T(unstem, 1, 1, N, Q), // return list of probabilistic terms from
1054 // the query which stemmed to this term
1055 T(upper, 1, 1, N, 0), // convert string to upper case
1056 T(url, 1, 1, N, 0), // url encode argument
1057 T(value, 1, 2, N, 0), // return document value
1058 T(version, 0, 0, N, 0), // omega version string
1059 T(weight, 0, 0, N, 0), // weight of the current hit
1060 { NULL,{0, 0, 0, 0, 0}}
1063 #undef T // Leaving T defined screws up Sun's C++ compiler!
1065 static vector<string> macros;
1067 // Call write() repeatedly until all data is written or we get a
1068 // non-recoverable error.
1069 static ssize_t
1070 write_all(int fd, const char * buf, size_t count)
1072 while (count) {
1073 ssize_t r = write(fd, buf, count);
1074 if (rare(r < 0)) {
1075 if (errno == EINTR) continue;
1076 return r;
1078 buf += r;
1079 count -= r;
1081 return 0;
1084 static string
1085 eval(const string &fmt, const vector<string> &param)
1087 static map<string, const struct func_attrib *> func_map;
1088 if (func_map.empty()) {
1089 struct func_desc *p;
1090 for (p = func_tab; p->name != NULL; p++) {
1091 func_map[string(p->name)] = &(p->a);
1094 string res;
1095 string::size_type p = 0, q;
1096 while ((q = fmt.find('$', p)) != string::npos) try {
1097 res += fmt.substr(p, q - p);
1098 string::size_type code_start = q; // note down for error reporting
1099 q++;
1100 if (q >= fmt.size()) break;
1101 unsigned char ch = fmt[q];
1102 switch (ch) {
1103 // Magic sequences:
1104 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1105 case '$':
1106 res += '$';
1107 p = q + 1;
1108 continue;
1109 case '(':
1110 res += '{';
1111 p = q + 1;
1112 continue;
1113 case ')':
1114 res += '}';
1115 p = q + 1;
1116 continue;
1117 case '.':
1118 res += ',';
1119 p = q + 1;
1120 continue;
1121 case '_':
1122 ch = '0';
1123 // FALL THRU
1124 case '1': case '2': case '3': case '4': case '5':
1125 case '6': case '7': case '8': case '9':
1126 ch -= '0';
1127 if (ch < param.size()) res += param[ch];
1128 p = q + 1;
1129 continue;
1130 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1131 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1132 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1133 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1134 case 'y': case 'z':
1135 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1136 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1137 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1138 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1139 case 'Y': case 'Z':
1140 case '{':
1141 break;
1142 default:
1143 string msg = "Unknown $ code in: $" + fmt.substr(q);
1144 throw msg;
1146 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1147 string var = fmt.substr(q, p - q);
1148 map<string, const struct func_attrib *>::const_iterator func;
1149 func = func_map.find(var);
1150 if (func == func_map.end()) {
1151 throw "Unknown function '" + var + "'";
1153 vector<string> args;
1154 if (fmt[p] == '{') {
1155 q = p + 1;
1156 int nest = 1;
1157 while (true) {
1158 p = fmt.find_first_of(",{}", p + 1);
1159 if (p == string::npos)
1160 throw "missing } in " + fmt.substr(code_start);
1161 if (fmt[p] == '{') {
1162 ++nest;
1163 } else {
1164 if (nest == 1) {
1165 // should we split the args
1166 if (func->second->minargs != N) {
1167 args.push_back(fmt.substr(q, p - q));
1168 q = p + 1;
1171 if (fmt[p] == '}' && --nest == 0) break;
1174 if (func->second->minargs == N)
1175 args.push_back(fmt.substr(q, p - q));
1176 p++;
1179 if (func->second->minargs != N) {
1180 if ((int)args.size() < func->second->minargs)
1181 throw "too few arguments to $" + var;
1182 if (func->second->maxargs != N &&
1183 (int)args.size() > func->second->maxargs)
1184 throw "too many arguments to $" + var;
1186 vector<string>::size_type n;
1187 if (func->second->evalargs != N)
1188 n = func->second->evalargs;
1189 else
1190 n = args.size();
1192 for (vector<string>::size_type j = 0; j < n; j++)
1193 args[j] = eval(args[j], param);
1195 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1196 ensure_query_parsed();
1197 if (func->second->ensure == 'M') ensure_match();
1198 string value;
1199 switch (func->second->tag) {
1200 case CMD_:
1201 break;
1202 case CMD_add: {
1203 int total = 0;
1204 vector<string>::const_iterator i;
1205 for (i = args.begin(); i != args.end(); i++)
1206 total += string_to_int(*i);
1207 value = str(total);
1208 break;
1210 case CMD_addfilter:
1211 add_bterm(args[0]);
1212 break;
1213 case CMD_allterms: {
1214 // list of all terms indexing document
1215 int id = q0;
1216 if (!args.empty()) id = string_to_int(args[0]);
1217 Xapian::TermIterator term = db.termlist_begin(id);
1218 for ( ; term != db.termlist_end(id); term++) {
1219 value += *term;
1220 value += '\t';
1223 if (!value.empty()) value.erase(value.size() - 1);
1224 break;
1226 case CMD_and: {
1227 value = "true";
1228 for (vector<string>::const_iterator i = args.begin();
1229 i != args.end(); i++) {
1230 if (eval(*i, param).empty()) {
1231 value.resize(0);
1232 break;
1235 break;
1237 case CMD_cgi: {
1238 MCI i = cgi_params.find(args[0]);
1239 if (i != cgi_params.end()) value = i->second;
1240 break;
1242 case CMD_cgilist: {
1243 pair<MCI, MCI> g;
1244 g = cgi_params.equal_range(args[0]);
1245 for (MCI i = g.first; i != g.second; i++) {
1246 value += i->second;
1247 value += '\t';
1249 if (!value.empty()) value.erase(value.size() - 1);
1250 break;
1252 case CMD_collapsed: {
1253 value = str(collapsed);
1254 break;
1256 case CMD_date:
1257 value = args[0];
1258 if (!value.empty()) {
1259 char buf[64] = "";
1260 time_t date = string_to_int(value);
1261 if (date != (time_t)-1) {
1262 struct tm *then;
1263 then = gmtime(&date);
1264 string date_fmt = "%Y-%m-%d";
1265 if (args.size() > 1) date_fmt = eval(args[1], param);
1266 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1268 value = buf;
1270 break;
1271 case CMD_dbname:
1272 value = dbname;
1273 break;
1274 case CMD_dbsize: {
1275 static Xapian::doccount dbsize;
1276 if (!dbsize) dbsize = db.get_doccount();
1277 value = str(dbsize);
1278 break;
1280 case CMD_def: {
1281 func_attrib *fa = new func_attrib;
1282 fa->tag = CMD_MACRO + macros.size();
1283 fa->minargs = 0;
1284 fa->maxargs = 9;
1285 fa->evalargs = N; // FIXME: or 0?
1286 fa->ensure = 0;
1288 macros.push_back(args[1]);
1289 func_map[args[0]] = fa;
1290 break;
1292 case CMD_defaultop:
1293 if (default_op == Xapian::Query::OP_AND) {
1294 value = "and";
1295 } else {
1296 value = "or";
1298 break;
1299 case CMD_div: {
1300 int denom = string_to_int(args[1]);
1301 if (denom == 0) {
1302 value = "divide by 0";
1303 } else {
1304 value = str(string_to_int(args[0]) /
1305 string_to_int(args[1]));
1307 break;
1309 case CMD_eq:
1310 if (args[0] == args[1]) value = "true";
1311 break;
1312 case CMD_emptydocs: {
1313 string t;
1314 if (!args.empty())
1315 t = args[0];
1316 Xapian::PostingIterator i;
1317 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1318 if (i.get_doclength() != 0) continue;
1319 if (!value.empty()) value += '\t';
1320 value += str(*i);
1322 break;
1324 case CMD_env: {
1325 char *env = getenv(args[0].c_str());
1326 if (env != NULL) value = env;
1327 break;
1329 case CMD_error:
1330 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1331 error_msg = "Database '" + dbname + "' couldn't be opened";
1333 value = error_msg;
1334 break;
1335 case CMD_field: {
1336 Xapian::docid did = q0;
1337 if (args.size() > 1) did = string_to_int(args[1]);
1338 value = fields.get_field(did, args[0]);
1339 break;
1341 case CMD_filesize: {
1342 // FIXME: rounding? i18n?
1343 int size = string_to_int(args[0]);
1344 int intpart = size;
1345 int fraction = -1;
1346 const char * format = 0;
1347 if (size < 0) {
1348 // Negative size -> empty result.
1349 } else if (size == 1) {
1350 format = "%d byte";
1351 } else if (size < 1024) {
1352 format = "%d bytes";
1353 } else {
1354 if (size < 1024*1024) {
1355 format = "%d.%cK";
1356 } else {
1357 size /= 1024;
1358 if (size < 1024*1024) {
1359 format = "%d.%cM";
1360 } else {
1361 size /= 1024;
1362 format = "%d.%cG";
1365 intpart = unsigned(size) / 1024;
1366 fraction = unsigned(size) % 1024;
1368 if (format) {
1369 char buf[200];
1370 int len;
1371 if (fraction == -1) {
1372 len = my_snprintf(buf, sizeof(buf), format, intpart);
1373 } else {
1374 fraction = (fraction * 10 / 1024) + '0';
1375 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1377 if (len < 0 || (unsigned)len > sizeof(buf)) len = sizeof(buf);
1378 value.assign(buf, len);
1380 break;
1382 case CMD_filters:
1383 value = filters;
1384 break;
1385 case CMD_filterterms: {
1386 Xapian::TermIterator term = db.allterms_begin();
1387 term.skip_to(args[0]);
1388 while (term != db.allterms_end()) {
1389 string t = *term;
1390 if (!startswith(t, args[0])) break;
1391 value += t;
1392 value += '\t';
1393 ++term;
1396 if (!value.empty()) value.erase(value.size() - 1);
1397 break;
1399 case CMD_find: {
1400 string l = args[0], s = args[1];
1401 string::size_type i = 0, j = 0;
1402 size_t count = 0;
1403 while (j != l.size()) {
1404 j = l.find('\t', i);
1405 if (j == string::npos) j = l.size();
1406 if (j - i == s.length()) {
1407 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1408 value = str(count);
1409 break;
1412 ++count;
1413 i = j + 1;
1415 break;
1417 case CMD_fmt:
1418 value = fmtname;
1419 break;
1420 case CMD_freq:
1421 try {
1422 value = str(mset.get_termfreq(args[0]));
1423 } catch (const Xapian::InvalidOperationError&) {
1424 // An MSet will raise this error if it's empty and not
1425 // associated with a search.
1426 value = str(db.get_termfreq(args[0]));
1428 break;
1429 case CMD_ge:
1430 if (string_to_int(args[0]) >= string_to_int(args[1]))
1431 value = "true";
1432 break;
1433 case CMD_gt:
1434 if (string_to_int(args[0]) > string_to_int(args[1]))
1435 value = "true";
1436 break;
1437 case CMD_highlight: {
1438 string bra, ket;
1439 if (args.size() > 2) {
1440 bra = args[2];
1441 if (args.size() > 3) {
1442 ket = args[3];
1443 } else {
1444 string::const_iterator i;
1445 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1446 ket = "</";
1447 ket += bra.substr(1, i - bra.begin() - 1);
1448 ket += '>';
1452 value = html_highlight(args[0], args[1], bra, ket);
1453 break;
1455 case CMD_hit:
1456 // 0-based mset index
1457 value = str(hit_no);
1458 break;
1459 case CMD_hitlist:
1460 #if 0
1461 url_query_string = "?DB=";
1462 url_query_string += dbname;
1463 multimap<string, string>::const_iterator j;
1464 for (j = probabilistic_query.begin();
1465 j != probabilistic_query.end();
1466 ++j) {
1467 if (j->first.empty()) {
1468 url_query_string += "&P=";
1469 } else {
1470 url_query_string += "&P."
1471 url_query_string += j->first;
1472 url_query_string += '=';
1474 const char *q = j->second.c_str();
1475 int ch;
1476 while ((ch = *q++) != '\0') {
1477 switch (ch) {
1478 case '+':
1479 url_query_string += "%2b";
1480 break;
1481 case '"':
1482 url_query_string += "%22";
1483 break;
1484 case '%':
1485 url_query_string += "%25";
1486 break;
1487 case '&':
1488 url_query_string += "%26";
1489 break;
1490 case ' ':
1491 ch = '+';
1492 /* fall through */
1493 default:
1494 url_query_string += ch;
1498 // add any boolean terms
1499 for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
1500 url_query_string += "&B=";
1501 url_query_string += i->second;
1503 #endif
1504 for (hit_no = topdoc; hit_no < last; hit_no++)
1505 value += print_caption(args[0], param);
1506 hit_no = 0;
1507 break;
1508 case CMD_hitsperpage:
1509 value = str(hits_per_page);
1510 break;
1511 case CMD_hostname: {
1512 value = args[0];
1513 // remove URL scheme and/or path
1514 string::size_type i = value.find("://");
1515 if (i == string::npos) i = 0; else i += 3;
1516 value = value.substr(i, value.find('/', i) - i);
1517 // remove user@ or user:password@
1518 i = value.find('@');
1519 if (i != string::npos) value.erase(0, i + 1);
1520 // remove :port
1521 i = value.find(':');
1522 if (i != string::npos) value.resize(i);
1523 break;
1525 case CMD_html:
1526 value = html_escape(args[0]);
1527 break;
1528 case CMD_htmlstrip:
1529 value = html_strip(args[0]);
1530 break;
1531 case CMD_httpheader:
1532 if (!suppress_http_headers) {
1533 cout << args[0] << ": " << args[1] << endl;
1534 if (!set_content_type && args[0].length() == 12 &&
1535 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1536 set_content_type = true;
1539 break;
1540 case CMD_id:
1541 // document id
1542 value = str(q0);
1543 break;
1544 case CMD_if:
1545 if (!args[0].empty())
1546 value = eval(args[1], param);
1547 else if (args.size() > 2)
1548 value = eval(args[2], param);
1549 break;
1550 case CMD_include:
1551 value = eval_file(args[0]);
1552 break;
1553 case CMD_json:
1554 value = args[0];
1555 json_escape(value);
1556 break;
1557 case CMD_jsonarray: {
1558 const string & l = args[0];
1559 string::size_type i = 0, j;
1560 if (l.empty()) {
1561 value = "[]";
1562 break;
1564 value = "[\"";
1565 while (true) {
1566 j = l.find('\t', i);
1567 string elt(l, i, j - i);
1568 json_escape(elt);
1569 value += elt;
1570 if (j == string::npos) break;
1571 value += "\",\"";
1572 i = j + 1;
1574 value += "\"]";
1575 break;
1577 case CMD_last:
1578 value = str(last);
1579 break;
1580 case CMD_lastpage: {
1581 int l = mset.get_matches_estimated();
1582 if (l > 0) l = (l - 1) / hits_per_page + 1;
1583 value = str(l);
1584 break;
1586 case CMD_le:
1587 if (string_to_int(args[0]) <= string_to_int(args[1]))
1588 value = "true";
1589 break;
1590 case CMD_length:
1591 if (args[0].empty()) {
1592 value = "0";
1593 } else {
1594 size_t length = count(args[0].begin(), args[0].end(), '\t');
1595 value = str(length + 1);
1597 break;
1598 case CMD_list: {
1599 if (!args[0].empty()) {
1600 string pre, inter, interlast, post;
1601 switch (args.size()) {
1602 case 2:
1603 inter = interlast = args[1];
1604 break;
1605 case 3:
1606 inter = args[1];
1607 interlast = args[2];
1608 break;
1609 case 4:
1610 pre = args[1];
1611 inter = interlast = args[2];
1612 post = args[3];
1613 break;
1614 case 5:
1615 pre = args[1];
1616 inter = args[2];
1617 interlast = args[3];
1618 post = args[4];
1619 break;
1621 value += pre;
1622 string list = args[0];
1623 string::size_type split = 0, split2;
1624 while ((split2 = list.find('\t', split)) != string::npos) {
1625 if (split) value += inter;
1626 value += list.substr(split, split2 - split);
1627 split = split2 + 1;
1629 if (split) value += interlast;
1630 value += list.substr(split);
1631 value += post;
1633 break;
1635 case CMD_log: {
1636 if (!vet_filename(args[0])) break;
1637 string logfile = log_dir + args[0];
1638 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1639 if (fd == -1) break;
1640 vector<string> noargs;
1641 noargs.resize(1);
1642 string line;
1643 if (args.size() > 1) {
1644 line = args[1];
1645 } else {
1646 line = DEFAULT_LOG_ENTRY;
1648 line = eval(line, noargs);
1649 line += '\n';
1650 (void)write_all(fd, line.data(), line.length());
1651 close(fd);
1652 break;
1654 case CMD_lookup: {
1655 if (!vet_filename(args[0])) break;
1656 string cdbfile = cdb_dir + args[0];
1657 int fd = open(cdbfile.c_str(), O_RDONLY);
1658 if (fd == -1) break;
1660 struct cdb cdb;
1661 cdb_init(&cdb, fd);
1663 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1664 size_t datalen = cdb_datalen(&cdb);
1665 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1666 if (q) {
1667 value = string(static_cast<const char *>(dat), datalen);
1671 cdb_free(&cdb);
1672 close(fd); // FIXME: cache fds?
1673 break;
1675 case CMD_lower:
1676 value = Xapian::Unicode::tolower(args[0]);
1677 break;
1678 case CMD_lt:
1679 if (string_to_int(args[0]) < string_to_int(args[1]))
1680 value = "true";
1681 break;
1682 case CMD_map:
1683 if (!args[0].empty()) {
1684 string l = args[0], pat = args[1];
1685 vector<string> new_args(param);
1686 string::size_type i = 0, j;
1687 while (true) {
1688 j = l.find('\t', i);
1689 new_args[0] = l.substr(i, j - i);
1690 value += eval(pat, new_args);
1691 if (j == string::npos) break;
1692 value += '\t';
1693 i = j + 1;
1696 break;
1697 case CMD_max: {
1698 vector<string>::const_iterator i = args.begin();
1699 int val = string_to_int(*i++);
1700 for (; i != args.end(); i++) {
1701 int x = string_to_int(*i);
1702 if (x > val) val = x;
1704 value = str(val);
1705 break;
1707 case CMD_min: {
1708 vector<string>::const_iterator i = args.begin();
1709 int val = string_to_int(*i++);
1710 for (; i != args.end(); i++) {
1711 int x = string_to_int(*i);
1712 if (x < val) val = x;
1714 value = str(val);
1715 break;
1717 case CMD_msize:
1718 // number of matches
1719 value = str(mset.get_matches_estimated());
1720 break;
1721 case CMD_msizeexact:
1722 // is msize exact?
1723 if (mset.get_matches_lower_bound()
1724 == mset.get_matches_upper_bound())
1725 value = "true";
1726 break;
1727 case CMD_mod: {
1728 int denom = string_to_int(args[1]);
1729 if (denom == 0) {
1730 value = "divide by 0";
1731 } else {
1732 value = str(string_to_int(args[0]) %
1733 string_to_int(args[1]));
1735 break;
1737 case CMD_mul: {
1738 vector<string>::const_iterator i = args.begin();
1739 int total = string_to_int(*i++);
1740 while (i != args.end())
1741 total *= string_to_int(*i++);
1742 value = str(total);
1743 break;
1745 case CMD_muldiv: {
1746 int denom = string_to_int(args[2]);
1747 if (denom == 0) {
1748 value = "divide by 0";
1749 } else {
1750 int num = string_to_int(args[0]) * string_to_int(args[1]);
1751 value = str(num / denom);
1753 break;
1755 case CMD_ne:
1756 if (args[0] != args[1]) value = "true";
1757 break;
1758 case CMD_nice: {
1759 string::const_iterator i = args[0].begin();
1760 int len = args[0].length();
1761 while (len) {
1762 value += *i++;
1763 if (--len && len % 3 == 0) value += option["thousand"];
1765 break;
1767 case CMD_not:
1768 if (args[0].empty()) value = "true";
1769 break;
1770 case CMD_now: {
1771 char buf[64];
1772 my_snprintf(buf, sizeof(buf), "%lu", (unsigned long)time(NULL));
1773 // MSVC's snprintf omits the zero byte if the string if
1774 // sizeof(buf) long.
1775 buf[sizeof(buf) - 1] = '\0';
1776 value = buf;
1777 break;
1779 case CMD_opt:
1780 if (args.size() == 2) {
1781 value = option[args[0] + "," + args[1]];
1782 } else {
1783 value = option[args[0]];
1785 break;
1786 case CMD_or: {
1787 for (vector<string>::const_iterator i = args.begin();
1788 i != args.end(); i++) {
1789 value = eval(*i, param);
1790 if (!value.empty()) break;
1792 break;
1794 case CMD_pack:
1795 value = int_to_binary_string(string_to_int(args[0]));
1796 break;
1797 case CMD_percentage:
1798 // percentage score
1799 value = str(percent);
1800 break;
1801 case CMD_prettyterm:
1802 value = pretty_term(args[0]);
1803 break;
1804 case CMD_prettyurl:
1805 value = args[0];
1806 url_prettify(value);
1807 break;
1808 case CMD_query: {
1809 pair<multimap<string, string>::const_iterator,
1810 multimap<string, string>::const_iterator> r;
1811 r = probabilistic_query.equal_range(args.empty() ?
1812 string() : args[0]);
1813 multimap<string, string>::const_iterator j;
1814 for (j = r.first; j != r.second; ++j) {
1815 if (!value.empty()) value += '\t';
1816 const string & s = j->second;
1817 size_t start = 0, tab;
1818 while ((tab = s.find('\t', start)) != string::npos) {
1819 value.append(s, start, tab - start);
1820 value += ' ';
1821 start = tab + 1;
1823 value.append(s, start, string::npos);
1825 break;
1827 case CMD_querydescription:
1828 value = query.get_description();
1829 break;
1830 case CMD_queryterms:
1831 value = queryterms;
1832 break;
1833 case CMD_range: {
1834 int start = string_to_int(args[0]);
1835 int end = string_to_int(args[1]);
1836 while (start <= end) {
1837 value += str(start);
1838 if (start < end) value += '\t';
1839 start++;
1841 break;
1843 case CMD_record: {
1844 int id = q0;
1845 if (!args.empty()) id = string_to_int(args[0]);
1846 value = db.get_document(id).get_data();
1847 break;
1849 case CMD_relevant: {
1850 // document id if relevant; empty otherwise
1851 int id = q0;
1852 if (!args.empty()) id = string_to_int(args[0]);
1853 map<Xapian::docid, bool>::iterator i = ticked.find(id);
1854 if (i != ticked.end()) {
1855 i->second = false; // icky side-effect
1856 value = str(id);
1858 break;
1860 case CMD_relevants: {
1861 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
1862 i != ticked.end(); i++) {
1863 if (i->second) {
1864 value += str(i->first);
1865 value += '\t';
1868 if (!value.empty()) value.erase(value.size() - 1);
1869 break;
1871 case CMD_score:
1872 // Score (0 to 10)
1873 value = str(percent / 10);
1874 break;
1875 case CMD_set:
1876 option[args[0]] = args[1];
1877 break;
1878 case CMD_setmap: {
1879 string base = args[0] + ',';
1880 if (args.size() % 2 != 1)
1881 throw string("$setmap requires an odd number of arguments");
1882 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
1883 option[base + args[i]] = args[i + 1];
1885 break;
1887 case CMD_setrelevant: {
1888 string::size_type i = 0, j;
1889 while (true) {
1890 j = args[0].find_first_not_of("0123456789", i);
1891 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
1892 if (id) {
1893 rset.add_document(id);
1894 ticked[id] = true;
1896 if (j == string::npos) break;
1897 i = j + 1;
1899 break;
1901 case CMD_slice: {
1902 string list = args[0], pos = args[1];
1903 vector<string> items;
1904 string::size_type i = 0, j;
1905 while (true) {
1906 j = list.find('\t', i);
1907 items.push_back(list.substr(i, j - i));
1908 if (j == string::npos) break;
1909 i = j + 1;
1911 i = 0;
1912 bool have_added = false;
1913 while (true) {
1914 j = pos.find('\t', i);
1915 int item = string_to_int(pos.substr(i, j - i));
1916 if (item >= 0 && size_t(item) < items.size()) {
1917 if (have_added) value += '\t';
1918 value += items[item];
1919 have_added = true;
1921 if (j == string::npos) break;
1922 i = j + 1;
1924 break;
1926 case CMD_snippet: {
1927 Xapian::Snipper snipper;
1928 snipper.set_mset(mset);
1929 snipper.set_stemmer(Xapian::Stem(option["stemmer"]));
1930 size_t len = (args.size() == 1) ? 200 : string_to_int(args[1]);
1931 value = snipper.generate_snippet(args[0], len);
1932 break;
1934 case CMD_split: {
1935 string split;
1936 if (args.size() == 1) {
1937 split = " ";
1938 value = args[0];
1939 } else {
1940 split = args[0];
1941 value = args[1];
1943 string::size_type i = 0;
1944 while (true) {
1945 if (split.empty()) {
1946 ++i;
1947 if (i >= value.size()) break;
1948 } else {
1949 i = value.find(split, i);
1950 if (i == string::npos) break;
1952 value.replace(i, split.size(), 1, '\t');
1953 ++i;
1955 break;
1957 case CMD_stoplist: {
1958 Xapian::TermIterator i = qp.stoplist_begin();
1959 Xapian::TermIterator end = qp.stoplist_end();
1960 while (i != end) {
1961 if (!value.empty()) value += '\t';
1962 value += *i;
1963 ++i;
1965 break;
1967 case CMD_sub:
1968 value = str(string_to_int(args[0]) - string_to_int(args[1]));
1969 break;
1970 case CMD_substr: {
1971 int start = string_to_int(args[1]);
1972 if (start < 0) {
1973 if (static_cast<size_t>(-start) >= args[0].size()) {
1974 start = 0;
1975 } else {
1976 start = static_cast<int>(args[0].size()) + start;
1978 } else {
1979 if (static_cast<size_t>(start) >= args[0].size()) break;
1981 size_t len = string::npos;
1982 if (args.size() > 2) {
1983 int int_len = string_to_int(args[2]);
1984 if (int_len >= 0) {
1985 len = size_t(int_len);
1986 } else {
1987 len = args[0].size() - start;
1988 if (static_cast<size_t>(-int_len) >= len) {
1989 len = 0;
1990 } else {
1991 len -= static_cast<size_t>(-int_len);
1995 value = args[0].substr(start, len);
1996 break;
1998 case CMD_suggestion:
1999 value = qp.get_corrected_query_string();
2000 break;
2001 case CMD_terms:
2002 if (enquire) {
2003 // list of matching terms
2004 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2005 while (term != enquire->get_matching_terms_end(q0)) {
2006 // check term was in the typed query so we ignore
2007 // boolean filter terms
2008 if (termset.find(*term) != termset.end()) {
2009 value += *term;
2010 value += '\t';
2012 ++term;
2015 if (!value.empty()) value.erase(value.size() - 1);
2017 break;
2018 case CMD_thispage:
2019 value = str(topdoc / hits_per_page + 1);
2020 break;
2021 case CMD_time:
2022 if (secs >= 0) {
2023 char buf[64];
2024 my_snprintf(buf, sizeof(buf), "%.6f", secs);
2025 // MSVC's snprintf omits the zero byte if the string if
2026 // sizeof(buf) long.
2027 buf[sizeof(buf) - 1] = '\0';
2028 value = buf;
2030 break;
2031 case CMD_topdoc:
2032 // first document on current page of hit list (counting from 0)
2033 value = str(topdoc);
2034 break;
2035 case CMD_topterms:
2036 if (enquire) {
2037 int howmany = 16;
2038 if (!args.empty()) howmany = string_to_int(args[0]);
2039 if (howmany < 0) howmany = 0;
2041 // List of expand terms
2042 Xapian::ESet eset;
2043 OmegaExpandDecider decider(db, &termset);
2045 if (!rset.empty()) {
2046 set_expansion_scheme(*enquire, option);
2047 #if XAPIAN_AT_LEAST(1,3,2)
2048 eset = enquire->get_eset(howmany * 2, rset, &decider);
2049 #else
2050 eset = enquire->get_eset(howmany * 2, rset, 0,
2051 expand_param_k, &decider);
2052 #endif
2053 } else if (mset.size()) {
2054 // invent an rset
2055 Xapian::RSet tmp;
2057 int c = 5;
2058 // FIXME: what if mset does not start at first match?
2059 Xapian::MSetIterator m = mset.begin();
2060 for ( ; m != mset.end(); ++m) {
2061 tmp.add_document(*m);
2062 if (--c == 0) break;
2065 set_expansion_scheme(*enquire, option);
2066 #if XAPIAN_AT_LEAST(1,3,2)
2067 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2068 #else
2069 eset = enquire->get_eset(howmany * 2, tmp, 0,
2070 expand_param_k, &decider);
2071 #endif
2074 // Don't show more than one word with the same stem.
2075 set<string> stems;
2076 Xapian::ESetIterator i;
2077 for (i = eset.begin(); i != eset.end(); ++i) {
2078 string term(*i);
2079 string stem = (*stemmer)(term);
2080 if (stems.find(stem) != stems.end()) continue;
2081 stems.insert(stem);
2082 value += term;
2083 value += '\t';
2084 if (--howmany == 0) break;
2086 if (!value.empty()) value.erase(value.size() - 1);
2088 break;
2089 case CMD_transform:
2090 omegascript_transform(value, args);
2091 break;
2092 case CMD_truncate:
2093 value = generate_sample(args[0],
2094 string_to_int(args[1]),
2095 args.size() > 2 ? args[2] : string(),
2096 args.size() > 3 ? args[3] : string());
2097 break;
2098 case CMD_uniq: {
2099 const string &list = args[0];
2100 if (list.empty()) break;
2101 string::size_type split = 0, split2;
2102 string prev;
2103 do {
2104 split2 = list.find('\t', split);
2105 string item = list.substr(split, split2 - split);
2106 if (split == 0) {
2107 value = item;
2108 } else if (item != prev) {
2109 value += '\t';
2110 value += item;
2112 prev = item;
2113 split = split2 + 1;
2114 } while (split2 != string::npos);
2115 break;
2117 case CMD_unpack:
2118 value = str(binary_string_to_int(args[0]));
2119 break;
2120 case CMD_unstem: {
2121 const string &term = args[0];
2122 Xapian::TermIterator i = qp.unstem_begin(term);
2123 Xapian::TermIterator end = qp.unstem_end(term);
2124 while (i != end) {
2125 if (!value.empty()) value += '\t';
2126 value += *i;
2127 ++i;
2129 break;
2131 case CMD_upper:
2132 value = Xapian::Unicode::toupper(args[0]);
2133 break;
2134 case CMD_url:
2135 url_encode(value, args[0]);
2136 break;
2137 case CMD_value: {
2138 Xapian::docid id = q0;
2139 Xapian::valueno value_no = string_to_int(args[0]);
2140 if (args.size() > 1) id = string_to_int(args[1]);
2141 value = db.get_document(id).get_value(value_no);
2142 break;
2144 case CMD_version:
2145 value = PACKAGE_STRING;
2146 break;
2147 case CMD_weight:
2148 value = double_to_string(weight);
2149 break;
2150 default: {
2151 args.insert(args.begin(), param[0]);
2152 int macro_no = func->second->tag - CMD_MACRO;
2153 assert(macro_no >= 0 && (unsigned int)macro_no < macros.size());
2154 // throw "Unknown function '" + var + "'";
2155 value = eval(macros[macro_no], args);
2156 break;
2159 res += value;
2160 } catch (const Xapian::Error & e) {
2161 // FIXME: this means we only see the most recent error in $error
2162 // - is that the best approach?
2163 error_msg = e.get_msg();
2166 res += fmt.substr(p);
2167 return res;
2170 static string
2171 eval_file(const string &fmtfile)
2173 string err;
2174 if (vet_filename(fmtfile)) {
2175 string file = template_dir + fmtfile;
2176 string fmt;
2177 if (load_file(file, fmt)) {
2178 vector<string> noargs;
2179 noargs.resize(1);
2180 return eval(fmt, noargs);
2182 err = strerror(errno);
2183 } else {
2184 err = "name contains '..'";
2187 // FIXME: report why!
2188 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2189 if (!err.empty()) msg += " (" + err + ')';
2190 throw msg;
2193 extern string
2194 pretty_term(string term)
2196 // Just leave empty strings and single characters alone.
2197 if (term.length() <= 1) return term;
2199 // Assume unprefixed terms are unstemmed.
2200 if (!C_isupper(term[0])) return term;
2202 // Handle stemmed terms.
2203 bool stemmed = (term[0] == 'Z');
2204 if (stemmed) {
2205 // First of all, check if a term in the query stemmed to this one.
2206 Xapian::TermIterator u = qp.unstem_begin(term);
2207 // There might be multiple words with the same stem, but we only want
2208 // one so just take the first.
2209 if (u != qp.unstem_end(term)) return *u;
2211 // Remove the 'Z'.
2212 term.erase(0, 1);
2215 bool add_quotes = false;
2217 // Check if the term has a prefix.
2218 if (C_isupper(term[0])) {
2219 // See if we have this prefix in the termprefix_to_userprefix map. If
2220 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2221 string prefix;
2222 size_t prefix_len = prefix_from_term(prefix, term);
2224 map<string, string>::const_iterator i;
2225 i = termprefix_to_userprefix.find(prefix);
2226 if (i != termprefix_to_userprefix.end()) {
2227 string user_prefix = i->second;
2228 user_prefix += ':';
2229 term.replace(0, prefix_len, user_prefix);
2230 } else {
2231 // We don't have a prefix mapping for this, so just set a flag to
2232 // add quotes around the term.
2233 add_quotes = true;
2237 if (stemmed) term += '.';
2239 if (add_quotes) {
2240 term.insert(0, "\"");
2241 term.append("\"");
2244 return term;
2247 static string
2248 print_caption(const string &fmt, const vector<string> &param)
2250 q0 = *(mset[hit_no]);
2252 weight = mset[hit_no].get_weight();
2253 percent = mset.convert_to_percent(mset[hit_no]);
2254 collapsed = mset[hit_no].get_collapse_count();
2256 return eval(fmt, param);
2259 void
2260 parse_omegascript()
2262 try {
2263 const char * p = getenv("SERVER_PROTOCOL");
2264 if (p && strcmp(p, "INCLUDED") == 0) {
2265 // We're being included in another page, so suppress headers.
2266 suppress_http_headers = true;
2269 std::string output = eval_file(fmtname);
2270 if (!set_content_type && !suppress_http_headers) {
2271 cout << "Content-Type: text/html" << std::endl;
2272 set_content_type = true;
2274 cout << std::endl;
2275 cout << output;
2276 } catch (...) {
2277 // Ensure the headers have been output so that any exception gets
2278 // reported rather than giving a server error.
2279 if (!set_content_type && !suppress_http_headers) {
2280 cout << "Content-Type: text/html" << std::endl;
2281 set_content_type = true;
2283 cout << std::endl;
2284 throw;
2288 static void
2289 ensure_query_parsed()
2291 if (query_parsed) return;
2292 query_parsed = true;
2294 MCI val;
2295 pair<MCI, MCI> g;
2297 // Should we discard the existing R-set recorded in R CGI parameters?
2298 bool discard_rset = false;
2300 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2301 // CGI parameters)?
2302 bool force_first_page = false;
2304 string v;
2305 // get list of terms from previous iteration of query
2306 val = cgi_params.find("xP");
2307 if (val != cgi_params.end()) {
2308 v = val->second;
2309 // If xP given, default to discarding any RSet and forcing the first
2310 // page of results. If the query is the same, or an extension of
2311 // the previous query, we adjust these again below.
2312 discard_rset = true;
2313 force_first_page = true;
2315 querytype result = set_probabilistic(v);
2316 switch (result) {
2317 case BAD_QUERY:
2318 break;
2319 case NEW_QUERY:
2320 break;
2321 case SAME_QUERY:
2322 case EXTENDED_QUERY:
2323 // If we've changed database, force the first page of hits
2324 // and discard the R-set (since the docids will have changed)
2325 val = cgi_params.find("xDB");
2326 if (val != cgi_params.end() && val->second != dbname) break;
2327 if (result == SAME_QUERY && force_first_page) {
2328 val = cgi_params.find("xFILTERS");
2329 if (val != cgi_params.end() && val->second != filters) {
2330 // Filters have changed since last query.
2331 } else {
2332 force_first_page = false;
2335 discard_rset = false;
2336 break;
2339 if (!force_first_page) {
2340 // Work out which mset element is the first hit we want
2341 // to display
2342 val = cgi_params.find("TOPDOC");
2343 if (val != cgi_params.end()) {
2344 topdoc = atol(val->second.c_str());
2347 // Handle next, previous, and page links
2348 if (cgi_params.find(">") != cgi_params.end()) {
2349 topdoc += hits_per_page;
2350 } else if (cgi_params.find("<") != cgi_params.end()) {
2351 if (topdoc >= hits_per_page)
2352 topdoc -= hits_per_page;
2353 else
2354 topdoc = 0;
2355 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2356 (val = cgi_params.find("#")) != cgi_params.end()) {
2357 long page = atol(val->second.c_str());
2358 // Do something sensible for page 0 (we count pages from 1).
2359 if (page == 0) page = 1;
2360 topdoc = (page - 1) * hits_per_page;
2363 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2364 // Normally we snap TOPDOC like this so that things work nicely if
2365 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2366 // postprocessing the output of omega and want variable sized pages,
2367 // this is unhelpful.
2368 bool raw_search = false;
2369 val = cgi_params.find("RAWSEARCH");
2370 if (val != cgi_params.end()) {
2371 raw_search = bool(atol(val->second.c_str()));
2374 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2377 if (!discard_rset) {
2378 // put documents marked as relevant into the rset
2379 g = cgi_params.equal_range("R");
2380 for (MCI i = g.first; i != g.second; i++) {
2381 const string & value = i->second;
2382 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2383 while (value[j] == '.') ++j;
2384 Xapian::docid d = atoi(value.c_str() + j);
2385 if (d) {
2386 rset.add_document(d);
2387 ticked[d] = true;
2394 // run query if we haven't already
2395 static void
2396 ensure_match()
2398 if (done_query) return;
2400 secs = RealTime::now();
2401 run_query();
2402 if (secs != -1)
2403 secs = RealTime::now() - secs;
2405 done_query = true;
2406 last = mset.get_matches_lower_bound();
2407 if (last == 0) {
2408 // Otherwise topdoc ends up being -6 if it's non-zero!
2409 topdoc = 0;
2410 } else {
2411 if (topdoc >= last)
2412 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2413 // last is the count of documents up to the end of the current page
2414 // (as returned by $last)
2415 if (topdoc + hits_per_page < last)
2416 last = topdoc + hits_per_page;
2420 // OmegaExpandDecider methods.
2422 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2423 set<string> * querytermset)
2424 : db(db_)
2426 // We'll want the stemmer for testing matches anyway.
2427 if (!stemmer)
2428 stemmer = new Xapian::Stem(option["stemmer"]);
2429 if (querytermset) {
2430 set<string>::const_iterator i;
2431 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2432 string term(*i);
2433 if (term.empty()) continue;
2435 unsigned char ch = term[0];
2436 bool stemmed = (ch == 'Z');
2437 if (stemmed) {
2438 term.erase(0, 1);
2439 if (term.empty()) continue;
2440 ch = term[0];
2443 if (C_isupper(ch)) {
2444 string prefix;
2445 size_t prefix_len = prefix_from_term(prefix, term);
2446 term.erase(0, prefix_len);
2449 if (!stemmed) term = (*stemmer)(term);
2451 exclude_stems.insert(term);
2456 bool
2457 OmegaExpandDecider::operator()(const string & term) const
2459 unsigned char ch = term[0];
2461 // Reject terms with a prefix.
2462 if (C_isupper(ch)) return false;
2465 MyStopper stopper;
2466 // Don't suggest stopwords.
2467 if (stopper(term)) return false;
2470 // Reject small numbers.
2471 if (term.size() < 4 && C_isdigit(ch)) return false;
2473 // Reject terms containing a space.
2474 if (term.find(' ') != string::npos) return false;
2476 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2477 // terms which are already in the query in some form.
2478 string stem = (*stemmer)(term);
2479 if (exclude_stems.find(stem) != exclude_stems.end())
2480 return false;
2482 // Ignore terms that only occur once (hapaxes) since they aren't
2483 // useful for finding related documents - they only occur in a
2484 // document that's already been marked as relevant.
2485 // FIXME: add an expand option to ignore terms where
2486 // termfreq == rtermfreq.
2487 if (db.get_termfreq(term) <= 1) return false;
2489 return true;