Support concurrent date range filters
[xapian.git] / xapian-applications / omega / omega.cc
blob0149567e38f2c425bb9d932d570d1d6957a0a293
1 /* omega.cc: Main module for omega (example CGI frontend for Xapian)
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016,2018 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 // If we're building against git after the expand API changed but before the
27 // version gets bumped to 1.3.2, we'll get a deprecation warning from
28 // get_eset() unless we suppress such warnings here.
29 #define XAPIAN_DEPRECATED(D) D
31 #include <cstdio>
32 #include <ctime>
34 #include <algorithm>
35 #include <cstring>
36 #include <iostream>
37 #include <set>
39 #include "safefcntl.h"
40 #include "safeunistd.h"
42 #include "omega.h"
43 #include "utils.h"
44 #include "cgiparam.h"
45 #include "query.h"
46 #include "str.h"
47 #include "stringutils.h"
48 #include "expand.h"
50 using namespace std;
52 static const char DEFAULT_STEM_LANGUAGE[] = "english";
54 // A character which doesn't require URL encoding, and isn't likely to appear
55 // in filter values.
56 const char filter_sep = '~';
58 // What we used for filter_sep in Omega < 1.3.4.
59 const char filter_sep_old = '-';
61 Xapian::Enquire * enquire;
62 Xapian::Database db;
63 Xapian::RSet rset;
65 map<string, string> option;
67 bool set_content_type = false;
69 bool suppress_http_headers = false;
71 string dbname;
72 string fmtname;
73 string filters, old_filters;
75 Xapian::docid topdoc = 0;
76 Xapian::docid hits_per_page = 0;
77 Xapian::docid min_hits = 0;
79 // percentage cut-off
80 int threshold = 0;
82 Xapian::MultiValueKeyMaker* sort_keymaker = NULL;
83 Xapian::valueno sort_key = Xapian::BAD_VALUENO; // Don't sort.
84 bool reverse_sort = true;
85 bool sort_after = false;
86 Xapian::Enquire::docid_order docid_order = Xapian::Enquire::ASCENDING;
88 Xapian::valueno collapse_key = 0;
89 bool collapse = false;
91 static string
92 map_dbname_to_dir(const string &database_name)
94 return database_dir + database_name;
97 // Get database(s) to search.
98 template<typename IT>
99 void
100 parse_db_params(const pair<IT, IT>& dbs)
102 dbname.resize(0);
103 // Only add a repeated db once.
104 set<string> seen;
105 for (auto i = dbs.first; i != dbs.second; ++i) {
106 const string& v = i->second;
107 if (v.empty()) continue;
108 size_t p = 0, q;
109 while (true) {
110 q = v.find('/', p);
111 string s(v, p, q - p);
112 if (!s.empty() && seen.find(s) == seen.end()) {
113 // Translate DB parameter to path of database directory
114 if (!dbname.empty()) dbname += '/';
115 dbname += s;
116 db.add_database(Xapian::Database(map_dbname_to_dir(s)));
117 seen.insert(s);
119 if (q == string::npos) break;
120 p = q + 1;
125 int main(int argc, char *argv[])
126 try {
127 read_config_file();
129 option["flag_default"] = "true";
131 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
132 option["decimal"] = ".";
133 option["thousand"] = ",";
135 // set the default stemming language
136 option["stemmer"] = DEFAULT_STEM_LANGUAGE;
138 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
139 // setvbuf(stdout, NULL, _IOLBF, 0);
141 const char * method = getenv("REQUEST_METHOD");
142 if (method == NULL) {
143 if (argc > 1 && (argv[1][0] != '-' || strchr(argv[1], '='))) {
144 // omega 'P=information retrieval' DB=papers
145 // check for a leading '-' on the first arg so "omega --version",
146 // "omega --help", and similar take the next branch
147 decode_argv(argv + 1);
148 } else {
149 // Seems we're running from the command line so give version
150 // and allow a query to be entered for testing
151 cout << PROGRAM_NAME " - " PACKAGE " " VERSION "\n";
152 if (argc > 1) exit(0);
153 cout << "Enter NAME=VALUE lines, end with blank line\n";
154 decode_test();
156 } else {
157 if (*method == 'P')
158 decode_post();
159 else
160 decode_get();
163 try {
164 parse_db_params(cgi_params.equal_range("DB"));
165 if (dbname.empty()) {
166 dbname = default_db;
167 db.add_database(Xapian::Database(map_dbname_to_dir(dbname)));
169 enquire = new Xapian::Enquire(db);
170 } catch (const Xapian::Error &) {
171 enquire = NULL;
174 hits_per_page = 0;
175 auto val = cgi_params.find("HITSPERPAGE");
176 if (val != cgi_params.end()) hits_per_page = atol(val->second.c_str());
177 if (hits_per_page == 0) {
178 hits_per_page = 10;
179 } else if (hits_per_page > 1000) {
180 hits_per_page = 1000;
183 val = cgi_params.find("DEFAULTOP");
184 if (val != cgi_params.end()) {
185 const string & v = val->second;
186 if (v == "OR" || v == "or")
187 default_op = Xapian::Query::OP_OR;
190 val = cgi_params.find("FMT");
191 if (val != cgi_params.end()) {
192 const string & v = val->second;
193 if (!v.empty()) fmtname = v;
195 if (fmtname.empty())
196 fmtname = default_template;
198 val = cgi_params.find("MORELIKE");
199 if (enquire && val != cgi_params.end()) {
200 const string & v = val->second;
201 Xapian::docid docid = atol(v.c_str());
202 if (docid == 0) {
203 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
204 // from an external source - we just find the correspond docid
205 Xapian::PostingIterator p = db.postlist_begin(v);
206 if (p != db.postlist_end(v)) docid = *p;
209 if (docid != 0) {
210 Xapian::RSet tmprset;
211 tmprset.add_document(docid);
213 OmegaExpandDecider decider(db);
214 set_expansion_scheme(*enquire, option);
215 Xapian::ESet eset(enquire->get_eset(40, tmprset, &decider));
216 string morelike_query;
217 for (auto&& term : eset) {
218 if (!morelike_query.empty()) morelike_query += ' ';
219 morelike_query += pretty_term(term);
221 add_query_string(string(), morelike_query);
223 } else {
224 // add expand/topterms terms if appropriate
225 string expand_terms;
226 if (cgi_params.find("ADD") != cgi_params.end()) {
227 auto g = cgi_params.equal_range("X");
228 for (auto i = g.first; i != g.second; ++i) {
229 const string & v = i->second;
230 if (!v.empty()) {
231 if (!expand_terms.empty())
232 expand_terms += ' ';
233 expand_terms += v;
238 // collect the unprefixed prob fields
239 auto g = cgi_params.equal_range("P");
240 for (auto i = g.first; i != g.second; ++i) {
241 const string & v = i->second;
242 if (!v.empty()) {
243 // If there are expand terms, append them to the first
244 // non-empty P parameter.
245 if (!expand_terms.empty()) {
246 string q = v;
247 q += ' ';
248 q += expand_terms;
249 add_query_string(string(), q);
250 expand_terms = string();
251 } else {
252 add_query_string(string(), v);
257 if (!expand_terms.empty()) {
258 add_query_string(string(), expand_terms);
262 auto begin = cgi_params.lower_bound("P.");
263 auto end = cgi_params.lower_bound("P/"); // '/' is '.' + 1.
264 for (auto i = begin; i != end; ++i) {
265 const string & v = i->second;
266 if (!v.empty()) {
267 string pfx(i->first, 2, string::npos);
268 add_query_string(pfx, v);
272 // set any boolean filters
273 auto g = cgi_params.equal_range("B");
274 if (g.first != g.second) {
275 vector<string> filter_v;
276 for (auto i = g.first; i != g.second; ++i) {
277 const string & v = i->second;
278 // we'll definitely get empty B fields from "-ALL-" options
279 if (!v.empty() && C_isalnum(v[0])) {
280 add_bterm(v);
281 filter_v.push_back(v);
284 sort(filter_v.begin(), filter_v.end());
285 vector<string>::const_iterator j;
286 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
287 const string & bterm = *j;
288 string::size_type e = bterm.find(filter_sep);
289 if (usual(e == string::npos)) {
290 filters += bterm;
291 } else {
292 // If a filter contains filter_sep then double it to escape.
293 // Each filter must start with an alnum (checked above) and
294 // the value after the last filter is the default op, which
295 // is encoded as a non-alnum so filter_sep followed by
296 // something other than filter_sep must be separating filters.
297 string::size_type b = 0;
298 while (e != string::npos) {
299 filters.append(bterm, b, e + 1 - b);
300 b = e;
301 e = bterm.find(filter_sep, b + 1);
303 filters.append(bterm, b, string::npos);
305 filters += filter_sep;
306 old_filters += bterm;
307 old_filters += filter_sep_old;
311 // set any negated boolean filters
312 g = cgi_params.equal_range("N");
313 if (g.first != g.second) {
314 vector<string> filter_v;
315 for (auto i = g.first; i != g.second; ++i) {
316 const string & v = i->second;
317 // we'll definitely get empty N fields from "-ALL-" options
318 if (!v.empty() && C_isalnum(v[0])) {
319 add_nterm(v);
320 filter_v.push_back(v);
323 sort(filter_v.begin(), filter_v.end());
324 vector<string>::const_iterator j;
325 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
326 const string & nterm = *j;
327 string::size_type e = nterm.find(filter_sep);
328 filters += '!';
329 if (usual(e == string::npos)) {
330 filters += nterm;
331 } else {
332 // If a filter contains filter_sep then double it to escape.
333 // Each filter must start with an alnum (checked above) and
334 // the value after the last filter is the default op, which
335 // is encoded as a non-alnum so filter_sep followed by
336 // something other than filter_sep must be separating filters.
337 string::size_type b = 0;
338 while (e != string::npos) {
339 filters.append(nterm, b, e + 1 - b);
340 b = e;
341 e = nterm.find(filter_sep, b + 1);
343 filters.append(nterm, b, string::npos);
345 filters += filter_sep;
346 // old_filters predates 'N' terms, so if there are 'N' terms this
347 // is definitely a different query.
348 old_filters.clear();
352 // date range filters
353 struct date_range {
354 string start, end, span;
356 map<Xapian::valueno, date_range> date_ranges;
357 begin = cgi_params.lower_bound("START.");
358 end = cgi_params.lower_bound("START/"); // '/' is '.' + 1.
359 for (auto i = begin; i != end; ++i) {
360 const string & v = i->second;
361 if (!v.empty()) {
362 Xapian::valueno slot = atoi(i->first.c_str() +
363 CONST_STRLEN("START."));
364 date_ranges[slot].start = v;
367 begin = cgi_params.lower_bound("END.");
368 end = cgi_params.lower_bound("END/"); // '/' is '.' + 1.
369 for (auto i = begin; i != end; ++i) {
370 const string & v = i->second;
371 if (!v.empty()) {
372 Xapian::valueno slot = atoi(i->first.c_str() +
373 CONST_STRLEN("END."));
374 date_ranges[slot].end = v;
377 begin = cgi_params.lower_bound("SPAN.");
378 end = cgi_params.lower_bound("SPAN/"); // '/' is '.' + 1.
379 for (auto i = begin; i != end; ++i) {
380 const string & v = i->second;
381 if (!v.empty()) {
382 Xapian::valueno slot = atoi(i->first.c_str() +
383 CONST_STRLEN("SPAN."));
384 date_ranges[slot].span = v;
387 if (!date_ranges.empty()) {
388 // old_filters predates START.N, END.N and SPAN.N so use of any of
389 // these means this is definitely a different query.
390 old_filters.clear();
392 for (auto i : date_ranges) {
393 auto slot = i.first;
394 auto r = i.second;
395 add_date_filter(r.start, r.end, r.span, slot);
396 filters += '$';
397 filters += str(slot);
398 filters += '$';
399 filters += r.start;
400 filters += '$';
401 filters += r.end;
402 filters += '$';
403 filters += r.span;
406 string date_start, date_end, date_span;
407 val = cgi_params.find("START");
408 if (val != cgi_params.end()) date_start = val->second;
409 val = cgi_params.find("END");
410 if (val != cgi_params.end()) date_end = val->second;
411 val = cgi_params.find("SPAN");
412 if (val != cgi_params.end()) date_span = val->second;
413 val = cgi_params.find("DATEVALUE");
414 Xapian::valueno date_value_slot = Xapian::BAD_VALUENO;
415 if (val != cgi_params.end()) date_value_slot = string_to_int(val->second);
416 add_date_filter(date_start, date_end, date_span, date_value_slot);
418 // If more default_op values are supported, encode them as non-alnums
419 // other than filter_sep, '!' or '$'.
420 filters += (default_op == Xapian::Query::OP_AND ? '.' : '-');
421 filters += date_start;
422 filters += filter_sep;
423 filters += date_end;
424 filters += filter_sep;
425 filters += date_span;
426 if (date_value_slot != Xapian::BAD_VALUENO) {
427 // This means we'll force the first page when reloading or changing
428 // page starting from existing URLs upon upgrade to 1.4.1, but the
429 // exact same existing URL could be for a search without the date
430 // filter where we want to force the first page, so there's an inherent
431 // ambiguity there. Forcing first page in this case seems the least
432 // problematic side-effect.
433 filters += filter_sep;
434 filters += str(date_value_slot);
437 if (!old_filters.empty()) {
438 old_filters += date_start;
439 old_filters += filter_sep_old;
440 old_filters += date_end;
441 old_filters += filter_sep_old;
442 old_filters += date_span;
443 old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O');
446 // Percentage relevance cut-off
447 val = cgi_params.find("THRESHOLD");
448 if (val != cgi_params.end()) {
449 threshold = atoi(val->second.c_str());
450 if (threshold < 0) threshold = 0;
451 if (threshold > 100) threshold = 100;
454 // collapsing
455 val = cgi_params.find("COLLAPSE");
456 if (val != cgi_params.end()) {
457 const string & v = val->second;
458 if (!v.empty()) {
459 collapse_key = atoi(v.c_str());
460 collapse = true;
461 filters += filter_sep;
462 filters += str(collapse_key);
463 if (!old_filters.empty()) {
464 old_filters += filter_sep_old;
465 old_filters += str(collapse_key);
469 if (!collapse && date_value_slot != Xapian::BAD_VALUENO) {
470 // We need to either omit filter_sep for both or neither, or else the
471 // encoding is ambiguous.
472 filters += filter_sep;
475 // docid order
476 val = cgi_params.find("DOCIDORDER");
477 if (val != cgi_params.end()) {
478 const string & v = val->second;
479 if (!v.empty()) {
480 char ch = v[0];
481 if (ch == 'D') {
482 docid_order = Xapian::Enquire::DESCENDING;
483 filters += 'D';
484 if (!old_filters.empty()) old_filters += 'D';
485 } else if (ch != 'A') {
486 docid_order = Xapian::Enquire::DONT_CARE;
487 } else {
488 // This is a bug (should add nothing here and 'X' in the (ch !=
489 // 'A') case, but the current "DONT_CARE" implementation
490 // actually always results in ascending docid order so it's not
491 // worth breaking compatibility to fix - let's just do it next
492 // time we change the encoding $filters uses.
493 filters += 'X';
494 if (!old_filters.empty()) old_filters += 'X';
499 // sorting
500 val = cgi_params.find("SORT");
501 if (val != cgi_params.end()) {
502 const char * base = val->second.c_str();
503 const char * p = base;
504 do {
505 bool rev = (*p != '+');
506 if (*p == '-' || *p == '+') {
507 // old_filters predates support for direction in SORT, so if
508 // there's a direction specified this is definitely a different
509 // query.
510 old_filters.clear();
511 ++p;
513 if (!C_isdigit(*p)) {
514 // Invalid.
515 break;
517 errno = 0;
518 char * q;
519 Xapian::valueno slot = strtoul(p, &q, 10);
520 p = q;
521 if (errno != 0) {
522 // Invalid.
523 break;
526 if (sort_key != Xapian::BAD_VALUENO) {
527 // Multiple sort keys specified, so we need a KeyMaker.
529 // Omit leading '+'.
530 if (reverse_sort) filters += '-';
531 filters += str(sort_key);
533 sort_keymaker = new Xapian::MultiValueKeyMaker;
534 sort_keymaker->add_value(sort_key, !reverse_sort);
535 sort_key = Xapian::BAD_VALUENO;
536 reverse_sort = true;
537 // old_filters predates multiple sort keys, so if there are
538 // multiple sort keys this is definitely a different query.
539 old_filters.clear();
542 if (sort_keymaker) {
543 filters += (rev ? '-' : '+');
544 filters += str(slot);
545 sort_keymaker->add_value(slot, !rev);
546 } else {
547 sort_key = slot;
548 reverse_sort = rev;
550 while (C_isspace(*p) || *p == ',') ++p;
551 } while (*p);
553 val = cgi_params.find("SORTREVERSE");
554 if (val != cgi_params.end() && atoi(val->second.c_str()) != 0) {
555 reverse_sort = !reverse_sort;
558 val = cgi_params.find("SORTAFTER");
559 if (val != cgi_params.end()) {
560 sort_after = (atoi(val->second.c_str()) != 0);
563 // Add the sorting related options to filters too.
565 // Note: old_filters really does encode a reversed sort as 'F' and a
566 // non-reversed sort as 'R' or 'r'.
568 // filters has them the other way around for sanity (except in
569 // development snapshot 1.3.4, which was when the new filter encoding
570 // was introduced).
571 if (!sort_keymaker) filters += str(sort_key);
572 if (!old_filters.empty()) old_filters += str(sort_key);
573 if (sort_after) {
574 if (reverse_sort) {
575 filters += 'R';
576 if (!old_filters.empty()) old_filters += 'F';
577 } else {
578 filters += 'F';
579 if (!old_filters.empty()) old_filters += 'R';
581 } else {
582 if (!reverse_sort) {
583 filters += 'f';
584 if (!old_filters.empty()) old_filters += 'r';
589 if (old_filters.empty()) old_filters = filters;
591 // min_hits (fill mset past topdoc+(hits_per_page+1) to
592 // topdoc+max(hits_per_page+1,min_hits)
593 val = cgi_params.find("MINHITS");
594 if (val != cgi_params.end()) {
595 min_hits = atol(val->second.c_str());
598 parse_omegascript();
599 } catch (const Xapian::Error &e) {
600 if (!set_content_type && !suppress_http_headers)
601 cout << "Content-Type: text/html\n\n";
602 cout << "Exception: " << html_escape(e.get_msg()) << endl;
603 } catch (const std::exception &e) {
604 if (!set_content_type && !suppress_http_headers)
605 cout << "Content-Type: text/html\n\n";
606 cout << "Exception: std::exception " << html_escape(e.what()) << endl;
607 } catch (const string &s) {
608 if (!set_content_type && !suppress_http_headers)
609 cout << "Content-Type: text/html\n\n";
610 cout << "Exception: " << html_escape(s) << endl;
611 } catch (const char *s) {
612 if (!set_content_type && !suppress_http_headers)
613 cout << "Content-Type: text/html\n\n";
614 cout << "Exception: " << html_escape(s) << endl;
615 } catch (...) {
616 if (!set_content_type && !suppress_http_headers)
617 cout << "Content-Type: text/html\n\n";
618 cout << "Caught unknown exception" << endl;