Support a direction prefix on SORT
[xapian.git] / xapian-applications / omega / omega.cc
blobca6964abf05117da9fa6d316fe7d43b367acda18
1 /* omega.cc: Main module for omega (example CGI frontend for Xapian)
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 // If we're building against git after the expand API changed but before the
27 // version gets bumped to 1.3.2, we'll get a deprecation warning from
28 // get_eset() unless we suppress such warnings here.
29 #define XAPIAN_DEPRECATED(D) D
31 #include <cstdio>
32 #include <ctime>
34 #include <algorithm>
35 #include <cstring>
36 #include <iostream>
37 #include <set>
39 #include "safefcntl.h"
40 #include "safeunistd.h"
42 #include "omega.h"
43 #include "utils.h"
44 #include "cgiparam.h"
45 #include "query.h"
46 #include "str.h"
47 #include "stringutils.h"
48 #include "expand.h"
50 using namespace std;
52 static const char DEFAULT_STEM_LANGUAGE[] = "english";
54 // A character which doesn't require URL encoding, and isn't likely to appear
55 // in filter values.
56 const char filter_sep = '~';
58 // What we used for filter_sep in Omega < 1.3.4.
59 const char filter_sep_old = '-';
61 Xapian::Enquire * enquire;
62 Xapian::Database db;
63 Xapian::RSet rset;
65 map<string, string> option;
67 string date_start, date_end, date_span;
69 bool set_content_type = false;
71 bool suppress_http_headers = false;
73 string dbname;
74 string fmtname;
75 string filters, old_filters;
77 Xapian::docid topdoc = 0;
78 Xapian::docid hits_per_page = 0;
79 Xapian::docid min_hits = 0;
81 // percentage cut-off
82 int threshold = 0;
84 Xapian::valueno sort_key = Xapian::BAD_VALUENO; // Don't sort.
85 bool reverse_sort = true;
86 bool sort_after = false;
87 Xapian::Enquire::docid_order docid_order = Xapian::Enquire::ASCENDING;
89 Xapian::valueno collapse_key = 0;
90 bool collapse = false;
92 static string
93 map_dbname_to_dir(const string &database_name)
95 return database_dir + database_name;
98 int main(int argc, char *argv[])
99 try {
100 read_config_file();
102 MCI val;
103 pair<MCI, MCI> g;
105 option["flag_default"] = "true";
107 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
108 option["decimal"] = ".";
109 option["thousand"] = ",";
111 // set the default stemming language
112 option["stemmer"] = DEFAULT_STEM_LANGUAGE;
114 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
115 //setvbuf(stdout, NULL, _IOLBF, 0);
117 const char * method = getenv("REQUEST_METHOD");
118 if (method == NULL) {
119 if (argc > 1 && (argv[1][0] != '-' || strchr(argv[1], '='))) {
120 // omega 'P=information retrieval' DB=papers
121 // check for a leading '-' on the first arg so "omega --version",
122 // "omega --help", and similar take the next branch
123 decode_argv(argv + 1);
124 } else {
125 // Seems we're running from the command line so give version
126 // and allow a query to be entered for testing
127 cout << PROGRAM_NAME " - " PACKAGE " " VERSION "\n";
128 if (argc > 1) exit(0);
129 cout << "Enter NAME=VALUE lines, end with blank line\n";
130 decode_test();
132 } else {
133 if (*method == 'P')
134 decode_post();
135 else
136 decode_get();
139 try {
140 // get database(s) to search
141 dbname.resize(0);
142 set<string> seen; // only add a repeated db once
143 g = cgi_params.equal_range("DB");
144 for (MCI i = g.first; i != g.second; ++i) {
145 const string & v = i->second;
146 if (!v.empty()) {
147 size_t p = 0, q;
148 while (true) {
149 q = v.find('/', p);
150 string s(v, p, q - p);
151 if (!s.empty() && seen.find(s) == seen.end()) {
152 // Translate DB parameter to path of database directory
153 if (!dbname.empty()) dbname += '/';
154 dbname += s;
155 db.add_database(Xapian::Database(map_dbname_to_dir(s)));
156 seen.insert(s);
158 if (q == string::npos) break;
159 p = q + 1;
163 if (dbname.empty()) {
164 dbname = default_db;
165 db.add_database(Xapian::Database(map_dbname_to_dir(dbname)));
167 enquire = new Xapian::Enquire(db);
169 catch (const Xapian::Error &) {
170 enquire = NULL;
173 hits_per_page = 0;
174 val = cgi_params.find("HITSPERPAGE");
175 if (val != cgi_params.end()) hits_per_page = atol(val->second.c_str());
176 if (hits_per_page == 0) {
177 hits_per_page = 10;
178 } else if (hits_per_page > 1000) {
179 hits_per_page = 1000;
182 val = cgi_params.find("DEFAULTOP");
183 if (val != cgi_params.end()) {
184 const string & v = val->second;
185 if (v == "OR" || v == "or")
186 default_op = Xapian::Query::OP_OR;
189 val = cgi_params.find("FMT");
190 if (val != cgi_params.end()) {
191 const string & v = val->second;
192 if (!v.empty()) fmtname = v;
194 if (fmtname.empty())
195 fmtname = default_template;
197 val = cgi_params.find("MORELIKE");
198 if (enquire && val != cgi_params.end()) {
199 const string & v = val->second;
200 Xapian::docid docid = atol(v.c_str());
201 if (docid == 0) {
202 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
203 // from an external source - we just find the correspond docid
204 Xapian::PostingIterator p = db.postlist_begin(v);
205 if (p != db.postlist_end(v)) docid = *p;
208 if (docid != 0) {
209 Xapian::RSet tmprset;
210 tmprset.add_document(docid);
212 OmegaExpandDecider decider(db);
213 set_expansion_scheme(*enquire, option);
214 #if XAPIAN_AT_LEAST(1,3,2)
215 Xapian::ESet eset(enquire->get_eset(40, tmprset, &decider));
216 #else
217 Xapian::ESet eset(enquire->get_eset(40, tmprset, 0,
218 expand_param_k, &decider));
219 #endif
220 string morelike_query;
221 for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); i++) {
222 if (!morelike_query.empty()) morelike_query += ' ';
223 morelike_query += pretty_term(*i);
225 set_probabilistic_query(string(), morelike_query);
227 } else {
228 // add expand/topterms terms if appropriate
229 string expand_terms;
230 if (cgi_params.find("ADD") != cgi_params.end()) {
231 g = cgi_params.equal_range("X");
232 for (MCI i = g.first; i != g.second; i++) {
233 const string & v = i->second;
234 if (!v.empty()) {
235 if (!expand_terms.empty())
236 expand_terms += ' ';
237 expand_terms += v;
242 // collect the unprefixed prob fields
243 g = cgi_params.equal_range("P");
244 for (MCI i = g.first; i != g.second; i++) {
245 const string & v = i->second;
246 if (!v.empty()) {
247 // If there are expand terms, append them to the first
248 // non-empty P parameter.
249 if (!expand_terms.empty()) {
250 string q = v;
251 q += ' ';
252 q += expand_terms;
253 set_probabilistic_query(string(), q);
254 expand_terms = string();
255 } else {
256 set_probabilistic_query(string(), v);
261 if (!expand_terms.empty()) {
262 set_probabilistic_query(string(), expand_terms);
266 g.first = cgi_params.lower_bound("P.");
267 g.second = cgi_params.lower_bound("P/"); // '/' is '.' + 1.
268 for (MCI i = g.first; i != g.second; i++) {
269 const string & v = i->second;
270 if (!v.empty()) {
271 string pfx(i->first, 2, string::npos);
272 set_probabilistic_query(pfx, v);
276 // set any boolean filters
277 g = cgi_params.equal_range("B");
278 if (g.first != g.second) {
279 vector<string> filter_v;
280 for (MCI i = g.first; i != g.second; i++) {
281 const string & v = i->second;
282 // we'll definitely get empty B fields from "-ALL-" options
283 if (!v.empty() && C_isalnum(v[0])) {
284 add_bterm(v);
285 filter_v.push_back(v);
288 sort(filter_v.begin(), filter_v.end());
289 vector<string>::const_iterator j;
290 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
291 const string & bterm = *j;
292 string::size_type e = bterm.find(filter_sep);
293 if (usual(e == string::npos)) {
294 filters += bterm;
295 } else {
296 // If a filter contains filter_sep then double it to escape.
297 // Each filter must start with an alnum (checked above) and
298 // the value after the last filter is the default op, which
299 // is encoded as a non-alnum so filter_sep followed by
300 // something other than filter_sep must be separating filters.
301 string::size_type b = 0;
302 while (e != string::npos) {
303 filters.append(bterm, b, e + 1 - b);
304 b = e;
305 e = bterm.find(filter_sep, b + 1);
307 filters.append(bterm, b, string::npos);
309 filters += filter_sep;
310 old_filters += bterm;
311 old_filters += filter_sep_old;
315 // set any negated boolean filters
316 g = cgi_params.equal_range("N");
317 if (g.first != g.second) {
318 vector<string> filter_v;
319 for (MCI i = g.first; i != g.second; i++) {
320 const string & v = i->second;
321 // we'll definitely get empty N fields from "-ALL-" options
322 if (!v.empty() && C_isalnum(v[0])) {
323 add_nterm(v);
324 filter_v.push_back(v);
327 sort(filter_v.begin(), filter_v.end());
328 vector<string>::const_iterator j;
329 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
330 const string & nterm = *j;
331 string::size_type e = nterm.find(filter_sep);
332 filters += '!';
333 if (usual(e == string::npos)) {
334 filters += nterm;
335 } else {
336 // If a filter contains filter_sep then double it to escape.
337 // Each filter must start with an alnum (checked above) and
338 // the value after the last filter is the default op, which
339 // is encoded as a non-alnum so filter_sep followed by
340 // something other than filter_sep must be separating filters.
341 string::size_type b = 0;
342 while (e != string::npos) {
343 filters.append(nterm, b, e + 1 - b);
344 b = e;
345 e = nterm.find(filter_sep, b + 1);
347 filters.append(nterm, b, string::npos);
349 filters += filter_sep;
350 // old_filters predates 'N' terms, so if there are 'N' terms this
351 // is definitely a different query.
352 old_filters.clear();
356 // date range filters
357 val = cgi_params.find("START");
358 if (val != cgi_params.end()) date_start = val->second;
359 val = cgi_params.find("END");
360 if (val != cgi_params.end()) date_end = val->second;
361 val = cgi_params.find("SPAN");
362 if (val != cgi_params.end()) date_span = val->second;
364 // If more default_op values are supported, encode them as non-alnums
365 // other than filter_sep or '!'.
366 filters += (default_op == Xapian::Query::OP_AND ? '.' : '-');
367 filters += date_start;
368 filters += filter_sep;
369 filters += date_end;
370 filters += filter_sep;
371 filters += date_span;
373 if (!old_filters.empty()) {
374 old_filters += date_start;
375 old_filters += filter_sep_old;
376 old_filters += date_end;
377 old_filters += filter_sep_old;
378 old_filters += date_span;
379 old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O');
382 // Percentage relevance cut-off
383 val = cgi_params.find("THRESHOLD");
384 if (val != cgi_params.end()) {
385 threshold = atoi(val->second.c_str());
386 if (threshold < 0) threshold = 0;
387 if (threshold > 100) threshold = 100;
390 // collapsing
391 val = cgi_params.find("COLLAPSE");
392 if (val != cgi_params.end()) {
393 const string & v = val->second;
394 if (!v.empty()) {
395 collapse_key = atoi(v.c_str());
396 collapse = true;
397 filters += filter_sep;
398 filters += str(collapse_key);
399 if (!old_filters.empty()) {
400 old_filters += filter_sep_old;
401 old_filters += str(collapse_key);
406 // docid order
407 val = cgi_params.find("DOCIDORDER");
408 if (val != cgi_params.end()) {
409 const string & v = val->second;
410 if (!v.empty()) {
411 char ch = v[0];
412 if (ch == 'D') {
413 docid_order = Xapian::Enquire::DESCENDING;
414 filters += 'D';
415 if (!old_filters.empty()) old_filters += 'D';
416 } else if (ch != 'A') {
417 docid_order = Xapian::Enquire::DONT_CARE;
418 } else {
419 filters += 'X';
420 if (!old_filters.empty()) old_filters += 'X';
425 // sorting
426 val = cgi_params.find("SORT");
427 if (val != cgi_params.end()) {
428 const char * p = val->second.c_str();
429 if (*p == '-' || *p == '+') {
430 reverse_sort = (*p == '-');
431 ++p;
433 sort_key = atoi(p);
435 val = cgi_params.find("SORTREVERSE");
436 if (val != cgi_params.end() && atoi(val->second.c_str()) != 0) {
437 reverse_sort = !reverse_sort;
440 val = cgi_params.find("SORTAFTER");
441 if (val != cgi_params.end()) {
442 sort_after = (atoi(val->second.c_str()) != 0);
445 // Add the sorting related options to filters too.
447 // Note: old_filters really does encode a reversed sort as 'F' and a
448 // non-reversed sort as 'R' or 'r'.
450 // filters has them the other way around for sanity (except in
451 // development snapshot 1.3.4, which was when the new filter encoding
452 // was introduced).
453 filters += str(sort_key);
454 if (!old_filters.empty()) old_filters += str(sort_key);
455 if (sort_after) {
456 if (reverse_sort) {
457 filters += 'R';
458 if (!old_filters.empty()) old_filters += 'F';
459 } else {
460 filters += 'F';
461 if (!old_filters.empty()) old_filters += 'R';
463 } else {
464 if (!reverse_sort) {
465 filters += 'f';
466 if (!old_filters.empty()) old_filters += 'r';
471 if (old_filters.empty()) old_filters = filters;
473 // min_hits (fill mset past topdoc+(hits_per_page+1) to
474 // topdoc+max(hits_per_page+1,min_hits)
475 val = cgi_params.find("MINHITS");
476 if (val != cgi_params.end()) {
477 min_hits = atol(val->second.c_str());
480 parse_omegascript();
481 } catch (const Xapian::Error &e) {
482 if (!set_content_type && !suppress_http_headers)
483 cout << "Content-Type: text/html\n\n";
484 cout << "Exception: " << html_escape(e.get_msg()) << endl;
485 } catch (const std::exception &e) {
486 if (!set_content_type && !suppress_http_headers)
487 cout << "Content-Type: text/html\n\n";
488 cout << "Exception: std::exception " << html_escape(e.what()) << endl;
489 } catch (const string &s) {
490 if (!set_content_type && !suppress_http_headers)
491 cout << "Content-Type: text/html\n\n";
492 cout << "Exception: " << html_escape(s) << endl;
493 } catch (const char *s) {
494 if (!set_content_type && !suppress_http_headers)
495 cout << "Content-Type: text/html\n\n";
496 cout << "Exception: " << html_escape(s) << endl;
497 } catch (...) {
498 if (!set_content_type && !suppress_http_headers)
499 cout << "Content-Type: text/html\n\n";
500 cout << "Caught unknown exception" << endl;