[ci] Fix netbsd job to upgrade existing packages
[xapian.git] / xapian-applications / omega / omega.cc
blob452463ffdd66a03d3ba9d9060aa9f2946f0a93c1
1 /** @file
2 * @brief Main module for omega (example CGI frontend for Xapian)
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2023 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #include <config.h>
27 #include <cerrno>
28 #include <cstdio>
29 #include <ctime>
31 #include <algorithm>
32 #include <cstring>
33 #include <iostream>
34 #include <set>
36 #include "safefcntl.h"
37 #include "safeunistd.h"
39 #include "omega.h"
40 #include "utils.h"
41 #include "cgiparam.h"
42 #include "query.h"
43 #include "str.h"
44 #include "stringutils.h"
45 #include "expand.h"
46 #include "parseint.h"
48 using namespace std;
50 static const char DEFAULT_STEM_LANGUAGE[] = "english";
52 // A character which doesn't require URL encoding, and isn't likely to appear
53 // in filter values.
54 const char filter_sep = '~';
56 Xapian::Enquire * enquire;
57 Xapian::Database db;
59 map<string, string> option;
61 bool set_content_type = false;
63 bool suppress_http_headers = false;
65 string dbname;
66 string fmtname;
67 string filters, old_filters;
69 Xapian::docid hits_per_page = 0;
70 Xapian::docid min_hits = 0;
72 // percentage cut-off
73 int threshold = 0;
75 Xapian::MultiValueKeyMaker* sort_keymaker = NULL;
76 Xapian::valueno sort_key = Xapian::BAD_VALUENO; // Don't sort.
77 bool reverse_sort = true;
78 bool sort_after = false;
79 Xapian::Enquire::docid_order docid_order = Xapian::Enquire::ASCENDING;
81 Xapian::valueno collapse_key = 0;
82 bool collapse = false;
84 static string
85 map_dbname_to_dir(const string &database_name)
87 return database_dir + database_name;
90 static void
91 add_database(const string& this_dbname)
93 if (!dbname.empty()) dbname += '/';
94 dbname += this_dbname;
96 Xapian::Database this_db(map_dbname_to_dir(this_dbname));
97 db.add_database(this_db);
99 size_t this_db_size = this_db.size();
100 size_t db_size = db.size();
101 size_t i = 0;
102 while (subdbs.size() != db_size) {
103 subdbs.emplace_back(this_dbname, i++, this_db_size);
107 // Get database(s) to search.
108 template<typename IT>
109 void
110 parse_db_params(const pair<IT, IT>& dbs)
112 dbname.resize(0);
113 // Only add a repeated db once.
114 set<string> seen;
115 for (auto i = dbs.first; i != dbs.second; ++i) {
116 const string& v = i->second;
117 if (v.empty()) continue;
118 size_t p = 0, q;
119 while (true) {
120 q = v.find('/', p);
121 string s(v, p, q - p);
122 if (!s.empty() && seen.find(s) == seen.end()) {
123 add_database(s);
124 seen.insert(s);
126 if (q == string::npos) break;
127 p = q + 1;
132 int main(int argc, char *argv[])
133 try {
135 // Check for SERVER_PROTOCOL=INCLUDED, which is set when we're being
136 // included in a page via a server-side include directive. In this
137 // case we suppress sending a Content-Type: header.
138 const char* p = getenv("SERVER_PROTOCOL");
139 if (p && strcmp(p, "INCLUDED") == 0) {
140 suppress_http_headers = true;
144 read_config_file();
146 option["flag_default"] = "true";
148 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
149 option["decimal"] = ".";
150 option["thousand"] = ",";
152 // set the default stemming language
153 option["stemmer"] = DEFAULT_STEM_LANGUAGE;
155 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
156 // setvbuf(stdout, NULL, _IOLBF, 0);
158 const char * method = getenv("REQUEST_METHOD");
159 if (method == NULL) {
160 if (argc > 1 && (argv[1][0] != '-' || strchr(argv[1], '='))) {
161 // omega 'P=information retrieval' DB=papers
162 // check for a leading '-' on the first arg so "omega --version",
163 // "omega --help", and similar take the next branch
164 decode_argv(argv + 1);
165 } else {
166 // Seems we're running from the command line so give version
167 // and allow a query to be entered for testing
168 cout << PROGRAM_NAME " - " PACKAGE " " VERSION "\n";
169 if (argc > 1) exit(0);
170 cout << "Enter NAME=VALUE lines, end with blank line\n";
171 decode_test();
173 } else {
174 if (*method == 'P')
175 decode_post();
176 else
177 decode_get();
180 try {
181 parse_db_params(cgi_params.equal_range("DB"));
182 if (dbname.empty()) {
183 add_database(default_db);
185 enquire = new Xapian::Enquire(db);
186 } catch (const Xapian::Error &) {
187 enquire = NULL;
188 db = Xapian::Database();
191 hits_per_page = 0;
192 auto val = cgi_params.find("HITSPERPAGE");
193 if (val != cgi_params.end()) {
194 if (!parse_unsigned(val->second.c_str(), hits_per_page)) {
195 throw "HITSPERPAGE parameter must be >= 0";
198 if (hits_per_page == 0) {
199 hits_per_page = 10;
200 } else if (hits_per_page > 1000) {
201 hits_per_page = 1000;
204 val = cgi_params.find("DEFAULTOP");
205 if (val != cgi_params.end()) {
206 const string & v = val->second;
207 if (v == "OR" || v == "or")
208 default_op = Xapian::Query::OP_OR;
211 val = cgi_params.find("FMT");
212 if (val != cgi_params.end()) {
213 const string & v = val->second;
214 if (!v.empty()) fmtname = v;
216 if (fmtname.empty())
217 fmtname = default_template;
219 auto ml = cgi_params.equal_range("MORELIKE");
220 if (enquire && ml.first != ml.second) {
221 Xapian::RSet tmprset;
222 for (auto i = ml.first; i != ml.second; ++i) {
223 const string& v = i->second;
224 Xapian::docid docid = atol(v.c_str());
225 if (docid == 0) {
226 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
227 // from an external source - we just find the correspond docid.
228 Xapian::PostingIterator p = db.postlist_begin(v);
229 if (p != db.postlist_end(v)) docid = *p;
231 if (docid != 0) {
232 tmprset.add_document(docid);
236 if (!tmprset.empty()) {
237 OmegaExpandDecider decider(db);
238 set_expansion_scheme(*enquire, option);
239 Xapian::ESet eset(enquire->get_eset(40, tmprset, &decider));
240 string morelike_query;
241 for (auto&& term : eset) {
242 if (!morelike_query.empty()) {
243 if (default_op == Xapian::Query::OP_OR) {
244 morelike_query += ' ';
245 } else {
246 morelike_query += " OR ";
249 morelike_query += pretty_term(term);
251 add_query_string(string(), morelike_query);
253 } else {
254 // add expand/topterms terms if appropriate
255 string expand_terms;
256 if (cgi_params.find("ADD") != cgi_params.end()) {
257 auto g = cgi_params.equal_range("X");
258 for (auto i = g.first; i != g.second; ++i) {
259 const string & v = i->second;
260 if (!v.empty()) {
261 if (!expand_terms.empty())
262 expand_terms += ' ';
263 expand_terms += v;
268 // collect the unprefixed prob fields
269 auto g = cgi_params.equal_range("P");
270 for (auto i = g.first; i != g.second; ++i) {
271 const string & v = i->second;
272 if (!v.empty()) {
273 // If there are expand terms, append them to the first
274 // non-empty P parameter.
275 if (!expand_terms.empty()) {
276 string q = v;
277 q += ' ';
278 q += expand_terms;
279 add_query_string(string(), q);
280 expand_terms = string();
281 } else {
282 add_query_string(string(), v);
287 if (!expand_terms.empty()) {
288 add_query_string(string(), expand_terms);
292 auto begin = cgi_params.lower_bound("P.");
293 auto end = cgi_params.lower_bound("P/"); // '/' is '.' + 1.
294 for (auto i = begin; i != end; ++i) {
295 const string & v = i->second;
296 if (!v.empty()) {
297 string pfx(i->first, 2, string::npos);
298 add_query_string(pfx, v);
302 // set any boolean filters
303 auto g = cgi_params.equal_range("B");
304 if (g.first != g.second) {
305 vector<string> filter_v;
306 for (auto i = g.first; i != g.second; ++i) {
307 const string & v = i->second;
308 // we'll definitely get empty B fields from "-ALL-" options
309 if (!v.empty() && C_isalnum(v[0])) {
310 add_bterm(v);
311 filter_v.push_back(v);
314 sort(filter_v.begin(), filter_v.end());
315 vector<string>::const_iterator j;
316 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
317 const string & bterm = *j;
318 string::size_type e = bterm.find(filter_sep);
319 if (usual(e == string::npos)) {
320 filters += bterm;
321 } else {
322 // If a filter contains filter_sep then double it to escape.
323 // Each filter must start with an alnum (checked above) and
324 // the value after the last filter is the default op, which
325 // is encoded as a non-alnum so filter_sep followed by
326 // something other than filter_sep must be separating filters.
327 string::size_type b = 0;
328 while (e != string::npos) {
329 filters.append(bterm, b, e + 1 - b);
330 b = e;
331 e = bterm.find(filter_sep, b + 1);
333 filters.append(bterm, b, string::npos);
335 filters += filter_sep;
339 // set any negated boolean filters
340 g = cgi_params.equal_range("N");
341 if (g.first != g.second) {
342 vector<string> filter_v;
343 for (auto i = g.first; i != g.second; ++i) {
344 const string & v = i->second;
345 // we'll definitely get empty N fields from "-ALL-" options
346 if (!v.empty() && C_isalnum(v[0])) {
347 add_nterm(v);
348 filter_v.push_back(v);
351 sort(filter_v.begin(), filter_v.end());
352 vector<string>::const_iterator j;
353 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
354 const string & nterm = *j;
355 string::size_type e = nterm.find(filter_sep);
356 filters += '!';
357 if (usual(e == string::npos)) {
358 filters += nterm;
359 } else {
360 // If a filter contains filter_sep then double it to escape.
361 // Each filter must start with an alnum (checked above) and
362 // the value after the last filter is the default op, which
363 // is encoded as a non-alnum so filter_sep followed by
364 // something other than filter_sep must be separating filters.
365 string::size_type b = 0;
366 while (e != string::npos) {
367 filters.append(nterm, b, e + 1 - b);
368 b = e;
369 e = nterm.find(filter_sep, b + 1);
371 filters.append(nterm, b, string::npos);
373 filters += filter_sep;
377 // date range filters
378 struct date_range {
379 string start, end, span;
381 map<Xapian::valueno, date_range> date_ranges;
382 begin = cgi_params.lower_bound("START.");
383 end = cgi_params.lower_bound("START/"); // '/' is '.' + 1.
384 for (auto i = begin; i != end; ++i) {
385 const string & v = i->second;
386 if (!v.empty()) {
387 Xapian::valueno slot;
388 if (!parse_unsigned(i->first.c_str() +
389 CONST_STRLEN("START."), slot)) {
390 throw "START slot value must be >= 0";
392 date_ranges[slot].start = v;
395 begin = cgi_params.lower_bound("END.");
396 end = cgi_params.lower_bound("END/"); // '/' is '.' + 1.
397 for (auto i = begin; i != end; ++i) {
398 const string & v = i->second;
399 if (!v.empty()) {
400 Xapian::valueno slot;
401 if (!parse_unsigned(i->first.c_str() +
402 CONST_STRLEN("END."), slot)) {
403 throw "END slot value must be >= 0";
405 date_ranges[slot].end = v;
408 begin = cgi_params.lower_bound("SPAN.");
409 end = cgi_params.lower_bound("SPAN/"); // '/' is '.' + 1.
410 for (auto i = begin; i != end; ++i) {
411 const string & v = i->second;
412 if (!v.empty()) {
413 Xapian::valueno slot;
414 if (!parse_unsigned(i->first.c_str() +
415 CONST_STRLEN("SPAN."), slot)) {
416 throw "SPAN slot value must be >= 0";
418 date_ranges[slot].span = v;
421 for (auto i : date_ranges) {
422 auto slot = i.first;
423 auto r = i.second;
424 add_date_filter(r.start, r.end, r.span, slot);
425 filters += '$';
426 filters += str(slot);
427 filters += '$';
428 filters += r.start;
429 filters += '$';
430 filters += r.end;
431 filters += '$';
432 filters += r.span;
435 string date_start, date_end, date_span;
436 val = cgi_params.find("DATEVALUE");
437 Xapian::valueno date_value_slot = Xapian::BAD_VALUENO;
438 if (val != cgi_params.end() &&
439 !parse_unsigned(val->second.c_str(), date_value_slot)) {
440 throw "DATEVALUE slot must be >= 0";
442 // Process DATEVALUE=n and associated values unless we saw START.n=...
443 // or END.n=... or SPAN.n=...
444 if (date_ranges.find(date_value_slot) == date_ranges.end()) {
445 val = cgi_params.find("START");
446 if (val != cgi_params.end()) date_start = val->second;
447 val = cgi_params.find("END");
448 if (val != cgi_params.end()) date_end = val->second;
449 val = cgi_params.find("SPAN");
450 if (val != cgi_params.end()) date_span = val->second;
451 add_date_filter(date_start, date_end, date_span, date_value_slot);
454 // If more default_op values are supported, encode them as non-alnums
455 // other than filter_sep, '!' or '$'.
456 filters += (default_op == Xapian::Query::OP_AND ? '.' : '-');
457 filters += date_start;
458 filters += filter_sep;
459 filters += date_end;
460 filters += filter_sep;
461 filters += date_span;
462 if (date_value_slot != Xapian::BAD_VALUENO) {
463 // This means we'll force the first page when reloading or changing
464 // page starting from existing URLs upon upgrade to 1.4.1, but the
465 // exact same existing URL could be for a search without the date
466 // filter where we want to force the first page, so there's an inherent
467 // ambiguity there. Forcing first page in this case seems the least
468 // problematic side-effect.
469 filters += filter_sep;
470 filters += str(date_value_slot);
473 // Percentage relevance cut-off
474 val = cgi_params.find("THRESHOLD");
475 if (val != cgi_params.end()) {
476 unsigned int temp;
477 if (val->second[0] == '-') {
478 if (!parse_unsigned(val->second.c_str() + 1, temp)) {
479 throw "THRESHOLD parameter must be an integer";
481 threshold = 0;
482 } else if (!parse_unsigned(val->second.c_str(), temp)) {
483 throw "THRESHOLD parameter must be an integer";
485 if (temp > 100) {
486 threshold = 100;
487 } else {
488 threshold = temp;
492 // collapsing
493 val = cgi_params.find("COLLAPSE");
494 if (val != cgi_params.end()) {
495 const string & v = val->second;
496 if (!v.empty()) {
497 if (!parse_unsigned(val->second.c_str(), collapse_key)) {
498 throw "COLLAPSE parameter must be >= 0";
500 collapse = true;
501 filters += filter_sep;
502 filters += str(collapse_key);
505 if (!collapse && date_value_slot != Xapian::BAD_VALUENO) {
506 // We need to either omit filter_sep for both or neither, or else the
507 // encoding is ambiguous.
508 filters += filter_sep;
511 // docid order
512 val = cgi_params.find("DOCIDORDER");
513 if (val != cgi_params.end()) {
514 const string & v = val->second;
515 if (!v.empty()) {
516 char ch = v[0];
517 if (ch == 'D') {
518 docid_order = Xapian::Enquire::DESCENDING;
519 filters += 'D';
520 } else if (ch != 'A') {
521 docid_order = Xapian::Enquire::DONT_CARE;
522 } else {
523 // This is a bug (should add nothing here and 'X' in the (ch !=
524 // 'A') case, but the current "DONT_CARE" implementation
525 // actually always results in ascending docid order so it's not
526 // worth breaking compatibility to fix - let's just do it next
527 // time we change the encoding $filters uses.
528 filters += 'X';
533 // sorting
534 val = cgi_params.find("SORT");
535 if (val != cgi_params.end()) {
536 const char * base = val->second.c_str();
537 const char * p = base;
538 do {
539 bool rev = (*p != '+');
540 if (*p == '-' || *p == '+') {
541 ++p;
543 if (!C_isdigit(*p)) {
544 // Invalid.
545 break;
547 errno = 0;
548 char * q;
549 Xapian::valueno slot = strtoul(p, &q, 10);
550 p = q;
551 if (errno != 0) {
552 // Invalid.
553 break;
556 if (sort_key != Xapian::BAD_VALUENO) {
557 // Multiple sort keys specified, so we need a KeyMaker.
559 // Omit leading '+'.
560 if (reverse_sort) filters += '-';
561 filters += str(sort_key);
563 sort_keymaker = new Xapian::MultiValueKeyMaker;
564 sort_keymaker->add_value(sort_key, !reverse_sort);
565 sort_key = Xapian::BAD_VALUENO;
566 reverse_sort = true;
569 if (sort_keymaker) {
570 filters += (rev ? '-' : '+');
571 filters += str(slot);
572 sort_keymaker->add_value(slot, !rev);
573 } else {
574 sort_key = slot;
575 reverse_sort = rev;
577 while (C_isspace(*p) || *p == ',') ++p;
578 } while (*p);
580 val = cgi_params.find("SORTREVERSE");
581 if (val != cgi_params.end()) {
582 unsigned int temp;
583 if (!parse_unsigned(val->second.c_str(), temp)) {
584 throw "SORTREVERSE parameter must be >= 0";
586 if (temp != 0) {
587 reverse_sort = !reverse_sort;
590 val = cgi_params.find("SORTAFTER");
591 if (val != cgi_params.end()) {
592 unsigned int temp;
593 if (!parse_unsigned(val->second.c_str(), temp)) {
594 throw "SORTAFTER parameter must be >= 0";
596 sort_after = bool(temp);
599 // Add the sorting related options to filters too.
600 if (!sort_keymaker) filters += str(sort_key);
601 if (sort_after) {
602 if (reverse_sort) {
603 filters += 'R';
604 } else {
605 filters += 'F';
607 } else {
608 if (!reverse_sort) {
609 filters += 'f';
614 // min_hits (fill mset past topdoc+(hits_per_page+1) to
615 // topdoc+max(hits_per_page+1,min_hits)
616 val = cgi_params.find("MINHITS");
617 if (val != cgi_params.end()) {
618 if (!parse_unsigned(val->second.c_str(), min_hits)) {
619 throw "MINHITS parameter must be >= 0";
623 parse_omegascript();
624 } catch (const Xapian::Error &e) {
625 if (!set_content_type && !suppress_http_headers)
626 cout << "Content-Type: text/html\n\n";
627 cout << "Exception: " << html_escape(e.get_description()) << endl;
628 } catch (const std::exception &e) {
629 if (!set_content_type && !suppress_http_headers)
630 cout << "Content-Type: text/html\n\n";
631 cout << "Exception: std::exception " << html_escape(e.what()) << endl;
632 } catch (const string &s) {
633 if (!set_content_type && !suppress_http_headers)
634 cout << "Content-Type: text/html\n\n";
635 cout << "Exception: " << html_escape(s) << endl;
636 } catch (const char *s) {
637 if (!set_content_type && !suppress_http_headers)
638 cout << "Content-Type: text/html\n\n";
639 cout << "Exception: " << html_escape(s) << endl;
640 } catch (...) {
641 if (!set_content_type && !suppress_http_headers)
642 cout << "Content-Type: text/html\n\n";
643 cout << "Caught unknown exception" << endl;