1 /* omega.cc: Main module for omega (example CGI frontend for Xapian)
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 // If we're building against git after the expand API changed but before the
27 // version gets bumped to 1.3.2, we'll get a deprecation warning from
28 // get_eset() unless we suppress such warnings here.
29 #define XAPIAN_DEPRECATED(D) D
39 #include "safefcntl.h"
40 #include "safeunistd.h"
47 #include "stringutils.h"
52 static const char DEFAULT_STEM_LANGUAGE
[] = "english";
54 // A character which doesn't require URL encoding, and isn't likely to appear
56 const char filter_sep
= '~';
58 // What we used for filter_sep in Omega < 1.3.4.
59 const char filter_sep_old
= '-';
61 Xapian::Enquire
* enquire
;
65 map
<string
, string
> option
;
67 string date_start
, date_end
, date_span
;
68 Xapian::valueno date_value_slot
= Xapian::BAD_VALUENO
;
70 bool set_content_type
= false;
72 bool suppress_http_headers
= false;
76 string filters
, old_filters
;
78 Xapian::docid topdoc
= 0;
79 Xapian::docid hits_per_page
= 0;
80 Xapian::docid min_hits
= 0;
85 Xapian::MultiValueKeyMaker
* sort_keymaker
= NULL
;
86 Xapian::valueno sort_key
= Xapian::BAD_VALUENO
; // Don't sort.
87 bool reverse_sort
= true;
88 bool sort_after
= false;
89 Xapian::Enquire::docid_order docid_order
= Xapian::Enquire::ASCENDING
;
91 Xapian::valueno collapse_key
= 0;
92 bool collapse
= false;
95 map_dbname_to_dir(const string
&database_name
)
97 return database_dir
+ database_name
;
100 int main(int argc
, char *argv
[])
107 option
["flag_default"] = "true";
109 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
110 option
["decimal"] = ".";
111 option
["thousand"] = ",";
113 // set the default stemming language
114 option
["stemmer"] = DEFAULT_STEM_LANGUAGE
;
116 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
117 //setvbuf(stdout, NULL, _IOLBF, 0);
119 const char * method
= getenv("REQUEST_METHOD");
120 if (method
== NULL
) {
121 if (argc
> 1 && (argv
[1][0] != '-' || strchr(argv
[1], '='))) {
122 // omega 'P=information retrieval' DB=papers
123 // check for a leading '-' on the first arg so "omega --version",
124 // "omega --help", and similar take the next branch
125 decode_argv(argv
+ 1);
127 // Seems we're running from the command line so give version
128 // and allow a query to be entered for testing
129 cout
<< PROGRAM_NAME
" - " PACKAGE
" " VERSION
"\n";
130 if (argc
> 1) exit(0);
131 cout
<< "Enter NAME=VALUE lines, end with blank line\n";
142 // get database(s) to search
144 set
<string
> seen
; // only add a repeated db once
145 g
= cgi_params
.equal_range("DB");
146 for (MCI i
= g
.first
; i
!= g
.second
; ++i
) {
147 const string
& v
= i
->second
;
152 string
s(v
, p
, q
- p
);
153 if (!s
.empty() && seen
.find(s
) == seen
.end()) {
154 // Translate DB parameter to path of database directory
155 if (!dbname
.empty()) dbname
+= '/';
157 db
.add_database(Xapian::Database(map_dbname_to_dir(s
)));
160 if (q
== string::npos
) break;
165 if (dbname
.empty()) {
167 db
.add_database(Xapian::Database(map_dbname_to_dir(dbname
)));
169 enquire
= new Xapian::Enquire(db
);
170 } catch (const Xapian::Error
&) {
175 val
= cgi_params
.find("HITSPERPAGE");
176 if (val
!= cgi_params
.end()) hits_per_page
= atol(val
->second
.c_str());
177 if (hits_per_page
== 0) {
179 } else if (hits_per_page
> 1000) {
180 hits_per_page
= 1000;
183 val
= cgi_params
.find("DEFAULTOP");
184 if (val
!= cgi_params
.end()) {
185 const string
& v
= val
->second
;
186 if (v
== "OR" || v
== "or")
187 default_op
= Xapian::Query::OP_OR
;
190 val
= cgi_params
.find("FMT");
191 if (val
!= cgi_params
.end()) {
192 const string
& v
= val
->second
;
193 if (!v
.empty()) fmtname
= v
;
196 fmtname
= default_template
;
198 val
= cgi_params
.find("MORELIKE");
199 if (enquire
&& val
!= cgi_params
.end()) {
200 const string
& v
= val
->second
;
201 Xapian::docid docid
= atol(v
.c_str());
203 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
204 // from an external source - we just find the correspond docid
205 Xapian::PostingIterator p
= db
.postlist_begin(v
);
206 if (p
!= db
.postlist_end(v
)) docid
= *p
;
210 Xapian::RSet tmprset
;
211 tmprset
.add_document(docid
);
213 OmegaExpandDecider
decider(db
);
214 set_expansion_scheme(*enquire
, option
);
215 #if XAPIAN_AT_LEAST(1,3,2)
216 Xapian::ESet
eset(enquire
->get_eset(40, tmprset
, &decider
));
218 Xapian::ESet
eset(enquire
->get_eset(40, tmprset
, 0,
219 expand_param_k
, &decider
));
221 string morelike_query
;
222 for (Xapian::ESetIterator i
= eset
.begin(); i
!= eset
.end(); i
++) {
223 if (!morelike_query
.empty()) morelike_query
+= ' ';
224 morelike_query
+= pretty_term(*i
);
226 set_probabilistic_query(string(), morelike_query
);
229 // add expand/topterms terms if appropriate
231 if (cgi_params
.find("ADD") != cgi_params
.end()) {
232 g
= cgi_params
.equal_range("X");
233 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
234 const string
& v
= i
->second
;
236 if (!expand_terms
.empty())
243 // collect the unprefixed prob fields
244 g
= cgi_params
.equal_range("P");
245 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
246 const string
& v
= i
->second
;
248 // If there are expand terms, append them to the first
249 // non-empty P parameter.
250 if (!expand_terms
.empty()) {
254 set_probabilistic_query(string(), q
);
255 expand_terms
= string();
257 set_probabilistic_query(string(), v
);
262 if (!expand_terms
.empty()) {
263 set_probabilistic_query(string(), expand_terms
);
267 g
.first
= cgi_params
.lower_bound("P.");
268 g
.second
= cgi_params
.lower_bound("P/"); // '/' is '.' + 1.
269 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
270 const string
& v
= i
->second
;
272 string
pfx(i
->first
, 2, string::npos
);
273 set_probabilistic_query(pfx
, v
);
277 // set any boolean filters
278 g
= cgi_params
.equal_range("B");
279 if (g
.first
!= g
.second
) {
280 vector
<string
> filter_v
;
281 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
282 const string
& v
= i
->second
;
283 // we'll definitely get empty B fields from "-ALL-" options
284 if (!v
.empty() && C_isalnum(v
[0])) {
286 filter_v
.push_back(v
);
289 sort(filter_v
.begin(), filter_v
.end());
290 vector
<string
>::const_iterator j
;
291 for (j
= filter_v
.begin(); j
!= filter_v
.end(); ++j
) {
292 const string
& bterm
= *j
;
293 string::size_type e
= bterm
.find(filter_sep
);
294 if (usual(e
== string::npos
)) {
297 // If a filter contains filter_sep then double it to escape.
298 // Each filter must start with an alnum (checked above) and
299 // the value after the last filter is the default op, which
300 // is encoded as a non-alnum so filter_sep followed by
301 // something other than filter_sep must be separating filters.
302 string::size_type b
= 0;
303 while (e
!= string::npos
) {
304 filters
.append(bterm
, b
, e
+ 1 - b
);
306 e
= bterm
.find(filter_sep
, b
+ 1);
308 filters
.append(bterm
, b
, string::npos
);
310 filters
+= filter_sep
;
311 old_filters
+= bterm
;
312 old_filters
+= filter_sep_old
;
316 // set any negated boolean filters
317 g
= cgi_params
.equal_range("N");
318 if (g
.first
!= g
.second
) {
319 vector
<string
> filter_v
;
320 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
321 const string
& v
= i
->second
;
322 // we'll definitely get empty N fields from "-ALL-" options
323 if (!v
.empty() && C_isalnum(v
[0])) {
325 filter_v
.push_back(v
);
328 sort(filter_v
.begin(), filter_v
.end());
329 vector
<string
>::const_iterator j
;
330 for (j
= filter_v
.begin(); j
!= filter_v
.end(); ++j
) {
331 const string
& nterm
= *j
;
332 string::size_type e
= nterm
.find(filter_sep
);
334 if (usual(e
== string::npos
)) {
337 // If a filter contains filter_sep then double it to escape.
338 // Each filter must start with an alnum (checked above) and
339 // the value after the last filter is the default op, which
340 // is encoded as a non-alnum so filter_sep followed by
341 // something other than filter_sep must be separating filters.
342 string::size_type b
= 0;
343 while (e
!= string::npos
) {
344 filters
.append(nterm
, b
, e
+ 1 - b
);
346 e
= nterm
.find(filter_sep
, b
+ 1);
348 filters
.append(nterm
, b
, string::npos
);
350 filters
+= filter_sep
;
351 // old_filters predates 'N' terms, so if there are 'N' terms this
352 // is definitely a different query.
357 // date range filters
358 val
= cgi_params
.find("START");
359 if (val
!= cgi_params
.end()) date_start
= val
->second
;
360 val
= cgi_params
.find("END");
361 if (val
!= cgi_params
.end()) date_end
= val
->second
;
362 val
= cgi_params
.find("SPAN");
363 if (val
!= cgi_params
.end()) date_span
= val
->second
;
364 val
= cgi_params
.find("DATEVALUE");
365 if (val
!= cgi_params
.end()) date_value_slot
= string_to_int(val
->second
);
367 // If more default_op values are supported, encode them as non-alnums
368 // other than filter_sep or '!'.
369 filters
+= (default_op
== Xapian::Query::OP_AND
? '.' : '-');
370 filters
+= date_start
;
371 filters
+= filter_sep
;
373 filters
+= filter_sep
;
374 filters
+= date_span
;
375 if (date_value_slot
!= Xapian::BAD_VALUENO
) {
376 // This means we'll force the first page when reloading or changing
377 // page starting from existing URLs upon upgrade to 1.4.1, but the
378 // exact same existing URL could be for a search without the date
379 // filter where we want to force the first page, so there's an inherent
380 // ambiguity there. Forcing first page in this case seems the least
381 // problematic side-effect.
382 filters
+= filter_sep
;
383 filters
+= str(date_value_slot
);
386 if (!old_filters
.empty()) {
387 old_filters
+= date_start
;
388 old_filters
+= filter_sep_old
;
389 old_filters
+= date_end
;
390 old_filters
+= filter_sep_old
;
391 old_filters
+= date_span
;
392 old_filters
+= (default_op
== Xapian::Query::OP_AND
? 'A' : 'O');
395 // Percentage relevance cut-off
396 val
= cgi_params
.find("THRESHOLD");
397 if (val
!= cgi_params
.end()) {
398 threshold
= atoi(val
->second
.c_str());
399 if (threshold
< 0) threshold
= 0;
400 if (threshold
> 100) threshold
= 100;
404 val
= cgi_params
.find("COLLAPSE");
405 if (val
!= cgi_params
.end()) {
406 const string
& v
= val
->second
;
408 collapse_key
= atoi(v
.c_str());
410 filters
+= filter_sep
;
411 filters
+= str(collapse_key
);
412 if (!old_filters
.empty()) {
413 old_filters
+= filter_sep_old
;
414 old_filters
+= str(collapse_key
);
418 if (!collapse
&& date_value_slot
!= Xapian::BAD_VALUENO
) {
419 // We need to either omit filter_sep for both or neither, or else the
420 // encoding is ambiguous.
421 filters
+= filter_sep
;
425 val
= cgi_params
.find("DOCIDORDER");
426 if (val
!= cgi_params
.end()) {
427 const string
& v
= val
->second
;
431 docid_order
= Xapian::Enquire::DESCENDING
;
433 if (!old_filters
.empty()) old_filters
+= 'D';
434 } else if (ch
!= 'A') {
435 docid_order
= Xapian::Enquire::DONT_CARE
;
437 // This is a bug (should add nothing here and 'X' in the (ch !=
438 // 'A') case, but the current "DONT_CARE" implementation
439 // actually always results in ascending docid order so it's not
440 // worth breaking compatibility to fix - let's just do it next
441 // time we change the encoding $filters uses.
443 if (!old_filters
.empty()) old_filters
+= 'X';
449 val
= cgi_params
.find("SORT");
450 if (val
!= cgi_params
.end()) {
451 const char * base
= val
->second
.c_str();
452 const char * p
= base
;
454 bool rev
= (*p
!= '+');
455 if (*p
== '-' || *p
== '+') {
456 // old_filters predates support for direction in SORT, so if
457 // there's a direction specified this is definitely a different
462 if (!C_isdigit(*p
)) {
468 Xapian::valueno slot
= strtoul(p
, &q
, 10);
475 if (sort_key
!= Xapian::BAD_VALUENO
) {
476 // Multiple sort keys specified, so we need a KeyMaker.
479 if (reverse_sort
) filters
+= '-';
480 filters
+= str(sort_key
);
482 sort_keymaker
= new Xapian::MultiValueKeyMaker
;
483 sort_keymaker
->add_value(sort_key
, !reverse_sort
);
484 sort_key
= Xapian::BAD_VALUENO
;
486 // old_filters predates multiple sort keys, so if there are
487 // multiple sort keys this is definitely a different query.
492 filters
+= (rev
? '-' : '+');
493 filters
+= str(slot
);
494 sort_keymaker
->add_value(slot
, !rev
);
499 while (C_isspace(*p
) || *p
== ',') ++p
;
502 val
= cgi_params
.find("SORTREVERSE");
503 if (val
!= cgi_params
.end() && atoi(val
->second
.c_str()) != 0) {
504 reverse_sort
= !reverse_sort
;
507 val
= cgi_params
.find("SORTAFTER");
508 if (val
!= cgi_params
.end()) {
509 sort_after
= (atoi(val
->second
.c_str()) != 0);
512 // Add the sorting related options to filters too.
514 // Note: old_filters really does encode a reversed sort as 'F' and a
515 // non-reversed sort as 'R' or 'r'.
517 // filters has them the other way around for sanity (except in
518 // development snapshot 1.3.4, which was when the new filter encoding
520 if (!sort_keymaker
) filters
+= str(sort_key
);
521 if (!old_filters
.empty()) old_filters
+= str(sort_key
);
525 if (!old_filters
.empty()) old_filters
+= 'F';
528 if (!old_filters
.empty()) old_filters
+= 'R';
533 if (!old_filters
.empty()) old_filters
+= 'r';
538 if (old_filters
.empty()) old_filters
= filters
;
540 // min_hits (fill mset past topdoc+(hits_per_page+1) to
541 // topdoc+max(hits_per_page+1,min_hits)
542 val
= cgi_params
.find("MINHITS");
543 if (val
!= cgi_params
.end()) {
544 min_hits
= atol(val
->second
.c_str());
548 } catch (const Xapian::Error
&e
) {
549 if (!set_content_type
&& !suppress_http_headers
)
550 cout
<< "Content-Type: text/html\n\n";
551 cout
<< "Exception: " << html_escape(e
.get_msg()) << endl
;
552 } catch (const std::exception
&e
) {
553 if (!set_content_type
&& !suppress_http_headers
)
554 cout
<< "Content-Type: text/html\n\n";
555 cout
<< "Exception: std::exception " << html_escape(e
.what()) << endl
;
556 } catch (const string
&s
) {
557 if (!set_content_type
&& !suppress_http_headers
)
558 cout
<< "Content-Type: text/html\n\n";
559 cout
<< "Exception: " << html_escape(s
) << endl
;
560 } catch (const char *s
) {
561 if (!set_content_type
&& !suppress_http_headers
)
562 cout
<< "Content-Type: text/html\n\n";
563 cout
<< "Exception: " << html_escape(s
) << endl
;
565 if (!set_content_type
&& !suppress_http_headers
)
566 cout
<< "Content-Type: text/html\n\n";
567 cout
<< "Caught unknown exception" << endl
;