1 /* query.cc: query executor for omega
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015 Olly Betts
8 * Copyright 2008 Thomas Viehmann
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
28 // If we're building against git after the expand API changed but before the
29 // version gets bumped to 1.3.2, we'll get a deprecation warning from
30 // get_eset() unless we suppress such warnings here.
31 #define XAPIAN_DEPRECATED(D) D
41 #include "safeerrno.h"
45 #include "strcasecmp.h"
48 #include "safeunistd.h"
49 #include <sys/types.h>
50 #include "safesysstat.h"
51 #include "safefcntl.h"
58 #include "datematchdecider.h"
59 #include "jsonescape.h"
67 #include "stringutils.h"
68 #include "transform.h"
69 #include "urldecode.h"
70 #include "urlencode.h"
80 using Xapian::Utf8Iterator
;
82 using Xapian::Unicode::is_wordchar
;
87 static int my_snprintf(char *str
, size_t size
, const char *format
, ...)
93 res
= vsprintf(str
, format
, ap
);
94 if (str
[size
- 1] || res
< 0 || size_t(res
) >= size
)
95 abort(); /* Overflowed! */
100 #define my_snprintf SNPRINTF
103 static bool query_parsed
= false;
104 static bool done_query
= false;
105 static Xapian::docid last
= 0;
107 static Xapian::MSet mset
;
109 static map
<Xapian::docid
, bool> ticked
;
111 static void ensure_query_parsed();
112 static void ensure_match();
114 static Xapian::Query query
;
115 //static string url_query_string;
116 Xapian::Query::op default_op
= Xapian::Query::OP_AND
; // default matching mode
118 static Xapian::QueryParser qp
;
119 static Xapian::NumberValueRangeProcessor
* size_vrp
= NULL
;
120 static Xapian::Stem
*stemmer
= NULL
;
122 static string
eval_file(const string
&fmtfile
);
124 static set
<string
> termset
;
126 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
127 static map
<string
, string
> termprefix_to_userprefix
;
129 static string queryterms
;
131 static string error_msg
;
133 static double secs
= -1;
135 static const char DEFAULT_LOG_ENTRY
[] =
136 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
137 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
138 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
141 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
143 class MyStopper
: public Xapian::Stopper
{
145 bool operator()(const string
&t
) const {
148 return (t
== "a" || t
== "about" || t
== "an" || t
== "and" ||
149 t
== "are" || t
== "as" || t
== "at");
151 return (t
== "be" || t
== "by");
155 return (t
== "for" || t
== "from");
159 return (t
== "i" || t
== "in" || t
== "is" || t
== "it");
161 return (t
== "of" || t
== "on" || t
== "or");
163 return (t
== "that" || t
== "the" || t
== "this" || t
== "to");
165 return (t
== "was" || t
== "what" || t
== "when" ||
166 t
== "where" || t
== "which" || t
== "who" ||
167 t
== "why" || t
== "will" || t
== "with");
169 return (t
== "you" || t
== "your");
177 prefix_from_term(string
&prefix
, const string
&term
)
183 if (term
[0] == 'X') {
184 const string::const_iterator begin
= term
.begin();
185 string::const_iterator i
= begin
+ 1;
186 while (i
!= term
.end() && C_isupper(*i
)) ++i
;
187 prefix
.assign(begin
, i
);
188 if (i
!= term
.end() && *i
== ':') ++i
;
196 // Don't allow ".." in format names, log file names, etc as this would allow
197 // people to open a format "../../etc/passwd" or similar.
198 // FIXME: make this check more exact ("foo..bar" is safe)
199 // FIXME: log when this check fails
201 vet_filename(const string
&filename
)
203 string::size_type i
= filename
.find("..");
204 return (i
== string::npos
);
208 // * If any terms have been removed, it's a "fresh query" so we discard any
209 // relevance judgements
210 // * If all previous terms are there but more have been added then we keep
211 // the relevance judgements, but return the first page of hits
213 // NEW_QUERY entirely new query
214 // SAME_QUERY unchanged query
215 // EXTENDED_QUERY new query, but based on the old one
216 // BAD_QUERY parse error (message in error_msg)
217 typedef enum { NEW_QUERY
, SAME_QUERY
, EXTENDED_QUERY
, BAD_QUERY
} querytype
;
219 static multimap
<string
, string
> probabilistic_query
;
222 set_probabilistic_query(const string
& prefix
, const string
& s
)
224 string query_string
= s
;
225 // Strip leading and trailing whitespace from query_string.
227 if (!query_string
.empty())
228 probabilistic_query
.insert(make_pair(prefix
, query_string
));
232 read_qp_flags(const string
& opt_pfx
, unsigned f
)
234 map
<string
, string
>::const_iterator i
= option
.lower_bound(opt_pfx
);
235 for (; i
!= option
.end() && startswith(i
->first
, opt_pfx
); ++i
) {
237 const char * s
= i
->first
.c_str() + opt_pfx
.size();
240 if (strcmp(s
, "auto_multiword_synonyms") == 0) {
241 mask
= Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS
;
244 if (strcmp(s
, "auto_synonyms") == 0) {
245 mask
= Xapian::QueryParser::FLAG_AUTO_SYNONYMS
;
250 if (strcmp(s
, "boolean") == 0) {
251 mask
= Xapian::QueryParser::FLAG_BOOLEAN
;
254 if (strcmp(s
, "boolean_any_case") == 0) {
255 mask
= Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE
;
260 if (strcmp(s
, "default") == 0) {
261 mask
= Xapian::QueryParser::FLAG_DEFAULT
;
266 if (strcmp(s
, "lovehate") == 0) {
267 mask
= Xapian::QueryParser::FLAG_LOVEHATE
;
272 if (strcmp(s
, "partial") == 0) {
273 mask
= Xapian::QueryParser::FLAG_PARTIAL
;
276 if (strcmp(s
, "phrase") == 0) {
277 mask
= Xapian::QueryParser::FLAG_PHRASE
;
280 if (strcmp(s
, "pure_not") == 0) {
281 mask
= Xapian::QueryParser::FLAG_PURE_NOT
;
286 if (strcmp(s
, "spelling_correction") == 0) {
287 mask
= Xapian::QueryParser::FLAG_SPELLING_CORRECTION
;
290 if (strcmp(s
, "synonym") == 0) {
291 mask
= Xapian::QueryParser::FLAG_SYNONYM
;
296 if (strcmp(s
, "wildcard") == 0) {
297 mask
= Xapian::QueryParser::FLAG_WILDCARD
;
303 if (i
->second
.empty()) {
313 set_probabilistic(const string
&oldp
)
315 // Parse the query string.
316 qp
.set_stemming_strategy(option
["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL
: Xapian::QueryParser::STEM_SOME
);
317 qp
.set_stopper(new MyStopper());
318 qp
.set_default_op(default_op
);
320 // FIXME: provide a custom VRP which handles size:10..20K, etc.
322 size_vrp
= new Xapian::NumberValueRangeProcessor(VALUE_SIZE
, "size:",
324 qp
.add_valuerangeprocessor(size_vrp
);
325 map
<string
, string
>::const_iterator pfx
= option
.lower_bound("prefix,");
326 for (; pfx
!= option
.end() && startswith(pfx
->first
, "prefix,"); ++pfx
) {
327 string
user_prefix(pfx
->first
, 7);
328 const string
& term_pfx_list
= pfx
->second
;
329 string::size_type i
= 0;
331 string::size_type i0
= i
;
332 i
= term_pfx_list
.find('\t', i
);
333 const string
& term_pfx
= term_pfx_list
.substr(i0
, i
- i0
);
334 qp
.add_prefix(user_prefix
, term_pfx
);
335 // std::map::insert() won't overwrite an existing entry, so we'll
336 // prefer the first user_prefix for which a particular term prefix
338 termprefix_to_userprefix
.insert(make_pair(term_pfx
, user_prefix
));
341 pfx
= option
.lower_bound("boolprefix,");
342 for (; pfx
!= option
.end() && startswith(pfx
->first
, "boolprefix,"); ++pfx
) {
343 string user_prefix
= pfx
->first
.substr(11);
344 qp
.add_boolean_prefix(user_prefix
, pfx
->second
);
345 termprefix_to_userprefix
.insert(make_pair(pfx
->second
, user_prefix
));
349 unsigned default_flags
= read_qp_flags("flag_", 0);
350 if (option
["spelling"] == "true")
351 default_flags
|= qp
.FLAG_SPELLING_CORRECTION
;
353 vector
<Xapian::Query
> queries
;
354 queries
.reserve(probabilistic_query
.size());
356 multimap
<string
, string
>::const_iterator j
;
357 for (j
= probabilistic_query
.begin();
358 j
!= probabilistic_query
.end();
360 const string
& prefix
= j
->first
;
362 // Choose the stemmer to use for this input.
363 string stemlang
= option
[prefix
+ ":stemmer"];
364 if (stemlang
.empty())
365 stemlang
= option
["stemmer"];
366 qp
.set_stemmer(Xapian::Stem(stemlang
));
368 // Work out the flags to use for this input.
369 unsigned f
= read_qp_flags(prefix
+ ":flag_", default_flags
);
371 const string
& query_string
= j
->second
;
372 Xapian::Query q
= qp
.parse_query(query_string
, f
, prefix
);
374 queries
.push_back(q
);
376 query
= Xapian::Query(query
.OP_AND
, queries
.begin(), queries
.end());
377 } catch (Xapian::QueryParserError
&e
) {
378 error_msg
= e
.get_msg();
382 Xapian::termcount n_new_terms
= 0;
383 for (Xapian::TermIterator i
= query
.get_terms_begin();
384 i
!= query
.get_terms_end(); ++i
) {
385 if (termset
.find(*i
) == termset
.end()) {
387 if (!queryterms
.empty()) queryterms
+= '\t';
393 // Check new query against the previous one
395 // If oldp was empty that means there were no probabilistic terms
396 // before, so if there are now this is a new query.
397 return n_new_terms
? NEW_QUERY
: SAME_QUERY
;
400 // The terms in oldp are separated by tabs.
401 const char oldp_separator
= '\t';
402 size_t n_old_terms
= count(oldp
.begin(), oldp
.end(), oldp_separator
) + 1;
404 // short-cut: if the new query has fewer terms, it must be a new one
405 if (n_new_terms
< n_old_terms
) return NEW_QUERY
;
407 const char *term
= oldp
.c_str();
409 while ((pend
= strchr(term
, oldp_separator
)) != NULL
) {
410 if (termset
.find(string(term
, pend
- term
)) == termset
.end())
415 if (termset
.find(string(term
)) == termset
.end())
419 // Use termset.size() rather than n_new_terms so we correctly handle
420 // the case when the query has repeated terms.
421 // This works wrongly in the case when the user extends the query
422 // by adding a term already in it, but that's unlikely and the behaviour
423 // isn't too bad (we just don't reset page 1). We also mishandle a few
424 // other obscure cases e.g. adding quotes to turn a query into a phrase.
425 if (termset
.size() > n_old_terms
) return EXTENDED_QUERY
;
429 static multimap
<string
, string
> filter_map
;
431 typedef multimap
<string
, string
>::const_iterator FMCI
;
433 void add_bterm(const string
&term
) {
435 if (prefix_from_term(prefix
, term
) > 0)
436 filter_map
.insert(multimap
<string
, string
>::value_type(prefix
, term
));
442 bool force_boolean
= false;
443 if (!filter_map
.empty()) {
444 // OR together filters with the same prefix, then AND together
445 vector
<Xapian::Query
> filter_vec
;
446 vector
<string
> or_vec
;
448 for (FMCI i
= filter_map
.begin(); ; i
++) {
449 bool over
= (i
== filter_map
.end());
450 if (over
|| i
->first
!= current
) {
451 switch (or_vec
.size()) {
455 filter_vec
.push_back(Xapian::Query(or_vec
[0]));
458 filter_vec
.push_back(Xapian::Query(Xapian::Query::OP_OR
,
467 or_vec
.push_back(i
->second
);
470 Xapian::Query
filter(Xapian::Query::OP_AND
,
471 filter_vec
.begin(), filter_vec
.end());
474 // If no probabilistic query is provided then promote the filters
475 // to be THE query - filtering an empty query will give no
477 std::swap(query
, filter
);
478 if (enquire
) force_boolean
= true;
480 query
= Xapian::Query(Xapian::Query::OP_FILTER
, query
, filter
);
484 Xapian::MatchDecider
* mdecider
= NULL
;
485 if (!date_start
.empty() || !date_end
.empty() || !date_span
.empty()) {
486 MCI i
= cgi_params
.find("DATEVALUE");
487 if (i
!= cgi_params
.end()) {
488 Xapian::valueno datevalue
= string_to_int(i
->second
);
489 mdecider
= new DateMatchDecider(datevalue
, date_start
, date_end
, date_span
);
491 Xapian::Query
date_filter(Xapian::Query::OP_OR
,
492 date_range_filter(date_start
, date_end
,
494 Xapian::Query("Dlatest"));
496 // If no probabilistic query is provided then promote the daterange
497 // filter to be THE query instead of filtering an empty query.
501 query
= Xapian::Query(Xapian::Query::OP_FILTER
, query
, date_filter
);
506 if (!enquire
|| !error_msg
.empty()) return;
508 set_weighting_scheme(*enquire
, option
, force_boolean
);
510 enquire
->set_cutoff(threshold
);
512 if (sort_key
!= Xapian::BAD_VALUENO
) {
514 enquire
->set_sort_by_relevance_then_value(sort_key
, sort_ascending
);
516 enquire
->set_sort_by_value_then_relevance(sort_key
, sort_ascending
);
520 enquire
->set_docid_order(docid_order
);
523 enquire
->set_collapse_key(collapse_key
);
526 if (!query
.empty()) {
528 // FIXME: If we start doing permissions checks based on $REMOTE_USER
529 // we're going to break some existing setups if users upgrade. We
530 // probably want a way to set this from OmegaScript.
531 const char * remote_user
= getenv("REMOTE_USER");
533 apply_unix_permissions(query
, remote_user
);
536 enquire
->set_query(query
);
537 // We could use the value of topdoc as first parameter, but we
538 // need to know the first few items in the mset to fake a
539 // relevance set for topterms.
541 // If min_hits isn't set, check at least one extra result so we
542 // know if we've reached the end of the matches or not - then we
543 // can avoid offering a "next" button which leads to an empty page.
544 mset
= enquire
->get_mset(0, topdoc
+ hits_per_page
,
545 topdoc
+ max(hits_per_page
+ 1, min_hits
),
551 html_escape(const string
&str
)
554 string::size_type p
= 0;
555 while (p
< str
.size()) {
578 html_strip(const string
&str
)
581 string::size_type p
= 0;
583 while (p
< str
.size()) {
593 if (! skip
) res
+= ch
;
599 // FIXME split list into hash or map and use that rather than linear lookup?
600 static int word_in_list(const string
& word
, const string
& list
)
602 string::size_type split
= 0, split2
;
604 while ((split2
= list
.find('\t', split
)) != string::npos
) {
605 if (word
.size() == split2
- split
) {
606 if (memcmp(word
.data(), list
.data() + split
, word
.size()) == 0)
612 if (word
.size() == list
.size() - split
) {
613 if (memcmp(word
.data(), list
.data() + split
, word
.size()) == 0)
619 // Not a character in an identifier
621 p_notid(unsigned int c
)
623 return !C_isalnum(c
) && c
!= '_';
626 // Not a character in an HTML tag name
628 p_nottag(unsigned int c
)
630 return !C_isalnum(c
) && c
!= '.' && c
!= '-';
633 // FIXME: shares algorithm with indextext.cc!
635 html_highlight(const string
&s
, const string
&list
,
636 const string
&bra
, const string
&ket
)
639 stemmer
= new Xapian::Stem(option
["stemmer"]);
645 const Utf8Iterator s_end
;
647 Utf8Iterator first
= j
;
648 while (first
!= s_end
&& !is_wordchar(*first
)) ++first
;
649 if (first
== s_end
) break;
650 Utf8Iterator term_end
;
653 const char *l
= j
.raw();
654 if (*first
< 128 && C_isupper(*first
)) {
656 Xapian::Unicode::append_utf8(term
, *j
);
657 while (++j
!= s_end
&& *j
== '.' && ++j
!= s_end
&& *j
< 128 && C_isupper(*j
)) {
658 Xapian::Unicode::append_utf8(term
, *j
);
660 if (term
.length() < 2 || (j
!= s_end
&& is_wordchar(*j
))) {
667 while (is_wordchar(*j
)) {
668 Xapian::Unicode::append_utf8(term
, *j
);
670 if (j
== s_end
) break;
671 if (*j
== '&' || *j
== '\'') {
672 Utf8Iterator next
= j
;
674 if (next
== s_end
|| !is_wordchar(*next
)) break;
680 if (j
!= s_end
&& (*j
== '+' || *j
== '-' || *j
== '#')) {
681 string::size_type len
= term
.length();
684 do { ++j
; } while (j
!= s_end
&& *j
== '#');
686 while (j
!= s_end
&& (*j
== '+' || *j
== '-')) {
687 Xapian::Unicode::append_utf8(term
, *j
);
691 if (term
.size() - len
> 3 || (j
!= s_end
&& is_wordchar(*j
))) {
699 term
= Xapian::Unicode::tolower(term
);
700 int match
= word_in_list(term
, list
);
703 stem
+= (*stemmer
)(term
);
704 match
= word_in_list(stem
, list
);
707 res
+= html_escape(string(l
, first
.raw() - l
));
711 static const char * colours
[] = {
712 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
713 "990000", "009900", "996600", "006699", "990099"
715 size_t idx
= match
% (sizeof(colours
) / sizeof(colours
[0]));
716 const char * bg
= colours
[idx
];
717 if (strchr(bg
, 'f')) {
718 res
+= "<b style=\"color:black;background-color:#";
720 res
+= "<b style=\"color:white;background-color:#";
725 word
= string(first
.raw(), j
.raw() - first
.raw());
726 res
+= html_escape(word
);
733 res
+= html_escape(string(l
, j
.raw() - l
));
736 if (j
!= s_end
) res
+= html_escape(string(j
.raw(), j
.left()));
742 print_query_string(const char *after
)
744 if (after
&& strncmp(after
, "&B=", 3) == 0) {
745 char prefix
= after
[3];
746 string::size_type start
= 0, amp
= 0;
748 amp
= url_query_string
.find('&', amp
);
749 if (amp
== string::npos
) {
750 cout
<< url_query_string
.substr(start
);
754 while (url_query_string
[amp
] == 'B' &&
755 url_query_string
[amp
+ 1] == '=' &&
756 url_query_string
[amp
+ 2] == prefix
) {
757 cout
<< url_query_string
.substr(start
, amp
- start
- 1);
758 start
= url_query_string
.find('&', amp
+ 3);
759 if (start
== string::npos
) return;
764 cout
<< url_query_string
;
769 mutable Xapian::docid did_cached
;
770 mutable map
<string
, string
> fields
;
772 void read_fields(Xapian::docid did
) const;
775 Fields() : did_cached(0) { }
777 const string
& get_field(Xapian::docid did
, const string
& field
) const {
778 if (did
!= did_cached
) read_fields(did
);
779 return fields
[field
];
784 Fields::read_fields(Xapian::docid did
) const
788 const string
& data
= db
.get_document(did
).get_data();
790 // Parse document data.
791 string::size_type i
= 0;
792 const string
& names
= option
["fieldnames"];
793 if (!names
.empty()) {
794 // Each line is a field, with fieldnames taken from corresponding
795 // entries in the tab-separated list specified by $opt{fieldnames}.
796 string::size_type n
= 0;
798 string::size_type n0
= n
;
799 n
= names
.find('\t', n
);
800 string::size_type i0
= i
;
801 i
= data
.find('\n', i
);
802 fields
.insert(make_pair(names
.substr(n0
, n
- n0
),
803 data
.substr(i0
, i
- i0
)));
804 } while (++n
&& ++i
);
806 // Each line is a field, in the format NAME=VALUE. We assume the field
807 // name doesn't contain an "=". Lines without an "=" are currently
810 string::size_type i0
= i
;
811 i
= data
.find('\n', i
);
812 string line
= data
.substr(i0
, i
- i0
);
813 string::size_type j
= line
.find('=');
814 if (j
!= string::npos
) {
815 string
& value
= fields
[line
.substr(0, j
)];
816 if (!value
.empty()) value
+= '\t';
817 value
+= line
.substr(j
+ 1);
823 static Fields fields
;
824 static Xapian::docid q0
;
825 static Xapian::doccount hit_no
;
827 static double weight
;
828 static Xapian::doccount collapsed
;
830 static string
print_caption(const string
&fmt
, const vector
<string
> ¶m
);
901 CMD_querydescription
,
933 CMD_MACRO
// special tag for macro evaluation
938 int minargs
, maxargs
, evalargs
;
942 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
945 struct func_attrib a
;
951 // NB when adding a new command which ensures M or Q, update the list in
952 // docs/omegascript.rst
953 static struct func_desc func_tab
[] = {
954 //name minargs maxargs evalargs ensure
955 {"",{CMD_
, N
, N
, 0, 0}},// commented out code
956 T(add
, 0, N
, N
, 0), // add a list of numbers
957 T(addfilter
, 1, 1, N
, 0), // add filter term
958 T(allterms
, 0, 1, N
, 0), // list of all terms matching document
959 T(and, 1, N
, 0, 0), // logical shortcutting and of a list of values
960 T(cgi
, 1, 1, N
, 0), // return cgi parameter value
961 T(cgilist
, 1, 1, N
, 0), // return list of values for cgi parameter
962 T(collapsed
, 0, 0, N
, 0), // return number of hits collapsed into this
963 T(date
, 1, 2, N
, 0), // convert time_t to strftime format
964 // (default: YYYY-MM-DD)
965 T(dbname
, 0, 0, N
, 0), // database name
966 T(dbsize
, 0, 0, N
, 0), // database size (# of documents)
967 T(def
, 2, 2, 1, 0), // define a macro
968 T(defaultop
, 0, 0, N
, 0), // default operator: "and" or "or"
969 T(div
, 2, 2, N
, 0), // integer divide
970 T(emptydocs
, 0, 1, N
, 0), // list of empty documents
971 T(env
, 1, 1, N
, 0), // environment variable
972 T(error
, 0, 0, N
, 0), // error message
973 T(eq
, 2, 2, N
, 0), // test equality
974 T(field
, 1, 2, N
, 0), // lookup field in record
975 T(filesize
, 1, 1, N
, 0), // pretty printed filesize
976 T(filters
, 0, 0, N
, 0), // serialisation of current filters
977 T(filterterms
, 1, 1, N
, 0), // list of terms with a given prefix
978 T(find
, 2, 2, N
, 0), // find entry in list
979 T(fmt
, 0, 0, N
, 0), // name of current format
980 T(freq
, 1, 1, N
, 0), // frequency of a term
981 T(ge
, 2, 2, N
, 0), // test >=
982 T(gt
, 2, 2, N
, 0), // test >
983 T(highlight
, 2, 4, N
, 0), // html escape and highlight words from list
984 T(hit
, 0, 0, N
, 0), // hit number of current mset entry (starting
986 T(hitlist
, 1, 1, 0, M
), // display hitlist using format in argument
987 T(hitsperpage
, 0, 0, N
, 0), // hits per page
988 T(hostname
, 1, 1, N
, 0), // extract hostname from URL
989 T(html
, 1, 1, N
, 0), // html escape string (<>&")
990 T(htmlstrip
, 1, 1, N
, 0), // html strip tags string (s/<[^>]*>?//g)
991 T(httpheader
, 2, 2, N
, 0), // arbitrary HTTP header
992 T(id
, 0, 0, N
, 0), // docid of current doc
993 T(if, 2, 3, 1, 0), // conditional
994 T(include
, 1, 1, 1, 0), // include another file
995 T(json
, 1, 1, N
, 0), // JSON string escaping
996 T(jsonarray
, 1, 1, N
, 0), // Format list as a JSON array of strings
997 T(last
, 0, 0, N
, M
), // hit number one beyond end of current page
998 T(lastpage
, 0, 0, N
, M
), // number of last hit page
999 T(le
, 2, 2, N
, 0), // test <=
1000 T(length
, 1, 1, N
, 0), // length of list
1001 T(list
, 2, 5, N
, 0), // pretty print list
1002 T(log
, 1, 2, 1, 0), // create a log entry
1003 T(lookup
, 2, 2, N
, 0), // lookup in named cdb file
1004 T(lower
, 1, 1, N
, 0), // convert string to lower case
1005 T(lt
, 2, 2, N
, 0), // test <
1006 T(map
, 1, 2, 1, 0), // map a list into another list
1007 T(max
, 1, N
, N
, 0), // maximum of a list of values
1008 T(min
, 1, N
, N
, 0), // minimum of a list of values
1009 T(mod
, 2, 2, N
, 0), // integer modulus
1010 T(msize
, 0, 0, N
, M
), // number of matches
1011 T(msizeexact
, 0, 0, N
, M
), // is $msize exact?
1012 T(mul
, 2, N
, N
, 0), // multiply a list of numbers
1013 T(muldiv
, 3, 3, N
, 0), // calculate A*B/C
1014 T(ne
, 2, 2, N
, 0), // test not equal
1015 T(nice
, 1, 1, N
, 0), // pretty print integer (with thousands sep)
1016 T(not, 1, 1, N
, 0), // logical not
1017 T(now
, 0, 0, N
, 0), // current date/time as a time_t
1018 T(opt
, 1, 2, N
, 0), // lookup an option value
1019 T(or, 1, N
, 0, 0), // logical shortcutting or of a list of values
1020 T(pack
, 1, 1, N
, 0), // convert a number to a 4 byte big endian binary string
1021 T(percentage
, 0, 0, N
, 0), // percentage score of current hit
1022 T(prettyterm
, 1, 1, N
, Q
), // pretty print term name
1023 T(prettyurl
, 1, 1, N
, 0), // pretty version of URL
1024 T(query
, 0, 1, N
, Q
), // query
1025 T(querydescription
,0, 0, N
, Q
), // query.get_description()
1026 T(queryterms
, 0, 0, N
, Q
), // list of query terms
1027 T(range
, 2, 2, N
, 0), // return list of values between start and end
1028 T(record
, 0, 1, N
, 0), // record contents of document
1029 T(relevant
, 0, 1, N
, Q
), // is document relevant?
1030 T(relevants
, 0, 0, N
, Q
), // return list of relevant documents
1031 T(score
, 0, 0, N
, 0), // score (0-10) of current hit
1032 T(set
, 2, 2, N
, 0), // set option value
1033 T(setmap
, 1, N
, N
, 0), // set map of option values
1034 T(setrelevant
, 0, 1, N
, Q
), // set rset
1035 T(slice
, 2, 2, N
, 0), // slice a list using a second list
1036 T(snippet
, 1, 2, N
, 0), // generate snippet from text
1037 T(split
, 1, 2, N
, 0), // split a string to give a list
1038 T(stoplist
, 0, 0, N
, Q
), // return list of stopped terms
1039 T(sub
, 2, 2, N
, 0), // subtract
1040 T(substr
, 2, 3, N
, 0), // substring
1041 T(suggestion
, 0, 0, N
, Q
), // misspelled word correction suggestion
1042 T(terms
, 0, 0, N
, M
), // list of matching terms
1043 T(thispage
, 0, 0, N
, M
), // page number of current page
1044 T(time
, 0, 0, N
, M
), // how long the match took (in seconds)
1045 T(topdoc
, 0, 0, N
, M
), // first document on current page of hit list
1046 // (counting from 0)
1047 T(topterms
, 0, 1, N
, M
), // list of up to N top relevance feedback terms
1049 T(transform
, 3, 3, N
, 0), // transform with a regexp
1050 T(truncate
, 2, 4, N
, 0), // truncate after a word
1051 T(uniq
, 1, 1, N
, 0), // removed duplicates from a sorted list
1052 T(unpack
, 1, 1, N
, 0), // convert 4 byte big endian binary string to a number
1053 T(unstem
, 1, 1, N
, Q
), // return list of probabilistic terms from
1054 // the query which stemmed to this term
1055 T(upper
, 1, 1, N
, 0), // convert string to upper case
1056 T(url
, 1, 1, N
, 0), // url encode argument
1057 T(value
, 1, 2, N
, 0), // return document value
1058 T(version
, 0, 0, N
, 0), // omega version string
1059 T(weight
, 0, 0, N
, 0), // weight of the current hit
1060 { NULL
,{0, 0, 0, 0, 0}}
1063 #undef T // Leaving T defined screws up Sun's C++ compiler!
1065 static vector
<string
> macros
;
1067 // Call write() repeatedly until all data is written or we get a
1068 // non-recoverable error.
1070 write_all(int fd
, const char * buf
, size_t count
)
1073 ssize_t r
= write(fd
, buf
, count
);
1075 if (errno
== EINTR
) continue;
1085 eval(const string
&fmt
, const vector
<string
> ¶m
)
1087 static map
<string
, const struct func_attrib
*> func_map
;
1088 if (func_map
.empty()) {
1089 struct func_desc
*p
;
1090 for (p
= func_tab
; p
->name
!= NULL
; p
++) {
1091 func_map
[string(p
->name
)] = &(p
->a
);
1095 string::size_type p
= 0, q
;
1096 while ((q
= fmt
.find('$', p
)) != string::npos
) try {
1097 res
+= fmt
.substr(p
, q
- p
);
1098 string::size_type code_start
= q
; // note down for error reporting
1100 if (q
>= fmt
.size()) break;
1101 unsigned char ch
= fmt
[q
];
1104 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1124 case '1': case '2': case '3': case '4': case '5':
1125 case '6': case '7': case '8': case '9':
1127 if (ch
< param
.size()) res
+= param
[ch
];
1130 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1131 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1132 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1133 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1135 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1136 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1137 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1138 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1143 string msg
= "Unknown $ code in: $" + fmt
.substr(q
);
1146 p
= find_if(fmt
.begin() + q
, fmt
.end(), p_notid
) - fmt
.begin();
1147 string var
= fmt
.substr(q
, p
- q
);
1148 map
<string
, const struct func_attrib
*>::const_iterator func
;
1149 func
= func_map
.find(var
);
1150 if (func
== func_map
.end()) {
1151 throw "Unknown function '" + var
+ "'";
1153 vector
<string
> args
;
1154 if (fmt
[p
] == '{') {
1158 p
= fmt
.find_first_of(",{}", p
+ 1);
1159 if (p
== string::npos
)
1160 throw "missing } in " + fmt
.substr(code_start
);
1161 if (fmt
[p
] == '{') {
1165 // should we split the args
1166 if (func
->second
->minargs
!= N
) {
1167 args
.push_back(fmt
.substr(q
, p
- q
));
1171 if (fmt
[p
] == '}' && --nest
== 0) break;
1174 if (func
->second
->minargs
== N
)
1175 args
.push_back(fmt
.substr(q
, p
- q
));
1179 if (func
->second
->minargs
!= N
) {
1180 if ((int)args
.size() < func
->second
->minargs
)
1181 throw "too few arguments to $" + var
;
1182 if (func
->second
->maxargs
!= N
&&
1183 (int)args
.size() > func
->second
->maxargs
)
1184 throw "too many arguments to $" + var
;
1186 vector
<string
>::size_type n
;
1187 if (func
->second
->evalargs
!= N
)
1188 n
= func
->second
->evalargs
;
1192 for (vector
<string
>::size_type j
= 0; j
< n
; j
++)
1193 args
[j
] = eval(args
[j
], param
);
1195 if (func
->second
->ensure
== 'Q' || func
->second
->ensure
== 'M')
1196 ensure_query_parsed();
1197 if (func
->second
->ensure
== 'M') ensure_match();
1199 switch (func
->second
->tag
) {
1204 vector
<string
>::const_iterator i
;
1205 for (i
= args
.begin(); i
!= args
.end(); i
++)
1206 total
+= string_to_int(*i
);
1213 case CMD_allterms
: {
1214 // list of all terms indexing document
1216 if (!args
.empty()) id
= string_to_int(args
[0]);
1217 Xapian::TermIterator term
= db
.termlist_begin(id
);
1218 for ( ; term
!= db
.termlist_end(id
); term
++) {
1223 if (!value
.empty()) value
.erase(value
.size() - 1);
1228 for (vector
<string
>::const_iterator i
= args
.begin();
1229 i
!= args
.end(); i
++) {
1230 if (eval(*i
, param
).empty()) {
1238 MCI i
= cgi_params
.find(args
[0]);
1239 if (i
!= cgi_params
.end()) value
= i
->second
;
1244 g
= cgi_params
.equal_range(args
[0]);
1245 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
1249 if (!value
.empty()) value
.erase(value
.size() - 1);
1252 case CMD_collapsed
: {
1253 value
= str(collapsed
);
1258 if (!value
.empty()) {
1260 time_t date
= string_to_int(value
);
1261 if (date
!= (time_t)-1) {
1263 then
= gmtime(&date
);
1264 string date_fmt
= "%Y-%m-%d";
1265 if (args
.size() > 1) date_fmt
= eval(args
[1], param
);
1266 strftime(buf
, sizeof buf
, date_fmt
.c_str(), then
);
1275 static Xapian::doccount dbsize
;
1276 if (!dbsize
) dbsize
= db
.get_doccount();
1277 value
= str(dbsize
);
1281 func_attrib
*fa
= new func_attrib
;
1282 fa
->tag
= CMD_MACRO
+ macros
.size();
1285 fa
->evalargs
= N
; // FIXME: or 0?
1288 macros
.push_back(args
[1]);
1289 func_map
[args
[0]] = fa
;
1293 if (default_op
== Xapian::Query::OP_AND
) {
1300 int denom
= string_to_int(args
[1]);
1302 value
= "divide by 0";
1304 value
= str(string_to_int(args
[0]) /
1305 string_to_int(args
[1]));
1310 if (args
[0] == args
[1]) value
= "true";
1312 case CMD_emptydocs
: {
1316 Xapian::PostingIterator i
;
1317 for (i
= db
.postlist_begin(t
); i
!= db
.postlist_end(t
); ++i
) {
1318 if (i
.get_doclength() != 0) continue;
1319 if (!value
.empty()) value
+= '\t';
1325 char *env
= getenv(args
[0].c_str());
1326 if (env
!= NULL
) value
= env
;
1330 if (error_msg
.empty() && enquire
== NULL
&& !dbname
.empty()) {
1331 error_msg
= "Database '" + dbname
+ "' couldn't be opened";
1336 Xapian::docid did
= q0
;
1337 if (args
.size() > 1) did
= string_to_int(args
[1]);
1338 value
= fields
.get_field(did
, args
[0]);
1341 case CMD_filesize
: {
1342 // FIXME: rounding? i18n?
1343 int size
= string_to_int(args
[0]);
1346 const char * format
= 0;
1348 // Negative size -> empty result.
1349 } else if (size
== 1) {
1351 } else if (size
< 1024) {
1352 format
= "%d bytes";
1354 if (size
< 1024*1024) {
1358 if (size
< 1024*1024) {
1365 intpart
= unsigned(size
) / 1024;
1366 fraction
= unsigned(size
) % 1024;
1371 if (fraction
== -1) {
1372 len
= my_snprintf(buf
, sizeof(buf
), format
, intpart
);
1374 fraction
= (fraction
* 10 / 1024) + '0';
1375 len
= my_snprintf(buf
, sizeof(buf
), format
, intpart
, fraction
);
1377 if (len
< 0 || (unsigned)len
> sizeof(buf
)) len
= sizeof(buf
);
1378 value
.assign(buf
, len
);
1385 case CMD_filterterms
: {
1386 Xapian::TermIterator term
= db
.allterms_begin();
1387 term
.skip_to(args
[0]);
1388 while (term
!= db
.allterms_end()) {
1390 if (!startswith(t
, args
[0])) break;
1396 if (!value
.empty()) value
.erase(value
.size() - 1);
1400 string l
= args
[0], s
= args
[1];
1401 string::size_type i
= 0, j
= 0;
1403 while (j
!= l
.size()) {
1404 j
= l
.find('\t', i
);
1405 if (j
== string::npos
) j
= l
.size();
1406 if (j
- i
== s
.length()) {
1407 if (memcmp(s
.data(), l
.data() + i
, j
- i
) == 0) {
1422 value
= str(mset
.get_termfreq(args
[0]));
1423 } catch (const Xapian::InvalidOperationError
&) {
1424 // An MSet will raise this error if it's empty and not
1425 // associated with a search.
1426 value
= str(db
.get_termfreq(args
[0]));
1430 if (string_to_int(args
[0]) >= string_to_int(args
[1]))
1434 if (string_to_int(args
[0]) > string_to_int(args
[1]))
1437 case CMD_highlight
: {
1439 if (args
.size() > 2) {
1441 if (args
.size() > 3) {
1444 string::const_iterator i
;
1445 i
= find_if(bra
.begin() + 2, bra
.end(), p_nottag
);
1447 ket
+= bra
.substr(1, i
- bra
.begin() - 1);
1452 value
= html_highlight(args
[0], args
[1], bra
, ket
);
1456 // 0-based mset index
1457 value
= str(hit_no
);
1461 url_query_string
= "?DB=";
1462 url_query_string
+= dbname
;
1463 multimap
<string
, string
>::const_iterator j
;
1464 for (j
= probabilistic_query
.begin();
1465 j
!= probabilistic_query
.end();
1467 if (j
->first
.empty()) {
1468 url_query_string
+= "&P=";
1470 url_query_string
+= "&P."
1471 url_query_string
+= j
->first
;
1472 url_query_string
+= '=';
1474 const char *q
= j
->second
.c_str();
1476 while ((ch
= *q
++) != '\0') {
1479 url_query_string
+= "%2b";
1482 url_query_string
+= "%22";
1485 url_query_string
+= "%25";
1488 url_query_string
+= "%26";
1494 url_query_string
+= ch
;
1498 // add any boolean terms
1499 for (FMCI i
= filter_map
.begin(); i
!= filter_map
.end(); i
++) {
1500 url_query_string
+= "&B=";
1501 url_query_string
+= i
->second
;
1504 for (hit_no
= topdoc
; hit_no
< last
; hit_no
++)
1505 value
+= print_caption(args
[0], param
);
1508 case CMD_hitsperpage
:
1509 value
= str(hits_per_page
);
1511 case CMD_hostname
: {
1513 // remove URL scheme and/or path
1514 string::size_type i
= value
.find("://");
1515 if (i
== string::npos
) i
= 0; else i
+= 3;
1516 value
= value
.substr(i
, value
.find('/', i
) - i
);
1517 // remove user@ or user:password@
1518 i
= value
.find('@');
1519 if (i
!= string::npos
) value
.erase(0, i
+ 1);
1521 i
= value
.find(':');
1522 if (i
!= string::npos
) value
.resize(i
);
1526 value
= html_escape(args
[0]);
1529 value
= html_strip(args
[0]);
1531 case CMD_httpheader
:
1532 if (!suppress_http_headers
) {
1533 cout
<< args
[0] << ": " << args
[1] << endl
;
1534 if (!set_content_type
&& args
[0].length() == 12 &&
1535 strcasecmp(args
[0].c_str(), "Content-Type") == 0) {
1536 set_content_type
= true;
1545 if (!args
[0].empty())
1546 value
= eval(args
[1], param
);
1547 else if (args
.size() > 2)
1548 value
= eval(args
[2], param
);
1551 value
= eval_file(args
[0]);
1557 case CMD_jsonarray
: {
1558 const string
& l
= args
[0];
1559 string::size_type i
= 0, j
;
1566 j
= l
.find('\t', i
);
1567 string
elt(l
, i
, j
- i
);
1570 if (j
== string::npos
) break;
1580 case CMD_lastpage
: {
1581 int l
= mset
.get_matches_estimated();
1582 if (l
> 0) l
= (l
- 1) / hits_per_page
+ 1;
1587 if (string_to_int(args
[0]) <= string_to_int(args
[1]))
1591 if (args
[0].empty()) {
1594 size_t length
= count(args
[0].begin(), args
[0].end(), '\t');
1595 value
= str(length
+ 1);
1599 if (!args
[0].empty()) {
1600 string pre
, inter
, interlast
, post
;
1601 switch (args
.size()) {
1603 inter
= interlast
= args
[1];
1607 interlast
= args
[2];
1611 inter
= interlast
= args
[2];
1617 interlast
= args
[3];
1622 string list
= args
[0];
1623 string::size_type split
= 0, split2
;
1624 while ((split2
= list
.find('\t', split
)) != string::npos
) {
1625 if (split
) value
+= inter
;
1626 value
+= list
.substr(split
, split2
- split
);
1629 if (split
) value
+= interlast
;
1630 value
+= list
.substr(split
);
1636 if (!vet_filename(args
[0])) break;
1637 string logfile
= log_dir
+ args
[0];
1638 int fd
= open(logfile
.c_str(), O_CREAT
|O_APPEND
|O_WRONLY
, 0644);
1639 if (fd
== -1) break;
1640 vector
<string
> noargs
;
1643 if (args
.size() > 1) {
1646 line
= DEFAULT_LOG_ENTRY
;
1648 line
= eval(line
, noargs
);
1650 (void)write_all(fd
, line
.data(), line
.length());
1655 if (!vet_filename(args
[0])) break;
1656 string cdbfile
= cdb_dir
+ args
[0];
1657 int fd
= open(cdbfile
.c_str(), O_RDONLY
);
1658 if (fd
== -1) break;
1663 if (cdb_find(&cdb
, args
[1].data(), args
[1].length()) > 0) {
1664 size_t datalen
= cdb_datalen(&cdb
);
1665 const void *dat
= cdb_get(&cdb
, datalen
, cdb_datapos(&cdb
));
1667 value
= string(static_cast<const char *>(dat
), datalen
);
1672 close(fd
); // FIXME: cache fds?
1676 value
= Xapian::Unicode::tolower(args
[0]);
1679 if (string_to_int(args
[0]) < string_to_int(args
[1]))
1683 if (!args
[0].empty()) {
1684 string l
= args
[0], pat
= args
[1];
1685 vector
<string
> new_args(param
);
1686 string::size_type i
= 0, j
;
1688 j
= l
.find('\t', i
);
1689 new_args
[0] = l
.substr(i
, j
- i
);
1690 value
+= eval(pat
, new_args
);
1691 if (j
== string::npos
) break;
1698 vector
<string
>::const_iterator i
= args
.begin();
1699 int val
= string_to_int(*i
++);
1700 for (; i
!= args
.end(); i
++) {
1701 int x
= string_to_int(*i
);
1702 if (x
> val
) val
= x
;
1708 vector
<string
>::const_iterator i
= args
.begin();
1709 int val
= string_to_int(*i
++);
1710 for (; i
!= args
.end(); i
++) {
1711 int x
= string_to_int(*i
);
1712 if (x
< val
) val
= x
;
1718 // number of matches
1719 value
= str(mset
.get_matches_estimated());
1721 case CMD_msizeexact
:
1723 if (mset
.get_matches_lower_bound()
1724 == mset
.get_matches_upper_bound())
1728 int denom
= string_to_int(args
[1]);
1730 value
= "divide by 0";
1732 value
= str(string_to_int(args
[0]) %
1733 string_to_int(args
[1]));
1738 vector
<string
>::const_iterator i
= args
.begin();
1739 int total
= string_to_int(*i
++);
1740 while (i
!= args
.end())
1741 total
*= string_to_int(*i
++);
1746 int denom
= string_to_int(args
[2]);
1748 value
= "divide by 0";
1750 int num
= string_to_int(args
[0]) * string_to_int(args
[1]);
1751 value
= str(num
/ denom
);
1756 if (args
[0] != args
[1]) value
= "true";
1759 string::const_iterator i
= args
[0].begin();
1760 int len
= args
[0].length();
1763 if (--len
&& len
% 3 == 0) value
+= option
["thousand"];
1768 if (args
[0].empty()) value
= "true";
1772 my_snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)time(NULL
));
1773 // MSVC's snprintf omits the zero byte if the string if
1774 // sizeof(buf) long.
1775 buf
[sizeof(buf
) - 1] = '\0';
1780 if (args
.size() == 2) {
1781 value
= option
[args
[0] + "," + args
[1]];
1783 value
= option
[args
[0]];
1787 for (vector
<string
>::const_iterator i
= args
.begin();
1788 i
!= args
.end(); i
++) {
1789 value
= eval(*i
, param
);
1790 if (!value
.empty()) break;
1795 value
= int_to_binary_string(string_to_int(args
[0]));
1797 case CMD_percentage
:
1799 value
= str(percent
);
1801 case CMD_prettyterm
:
1802 value
= pretty_term(args
[0]);
1806 url_prettify(value
);
1809 pair
<multimap
<string
, string
>::const_iterator
,
1810 multimap
<string
, string
>::const_iterator
> r
;
1811 r
= probabilistic_query
.equal_range(args
.empty() ?
1812 string() : args
[0]);
1813 multimap
<string
, string
>::const_iterator j
;
1814 for (j
= r
.first
; j
!= r
.second
; ++j
) {
1815 if (!value
.empty()) value
+= '\t';
1816 const string
& s
= j
->second
;
1817 size_t start
= 0, tab
;
1818 while ((tab
= s
.find('\t', start
)) != string::npos
) {
1819 value
.append(s
, start
, tab
- start
);
1823 value
.append(s
, start
, string::npos
);
1827 case CMD_querydescription
:
1828 value
= query
.get_description();
1830 case CMD_queryterms
:
1834 int start
= string_to_int(args
[0]);
1835 int end
= string_to_int(args
[1]);
1836 while (start
<= end
) {
1837 value
+= str(start
);
1838 if (start
< end
) value
+= '\t';
1845 if (!args
.empty()) id
= string_to_int(args
[0]);
1846 value
= db
.get_document(id
).get_data();
1849 case CMD_relevant
: {
1850 // document id if relevant; empty otherwise
1852 if (!args
.empty()) id
= string_to_int(args
[0]);
1853 map
<Xapian::docid
, bool>::iterator i
= ticked
.find(id
);
1854 if (i
!= ticked
.end()) {
1855 i
->second
= false; // icky side-effect
1860 case CMD_relevants
: {
1861 for (map
<Xapian::docid
, bool>::const_iterator i
= ticked
.begin();
1862 i
!= ticked
.end(); i
++) {
1864 value
+= str(i
->first
);
1868 if (!value
.empty()) value
.erase(value
.size() - 1);
1873 value
= str(percent
/ 10);
1876 option
[args
[0]] = args
[1];
1879 string base
= args
[0] + ',';
1880 if (args
.size() % 2 != 1)
1881 throw string("$setmap requires an odd number of arguments");
1882 for (unsigned int i
= 1; i
+ 1 < args
.size(); i
+= 2) {
1883 option
[base
+ args
[i
]] = args
[i
+ 1];
1887 case CMD_setrelevant
: {
1888 string::size_type i
= 0, j
;
1890 j
= args
[0].find_first_not_of("0123456789", i
);
1891 Xapian::docid id
= atoi(args
[0].substr(i
, j
- i
).c_str());
1893 rset
.add_document(id
);
1896 if (j
== string::npos
) break;
1902 string list
= args
[0], pos
= args
[1];
1903 vector
<string
> items
;
1904 string::size_type i
= 0, j
;
1906 j
= list
.find('\t', i
);
1907 items
.push_back(list
.substr(i
, j
- i
));
1908 if (j
== string::npos
) break;
1912 bool have_added
= false;
1914 j
= pos
.find('\t', i
);
1915 int item
= string_to_int(pos
.substr(i
, j
- i
));
1916 if (item
>= 0 && size_t(item
) < items
.size()) {
1917 if (have_added
) value
+= '\t';
1918 value
+= items
[item
];
1921 if (j
== string::npos
) break;
1927 Xapian::Snipper snipper
;
1928 snipper
.set_mset(mset
);
1929 snipper
.set_stemmer(Xapian::Stem(option
["stemmer"]));
1930 size_t len
= (args
.size() == 1) ? 200 : string_to_int(args
[1]);
1931 value
= snipper
.generate_snippet(args
[0], len
);
1936 if (args
.size() == 1) {
1943 string::size_type i
= 0;
1945 if (split
.empty()) {
1947 if (i
>= value
.size()) break;
1949 i
= value
.find(split
, i
);
1950 if (i
== string::npos
) break;
1952 value
.replace(i
, split
.size(), 1, '\t');
1957 case CMD_stoplist
: {
1958 Xapian::TermIterator i
= qp
.stoplist_begin();
1959 Xapian::TermIterator end
= qp
.stoplist_end();
1961 if (!value
.empty()) value
+= '\t';
1968 value
= str(string_to_int(args
[0]) - string_to_int(args
[1]));
1971 int start
= string_to_int(args
[1]);
1973 if (static_cast<size_t>(-start
) >= args
[0].size()) {
1976 start
= static_cast<int>(args
[0].size()) + start
;
1979 if (static_cast<size_t>(start
) >= args
[0].size()) break;
1981 size_t len
= string::npos
;
1982 if (args
.size() > 2) {
1983 int int_len
= string_to_int(args
[2]);
1985 len
= size_t(int_len
);
1987 len
= args
[0].size() - start
;
1988 if (static_cast<size_t>(-int_len
) >= len
) {
1991 len
-= static_cast<size_t>(-int_len
);
1995 value
= args
[0].substr(start
, len
);
1998 case CMD_suggestion
:
1999 value
= qp
.get_corrected_query_string();
2003 // list of matching terms
2004 Xapian::TermIterator term
= enquire
->get_matching_terms_begin(q0
);
2005 while (term
!= enquire
->get_matching_terms_end(q0
)) {
2006 // check term was in the typed query so we ignore
2007 // boolean filter terms
2008 if (termset
.find(*term
) != termset
.end()) {
2015 if (!value
.empty()) value
.erase(value
.size() - 1);
2019 value
= str(topdoc
/ hits_per_page
+ 1);
2024 my_snprintf(buf
, sizeof(buf
), "%.6f", secs
);
2025 // MSVC's snprintf omits the zero byte if the string if
2026 // sizeof(buf) long.
2027 buf
[sizeof(buf
) - 1] = '\0';
2032 // first document on current page of hit list (counting from 0)
2033 value
= str(topdoc
);
2038 if (!args
.empty()) howmany
= string_to_int(args
[0]);
2039 if (howmany
< 0) howmany
= 0;
2041 // List of expand terms
2043 OmegaExpandDecider
decider(db
, &termset
);
2045 if (!rset
.empty()) {
2046 set_expansion_scheme(*enquire
, option
);
2047 #if XAPIAN_AT_LEAST(1,3,2)
2048 eset
= enquire
->get_eset(howmany
* 2, rset
, &decider
);
2050 eset
= enquire
->get_eset(howmany
* 2, rset
, 0,
2051 expand_param_k
, &decider
);
2053 } else if (mset
.size()) {
2058 // FIXME: what if mset does not start at first match?
2059 Xapian::MSetIterator m
= mset
.begin();
2060 for ( ; m
!= mset
.end(); ++m
) {
2061 tmp
.add_document(*m
);
2062 if (--c
== 0) break;
2065 set_expansion_scheme(*enquire
, option
);
2066 #if XAPIAN_AT_LEAST(1,3,2)
2067 eset
= enquire
->get_eset(howmany
* 2, tmp
, &decider
);
2069 eset
= enquire
->get_eset(howmany
* 2, tmp
, 0,
2070 expand_param_k
, &decider
);
2074 // Don't show more than one word with the same stem.
2076 Xapian::ESetIterator i
;
2077 for (i
= eset
.begin(); i
!= eset
.end(); ++i
) {
2079 string stem
= (*stemmer
)(term
);
2080 if (stems
.find(stem
) != stems
.end()) continue;
2084 if (--howmany
== 0) break;
2086 if (!value
.empty()) value
.erase(value
.size() - 1);
2090 omegascript_transform(value
, args
);
2093 value
= generate_sample(args
[0],
2094 string_to_int(args
[1]),
2095 args
.size() > 2 ? args
[2] : string(),
2096 args
.size() > 3 ? args
[3] : string());
2099 const string
&list
= args
[0];
2100 if (list
.empty()) break;
2101 string::size_type split
= 0, split2
;
2104 split2
= list
.find('\t', split
);
2105 string item
= list
.substr(split
, split2
- split
);
2108 } else if (item
!= prev
) {
2114 } while (split2
!= string::npos
);
2118 value
= str(binary_string_to_int(args
[0]));
2121 const string
&term
= args
[0];
2122 Xapian::TermIterator i
= qp
.unstem_begin(term
);
2123 Xapian::TermIterator end
= qp
.unstem_end(term
);
2125 if (!value
.empty()) value
+= '\t';
2132 value
= Xapian::Unicode::toupper(args
[0]);
2135 url_encode(value
, args
[0]);
2138 Xapian::docid id
= q0
;
2139 Xapian::valueno value_no
= string_to_int(args
[0]);
2140 if (args
.size() > 1) id
= string_to_int(args
[1]);
2141 value
= db
.get_document(id
).get_value(value_no
);
2145 value
= PACKAGE_STRING
;
2148 value
= double_to_string(weight
);
2151 args
.insert(args
.begin(), param
[0]);
2152 int macro_no
= func
->second
->tag
- CMD_MACRO
;
2153 assert(macro_no
>= 0 && (unsigned int)macro_no
< macros
.size());
2154 // throw "Unknown function '" + var + "'";
2155 value
= eval(macros
[macro_no
], args
);
2160 } catch (const Xapian::Error
& e
) {
2161 // FIXME: this means we only see the most recent error in $error
2162 // - is that the best approach?
2163 error_msg
= e
.get_msg();
2166 res
+= fmt
.substr(p
);
2171 eval_file(const string
&fmtfile
)
2174 if (vet_filename(fmtfile
)) {
2175 string file
= template_dir
+ fmtfile
;
2177 if (load_file(file
, fmt
)) {
2178 vector
<string
> noargs
;
2180 return eval(fmt
, noargs
);
2182 err
= strerror(errno
);
2184 err
= "name contains '..'";
2187 // FIXME: report why!
2188 string msg
= string("Couldn't read format template '") + fmtfile
+ '\'';
2189 if (!err
.empty()) msg
+= " (" + err
+ ')';
2194 pretty_term(string term
)
2196 // Just leave empty strings and single characters alone.
2197 if (term
.length() <= 1) return term
;
2199 // Assume unprefixed terms are unstemmed.
2200 if (!C_isupper(term
[0])) return term
;
2202 // Handle stemmed terms.
2203 bool stemmed
= (term
[0] == 'Z');
2205 // First of all, check if a term in the query stemmed to this one.
2206 Xapian::TermIterator u
= qp
.unstem_begin(term
);
2207 // There might be multiple words with the same stem, but we only want
2208 // one so just take the first.
2209 if (u
!= qp
.unstem_end(term
)) return *u
;
2215 bool add_quotes
= false;
2217 // Check if the term has a prefix.
2218 if (C_isupper(term
[0])) {
2219 // See if we have this prefix in the termprefix_to_userprefix map. If
2220 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2222 size_t prefix_len
= prefix_from_term(prefix
, term
);
2224 map
<string
, string
>::const_iterator i
;
2225 i
= termprefix_to_userprefix
.find(prefix
);
2226 if (i
!= termprefix_to_userprefix
.end()) {
2227 string user_prefix
= i
->second
;
2229 term
.replace(0, prefix_len
, user_prefix
);
2231 // We don't have a prefix mapping for this, so just set a flag to
2232 // add quotes around the term.
2237 if (stemmed
) term
+= '.';
2240 term
.insert(0, "\"");
2248 print_caption(const string
&fmt
, const vector
<string
> ¶m
)
2250 q0
= *(mset
[hit_no
]);
2252 weight
= mset
[hit_no
].get_weight();
2253 percent
= mset
.convert_to_percent(mset
[hit_no
]);
2254 collapsed
= mset
[hit_no
].get_collapse_count();
2256 return eval(fmt
, param
);
2263 const char * p
= getenv("SERVER_PROTOCOL");
2264 if (p
&& strcmp(p
, "INCLUDED") == 0) {
2265 // We're being included in another page, so suppress headers.
2266 suppress_http_headers
= true;
2269 std::string output
= eval_file(fmtname
);
2270 if (!set_content_type
&& !suppress_http_headers
) {
2271 cout
<< "Content-Type: text/html" << std::endl
;
2272 set_content_type
= true;
2277 // Ensure the headers have been output so that any exception gets
2278 // reported rather than giving a server error.
2279 if (!set_content_type
&& !suppress_http_headers
) {
2280 cout
<< "Content-Type: text/html" << std::endl
;
2281 set_content_type
= true;
2289 ensure_query_parsed()
2291 if (query_parsed
) return;
2292 query_parsed
= true;
2297 // Should we discard the existing R-set recorded in R CGI parameters?
2298 bool discard_rset
= false;
2300 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2302 bool force_first_page
= false;
2305 // get list of terms from previous iteration of query
2306 val
= cgi_params
.find("xP");
2307 if (val
!= cgi_params
.end()) {
2309 // If xP given, default to discarding any RSet and forcing the first
2310 // page of results. If the query is the same, or an extension of
2311 // the previous query, we adjust these again below.
2312 discard_rset
= true;
2313 force_first_page
= true;
2315 querytype result
= set_probabilistic(v
);
2322 case EXTENDED_QUERY
:
2323 // If we've changed database, force the first page of hits
2324 // and discard the R-set (since the docids will have changed)
2325 val
= cgi_params
.find("xDB");
2326 if (val
!= cgi_params
.end() && val
->second
!= dbname
) break;
2327 if (result
== SAME_QUERY
&& force_first_page
) {
2328 val
= cgi_params
.find("xFILTERS");
2329 if (val
!= cgi_params
.end() && val
->second
!= filters
) {
2330 // Filters have changed since last query.
2332 force_first_page
= false;
2335 discard_rset
= false;
2339 if (!force_first_page
) {
2340 // Work out which mset element is the first hit we want
2342 val
= cgi_params
.find("TOPDOC");
2343 if (val
!= cgi_params
.end()) {
2344 topdoc
= atol(val
->second
.c_str());
2347 // Handle next, previous, and page links
2348 if (cgi_params
.find(">") != cgi_params
.end()) {
2349 topdoc
+= hits_per_page
;
2350 } else if (cgi_params
.find("<") != cgi_params
.end()) {
2351 if (topdoc
>= hits_per_page
)
2352 topdoc
-= hits_per_page
;
2355 } else if ((val
= cgi_params
.find("[")) != cgi_params
.end() ||
2356 (val
= cgi_params
.find("#")) != cgi_params
.end()) {
2357 long page
= atol(val
->second
.c_str());
2358 // Do something sensible for page 0 (we count pages from 1).
2359 if (page
== 0) page
= 1;
2360 topdoc
= (page
- 1) * hits_per_page
;
2363 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2364 // Normally we snap TOPDOC like this so that things work nicely if
2365 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2366 // postprocessing the output of omega and want variable sized pages,
2367 // this is unhelpful.
2368 bool raw_search
= false;
2369 val
= cgi_params
.find("RAWSEARCH");
2370 if (val
!= cgi_params
.end()) {
2371 raw_search
= bool(atol(val
->second
.c_str()));
2374 if (!raw_search
) topdoc
= (topdoc
/ hits_per_page
) * hits_per_page
;
2377 if (!discard_rset
) {
2378 // put documents marked as relevant into the rset
2379 g
= cgi_params
.equal_range("R");
2380 for (MCI i
= g
.first
; i
!= g
.second
; i
++) {
2381 const string
& value
= i
->second
;
2382 for (size_t j
= 0; j
< value
.size(); j
= value
.find('.', j
)) {
2383 while (value
[j
] == '.') ++j
;
2384 Xapian::docid d
= atoi(value
.c_str() + j
);
2386 rset
.add_document(d
);
2394 // run query if we haven't already
2398 if (done_query
) return;
2400 secs
= RealTime::now();
2403 secs
= RealTime::now() - secs
;
2406 last
= mset
.get_matches_lower_bound();
2408 // Otherwise topdoc ends up being -6 if it's non-zero!
2412 topdoc
= ((last
- 1) / hits_per_page
) * hits_per_page
;
2413 // last is the count of documents up to the end of the current page
2414 // (as returned by $last)
2415 if (topdoc
+ hits_per_page
< last
)
2416 last
= topdoc
+ hits_per_page
;
2420 // OmegaExpandDecider methods.
2422 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database
& db_
,
2423 set
<string
> * querytermset
)
2426 // We'll want the stemmer for testing matches anyway.
2428 stemmer
= new Xapian::Stem(option
["stemmer"]);
2430 set
<string
>::const_iterator i
;
2431 for (i
= querytermset
->begin(); i
!= querytermset
->end(); ++i
) {
2433 if (term
.empty()) continue;
2435 unsigned char ch
= term
[0];
2436 bool stemmed
= (ch
== 'Z');
2439 if (term
.empty()) continue;
2443 if (C_isupper(ch
)) {
2445 size_t prefix_len
= prefix_from_term(prefix
, term
);
2446 term
.erase(0, prefix_len
);
2449 if (!stemmed
) term
= (*stemmer
)(term
);
2451 exclude_stems
.insert(term
);
2457 OmegaExpandDecider::operator()(const string
& term
) const
2459 unsigned char ch
= term
[0];
2461 // Reject terms with a prefix.
2462 if (C_isupper(ch
)) return false;
2466 // Don't suggest stopwords.
2467 if (stopper(term
)) return false;
2470 // Reject small numbers.
2471 if (term
.size() < 4 && C_isdigit(ch
)) return false;
2473 // Reject terms containing a space.
2474 if (term
.find(' ') != string::npos
) return false;
2476 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2477 // terms which are already in the query in some form.
2478 string stem
= (*stemmer
)(term
);
2479 if (exclude_stems
.find(stem
) != exclude_stems
.end())
2482 // Ignore terms that only occur once (hapaxes) since they aren't
2483 // useful for finding related documents - they only occur in a
2484 // document that's already been marked as relevant.
2485 // FIXME: add an expand option to ignore terms where
2486 // termfreq == rtermfreq.
2487 if (db
.get_termfreq(term
) <= 1) return false;