From cf32d39172f4580a7ece3315b2516692fe3f1994 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 2 Feb 2016 17:17:20 +1300 Subject: [PATCH] Add support for negated boolean filter terms Specified by CGI parameter "N". --- xapian-applications/omega/docs/cgiparams.rst | 13 +++- xapian-applications/omega/docs/overview.rst | 95 +++++++++++++++++++++++----- xapian-applications/omega/omega.cc | 79 ++++++++++++++++++----- xapian-applications/omega/omegatest | 10 ++- xapian-applications/omega/query.cc | 22 ++++++- xapian-applications/omega/query.h | 4 +- 6 files changed, 185 insertions(+), 38 deletions(-) diff --git a/xapian-applications/omega/docs/cgiparams.rst b/xapian-applications/omega/docs/cgiparams.rst index 3bc517bee..9837c270b 100644 --- a/xapian-applications/omega/docs/cgiparams.rst +++ b/xapian-applications/omega/docs/cgiparams.rst @@ -89,8 +89,17 @@ Filtering parameters -------------------- B - general boolean filter terms. See the `overview document - `_ for details of how multiple B parameters are handled. + general boolean filter terms. + + See the `overview document `_ for details of how + multiple `B` and `N` parameters are handled. + +N + negated general boolean filter terms (new in Omega 1.3.5 - older + versions will just ignore any `N` parameters). + + See the `overview document `_ for details of how + multiple `B` and `N` parameters are handled. COLLAPSE value number to use for removing duplicate documents. diff --git a/xapian-applications/omega/docs/overview.rst b/xapian-applications/omega/docs/overview.rst index 8255e2c0b..6e0835a4a 100644 --- a/xapian-applications/omega/docs/overview.rst +++ b/xapian-applications/omega/docs/overview.rst @@ -27,11 +27,12 @@ Term construction Documents within an omega database are stored with two types of terms: those used for probabilistic searching (the CGI parameter 'P'), and -those used for boolean filtering (the CGI parameter 'B'). Boolean -terms start with an initial capital letter denoting the 'group' of the -term (e.g. 'M' for MIME type), while probabilistic terms are all -lower-case, and are also stemmed before adding to the -database. +those used for boolean filtering (the CGI parameters 'B' and 'N' - the +latter is a negated variant of 'B' and was added in Omega 1.3.5). + +Boolean terms start with an initial capital letter denoting the 'group' of the +term (e.g. 'M' for MIME type), while probabilistic terms are all lower-case, +and are also stemmed before adding to the database. The "english" stemmer is used by default - you can configure this for omindex and scriptindex with ``--stemmer=LANGUAGE`` (use ``--stemmer=none`` to disable @@ -40,21 +41,77 @@ search time you can configure the stemmer by adding ``$set{stemmer,LANGUAGE}`` to the top of your OmegaScript template. The two term types are used as follows when building the query: -B(oolean) terms with the same prefix are ORed together, with all the -different prefix groups being ANDed together. This is then FILTERed -against the P(robabilistic) terms. This will look something like:: - - [ FILTER ] - / \ - / \ - P-terms [ AND ] - / |...\ - / + +The 'P' parameter is parsed using `Xapian::QueryParser` to give a +`Xapian::Query` object denoted as `P-terms` below. + +There are two ways that 'B' and 'N' parameters are handled, depending if the +term-prefix has been configured as "non-exclusive" or not. The default is +"exclusive" (and in versions before 1.3.4, this was how all 'B' parameters +were handled). + +Exclusive Boolean Prefix +------------------------ + +B(oolean) terms from 'B' parameters with the same prefix are ORed together, +like so:: + + [ OR ] / | ... \ B(F,1) B(F,2)...B(F,n) -Where B(F,1) is the first boolean term with prefix F, and so on. +Where B(F,1) is the first boolean term with prefix F from a 'B' parameter, and +so on. + +Non-Exclusive Boolean Prefix +---------------------------- + +For example, ``$setmap{nonexclusiveprefix,K,true}`` sets prefix `K` as +non-exclusive, which means that multiple filter terms from 'B' parameters will +be combined with "AND" instead of "OR", like so:: + + [ AND ] + / | ... \ + B(K,1) B(K,2)... B(K,m) + +Combining the Boolean Filters +----------------------------- + +The subqueries for each prefix from "B" parameters are combined with AND, +to make this (which we refer to as "B-filter" below):: + + [ AND ] + / | ... \ + / \ + [ OR ] [ AND ] + / | ... \ / | ... \ + B(F,1) B(F,2)...B(F,n) B(K,1) B(K,2)...B(K,m) + + +Negated Boolean Terms +--------------------- + +All the terms from all 'N' parameters are combined together with "OR", to +make this (which we refer to as "N-filter" below):: + + [ OR ] + / ... | | ... \ + N(F,1)...N(F,n) N(K,1)...N(K,m) + +Putting it all together +----------------------- + +The P-terms are filtered by the B-filter using "FILTER" and by the N-filter +using "AND_NOT":: + + [ AND_NOT ] + / \ + / \ + [ FILTER ] N-terms + / \ + / \ + P-terms B-terms The intent here is to allow filtering on arbitrary (and, typically, orthogonal) characteristics of the document. For instance, by adding @@ -63,9 +120,13 @@ filtering the probabilistic search for only documents that are both in the "/press" site *and* which are either of MIME type text/html or text/plain. (See below for more information about sites.) +If B-terms or N-terms is absent, that part of the query is simply omitted. + If there is no probabilistic query, the boolean filter is promoted to be the query, and the weighting scheme is set to boolean. This has -the effect of applying the boolean filter to the whole database. +the effect of applying the boolean filter to the whole database. If +there are only N-terms, then ``Query::MatchAll`` is used for the left +side of the "AND_NOT". In order to add more boolean prefixes, you will need to alter the ``index_file()`` function in omindex.cc. Currently omindex adds several diff --git a/xapian-applications/omega/omega.cc b/xapian-applications/omega/omega.cc index d7d7fb72f..8d6f090bb 100644 --- a/xapian-applications/omega/omega.cc +++ b/xapian-applications/omega/omega.cc @@ -3,7 +3,7 @@ * Copyright 1999,2000,2001 BrightStation PLC * Copyright 2001 James Aylett * Copyright 2001,2002 Ananova Ltd - * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015 Olly Betts + * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016 Olly Betts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -312,6 +312,47 @@ try { } } + // set any negated boolean filters + g = cgi_params.equal_range("N"); + if (g.first != g.second) { + vector filter_v; + for (MCI i = g.first; i != g.second; i++) { + const string & v = i->second; + // we'll definitely get empty N fields from "-ALL-" options + if (!v.empty() && C_isalnum(v[0])) { + add_nterm(v); + filter_v.push_back(v); + } + } + sort(filter_v.begin(), filter_v.end()); + vector::const_iterator j; + for (j = filter_v.begin(); j != filter_v.end(); ++j) { + const string & nterm = *j; + string::size_type e = nterm.find(filter_sep); + filters += '!'; + if (usual(e == string::npos)) { + filters += nterm; + } else { + // If a filter contains filter_sep then double it to escape. + // Each filter must start with an alnum (checked above) and + // the value after the last filter is the default op, which + // is encoded as a non-alnum so filter_sep followed by + // something other than filter_sep must be separating filters. + string::size_type b = 0; + while (e != string::npos) { + filters.append(nterm, b, e + 1 - b); + b = e; + e = nterm.find(filter_sep, b + 1); + } + filters.append(nterm, b, string::npos); + } + filters += filter_sep; + // old_filters predates 'N' terms, so if there are 'N' terms this + // is definitely a different query. + old_filters.clear(); + } + } + // date range filters val = cgi_params.find("START"); if (val != cgi_params.end()) date_start = val->second; @@ -321,7 +362,7 @@ try { if (val != cgi_params.end()) date_span = val->second; // If more default_op values are supported, encode them as non-alnums - // other than filter_sep. + // other than filter_sep or '!'. filters += (default_op == Xapian::Query::OP_AND ? '.' : '-'); filters += date_start; filters += filter_sep; @@ -329,12 +370,14 @@ try { filters += filter_sep; filters += date_span; - old_filters += date_start; - old_filters += filter_sep_old; - old_filters += date_end; - old_filters += filter_sep_old; - old_filters += date_span; - old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O'); + if (!old_filters.empty()) { + old_filters += date_start; + old_filters += filter_sep_old; + old_filters += date_end; + old_filters += filter_sep_old; + old_filters += date_span; + old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O'); + } // Percentage relevance cut-off val = cgi_params.find("THRESHOLD"); @@ -353,8 +396,10 @@ try { collapse = true; filters += filter_sep; filters += str(collapse_key); - old_filters += filter_sep_old; - old_filters += str(collapse_key); + if (!old_filters.empty()) { + old_filters += filter_sep_old; + old_filters += str(collapse_key); + } } } @@ -367,12 +412,12 @@ try { if (ch == 'D') { docid_order = Xapian::Enquire::DESCENDING; filters += 'D'; - old_filters += 'D'; + if (!old_filters.empty()) old_filters += 'D'; } else if (ch != 'A') { docid_order = Xapian::Enquire::DONT_CARE; } else { filters += 'X'; - old_filters += 'X'; + if (!old_filters.empty()) old_filters += 'X'; } } } @@ -391,23 +436,25 @@ try { } // Add the sorting related options to filters too. filters += str(sort_key); - old_filters += str(sort_key); + if (!old_filters.empty()) old_filters += str(sort_key); if (sort_after) { if (sort_ascending) { filters += 'F'; - old_filters += 'F'; + if (!old_filters.empty()) old_filters += 'F'; } else { filters += 'R'; - old_filters += 'R'; + if (!old_filters.empty()) old_filters += 'R'; } } else { if (!sort_ascending) { filters += 'r'; - old_filters += 'r'; + if (!old_filters.empty()) old_filters += 'r'; } } } + if (old_filters.empty()) old_filters = filters; + // min_hits (fill mset past topdoc+(hits_per_page+1) to // topdoc+max(hits_per_page+1,min_hits) val = cgi_params.find("MINHITS"); diff --git a/xapian-applications/omega/omegatest b/xapian-applications/omega/omegatest index 3071d25f1..2621665d4 100755 --- a/xapian-applications/omega/omegatest +++ b/xapian-applications/omega/omegatest @@ -1,7 +1,7 @@ #!/bin/sh # omegatest: Test omega CGI # -# Copyright (C) 2015 Olly Betts +# Copyright (C) 2015,2016 Olly Betts # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as @@ -163,6 +163,14 @@ testcase '0 * ((Horg OR Hcom) AND (Len AND Lfr))' P='lang:en lang:fr host:org ho testcase '((XANDa AND XANDb AND XANDc) AND (Y1998 OR Y2001))' B=Y1998 B=Y2001 B=XANDa B=XANDb B=XANDc testcase '0 * (((XANDa AND XANDb) AND XANDc) AND (Y1998 OR Y2001))' P='year:1998 year:2001 and:a and:b and:c' +testcase '(Ztruth@1 AND_NOT Epdf)' P=truth N=Epdf +testcase '(Ztruth@1 AND_NOT (Ehtm OR Epdf))' P=truth N=Epdf N=Ehtm +testcase '(Ztruth@1 AND_NOT (Ehtm OR Epdf OR Lde OR Lfr))' P=truth N=Lfr N=Epdf N=Ehtm N=Lde +testcase '((Ztruth@1 FILTER (Lfr AND Lzh)) AND_NOT (Lde OR Len))' P=truth N=Lde N=Len B=Lfr B=Lzh +testcase '((Ztruth@1 FILTER Lfr) AND_NOT (Ehtm OR Epdf))' P=truth N=Epdf N=Ehtm B=Lfr +testcase '( AND_NOT (Len OR Lfr))' N=Lfr N=Len +testcase '(VALUE_RANGE 0 2015 201501~ AND_NOT Len)' DATEVALUE=0 START=20150101 END=20150131 N=Len + # If faketime is available, test a range back from now. if faketime --version > /dev/null 2>&1; then TZ=UTC diff --git a/xapian-applications/omega/query.cc b/xapian-applications/omega/query.cc index a13dbf3e3..255d4f63a 100644 --- a/xapian-applications/omega/query.cc +++ b/xapian-applications/omega/query.cc @@ -436,13 +436,19 @@ set_probabilistic(const string &oldp) } static multimap filter_map; +static set neg_filters; typedef multimap::const_iterator FMCI; void add_bterm(const string &term) { string prefix; if (prefix_from_term(prefix, term) > 0) - filter_map.insert(multimap::value_type(prefix, term)); + filter_map.emplace(prefix, term); +} + +void add_nterm(const string &term) { + if (!term.empty()) + neg_filters.emplace(term); } static void @@ -528,6 +534,20 @@ run_query() } } + if (!neg_filters.empty()) { + // OR together all negated filters. + Xapian::Query filter(Xapian::Query::OP_OR, + neg_filters.begin(), neg_filters.end()); + + if (query.empty()) { + // If we only have a negative filter for the query, use MatchAll as + // the query to apply the filters to. + query = Xapian::Query::MatchAll; + force_boolean = true; + } + query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter); + } + if (!enquire || !error_msg.empty()) return; set_weighting_scheme(*enquire, option, force_boolean); diff --git a/xapian-applications/omega/query.h b/xapian-applications/omega/query.h index 47c5f32f5..6cd2f4ab1 100644 --- a/xapian-applications/omega/query.h +++ b/xapian-applications/omega/query.h @@ -1,7 +1,7 @@ /** @file query.h * @brief: Omega functions for running queries, etc. * - * Copyright (C) 2007,2011 Olly Betts + * Copyright (C) 2007,2011,2016 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -30,6 +30,8 @@ extern Xapian::Query::op default_op; void add_bterm(const std::string & term); +void add_nterm(const std::string & term); + void set_probabilistic_query(const std::string & prefix, const std::string & s); -- 2.11.4.GIT