From cf32d39172f4580a7ece3315b2516692fe3f1994 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 2 Feb 2016 17:17:20 +1300
Subject: [PATCH] Add support for negated boolean filter terms

Specified by CGI parameter "N".
---
 xapian-applications/omega/docs/cgiparams.rst | 13 +++-
 xapian-applications/omega/docs/overview.rst  | 95 +++++++++++++++++++++++-----
 xapian-applications/omega/omega.cc           | 79 ++++++++++++++++++-----
 xapian-applications/omega/omegatest          | 10 ++-
 xapian-applications/omega/query.cc           | 22 ++++++-
 xapian-applications/omega/query.h            |  4 +-
 6 files changed, 185 insertions(+), 38 deletions(-)

diff --git a/xapian-applications/omega/docs/cgiparams.rst b/xapian-applications/omega/docs/cgiparams.rst
index 3bc517bee..9837c270b 100644
--- a/xapian-applications/omega/docs/cgiparams.rst
+++ b/xapian-applications/omega/docs/cgiparams.rst
@@ -89,8 +89,17 @@ Filtering parameters
 --------------------
 
 B
-        general boolean filter terms.  See the `overview document
-        <overview.html>`_ for details of how multiple B parameters are handled.
+        general boolean filter terms.
+
+        See the `overview document <overview.html>`_ for details of how
+        multiple `B` and `N` parameters are handled.
+
+N
+        negated general boolean filter terms (new in Omega 1.3.5 - older
+        versions will just ignore any `N` parameters).
+
+        See the `overview document <overview.html>`_ for details of how
+        multiple `B` and `N` parameters are handled.
 
 COLLAPSE
 	value number to use for removing duplicate documents.
diff --git a/xapian-applications/omega/docs/overview.rst b/xapian-applications/omega/docs/overview.rst
index 8255e2c0b..6e0835a4a 100644
--- a/xapian-applications/omega/docs/overview.rst
+++ b/xapian-applications/omega/docs/overview.rst
@@ -27,11 +27,12 @@ Term construction
 
 Documents within an omega database are stored with two types of terms:
 those used for probabilistic searching (the CGI parameter 'P'), and
-those used for boolean filtering (the CGI parameter 'B'). Boolean
-terms start with an initial capital letter denoting the 'group' of the
-term (e.g. 'M' for MIME type), while probabilistic terms are all
-lower-case, and are also stemmed before adding to the
-database.
+those used for boolean filtering (the CGI parameters 'B' and 'N' - the
+latter is a negated variant of 'B' and was added in Omega 1.3.5).
+
+Boolean terms start with an initial capital letter denoting the 'group' of the
+term (e.g. 'M' for MIME type), while probabilistic terms are all lower-case,
+and are also stemmed before adding to the database.
 
 The "english" stemmer is used by default - you can configure this for omindex
 and scriptindex with ``--stemmer=LANGUAGE`` (use ``--stemmer=none`` to disable
@@ -40,21 +41,77 @@ search time you can configure the stemmer by adding ``$set{stemmer,LANGUAGE}``
 to the top of your OmegaScript template.
 
 The two term types are used as follows when building the query:
-B(oolean) terms with the same prefix are ORed together, with all the
-different prefix groups being ANDed together. This is then FILTERed
-against the P(robabilistic) terms. This will look something like::
-
-                  [ FILTER ]
-                   /      \
-                  /        \
-             P-terms      [   AND   ]
-                           /   |...\
-                          /
+
+The 'P' parameter is parsed using `Xapian::QueryParser` to give a
+`Xapian::Query` object denoted as `P-terms` below.
+
+There are two ways that 'B' and 'N' parameters are handled, depending if the
+term-prefix has been configured as "non-exclusive" or not.  The default is
+"exclusive" (and in versions before 1.3.4, this was how all 'B' parameters
+were handled).
+
+Exclusive Boolean Prefix
+------------------------
+
+B(oolean) terms from 'B' parameters with the same prefix are ORed together,
+like so::
+
+
                     [   OR   ]
                    /    | ... \
               B(F,1) B(F,2)...B(F,n)
 
-Where B(F,1) is the first boolean term with prefix F, and so on.
+Where B(F,1) is the first boolean term with prefix F from a 'B' parameter, and
+so on.
+
+Non-Exclusive Boolean Prefix
+----------------------------
+
+For example, ``$setmap{nonexclusiveprefix,K,true}`` sets prefix `K` as
+non-exclusive, which means that multiple filter terms from 'B' parameters will
+be combined with "AND" instead of "OR", like so::
+
+                    [   AND   ]
+                   /     | ... \
+              B(K,1) B(K,2)... B(K,m)
+
+Combining the Boolean Filters
+-----------------------------
+
+The subqueries for each prefix from "B" parameters are combined with AND,
+to make this (which we refer to as "B-filter" below)::
+
+                         [     AND     ]
+                        /       |  ...  \
+                       /                 \
+                 [   OR   ]               [   AND  ]
+                /    | ... \             /    | ... \
+           B(F,1) B(F,2)...B(F,n)   B(K,1) B(K,2)...B(K,m)
+
+
+Negated Boolean Terms
+---------------------
+
+All the terms from all 'N' parameters are combined together with "OR", to
+make this (which we refer to as "N-filter" below)::
+
+                    [       OR       ]
+                   / ... |     |  ... \
+              N(F,1)...N(F,n) N(K,1)...N(K,m)
+
+Putting it all together
+-----------------------
+
+The P-terms are filtered by the B-filter using "FILTER" and by the N-filter
+using "AND_NOT"::
+
+                        [ AND_NOT ]
+                       /           \
+                      /             \
+            [ FILTER ]             N-terms
+             /      \
+            /        \
+       P-terms      B-terms
 
 The intent here is to allow filtering on arbitrary (and, typically,
 orthogonal) characteristics of the document. For instance, by adding
@@ -63,9 +120,13 @@ filtering the probabilistic search for only documents that are both in
 the "/press" site *and* which are either of MIME type text/html or
 text/plain. (See below for more information about sites.)
 
+If B-terms or N-terms is absent, that part of the query is simply omitted.
+
 If there is no probabilistic query, the boolean filter is promoted to
 be the query, and the weighting scheme is set to boolean.  This has
-the effect of applying the boolean filter to the whole database.
+the effect of applying the boolean filter to the whole database.  If
+there are only N-terms, then ``Query::MatchAll`` is used for the left
+side of the "AND_NOT".
 
 In order to add more boolean prefixes, you will need to alter the
 ``index_file()`` function in omindex.cc. Currently omindex adds several
diff --git a/xapian-applications/omega/omega.cc b/xapian-applications/omega/omega.cc
index d7d7fb72f..8d6f090bb 100644
--- a/xapian-applications/omega/omega.cc
+++ b/xapian-applications/omega/omega.cc
@@ -3,7 +3,7 @@
  * Copyright 1999,2000,2001 BrightStation PLC
  * Copyright 2001 James Aylett
  * Copyright 2001,2002 Ananova Ltd
- * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015 Olly Betts
+ * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016 Olly Betts
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -312,6 +312,47 @@ try {
 	}
     }
 
+    // set any negated boolean filters
+    g = cgi_params.equal_range("N");
+    if (g.first != g.second) {
+	vector<string> filter_v;
+	for (MCI i = g.first; i != g.second; i++) {
+	    const string & v = i->second;
+	    // we'll definitely get empty N fields from "-ALL-" options
+	    if (!v.empty() && C_isalnum(v[0])) {
+		add_nterm(v);
+		filter_v.push_back(v);
+	    }
+	}
+	sort(filter_v.begin(), filter_v.end());
+	vector<string>::const_iterator j;
+	for (j = filter_v.begin(); j != filter_v.end(); ++j) {
+	    const string & nterm = *j;
+	    string::size_type e = nterm.find(filter_sep);
+	    filters += '!';
+	    if (usual(e == string::npos)) {
+		filters += nterm;
+	    } else {
+		// If a filter contains filter_sep then double it to escape.
+		// Each filter must start with an alnum (checked above) and
+		// the value after the last filter is the default op, which
+		// is encoded as a non-alnum so filter_sep followed by
+		// something other than filter_sep must be separating filters.
+		string::size_type b = 0;
+		while (e != string::npos) {
+		    filters.append(nterm, b, e + 1 - b);
+		    b = e;
+		    e = nterm.find(filter_sep, b + 1);
+		}
+		filters.append(nterm, b, string::npos);
+	    }
+	    filters += filter_sep;
+	    // old_filters predates 'N' terms, so if there are 'N' terms this
+	    // is definitely a different query.
+	    old_filters.clear();
+	}
+    }
+
     // date range filters
     val = cgi_params.find("START");
     if (val != cgi_params.end()) date_start = val->second;
@@ -321,7 +362,7 @@ try {
     if (val != cgi_params.end()) date_span = val->second;
 
     // If more default_op values are supported, encode them as non-alnums
-    // other than filter_sep.
+    // other than filter_sep or '!'.
     filters += (default_op == Xapian::Query::OP_AND ? '.' : '-');
     filters += date_start;
     filters += filter_sep;
@@ -329,12 +370,14 @@ try {
     filters += filter_sep;
     filters += date_span;
 
-    old_filters += date_start;
-    old_filters += filter_sep_old;
-    old_filters += date_end;
-    old_filters += filter_sep_old;
-    old_filters += date_span;
-    old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O');
+    if (!old_filters.empty()) {
+	old_filters += date_start;
+	old_filters += filter_sep_old;
+	old_filters += date_end;
+	old_filters += filter_sep_old;
+	old_filters += date_span;
+	old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O');
+    }
 
     // Percentage relevance cut-off
     val = cgi_params.find("THRESHOLD");
@@ -353,8 +396,10 @@ try {
 	    collapse = true;
 	    filters += filter_sep;
 	    filters += str(collapse_key);
-	    old_filters += filter_sep_old;
-	    old_filters += str(collapse_key);
+	    if (!old_filters.empty()) {
+		old_filters += filter_sep_old;
+		old_filters += str(collapse_key);
+	    }
 	}
     }
 
@@ -367,12 +412,12 @@ try {
 	    if (ch == 'D') {
 		docid_order = Xapian::Enquire::DESCENDING;
 		filters += 'D';
-		old_filters += 'D';
+		if (!old_filters.empty()) old_filters += 'D';
 	    } else if (ch != 'A') {
 		docid_order = Xapian::Enquire::DONT_CARE;
 	    } else {
 		filters += 'X';
-		old_filters += 'X';
+		if (!old_filters.empty()) old_filters += 'X';
 	    }
 	}
     }
@@ -391,23 +436,25 @@ try {
 	}
 	// Add the sorting related options to filters too.
 	filters += str(sort_key);
-	old_filters += str(sort_key);
+	if (!old_filters.empty()) old_filters += str(sort_key);
 	if (sort_after) {
 	    if (sort_ascending) {
 		filters += 'F';
-		old_filters += 'F';
+		if (!old_filters.empty()) old_filters += 'F';
 	    } else {
 		filters += 'R';
-		old_filters += 'R';
+		if (!old_filters.empty()) old_filters += 'R';
 	    }
 	} else {
 	    if (!sort_ascending) {
 		filters += 'r';
-		old_filters += 'r';
+		if (!old_filters.empty()) old_filters += 'r';
 	    }
 	}
     }
 
+    if (old_filters.empty()) old_filters = filters;
+
     // min_hits (fill mset past topdoc+(hits_per_page+1) to
     // topdoc+max(hits_per_page+1,min_hits)
     val = cgi_params.find("MINHITS");
diff --git a/xapian-applications/omega/omegatest b/xapian-applications/omega/omegatest
index 3071d25f1..2621665d4 100755
--- a/xapian-applications/omega/omegatest
+++ b/xapian-applications/omega/omegatest
@@ -1,7 +1,7 @@
 #!/bin/sh
 # omegatest: Test omega CGI
 #
-# Copyright (C) 2015 Olly Betts
+# Copyright (C) 2015,2016 Olly Betts
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -163,6 +163,14 @@ testcase '0 * ((Horg OR Hcom) AND (Len AND Lfr))' P='lang:en lang:fr host:org ho
 testcase '((XANDa AND XANDb AND XANDc) AND (Y1998 OR Y2001))' B=Y1998 B=Y2001 B=XANDa B=XANDb B=XANDc
 testcase '0 * (((XANDa AND XANDb) AND XANDc) AND (Y1998 OR Y2001))' P='year:1998 year:2001 and:a and:b and:c'
 
+testcase '(Ztruth@1 AND_NOT Epdf)' P=truth N=Epdf
+testcase '(Ztruth@1 AND_NOT (Ehtm OR Epdf))' P=truth N=Epdf N=Ehtm
+testcase '(Ztruth@1 AND_NOT (Ehtm OR Epdf OR Lde OR Lfr))' P=truth N=Lfr N=Epdf N=Ehtm N=Lde
+testcase '((Ztruth@1 FILTER (Lfr AND Lzh)) AND_NOT (Lde OR Len))' P=truth N=Lde N=Len B=Lfr B=Lzh
+testcase '((Ztruth@1 FILTER Lfr) AND_NOT (Ehtm OR Epdf))' P=truth N=Epdf N=Ehtm B=Lfr
+testcase '(<alldocuments> AND_NOT (Len OR Lfr))' N=Lfr N=Len
+testcase '(VALUE_RANGE 0 2015 201501~ AND_NOT Len)' DATEVALUE=0 START=20150101 END=20150131 N=Len
+
 # If faketime is available, test a range back from now.
 if faketime --version > /dev/null 2>&1; then
     TZ=UTC
diff --git a/xapian-applications/omega/query.cc b/xapian-applications/omega/query.cc
index a13dbf3e3..255d4f63a 100644
--- a/xapian-applications/omega/query.cc
+++ b/xapian-applications/omega/query.cc
@@ -436,13 +436,19 @@ set_probabilistic(const string &oldp)
 }
 
 static multimap<string, string> filter_map;
+static set<string> neg_filters;
 
 typedef multimap<string, string>::const_iterator FMCI;
 
 void add_bterm(const string &term) {
     string prefix;
     if (prefix_from_term(prefix, term) > 0)
-	filter_map.insert(multimap<string, string>::value_type(prefix, term));
+	filter_map.emplace(prefix, term);
+}
+
+void add_nterm(const string &term) {
+    if (!term.empty())
+	neg_filters.emplace(term);
 }
 
 static void
@@ -528,6 +534,20 @@ run_query()
 	}
     }
 
+    if (!neg_filters.empty()) {
+	// OR together all negated filters.
+	Xapian::Query filter(Xapian::Query::OP_OR,
+			     neg_filters.begin(), neg_filters.end());
+
+	if (query.empty()) {
+	    // If we only have a negative filter for the query, use MatchAll as
+	    // the query to apply the filters to.
+	    query = Xapian::Query::MatchAll;
+	    force_boolean = true;
+	}
+	query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
+    }
+
     if (!enquire || !error_msg.empty()) return;
 
     set_weighting_scheme(*enquire, option, force_boolean);
diff --git a/xapian-applications/omega/query.h b/xapian-applications/omega/query.h
index 47c5f32f5..6cd2f4ab1 100644
--- a/xapian-applications/omega/query.h
+++ b/xapian-applications/omega/query.h
@@ -1,7 +1,7 @@
 /** @file query.h
  * @brief: Omega functions for running queries, etc.
  *
- * Copyright (C) 2007,2011 Olly Betts
+ * Copyright (C) 2007,2011,2016 Olly Betts
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -30,6 +30,8 @@ extern Xapian::Query::op default_op;
 
 void add_bterm(const std::string & term);
 
+void add_nterm(const std::string & term);
+
 void set_probabilistic_query(const std::string & prefix,
 			     const std::string & s);
 
-- 
2.11.4.GIT