Reimplement the matcher
[xapian.git] / xapian-core / matcher / collapser.h
blobe818acc55d6bdd023bef3b65f406d906996e47eb
1 /** @file collapser.h
2 * @brief Collapse documents with the same collapse key during the match.
3 */
4 /* Copyright (C) 2009,2011,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 #define XAPIAN_INCLUDED_COLLAPSER_H
24 #include "backends/documentinternal.h"
25 #include "msetcmp.h"
26 #include "api/postlist.h"
27 #include "api/result.h"
29 #include <unordered_map>
30 #include <vector>
32 /// Enumeration reporting how a result will be handled by the Collapser.
33 typedef enum {
34 EMPTY,
35 NEW,
36 ADD,
37 REJECT,
38 REPLACE
39 } collapse_result;
41 /// Class tracking information for a given value of the collapse key.
42 class CollapseData {
43 /** Currently kept MSet entries for this value of the collapse key.
45 * If collapse_max > 1, then this is a min-heap once collapse_count > 0.
47 * The first member of the pair is the index into proto_mset.results
48 * and the second is the docid of the entry (used to detect if the
49 * entry in proto_mset.results we were referring to has been dropped).
51 * FIXME: We expect collapse_max to be small, so perhaps we should
52 * preallocate space for that many entries and/or allocate space in
53 * larger blocks to divvy up?
55 std::vector<std::pair<Xapian::doccount, Xapian::docid>> items;
57 /// The highest weight of a document we've rejected.
58 double next_best_weight;
60 /// The number of documents we've rejected.
61 Xapian::doccount collapse_count;
63 public:
64 /// Construct with the given item.
65 CollapseData(Xapian::doccount item, Xapian::docid did)
66 : items(1, { item, did }), next_best_weight(0), collapse_count(0) {
69 /** Check a new result with this collapse key value.
71 * If this method determines the action to take is ADD or REPLACE, then
72 * the proto-mset should be updated and then add_item() called to complete
73 * the update of the CollapseData (if the result doesn't actually get
74 * added, then it's OK not to follow up with a call to add_item()).
76 * @param results The results so far.
77 * @param result The new result.
78 * @param collapse_max Max no. of items for each collapse key value.
79 * @param mcmp Result comparison functor.
80 * @param[out] old_item Item to be replaced (when REPLACE is returned).
82 * @return How to handle @a result: ADD, REJECT or REPLACE.
84 collapse_result check_item(const std::vector<Result>& results,
85 const Result& result,
86 Xapian::doccount collapse_max,
87 MSetCmp mcmp,
88 Xapian::doccount& old_item);
90 /** Set item after constructing with a placeholder.
92 * @param item The new item (index into results).
94 void set_item(Xapian::doccount item);
96 /** Complete update of new result with this collapse key value.
98 * @param results The results so far.
99 * @param item The new item (index into results).
100 * @param collapse_max Max no. of items for each collapse key value.
101 * @param mcmp Result comparison functor.
103 void add_item(const std::vector<Result>& results,
104 Xapian::doccount item,
105 Xapian::doccount collapse_max,
106 MSetCmp mcmp);
108 /// The highest weight of a document we've rejected.
109 double get_next_best_weight() const { return next_best_weight; }
111 /// The number of documents we've rejected.
112 Xapian::doccount get_collapse_count() const { return collapse_count; }
115 /// The Collapser class tracks collapse keys and the documents they match.
116 class Collapser {
117 /// Map from collapse key values to the items we're keeping for them.
118 std::unordered_map<std::string, CollapseData> table;
120 /// How many items we're currently keeping in @a table.
121 Xapian::doccount entry_count = 0;
123 /** How many documents have we seen without a collapse key?
125 * We use this statistic to improve matches_lower_bound.
127 Xapian::doccount no_collapse_key = 0;
129 /** How many documents with duplicate collapse keys we have ignored.
131 * We use this statistic to improve matches_estimated (by considering
132 * the rate of collapsing) and matches_upper_bound.
134 Xapian::doccount dups_ignored = 0;
136 /** How many documents we've considered for collapsing.
138 * We use this statistic to improve matches_estimated (by considering
139 * the rate of collapsing).
141 Xapian::doccount docs_considered = 0;
143 /** The value slot we're getting collapse keys from. */
144 Xapian::valueno slot;
146 /** The maximum number of items to keep for each collapse key value. */
147 Xapian::doccount collapse_max;
149 std::vector<Result>& results;
151 MSetCmp mcmp;
153 /** Pointer to CollapseData when NEW or ADD is in progress. */
154 CollapseData* ptr = NULL;
156 /// Adapt @a mcmp to be usable with min_heap.
157 bool operator()(Xapian::doccount a, Xapian::doccount b) const {
158 return mcmp(results[a], results[b]);
161 public:
162 /// Replaced item when REPLACE is returned by @a collapse().
163 Xapian::doccount old_item = 0;
165 Collapser(Xapian::valueno slot_,
166 Xapian::doccount collapse_max_,
167 std::vector<Result>& results_,
168 MSetCmp mcmp_)
169 : slot(slot_),
170 collapse_max(collapse_max_),
171 results(results_),
172 mcmp(mcmp_) { }
174 /// Return true if collapsing is active for this match.
175 operator bool() const { return collapse_max != 0; }
177 /** Check a new result.
179 * If this method determines the action to take is NEW or ADD then the
180 * proto-mset should be updated and then process() called to complete the
181 * update (if the result doesn't actually get added, then it's OK not
182 * to follow up with a call to process()).
184 * @param result The new result.
185 * @param vsdoc Document for getting values.
187 * @return How to handle @a result: EMPTY, NEW, ADD, REJECT or REPLACE.
189 collapse_result check(Result& result,
190 Xapian::Document::Internal & vsdoc);
192 /** Handle a new Result.
194 * @param action The collapse_result returned by check().
195 * @param item The new item (index into results).
197 void process(collapse_result action, Xapian::doccount item);
199 Xapian::doccount get_collapse_count(const std::string & collapse_key,
200 int percent_threshold,
201 double min_weight) const;
203 Xapian::doccount get_docs_considered() const { return docs_considered; }
205 Xapian::doccount get_dups_ignored() const { return dups_ignored; }
207 Xapian::doccount get_entries() const { return entry_count; }
209 Xapian::doccount get_matches_lower_bound() const;
211 void finalise(double min_weight, int percent_threshold);
214 /** Simpler version of Collapser used when merging MSet objects.
216 * We merge results in descending rank order, so collapsing is much simpler
217 * than during the match - we just need to discard documents if we've already
218 * seen collapse_max with the same key.
220 class CollapserLite {
221 /// Map from collapse key values to collapse counts.
222 std::unordered_map<std::string, Xapian::doccount> table;
224 /// How many items we're currently keeping in @a table.
225 Xapian::doccount entry_count = 0;
227 /** How many documents have we seen without a collapse key?
229 * We use this statistic to improve matches_lower_bound.
231 Xapian::doccount no_collapse_key = 0;
233 /** How many documents with duplicate collapse keys we have ignored.
235 * We use this statistic to improve matches_estimated (by considering
236 * the rate of collapsing) and matches_upper_bound.
238 Xapian::doccount dups_ignored = 0;
240 /** How many documents we've considered for collapsing.
242 * We use this statistic to improve matches_estimated (by considering
243 * the rate of collapsing).
245 Xapian::doccount docs_considered = 0;
247 /** The maximum number of items to keep for each collapse key value. */
248 Xapian::doccount collapse_max;
250 public:
251 CollapserLite(Xapian::doccount collapse_max_)
252 : collapse_max(collapse_max_) {}
254 /// Return true if collapsing is active for this match.
255 operator bool() const { return collapse_max != 0; }
257 /** Try to add a new key.
259 * @return true if accepted; false if rejected.
261 bool add(const std::string& key) {
262 ++docs_considered;
264 if (key.empty()) {
265 ++no_collapse_key;
266 return true;
269 auto r = table.emplace(key, 1);
270 if (r.second) {
271 // New entry, set to 1.
272 } else if (r.first->second == collapse_max) {
273 // Already seen collapse_max with this key so reject.
274 ++dups_ignored;
275 return false;
276 } else {
277 // Increment count.
278 ++r.first->second;
280 ++entry_count;
281 return true;
284 Xapian::doccount get_docs_considered() const { return docs_considered; }
286 Xapian::doccount get_dups_ignored() const { return dups_ignored; }
288 Xapian::doccount get_entries() const { return entry_count; }
290 Xapian::doccount get_matches_lower_bound() const {
291 return no_collapse_key + entry_count;
294 void finalise(std::vector<Result>& results, int percent_threshold) {
295 if (table.empty() || results.empty())
296 return;
298 // We need to fill in collapse_count values in results using the
299 // information stored in table.
300 Xapian::doccount todo = entry_count;
301 for (Result& result : results) {
302 const string& key = result.get_collapse_key();
303 if (key.empty())
304 continue;
306 // Adjust collapse_count.
307 if (percent_threshold) {
308 // FIXME: We can probably do better here.
309 result.set_collapse_count(1);
310 } else {
311 auto c = result.get_collapse_count() + table[key];
312 result.set_collapse_count(c);
315 if (--todo == 0) {
316 // Terminate early if we've handled all non-empty entries.
317 break;
323 #endif // XAPIAN_INCLUDED_COLLAPSER_H