2 * @brief Collapse documents with the same collapse key during the match.
4 /* Copyright (C) 2009,2011,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 #define XAPIAN_INCLUDED_COLLAPSER_H
24 #include "backends/documentinternal.h"
26 #include "api/postlist.h"
27 #include "api/result.h"
29 #include <unordered_map>
32 /// Enumeration reporting how a result will be handled by the Collapser.
41 /// Class tracking information for a given value of the collapse key.
43 /** Currently kept MSet entries for this value of the collapse key.
45 * If collapse_max > 1, then this is a min-heap once collapse_count > 0.
47 * The first member of the pair is the index into proto_mset.results
48 * and the second is the docid of the entry (used to detect if the
49 * entry in proto_mset.results we were referring to has been dropped).
51 * FIXME: We expect collapse_max to be small, so perhaps we should
52 * preallocate space for that many entries and/or allocate space in
53 * larger blocks to divvy up?
55 std::vector
<std::pair
<Xapian::doccount
, Xapian::docid
>> items
;
57 /// The highest weight of a document we've rejected.
58 double next_best_weight
;
60 /// The number of documents we've rejected.
61 Xapian::doccount collapse_count
;
64 /// Construct with the given item.
65 CollapseData(Xapian::doccount item
, Xapian::docid did
)
66 : items(1, { item
, did
}), next_best_weight(0), collapse_count(0) {
69 /** Check a new result with this collapse key value.
71 * If this method determines the action to take is ADD or REPLACE, then
72 * the proto-mset should be updated and then add_item() called to complete
73 * the update of the CollapseData (if the result doesn't actually get
74 * added, then it's OK not to follow up with a call to add_item()).
76 * @param results The results so far.
77 * @param result The new result.
78 * @param collapse_max Max no. of items for each collapse key value.
79 * @param mcmp Result comparison functor.
80 * @param[out] old_item Item to be replaced (when REPLACE is returned).
82 * @return How to handle @a result: ADD, REJECT or REPLACE.
84 collapse_result
check_item(const std::vector
<Result
>& results
,
86 Xapian::doccount collapse_max
,
88 Xapian::doccount
& old_item
);
90 /** Set item after constructing with a placeholder.
92 * @param item The new item (index into results).
94 void set_item(Xapian::doccount item
);
96 /** Complete update of new result with this collapse key value.
98 * @param results The results so far.
99 * @param item The new item (index into results).
100 * @param collapse_max Max no. of items for each collapse key value.
101 * @param mcmp Result comparison functor.
103 void add_item(const std::vector
<Result
>& results
,
104 Xapian::doccount item
,
105 Xapian::doccount collapse_max
,
108 /// The highest weight of a document we've rejected.
109 double get_next_best_weight() const { return next_best_weight
; }
111 /// The number of documents we've rejected.
112 Xapian::doccount
get_collapse_count() const { return collapse_count
; }
115 /// The Collapser class tracks collapse keys and the documents they match.
117 /// Map from collapse key values to the items we're keeping for them.
118 std::unordered_map
<std::string
, CollapseData
> table
;
120 /// How many items we're currently keeping in @a table.
121 Xapian::doccount entry_count
= 0;
123 /** How many documents have we seen without a collapse key?
125 * We use this statistic to improve matches_lower_bound.
127 Xapian::doccount no_collapse_key
= 0;
129 /** How many documents with duplicate collapse keys we have ignored.
131 * We use this statistic to improve matches_estimated (by considering
132 * the rate of collapsing) and matches_upper_bound.
134 Xapian::doccount dups_ignored
= 0;
136 /** How many documents we've considered for collapsing.
138 * We use this statistic to improve matches_estimated (by considering
139 * the rate of collapsing).
141 Xapian::doccount docs_considered
= 0;
143 /** The value slot we're getting collapse keys from. */
144 Xapian::valueno slot
;
146 /** The maximum number of items to keep for each collapse key value. */
147 Xapian::doccount collapse_max
;
149 std::vector
<Result
>& results
;
153 /** Pointer to CollapseData when NEW or ADD is in progress. */
154 CollapseData
* ptr
= NULL
;
156 /// Adapt @a mcmp to be usable with min_heap.
157 bool operator()(Xapian::doccount a
, Xapian::doccount b
) const {
158 return mcmp(results
[a
], results
[b
]);
162 /// Replaced item when REPLACE is returned by @a collapse().
163 Xapian::doccount old_item
= 0;
165 Collapser(Xapian::valueno slot_
,
166 Xapian::doccount collapse_max_
,
167 std::vector
<Result
>& results_
,
170 collapse_max(collapse_max_
),
174 /// Return true if collapsing is active for this match.
175 operator bool() const { return collapse_max
!= 0; }
177 /** Check a new result.
179 * If this method determines the action to take is NEW or ADD then the
180 * proto-mset should be updated and then process() called to complete the
181 * update (if the result doesn't actually get added, then it's OK not
182 * to follow up with a call to process()).
184 * @param result The new result.
185 * @param vsdoc Document for getting values.
187 * @return How to handle @a result: EMPTY, NEW, ADD, REJECT or REPLACE.
189 collapse_result
check(Result
& result
,
190 Xapian::Document::Internal
& vsdoc
);
192 /** Handle a new Result.
194 * @param action The collapse_result returned by check().
195 * @param item The new item (index into results).
197 void process(collapse_result action
, Xapian::doccount item
);
199 Xapian::doccount
get_collapse_count(const std::string
& collapse_key
,
200 int percent_threshold
,
201 double min_weight
) const;
203 Xapian::doccount
get_docs_considered() const { return docs_considered
; }
205 Xapian::doccount
get_dups_ignored() const { return dups_ignored
; }
207 Xapian::doccount
get_entries() const { return entry_count
; }
209 Xapian::doccount
get_matches_lower_bound() const;
211 void finalise(double min_weight
, int percent_threshold
);
214 /** Simpler version of Collapser used when merging MSet objects.
216 * We merge results in descending rank order, so collapsing is much simpler
217 * than during the match - we just need to discard documents if we've already
218 * seen collapse_max with the same key.
220 class CollapserLite
{
221 /// Map from collapse key values to collapse counts.
222 std::unordered_map
<std::string
, Xapian::doccount
> table
;
224 /// How many items we're currently keeping in @a table.
225 Xapian::doccount entry_count
= 0;
227 /** How many documents have we seen without a collapse key?
229 * We use this statistic to improve matches_lower_bound.
231 Xapian::doccount no_collapse_key
= 0;
233 /** How many documents with duplicate collapse keys we have ignored.
235 * We use this statistic to improve matches_estimated (by considering
236 * the rate of collapsing) and matches_upper_bound.
238 Xapian::doccount dups_ignored
= 0;
240 /** How many documents we've considered for collapsing.
242 * We use this statistic to improve matches_estimated (by considering
243 * the rate of collapsing).
245 Xapian::doccount docs_considered
= 0;
247 /** The maximum number of items to keep for each collapse key value. */
248 Xapian::doccount collapse_max
;
251 CollapserLite(Xapian::doccount collapse_max_
)
252 : collapse_max(collapse_max_
) {}
254 /// Return true if collapsing is active for this match.
255 operator bool() const { return collapse_max
!= 0; }
257 /** Try to add a new key.
259 * @return true if accepted; false if rejected.
261 bool add(const std::string
& key
) {
269 auto r
= table
.emplace(key
, 1);
271 // New entry, set to 1.
272 } else if (r
.first
->second
== collapse_max
) {
273 // Already seen collapse_max with this key so reject.
284 Xapian::doccount
get_docs_considered() const { return docs_considered
; }
286 Xapian::doccount
get_dups_ignored() const { return dups_ignored
; }
288 Xapian::doccount
get_entries() const { return entry_count
; }
290 Xapian::doccount
get_matches_lower_bound() const {
291 return no_collapse_key
+ entry_count
;
294 void finalise(std::vector
<Result
>& results
, int percent_threshold
) {
295 if (table
.empty() || results
.empty())
298 // We need to fill in collapse_count values in results using the
299 // information stored in table.
300 Xapian::doccount todo
= entry_count
;
301 for (Result
& result
: results
) {
302 const string
& key
= result
.get_collapse_key();
306 // Adjust collapse_count.
307 if (percent_threshold
) {
308 // FIXME: We can probably do better here.
309 result
.set_collapse_count(1);
311 auto c
= result
.get_collapse_count() + table
[key
];
312 result
.set_collapse_count(c
);
316 // Terminate early if we've handled all non-empty entries.
323 #endif // XAPIAN_INCLUDED_COLLAPSER_H