1 /** @file maxpostlist.cc
2 * @brief N-way OR postlist with wt=max(wt_i)
4 /* Copyright (C) 2007,2009,2010,2011,2012,2013,2014,2017 Olly Betts
5 * Copyright (C) 2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "maxpostlist.h"
31 MaxPostList::~MaxPostList()
34 for (size_t i
= 0; i
< n_kids
; ++i
) {
42 MaxPostList::get_termfreq_min() const
44 Xapian::doccount res
= plist
[0]->get_termfreq_min();
45 for (size_t i
= 1; i
< n_kids
; ++i
) {
46 res
= max(res
, plist
[i
]->get_termfreq_min());
52 MaxPostList::get_termfreq_max() const
54 Xapian::doccount res
= plist
[0]->get_termfreq_max();
55 for (size_t i
= 1; i
< n_kids
; ++i
) {
56 Xapian::doccount c
= plist
[i
]->get_termfreq_max();
57 if (db_size
- res
<= c
)
65 MaxPostList::get_termfreq_est() const
67 if (rare(db_size
== 0))
70 // We calculate the estimate assuming independence. The simplest
71 // way to calculate this seems to be a series of (n_kids - 1) pairwise
72 // calculations, which gives the same answer regardless of the order.
73 double scale
= 1.0 / db_size
;
74 double P_est
= plist
[0]->get_termfreq_est() * scale
;
75 for (size_t i
= 1; i
< n_kids
; ++i
) {
76 double P_i
= plist
[i
]->get_termfreq_est() * scale
;
77 P_est
+= P_i
- P_est
* P_i
;
79 return static_cast<Xapian::doccount
>(P_est
* db_size
+ 0.5);
83 MaxPostList::get_termfreq_est_using_stats(
84 const Xapian::Weight::Internal
& stats
) const
86 // We calculate the estimate assuming independence. The simplest
87 // way to calculate this seems to be a series of (n_kids - 1) pairwise
88 // calculations, which gives the same answer regardless of the order.
89 TermFreqs
freqs(plist
[0]->get_termfreq_est_using_stats(stats
));
91 // Our caller should have ensured this.
92 Assert(stats
.collection_size
);
93 double scale
= 1.0 / stats
.collection_size
;
94 double P_est
= freqs
.termfreq
* scale
;
95 double Pr_est
= freqs
.reltermfreq
* scale
;
96 double Pc_est
= freqs
.collfreq
* scale
;
98 for (size_t i
= 1; i
< n_kids
; ++i
) {
99 double P_i
= freqs
.termfreq
* scale
;
100 P_est
+= P_i
- P_est
* P_i
;
101 double Pc_i
= freqs
.collfreq
* scale
;
102 Pc_est
+= Pc_i
- Pc_est
* Pc_i
;
103 // If the rset is empty, Pr_est should be 0 already, so leave
105 if (stats
.rset_size
!= 0) {
106 double Pr_i
= freqs
.reltermfreq
/ stats
.rset_size
;
107 Pr_est
+= Pr_i
- Pr_est
* Pr_i
;
110 return TermFreqs(Xapian::doccount(P_est
* stats
.collection_size
+ 0.5),
111 Xapian::doccount(Pr_est
* stats
.rset_size
+ 0.5),
112 Xapian::termcount(Pc_est
* stats
.total_term_count
));
116 MaxPostList::get_docid() const
122 MaxPostList::get_weight(Xapian::termcount doclen
,
123 Xapian::termcount unique_terms
) const
127 for (size_t i
= 0; i
< n_kids
; ++i
) {
128 if (plist
[i
]->get_docid() == did
)
129 res
= max(res
, plist
[i
]->get_weight(doclen
, unique_terms
));
135 MaxPostList::at_end() const
141 MaxPostList::recalc_maxweight()
143 double result
= plist
[0]->recalc_maxweight();
144 for (size_t i
= 1; i
< n_kids
; ++i
) {
145 result
= max(result
, plist
[i
]->recalc_maxweight());
151 MaxPostList::next(double w_min
)
153 Xapian::docid old_did
= did
;
155 for (size_t i
= 0; i
< n_kids
; ++i
) {
156 Xapian::docid cur_did
= 0;
158 cur_did
= plist
[i
]->get_docid();
159 if (cur_did
<= old_did
) {
161 if (old_did
== 0 || cur_did
== old_did
) {
162 res
= plist
[i
]->next(w_min
);
164 res
= plist
[i
]->skip_to(old_did
+ 1, w_min
);
171 if (plist
[i
]->at_end()) {
177 matcher
->force_recalc();
179 cur_did
= plist
[i
]->get_docid();
182 if (did
== 0 || cur_did
< did
) {
196 MaxPostList::skip_to(Xapian::docid did_min
, double w_min
)
198 Xapian::docid old_did
= did
;
200 for (size_t i
= 0; i
< n_kids
; ++i
) {
201 Xapian::docid cur_did
= 0;
203 cur_did
= plist
[i
]->get_docid();
204 if (cur_did
< did_min
) {
205 PostList
* res
= plist
[i
]->skip_to(did_min
, w_min
);
211 if (plist
[i
]->at_end()) {
217 matcher
->force_recalc();
219 cur_did
= plist
[i
]->get_docid();
222 if (did
== 0 || cur_did
< did
) {
236 MaxPostList::get_description() const
239 desc
+= plist
[0]->get_description();
240 for (size_t i
= 1; i
< n_kids
; ++i
) {
242 desc
+= plist
[i
]->get_description();
249 MaxPostList::get_wdf() const
251 Xapian::termcount totwdf
= 0;
252 for (size_t i
= 0; i
< n_kids
; ++i
) {
253 if (plist
[i
]->get_docid() == did
)
254 totwdf
+= plist
[i
]->get_wdf();
260 MaxPostList::count_matching_subqs() const