Implement PIMPL for Xapian::Diversify
[xapian.git] / xapian-core / diversify / diversify.cc
blobae1fc52e3c97693ff9922562d04b0aed143eb004
1 /** @file diversify.cc
2 * @brief Diversification API
3 */
4 /* Copyright (C) 2018 Uppinder Chugh
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "xapian/diversify.h"
25 #include "xapian/error.h"
27 #include "debuglog.h"
28 #include "diversify/diversifyinternal.h"
30 #include <algorithm>
31 #include <cmath>
32 #include <limits>
33 #include <vector>
35 using namespace Xapian;
36 using namespace std;
38 Diversify::Diversify(const Diversify&) = default;
40 Diversify&
41 Diversify::operator=(const Diversify&) = default;
43 Diversify::Diversify(Diversify&&) = default;
45 Diversify&
46 Diversify::operator=(Diversify&&) = default;
48 Diversify::Diversify(unsigned int k_,
49 double lambda_,
50 double b_,
51 double sigma_sqr_)
52 : internal(new Xapian::Diversify::Internal(k_, lambda_, b_, sigma_sqr_))
54 LOGCALL_CTOR(API, "Diversify", k_ | lambda_ | b_ | sigma_sqr_);
57 Diversify::~Diversify()
59 LOGCALL_DTOR(API, "Diversify");
62 string
63 Diversify::get_description() const
65 return "Diversify()";
68 DocumentSet
69 Diversify::get_dmset(const MSet& mset)
71 LOGCALL(API, MSet, "Diversify::get_dmset", mset);
72 return internal->get_dmset(mset);
75 void
76 Diversify::Internal::initialise_points(const MSet& source)
78 unsigned int count = 0;
79 TermListGroup tlg(source);
80 for (MSetIterator it = source.begin(); it != source.end(); ++it) {
81 points.emplace(*it, Xapian::Point(tlg, it.get_document()));
82 scores[*it] = it.get_weight();
83 // Initial top-k diversified documents
84 if (count < k) {
85 main_dmset.push_back(*it);
86 ++count;
91 pair<Xapian::docid, Xapian::docid>
92 Diversify::Internal::get_key(Xapian::docid docid_a, Xapian::docid docid_b)
94 pair<Xapian::docid, Xapian::docid> key;
95 if (docid_a <= docid_b) {
96 key = make_pair(docid_a, docid_b);
97 } else {
98 key = make_pair(docid_b, docid_a);
101 return key;
104 void
105 Diversify::Internal::compute_similarities()
107 Xapian::CosineDistance d;
108 for (auto p_a : points) {
109 Xapian::docid pointid_a = p_a.first;
110 const Xapian::Point& point_a = p_a.second;
111 for (auto p_b : points) {
112 Xapian::docid pointid_b = p_b.first;
113 const Xapian::Point& point_b = p_b.second;
115 if (pointid_a > pointid_b) {
116 continue;
119 double sim = d.similarity(point_a, point_b);
120 auto key = get_key(pointid_a, pointid_b);
121 pairwise_sim[key] = sim;
126 vector<Xapian::docid>
127 Diversify::Internal::compute_diff_dmset(const vector<Xapian::docid>& dmset)
129 vector<Xapian::docid> diff_dmset;
130 for (auto point : points) {
131 Xapian::docid point_id = point.first;
132 bool found_point = false;
133 for (auto doc_id : dmset) {
134 if (point_id == doc_id) {
135 found_point = true;
136 break;
140 if (!found_point) {
141 diff_dmset.push_back(point_id);
145 return diff_dmset;
148 double
149 Diversify::Internal::evaluate_dmset(const vector<Xapian::docid>& dmset)
151 double score_1 = 0, score_2 = 0;
153 for (auto doc_id : dmset)
154 score_1 += scores[doc_id];
156 vector<Xapian::docid> diff_dmset = compute_diff_dmset(dmset);
158 for (auto point_id : diff_dmset) {
159 double min_dist = numeric_limits<double>::max();
160 unsigned int pos = 1;
161 for (auto doc_id : dmset) {
162 auto key = get_key(point_id, doc_id);
163 double sim = pairwise_sim[key];
164 double weight = 2 * b * sigma_sqr / log(1 + pos) * (1 - sim);
165 min_dist = min(min_dist, weight);
166 ++pos;
168 score_2 += min_dist;
171 return -lambda * score_1 + (1 - lambda) * score_2;
174 DocumentSet
175 Diversify::Internal::get_dmset(const MSet& mset)
177 // Return original mset if no need to diversify
178 if (k == 0 || mset.size() <= 2) {
179 DocumentSet dmset;
180 for (MSetIterator it = mset.begin(); it != mset.end(); ++it)
181 dmset.add_document(it.get_document());
182 return dmset;
185 if (k > mset.size())
186 k = mset.size();
188 initialise_points(mset);
189 compute_similarities();
191 vector<Xapian::docid> curr_dmset = main_dmset;
193 while (true) {
194 bool found_better_dmset = false;
195 for (unsigned int i = 0; i < main_dmset.size(); ++i) {
196 auto curr_doc = main_dmset[i];
197 double best_score = evaluate_dmset(curr_dmset);
199 vector<Xapian::docid> diff_dmset = compute_diff_dmset(curr_dmset);
201 bool found_better_doc = false;
202 for (unsigned int j = 0; j < diff_dmset.size(); ++j) {
203 vector<Xapian::docid> temp_dmset = curr_dmset;
204 temp_dmset[i] = diff_dmset[j];
205 double score = evaluate_dmset(temp_dmset);
206 if (score < best_score) {
207 curr_doc = temp_dmset[i];
208 best_score = score;
209 found_better_doc = true;
212 if (found_better_doc) {
213 curr_dmset[i] = curr_doc;
214 found_better_dmset = true;
218 // Terminate algorithm when there's no change in current
219 // document matchset
220 if (!found_better_dmset)
221 break;
223 main_dmset = curr_dmset;
226 // Merge main_dmset and diff_dmset into final dmset
227 DocumentSet dmset;
228 for (auto doc_id : main_dmset)
229 dmset.add_document(points.at(doc_id).get_document());
231 vector<Xapian::docid> diff_dmset = compute_diff_dmset(main_dmset);
232 for (auto doc_id : diff_dmset)
233 dmset.add_document(points.at(doc_id).get_document());
235 return dmset;