New OmegaScript $unique command
[xapian.git] / xapian-core / diversify / diversify.cc
blob614fb749da4226a13cd968af704bc21ac14f5d6a
1 /** @file diversify.cc
2 * @brief Diversification API
3 */
4 /* Copyright (C) 2018 Uppinder Chugh
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "xapian/diversify.h"
25 #include "xapian/error.h"
27 #include "debuglog.h"
28 #include "diversify/diversifyinternal.h"
30 #include <algorithm>
31 #include <cmath>
32 #include <limits>
33 #include <vector>
35 using namespace Xapian;
36 using namespace std;
38 Diversify::Diversify(const Diversify&) = default;
40 Diversify&
41 Diversify::operator=(const Diversify&) = default;
43 Diversify::Diversify(Diversify&&) = default;
45 Diversify&
46 Diversify::operator=(Diversify&&) = default;
48 Diversify::Diversify(Xapian::doccount k_,
49 Xapian::doccount r_,
50 double lambda_,
51 double b_,
52 double sigma_sqr_)
53 : internal(new Xapian::Diversify::Internal(k_, r_, lambda_, b_, sigma_sqr_))
55 LOGCALL_CTOR(API, "Diversify", k_ | r_ | lambda_ | b_ | sigma_sqr_);
56 if (r_ == 0)
57 throw InvalidArgumentError("Value of r should be greater than zero.");
60 Diversify::~Diversify()
62 LOGCALL_DTOR(API, "Diversify");
65 string
66 Diversify::get_description() const
68 return "Diversify()";
71 DocumentSet
72 Diversify::get_dmset(const MSet& mset)
74 LOGCALL(API, MSet, "Diversify::get_dmset", mset);
75 return internal->get_dmset(mset);
78 void
79 Diversify::Internal::initialise_points(const MSet& source)
81 unsigned int count = 0;
82 TermListGroup tlg(source);
83 for (MSetIterator it = source.begin(); it != source.end(); ++it) {
84 points.emplace(*it, Xapian::Point(tlg, it.get_document()));
85 scores[*it] = it.get_weight();
86 // Initial top-k diversified documents
87 if (count < k) {
88 main_dmset.push_back(*it);
89 ++count;
94 pair<Xapian::docid, unsigned int>
95 Diversify::Internal::get_key(Xapian::docid doc_id, unsigned int centroid_id)
97 return make_pair(doc_id, centroid_id);
100 void
101 Diversify::Internal::compute_similarities(const Xapian::ClusterSet& cset)
103 Xapian::CosineDistance d;
104 for (auto p : points) {
105 Xapian::docid point_id = p.first;
106 Xapian::Point point = p.second;
107 for (unsigned int c = 0; c < cset.size(); ++c) {
108 double dist = d.similarity(point, cset[c].get_centroid());
109 auto key = get_key(point_id, c);
110 pairwise_sim[key] = dist;
115 vector<Xapian::docid>
116 Diversify::Internal::compute_diff_dmset(const vector<Xapian::docid>& dmset)
118 vector<Xapian::docid> diff_dmset;
119 for (auto point : points) {
120 Xapian::docid point_id = point.first;
121 bool found_point = false;
122 for (auto doc_id : dmset) {
123 if (point_id == doc_id) {
124 found_point = true;
125 break;
129 if (!found_point) {
130 diff_dmset.push_back(point_id);
134 return diff_dmset;
137 double
138 Diversify::Internal::evaluate_dmset(const vector<Xapian::docid>& dmset,
139 const Xapian::ClusterSet& cset)
141 double score_1 = 0, score_2 = 0;
143 for (auto doc_id : dmset)
144 score_1 += scores[doc_id];
146 for (unsigned int c = 0; c < cset.size(); ++c) {
147 double min_dist = numeric_limits<double>::max();
148 unsigned int pos = 1;
149 for (auto doc_id : dmset) {
150 auto key = get_key(doc_id, c);
151 double sim = pairwise_sim[key];
152 double weight = 2 * b * sigma_sqr / log(1 + pos) * (1 - sim);
153 min_dist = min(min_dist, weight);
154 ++pos;
156 score_2 += min_dist;
159 return -lambda * score_1 + (1 - lambda) * score_2;
162 DocumentSet
163 Diversify::Internal::get_dmset(const MSet& mset)
165 // Return original mset if no need to diversify
166 if (k == 0 || mset.size() <= 2) {
167 DocumentSet dmset;
168 for (MSetIterator it = mset.begin(); it != mset.end(); ++it)
169 dmset.add_document(it.get_document());
170 return dmset;
173 if (k > mset.size())
174 k = mset.size();
176 initialise_points(mset);
178 // Cluster the given mset into k clusters
179 KMeans km(k);
180 Xapian::ClusterSet cset = km.cluster(mset);
181 compute_similarities(cset);
183 // topC contains union of top-r relevant documents of each cluster
184 vector<Xapian::docid> topc;
186 // Build topC
187 for (unsigned int c = 0; c < cset.size(); ++c) {
188 auto documents = cset[c].get_documents();
189 for (unsigned int d = 0; d < r && d < documents.size(); ++d) {
190 auto doc_id = documents[d].get_docid();
191 topc.push_back(doc_id);
195 vector<Xapian::docid> curr_dmset = main_dmset;
197 while (true) {
198 bool found_better_dmset = false;
199 for (unsigned int i = 0; i < main_dmset.size(); ++i) {
200 auto curr_doc = main_dmset[i];
201 double best_score = evaluate_dmset(curr_dmset, cset);
202 bool found_better_doc = false;
204 for (unsigned int j = 0; j < topc.size(); ++j) {
205 // Continue if candidate document from topC already
206 // exists in curr_dmset
207 auto candidate_doc = find(curr_dmset.begin(), curr_dmset.end(),
208 topc[j]);
209 if (candidate_doc != curr_dmset.end()) {
210 continue;
213 auto temp_doc = curr_dmset[i];
214 curr_dmset[i] = topc[j];
215 double score = evaluate_dmset(curr_dmset, cset);
217 if (score < best_score) {
218 curr_doc = curr_dmset[i];
219 best_score = score;
220 found_better_doc = true;
223 curr_dmset[i] = temp_doc;
225 if (found_better_doc) {
226 curr_dmset[i] = curr_doc;
227 found_better_dmset = true;
231 // Terminate algorithm when there's no change in current
232 // document matchset
233 if (!found_better_dmset)
234 break;
236 main_dmset = curr_dmset;
239 // Merge main_dmset and diff_dmset into final dmset
240 DocumentSet dmset;
241 for (auto doc_id : main_dmset)
242 dmset.add_document(points.at(doc_id).get_document());
244 vector<Xapian::docid> diff_dmset = compute_diff_dmset(main_dmset);
245 for (auto doc_id : diff_dmset)
246 dmset.add_document(points.at(doc_id).get_document());
248 return dmset;