2 * @brief Diversification API
4 /* Copyright (C) 2018 Uppinder Chugh
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "xapian/diversify.h"
25 #include "xapian/error.h"
28 #include "diversify/diversifyinternal.h"
35 using namespace Xapian
;
38 Diversify::Diversify(const Diversify
&) = default;
41 Diversify::operator=(const Diversify
&) = default;
43 Diversify::Diversify(Diversify
&&) = default;
46 Diversify::operator=(Diversify
&&) = default;
48 Diversify::Diversify(Xapian::doccount k_
,
53 : internal(new Xapian::Diversify::Internal(k_
, r_
, lambda_
, b_
, sigma_sqr_
))
55 LOGCALL_CTOR(API
, "Diversify", k_
| r_
| lambda_
| b_
| sigma_sqr_
);
57 throw InvalidArgumentError("Value of r should be greater than zero.");
60 Diversify::~Diversify()
62 LOGCALL_DTOR(API
, "Diversify");
66 Diversify::get_description() const
72 Diversify::get_dmset(const MSet
& mset
)
74 LOGCALL(API
, MSet
, "Diversify::get_dmset", mset
);
75 return internal
->get_dmset(mset
);
79 Diversify::Internal::initialise_points(const MSet
& source
)
81 unsigned int count
= 0;
82 TermListGroup
tlg(source
);
83 for (MSetIterator it
= source
.begin(); it
!= source
.end(); ++it
) {
84 points
.emplace(*it
, Xapian::Point(tlg
, it
.get_document()));
85 scores
[*it
] = it
.get_weight();
86 // Initial top-k diversified documents
88 main_dmset
.push_back(*it
);
94 pair
<Xapian::docid
, unsigned int>
95 Diversify::Internal::get_key(Xapian::docid doc_id
, unsigned int centroid_id
)
97 return make_pair(doc_id
, centroid_id
);
101 Diversify::Internal::compute_similarities(const Xapian::ClusterSet
& cset
)
103 Xapian::CosineDistance d
;
104 for (auto p
: points
) {
105 Xapian::docid point_id
= p
.first
;
106 Xapian::Point point
= p
.second
;
107 for (unsigned int c
= 0; c
< cset
.size(); ++c
) {
108 double dist
= d
.similarity(point
, cset
[c
].get_centroid());
109 auto key
= get_key(point_id
, c
);
110 pairwise_sim
[key
] = dist
;
115 vector
<Xapian::docid
>
116 Diversify::Internal::compute_diff_dmset(const vector
<Xapian::docid
>& dmset
)
118 vector
<Xapian::docid
> diff_dmset
;
119 for (auto point
: points
) {
120 Xapian::docid point_id
= point
.first
;
121 bool found_point
= false;
122 for (auto doc_id
: dmset
) {
123 if (point_id
== doc_id
) {
130 diff_dmset
.push_back(point_id
);
138 Diversify::Internal::evaluate_dmset(const vector
<Xapian::docid
>& dmset
,
139 const Xapian::ClusterSet
& cset
)
141 double score_1
= 0, score_2
= 0;
143 for (auto doc_id
: dmset
)
144 score_1
+= scores
[doc_id
];
146 for (unsigned int c
= 0; c
< cset
.size(); ++c
) {
147 double min_dist
= numeric_limits
<double>::max();
148 unsigned int pos
= 1;
149 for (auto doc_id
: dmset
) {
150 auto key
= get_key(doc_id
, c
);
151 double sim
= pairwise_sim
[key
];
152 double weight
= 2 * b
* sigma_sqr
/ log(1 + pos
) * (1 - sim
);
153 min_dist
= min(min_dist
, weight
);
159 return -lambda
* score_1
+ (1 - lambda
) * score_2
;
163 Diversify::Internal::get_dmset(const MSet
& mset
)
165 // Return original mset if no need to diversify
166 if (k
== 0 || mset
.size() <= 2) {
168 for (MSetIterator it
= mset
.begin(); it
!= mset
.end(); ++it
)
169 dmset
.add_document(it
.get_document());
176 initialise_points(mset
);
178 // Cluster the given mset into k clusters
180 Xapian::ClusterSet cset
= km
.cluster(mset
);
181 compute_similarities(cset
);
183 // topC contains union of top-r relevant documents of each cluster
184 vector
<Xapian::docid
> topc
;
187 for (unsigned int c
= 0; c
< cset
.size(); ++c
) {
188 auto documents
= cset
[c
].get_documents();
189 for (unsigned int d
= 0; d
< r
&& d
< documents
.size(); ++d
) {
190 auto doc_id
= documents
[d
].get_docid();
191 topc
.push_back(doc_id
);
195 vector
<Xapian::docid
> curr_dmset
= main_dmset
;
198 bool found_better_dmset
= false;
199 for (unsigned int i
= 0; i
< main_dmset
.size(); ++i
) {
200 auto curr_doc
= main_dmset
[i
];
201 double best_score
= evaluate_dmset(curr_dmset
, cset
);
202 bool found_better_doc
= false;
204 for (unsigned int j
= 0; j
< topc
.size(); ++j
) {
205 // Continue if candidate document from topC already
206 // exists in curr_dmset
207 auto candidate_doc
= find(curr_dmset
.begin(), curr_dmset
.end(),
209 if (candidate_doc
!= curr_dmset
.end()) {
213 auto temp_doc
= curr_dmset
[i
];
214 curr_dmset
[i
] = topc
[j
];
215 double score
= evaluate_dmset(curr_dmset
, cset
);
217 if (score
< best_score
) {
218 curr_doc
= curr_dmset
[i
];
220 found_better_doc
= true;
223 curr_dmset
[i
] = temp_doc
;
225 if (found_better_doc
) {
226 curr_dmset
[i
] = curr_doc
;
227 found_better_dmset
= true;
231 // Terminate algorithm when there's no change in current
233 if (!found_better_dmset
)
236 main_dmset
= curr_dmset
;
239 // Merge main_dmset and diff_dmset into final dmset
241 for (auto doc_id
: main_dmset
)
242 dmset
.add_document(points
.at(doc_id
).get_document());
244 vector
<Xapian::docid
> diff_dmset
= compute_diff_dmset(main_dmset
);
245 for (auto doc_id
: diff_dmset
)
246 dmset
.add_document(points
.at(doc_id
).get_document());