2 * @brief Diversification API
4 /* Copyright (C) 2018 Uppinder Chugh
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "xapian/diversify.h"
25 #include "xapian/error.h"
28 #include "diversify/diversifyinternal.h"
35 using namespace Xapian
;
38 Diversify::Diversify(const Diversify
&) = default;
41 Diversify::operator=(const Diversify
&) = default;
43 Diversify::Diversify(Diversify
&&) = default;
46 Diversify::operator=(Diversify
&&) = default;
48 Diversify::Diversify(unsigned int k_
,
52 : internal(new Xapian::Diversify::Internal(k_
, lambda_
, b_
, sigma_sqr_
))
54 LOGCALL_CTOR(API
, "Diversify", k_
| lambda_
| b_
| sigma_sqr_
);
57 Diversify::~Diversify()
59 LOGCALL_DTOR(API
, "Diversify");
63 Diversify::get_description() const
69 Diversify::get_dmset(const MSet
& mset
)
71 LOGCALL(API
, MSet
, "Diversify::get_dmset", mset
);
72 return internal
->get_dmset(mset
);
76 Diversify::Internal::initialise_points(const MSet
& source
)
78 unsigned int count
= 0;
79 TermListGroup
tlg(source
);
80 for (MSetIterator it
= source
.begin(); it
!= source
.end(); ++it
) {
81 points
.emplace(*it
, Xapian::Point(tlg
, it
.get_document()));
82 scores
[*it
] = it
.get_weight();
83 // Initial top-k diversified documents
85 main_dmset
.push_back(*it
);
91 pair
<Xapian::docid
, Xapian::docid
>
92 Diversify::Internal::get_key(Xapian::docid docid_a
, Xapian::docid docid_b
)
94 pair
<Xapian::docid
, Xapian::docid
> key
;
95 if (docid_a
<= docid_b
) {
96 key
= make_pair(docid_a
, docid_b
);
98 key
= make_pair(docid_b
, docid_a
);
105 Diversify::Internal::compute_similarities()
107 Xapian::CosineDistance d
;
108 for (auto p_a
: points
) {
109 Xapian::docid pointid_a
= p_a
.first
;
110 const Xapian::Point
& point_a
= p_a
.second
;
111 for (auto p_b
: points
) {
112 Xapian::docid pointid_b
= p_b
.first
;
113 const Xapian::Point
& point_b
= p_b
.second
;
115 if (pointid_a
> pointid_b
) {
119 double sim
= d
.similarity(point_a
, point_b
);
120 auto key
= get_key(pointid_a
, pointid_b
);
121 pairwise_sim
[key
] = sim
;
126 vector
<Xapian::docid
>
127 Diversify::Internal::compute_diff_dmset(const vector
<Xapian::docid
>& dmset
)
129 vector
<Xapian::docid
> diff_dmset
;
130 for (auto point
: points
) {
131 Xapian::docid point_id
= point
.first
;
132 bool found_point
= false;
133 for (auto doc_id
: dmset
) {
134 if (point_id
== doc_id
) {
141 diff_dmset
.push_back(point_id
);
149 Diversify::Internal::evaluate_dmset(const vector
<Xapian::docid
>& dmset
)
151 double score_1
= 0, score_2
= 0;
153 for (auto doc_id
: dmset
)
154 score_1
+= scores
[doc_id
];
156 vector
<Xapian::docid
> diff_dmset
= compute_diff_dmset(dmset
);
158 for (auto point_id
: diff_dmset
) {
159 double min_dist
= numeric_limits
<double>::max();
160 unsigned int pos
= 1;
161 for (auto doc_id
: dmset
) {
162 auto key
= get_key(point_id
, doc_id
);
163 double sim
= pairwise_sim
[key
];
164 double weight
= 2 * b
* sigma_sqr
/ log(1 + pos
) * (1 - sim
);
165 min_dist
= min(min_dist
, weight
);
171 return -lambda
* score_1
+ (1 - lambda
) * score_2
;
175 Diversify::Internal::get_dmset(const MSet
& mset
)
177 // Return original mset if no need to diversify
178 if (k
== 0 || mset
.size() <= 2) {
180 for (MSetIterator it
= mset
.begin(); it
!= mset
.end(); ++it
)
181 dmset
.add_document(it
.get_document());
188 initialise_points(mset
);
189 compute_similarities();
191 vector
<Xapian::docid
> curr_dmset
= main_dmset
;
194 bool found_better_dmset
= false;
195 for (unsigned int i
= 0; i
< main_dmset
.size(); ++i
) {
196 auto curr_doc
= main_dmset
[i
];
197 double best_score
= evaluate_dmset(curr_dmset
);
199 vector
<Xapian::docid
> diff_dmset
= compute_diff_dmset(curr_dmset
);
201 bool found_better_doc
= false;
202 for (unsigned int j
= 0; j
< diff_dmset
.size(); ++j
) {
203 vector
<Xapian::docid
> temp_dmset
= curr_dmset
;
204 temp_dmset
[i
] = diff_dmset
[j
];
205 double score
= evaluate_dmset(temp_dmset
);
206 if (score
< best_score
) {
207 curr_doc
= temp_dmset
[i
];
209 found_better_doc
= true;
212 if (found_better_doc
) {
213 curr_dmset
[i
] = curr_doc
;
214 found_better_dmset
= true;
218 // Terminate algorithm when there's no change in current
220 if (!found_better_dmset
)
223 main_dmset
= curr_dmset
;
226 // Merge main_dmset and diff_dmset into final dmset
228 for (auto doc_id
: main_dmset
)
229 dmset
.add_document(points
.at(doc_id
).get_document());
231 vector
<Xapian::docid
> diff_dmset
= compute_diff_dmset(main_dmset
);
232 for (auto doc_id
: diff_dmset
)
233 dmset
.add_document(points
.at(doc_id
).get_document());