xapian-core/diversify/diversify.cc

   1 /** @file diversify.cc
   2  *  @brief Diversification API
   3  */
   4 /* Copyright (C) 2018 Uppinder Chugh
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  19  * USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "xapian/diversify.h"
  25 #include "xapian/error.h"
  26
  27 #include "debuglog.h"
  28 #include "diversify/diversifyinternal.h"
  29
  30 #include <algorithm>
  31 #include <cmath>
  32 #include <limits>
  33 #include <vector>
  34
  35 using namespace Xapian;
  36 using namespace std;
  37
  38 Diversify::Diversify(const Diversify&) = default;
  39
  40 Diversify&
  41 Diversify::operator=(const Diversify&) = default;
  42
  43 Diversify::Diversify(Diversify&&) = default;
  44
  45 Diversify&
  46 Diversify::operator=(Diversify&&) = default;
  47
  48 Diversify::Diversify(Xapian::doccount k_,
  49                      Xapian::doccount r_,
  50                      double lambda_,
  51                      double b_,
  52                      double sigma_sqr_)
  53     : internal(new Xapian::Diversify::Internal(k_, r_, lambda_, b_, sigma_sqr_))
  54 {
  55     LOGCALL_CTOR(API, "Diversify", k_ | r_ | lambda_ | b_ | sigma_sqr_);
  56     if (r_ == 0)
  57         throw InvalidArgumentError("Value of r should be greater than zero.");
  58 }
  59
  60 Diversify::~Diversify()
  61 {
  62     LOGCALL_DTOR(API, "Diversify");
  63 }
  64
  65 string
  66 Diversify::get_description() const
  67 {
  68     return "Diversify()";
  69 }
  70
  71 DocumentSet
  72 Diversify::get_dmset(const MSet& mset)
  73 {
  74     LOGCALL(API, MSet, "Diversify::get_dmset", mset);
  75     return internal->get_dmset(mset);
  76 }
  77
  78 void
  79 Diversify::Internal::initialise_points(const MSet& source)
  80 {
  81     unsigned int count = 0;
  82     TermListGroup tlg(source);
  83     for (MSetIterator it = source.begin(); it != source.end(); ++it) {
  84         points.emplace(*it, Xapian::Point(tlg, it.get_document()));
  85         scores[*it] = it.get_weight();
  86         // Initial top-k diversified documents
  87         if (count < k) {
  88             main_dmset.push_back(*it);
  89             ++count;
  90         }
  91     }
  92 }
  93
  94 pair<Xapian::docid, unsigned int>
  95 Diversify::Internal::get_key(Xapian::docid doc_id, unsigned int centroid_id)
  96 {
  97     return make_pair(doc_id, centroid_id);
  98 }
  99
 100 void
 101 Diversify::Internal::compute_similarities(const Xapian::ClusterSet& cset)
 102 {
 103     Xapian::CosineDistance d;
 104     for (auto p : points) {
 105         Xapian::docid point_id = p.first;
 106         Xapian::Point point = p.second;
 107         for (unsigned int c = 0; c < cset.size(); ++c) {
 108             double dist = d.similarity(point, cset[c].get_centroid());
 109             auto key = get_key(point_id, c);
 110             pairwise_sim[key] = dist;
 111         }
 112     }
 113 }
 114
 115 vector<Xapian::docid>
 116 Diversify::Internal::compute_diff_dmset(const vector<Xapian::docid>& dmset)
 117 {
 118     vector<Xapian::docid> diff_dmset;
 119     for (auto point : points) {
 120         Xapian::docid point_id = point.first;
 121         bool found_point = false;
 122         for (auto doc_id : dmset) {
 123             if (point_id == doc_id) {
 124                 found_point = true;
 125                 break;
 126             }
 127         }
 128
 129         if (!found_point) {
 130             diff_dmset.push_back(point_id);
 131         }
 132     }
 133
 134     return diff_dmset;
 135 }
 136
 137 double
 138 Diversify::Internal::evaluate_dmset(const vector<Xapian::docid>& dmset,
 139                                     const Xapian::ClusterSet& cset)
 140 {
 141     double score_1 = 0, score_2 = 0;
 142
 143     for (auto doc_id : dmset)
 144         score_1 += scores[doc_id];
 145
 146     for (unsigned int c = 0; c < cset.size(); ++c) {
 147         double min_dist = numeric_limits<double>::max();
 148         unsigned int pos = 1;
 149         for (auto doc_id : dmset) {
 150             auto key = get_key(doc_id, c);
 151             double sim = pairwise_sim[key];
 152             double weight = 2 * b * sigma_sqr / log(1 + pos) * (1 - sim);
 153             min_dist = min(min_dist, weight);
 154             ++pos;
 155         }
 156         score_2 += min_dist;
 157     }
 158
 159     return -lambda * score_1 + (1 - lambda) * score_2;
 160 }
 161
 162 DocumentSet
 163 Diversify::Internal::get_dmset(const MSet& mset)
 164 {
 165     // Return original mset if no need to diversify
 166     if (k == 0 || mset.size() <= 2) {
 167         DocumentSet dmset;
 168         for (MSetIterator it = mset.begin(); it != mset.end(); ++it)
 169             dmset.add_document(it.get_document());
 170         return dmset;
 171     }
 172
 173     if (k > mset.size())
 174         k = mset.size();
 175
 176     initialise_points(mset);
 177
 178     // Cluster the given mset into k clusters
 179     KMeans km(k);
 180     Xapian::ClusterSet cset = km.cluster(mset);
 181     compute_similarities(cset);
 182
 183     // topC contains union of top-r relevant documents of each cluster
 184     vector<Xapian::docid> topc;
 185
 186     // Build topC
 187     for (unsigned int c = 0; c < cset.size(); ++c) {
 188         auto documents = cset[c].get_documents();
 189         for (unsigned int d = 0; d < r && d < documents.size(); ++d) {
 190             auto doc_id = documents[d].get_docid();
 191             topc.push_back(doc_id);
 192         }
 193     }
 194
 195     vector<Xapian::docid> curr_dmset = main_dmset;
 196
 197     while (true) {
 198         bool found_better_dmset = false;
 199         for (unsigned int i = 0; i < main_dmset.size(); ++i) {
 200             auto curr_doc = main_dmset[i];
 201             double best_score = evaluate_dmset(curr_dmset, cset);
 202             bool found_better_doc = false;
 203
 204             for (unsigned int j = 0; j < topc.size(); ++j) {
 205                 // Continue if candidate document from topC already
 206                 // exists in curr_dmset
 207                 auto candidate_doc = find(curr_dmset.begin(), curr_dmset.end(),
 208                                           topc[j]);
 209                 if (candidate_doc != curr_dmset.end()) {
 210                     continue;
 211                 }
 212
 213                 auto temp_doc = curr_dmset[i];
 214                 curr_dmset[i] = topc[j];
 215                 double score = evaluate_dmset(curr_dmset, cset);
 216
 217                 if (score < best_score) {
 218                     curr_doc = curr_dmset[i];
 219                     best_score = score;
 220                     found_better_doc = true;
 221                 }
 222
 223                 curr_dmset[i] = temp_doc;
 224             }
 225             if (found_better_doc) {
 226                 curr_dmset[i] = curr_doc;
 227                 found_better_dmset = true;
 228             }
 229         }
 230
 231         // Terminate algorithm when there's no change in current
 232         // document matchset
 233         if (!found_better_dmset)
 234             break;
 235
 236         main_dmset = curr_dmset;
 237     }
 238
 239     // Merge main_dmset and diff_dmset into final dmset
 240     DocumentSet dmset;
 241     for (auto doc_id : main_dmset)
 242         dmset.add_document(points.at(doc_id).get_document());
 243
 244     vector<Xapian::docid> diff_dmset = compute_diff_dmset(main_dmset);
 245     for (auto doc_id : diff_dmset)
 246         dmset.add_document(points.at(doc_id).get_document());
 247
 248     return dmset;
 249 }