4 /* Copyright (C) 2010 Richard Boulton
5 * Copyright (C) 2016 Richhiey Thomas
6 * Copyright (C) 2018 Uppinder Chugh
7 * Copyright (C) 2024 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #ifndef XAPIAN_INCLUDED_CLUSTER_H
26 #define XAPIAN_INCLUDED_CLUSTER_H
28 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
29 #error Never use <xapian/cluster.h> directly; include <xapian.h> instead.
32 #include <xapian/attributes.h>
33 #include <xapian/mset.h>
34 #include <xapian/queryparser.h>
35 #include <xapian/types.h>
36 #include <xapian/visibility.h>
38 #include <string_view>
39 #include <unordered_map>
40 #include <unordered_set>
45 /** Stopper subclass which checks for both stemmed and unstemmed stopwords.
47 * This is intended for use with Xapian::Cluster.
49 class XAPIAN_VISIBILITY_DEFAULT StemStopper
: public Xapian::Stopper
{
51 /// Stemming strategies
53 STEM_NONE
, STEM_SOME
, STEM_ALL
, STEM_ALL_Z
, STEM_SOME_FULL_POS
58 * @param stemmer The Xapian::Stem object to set.
59 * @param strategy The stemming strategy to be used.
61 explicit StemStopper(const Xapian::Stem
&stemmer
, stem_strategy strategy
= STEM_SOME
);
63 std::string
get_description() const override
;
65 bool operator()(const std::string
& term
) const override
{
66 return stop_words
.find(term
) != stop_words
.end();
69 /// Add a single stop word and its stemmed equivalent
70 void add(std::string_view term
);
73 stem_strategy stem_action
;
74 std::unordered_set
<std::string
> stop_words
;
78 /** Class representing a set of documents in a cluster
80 class XAPIAN_VISIBILITY_DEFAULT DocumentSet
{
83 /// @private @internal Reference counted internals.
84 Xapian::Internal::intrusive_ptr_nonnull
<Internal
> internal
;
86 /** Copying is allowed. The internals are reference counted, so
89 * @param other The object to copy.
91 DocumentSet(const DocumentSet
&other
);
93 /** Assignment is allowed. The internals are reference counted,
94 * so assignment is cheap.
96 * @param other The object to copy.
98 DocumentSet
& operator=(const DocumentSet
&other
);
100 /** Move constructor.
102 * @param other The object to move.
104 DocumentSet(DocumentSet
&& other
);
106 /** Move assignment operator.
108 * @param other The object to move.
110 DocumentSet
& operator=(DocumentSet
&& other
);
112 /// Default constructor
118 /// Return the size of the DocumentSet
119 Xapian::doccount
size() const;
121 /// Return the Document in the DocumentSet at index i
122 const Xapian::Document
& operator[](Xapian::doccount i
) const;
124 /** Add a new Document to the DocumentSet
126 * @param document Document object that is to be added to
129 void add_document(const Document
&document
);
132 /** Base class for TermListGroup
133 * Stores and provides terms that are contained in a document and
134 * their respective term frequencies
136 class XAPIAN_VISIBILITY_DEFAULT FreqSource
137 : public Xapian::Internal::opt_intrusive_base
{
138 /// Don't allow assignment.
139 void operator=(const FreqSource
&) = delete;
141 /// Don't allow copying.
142 FreqSource(const FreqSource
&) = delete;
145 /// Default constructor
149 virtual ~FreqSource();
151 /** Return the term frequency of a particular term 'tname'
153 * @param tname The term for which to return the term frequency
155 virtual doccount
get_termfreq(const std::string
&tname
) const = 0;
157 /// Return the number of documents within the MSet
158 virtual doccount
get_doccount() const = 0;
160 /** Start reference counting this object.
162 * You can transfer ownership of a dynamically allocated FreqSource
163 * object to Xapian by calling release() and then passing the object to a
164 * Xapian method. Xapian will arrange to delete the object once it is no
167 FreqSource
* release() {
168 opt_intrusive_base::release();
172 /** Start reference counting this object.
174 * You can transfer ownership of a dynamically allocated FreqSource
175 * object to Xapian by calling release() and then passing the object to a
176 * Xapian method. Xapian will arrange to delete the object once it is no
179 const FreqSource
* release() const {
180 opt_intrusive_base::release();
185 /** A class for construction of termlists which store the terms for a
186 * document along with the number of documents it indexes i.e. term
189 class XAPIAN_VISIBILITY_DEFAULT TermListGroup
: public FreqSource
{
190 /** Map of the terms and its corresponding term frequencies.
191 * The term frequency of a term stands for the number of documents it indexes
193 std::unordered_map
<std::string
, doccount
> termfreq
;
195 /// Number of documents added to the termlist
196 doccount num_of_documents
;
198 /** Add a single document and calculates its corresponding term frequencies
200 * @param document Adds a document and updates the TermListGroup
201 * based on the terms found in the document
202 * @param stopper Xapian::Stopper object to identify stopwords
204 void add_document(const Document
&document
, const Stopper
*stopper
= NULL
);
209 * @param docs MSet object used to construct the TermListGroup
210 * @param stopper Xapian::Stopper object to identify stopwords
212 explicit TermListGroup(const MSet
&docs
, const Stopper
*stopper
= NULL
);
214 /** Return the number of documents that the term 'tname' exists in
216 * @param tname The term for which to return the term frequency
218 doccount
get_termfreq(const std::string
& tname
) const override
;
220 doccount
get_doccount() const override
;
223 /** Abstract class representing a point in the VSM
225 class XAPIAN_VISIBILITY_DEFAULT PointType
226 : public Xapian::Internal::opt_intrusive_base
{
228 /** Implement a map to store the terms within a document
229 * and their pre-computed TF-IDF weights
231 std::unordered_map
<std::string
, double> weights
;
233 /// Store the squared magnitude of the PointType
234 double magnitude
= 0.0;
236 /** Set the weight 'weight' to the mapping of a term
238 * @param term Term for which the weight is supposed
240 * @param weight The weight to which the mapping of the
243 void set_weight(std::string_view term
, double weight
) {
244 weights
[std::string(term
)] = weight
;
248 /// Default constructor
251 /// Return a TermIterator to the beginning of the termlist
252 TermIterator
termlist_begin() const;
254 /// Return a TermIterator to the end of the termlist
255 TermIterator
termlist_end() const noexcept
{
256 return TermIterator(NULL
);
259 /** Validate whether a certain term exists in the termlist
260 * or not by performing a lookup operation in the existing values
262 * @param term Term which is to be searched
264 bool contains(std::string_view term
) const {
265 return weights
.find(std::string(term
)) != weights
.end();
268 /** Return the TF-IDF weight associated with a certain term
270 * @param term Term for which TF-IDF weight is returned
272 double get_weight(std::string_view term
) const {
273 auto it
= weights
.find(std::string(term
));
274 return (it
== weights
.end()) ? 0.0 : it
->second
;
277 /** Add the weight 'weight' to the mapping of a term
279 * @param term Term to which the weight is to be added
280 * @param weight Weight which has to be added to the existing
281 * mapping of the term
283 void add_weight(std::string_view term
, double weight
) {
284 weights
[std::string(term
)] += weight
;
287 /// Return the pre-computed squared magnitude
288 double get_magnitude() const { return magnitude
; }
290 /// Return the size of the termlist
291 Xapian::termcount
termlist_size() const { return weights
.size(); }
293 /** Start reference counting this object.
295 * You can transfer ownership of a dynamically allocated PointType
296 * object to Xapian by calling release() and then passing the object to a
297 * Xapian method. Xapian will arrange to delete the object once it is no
300 PointType
* release() {
301 opt_intrusive_base::release();
305 /** Start reference counting this object.
307 * You can transfer ownership of a dynamically allocated PointType
308 * object to Xapian by calling release() and then passing the object to a
309 * Xapian method. Xapian will arrange to delete the object once it is no
312 const PointType
* release() const {
313 opt_intrusive_base::release();
318 /** Class to represent a document as a point in the Vector Space
321 class XAPIAN_VISIBILITY_DEFAULT Point
: public PointType
{
322 /// The document which is being represented by the Point
327 * Initialise the point with terms and corresponding TF-IDF weights
329 * @param freqsource FreqSource object which provides the term
330 * frequencies. It is used for TF-IDF weight
332 * @param document The Document object over which the Point object
333 * will be initialised
335 Point(const FreqSource
& freqsource
, const Document
& document
);
337 /// Returns the document corresponding to this Point
338 Document
get_document() const { return document
; }
341 /** Class to represent cluster centroids in the vector space
343 class XAPIAN_VISIBILITY_DEFAULT Centroid
: public PointType
{
345 /// Default constructor
348 /** Constructor with Point argument
350 * @param point Point object to which Centroid object is
351 * initialised. The document vector and the
352 * magnitude are made equal
354 explicit Centroid(const Point
&point
);
356 /** Divide the weight of terms in the centroid by 'size' and
357 * recalculate the magnitude
359 * @param cluster_size Value by which Centroid document vector is
362 void divide(double cluster_size
);
364 /// Clear the terms and corresponding values of the centroid
365 void clear() { weights
.clear(); }
368 /** Class to represents a Cluster which contains Points and Centroid
371 class XAPIAN_VISIBILITY_DEFAULT Cluster
{
374 /// @private @internal Reference counted internals.
375 Xapian::Internal::intrusive_ptr_nonnull
<Internal
> internal
;
377 /** Copying is allowed. The internals are reference counted, so
380 * @param other The object to copy.
382 Cluster(const Cluster
&other
);
384 /** Assignment is allowed. The internals are reference counted,
385 * so assignment is cheap.
387 * @param other The object to copy.
389 Cluster
& operator=(const Cluster
&other
);
391 /** Move constructor.
393 * @param other The object to move.
395 Cluster(Cluster
&& other
);
397 /** Move assignment operator.
399 * @param other The object to move.
401 Cluster
& operator=(Cluster
&& other
);
405 * @param centroid The centroid of the cluster object is
406 * assigned to 'centroid'
408 explicit Cluster(const Centroid
¢roid
);
410 /// Default constructor
416 /// Return size of the cluster
417 Xapian::doccount
size() const;
419 /** Add a document to the Cluster
421 * @param point The Point object representing the document which
422 * needs to be added to the cluster
424 void add_point(const Point
&point
);
426 /// Clear the cluster weights
429 /// Return the point at the given index in the cluster
430 const Point
& operator[](Xapian::doccount i
) const;
432 /// Return the documents that are contained within the cluster
433 DocumentSet
get_documents() const;
435 /// Return the current centroid of the cluster
436 const Centroid
& get_centroid() const;
438 /** Set the centroid of the Cluster to 'centroid'
440 * @param centroid Centroid object for the Cluster
442 void set_centroid(const Centroid
¢roid
);
444 /** Recalculate the centroid of the Cluster after each iteration
445 * of the KMeans algorithm by taking the mean of all document vectors (Points)
446 * that belong to the Cluster
451 /** Class for storing the results returned by the Clusterer
453 class XAPIAN_VISIBILITY_DEFAULT ClusterSet
{
456 /// @private @internal Reference counted internals.
457 Xapian::Internal::intrusive_ptr_nonnull
<Internal
> internal
;
459 /** Copying is allowed. The internals are reference counted, so
462 * @param other The object to copy.
464 ClusterSet(const ClusterSet
&other
);
466 /** Assignment is allowed. The internals are reference counted,
467 * so assignment is cheap.
469 * @param other The object to copy.
471 ClusterSet
& operator=(const ClusterSet
&other
);
473 /** Move constructor.
475 * @param other The object to move.
477 ClusterSet(ClusterSet
&& other
);
479 /** Move assignment operator.
481 * @param other The object to move.
483 ClusterSet
& operator=(ClusterSet
&& other
);
485 /// Default constructor
491 /** Add a cluster to the ClusterSet
493 * @param cluster Cluster object which is to be added to the ClusterSet
495 void add_cluster(const Cluster
&cluster
);
497 /** Add the point to the cluster at position 'index'
499 * @param point Point object which needs to be added to
500 * a Cluster within the ClusterSet
501 * @param index Index of the Cluster within the ClusterSet to
502 * which the Point is to be added
504 void add_to_cluster(const Point
&point
, unsigned int index
);
506 /// Return the number of clusters
507 Xapian::doccount
size() const;
509 /// Return the cluster at index 'i'
510 const Cluster
& operator[](Xapian::doccount i
) const;
512 /// Clear all the clusters in the ClusterSet
513 void clear_clusters();
515 /** Recalculate the centroid for all the clusters in the ClusterSet */
516 void recalculate_centroids();
519 /** Base class for calculating the similarity between documents
521 class XAPIAN_VISIBILITY_DEFAULT Similarity
{
524 virtual ~Similarity();
526 /** Calculates the similarity between the two documents
528 * @param a First point object for distance calculation
529 * @param b Second point object for distance calculation
531 virtual double similarity(const PointType
&a
, const PointType
&b
) const = 0;
533 /// Returns a string describing the similarity metric being used
534 virtual std::string
get_description() const = 0;
537 /** Class for calculating the cosine distance between two documents
539 class XAPIAN_VISIBILITY_DEFAULT CosineDistance
: public Similarity
{
541 /** Calculates and returns the cosine similarity using the
542 * formula cos(theta) = a.b/(|a|*|b|)
544 double similarity(const PointType
& a
, const PointType
& b
) const override
;
546 /// Return a string describing this object
547 std::string
get_description() const override
;
550 /** Class representing an abstract class for a clusterer to be implemented
552 class XAPIAN_VISIBILITY_DEFAULT Clusterer
553 : public Xapian::Internal::opt_intrusive_base
{
556 virtual ~Clusterer();
558 /** Implement the required clustering algorithm in the subclass and
559 * and return clustered output as ClusterSet
561 * @param mset The MSet object which contains the documents to be
564 virtual ClusterSet
cluster(const MSet
&mset
) = 0;
566 /// Returns a string describing the clusterer being used
567 virtual std::string
get_description() const = 0;
569 /** Start reference counting this object.
571 * You can transfer ownership of a dynamically allocated Clusterer
572 * object to Xapian by calling release() and then passing the object to a
573 * Xapian method. Xapian will arrange to delete the object once it is no
576 Clusterer
* release() {
577 opt_intrusive_base::release();
581 /** Start reference counting this object.
583 * You can transfer ownership of a dynamically allocated Clusterer
584 * object to Xapian by calling release() and then passing the object to a
585 * Xapian method. Xapian will arrange to delete the object once it is no
588 const Clusterer
* release() const {
589 opt_intrusive_base::release();
594 /** Kmeans clusterer:
595 * This clusterer implements the K-Means clustering algorithm
597 class XAPIAN_VISIBILITY_DEFAULT KMeans
: public Clusterer
{
598 /// Contains the initialised points that are to be clustered
599 std::vector
<Point
> points
;
601 /// Specifies that the clusterer needs to form 'k' clusters
604 /// Specifies the maximum number of iterations that KMeans will have
605 unsigned int max_iters
;
607 /// Pointer to stopper object for identifying stopwords
608 Xapian::Internal::opt_intrusive_ptr
<const Xapian::Stopper
> stopper
;
610 /** Initialise 'k' clusters by selecting 'k' centroids and assigning
611 * them to different clusters
613 * @param cset ClusterSet object to be initialised by assigning
614 * centroids to each cluster
615 * @param num_of_points Number of points passed to clusterer
617 void initialise_clusters(ClusterSet
&cset
, Xapian::doccount num_of_points
);
619 /** Initialise the Points to be fed into the Clusterer with the MSet object
620 * 'source'. The TF-IDF weights for the documents are calculated and stored
621 * within the Points to be used later during distance calculations
623 * @param source MSet object containing the documents which will be
624 * used to create document vectors that are represented
627 void initialise_points(const MSet
&source
);
630 /** Constructor specifying number of clusters and maximum iterations
632 * @param k_ Number of required clusters
633 * @param max_iters_ The maximum number of iterations for which KMeans
634 * will run if it doesn't converge
636 explicit KMeans(unsigned int k_
, unsigned int max_iters_
= 0);
638 /** Implements the KMeans clustering algorithm
640 * @param mset MSet object containing the documents that are to
643 ClusterSet
cluster(const MSet
&mset
) override
;
645 /** Set the Xapian::Stopper object to be used for identifying stopwords.
647 * Stopwords are discarded while calculating term frequency for terms.
649 * @param stop The Stopper object to set (default NULL, which means no
652 void set_stopper(const Xapian::Stopper
* stop
= NULL
) { stopper
= stop
; }
654 /// Return a string describing this object
655 std::string
get_description() const override
;
659 * This clusterer implements the LCD clustering algorithm adapted from
660 * Modelling efficient novelty-based search result diversification in metric
661 * spaces Gil-Costa et al. 2013
663 class XAPIAN_VISIBILITY_DEFAULT LCDClusterer
: public Clusterer
{
664 /// Specifies that the clusterer needs to form 'k' clusters
668 /** Constructor specifying number of clusters
670 * @param k_ Number of required clusters
672 explicit LCDClusterer(unsigned int k_
);
674 /** Implements the LCD clustering algorithm
676 * @param mset MSet object containing the documents that are to
679 ClusterSet
cluster(const MSet
&mset
) override
;
681 /// Return a string describing this object
682 std::string
get_description() const override
;
685 #endif // XAPIAN_INCLUDED_CLUSTER_H