[ci] Update macos jobs
[xapian.git] / xapian-core / include / xapian / cluster.h
blob22b3394d5f08cbd212adbed917e11c009e0764fd
1 /** @file
2 * @brief Cluster API
3 */
4 /* Copyright (C) 2010 Richard Boulton
5 * Copyright (C) 2016 Richhiey Thomas
6 * Copyright (C) 2018 Uppinder Chugh
7 * Copyright (C) 2024 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #ifndef XAPIAN_INCLUDED_CLUSTER_H
26 #define XAPIAN_INCLUDED_CLUSTER_H
28 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
29 #error Never use <xapian/cluster.h> directly; include <xapian.h> instead.
30 #endif
32 #include <xapian/attributes.h>
33 #include <xapian/mset.h>
34 #include <xapian/queryparser.h>
35 #include <xapian/types.h>
36 #include <xapian/visibility.h>
38 #include <string_view>
39 #include <unordered_map>
40 #include <unordered_set>
41 #include <vector>
43 namespace Xapian {
45 /** Stopper subclass which checks for both stemmed and unstemmed stopwords.
47 * This is intended for use with Xapian::Cluster.
49 class XAPIAN_VISIBILITY_DEFAULT StemStopper : public Xapian::Stopper {
50 public:
51 /// Stemming strategies
52 typedef enum {
53 STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z, STEM_SOME_FULL_POS
54 } stem_strategy;
56 /** Constructor
58 * @param stemmer The Xapian::Stem object to set.
59 * @param strategy The stemming strategy to be used.
61 explicit StemStopper(const Xapian::Stem &stemmer, stem_strategy strategy = STEM_SOME);
63 std::string get_description() const override;
65 bool operator()(const std::string& term) const override {
66 return stop_words.find(term) != stop_words.end();
69 /// Add a single stop word and its stemmed equivalent
70 void add(std::string_view term);
72 private:
73 stem_strategy stem_action;
74 std::unordered_set<std::string> stop_words;
75 Xapian::Stem stemmer;
78 /** Class representing a set of documents in a cluster
80 class XAPIAN_VISIBILITY_DEFAULT DocumentSet {
81 public:
82 class Internal;
83 /// @private @internal Reference counted internals.
84 Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
86 /** Copying is allowed. The internals are reference counted, so
87 * copying is cheap.
89 * @param other The object to copy.
91 DocumentSet(const DocumentSet &other);
93 /** Assignment is allowed. The internals are reference counted,
94 * so assignment is cheap.
96 * @param other The object to copy.
98 DocumentSet & operator=(const DocumentSet &other);
100 /** Move constructor.
102 * @param other The object to move.
104 DocumentSet(DocumentSet && other);
106 /** Move assignment operator.
108 * @param other The object to move.
110 DocumentSet & operator=(DocumentSet && other);
112 /// Default constructor
113 DocumentSet();
115 /// Destructor
116 ~DocumentSet();
118 /// Return the size of the DocumentSet
119 Xapian::doccount size() const;
121 /// Return the Document in the DocumentSet at index i
122 const Xapian::Document& operator[](Xapian::doccount i) const;
124 /** Add a new Document to the DocumentSet
126 * @param document Document object that is to be added to
127 * the DocumentSet
129 void add_document(const Document &document);
132 /** Base class for TermListGroup
133 * Stores and provides terms that are contained in a document and
134 * their respective term frequencies
136 class XAPIAN_VISIBILITY_DEFAULT FreqSource
137 : public Xapian::Internal::opt_intrusive_base {
138 /// Don't allow assignment.
139 void operator=(const FreqSource &) = delete;
141 /// Don't allow copying.
142 FreqSource(const FreqSource &) = delete;
144 public:
145 /// Default constructor
146 FreqSource() {}
148 /// Destructor
149 virtual ~FreqSource();
151 /** Return the term frequency of a particular term 'tname'
153 * @param tname The term for which to return the term frequency
155 virtual doccount get_termfreq(const std::string &tname) const = 0;
157 /// Return the number of documents within the MSet
158 virtual doccount get_doccount() const = 0;
160 /** Start reference counting this object.
162 * You can transfer ownership of a dynamically allocated FreqSource
163 * object to Xapian by calling release() and then passing the object to a
164 * Xapian method. Xapian will arrange to delete the object once it is no
165 * longer required.
167 FreqSource * release() {
168 opt_intrusive_base::release();
169 return this;
172 /** Start reference counting this object.
174 * You can transfer ownership of a dynamically allocated FreqSource
175 * object to Xapian by calling release() and then passing the object to a
176 * Xapian method. Xapian will arrange to delete the object once it is no
177 * longer required.
179 const FreqSource * release() const {
180 opt_intrusive_base::release();
181 return this;
185 /** A class for construction of termlists which store the terms for a
186 * document along with the number of documents it indexes i.e. term
187 * frequency
189 class XAPIAN_VISIBILITY_DEFAULT TermListGroup : public FreqSource {
190 /** Map of the terms and its corresponding term frequencies.
191 * The term frequency of a term stands for the number of documents it indexes
193 std::unordered_map<std::string, doccount> termfreq;
195 /// Number of documents added to the termlist
196 doccount num_of_documents;
198 /** Add a single document and calculates its corresponding term frequencies
200 * @param document Adds a document and updates the TermListGroup
201 * based on the terms found in the document
202 * @param stopper Xapian::Stopper object to identify stopwords
204 void add_document(const Document &document, const Stopper *stopper = NULL);
206 public:
207 /** Constructor
209 * @param docs MSet object used to construct the TermListGroup
210 * @param stopper Xapian::Stopper object to identify stopwords
212 explicit TermListGroup(const MSet &docs, const Stopper *stopper = NULL);
214 /** Return the number of documents that the term 'tname' exists in
216 * @param tname The term for which to return the term frequency
218 doccount get_termfreq(const std::string& tname) const override;
220 doccount get_doccount() const override;
223 /** Abstract class representing a point in the VSM
225 class XAPIAN_VISIBILITY_DEFAULT PointType
226 : public Xapian::Internal::opt_intrusive_base {
227 protected:
228 /** Implement a map to store the terms within a document
229 * and their pre-computed TF-IDF weights
231 std::unordered_map<std::string, double> weights;
233 /// Store the squared magnitude of the PointType
234 double magnitude = 0.0;
236 /** Set the weight 'weight' to the mapping of a term
238 * @param term Term for which the weight is supposed
239 * to be changed
240 * @param weight The weight to which the mapping of the
241 * term is to be set
243 void set_weight(std::string_view term, double weight) {
244 weights[std::string(term)] = weight;
247 public:
248 /// Default constructor
249 PointType() {}
251 /// Return a TermIterator to the beginning of the termlist
252 TermIterator termlist_begin() const;
254 /// Return a TermIterator to the end of the termlist
255 TermIterator termlist_end() const noexcept {
256 return TermIterator(NULL);
259 /** Validate whether a certain term exists in the termlist
260 * or not by performing a lookup operation in the existing values
262 * @param term Term which is to be searched
264 bool contains(std::string_view term) const {
265 return weights.find(std::string(term)) != weights.end();
268 /** Return the TF-IDF weight associated with a certain term
270 * @param term Term for which TF-IDF weight is returned
272 double get_weight(std::string_view term) const {
273 auto it = weights.find(std::string(term));
274 return (it == weights.end()) ? 0.0 : it->second;
277 /** Add the weight 'weight' to the mapping of a term
279 * @param term Term to which the weight is to be added
280 * @param weight Weight which has to be added to the existing
281 * mapping of the term
283 void add_weight(std::string_view term, double weight) {
284 weights[std::string(term)] += weight;
287 /// Return the pre-computed squared magnitude
288 double get_magnitude() const { return magnitude; }
290 /// Return the size of the termlist
291 Xapian::termcount termlist_size() const { return weights.size(); }
293 /** Start reference counting this object.
295 * You can transfer ownership of a dynamically allocated PointType
296 * object to Xapian by calling release() and then passing the object to a
297 * Xapian method. Xapian will arrange to delete the object once it is no
298 * longer required.
300 PointType * release() {
301 opt_intrusive_base::release();
302 return this;
305 /** Start reference counting this object.
307 * You can transfer ownership of a dynamically allocated PointType
308 * object to Xapian by calling release() and then passing the object to a
309 * Xapian method. Xapian will arrange to delete the object once it is no
310 * longer required.
312 const PointType * release() const {
313 opt_intrusive_base::release();
314 return this;
318 /** Class to represent a document as a point in the Vector Space
319 * Model
321 class XAPIAN_VISIBILITY_DEFAULT Point : public PointType {
322 /// The document which is being represented by the Point
323 Document document;
325 public:
326 /** Constructor
327 * Initialise the point with terms and corresponding TF-IDF weights
329 * @param freqsource FreqSource object which provides the term
330 * frequencies. It is used for TF-IDF weight
331 * calculations
332 * @param document The Document object over which the Point object
333 * will be initialised
335 Point(const FreqSource& freqsource, const Document& document);
337 /// Returns the document corresponding to this Point
338 Document get_document() const { return document; }
341 /** Class to represent cluster centroids in the vector space
343 class XAPIAN_VISIBILITY_DEFAULT Centroid : public PointType {
344 public:
345 /// Default constructor
346 Centroid() { }
348 /** Constructor with Point argument
350 * @param point Point object to which Centroid object is
351 * initialised. The document vector and the
352 * magnitude are made equal
354 explicit Centroid(const Point &point);
356 /** Divide the weight of terms in the centroid by 'size' and
357 * recalculate the magnitude
359 * @param cluster_size Value by which Centroid document vector is
360 * divided
362 void divide(double cluster_size);
364 /// Clear the terms and corresponding values of the centroid
365 void clear() { weights.clear(); }
368 /** Class to represents a Cluster which contains Points and Centroid
369 * of the Cluster
371 class XAPIAN_VISIBILITY_DEFAULT Cluster {
372 public:
373 class Internal;
374 /// @private @internal Reference counted internals.
375 Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
377 /** Copying is allowed. The internals are reference counted, so
378 * copying is cheap.
380 * @param other The object to copy.
382 Cluster(const Cluster &other);
384 /** Assignment is allowed. The internals are reference counted,
385 * so assignment is cheap.
387 * @param other The object to copy.
389 Cluster& operator=(const Cluster &other);
391 /** Move constructor.
393 * @param other The object to move.
395 Cluster(Cluster && other);
397 /** Move assignment operator.
399 * @param other The object to move.
401 Cluster & operator=(Cluster && other);
403 /** Constructor
405 * @param centroid The centroid of the cluster object is
406 * assigned to 'centroid'
408 explicit Cluster(const Centroid &centroid);
410 /// Default constructor
411 Cluster();
413 /// Destructor
414 ~Cluster();
416 /// Return size of the cluster
417 Xapian::doccount size() const;
419 /** Add a document to the Cluster
421 * @param point The Point object representing the document which
422 * needs to be added to the cluster
424 void add_point(const Point &point);
426 /// Clear the cluster weights
427 void clear();
429 /// Return the point at the given index in the cluster
430 const Point& operator[](Xapian::doccount i) const;
432 /// Return the documents that are contained within the cluster
433 DocumentSet get_documents() const;
435 /// Return the current centroid of the cluster
436 const Centroid& get_centroid() const;
438 /** Set the centroid of the Cluster to 'centroid'
440 * @param centroid Centroid object for the Cluster
442 void set_centroid(const Centroid &centroid);
444 /** Recalculate the centroid of the Cluster after each iteration
445 * of the KMeans algorithm by taking the mean of all document vectors (Points)
446 * that belong to the Cluster
448 void recalculate();
451 /** Class for storing the results returned by the Clusterer
453 class XAPIAN_VISIBILITY_DEFAULT ClusterSet {
454 public:
455 class Internal;
456 /// @private @internal Reference counted internals.
457 Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
459 /** Copying is allowed. The internals are reference counted, so
460 * copying is cheap.
462 * @param other The object to copy.
464 ClusterSet(const ClusterSet &other);
466 /** Assignment is allowed. The internals are reference counted,
467 * so assignment is cheap.
469 * @param other The object to copy.
471 ClusterSet& operator=(const ClusterSet &other);
473 /** Move constructor.
475 * @param other The object to move.
477 ClusterSet(ClusterSet && other);
479 /** Move assignment operator.
481 * @param other The object to move.
483 ClusterSet & operator=(ClusterSet && other);
485 /// Default constructor
486 ClusterSet();
488 /// Destructor
489 ~ClusterSet();
491 /** Add a cluster to the ClusterSet
493 * @param cluster Cluster object which is to be added to the ClusterSet
495 void add_cluster(const Cluster &cluster);
497 /** Add the point to the cluster at position 'index'
499 * @param point Point object which needs to be added to
500 * a Cluster within the ClusterSet
501 * @param index Index of the Cluster within the ClusterSet to
502 * which the Point is to be added
504 void add_to_cluster(const Point &point, unsigned int index);
506 /// Return the number of clusters
507 Xapian::doccount size() const;
509 /// Return the cluster at index 'i'
510 const Cluster& operator[](Xapian::doccount i) const;
512 /// Clear all the clusters in the ClusterSet
513 void clear_clusters();
515 /** Recalculate the centroid for all the clusters in the ClusterSet */
516 void recalculate_centroids();
519 /** Base class for calculating the similarity between documents
521 class XAPIAN_VISIBILITY_DEFAULT Similarity {
522 public:
523 /// Destructor
524 virtual ~Similarity();
526 /** Calculates the similarity between the two documents
528 * @param a First point object for distance calculation
529 * @param b Second point object for distance calculation
531 virtual double similarity(const PointType &a, const PointType &b) const = 0;
533 /// Returns a string describing the similarity metric being used
534 virtual std::string get_description() const = 0;
537 /** Class for calculating the cosine distance between two documents
539 class XAPIAN_VISIBILITY_DEFAULT CosineDistance : public Similarity {
540 public:
541 /** Calculates and returns the cosine similarity using the
542 * formula cos(theta) = a.b/(|a|*|b|)
544 double similarity(const PointType& a, const PointType& b) const override;
546 /// Return a string describing this object
547 std::string get_description() const override;
550 /** Class representing an abstract class for a clusterer to be implemented
552 class XAPIAN_VISIBILITY_DEFAULT Clusterer
553 : public Xapian::Internal::opt_intrusive_base {
554 public:
555 /// Destructor
556 virtual ~Clusterer();
558 /** Implement the required clustering algorithm in the subclass and
559 * and return clustered output as ClusterSet
561 * @param mset The MSet object which contains the documents to be
562 * clustered
564 virtual ClusterSet cluster(const MSet &mset) = 0;
566 /// Returns a string describing the clusterer being used
567 virtual std::string get_description() const = 0;
569 /** Start reference counting this object.
571 * You can transfer ownership of a dynamically allocated Clusterer
572 * object to Xapian by calling release() and then passing the object to a
573 * Xapian method. Xapian will arrange to delete the object once it is no
574 * longer required.
576 Clusterer * release() {
577 opt_intrusive_base::release();
578 return this;
581 /** Start reference counting this object.
583 * You can transfer ownership of a dynamically allocated Clusterer
584 * object to Xapian by calling release() and then passing the object to a
585 * Xapian method. Xapian will arrange to delete the object once it is no
586 * longer required.
588 const Clusterer * release() const {
589 opt_intrusive_base::release();
590 return this;
594 /** Kmeans clusterer:
595 * This clusterer implements the K-Means clustering algorithm
597 class XAPIAN_VISIBILITY_DEFAULT KMeans : public Clusterer {
598 /// Contains the initialised points that are to be clustered
599 std::vector<Point> points;
601 /// Specifies that the clusterer needs to form 'k' clusters
602 unsigned int k;
604 /// Specifies the maximum number of iterations that KMeans will have
605 unsigned int max_iters;
607 /// Pointer to stopper object for identifying stopwords
608 Xapian::Internal::opt_intrusive_ptr<const Xapian::Stopper> stopper;
610 /** Initialise 'k' clusters by selecting 'k' centroids and assigning
611 * them to different clusters
613 * @param cset ClusterSet object to be initialised by assigning
614 * centroids to each cluster
615 * @param num_of_points Number of points passed to clusterer
617 void initialise_clusters(ClusterSet &cset, Xapian::doccount num_of_points);
619 /** Initialise the Points to be fed into the Clusterer with the MSet object
620 * 'source'. The TF-IDF weights for the documents are calculated and stored
621 * within the Points to be used later during distance calculations
623 * @param source MSet object containing the documents which will be
624 * used to create document vectors that are represented
625 * as Point objects
627 void initialise_points(const MSet &source);
629 public:
630 /** Constructor specifying number of clusters and maximum iterations
632 * @param k_ Number of required clusters
633 * @param max_iters_ The maximum number of iterations for which KMeans
634 * will run if it doesn't converge
636 explicit KMeans(unsigned int k_, unsigned int max_iters_ = 0);
638 /** Implements the KMeans clustering algorithm
640 * @param mset MSet object containing the documents that are to
641 * be clustered
643 ClusterSet cluster(const MSet &mset) override;
645 /** Set the Xapian::Stopper object to be used for identifying stopwords.
647 * Stopwords are discarded while calculating term frequency for terms.
649 * @param stop The Stopper object to set (default NULL, which means no
650 * stopwords)
652 void set_stopper(const Xapian::Stopper* stop = NULL) { stopper = stop; }
654 /// Return a string describing this object
655 std::string get_description() const override;
658 /** LCD clusterer:
659 * This clusterer implements the LCD clustering algorithm adapted from
660 * Modelling efficient novelty-based search result diversification in metric
661 * spaces Gil-Costa et al. 2013
663 class XAPIAN_VISIBILITY_DEFAULT LCDClusterer : public Clusterer {
664 /// Specifies that the clusterer needs to form 'k' clusters
665 unsigned int k;
667 public:
668 /** Constructor specifying number of clusters
670 * @param k_ Number of required clusters
672 explicit LCDClusterer(unsigned int k_);
674 /** Implements the LCD clustering algorithm
676 * @param mset MSet object containing the documents that are to
677 * be clustered
679 ClusterSet cluster(const MSet &mset) override;
681 /// Return a string describing this object
682 std::string get_description() const override;
685 #endif // XAPIAN_INCLUDED_CLUSTER_H