4 /* Copyright (C) 2010 Richard Boulton
5 * Copyright (C) 2016 Richhiey Thomas
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "xapian/cluster.h"
27 #include "cluster/clusterinternal.h"
28 #include "xapian/error.h"
29 #include "api/termlist.h"
34 #include <unordered_map>
37 using namespace Xapian
;
40 FreqSource::~FreqSource()
42 LOGCALL_DTOR(API
, "FreqSource");
45 Similarity::~Similarity()
47 LOGCALL_DTOR(API
, "Similarity");
50 Clusterer::~Clusterer()
52 LOGCALL_DTOR(API
, "Clusterer");
55 TermListGroup::TermListGroup(const MSet
& docs
, const Stopper
* stopper
)
57 LOGCALL_CTOR(API
, "TermListGroup", docs
| stopper
);
58 for (MSetIterator it
= docs
.begin(); it
!= docs
.end(); ++it
)
59 add_document(it
.get_document(), stopper
);
60 num_of_documents
= docs
.size();
64 TermListGroup::add_document(const Document
& document
, const Stopper
* stopper
)
66 LOGCALL_VOID(API
, "TermListGroup::add_document", document
| stopper
);
68 TermIterator
titer(document
.termlist_begin());
70 for (; titer
!= document
.termlist_end(); ++titer
) {
71 const string
& term
= *titer
;
73 // Remove stopwords by using the Xapian::Stopper object
74 if (stopper
&& (*stopper
)(term
))
77 // Remove unstemmed terms since document vector should
78 // contain only stemmed terms
82 unordered_map
<string
, doccount
>::iterator i
;
83 i
= termfreq
.find(term
);
84 if (i
== termfreq
.end())
92 TermListGroup::get_doccount() const
94 LOGCALL(API
, doccount
, "TermListGroup::get_doccount", NO_ARGS
);
95 return num_of_documents
;
99 TermListGroup::get_termfreq(const string
& tname
) const
101 LOGCALL(API
, doccount
, "TermListGroup::get_termfreq", tname
);
102 unordered_map
<string
, doccount
>::const_iterator it
= termfreq
.find(tname
);
103 if (it
!= termfreq
.end())
109 DocumentSet::DocumentSet(const DocumentSet
&) = default;
112 DocumentSet::operator=(const DocumentSet
&) = default;
114 DocumentSet::DocumentSet(DocumentSet
&&) = default;
117 DocumentSet::operator=(DocumentSet
&&) = default;
119 DocumentSet::DocumentSet()
120 : internal(new Xapian::DocumentSet::Internal
)
125 DocumentSet::size() const
127 LOGCALL(API
, doccount
, "DocumentSet::size", NO_ARGS
);
128 return internal
->size();
132 DocumentSet::Internal::size() const
134 return documents
.size();
138 DocumentSet::add_document(const Document
& document
)
140 LOGCALL_VOID(API
, "DocumentSet::add_document", document
);
141 internal
->add_document(document
);
145 DocumentSet::Internal::add_document(const Document
& document
)
147 documents
.push_back(document
);
151 DocumentSet::operator[](doccount i
)
153 return internal
->get_document(i
);
157 DocumentSet::Internal::get_document(doccount i
)
163 DocumentSet::operator[](doccount i
) const
165 return internal
->get_document(i
);
169 DocumentSet::Internal::get_document(doccount i
) const
174 DocumentSet::~DocumentSet()
176 LOGCALL_DTOR(API
, "DocumentSet");
179 class PointTermIterator
: public TermIterator::Internal
{
180 unordered_map
<string
, double>::const_iterator i
;
181 unordered_map
<string
, double>::const_iterator end
;
185 PointTermIterator(const unordered_map
<string
, double>& termlist
)
186 : i(termlist
.begin()), end(termlist
.end()),
187 size(termlist
.size()), started(false)
189 termcount
get_approx_size() const { return size
; }
190 termcount
get_wdf() const {
191 throw UnimplementedError("PointIterator doesn't support get_wdf()");
193 string
get_termname() const { return i
->first
; }
194 doccount
get_termfreq() const {
195 throw UnimplementedError("PointIterator doesn't support "
199 termcount
positionlist_count() const {
200 throw UnimplementedError("PointTermIterator doesn't support "
201 "positionlist_count()");
204 PositionList
* positionlist_begin() const {
205 throw UnimplementedError("PointTermIterator doesn't support "
206 "positionlist_begin()");
208 Internal
* skip_to(const string
&) {
209 throw UnimplementedError("PointTermIterator doesn't support skip_to()");
213 TermIterator::Internal
*
214 PointTermIterator::next()
226 PointTermIterator::at_end() const
228 if (!started
) return false;
233 PointType::termlist_begin() const
235 LOGCALL(API
, TermIterator
, "PointType::termlist_begin", NO_ARGS
);
236 return TermIterator(new PointTermIterator(weights
));
240 PointType::contains(const string
& term
) const
242 LOGCALL(API
, bool, "PointType::contains", term
);
243 return weights
.find(term
) != weights
.end();
247 PointType::get_weight(const string
& term
) const
249 LOGCALL(API
, double, "PointType::get_weight", term
);
250 unordered_map
<string
, double>::const_iterator it
= weights
.find(term
);
251 return (it
== weights
.end()) ? 0.0 : it
->second
;
255 PointType::get_magnitude() const {
256 LOGCALL(API
, double, "PointType::get_magnitude", NO_ARGS
);
261 PointType::add_weight(const string
& term
, double weight
)
263 LOGCALL_VOID(API
, "PointType::add_weight", term
| weight
);
264 unordered_map
<string
, double>::iterator it
;
265 it
= weights
.find(term
);
266 if (it
!= weights
.end())
267 it
->second
+= weight
;
269 weights
[term
] = weight
;
273 PointType::set_weight(const string
& term
, double weight
)
275 LOGCALL_VOID(API
, "PointType::set_weight", term
| weight
);
276 weights
[term
] = weight
;
280 PointType::termlist_size() const
282 LOGCALL(API
, termcount
, "PointType::termlist_size", NO_ARGS
);
283 return weights
.size();
287 Point::get_document() const
289 LOGCALL(API
, Document
, "Point::get_document", NO_ARGS
);
293 Point::Point(const FreqSource
& freqsource
, const Document
& document_
)
295 LOGCALL_CTOR(API
, "Point::initialize", freqsource
| document_
);
296 doccount size
= freqsource
.get_doccount();
297 document
= document_
;
298 for (TermIterator it
= document
.termlist_begin();
299 it
!= document
.termlist_end();
301 doccount wdf
= it
.get_wdf();
303 double termfreq
= freqsource
.get_termfreq(term
);
305 // If the term exists in only one document, or if it exists in
306 // every document within the MSet, or if it is a filter term, then
307 // these terms are not used for document vector calculations
308 if (wdf
< 1 || termfreq
<= 1 || size
== termfreq
)
311 double tf
= 1 + log(double(wdf
));
312 double idf
= log(size
/ termfreq
);
313 double wt
= tf
* idf
;
316 magnitude
+= wt
* wt
;
320 Centroid::Centroid(const Point
& point
)
322 LOGCALL_CTOR(API
, "Centroid", point
);
323 for (TermIterator it
= point
.termlist_begin();
324 it
!= point
.termlist_end();
326 weights
[*it
] = point
.get_weight(*it
);
328 magnitude
= point
.get_magnitude();
332 Centroid::divide(double cluster_size
)
334 LOGCALL_VOID(API
, "Centroid::divide", cluster_size
);
336 unordered_map
<string
, double>::iterator it
;
337 for (it
= weights
.begin(); it
!= weights
.end(); ++it
) {
338 double new_weight
= it
->second
/ cluster_size
;
339 it
->second
= new_weight
;
340 magnitude
+= new_weight
* new_weight
;
347 LOGCALL_VOID(API
, "Centroid::clear", NO_ARGS
);
352 Cluster::operator=(const Cluster
&) = default;
354 Cluster::Cluster(const Cluster
&) = default;
356 Cluster::Cluster(Cluster
&&) = default;
359 Cluster::operator=(Cluster
&&) = default;
361 Cluster::Cluster() : internal(new Xapian::Cluster::Internal
)
363 LOGCALL_CTOR(API
, "Cluster", NO_ARGS
);
366 Cluster::Cluster(const Centroid
& centroid
)
367 : internal(new Xapian::Cluster::Internal(centroid
))
369 LOGCALL_CTOR(API
, "Cluster", centroid
);
374 LOGCALL_DTOR(API
, "Cluster");
379 LOGCALL_CTOR(API
, "Centroid", NO_ARGS
);
383 Cluster::get_documents() const
385 LOGCALL(API
, DocumentSet
, "Cluster::get_documents", NO_ARGS
);
386 return internal
->get_documents();
390 Cluster::Internal::get_documents() const
393 for (auto&& point
: cluster_docs
)
394 docs
.add_document(point
.get_document());
399 Cluster::operator[](Xapian::doccount i
)
401 return internal
->get_point(i
);
405 Cluster::Internal::get_point(Xapian::doccount i
)
407 return cluster_docs
[i
];
411 Cluster::operator[](Xapian::doccount i
) const
413 return internal
->get_point(i
);
417 Cluster::Internal::get_point(Xapian::doccount i
) const
419 return cluster_docs
[i
];
423 ClusterSet::operator=(const ClusterSet
&) = default;
425 ClusterSet::ClusterSet(const ClusterSet
&) = default;
428 ClusterSet::operator=(ClusterSet
&&) = default;
430 ClusterSet::ClusterSet(ClusterSet
&&) = default;
432 ClusterSet::ClusterSet() : internal(new Xapian::ClusterSet::Internal
)
436 ClusterSet::~ClusterSet()
441 ClusterSet::Internal::size() const
443 return clusters
.size();
447 ClusterSet::size() const
449 LOGCALL(API
, doccount
, "ClusterSet::size", NO_ARGS
);
450 return internal
->size();
454 ClusterSet::Internal::add_cluster(const Cluster
& cluster
)
456 clusters
.push_back(cluster
);
460 ClusterSet::add_cluster(const Cluster
& cluster
)
462 LOGCALL_VOID(API
, "ClusterSet::add_cluster", cluster
);
463 internal
->add_cluster(cluster
);
467 ClusterSet::Internal::get_cluster(doccount i
)
473 ClusterSet::operator[](doccount i
)
475 return internal
->get_cluster(i
);
479 ClusterSet::Internal::get_cluster(doccount i
) const
485 ClusterSet::operator[](doccount i
) const
487 return internal
->get_cluster(i
);
491 ClusterSet::Internal::add_to_cluster(const Point
& point
, unsigned int index
)
493 clusters
[index
].add_point(point
);
497 ClusterSet::add_to_cluster(const Point
& point
, unsigned int index
)
499 LOGCALL_VOID(API
, "ClusterSet::add_to_cluster", point
| index
);
500 internal
->add_to_cluster(point
, index
);
504 ClusterSet::Internal::recalculate_centroids()
506 for (auto&& cluster
: clusters
)
507 cluster
.recalculate();
511 ClusterSet::recalculate_centroids()
513 LOGCALL_VOID(API
, "ClusterSet::recalculate_centroids", NO_ARGS
);
514 internal
->recalculate_centroids();
518 ClusterSet::clear_clusters()
520 LOGCALL_VOID(API
, "ClusterSet::clear_clusters", NO_ARGS
);
521 internal
->clear_clusters();
525 ClusterSet::Internal::clear_clusters()
527 for (auto&& cluster
: clusters
)
532 Cluster::size() const
534 LOGCALL(API
, doccount
, "Cluster::size", NO_ARGS
);
535 return internal
->size();
539 Cluster::Internal::size() const
541 return (cluster_docs
.size());
545 Cluster::add_point(const Point
& point
)
547 LOGCALL_VOID(API
, "Cluster::add_point", point
);
548 internal
->add_point(point
);
552 Cluster::Internal::add_point(const Point
& point
)
554 cluster_docs
.push_back(point
);
560 LOGCALL_VOID(API
, "Cluster::clear", NO_ARGS
);
565 Cluster::Internal::clear()
567 cluster_docs
.clear();
571 Cluster::get_centroid() const
573 LOGCALL(API
, Centroid
, "Cluster::get_centroid", NO_ARGS
);
574 return internal
->get_centroid();
578 Cluster::Internal::get_centroid() const
584 Cluster::set_centroid(const Centroid
& centroid_
)
586 LOGCALL_VOID(API
, "Cluster::set_centroid", centroid_
);
587 internal
->set_centroid(centroid_
);
591 Cluster::Internal::set_centroid(const Centroid
& centroid_
)
593 centroid
= centroid_
;
597 Cluster::recalculate()
599 LOGCALL_VOID(API
, "Cluster::recalculate", NO_ARGS
);
600 internal
->recalculate();
604 Cluster::Internal::recalculate()
607 for (const Point
& temp
: cluster_docs
) {
608 for (TermIterator titer
= temp
.termlist_begin();
609 titer
!= temp
.termlist_end();
611 centroid
.add_weight(*titer
, temp
.get_weight(*titer
));
614 centroid
.divide(size());
617 StemStopper::StemStopper(const Stem
& stemmer_
, stem_strategy strategy
)
618 : stem_action(strategy
), stemmer(stemmer_
)
620 LOGCALL_CTOR(API
, "StemStopper", stemmer_
| strategy
);
624 StemStopper::get_description() const
626 string
desc("Xapian::StemStopper(");
627 unordered_set
<string
>::const_iterator i
;
628 for (i
= stop_words
.begin(); i
!= stop_words
.end(); ++i
) {
629 if (i
!= stop_words
.begin()) desc
+= ' ';
637 StemStopper::add(const string
& term
)
639 LOGCALL_VOID(API
, "StemStopper::add", term
);
640 switch (stem_action
) {
642 stop_words
.insert(term
);
645 stop_words
.insert('Z' + stemmer(term
));
648 stop_words
.insert(stemmer(term
));
651 case STEM_SOME_FULL_POS
:
652 stop_words
.insert(term
);
653 stop_words
.insert('Z' + stemmer(term
));