1 /** @file tfidfweight.cc
2 * @brief Xapian::TfIdfWeight class - The TfIdf weighting scheme
4 /* Copyright (C) 2013 Aarsh Shah
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "xapian/weight.h"
30 #include "xapian/error.h"
36 TfIdfWeight::TfIdfWeight(const std::string
&normals
)
37 : normalizations(normals
)
39 if (normalizations
.length() != 3 ||
40 !strchr("nbsl", normalizations
[0]) ||
41 !strchr("ntp", normalizations
[1]) ||
42 !strchr("n", normalizations
[2]))
43 throw Xapian::InvalidArgumentError("Normalization string is invalid");
44 if (normalizations
[1] != 'n') {
46 need_stat(COLLECTION_SIZE
);
53 TfIdfWeight::clone() const
55 return new TfIdfWeight(normalizations
);
59 TfIdfWeight::init(double factor_
)
65 TfIdfWeight::name() const
67 return "Xapian::TfIdfWeight";
71 TfIdfWeight::serialise() const
73 return normalizations
;
77 TfIdfWeight::unserialise(const string
& s
) const
80 throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
81 return new TfIdfWeight(s
);
85 TfIdfWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount
,
86 Xapian::termcount
) const
88 Xapian::doccount termfreq
= 1;
89 if (normalizations
[1] != 'n') termfreq
= get_termfreq();
90 double wt
= get_wdfn(wdf
, normalizations
[0]) *
91 get_idfn(termfreq
, normalizations
[1]);
92 return get_wtn(wt
, normalizations
[2]) * factor
;
95 // An upper bound can be calculated simply on the basis of wdf_max as termfreq
96 // and N are constants.
98 TfIdfWeight::get_maxpart() const
100 Xapian::doccount termfreq
= 1;
101 if (normalizations
[1] != 'n') termfreq
= get_termfreq();
102 Xapian::termcount wdf_max
= get_wdf_upper_bound();
103 double wt
= get_wdfn(wdf_max
, normalizations
[0]) *
104 get_idfn(termfreq
, normalizations
[1]);
105 return get_wtn(wt
, normalizations
[2]) * factor
;
108 // There is no extra per document component in the TfIdfWeighting scheme.
110 TfIdfWeight::get_sumextra(Xapian::termcount
, Xapian::termcount
) const
116 TfIdfWeight::get_maxextra() const
121 // Return normalized wdf, idf and weight depending on the normalization string.
123 TfIdfWeight::get_wdfn(Xapian::termcount wdf
, char c
) const
127 if (wdf
== 0) return 0;
132 if (wdf
== 0) return 0;
133 return (1 + log(double(wdf
)));
141 TfIdfWeight::get_idfn(Xapian::doccount termfreq
, char c
) const
144 if (c
!= 'n') N
= get_collection_size();
149 // All documents are indexed by the term
150 if (N
== termfreq
) return 0;
151 return log((N
- termfreq
) / termfreq
);
154 return (log(N
/ termfreq
));
159 TfIdfWeight::get_wtn(double wt
, char c
) const