Add Indonesian stemmer
[xapian.git] / xapian-letor / api / featurelist_internal.cc
blob893bf65177ed08714562c836b2cbca3fd5e04091
1 /** @file featurelist_internal.cc
2 * @brief Definition of Feature::Internal class.
3 */
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "xapian-letor/featurelist.h"
26 #include "featurelist_internal.h"
28 #include <cmath>
29 #include <cstdio>
30 #include <cstdlib>
31 #include <cstring>
32 #include "debuglog.h"
34 using namespace std;
35 using namespace Xapian;
37 void
38 FeatureList::Internal::set_database(const Xapian::Database & db)
40 featurelist_db = db;
43 void
44 FeatureList::Internal::set_query(const Xapian::Query & query)
46 featurelist_query = query;
49 void
50 FeatureList::Internal::set_doc(const Xapian::Document & doc)
52 featurelist_doc = doc;
55 void
56 FeatureList::Internal::set_data(const Xapian::Query & letor_query,
57 const Xapian::Database & letor_db,
58 const Xapian::Document & letor_doc)
60 set_query(letor_query);
61 set_doc(letor_doc);
62 set_database(letor_db);
65 void
66 FeatureList::Internal::compute_termfreq()
68 std::map<std::string, Xapian::termcount> tf;
70 Xapian::TermIterator docterms = featurelist_doc.termlist_begin();
71 for (Xapian::TermIterator qt = featurelist_query.get_unique_terms_begin();
72 qt != featurelist_query.get_terms_end(); ++qt) {
73 docterms.skip_to(*qt);
74 if (docterms != featurelist_doc.termlist_end() && *qt == *docterms)
75 tf[*qt] = docterms.get_wdf();
77 std::swap(termfreq, tf);
80 void
81 FeatureList::Internal::compute_inverse_doc_freq()
83 std::map<std::string, double> idf;
84 Xapian::doccount totaldocs = featurelist_db.get_doccount();
86 for (Xapian::TermIterator qt = featurelist_query.get_unique_terms_begin();
87 qt != featurelist_query.get_terms_end(); ++qt) {
88 Xapian::doccount df = featurelist_db.get_termfreq(*qt);
89 if (df != 0)
90 idf[*qt] = log10((double)totaldocs / (double)(1 + df));
92 std::swap(inverse_doc_freq, idf);
95 void
96 FeatureList::Internal::compute_doc_length()
98 std::map<std::string, Xapian::termcount> len;
100 Xapian::termcount title_len = 0;
101 Xapian::TermIterator dt = featurelist_doc.termlist_begin();
102 // reach the iterator to the start of the title terms i.e. prefix "S"
103 dt.skip_to("S");
104 for ( ; dt != featurelist_doc.termlist_end(); ++dt) {
105 if ((*dt)[0] != 'S') {
106 // We've reached the end of the S-prefixed terms.
107 break;
109 title_len += dt.get_wdf();
111 len["title"] = title_len;
112 Xapian::termcount whole_len =
113 featurelist_db.get_doclength(featurelist_doc.get_docid());
114 len["whole"] = whole_len;
115 len["body"] = whole_len - title_len;
116 std::swap(doc_length, len);
119 void
120 FeatureList::Internal::compute_collection_length()
122 std::map<std::string, Xapian::termcount> len;
124 if (!featurelist_db.get_metadata("collection_len_title").empty() &&
125 !featurelist_db.get_metadata("collection_len_body").empty() &&
126 !featurelist_db.get_metadata("collection_len_whole").empty()) {
127 len["title"] =
128 atol(featurelist_db.get_metadata("collection_len_title").c_str());
129 len["body"] =
130 atol(featurelist_db.get_metadata("collection_len_body").c_str());
131 len["whole"] =
132 atol(featurelist_db.get_metadata("collection_len_whole").c_str());
133 } else {
134 Xapian::termcount title_len = 0;
135 Xapian::TermIterator dt = featurelist_db.allterms_begin("S");
136 for ( ; dt != featurelist_db.allterms_end("S"); ++dt) {
137 // because we don't want the unique terms so we want their
138 // original frequencies and i.e. the total size of the title collection.
139 title_len += featurelist_db.get_collection_freq(*dt);
141 len["title"] = title_len;
142 Xapian::termcount whole_len = featurelist_db.get_avlength() *
143 featurelist_db.get_doccount();
144 len["whole"] = whole_len;
145 len["body"] = whole_len - title_len;
147 std::swap(collection_length, len);
150 void
151 FeatureList::Internal::compute_collection_termfreq()
153 std::map<std::string, Xapian::termcount> tf;
155 for (Xapian::TermIterator qt = featurelist_query.get_unique_terms_begin();
156 qt != featurelist_query.get_terms_end(); ++qt) {
157 Xapian::termcount coll_tf = featurelist_db.get_collection_freq(*qt);
158 if (coll_tf != 0)
159 tf[*qt] = coll_tf;
161 std::swap(collection_termfreq, tf);
164 void
165 FeatureList::Internal::populate_feature(Feature *feature_)
167 stat_flags stats_needed = stat_flags(feature_->get_stats());
168 if (stats_needed & TERM_FREQUENCY) {
169 if (termfreq.empty())
170 compute_termfreq();
171 feature_->set_termfreq(termfreq);
173 if (stats_needed & INVERSE_DOCUMENT_FREQUENCY) {
174 if (inverse_doc_freq.empty())
175 compute_inverse_doc_freq();
176 feature_->set_inverse_doc_freq(inverse_doc_freq);
178 if (stats_needed & DOCUMENT_LENGTH) {
179 if (doc_length.empty())
180 compute_doc_length();
181 feature_->set_doc_length(doc_length);
183 if (stats_needed & COLLECTION_LENGTH) {
184 if (collection_length.empty())
185 compute_collection_length();
186 feature_->set_collection_length(collection_length);
188 if (stats_needed & COLLECTION_TERM_FREQ) {
189 if (collection_termfreq.empty())
190 compute_collection_termfreq();
191 feature_->set_collection_termfreq(collection_termfreq);
195 void
196 FeatureList::Internal::clear_stats()
198 termfreq.clear();
199 inverse_doc_freq.clear();
200 doc_length.clear();
201 collection_length.clear();
202 collection_termfreq.clear();