Fix replication for files > 4GB on 32-bit platforms
[xapian.git] / xapian-letor / letor_features.cc
blob876944f245d28a69def0c38df9737a63fd212638
1 /** @file features.cc
2 * @brief Fetures class
3 */
4 /* Copyright (C) 2012 Parth Gupta
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include "letor_features.h"
23 #include <math.h>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <cstring>
31 using namespace std;
33 using namespace Xapian;
35 map<string, long int>
36 Features::termfreq(const Xapian::Document & doc, const Xapian::Query & query) {
37 map<string, long int> tf;
39 Xapian::TermIterator docterms = doc.termlist_begin();
40 for (Xapian::TermIterator qt = query.get_terms_begin();
41 qt != query.get_terms_end(); ++qt) {
42 docterms.skip_to(*qt);
43 if (docterms != doc.termlist_end() && *qt == *docterms) {
44 tf[*qt] = docterms.get_wdf();
45 } else {
46 tf[*qt] = 0;
49 return tf;
52 map<string, double>
53 Features::inverse_doc_freq(const Xapian::Database & db, const Xapian::Query & query) {
54 map<string, double> idf;
56 for (Xapian::TermIterator qt = query.get_terms_begin();
57 qt != query.get_terms_end(); ++qt) {
58 if (db.term_exists(*qt)) {
59 long int totaldocs = db.get_doccount();
60 long int df = db.get_termfreq(*qt);
61 idf[*qt] = log10(totaldocs / (1 + df));
62 } else {
63 idf[*qt] = 0;
66 return idf;
69 map<string, long int>
70 Features::doc_length(const Xapian::Database & db, const Xapian::Document & doc) {
71 map<string, long int> len;
73 long int temp_count = 0;
74 Xapian::TermIterator dt = doc.termlist_begin();
75 dt.skip_to("S"); //reach the iterator to the start of the title terms i.e. prefix "S"
76 for ( ; dt != doc.termlist_end(); ++dt) {
77 if ((*dt)[0] != 'S') {
78 // We've reached the end of the S-prefixed terms.
79 break;
81 temp_count += dt.get_wdf();
83 len["title"] = temp_count;
84 len["whole"] = db.get_doclength(doc.get_docid());
85 len["body"] = len["whole"] - len["title"];
86 return len;
89 map<string, long int>
90 Features::collection_length(const Xapian::Database & db) {
91 map<string, long int> len;
93 if (!db.get_metadata("collection_len_title").empty() && !db.get_metadata("collection_len_body").empty() && !db.get_metadata("collection_len_whole").empty()) {
94 len["title"] = atol(db.get_metadata("collection_len_title").c_str());
95 len["body"] = atol(db.get_metadata("collection_len_body").c_str());
96 len["whole"] = atol(db.get_metadata("collection_len_whole").c_str());
97 } else {
98 long int temp_count = 0;
99 Xapian::TermIterator dt = db.allterms_begin("S");
100 for ( ; dt != db.allterms_end("S"); ++dt) {
101 temp_count += db.get_collection_freq(*dt); // because we don't want the unique terms so we want their original frequencies and i.e. the total size of the title collection.
103 len["title"] = temp_count;
104 len["whole"] = db.get_avlength() * db.get_doccount();
105 len["body"] = len["whole"] - len["title"];
107 return len;
110 map<string, long int>
111 Features::collection_termfreq(const Xapian::Database & db, const Xapian::Query & query) {
112 map<string, long int> tf;
114 for (Xapian::TermIterator qt = query.get_terms_begin();
115 qt != query.get_terms_end(); ++qt) {
116 if (db.term_exists(*qt))
117 tf[*qt] = db.get_collection_freq(*qt);
118 else
119 tf[*qt] = 0;
121 return tf;
124 double
125 Features::calculate_f1(const Xapian::Query & query, map<string, long int> & tf, char ch) {
126 double value = 0;
128 if (ch == 't') { // if feature1 for title
129 for (Xapian::TermIterator qt = query.get_terms_begin();
130 qt != query.get_terms_end(); ++qt) {
131 if ((*qt).substr(0, 1) == "S" || (*qt).substr(1, 1) == "S") {
132 value += log10(1 + tf[*qt]); // always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
133 } else // if there is no title information stored with standart "S" prefix
134 value += 0;
136 return value;
137 } else if (ch == 'b') { // if for body only
138 for (Xapian::TermIterator qt = query.get_terms_begin();
139 qt != query.get_terms_end(); ++qt) {
140 if ((*qt).substr(0, 1) != "S" && (*qt).substr(1, 1) != "S") {
141 value += log10(1 + tf[*qt]); // always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
142 } else
143 value += 0;
145 return value;
146 } else { // if for whole document
147 for (Xapian::TermIterator qt = query.get_terms_begin();
148 qt != query.get_terms_end(); ++qt) {
149 value += log10(1 + tf[*qt]); // always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
151 return value;
156 double
157 Features::calculate_f2(const Xapian::Query & query, map<string, long int> & tf, map<string, long int> & doc_len, char ch) {
158 double value = 0;
160 if (ch == 't') { //if feature1 for title then
161 for (Xapian::TermIterator qt = query.get_terms_begin();
162 qt != query.get_terms_end(); ++qt) {
163 if ((*qt).substr(0, 1) == "S" || (*qt).substr(1, 1) == "S") {
164 value += log10(1 + ((double)tf[*qt] / (1 + (double)doc_len["title"]))); //always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
167 return value;
168 } else if (ch == 'b') {
169 for (Xapian::TermIterator qt = query.get_terms_begin();
170 qt != query.get_terms_end(); ++qt) {
171 if ((*qt).substr(0, 1) != "S" && (*qt).substr(1, 1) != "S") {
172 value += log10(1 + ((double)tf[*qt] / (1 + (double)doc_len["body"])));
175 return value;
176 } else {
177 for (Xapian::TermIterator qt = query.get_terms_begin();
178 qt != query.get_terms_end(); ++qt) {
179 value += log10(1 + ((double)tf[*qt] / (1 + (double)doc_len["whole"])));
181 return value;
185 double
186 Features::calculate_f3(const Xapian::Query & query, map<string, double> & idf, char ch) {
187 double value = 0;
189 if (ch == 't') {
190 for (Xapian::TermIterator qt = query.get_terms_begin();
191 qt != query.get_terms_end(); ++qt) {
192 if ((*qt).substr(0, 1) == "S" || (*qt).substr(1, 1) == "S") {
193 value += log10(1 + idf[*qt]);
194 } else
195 value += 0;
197 return value;
198 } else if (ch == 'b') {
199 for (Xapian::TermIterator qt = query.get_terms_begin();
200 qt != query.get_terms_end(); ++qt) {
201 if ((*qt).substr(0, 1) != "S" && (*qt).substr(1, 1) != "S") {
202 value += log10(1 + idf[*qt]);
203 } else
204 value += 0;
206 return value;
207 } else {
208 for (Xapian::TermIterator qt = query.get_terms_begin();
209 qt != query.get_terms_end(); ++qt) {
210 value += log10(1 + idf[*qt]);
212 return value;
216 double
217 Features::calculate_f4(const Xapian::Query & query, map<string, long int> & tf, map<string, long int> & coll_len, char ch) {
218 double value = 0;
220 if (ch == 't') {
221 for (Xapian::TermIterator qt = query.get_terms_begin();
222 qt != query.get_terms_end(); ++qt) {
223 if ((*qt).substr(0, 1) == "S" || (*qt).substr(1, 1) == "S") {
224 value += log10(1 + ((double)coll_len["title"] / (double)(1 + tf[*qt])));
225 } else
226 value += 0;
228 return value;
229 } else if (ch == 'b') {
230 for (Xapian::TermIterator qt = query.get_terms_begin();
231 qt != query.get_terms_end(); ++qt) {
232 if ((*qt).substr(0, 1) != "S" && (*qt).substr(1, 1) != "S") {
233 value += log10(1 + ((double)coll_len["body"] / (double)(1 + tf[*qt])));
234 } else
235 value += 0;
237 return value;
238 } else {
239 for (Xapian::TermIterator qt = query.get_terms_begin();
240 qt != query.get_terms_end(); ++qt) {
241 value += log10(1 + ((double)coll_len["whole"] / (double)(1 + tf[*qt])));
243 return value;
247 double
248 Features::calculate_f5(const Xapian::Query & query, map<string, long int> & tf, map<string, double> & idf, map<string, long int> & doc_len, char ch) {
249 double value = 0;
251 if (ch == 't') {
252 for (Xapian::TermIterator qt = query.get_terms_begin();
253 qt != query.get_terms_end(); ++qt) {
254 if ((*qt).substr(0, 1) == "S" || (*qt).substr(1, 1) == "S") {
255 value += log10(1 + ((double)(tf[*qt] * idf[*qt]) / (1 + (double)doc_len["title"]))); // 1 + doc_len because if title info is not available then doc_len["title"] will be zero.
256 } else
257 value += 0;
259 return value;
260 } else if (ch == 'b') {
261 for (Xapian::TermIterator qt = query.get_terms_begin();
262 qt != query.get_terms_end(); ++qt) {
263 if ((*qt).substr(0, 1) != "S" && (*qt).substr(1, 1) != "S") {
264 value += log10(1 + ((double)(tf[*qt] * idf[*qt]) / (1 + (double)doc_len["body"])));
265 } else
266 value += 0;
268 return value;
269 } else {
270 for (Xapian::TermIterator qt = query.get_terms_begin();
271 qt != query.get_terms_end(); ++qt) {
272 value += log10(1 + ((double)(tf[*qt] * idf[*qt]) / (1 + (double)doc_len["whole"])));
274 return value;
278 double
279 Features::calculate_f6(const Xapian::Query & query, map<string, long int> & tf, map<string, long int> & doc_len, map<string, long int> & coll_tf, map<string, long int> & coll_length, char ch) {
280 double value = 0;
282 if (ch == 't') {
283 for (Xapian::TermIterator qt = query.get_terms_begin();
284 qt != query.get_terms_end(); ++qt) {
285 if ((*qt).substr(0, 1) == "S" || (*qt).substr(1, 1) == "S") {
286 value += log10(1 + (((double)tf[*qt] * (double)coll_length["title"]) / (double)(1 + ((double)doc_len["title"] * (double)coll_tf[*qt]))));
287 } else
288 value += 0;
290 return value;
291 } else if (ch == 'b') {
292 for (Xapian::TermIterator qt = query.get_terms_begin();
293 qt != query.get_terms_end(); ++qt) {
294 if ((*qt).substr(0, 1) != "S" && (*qt).substr(1, 1) != "S") {
295 value += log10(1 + (((double)tf[*qt] * (double)coll_length["body"]) / (double)(1 + ((double)doc_len["body"] * (double)coll_tf[*qt]))));
296 } else
297 value += 0;
299 return value;
300 } else {
301 for (Xapian::TermIterator qt = query.get_terms_begin();
302 qt != query.get_terms_end(); ++qt) {
303 value += log10(1+(((double)tf[*qt] * (double)coll_length["whole"]) / (double)(1 + ((double)doc_len["whole"] * (double)coll_tf[*qt]))));
305 return value;