4 /* Copyright (C) 2012 Parth Gupta
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 #include "letor_features.h"
33 using namespace Xapian
;
36 Features::termfreq(const Xapian::Document
& doc
, const Xapian::Query
& query
) {
37 map
<string
, long int> tf
;
39 Xapian::TermIterator docterms
= doc
.termlist_begin();
40 for (Xapian::TermIterator qt
= query
.get_terms_begin();
41 qt
!= query
.get_terms_end(); ++qt
) {
42 docterms
.skip_to(*qt
);
43 if (docterms
!= doc
.termlist_end() && *qt
== *docterms
) {
44 tf
[*qt
] = docterms
.get_wdf();
53 Features::inverse_doc_freq(const Xapian::Database
& db
, const Xapian::Query
& query
) {
54 map
<string
, double> idf
;
56 for (Xapian::TermIterator qt
= query
.get_terms_begin();
57 qt
!= query
.get_terms_end(); ++qt
) {
58 if (db
.term_exists(*qt
)) {
59 long int totaldocs
= db
.get_doccount();
60 long int df
= db
.get_termfreq(*qt
);
61 idf
[*qt
] = log10(totaldocs
/ (1 + df
));
70 Features::doc_length(const Xapian::Database
& db
, const Xapian::Document
& doc
) {
71 map
<string
, long int> len
;
73 long int temp_count
= 0;
74 Xapian::TermIterator dt
= doc
.termlist_begin();
75 dt
.skip_to("S"); //reach the iterator to the start of the title terms i.e. prefix "S"
76 for ( ; dt
!= doc
.termlist_end(); ++dt
) {
77 if ((*dt
)[0] != 'S') {
78 // We've reached the end of the S-prefixed terms.
81 temp_count
+= dt
.get_wdf();
83 len
["title"] = temp_count
;
84 len
["whole"] = db
.get_doclength(doc
.get_docid());
85 len
["body"] = len
["whole"] - len
["title"];
90 Features::collection_length(const Xapian::Database
& db
) {
91 map
<string
, long int> len
;
93 if (!db
.get_metadata("collection_len_title").empty() && !db
.get_metadata("collection_len_body").empty() && !db
.get_metadata("collection_len_whole").empty()) {
94 len
["title"] = atol(db
.get_metadata("collection_len_title").c_str());
95 len
["body"] = atol(db
.get_metadata("collection_len_body").c_str());
96 len
["whole"] = atol(db
.get_metadata("collection_len_whole").c_str());
98 long int temp_count
= 0;
99 Xapian::TermIterator dt
= db
.allterms_begin("S");
100 for ( ; dt
!= db
.allterms_end("S"); ++dt
) {
101 temp_count
+= db
.get_collection_freq(*dt
); // because we don't want the unique terms so we want their original frequencies and i.e. the total size of the title collection.
103 len
["title"] = temp_count
;
104 len
["whole"] = db
.get_avlength() * db
.get_doccount();
105 len
["body"] = len
["whole"] - len
["title"];
110 map
<string
, long int>
111 Features::collection_termfreq(const Xapian::Database
& db
, const Xapian::Query
& query
) {
112 map
<string
, long int> tf
;
114 for (Xapian::TermIterator qt
= query
.get_terms_begin();
115 qt
!= query
.get_terms_end(); ++qt
) {
116 if (db
.term_exists(*qt
))
117 tf
[*qt
] = db
.get_collection_freq(*qt
);
125 Features::calculate_f1(const Xapian::Query
& query
, map
<string
, long int> & tf
, char ch
) {
128 if (ch
== 't') { // if feature1 for title
129 for (Xapian::TermIterator qt
= query
.get_terms_begin();
130 qt
!= query
.get_terms_end(); ++qt
) {
131 if ((*qt
).substr(0, 1) == "S" || (*qt
).substr(1, 1) == "S") {
132 value
+= log10(1 + tf
[*qt
]); // always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
133 } else // if there is no title information stored with standart "S" prefix
137 } else if (ch
== 'b') { // if for body only
138 for (Xapian::TermIterator qt
= query
.get_terms_begin();
139 qt
!= query
.get_terms_end(); ++qt
) {
140 if ((*qt
).substr(0, 1) != "S" && (*qt
).substr(1, 1) != "S") {
141 value
+= log10(1 + tf
[*qt
]); // always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
146 } else { // if for whole document
147 for (Xapian::TermIterator qt
= query
.get_terms_begin();
148 qt
!= query
.get_terms_end(); ++qt
) {
149 value
+= log10(1 + tf
[*qt
]); // always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
157 Features::calculate_f2(const Xapian::Query
& query
, map
<string
, long int> & tf
, map
<string
, long int> & doc_len
, char ch
) {
160 if (ch
== 't') { //if feature1 for title then
161 for (Xapian::TermIterator qt
= query
.get_terms_begin();
162 qt
!= query
.get_terms_end(); ++qt
) {
163 if ((*qt
).substr(0, 1) == "S" || (*qt
).substr(1, 1) == "S") {
164 value
+= log10(1 + ((double)tf
[*qt
] / (1 + (double)doc_len
["title"]))); //always use log10(1+quantity) because log(1) = 0 and log(0) = -inf
168 } else if (ch
== 'b') {
169 for (Xapian::TermIterator qt
= query
.get_terms_begin();
170 qt
!= query
.get_terms_end(); ++qt
) {
171 if ((*qt
).substr(0, 1) != "S" && (*qt
).substr(1, 1) != "S") {
172 value
+= log10(1 + ((double)tf
[*qt
] / (1 + (double)doc_len
["body"])));
177 for (Xapian::TermIterator qt
= query
.get_terms_begin();
178 qt
!= query
.get_terms_end(); ++qt
) {
179 value
+= log10(1 + ((double)tf
[*qt
] / (1 + (double)doc_len
["whole"])));
186 Features::calculate_f3(const Xapian::Query
& query
, map
<string
, double> & idf
, char ch
) {
190 for (Xapian::TermIterator qt
= query
.get_terms_begin();
191 qt
!= query
.get_terms_end(); ++qt
) {
192 if ((*qt
).substr(0, 1) == "S" || (*qt
).substr(1, 1) == "S") {
193 value
+= log10(1 + idf
[*qt
]);
198 } else if (ch
== 'b') {
199 for (Xapian::TermIterator qt
= query
.get_terms_begin();
200 qt
!= query
.get_terms_end(); ++qt
) {
201 if ((*qt
).substr(0, 1) != "S" && (*qt
).substr(1, 1) != "S") {
202 value
+= log10(1 + idf
[*qt
]);
208 for (Xapian::TermIterator qt
= query
.get_terms_begin();
209 qt
!= query
.get_terms_end(); ++qt
) {
210 value
+= log10(1 + idf
[*qt
]);
217 Features::calculate_f4(const Xapian::Query
& query
, map
<string
, long int> & tf
, map
<string
, long int> & coll_len
, char ch
) {
221 for (Xapian::TermIterator qt
= query
.get_terms_begin();
222 qt
!= query
.get_terms_end(); ++qt
) {
223 if ((*qt
).substr(0, 1) == "S" || (*qt
).substr(1, 1) == "S") {
224 value
+= log10(1 + ((double)coll_len
["title"] / (double)(1 + tf
[*qt
])));
229 } else if (ch
== 'b') {
230 for (Xapian::TermIterator qt
= query
.get_terms_begin();
231 qt
!= query
.get_terms_end(); ++qt
) {
232 if ((*qt
).substr(0, 1) != "S" && (*qt
).substr(1, 1) != "S") {
233 value
+= log10(1 + ((double)coll_len
["body"] / (double)(1 + tf
[*qt
])));
239 for (Xapian::TermIterator qt
= query
.get_terms_begin();
240 qt
!= query
.get_terms_end(); ++qt
) {
241 value
+= log10(1 + ((double)coll_len
["whole"] / (double)(1 + tf
[*qt
])));
248 Features::calculate_f5(const Xapian::Query
& query
, map
<string
, long int> & tf
, map
<string
, double> & idf
, map
<string
, long int> & doc_len
, char ch
) {
252 for (Xapian::TermIterator qt
= query
.get_terms_begin();
253 qt
!= query
.get_terms_end(); ++qt
) {
254 if ((*qt
).substr(0, 1) == "S" || (*qt
).substr(1, 1) == "S") {
255 value
+= log10(1 + ((double)(tf
[*qt
] * idf
[*qt
]) / (1 + (double)doc_len
["title"]))); // 1 + doc_len because if title info is not available then doc_len["title"] will be zero.
260 } else if (ch
== 'b') {
261 for (Xapian::TermIterator qt
= query
.get_terms_begin();
262 qt
!= query
.get_terms_end(); ++qt
) {
263 if ((*qt
).substr(0, 1) != "S" && (*qt
).substr(1, 1) != "S") {
264 value
+= log10(1 + ((double)(tf
[*qt
] * idf
[*qt
]) / (1 + (double)doc_len
["body"])));
270 for (Xapian::TermIterator qt
= query
.get_terms_begin();
271 qt
!= query
.get_terms_end(); ++qt
) {
272 value
+= log10(1 + ((double)(tf
[*qt
] * idf
[*qt
]) / (1 + (double)doc_len
["whole"])));
279 Features::calculate_f6(const Xapian::Query
& query
, map
<string
, long int> & tf
, map
<string
, long int> & doc_len
, map
<string
, long int> & coll_tf
, map
<string
, long int> & coll_length
, char ch
) {
283 for (Xapian::TermIterator qt
= query
.get_terms_begin();
284 qt
!= query
.get_terms_end(); ++qt
) {
285 if ((*qt
).substr(0, 1) == "S" || (*qt
).substr(1, 1) == "S") {
286 value
+= log10(1 + (((double)tf
[*qt
] * (double)coll_length
["title"]) / (double)(1 + ((double)doc_len
["title"] * (double)coll_tf
[*qt
]))));
291 } else if (ch
== 'b') {
292 for (Xapian::TermIterator qt
= query
.get_terms_begin();
293 qt
!= query
.get_terms_end(); ++qt
) {
294 if ((*qt
).substr(0, 1) != "S" && (*qt
).substr(1, 1) != "S") {
295 value
+= log10(1 + (((double)tf
[*qt
] * (double)coll_length
["body"]) / (double)(1 + ((double)doc_len
["body"] * (double)coll_tf
[*qt
]))));
301 for (Xapian::TermIterator qt
= query
.get_terms_begin();
302 qt
!= query
.get_terms_end(); ++qt
) {
303 value
+= log10(1+(((double)tf
[*qt
] * (double)coll_length
["whole"]) / (double)(1 + ((double)doc_len
["whole"] * (double)coll_tf
[*qt
]))));