Correct typo: .clangformat -> .clang-format
[xapian.git] / xapian-letor / feature / tfidfdoclenfeature.cc
bloba4828335a9fad4ef548aa913e4213522c15fc6cf
1 /** @file tfidfdoclenfeature.cc
2 * @brief TfIdfDoclenFeature class
3 */
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian-letor/feature.h"
26 #include "debuglog.h"
27 #include "stringutils.h"
29 using namespace std;
31 namespace Xapian {
33 string
34 TfIdfDoclenFeature::name() const
36 return "TfIdfDoclenFeature";
39 /** A helper function for feature->get_value()
41 * Checks if the term belongs to the title or is stemmed from the title.
43 inline bool
44 is_title_term(const std::string& term)
46 return startswith(term, 'S') || startswith(term, "ZS");
49 vector<double>
50 TfIdfDoclenFeature::get_values() const
52 LOGCALL(API, vector<double>, "TfIdfDoclenFeature::get_values", NO_ARGS);
54 vector<double> values;
55 double value = 0;
56 double doc_len;
57 auto doc_len_iterator = doc_length.find("title");
58 if (doc_len_iterator != doc_length.end())
59 doc_len = (double)doc_len_iterator->second;
60 else
61 doc_len = 0;
63 for (Xapian::TermIterator qt = feature_query.get_unique_terms_begin();
64 qt != feature_query.get_terms_end(); ++qt) {
65 if (is_title_term((*qt))) {
66 double tf;
67 double idf;
68 auto tf_iterator = termfreq.find(*qt);
69 auto idf_iterator = inverse_doc_freq.find(*qt);
70 if (tf_iterator != termfreq.end())
71 tf = (double)tf_iterator->second;
72 else
73 tf = 0;
74 if (idf_iterator != inverse_doc_freq.end())
75 idf = idf_iterator->second;
76 else
77 idf = 0;
78 value += log10(1 + ((tf * idf) / (1 + doc_len)));
81 values.push_back(value);
82 value = 0;
83 doc_len_iterator = doc_length.find("body");
84 if (doc_len_iterator != doc_length.end())
85 doc_len = (double)doc_len_iterator->second;
86 else
87 doc_len = 0;
89 for (Xapian::TermIterator qt = feature_query.get_unique_terms_begin();
90 qt != feature_query.get_terms_end(); ++qt) {
91 if (!is_title_term((*qt))) {
92 double tf;
93 double idf;
94 auto tf_iterator = termfreq.find(*qt);
95 auto idf_iterator = inverse_doc_freq.find(*qt);
96 if (tf_iterator != termfreq.end())
97 tf = (double)tf_iterator->second;
98 else
99 tf = 0;
100 if (idf_iterator != inverse_doc_freq.end())
101 idf = idf_iterator->second;
102 else
103 idf = 0;
104 value += log10(1 + ((tf * idf) / (1 + doc_len)));
107 values.push_back(value);
108 value = 0;
109 doc_len_iterator = doc_length.find("whole");
110 if (doc_len_iterator != doc_length.end())
111 doc_len = (double)doc_len_iterator->second;
112 else
113 doc_len = 0;
115 for (Xapian::TermIterator qt = feature_query.get_unique_terms_begin();
116 qt != feature_query.get_terms_end(); ++qt) {
117 double tf;
118 double idf;
119 auto tf_iterator = termfreq.find(*qt);
120 auto idf_iterator = inverse_doc_freq.find(*qt);
121 if (tf_iterator != termfreq.end())
122 tf = (double)tf_iterator->second;
123 else
124 tf = 0;
125 if (idf_iterator != inverse_doc_freq.end())
126 idf = idf_iterator->second;
127 else
128 idf = 0;
129 value += log10(1 + ((tf * idf) / (1 + doc_len)));
131 values.push_back(value);
133 return values;