1 /** @file featurelist_internal.cc
2 * @brief Definition of Feature::Internal class.
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "xapian-letor/featurelist.h"
26 #include "featurelist_internal.h"
35 using namespace Xapian
;
38 FeatureList::Internal::set_database(const Xapian::Database
& db
)
44 FeatureList::Internal::set_query(const Xapian::Query
& query
)
46 featurelist_query
= query
;
50 FeatureList::Internal::set_doc(const Xapian::Document
& doc
)
52 featurelist_doc
= doc
;
56 FeatureList::Internal::set_data(const Xapian::Query
& letor_query
,
57 const Xapian::Database
& letor_db
,
58 const Xapian::Document
& letor_doc
)
60 set_query(letor_query
);
62 set_database(letor_db
);
66 FeatureList::Internal::compute_termfreq()
68 std::map
<std::string
, Xapian::termcount
> tf
;
70 Xapian::TermIterator docterms
= featurelist_doc
.termlist_begin();
71 for (Xapian::TermIterator qt
= featurelist_query
.get_unique_terms_begin();
72 qt
!= featurelist_query
.get_terms_end(); ++qt
) {
73 docterms
.skip_to(*qt
);
74 if (docterms
!= featurelist_doc
.termlist_end() && *qt
== *docterms
)
75 tf
[*qt
] = docterms
.get_wdf();
77 std::swap(termfreq
, tf
);
81 FeatureList::Internal::compute_inverse_doc_freq()
83 std::map
<std::string
, double> idf
;
84 Xapian::doccount totaldocs
= featurelist_db
.get_doccount();
86 for (Xapian::TermIterator qt
= featurelist_query
.get_unique_terms_begin();
87 qt
!= featurelist_query
.get_terms_end(); ++qt
) {
88 Xapian::doccount df
= featurelist_db
.get_termfreq(*qt
);
90 idf
[*qt
] = log10((double)totaldocs
/ (double)(1 + df
));
92 std::swap(inverse_doc_freq
, idf
);
96 FeatureList::Internal::compute_doc_length()
98 std::map
<std::string
, Xapian::termcount
> len
;
100 Xapian::termcount title_len
= 0;
101 Xapian::TermIterator dt
= featurelist_doc
.termlist_begin();
102 // reach the iterator to the start of the title terms i.e. prefix "S"
104 for ( ; dt
!= featurelist_doc
.termlist_end(); ++dt
) {
105 if ((*dt
)[0] != 'S') {
106 // We've reached the end of the S-prefixed terms.
109 title_len
+= dt
.get_wdf();
111 len
["title"] = title_len
;
112 Xapian::termcount whole_len
=
113 featurelist_db
.get_doclength(featurelist_doc
.get_docid());
114 len
["whole"] = whole_len
;
115 len
["body"] = whole_len
- title_len
;
116 std::swap(doc_length
, len
);
120 FeatureList::Internal::compute_collection_length()
122 std::map
<std::string
, Xapian::termcount
> len
;
124 if (!featurelist_db
.get_metadata("collection_len_title").empty() &&
125 !featurelist_db
.get_metadata("collection_len_body").empty() &&
126 !featurelist_db
.get_metadata("collection_len_whole").empty()) {
128 atol(featurelist_db
.get_metadata("collection_len_title").c_str());
130 atol(featurelist_db
.get_metadata("collection_len_body").c_str());
132 atol(featurelist_db
.get_metadata("collection_len_whole").c_str());
134 Xapian::termcount title_len
= 0;
135 Xapian::TermIterator dt
= featurelist_db
.allterms_begin("S");
136 for ( ; dt
!= featurelist_db
.allterms_end("S"); ++dt
) {
137 // because we don't want the unique terms so we want their
138 // original frequencies and i.e. the total size of the title collection.
139 title_len
+= featurelist_db
.get_collection_freq(*dt
);
141 len
["title"] = title_len
;
142 Xapian::termcount whole_len
= featurelist_db
.get_avlength() *
143 featurelist_db
.get_doccount();
144 len
["whole"] = whole_len
;
145 len
["body"] = whole_len
- title_len
;
147 std::swap(collection_length
, len
);
151 FeatureList::Internal::compute_collection_termfreq()
153 std::map
<std::string
, Xapian::termcount
> tf
;
155 for (Xapian::TermIterator qt
= featurelist_query
.get_unique_terms_begin();
156 qt
!= featurelist_query
.get_terms_end(); ++qt
) {
157 Xapian::termcount coll_tf
= featurelist_db
.get_collection_freq(*qt
);
161 std::swap(collection_termfreq
, tf
);
165 FeatureList::Internal::populate_feature(Feature
*feature_
)
167 stat_flags stats_needed
= stat_flags(feature_
->get_stats());
168 if (stats_needed
& TERM_FREQUENCY
) {
169 if (termfreq
.empty())
171 feature_
->set_termfreq(termfreq
);
173 if (stats_needed
& INVERSE_DOCUMENT_FREQUENCY
) {
174 if (inverse_doc_freq
.empty())
175 compute_inverse_doc_freq();
176 feature_
->set_inverse_doc_freq(inverse_doc_freq
);
178 if (stats_needed
& DOCUMENT_LENGTH
) {
179 if (doc_length
.empty())
180 compute_doc_length();
181 feature_
->set_doc_length(doc_length
);
183 if (stats_needed
& COLLECTION_LENGTH
) {
184 if (collection_length
.empty())
185 compute_collection_length();
186 feature_
->set_collection_length(collection_length
);
188 if (stats_needed
& COLLECTION_TERM_FREQ
) {
189 if (collection_termfreq
.empty())
190 compute_collection_termfreq();
191 feature_
->set_collection_termfreq(collection_termfreq
);
196 FeatureList::Internal::clear_stats()
199 inverse_doc_freq
.clear();
201 collection_length
.clear();
202 collection_termfreq
.clear();