Make sure EOF is defined
[xapian.git] / xapian-letor / api / featurelist_internal.h
blob613aa40d484d8e831b3bf35663e9b9cb212f0ba8
1 /** @file featurelist_internal.h
2 * @brief Internals of Feature class
3 */
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #ifndef XAPIAN_LETOR_INCLUDED_FEATURELIST_INTERNAL_H
24 #define XAPIAN_LETOR_INCLUDED_FEATURELIST_INTERNAL_H
26 #include "xapian-letor/featurelist.h"
28 #include <map>
30 namespace Xapian {
32 /** Class defining internals of Feature class. */
33 class FeatureList::Internal : public Xapian::Internal::intrusive_base {
34 friend class FeatureList;
36 /// Stats which FeatureList can use to determine the stats needed by a Feature.
37 typedef enum {
38 /// Number of documents in the collection.
39 TERM_FREQUENCY = 1,
40 /// Number of documents in the RSet.
41 INVERSE_DOCUMENT_FREQUENCY = 2,
42 /// Average length of documents in the collection.
43 DOCUMENT_LENGTH = 4,
44 /// How many documents the current term is in.
45 COLLECTION_LENGTH = 8,
46 /// How many documents in the RSet the current term is in.
47 COLLECTION_TERM_FREQ = 16,
48 } stat_flags;
50 /// Xapian::Database using which features will be calculated.
51 Database featurelist_db;
53 /// Xapian::Query using which features will be calculated.
54 Query featurelist_query;
56 /// Xapian::Document using which features will be calculated.
57 Document featurelist_doc;
59 /// Frequency of the Query Terms in the specified documents.
60 std::map<std::string, Xapian::termcount> termfreq;
62 /// Inverse Document Frequency of Query terms in the database.
63 std::map<std::string, double> inverse_doc_freq;
65 /** Length of the document as number of terms for different parts like
66 * 'title', 'body' and 'whole'.
68 std::map<std::string, Xapian::termcount> doc_length;
70 /** Length of the collection in number of terms for different parts like
71 * 'title', 'body' and 'whole'.
73 std::map<std::string, Xapian::termcount> collection_length;
75 /// Frequency of the Query Terms in the whole database
76 std::map<std::string, Xapian::termcount> collection_termfreq;
78 /** This method finds the frequency of the query terms in the
79 * specified documents.
81 * This method is a helper method and statistics gathered through
82 * this method are used in feature value calculation. This information
83 * is stored in termfreq.
85 void compute_termfreq();
87 /** This method calculates the inverse document frequency(idf) of query
88 * terms in the database.
90 * This method is a helper method and statistics gathered through
91 * this method are used in feature value calculation. This information
92 * is stored in inverse_doc_freq.
94 * Note: idf of a term 't' is calculated as below:
96 * idf(t) = log(N/df(t))
97 * Where,
98 * N = Total number of documents in database and
99 * df(t) = number of documents containing term 't'
101 void compute_inverse_doc_freq();
103 /** This method calculates the length of the documents as number of 'terms'.
104 * It calculates the length for three different parts:
105 * title, body and whole document.
107 * This method is a helper method and statistics gathered through
108 * this method are used in feature value calculation. This information
109 * is stored in doc_length in the following format:
111 * @code
112 * map<string, long int> len;
113 * len["title"];
114 * len["body"];
115 * len["whole"];
116 * @endcode
118 void compute_doc_length();
120 /** This method calculates the length of the collection in number of terms
121 * for different parts like 'title', 'body' and 'whole'.
123 * This is calculated as a stored user metadata in omindex otherwise
124 * it is calculated out of scratch
125 * (this might take some time depending upon the size of the database).
127 * This method is a helper method and statistics gathered through
128 * this method are used in feature value calculation. This information
129 * is stored in collection_length in the following format.
131 * @code
132 * map<string, long int> len;
133 * len["title"];
134 * len["body"];
135 * len["whole"];
136 * @endcode
139 void compute_collection_length();
141 /** This method calculates the frequency of query terms in
142 * the whole database.
144 * This method is a helper method and statistics gathered through
145 * this method are used in feature value calculation. This information
146 * is stored in collection_termfreq.
148 void compute_collection_termfreq();
150 /** Specify the database to use for feature building.
152 * This will be used by the Internal class.
154 void set_database(const Xapian::Database & db);
156 /** Specify the query to use for feature building.
158 * This will be used by the Internal class.
160 void set_query(const Xapian::Query & query);
162 /** Specify the document to use for feature building.
164 * This will be used by the Internal class.
166 void set_doc(const Xapian::Document & doc);
168 public:
170 /** Vector containing Feature pointer objects.
171 * Each will be used to return feature value.
173 std::vector<Feature *> feature;
175 /// This method sets all the data members required for computing stats.
176 void set_data(const Xapian::Query & query,
177 const Xapian::Database & db,
178 const Xapian::Document & doc);
180 /// Computes and populates the stats needed by a Feature.
181 void populate_feature(Feature *feature_);
183 /// Clears all the stats.
184 void clear_stats();
189 #endif // XAPIAN_LETOR_INCLUDED_FEATURELIST_INTERNAL_H