1 /** @file featurelist_internal.h
2 * @brief Internals of Feature class
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 #ifndef XAPIAN_LETOR_INCLUDED_FEATURELIST_INTERNAL_H
24 #define XAPIAN_LETOR_INCLUDED_FEATURELIST_INTERNAL_H
26 #include "xapian-letor/featurelist.h"
32 /** Class defining internals of Feature class. */
33 class FeatureList::Internal
: public Xapian::Internal::intrusive_base
{
34 friend class FeatureList
;
36 /// Stats which FeatureList can use to determine the stats needed by a Feature.
38 /// Number of documents in the collection.
40 /// Number of documents in the RSet.
41 INVERSE_DOCUMENT_FREQUENCY
= 2,
42 /// Average length of documents in the collection.
44 /// How many documents the current term is in.
45 COLLECTION_LENGTH
= 8,
46 /// How many documents in the RSet the current term is in.
47 COLLECTION_TERM_FREQ
= 16,
50 /// Xapian::Database using which features will be calculated.
51 Database featurelist_db
;
53 /// Xapian::Query using which features will be calculated.
54 Query featurelist_query
;
56 /// Xapian::Document using which features will be calculated.
57 Document featurelist_doc
;
59 /// Frequency of the Query Terms in the specified documents.
60 std::map
<std::string
, Xapian::termcount
> termfreq
;
62 /// Inverse Document Frequency of Query terms in the database.
63 std::map
<std::string
, double> inverse_doc_freq
;
65 /** Length of the document as number of terms for different parts like
66 * 'title', 'body' and 'whole'.
68 std::map
<std::string
, Xapian::termcount
> doc_length
;
70 /** Length of the collection in number of terms for different parts like
71 * 'title', 'body' and 'whole'.
73 std::map
<std::string
, Xapian::termcount
> collection_length
;
75 /// Frequency of the Query Terms in the whole database
76 std::map
<std::string
, Xapian::termcount
> collection_termfreq
;
78 /** This method finds the frequency of the query terms in the
79 * specified documents.
81 * This method is a helper method and statistics gathered through
82 * this method are used in feature value calculation. This information
83 * is stored in termfreq.
85 void compute_termfreq();
87 /** This method calculates the inverse document frequency(idf) of query
88 * terms in the database.
90 * This method is a helper method and statistics gathered through
91 * this method are used in feature value calculation. This information
92 * is stored in inverse_doc_freq.
94 * Note: idf of a term 't' is calculated as below:
96 * idf(t) = log(N/df(t))
98 * N = Total number of documents in database and
99 * df(t) = number of documents containing term 't'
101 void compute_inverse_doc_freq();
103 /** This method calculates the length of the documents as number of 'terms'.
104 * It calculates the length for three different parts:
105 * title, body and whole document.
107 * This method is a helper method and statistics gathered through
108 * this method are used in feature value calculation. This information
109 * is stored in doc_length in the following format:
112 * map<string, long int> len;
118 void compute_doc_length();
120 /** This method calculates the length of the collection in number of terms
121 * for different parts like 'title', 'body' and 'whole'.
123 * This is calculated as a stored user metadata in omindex otherwise
124 * it is calculated out of scratch
125 * (this might take some time depending upon the size of the database).
127 * This method is a helper method and statistics gathered through
128 * this method are used in feature value calculation. This information
129 * is stored in collection_length in the following format.
132 * map<string, long int> len;
139 void compute_collection_length();
141 /** This method calculates the frequency of query terms in
142 * the whole database.
144 * This method is a helper method and statistics gathered through
145 * this method are used in feature value calculation. This information
146 * is stored in collection_termfreq.
148 void compute_collection_termfreq();
150 /** Specify the database to use for feature building.
152 * This will be used by the Internal class.
154 void set_database(const Xapian::Database
& db
);
156 /** Specify the query to use for feature building.
158 * This will be used by the Internal class.
160 void set_query(const Xapian::Query
& query
);
162 /** Specify the document to use for feature building.
164 * This will be used by the Internal class.
166 void set_doc(const Xapian::Document
& doc
);
170 /** Vector containing Feature pointer objects.
171 * Each will be used to return feature value.
173 std::vector
<Feature
*> feature
;
175 /// This method sets all the data members required for computing stats.
176 void set_data(const Xapian::Query
& query
,
177 const Xapian::Database
& db
,
178 const Xapian::Document
& doc
);
180 /// Computes and populates the stats needed by a Feature.
181 void populate_feature(Feature
*feature_
);
183 /// Clears all the stats.
189 #endif // XAPIAN_LETOR_INCLUDED_FEATURELIST_INTERNAL_H