HoneyTable::read_item(): Read tag in one go
[xapian.git] / xapian-core / backends / documentinternal.h
blob491640f55edc572aed751f17503e1cc17543735a
1 /** @file documentinternal.h
2 * @brief Abstract base class for a document
3 */
4 /* Copyright 2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_DOCUMENTINTERNAL_H
22 #define XAPIAN_INCLUDED_DOCUMENTINTERNAL_H
24 #include <xapian/document.h>
25 #include <xapian/intrusive_ptr.h>
26 #include <xapian/types.h>
28 #include "api/terminfo.h"
29 #include "api/termlist.h"
30 #include "backends/databaseinternal.h"
32 #include <map>
33 #include <memory>
34 #include <string>
36 class DocumentTermList;
37 class DocumentValueList;
38 class GlassValueManager;
39 class HoneyValueManager;
40 class ValueStreamDocument;
42 namespace Xapian {
44 /// Abstract base class for a document.
45 class Document::Internal : public Xapian::Internal::intrusive_base {
46 friend class ::DocumentTermList;
47 friend class ::DocumentValueList;
48 // For ensure_values_fetched():
49 friend class ::GlassValueManager;
50 friend class ::HoneyValueManager;
51 friend class ::ValueStreamDocument;
53 /// Don't allow assignment.
54 void operator=(const Internal &) = delete;
56 /// Don't allow copying.
57 Internal(const Internal &) = delete;
59 /** The document data.
61 * If NULL, this hasn't been fetched or set yet.
63 std::unique_ptr<std::string> data;
65 /** Terms in the document and their associated metadata.
67 * If NULL, the terms haven't been fetched or set yet.
69 * We use std::map<> rather than std::unordered_map<> because the latter
70 * invalidates existing iterators upon insert() if rehashing occurs,
71 * whereas existing iterators remain valid for std::map<>.
73 mutable std::unique_ptr<std::map<std::string, TermInfo>> terms;
75 /** The number of distinct terms in @a terms.
77 * Only valid when terms is non-NULL.
79 * This may be less than terms.size() if any terms have been deleted.
81 mutable Xapian::termcount termlist_size;
83 /** Are there any changes to term positions in @a terms?
85 * If a document is read from a database, modified and then replaced at
86 * the same docid, then we can save a lot of work if we know when there
87 * are no changes to term positions, even if there are changes to terms
88 * (a common example is adding filter terms to an existing document).
90 * It's OK for this to be true when there aren't any modifications (it
91 * just means that the backend can't shortcut as directly).
93 mutable bool positions_modified_ = false;
95 /** Ensure terms have been fetched from @a database.
97 * After this call, @a terms will be non-NULL. If @a database is NULL,
98 * @a terms will be initialised to an empty map if it was NULL.
100 void ensure_terms_fetched() const;
102 /** Ensure values have been fetched from @a database.
104 * After this call, @a values will be non-NULL. If @a database is NULL,
105 * @a values will be initialised to an empty map if it was NULL.
107 void ensure_values_fetched() const;
109 protected:
110 /** Document value slots and their contents.
112 * If NULL, the values haven't been fetched or set yet.
114 * We use std::map<> rather than std::unordered_map<> because the latter
115 * invalidates existing iterators upon insert() if rehashing occurs,
116 * whereas existing iterators remain valid for std::map<>.
118 mutable std::unique_ptr<std::map<Xapian::valueno, std::string>> values;
120 /** Database this document came from.
122 * If this document didn't come from a database, this will be NULL.
124 Xapian::Internal::intrusive_ptr<const Xapian::Database::Internal> database;
126 /** The document ID this document came from in @a database.
128 * If this document didn't come from a database, this will be 0.
130 * Note that this is the docid in the sub-database when multiple databases
131 * are being searched.
133 Xapian::docid did;
135 /// Constructor used by subclasses.
136 Internal(Xapian::Internal::intrusive_ptr<const Xapian::Database::Internal> database_,
137 Xapian::docid did_)
138 : database(database_), did(did_) {}
140 /// Constructor used by RemoteDocument subclass.
141 Internal(const Xapian::Database::Internal* database_,
142 Xapian::docid did_,
143 const std::string& data_,
144 std::map<Xapian::valueno, std::string>&& values_)
145 : data(new std::string(data_)),
146 values(new std::map<Xapian::valueno, std::string>(std::move(values_))),
147 database(database_),
148 did(did_) {}
150 /** Fetch the document data from the database.
152 * The default implementation (used when there's no associated database)
153 * returns an empty string.
155 virtual std::string fetch_data() const;
157 /** Fetch all set values from the database.
159 * The default implementation (used when there's no associated database)
160 * clears @a values_.
162 virtual void fetch_all_values(std::map<Xapian::valueno,
163 std::string>& values_) const;
165 /** Fetch a single value from the database.
167 * The default implementation (used when there's no associated database)
168 * returns an empty string.
170 virtual std::string fetch_value(Xapian::valueno slot) const;
172 public:
173 /// Construct an empty document.
174 Internal() : did(0) {}
176 /** We have virtual methods and want to be able to delete derived classes
177 * using a pointer to the base class, so we need a virtual destructor.
179 virtual ~Internal();
181 /** Return true if the document data might have been modified.
183 * If the document is from a database, this means modifications
184 * compared to the version read, otherwise it means modifications
185 * compared to an empty database.
187 bool data_modified() const { return data != NULL; }
189 /** Return true if the document's terms might have been modified.
191 * If the document is from a database, this means modifications
192 * compared to the version read, otherwise it means modifications
193 * compared to an empty database.
195 bool terms_modified() const { return terms != NULL; }
197 /** Return true if the document's values might have been modified.
199 * If the document is from a database, this means modifications
200 * compared to the version read, otherwise it means modifications
201 * compared to an empty database.
203 bool values_modified() const { return values != NULL; }
205 /** Return true if the document might have been modified in any way.
207 * If the document is from a database, this means modifications
208 * compared to the version read, otherwise it means modifications
209 * compared to an empty database.
211 bool modified() const {
212 return data_modified() || terms_modified() || values_modified();
215 /** Return true if the document's term positions might have been modified.
217 * If the document is from a database, this means modifications
218 * compared to the version read, otherwise it means modifications
219 * compared to an empty database.
221 bool positions_modified() const { return positions_modified_; }
223 /** Get the document ID this document came from.
225 * If this document didn't come from a database, this will be 0.
227 * Note that this is the docid in the sub-database when multiple databases
228 * are being searched.
230 Xapian::docid get_docid() const { return did; }
232 /// Get the document data.
233 std::string get_data() const {
234 if (data)
235 return *data;
236 return fetch_data();
239 /// Set the document data.
240 void set_data(const std::string& data_) {
241 data.reset(new std::string(data_));
244 /// Add a term to this document.
245 void add_term(const std::string& term, Xapian::termcount wdf_inc) {
246 ensure_terms_fetched();
248 auto i = terms->find(term);
249 if (i == terms->end()) {
250 ++termlist_size;
251 terms->emplace(make_pair(term, TermInfo(wdf_inc)));
252 } else {
253 if (i->second.increase_wdf(wdf_inc))
254 ++termlist_size;
258 /// Remove a term from this document.
259 bool remove_term(const std::string& term) {
260 ensure_terms_fetched();
262 auto i = terms->find(term);
263 if (i == terms->end()) {
264 return false;
266 if (!i->second.get_positions()->empty()) {
267 positions_modified_ = true;
269 if (!i->second.remove()) {
270 return false;
272 --termlist_size;
273 return true;
276 /// Add a posting for a term.
277 void add_posting(const std::string& term,
278 Xapian::termpos term_pos,
279 Xapian::termcount wdf_inc) {
280 ensure_terms_fetched();
281 positions_modified_ = true;
283 auto i = terms->find(term);
284 if (i == terms->end()) {
285 ++termlist_size;
286 terms->emplace(term, TermInfo(wdf_inc, term_pos));
287 return;
289 if (i->second.add_position(wdf_inc, term_pos))
290 ++termlist_size;
293 enum remove_posting_result { OK, NO_TERM, NO_POS };
295 /// Remove a posting for a term.
296 remove_posting_result
297 remove_posting(const std::string& term,
298 Xapian::termpos term_pos,
299 Xapian::termcount wdf_dec) {
300 ensure_terms_fetched();
302 auto i = terms->find(term);
303 if (i == terms->end() || i->second.is_deleted()) {
304 return remove_posting_result::NO_TERM;
306 if (!i->second.remove_position(term_pos)) {
307 return remove_posting_result::NO_POS;
309 if (wdf_dec)
310 i->second.decrease_wdf(wdf_dec);
311 positions_modified_ = true;
312 return remove_posting_result::OK;
315 /// Clear all terms from the document.
316 void clear_terms() {
317 if (!terms) {
318 if (database.get()) {
319 terms.reset(new map<string, TermInfo>());
320 termlist_size = 0;
321 } else {
322 // We didn't come from a database, so there are no unfetched
323 // terms to clear.
325 } else {
326 terms->clear();
327 termlist_size = 0;
328 // Assume there was positional data if there's any in the database.
329 positions_modified_ = database.get() && database->has_positions();
333 /// Return the number of distinct terms in this document.
334 Xapian::termcount termlist_count() const {
335 if (terms)
336 return termlist_size;
338 if (!database.get())
339 return 0;
341 std::unique_ptr<TermList> tl(database->open_term_list(did));
342 // get_approx_size() is exact for TermList from a database.
343 return tl->get_approx_size();
346 /** Start iterating the terms in this document.
348 * @return A new TermList object (caller takes ownership) or NULL if
349 * there are no terms.
351 TermList* open_term_list() const;
353 /** Read a value slot in this document.
355 * @return The value in slot @a slot, or an empty string if not set.
357 std::string get_value(Xapian::valueno slot) const {
358 if (values) {
359 auto i = values->find(slot);
360 if (i != values->end())
361 return i->second;
362 return std::string();
365 return fetch_value(slot);
368 /// Add a value to a slot in this document.
369 void add_value(Xapian::valueno slot, const std::string& value) {
370 ensure_values_fetched();
372 if (!value.empty()) {
373 (*values)[slot] = value;
374 } else {
375 // Empty values aren't stored, but replace any existing value by
376 // removing it.
377 values->erase(slot);
381 /// Clear all value slots in this document.
382 void clear_values() {
383 if (!values) {
384 if (database.get()) {
385 values.reset(new map<Xapian::valueno, string>());
386 } else {
387 // We didn't come from a database, so there are no unfetched
388 // values to clear.
390 } else {
391 values->clear();
395 /// Count the value slots used in this document.
396 Xapian::valueno values_count() const {
397 ensure_values_fetched();
398 return values->size();
401 Xapian::ValueIterator values_begin() const;
403 /// Return a string describing this object.
404 std::string get_description() const;
409 #endif // XAPIAN_INCLUDED_DOCUMENTINTERNAL_H