1 /** @file chert_database.h
2 * @brief C++ class definition for chert database
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts
7 * Copyright 2008 Lemur Consulting Ltd
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #ifndef OM_HGUARD_CHERT_DATABASE_H
26 #define OM_HGUARD_CHERT_DATABASE_H
28 #include "backends/backends.h"
29 #include "backends/database.h"
30 #include "chert_dbstats.h"
31 #include "chert_positionlist.h"
32 #include "chert_postlist.h"
33 #include "chert_record.h"
34 #include "chert_spelling.h"
35 #include "chert_synonym.h"
36 #include "chert_termlisttable.h"
37 #include "chert_values.h"
38 #include "chert_version.h"
39 #include "../flint_lock.h"
40 #include "chert_types.h"
41 #include "backends/valuestats.h"
45 #include "xapian/compactor.h"
46 #include "xapian/constants.h"
53 class ChertAllDocsPostList
;
54 class RemoteConnection
;
56 /** A backend designed for efficient indexing and retrieval, using
57 * compressed posting lists and a btree storage scheme.
59 class ChertDatabase
: public Xapian::Database::Internal
{
60 friend class ChertWritableDatabase
;
61 friend class ChertTermList
;
62 friend class ChertPostList
;
63 friend class ChertAllTermsList
;
64 friend class ChertAllDocsPostList
;
66 /** Directory to store databases in.
70 /** Whether the database is readonly.
74 /** The file describing the Chert database.
75 * This file has information about the format of the database
76 * which can't easily be stored in any of the individual tables.
78 ChertVersion version_file
;
80 /** Table storing posting lists.
82 * Whenever an update is performed, this table is the first to be
83 * updated: therefore, its most recent revision number is the most
84 * recent anywhere in the database.
86 mutable ChertPostListTable postlist_table
;
88 /** Table storing position lists.
90 ChertPositionListTable position_table
;
92 /** Table storing term lists.
94 ChertTermListTable termlist_table
;
97 mutable ChertValueManager value_manager
;
99 /** Table storing synonym data.
101 mutable ChertSynonymTable synonym_table
;
103 /** Table storing spelling correction data.
105 mutable ChertSpellingTable spelling_table
;
107 /** Table storing records.
109 * Whenever an update is performed, this table is the last to be
110 * updated: therefore, its most recent revision number is the most
111 * recent consistent revision available. If this table's most
112 * recent revision number is not available for all tables, there
113 * is no consistent revision available, and the database is corrupt.
115 ChertRecordTable record_table
;
120 /** The maximum number of changesets which should be kept in the
122 unsigned int max_changesets
;
124 /// Database statistics.
125 ChertDatabaseStats stats
;
127 /** Return true if a database exists at the path specified for this
130 bool database_exists();
132 /** Create new tables, and open them.
133 * Any existing tables will be removed first.
135 void create_and_open_tables(unsigned int blocksize
);
137 /** Open all tables at most recent consistent revision.
139 * @return true if the tables were reopened; false if we could
140 * tell they were already open at the latest revision.
142 * @exception Xapian::DatabaseCorruptError is thrown if there is no
143 * consistent revision available.
145 bool open_tables_consistent();
147 /** Get a write lock on the database, or throw an
148 * Xapian::DatabaseLockError if failure.
150 * @param flags Bit-wise or of zero or more Xapian::DB_* constants
152 * @param creating true if the database is in the process of being
153 * created - if false, will throw a DatabaseOpening error if the lock
154 * can't be acquired and the database doesn't exist.
156 void get_database_write_lock(int flags
, bool creating
);
158 /** Open tables at specified revision number.
160 * @exception Xapian::InvalidArgumentError is thrown if the specified
161 * revision is not available.
163 void open_tables(chert_revision_number_t revision
);
165 /** Get an object holding the next revision number which should be
166 * used in the tables.
168 * @return the next revision number.
170 chert_revision_number_t
get_next_revision_number() const;
172 /** Set the revision number in the tables.
174 * This updates the disk tables so that the currently open revision
175 * becomes the specified revision number.
177 * @param new_revision The new revision number to store. This must
178 * be greater than the latest revision number (see
179 * get_latest_revision_number()), or undefined behaviour will
182 void set_revision_number(chert_revision_number_t new_revision
);
184 /** Re-open tables to recover from an overwritten condition,
185 * or just get most up-to-date version.
189 /** Close all the tables permanently.
193 /** Called if a modifications fail.
195 * @param msg is a string description of the exception that was
196 * raised when the modifications failed.
198 void modifications_failed(chert_revision_number_t old_revision
,
199 chert_revision_number_t new_revision
,
200 const std::string
& msg
);
202 /** Apply any outstanding changes to the tables.
204 * If an error occurs during this operation, this will be signalled
205 * by an exception being thrown. In this case the contents of the
206 * tables on disk will be left in an unmodified state (though possibly
207 * with increased revision numbers), and the outstanding changes will
212 /** Cancel any outstanding changes to the tables.
216 /** Send a set of messages which transfer the whole database.
218 void send_whole_database(RemoteConnection
& conn
, double end_time
);
220 /** Get the revision stored in a changeset.
222 void get_changeset_revisions(const string
& path
,
223 chert_revision_number_t
* startrev
,
224 chert_revision_number_t
* endrev
) const;
226 /** Create and open a chert database.
228 * @exception Xapian::DatabaseCorruptError is thrown if there is no
229 * consistent revision available.
231 * @exception Xapian::DatabaseOpeningError thrown if database can't
234 * @exception Xapian::DatabaseVersionError thrown if database is in an
235 * unsupported format. This implies that the database was
236 * created by an older or newer version of Xapian.
238 * @param dbdir directory holding chert tables
240 * @param block_size Block size, in bytes, to use when creating
241 * tables. This is only important, and has the
242 * correct value, when the database is being
245 ChertDatabase(const string
&db_dir_
, int action
= Xapian::DB_READONLY_
,
246 unsigned int block_size
= 0u);
250 /// Get a postlist table cursor (used by ChertValueList).
251 ChertCursor
* get_postlist_cursor() const {
252 return postlist_table
.cursor_get();
255 /** Get an object holding the revision number which the tables are
258 * @return the current revision number.
260 chert_revision_number_t
get_revision_number() const;
262 /** Virtual methods of Database::Internal. */
264 Xapian::doccount
get_doccount() const;
265 Xapian::docid
get_lastdocid() const;
266 totlen_t
get_total_length() const;
267 Xapian::termcount
get_doclength(Xapian::docid did
) const;
268 Xapian::termcount
get_unique_terms(Xapian::docid did
) const;
269 void get_freqs(const string
& term
,
270 Xapian::doccount
* termfreq_ptr
,
271 Xapian::termcount
* collfreq_ptr
) const;
272 Xapian::doccount
get_value_freq(Xapian::valueno slot
) const;
273 std::string
get_value_lower_bound(Xapian::valueno slot
) const;
274 std::string
get_value_upper_bound(Xapian::valueno slot
) const;
275 Xapian::termcount
get_doclength_lower_bound() const;
276 Xapian::termcount
get_doclength_upper_bound() const;
277 Xapian::termcount
get_wdf_upper_bound(const string
& term
) const;
278 bool term_exists(const string
& tname
) const;
279 bool has_positions() const;
281 LeafPostList
* open_post_list(const string
& tname
) const;
282 ValueList
* open_value_list(Xapian::valueno slot
) const;
283 Xapian::Document::Internal
* open_document(Xapian::docid did
, bool lazy
) const;
285 PositionList
* open_position_list(Xapian::docid did
, const string
& term
) const;
286 TermList
* open_term_list(Xapian::docid did
) const;
287 TermList
* open_allterms(const string
& prefix
) const;
289 TermList
* open_spelling_termlist(const string
& word
) const;
290 TermList
* open_spelling_wordlist() const;
291 Xapian::doccount
get_spelling_frequency(const string
& word
) const;
293 TermList
* open_synonym_termlist(const string
& term
) const;
294 TermList
* open_synonym_keylist(const string
& prefix
) const;
296 string
get_metadata(const string
& key
) const;
297 TermList
* open_metadata_keylist(const std::string
&prefix
) const;
298 void write_changesets_to_fd(int fd
,
299 const string
& start_revision
,
301 Xapian::ReplicationInfo
* info
);
302 string
get_revision_info() const;
303 string
get_uuid() const;
305 void request_document(Xapian::docid
/*did*/) const;
306 void readahead_for_query(const Xapian::Query
&query
);
309 XAPIAN_NORETURN(void throw_termlist_table_close_exception() const);
311 int get_backend_info(string
* path
) const {
312 if (path
) *path
= db_dir
;
313 return BACKEND_CHERT
;
316 void get_used_docid_range(Xapian::docid
& first
,
317 Xapian::docid
& last
) const;
319 static void compact(Xapian::Compactor
* compactor
,
320 const char * destdir
,
321 const std::vector
<Xapian::Database::Internal
*> & sources
,
322 const std::vector
<Xapian::docid
> & offset
,
324 Xapian::Compactor::compaction_level compaction
,
326 Xapian::docid last_docid
);
329 /** A writable chert database.
331 class ChertWritableDatabase
: public ChertDatabase
{
332 /** Unflushed changes to term frequencies and collection frequencies. */
333 mutable map
<string
, pair
<Xapian::termcount_diff
, Xapian::termcount_diff
> >
336 /** Document lengths of new and modified documents which haven't been flushed yet. */
337 mutable map
<Xapian::docid
, Xapian::termcount
> doclens
;
339 /// Modifications to posting lists.
340 mutable map
<string
, map
<Xapian::docid
,
341 pair
<char, Xapian::termcount
> > > mod_plists
;
343 mutable map
<Xapian::valueno
, ValueStats
> value_stats
;
345 /** The number of documents added, deleted, or replaced since the last
348 mutable Xapian::doccount change_count
;
350 /// If change_count reaches this threshold we automatically flush.
351 Xapian::doccount flush_threshold
;
353 /** A pointer to the last document which was returned by
354 * open_document(), or NULL if there is no such valid document. This
355 * is used purely for comparing with a supplied document to help with
356 * optimising replace_document. When the document internals are
357 * deleted, this pointer gets set to NULL.
359 mutable Xapian::Document::Internal
* modify_shortcut_document
;
361 /** The document ID for the last document returned by open_document().
363 mutable Xapian::docid modify_shortcut_docid
;
365 /** Check if we should autoflush.
367 * Called at the end of each document changing operation.
369 void check_flush_threshold();
371 /// Flush any unflushed postlist changes, but don't commit them.
372 void flush_postlist_changes() const;
374 /// Close all the tables permanently.
380 /** Add or modify an entry in freq_deltas.
382 * @param tname The term to modify the entry for.
383 * @param tf_delta The change in the term frequency delta.
384 * @param cf_delta The change in the collection frequency delta.
386 void add_freq_delta(const string
& tname
,
387 Xapian::termcount_diff tf_delta
,
388 Xapian::termcount_diff cf_delta
);
390 /** Insert modifications for a new document to the postlists.
392 * @param did The document ID to insert the entry for.
393 * @param tname The term to insert the entry for.
394 * @param wdf The new wdf value to store.
396 void insert_mod_plist(Xapian::docid did
,
397 const string
& tname
,
398 Xapian::termcount wdf
);
400 /** Update the stored modifications to the postlists.
402 * @param did The document ID to modify the entry for.
403 * @param tname The term to modify the entry for.
404 * @param type The type of change to the postlist.
405 * @param wdf The new wdf value to store.
407 * If type is 'A', and an existing entry is in the stored
408 * modifications, the stored type will be set to 'M'. In all other
409 * cases, the stored type is simply the value supplied.
411 void update_mod_plist(Xapian::docid did
,
412 const string
& tname
,
414 Xapian::termcount wdf
);
417 /** Implementation of virtual methods: see Database::Internal for
422 /** Cancel pending modifications to the database. */
425 Xapian::docid
add_document(const Xapian::Document
& document
);
426 Xapian::docid
add_document_(Xapian::docid did
, const Xapian::Document
& document
);
427 // Stop the default implementation of delete_document(term) and
428 // replace_document(term) from being hidden. This isn't really
429 // a problem as we only try to call them through the base class
430 // (where they aren't hidden) but some compilers generate a warning
433 using Xapian::Database::Internal::delete_document
;
434 using Xapian::Database::Internal::replace_document
;
436 void delete_document(Xapian::docid did
);
437 void replace_document(Xapian::docid did
, const Xapian::Document
& document
);
439 Xapian::Document::Internal
* open_document(Xapian::docid did
,
445 /** Create and open a writable chert database.
447 * @exception Xapian::DatabaseOpeningError thrown if database can't
450 * @exception Xapian::DatabaseVersionError thrown if database is in an
451 * unsupported format. This implies that the database was
452 * created by an older or newer version of Xapian.
454 * @param dir directory holding chert tables
456 ChertWritableDatabase(const string
&dir
, int action
, int block_size
);
458 ~ChertWritableDatabase();
460 /** Virtual methods of Database::Internal. */
462 Xapian::termcount
get_doclength(Xapian::docid did
) const;
463 void get_freqs(const string
& term
,
464 Xapian::doccount
* termfreq_ptr
,
465 Xapian::termcount
* collfreq_ptr
) const;
466 Xapian::doccount
get_value_freq(Xapian::valueno slot
) const;
467 std::string
get_value_lower_bound(Xapian::valueno slot
) const;
468 std::string
get_value_upper_bound(Xapian::valueno slot
) const;
469 bool term_exists(const string
& tname
) const;
471 LeafPostList
* open_post_list(const string
& tname
) const;
472 ValueList
* open_value_list(Xapian::valueno slot
) const;
473 TermList
* open_allterms(const string
& prefix
) const;
475 void add_spelling(const string
& word
, Xapian::termcount freqinc
) const;
476 void remove_spelling(const string
& word
, Xapian::termcount freqdec
) const;
477 TermList
* open_spelling_wordlist() const;
479 TermList
* open_synonym_keylist(const string
& prefix
) const;
480 void add_synonym(const string
& word
, const string
& synonym
) const;
481 void remove_synonym(const string
& word
, const string
& synonym
) const;
482 void clear_synonyms(const string
& word
) const;
484 void set_metadata(const string
& key
, const string
& value
);
485 void invalidate_doc_object(Xapian::Document::Internal
* obj
) const;
489 #endif /* OM_HGUARD_CHERT_DATABASE_H */