2 * @brief Database API class
4 /* Copyright 2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include <xapian/database.h>
26 #include "backends/databaseinternal.h"
27 #include "backends/empty_database.h"
28 #include "backends/multi/multi_database.h"
30 #include "editdistance.h"
32 #include "postingiteratorinternal.h"
33 #include <xapian/constants.h>
34 #include <xapian/error.h>
35 #include <xapian/positioniterator.h>
36 #include <xapian/postingiterator.h>
37 #include <xapian/termiterator.h>
38 #include <xapian/unicode.h>
41 #include <cstdlib> // For abs().
49 static void docid_zero_invalid()
51 throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
55 static void empty_metadata_key()
57 throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
61 static void empty_term_invalid()
63 throw Xapian::InvalidArgumentError("Empty terms are invalid");
68 Database::Database(Database::Internal
* internal_
)
73 Database::Database(const Database
& o
)
74 : internal(o
.internal
)
79 Database::operator=(const Database
& o
)
81 internal
= o
.internal
;
86 : internal(new EmptyDatabase
)
97 return internal
->reopen();
107 Database::add_database_(const Database
& o
, bool read_only
)
110 const char* msg
= read_only
?
111 "Database::add_database(): Can't add a Database to itself" :
112 "WritableDatabase::add_database(): Can't add a WritableDatabase "
114 throw InvalidArgumentError(msg
);
117 auto o_size
= o
.internal
->size();
119 // Adding an empty database is a no-op.
123 auto my_size
= internal
->size();
126 internal
= o
.internal
;
131 // This doesn't work - for example:
134 // db.add_database(WritableDatabase("one.db"));
135 // db.add_database(WritableDatabase("two.db"));
137 // The first add_database() assigns the internal across, so at the second
138 // call internal->is_read_only() returns false but read_only is true.
140 // I'm not entirely convinced the extra complexity required to make this
141 // work is worthwhile. We catch static violations such as this at compile
144 // WritableDatabase db;
145 // db.add_database(Database("one.db"));
147 // The case we don't catch is:
149 // WritableDatabase db;
150 // Database ro_db = db;
151 // ro_db.add_database(Database("one.db"));
153 // But performing WritableDatabase actions using such a WritableDatabase
154 // should now throw InvalidOperationError.
155 if (!internal
->is_read_only() && read_only
) {
156 throw InvalidArgumentError("Database::add_database(): Can't add a "
157 "Database to a WritableDatabase");
161 // Make sure internal is a MultiDatabase with enough space reserved.
162 auto new_size
= my_size
+ o_size
;
163 MultiDatabase
* multi_db
;
165 multi_db
= new MultiDatabase(new_size
, read_only
);
166 multi_db
->push_back(internal
.get());
169 // Must already be a MultiDatabase as everything else reports 1 for
171 multi_db
= static_cast<MultiDatabase
*>(internal
.get());
172 multi_db
->reserve(new_size
);
176 multi_db
->push_back(o
.internal
.get());
178 // Must be a MultiDatabase.
179 auto o_multi
= static_cast<MultiDatabase
*>(o
.internal
.get());
180 // Add the shards from o to ourself.
181 for (auto&& shard
: o_multi
->shards
) {
182 multi_db
->push_back(shard
);
188 Database::postlist_begin(const string
& term
) const
190 PostList
* pl
= internal
->open_post_list(term
);
191 if (!pl
) return PostingIterator();
192 return PostingIterator(new PostingIterator::Internal(pl
, *this));
196 Database::termlist_begin(Xapian::docid did
) const
199 docid_zero_invalid();
201 return TermIterator(internal
->open_term_list(did
));
205 Database::allterms_begin(const string
& prefix
) const
207 return TermIterator(internal
->open_allterms(prefix
));
211 Database::has_positions() const
213 return internal
->has_positions();
217 Database::positionlist_begin(Xapian::docid did
, const string
& term
) const
220 docid_zero_invalid();
223 empty_term_invalid();
225 return PositionIterator(internal
->open_position_list(did
, term
));
229 Database::get_doccount() const
231 return internal
->get_doccount();
235 Database::get_lastdocid() const
237 return internal
->get_lastdocid();
241 Database::get_average_length() const
243 Xapian::doccount doc_count
= internal
->get_doccount();
244 if (rare(doc_count
== 0))
247 Xapian::totallength total_length
= internal
->get_total_length();
248 return total_length
/ double(doc_count
);
252 Database::get_total_length() const
254 return internal
->get_total_length();
258 Database::get_termfreq(const string
& term
) const
261 return get_doccount();
263 Xapian::doccount result
;
264 internal
->get_freqs(term
, &result
, NULL
);
269 Database::get_collection_freq(const string
& term
) const
272 return get_doccount();
274 Xapian::termcount result
;
275 internal
->get_freqs(term
, NULL
, &result
);
280 Database::get_value_freq(Xapian::valueno slot
) const
282 return internal
->get_value_freq(slot
);
286 Database::get_value_lower_bound(Xapian::valueno slot
) const
288 return internal
->get_value_lower_bound(slot
);
292 Database::get_value_upper_bound(Xapian::valueno slot
) const
294 return internal
->get_value_upper_bound(slot
);
298 Database::get_doclength_lower_bound() const
300 return internal
->get_doclength_lower_bound();
304 Database::get_doclength_upper_bound() const
306 return internal
->get_doclength_upper_bound();
310 Database::get_wdf_upper_bound(const string
& term
) const
315 return internal
->get_wdf_upper_bound(term
);
319 Database::valuestream_begin(Xapian::valueno slot
) const
321 return ValueIterator(internal
->open_value_list(slot
));
325 Database::get_doclength(Xapian::docid did
) const
328 docid_zero_invalid();
330 return internal
->get_doclength(did
);
334 Database::get_unique_terms(Xapian::docid did
) const
337 docid_zero_invalid();
339 return internal
->get_unique_terms(did
);
343 Database::get_document(Xapian::docid did
, unsigned flags
) const
346 docid_zero_invalid();
348 bool assume_valid
= flags
& Xapian::DOC_ASSUME_VALID
;
349 return Document(internal
->open_document(did
, assume_valid
));
353 Database::term_exists(const string
& term
) const
355 // NB Internal::term_exists() handles term.empty().
356 return internal
->term_exists(term
);
360 Database::keep_alive()
362 internal
->keep_alive();
366 Database::get_description() const
368 string desc
= "Database(";
369 desc
+= internal
->get_description();
374 // Word must have a trigram score at least this close to the best score seen
376 #define TRIGRAM_SCORE_THRESHOLD 2
379 Database::get_spelling_suggestion(const string
& word
,
380 unsigned max_edit_distance
) const
382 if (word
.size() <= 1)
385 unique_ptr
<TermList
> merger(internal
->open_spelling_termlist(word
));
389 // Convert word to UTF-32.
390 vector
<unsigned> utf32_word
{Utf8Iterator(word
), Utf8Iterator()};
392 vector
<unsigned> utf32_term
;
394 Xapian::termcount best
= 1;
396 int edist_best
= max_edit_distance
;
397 Xapian::doccount freq_best
= 0;
398 Xapian::doccount freq_exact
= 0;
400 TermList
* ret
= merger
->next();
401 if (ret
) merger
.reset(ret
);
403 if (merger
->at_end()) break;
405 string term
= merger
->get_termname();
406 Xapian::termcount score
= merger
->get_wdf();
408 LOGVALUE(SPELLING
, term
);
409 LOGVALUE(SPELLING
, score
);
410 if (score
+ TRIGRAM_SCORE_THRESHOLD
>= best
) {
411 if (score
> best
) best
= score
;
413 // There's no point considering a word where the difference
414 // in length is greater than the smallest number of edits we've
417 // First check the length of the encoded UTF-8 version of term.
418 // Each UTF-32 character is 1-4 bytes in UTF-8.
419 if (abs(long(term
.size()) - long(word
.size())) > edist_best
* 4) {
420 LOGLINE(SPELLING
, "Lengths much too different");
424 // Now convert to UTF-32, and compare the true lengths more
426 utf32_term
.assign(Utf8Iterator(term
), Utf8Iterator());
428 // Check a very cheap length-based lower bound first.
429 long lb
= abs(long(utf32_term
.size()) - long(utf32_word
.size()));
430 if (lb
> edist_best
) {
434 if (freq_edit_lower_bound(utf32_term
, utf32_word
) > edist_best
) {
435 LOGLINE(SPELLING
, "Rejected by character frequency test");
439 int edist
= edit_distance_unsigned(&utf32_term
[0],
440 int(utf32_term
.size()),
442 int(utf32_word
.size()),
444 LOGVALUE(SPELLING
, edist
);
446 if (edist
<= edist_best
) {
447 Xapian::doccount freq
= internal
->get_spelling_frequency(term
);
449 LOGVALUE(SPELLING
, freq
);
450 LOGVALUE(SPELLING
, freq_best
);
451 // Even if we have an exact match, there may be a much more
452 // frequent potential correction which will still be
459 if (edist
< edist_best
|| freq
> freq_best
) {
460 LOGLINE(SPELLING
, "Best so far: \"" << term
<<
461 "\" edist " << edist
<< " freq " << freq
);
469 if (freq_best
< freq_exact
)
475 Database::spellings_begin() const
477 return TermIterator(internal
->open_spelling_wordlist());
481 Database::synonyms_begin(const string
& term
) const
483 return TermIterator(internal
->open_synonym_termlist(term
));
487 Database::synonym_keys_begin(const string
& prefix
) const
489 return TermIterator(internal
->open_synonym_keylist(prefix
));
493 Database::get_metadata(const string
& key
) const
495 if (rare(key
.empty()))
496 empty_metadata_key();
498 return internal
->get_metadata(key
);
502 Database::metadata_keys_begin(const string
& prefix
) const
504 return TermIterator(internal
->open_metadata_keylist(prefix
));
508 Database::get_uuid() const
510 return internal
->get_uuid();
514 Database::locked() const
516 return internal
->locked();
520 Database::get_revision() const
522 return internal
->get_revision();
526 WritableDatabase::commit()
532 WritableDatabase::begin_transaction(bool flushed
)
534 internal
->begin_transaction(flushed
);
538 WritableDatabase::end_transaction_(bool do_commit
)
540 internal
->end_transaction(do_commit
);
544 WritableDatabase::add_document(const Document
& doc
)
546 return internal
->add_document(doc
);
550 WritableDatabase::delete_document(Xapian::docid did
)
552 internal
->delete_document(did
);
556 WritableDatabase::delete_document(const string
& term
)
559 empty_term_invalid();
561 internal
->delete_document(term
);
565 WritableDatabase::replace_document(Xapian::docid did
, const Document
& doc
)
568 docid_zero_invalid();
570 internal
->replace_document(did
, doc
);
574 WritableDatabase::replace_document(const string
& term
, const Document
& doc
)
577 empty_term_invalid();
579 return internal
->replace_document(term
, doc
);
583 WritableDatabase::add_spelling(const string
& word
,
584 Xapian::termcount freqinc
) const
586 internal
->add_spelling(word
, freqinc
);
590 WritableDatabase::remove_spelling(const string
& word
,
591 Xapian::termcount freqdec
) const
593 return internal
->remove_spelling(word
, freqdec
);
597 WritableDatabase::add_synonym(const string
& term
,
598 const string
& synonym
) const
600 internal
->add_synonym(term
, synonym
);
604 WritableDatabase::remove_synonym(const string
& term
,
605 const string
& synonym
) const
607 internal
->remove_synonym(term
, synonym
);
611 WritableDatabase::clear_synonyms(const string
& term
) const
613 internal
->clear_synonyms(term
);
617 WritableDatabase::set_metadata(const string
& key
, const string
& value
)
619 if (rare(key
.empty()))
620 empty_metadata_key();
622 internal
->set_metadata(key
, value
);
626 WritableDatabase::get_description() const
628 string desc
= "WritableDatabase(";
629 desc
+= internal
->get_description();