2 * @brief Database API class
4 /* Copyright 2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include <xapian/database.h>
26 #include "backends/databaseinternal.h"
27 #include "backends/empty_database.h"
28 #include "backends/multi/multi_database.h"
30 #include "editdistance.h"
33 #include "postingiteratorinternal.h"
34 #include <xapian/constants.h>
35 #include <xapian/error.h>
36 #include <xapian/positioniterator.h>
37 #include <xapian/postingiterator.h>
38 #include <xapian/termiterator.h>
39 #include <xapian/unicode.h>
42 #include <cstdlib> // For abs().
50 static void docid_zero_invalid()
52 throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
56 static void empty_metadata_key()
58 throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
62 static void empty_term_invalid()
64 throw Xapian::InvalidArgumentError("Empty terms are invalid");
69 Database::Database(Database::Internal
* internal_
)
74 Database::Database(const Database
& o
)
75 : internal(o
.internal
)
80 Database::operator=(const Database
& o
)
82 internal
= o
.internal
;
87 : internal(new EmptyDatabase
)
98 return internal
->reopen();
108 Database::add_database_(const Database
& o
, bool read_only
)
111 const char* msg
= read_only
?
112 "Database::add_database(): Can't add a Database to itself" :
113 "WritableDatabase::add_database(): Can't add a WritableDatabase "
115 throw InvalidArgumentError(msg
);
118 auto o_size
= o
.internal
->size();
120 // Adding an empty database is a no-op.
124 auto my_size
= internal
->size();
127 internal
= o
.internal
;
132 // This doesn't work - for example:
135 // db.add_database(WritableDatabase("one.db"));
136 // db.add_database(WritableDatabase("two.db"));
138 // The first add_database() assigns the internal across, so at the second
139 // call internal->is_read_only() returns false but read_only is true.
141 // I'm not entirely convinced the extra complexity required to make this
142 // work is worthwhile. We catch static violations such as this at compile
145 // WritableDatabase db;
146 // db.add_database(Database("one.db"));
148 // The case we don't catch is:
150 // WritableDatabase db;
151 // Database ro_db = db;
152 // ro_db.add_database(Database("one.db"));
154 // But performing WritableDatabase actions using such a WritableDatabase
155 // should now throw InvalidOperationError.
156 if (!internal
->is_read_only() && read_only
) {
157 throw InvalidArgumentError("Database::add_database(): Can't add a "
158 "Database to a WritableDatabase");
162 // Make sure internal is a MultiDatabase with enough space reserved.
163 auto new_size
= my_size
+ o_size
;
164 MultiDatabase
* multi_db
;
166 multi_db
= new MultiDatabase(new_size
, read_only
);
167 multi_db
->push_back(internal
.get());
170 // Must already be a MultiDatabase as everything else reports 1 for
172 multi_db
= static_cast<MultiDatabase
*>(internal
.get());
173 multi_db
->reserve(new_size
);
177 multi_db
->push_back(o
.internal
.get());
179 // Must be a MultiDatabase.
180 auto o_multi
= static_cast<MultiDatabase
*>(o
.internal
.get());
181 // Add the shards from o to ourself.
182 for (auto&& shard
: o_multi
->shards
) {
183 multi_db
->push_back(shard
);
189 Database::postlist_begin(const string
& term
) const
191 PostList
* pl
= internal
->open_post_list(term
);
192 if (!pl
) return PostingIterator();
193 return PostingIterator(new PostingIterator::Internal(pl
));
197 Database::termlist_begin(Xapian::docid did
) const
200 docid_zero_invalid();
202 return TermIterator(internal
->open_term_list(did
));
206 Database::allterms_begin(const string
& prefix
) const
208 return TermIterator(internal
->open_allterms(prefix
));
212 Database::has_positions() const
214 return internal
->has_positions();
218 Database::positionlist_begin(Xapian::docid did
, const string
& term
) const
221 docid_zero_invalid();
224 empty_term_invalid();
226 return PositionIterator(internal
->open_position_list(did
, term
));
230 Database::get_doccount() const
232 return internal
->get_doccount();
236 Database::get_lastdocid() const
238 return internal
->get_lastdocid();
242 Database::get_average_length() const
244 Xapian::doccount doc_count
= internal
->get_doccount();
245 if (rare(doc_count
== 0))
248 Xapian::totallength total_length
= internal
->get_total_length();
249 return total_length
/ double(doc_count
);
253 Database::get_total_length() const
255 return internal
->get_total_length();
259 Database::get_termfreq(const string
& term
) const
262 return get_doccount();
264 Xapian::doccount result
;
265 internal
->get_freqs(term
, &result
, NULL
);
270 Database::get_collection_freq(const string
& term
) const
273 return get_doccount();
275 Xapian::termcount result
;
276 internal
->get_freqs(term
, NULL
, &result
);
281 Database::get_value_freq(Xapian::valueno slot
) const
283 return internal
->get_value_freq(slot
);
287 Database::get_value_lower_bound(Xapian::valueno slot
) const
289 return internal
->get_value_lower_bound(slot
);
293 Database::get_value_upper_bound(Xapian::valueno slot
) const
295 return internal
->get_value_upper_bound(slot
);
299 Database::get_doclength_lower_bound() const
301 return internal
->get_doclength_lower_bound();
305 Database::get_doclength_upper_bound() const
307 return internal
->get_doclength_upper_bound();
311 Database::get_wdf_upper_bound(const string
& term
) const
316 return internal
->get_wdf_upper_bound(term
);
320 Database::valuestream_begin(Xapian::valueno slot
) const
322 return ValueIterator(internal
->open_value_list(slot
));
326 Database::get_doclength(Xapian::docid did
) const
329 docid_zero_invalid();
331 return internal
->get_doclength(did
);
335 Database::get_unique_terms(Xapian::docid did
) const
338 docid_zero_invalid();
340 return internal
->get_unique_terms(did
);
344 Database::get_document(Xapian::docid did
, unsigned flags
) const
347 docid_zero_invalid();
349 bool assume_valid
= flags
& Xapian::DOC_ASSUME_VALID
;
350 return Document(internal
->open_document(did
, assume_valid
));
354 Database::term_exists(const string
& term
) const
356 // NB Internal::term_exists() handles term.empty().
357 return internal
->term_exists(term
);
361 Database::keep_alive()
363 internal
->keep_alive();
367 Database::get_description() const
369 string desc
= "Database(";
370 desc
+= internal
->get_description();
375 // Word must have a trigram score at least this close to the best score seen
377 #define TRIGRAM_SCORE_THRESHOLD 2
380 Database::get_spelling_suggestion(const string
& word
,
381 unsigned max_edit_distance
) const
383 if (word
.size() <= 1)
386 unique_ptr
<TermList
> merger(internal
->open_spelling_termlist(word
));
390 // Convert word to UTF-32.
391 vector
<unsigned> utf32_word
{Utf8Iterator(word
), Utf8Iterator()};
393 vector
<unsigned> utf32_term
;
395 Xapian::termcount best
= 1;
397 int edist_best
= max_edit_distance
;
398 Xapian::doccount freq_best
= 0;
399 Xapian::doccount freq_exact
= 0;
401 TermList
* ret
= merger
->next();
402 if (ret
) merger
.reset(ret
);
404 if (merger
->at_end()) break;
406 string term
= merger
->get_termname();
407 Xapian::termcount score
= merger
->get_wdf();
409 LOGVALUE(SPELLING
, term
);
410 LOGVALUE(SPELLING
, score
);
411 if (score
+ TRIGRAM_SCORE_THRESHOLD
>= best
) {
412 if (score
> best
) best
= score
;
414 // There's no point considering a word where the difference
415 // in length is greater than the smallest number of edits we've
418 // First check the length of the encoded UTF-8 version of term.
419 // Each UTF-32 character is 1-4 bytes in UTF-8.
420 if (abs(long(term
.size()) - long(word
.size())) > edist_best
* 4) {
421 LOGLINE(SPELLING
, "Lengths much too different");
425 // Now convert to UTF-32, and compare the true lengths more
427 utf32_term
.assign(Utf8Iterator(term
), Utf8Iterator());
429 // Check a very cheap length-based lower bound first.
430 long lb
= abs(long(utf32_term
.size()) - long(utf32_word
.size()));
431 if (lb
> edist_best
) {
435 if (freq_edit_lower_bound(utf32_term
, utf32_word
) > edist_best
) {
436 LOGLINE(SPELLING
, "Rejected by character frequency test");
440 int edist
= edit_distance_unsigned(&utf32_term
[0],
441 int(utf32_term
.size()),
443 int(utf32_word
.size()),
445 LOGVALUE(SPELLING
, edist
);
447 if (edist
<= edist_best
) {
448 Xapian::doccount freq
= internal
->get_spelling_frequency(term
);
450 LOGVALUE(SPELLING
, freq
);
451 LOGVALUE(SPELLING
, freq_best
);
452 // Even if we have an exact match, there may be a much more
453 // frequent potential correction which will still be
460 if (edist
< edist_best
|| freq
> freq_best
) {
461 LOGLINE(SPELLING
, "Best so far: \"" << term
<<
462 "\" edist " << edist
<< " freq " << freq
);
470 if (freq_best
< freq_exact
)
476 Database::spellings_begin() const
478 return TermIterator(internal
->open_spelling_wordlist());
482 Database::synonyms_begin(const string
& term
) const
484 return TermIterator(internal
->open_synonym_termlist(term
));
488 Database::synonym_keys_begin(const string
& prefix
) const
490 return TermIterator(internal
->open_synonym_keylist(prefix
));
494 Database::get_metadata(const string
& key
) const
496 if (rare(key
.empty()))
497 empty_metadata_key();
499 return internal
->get_metadata(key
);
503 Database::metadata_keys_begin(const string
& prefix
) const
505 return TermIterator(internal
->open_metadata_keylist(prefix
));
509 Database::get_uuid() const
511 return internal
->get_uuid();
515 Database::locked() const
517 return internal
->locked();
521 Database::get_revision() const
523 const string
& s
= internal
->get_revision_info();
524 const char* p
= s
.data();
525 const char* end
= p
+ s
.size();
526 Xapian::rev revision
;
527 if (!unpack_uint(&p
, end
, &revision
))
528 throw Xapian::UnimplementedError("Database::get_revision() only "
529 "supported for glass");
534 WritableDatabase::commit()
540 WritableDatabase::begin_transaction(bool flushed
)
542 internal
->begin_transaction(flushed
);
546 WritableDatabase::end_transaction_(bool do_commit
)
548 internal
->end_transaction(do_commit
);
552 WritableDatabase::add_document(const Document
& doc
)
554 return internal
->add_document(doc
);
558 WritableDatabase::delete_document(Xapian::docid did
)
560 internal
->delete_document(did
);
564 WritableDatabase::delete_document(const string
& term
)
567 empty_term_invalid();
569 internal
->delete_document(term
);
573 WritableDatabase::replace_document(Xapian::docid did
, const Document
& doc
)
576 docid_zero_invalid();
578 internal
->replace_document(did
, doc
);
582 WritableDatabase::replace_document(const string
& term
, const Document
& doc
)
585 empty_term_invalid();
587 return internal
->replace_document(term
, doc
);
591 WritableDatabase::add_spelling(const string
& word
,
592 Xapian::termcount freqinc
) const
594 internal
->add_spelling(word
, freqinc
);
598 WritableDatabase::remove_spelling(const string
& word
,
599 Xapian::termcount freqdec
) const
601 return internal
->remove_spelling(word
, freqdec
);
605 WritableDatabase::add_synonym(const string
& term
,
606 const string
& synonym
) const
608 internal
->add_synonym(term
, synonym
);
612 WritableDatabase::remove_synonym(const string
& term
,
613 const string
& synonym
) const
615 internal
->remove_synonym(term
, synonym
);
619 WritableDatabase::clear_synonyms(const string
& term
) const
621 internal
->clear_synonyms(term
);
625 WritableDatabase::set_metadata(const string
& key
, const string
& value
)
627 if (rare(key
.empty()))
628 empty_metadata_key();
630 internal
->set_metadata(key
, value
);
634 WritableDatabase::get_description() const
636 string desc
= "WritableDatabase(";
637 desc
+= internal
->get_description();