Split PostList and PostingIterator::Internal
[xapian.git] / xapian-core / api / database.cc
blob7c4096887a758dab307e02314ece03627e6c0df8
1 /** @file database.cc
2 * @brief Database API class
3 */
4 /* Copyright 2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include <xapian/database.h>
26 #include "backends/databaseinternal.h"
27 #include "backends/empty_database.h"
28 #include "backends/multi/multi_database.h"
29 #include "debuglog.h"
30 #include "editdistance.h"
31 #include "omassert.h"
32 #include "pack.h"
33 #include "postingiteratorinternal.h"
34 #include <xapian/constants.h>
35 #include <xapian/error.h>
36 #include <xapian/positioniterator.h>
37 #include <xapian/postingiterator.h>
38 #include <xapian/termiterator.h>
39 #include <xapian/unicode.h>
41 #include <algorithm>
42 #include <cstdlib> // For abs().
43 #include <memory>
44 #include <string>
45 #include <vector>
47 using namespace std;
49 [[noreturn]]
50 static void docid_zero_invalid()
52 throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
55 [[noreturn]]
56 static void empty_metadata_key()
58 throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
61 [[noreturn]]
62 static void empty_term_invalid()
64 throw Xapian::InvalidArgumentError("Empty terms are invalid");
67 namespace Xapian {
69 Database::Database(Database::Internal* internal_)
70 : internal(internal_)
74 Database::Database(const Database& o)
75 : internal(o.internal)
79 Database&
80 Database::operator=(const Database& o)
82 internal = o.internal;
83 return *this;
86 Database::Database()
87 : internal(new EmptyDatabase)
91 Database::~Database()
95 bool
96 Database::reopen()
98 return internal->reopen();
101 void
102 Database::close()
104 internal->close();
107 void
108 Database::add_database_(const Database& o, bool read_only)
110 if (this == &o) {
111 const char* msg = read_only ?
112 "Database::add_database(): Can't add a Database to itself" :
113 "WritableDatabase::add_database(): Can't add a WritableDatabase "
114 "to itself";
115 throw InvalidArgumentError(msg);
118 auto o_size = o.internal->size();
119 if (o_size == 0) {
120 // Adding an empty database is a no-op.
121 return;
124 auto my_size = internal->size();
125 if (my_size == 0) {
126 // Just copy.
127 internal = o.internal;
128 return;
131 #if 0
132 // This doesn't work - for example:
134 // Database db;
135 // db.add_database(WritableDatabase("one.db"));
136 // db.add_database(WritableDatabase("two.db"));
138 // The first add_database() assigns the internal across, so at the second
139 // call internal->is_read_only() returns false but read_only is true.
141 // I'm not entirely convinced the extra complexity required to make this
142 // work is worthwhile. We catch static violations such as this at compile
143 // time:
145 // WritableDatabase db;
146 // db.add_database(Database("one.db"));
148 // The case we don't catch is:
150 // WritableDatabase db;
151 // Database ro_db = db;
152 // ro_db.add_database(Database("one.db"));
154 // But performing WritableDatabase actions using such a WritableDatabase
155 // should now throw InvalidOperationError.
156 if (!internal->is_read_only() && read_only) {
157 throw InvalidArgumentError("Database::add_database(): Can't add a "
158 "Database to a WritableDatabase");
160 #endif
162 // Make sure internal is a MultiDatabase with enough space reserved.
163 auto new_size = my_size + o_size;
164 MultiDatabase* multi_db;
165 if (my_size == 1) {
166 multi_db = new MultiDatabase(new_size, read_only);
167 multi_db->push_back(internal.get());
168 internal = multi_db;
169 } else {
170 // Must already be a MultiDatabase as everything else reports 1 for
171 // size().
172 multi_db = static_cast<MultiDatabase*>(internal.get());
173 multi_db->reserve(new_size);
176 if (o_size == 1) {
177 multi_db->push_back(o.internal.get());
178 } else {
179 // Must be a MultiDatabase.
180 auto o_multi = static_cast<MultiDatabase*>(o.internal.get());
181 // Add the shards from o to ourself.
182 for (auto&& shard : o_multi->shards) {
183 multi_db->push_back(shard);
188 PostingIterator
189 Database::postlist_begin(const string& term) const
191 PostList* pl = internal->open_post_list(term);
192 if (!pl) return PostingIterator();
193 return PostingIterator(new PostingIterator::Internal(pl));
196 TermIterator
197 Database::termlist_begin(Xapian::docid did) const
199 if (did == 0)
200 docid_zero_invalid();
202 return TermIterator(internal->open_term_list(did));
205 TermIterator
206 Database::allterms_begin(const string& prefix) const
208 return TermIterator(internal->open_allterms(prefix));
211 bool
212 Database::has_positions() const
214 return internal->has_positions();
217 PositionIterator
218 Database::positionlist_begin(Xapian::docid did, const string& term) const
220 if (did == 0)
221 docid_zero_invalid();
223 if (term.empty())
224 empty_term_invalid();
226 return PositionIterator(internal->open_position_list(did, term));
229 Xapian::doccount
230 Database::get_doccount() const
232 return internal->get_doccount();
235 Xapian::docid
236 Database::get_lastdocid() const
238 return internal->get_lastdocid();
241 double
242 Database::get_average_length() const
244 Xapian::doccount doc_count = internal->get_doccount();
245 if (rare(doc_count == 0))
246 return 0.0;
248 Xapian::totallength total_length = internal->get_total_length();
249 return total_length / double(doc_count);
252 Xapian::totallength
253 Database::get_total_length() const
255 return internal->get_total_length();
258 Xapian::doccount
259 Database::get_termfreq(const string& term) const
261 if (term.empty())
262 return get_doccount();
264 Xapian::doccount result;
265 internal->get_freqs(term, &result, NULL);
266 return result;
269 Xapian::termcount
270 Database::get_collection_freq(const string& term) const
272 if (term.empty())
273 return get_doccount();
275 Xapian::termcount result;
276 internal->get_freqs(term, NULL, &result);
277 return result;
280 Xapian::doccount
281 Database::get_value_freq(Xapian::valueno slot) const
283 return internal->get_value_freq(slot);
286 string
287 Database::get_value_lower_bound(Xapian::valueno slot) const
289 return internal->get_value_lower_bound(slot);
292 string
293 Database::get_value_upper_bound(Xapian::valueno slot) const
295 return internal->get_value_upper_bound(slot);
298 Xapian::termcount
299 Database::get_doclength_lower_bound() const
301 return internal->get_doclength_lower_bound();
304 Xapian::termcount
305 Database::get_doclength_upper_bound() const
307 return internal->get_doclength_upper_bound();
310 Xapian::termcount
311 Database::get_wdf_upper_bound(const string& term) const
313 if (term.empty())
314 return 0;
316 return internal->get_wdf_upper_bound(term);
319 ValueIterator
320 Database::valuestream_begin(Xapian::valueno slot) const
322 return ValueIterator(internal->open_value_list(slot));
325 Xapian::termcount
326 Database::get_doclength(Xapian::docid did) const
328 if (did == 0)
329 docid_zero_invalid();
331 return internal->get_doclength(did);
334 Xapian::termcount
335 Database::get_unique_terms(Xapian::docid did) const
337 if (did == 0)
338 docid_zero_invalid();
340 return internal->get_unique_terms(did);
343 Document
344 Database::get_document(Xapian::docid did, unsigned flags) const
346 if (rare(did == 0))
347 docid_zero_invalid();
349 bool assume_valid = flags & Xapian::DOC_ASSUME_VALID;
350 return Document(internal->open_document(did, assume_valid));
353 bool
354 Database::term_exists(const string& term) const
356 // NB Internal::term_exists() handles term.empty().
357 return internal->term_exists(term);
360 void
361 Database::keep_alive()
363 internal->keep_alive();
366 string
367 Database::get_description() const
369 string desc = "Database(";
370 desc += internal->get_description();
371 desc += ')';
372 return desc;
375 // Word must have a trigram score at least this close to the best score seen
376 // so far.
377 #define TRIGRAM_SCORE_THRESHOLD 2
379 string
380 Database::get_spelling_suggestion(const string& word,
381 unsigned max_edit_distance) const
383 if (word.size() <= 1)
384 return string();
386 unique_ptr<TermList> merger(internal->open_spelling_termlist(word));
387 if (!merger.get())
388 return string();
390 // Convert word to UTF-32.
391 vector<unsigned> utf32_word{Utf8Iterator(word), Utf8Iterator()};
393 vector<unsigned> utf32_term;
395 Xapian::termcount best = 1;
396 string result;
397 int edist_best = max_edit_distance;
398 Xapian::doccount freq_best = 0;
399 Xapian::doccount freq_exact = 0;
400 while (true) {
401 TermList* ret = merger->next();
402 if (ret) merger.reset(ret);
404 if (merger->at_end()) break;
406 string term = merger->get_termname();
407 Xapian::termcount score = merger->get_wdf();
409 LOGVALUE(SPELLING, term);
410 LOGVALUE(SPELLING, score);
411 if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
412 if (score > best) best = score;
414 // There's no point considering a word where the difference
415 // in length is greater than the smallest number of edits we've
416 // found so far.
418 // First check the length of the encoded UTF-8 version of term.
419 // Each UTF-32 character is 1-4 bytes in UTF-8.
420 if (abs(long(term.size()) - long(word.size())) > edist_best * 4) {
421 LOGLINE(SPELLING, "Lengths much too different");
422 continue;
425 // Now convert to UTF-32, and compare the true lengths more
426 // strictly.
427 utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
429 // Check a very cheap length-based lower bound first.
430 long lb = abs(long(utf32_term.size()) - long(utf32_word.size()));
431 if (lb > edist_best) {
432 continue;
435 if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) {
436 LOGLINE(SPELLING, "Rejected by character frequency test");
437 continue;
440 int edist = edit_distance_unsigned(&utf32_term[0],
441 int(utf32_term.size()),
442 &utf32_word[0],
443 int(utf32_word.size()),
444 edist_best);
445 LOGVALUE(SPELLING, edist);
447 if (edist <= edist_best) {
448 Xapian::doccount freq = internal->get_spelling_frequency(term);
450 LOGVALUE(SPELLING, freq);
451 LOGVALUE(SPELLING, freq_best);
452 // Even if we have an exact match, there may be a much more
453 // frequent potential correction which will still be
454 // interesting.
455 if (edist == 0) {
456 freq_exact = freq;
457 continue;
460 if (edist < edist_best || freq > freq_best) {
461 LOGLINE(SPELLING, "Best so far: \"" << term <<
462 "\" edist " << edist << " freq " << freq);
463 result = term;
464 edist_best = edist;
465 freq_best = freq;
470 if (freq_best < freq_exact)
471 return string();
472 return result;
475 TermIterator
476 Database::spellings_begin() const
478 return TermIterator(internal->open_spelling_wordlist());
481 TermIterator
482 Database::synonyms_begin(const string& term) const
484 return TermIterator(internal->open_synonym_termlist(term));
487 TermIterator
488 Database::synonym_keys_begin(const string& prefix) const
490 return TermIterator(internal->open_synonym_keylist(prefix));
493 string
494 Database::get_metadata(const string& key) const
496 if (rare(key.empty()))
497 empty_metadata_key();
499 return internal->get_metadata(key);
502 Xapian::TermIterator
503 Database::metadata_keys_begin(const string& prefix) const
505 return TermIterator(internal->open_metadata_keylist(prefix));
508 string
509 Database::get_uuid() const
511 return internal->get_uuid();
514 bool
515 Database::locked() const
517 return internal->locked();
520 Xapian::rev
521 Database::get_revision() const
523 const string& s = internal->get_revision_info();
524 const char* p = s.data();
525 const char* end = p + s.size();
526 Xapian::rev revision;
527 if (!unpack_uint(&p, end, &revision))
528 throw Xapian::UnimplementedError("Database::get_revision() only "
529 "supported for glass");
530 return revision;
533 void
534 WritableDatabase::commit()
536 internal->commit();
539 void
540 WritableDatabase::begin_transaction(bool flushed)
542 internal->begin_transaction(flushed);
545 void
546 WritableDatabase::end_transaction_(bool do_commit)
548 internal->end_transaction(do_commit);
551 Xapian::docid
552 WritableDatabase::add_document(const Document& doc)
554 return internal->add_document(doc);
557 void
558 WritableDatabase::delete_document(Xapian::docid did)
560 internal->delete_document(did);
563 void
564 WritableDatabase::delete_document(const string& term)
566 if (term.empty())
567 empty_term_invalid();
569 internal->delete_document(term);
572 void
573 WritableDatabase::replace_document(Xapian::docid did, const Document& doc)
575 if (rare(did == 0))
576 docid_zero_invalid();
578 internal->replace_document(did, doc);
581 Xapian::docid
582 WritableDatabase::replace_document(const string& term, const Document& doc)
584 if (term.empty())
585 empty_term_invalid();
587 return internal->replace_document(term, doc);
590 void
591 WritableDatabase::add_spelling(const string& word,
592 Xapian::termcount freqinc) const
594 internal->add_spelling(word, freqinc);
597 Xapian::termcount
598 WritableDatabase::remove_spelling(const string& word,
599 Xapian::termcount freqdec) const
601 return internal->remove_spelling(word, freqdec);
604 void
605 WritableDatabase::add_synonym(const string& term,
606 const string& synonym) const
608 internal->add_synonym(term, synonym);
611 void
612 WritableDatabase::remove_synonym(const string& term,
613 const string& synonym) const
615 internal->remove_synonym(term, synonym);
618 void
619 WritableDatabase::clear_synonyms(const string& term) const
621 internal->clear_synonyms(term);
624 void
625 WritableDatabase::set_metadata(const string& key, const string& value)
627 if (rare(key.empty()))
628 empty_metadata_key();
630 internal->set_metadata(key, value);
633 string
634 WritableDatabase::get_description() const
636 string desc = "WritableDatabase(";
637 desc += internal->get_description();
638 desc += ')';
639 return desc;