[python3] Simplify generated wrapper post-processing
[xapian.git] / xapian-core / api / database.cc
blobbe87d738bb64d2766c92fa01073ccef18270daeb
1 /** @file database.cc
2 * @brief Database API class
3 */
4 /* Copyright 2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include <xapian/database.h>
26 #include "backends/databaseinternal.h"
27 #include "backends/empty_database.h"
28 #include "backends/multi/multi_database.h"
29 #include "debuglog.h"
30 #include "editdistance.h"
31 #include "omassert.h"
32 #include "postingiteratorinternal.h"
33 #include <xapian/constants.h>
34 #include <xapian/error.h>
35 #include <xapian/positioniterator.h>
36 #include <xapian/postingiterator.h>
37 #include <xapian/termiterator.h>
38 #include <xapian/unicode.h>
40 #include <algorithm>
41 #include <cstdlib> // For abs().
42 #include <memory>
43 #include <string>
44 #include <vector>
46 using namespace std;
48 [[noreturn]]
49 static void docid_zero_invalid()
51 throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
54 [[noreturn]]
55 static void empty_metadata_key()
57 throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
60 [[noreturn]]
61 static void empty_term_invalid()
63 throw Xapian::InvalidArgumentError("Empty terms are invalid");
66 namespace Xapian {
68 Database::Database(Database::Internal* internal_)
69 : internal(internal_)
73 Database::Database(const Database& o)
74 : internal(o.internal)
78 Database&
79 Database::operator=(const Database& o)
81 internal = o.internal;
82 return *this;
85 Database::Database()
86 : internal(new EmptyDatabase)
90 Database::~Database()
94 bool
95 Database::reopen()
97 return internal->reopen();
100 void
101 Database::close()
103 internal->close();
106 void
107 Database::add_database_(const Database& o, bool read_only)
109 if (this == &o) {
110 const char* msg = read_only ?
111 "Database::add_database(): Can't add a Database to itself" :
112 "WritableDatabase::add_database(): Can't add a WritableDatabase "
113 "to itself";
114 throw InvalidArgumentError(msg);
117 auto o_size = o.internal->size();
118 if (o_size == 0) {
119 // Adding an empty database is a no-op.
120 return;
123 auto my_size = internal->size();
124 if (my_size == 0) {
125 // Just copy.
126 internal = o.internal;
127 return;
130 #if 0
131 // This doesn't work - for example:
133 // Database db;
134 // db.add_database(WritableDatabase("one.db"));
135 // db.add_database(WritableDatabase("two.db"));
137 // The first add_database() assigns the internal across, so at the second
138 // call internal->is_read_only() returns false but read_only is true.
140 // I'm not entirely convinced the extra complexity required to make this
141 // work is worthwhile. We catch static violations such as this at compile
142 // time:
144 // WritableDatabase db;
145 // db.add_database(Database("one.db"));
147 // The case we don't catch is:
149 // WritableDatabase db;
150 // Database ro_db = db;
151 // ro_db.add_database(Database("one.db"));
153 // But performing WritableDatabase actions using such a WritableDatabase
154 // should now throw InvalidOperationError.
155 if (!internal->is_read_only() && read_only) {
156 throw InvalidArgumentError("Database::add_database(): Can't add a "
157 "Database to a WritableDatabase");
159 #endif
161 // Make sure internal is a MultiDatabase with enough space reserved.
162 auto new_size = my_size + o_size;
163 MultiDatabase* multi_db;
164 if (my_size == 1) {
165 multi_db = new MultiDatabase(new_size, read_only);
166 multi_db->push_back(internal.get());
167 internal = multi_db;
168 } else {
169 // Must already be a MultiDatabase as everything else reports 1 for
170 // size().
171 multi_db = static_cast<MultiDatabase*>(internal.get());
172 multi_db->reserve(new_size);
175 if (o_size == 1) {
176 multi_db->push_back(o.internal.get());
177 } else {
178 // Must be a MultiDatabase.
179 auto o_multi = static_cast<MultiDatabase*>(o.internal.get());
180 // Add the shards from o to ourself.
181 for (auto&& shard : o_multi->shards) {
182 multi_db->push_back(shard);
187 PostingIterator
188 Database::postlist_begin(const string& term) const
190 PostList* pl = internal->open_post_list(term);
191 if (!pl) return PostingIterator();
192 return PostingIterator(new PostingIterator::Internal(pl, *this));
195 TermIterator
196 Database::termlist_begin(Xapian::docid did) const
198 if (did == 0)
199 docid_zero_invalid();
201 return TermIterator(internal->open_term_list(did));
204 TermIterator
205 Database::allterms_begin(const string& prefix) const
207 return TermIterator(internal->open_allterms(prefix));
210 bool
211 Database::has_positions() const
213 return internal->has_positions();
216 PositionIterator
217 Database::positionlist_begin(Xapian::docid did, const string& term) const
219 if (did == 0)
220 docid_zero_invalid();
222 if (term.empty())
223 empty_term_invalid();
225 return PositionIterator(internal->open_position_list(did, term));
228 Xapian::doccount
229 Database::get_doccount() const
231 return internal->get_doccount();
234 Xapian::docid
235 Database::get_lastdocid() const
237 return internal->get_lastdocid();
240 double
241 Database::get_average_length() const
243 Xapian::doccount doc_count = internal->get_doccount();
244 if (rare(doc_count == 0))
245 return 0.0;
247 Xapian::totallength total_length = internal->get_total_length();
248 return total_length / double(doc_count);
251 Xapian::totallength
252 Database::get_total_length() const
254 return internal->get_total_length();
257 Xapian::doccount
258 Database::get_termfreq(const string& term) const
260 if (term.empty())
261 return get_doccount();
263 Xapian::doccount result;
264 internal->get_freqs(term, &result, NULL);
265 return result;
268 Xapian::termcount
269 Database::get_collection_freq(const string& term) const
271 if (term.empty())
272 return get_doccount();
274 Xapian::termcount result;
275 internal->get_freqs(term, NULL, &result);
276 return result;
279 Xapian::doccount
280 Database::get_value_freq(Xapian::valueno slot) const
282 return internal->get_value_freq(slot);
285 string
286 Database::get_value_lower_bound(Xapian::valueno slot) const
288 return internal->get_value_lower_bound(slot);
291 string
292 Database::get_value_upper_bound(Xapian::valueno slot) const
294 return internal->get_value_upper_bound(slot);
297 Xapian::termcount
298 Database::get_doclength_lower_bound() const
300 return internal->get_doclength_lower_bound();
303 Xapian::termcount
304 Database::get_doclength_upper_bound() const
306 return internal->get_doclength_upper_bound();
309 Xapian::termcount
310 Database::get_wdf_upper_bound(const string& term) const
312 if (term.empty())
313 return 0;
315 return internal->get_wdf_upper_bound(term);
318 ValueIterator
319 Database::valuestream_begin(Xapian::valueno slot) const
321 return ValueIterator(internal->open_value_list(slot));
324 Xapian::termcount
325 Database::get_doclength(Xapian::docid did) const
327 if (did == 0)
328 docid_zero_invalid();
330 return internal->get_doclength(did);
333 Xapian::termcount
334 Database::get_unique_terms(Xapian::docid did) const
336 if (did == 0)
337 docid_zero_invalid();
339 return internal->get_unique_terms(did);
342 Document
343 Database::get_document(Xapian::docid did, unsigned flags) const
345 if (rare(did == 0))
346 docid_zero_invalid();
348 bool assume_valid = flags & Xapian::DOC_ASSUME_VALID;
349 return Document(internal->open_document(did, assume_valid));
352 bool
353 Database::term_exists(const string& term) const
355 // NB Internal::term_exists() handles term.empty().
356 return internal->term_exists(term);
359 void
360 Database::keep_alive()
362 internal->keep_alive();
365 string
366 Database::get_description() const
368 string desc = "Database(";
369 desc += internal->get_description();
370 desc += ')';
371 return desc;
374 // Word must have a trigram score at least this close to the best score seen
375 // so far.
376 #define TRIGRAM_SCORE_THRESHOLD 2
378 string
379 Database::get_spelling_suggestion(const string& word,
380 unsigned max_edit_distance) const
382 if (word.size() <= 1)
383 return string();
385 unique_ptr<TermList> merger(internal->open_spelling_termlist(word));
386 if (!merger.get())
387 return string();
389 // Convert word to UTF-32.
390 vector<unsigned> utf32_word{Utf8Iterator(word), Utf8Iterator()};
392 vector<unsigned> utf32_term;
394 Xapian::termcount best = 1;
395 string result;
396 int edist_best = max_edit_distance;
397 Xapian::doccount freq_best = 0;
398 Xapian::doccount freq_exact = 0;
399 while (true) {
400 TermList* ret = merger->next();
401 if (ret) merger.reset(ret);
403 if (merger->at_end()) break;
405 string term = merger->get_termname();
406 Xapian::termcount score = merger->get_wdf();
408 LOGVALUE(SPELLING, term);
409 LOGVALUE(SPELLING, score);
410 if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
411 if (score > best) best = score;
413 // There's no point considering a word where the difference
414 // in length is greater than the smallest number of edits we've
415 // found so far.
417 // First check the length of the encoded UTF-8 version of term.
418 // Each UTF-32 character is 1-4 bytes in UTF-8.
419 if (abs(long(term.size()) - long(word.size())) > edist_best * 4) {
420 LOGLINE(SPELLING, "Lengths much too different");
421 continue;
424 // Now convert to UTF-32, and compare the true lengths more
425 // strictly.
426 utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
428 // Check a very cheap length-based lower bound first.
429 long lb = abs(long(utf32_term.size()) - long(utf32_word.size()));
430 if (lb > edist_best) {
431 continue;
434 if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) {
435 LOGLINE(SPELLING, "Rejected by character frequency test");
436 continue;
439 int edist = edit_distance_unsigned(&utf32_term[0],
440 int(utf32_term.size()),
441 &utf32_word[0],
442 int(utf32_word.size()),
443 edist_best);
444 LOGVALUE(SPELLING, edist);
446 if (edist <= edist_best) {
447 Xapian::doccount freq = internal->get_spelling_frequency(term);
449 LOGVALUE(SPELLING, freq);
450 LOGVALUE(SPELLING, freq_best);
451 // Even if we have an exact match, there may be a much more
452 // frequent potential correction which will still be
453 // interesting.
454 if (edist == 0) {
455 freq_exact = freq;
456 continue;
459 if (edist < edist_best || freq > freq_best) {
460 LOGLINE(SPELLING, "Best so far: \"" << term <<
461 "\" edist " << edist << " freq " << freq);
462 result = term;
463 edist_best = edist;
464 freq_best = freq;
469 if (freq_best < freq_exact)
470 return string();
471 return result;
474 TermIterator
475 Database::spellings_begin() const
477 return TermIterator(internal->open_spelling_wordlist());
480 TermIterator
481 Database::synonyms_begin(const string& term) const
483 return TermIterator(internal->open_synonym_termlist(term));
486 TermIterator
487 Database::synonym_keys_begin(const string& prefix) const
489 return TermIterator(internal->open_synonym_keylist(prefix));
492 string
493 Database::get_metadata(const string& key) const
495 if (rare(key.empty()))
496 empty_metadata_key();
498 return internal->get_metadata(key);
501 Xapian::TermIterator
502 Database::metadata_keys_begin(const string& prefix) const
504 return TermIterator(internal->open_metadata_keylist(prefix));
507 string
508 Database::get_uuid() const
510 return internal->get_uuid();
513 bool
514 Database::locked() const
516 return internal->locked();
519 Xapian::rev
520 Database::get_revision() const
522 return internal->get_revision();
525 void
526 WritableDatabase::commit()
528 internal->commit();
531 void
532 WritableDatabase::begin_transaction(bool flushed)
534 internal->begin_transaction(flushed);
537 void
538 WritableDatabase::end_transaction_(bool do_commit)
540 internal->end_transaction(do_commit);
543 Xapian::docid
544 WritableDatabase::add_document(const Document& doc)
546 return internal->add_document(doc);
549 void
550 WritableDatabase::delete_document(Xapian::docid did)
552 internal->delete_document(did);
555 void
556 WritableDatabase::delete_document(const string& term)
558 if (term.empty())
559 empty_term_invalid();
561 internal->delete_document(term);
564 void
565 WritableDatabase::replace_document(Xapian::docid did, const Document& doc)
567 if (rare(did == 0))
568 docid_zero_invalid();
570 internal->replace_document(did, doc);
573 Xapian::docid
574 WritableDatabase::replace_document(const string& term, const Document& doc)
576 if (term.empty())
577 empty_term_invalid();
579 return internal->replace_document(term, doc);
582 void
583 WritableDatabase::add_spelling(const string& word,
584 Xapian::termcount freqinc) const
586 internal->add_spelling(word, freqinc);
589 Xapian::termcount
590 WritableDatabase::remove_spelling(const string& word,
591 Xapian::termcount freqdec) const
593 return internal->remove_spelling(word, freqdec);
596 void
597 WritableDatabase::add_synonym(const string& term,
598 const string& synonym) const
600 internal->add_synonym(term, synonym);
603 void
604 WritableDatabase::remove_synonym(const string& term,
605 const string& synonym) const
607 internal->remove_synonym(term, synonym);
610 void
611 WritableDatabase::clear_synonyms(const string& term) const
613 internal->clear_synonyms(term);
616 void
617 WritableDatabase::set_metadata(const string& key, const string& value)
619 if (rare(key.empty()))
620 empty_metadata_key();
622 internal->set_metadata(key, value);
625 string
626 WritableDatabase::get_description() const
628 string desc = "WritableDatabase(";
629 desc += internal->get_description();
630 desc += ')';
631 return desc;