1 /** @file honey_values.cc
2 * @brief HoneyValueManager class
4 /* Copyright (C) 2008,2009,2010,2011,2012,2016,2017,2018 Olly Betts
5 * Copyright (C) 2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "honey_values.h"
26 #include "honey_cursor.h"
27 #include "honey_postlist.h"
28 #include "honey_postlisttable.h"
29 #include "honey_termlist.h"
30 #include "honey_termlisttable.h"
32 #include "bitstream.h"
34 #include "backends/documentinternal.h"
37 #include "xapian/error.h"
38 #include "xapian/valueiterator.h"
43 using namespace Honey
;
48 // * values named instead of numbered?
51 ValueChunkReader::assign(const char * p_
, size_t len
, Xapian::docid last_did
)
55 if (!unpack_uint(&p
, end
, &did
))
56 throw Xapian::DatabaseCorruptError("Failed to unpack docid delta");
58 if (!unpack_string(&p
, end
, value
))
59 throw Xapian::DatabaseCorruptError("Failed to unpack first value");
63 ValueChunkReader::next()
71 if (!unpack_uint(&p
, end
, &delta
)) {
72 throw Xapian::DatabaseCorruptError("Failed to unpack streamed value "
76 if (!unpack_string(&p
, end
, value
))
77 throw Xapian::DatabaseCorruptError("Failed to unpack streamed value");
81 ValueChunkReader::skip_to(Xapian::docid target
)
83 if (p
== NULL
|| target
<= did
)
90 if (rare(!unpack_uint(&p
, end
, &delta
))) {
91 throw Xapian::DatabaseCorruptError("Failed to unpack streamed "
96 // Get the length of the string
97 if (rare(!unpack_uint(&p
, end
, &value_len
))) {
98 throw Xapian::DatabaseCorruptError("Failed to unpack streamed "
102 // Check that it's not too long
103 if (rare(value_len
> size_t(end
- p
))) {
104 throw Xapian::DatabaseCorruptError("Failed to unpack streamed "
108 // Assign the value and return only if we've reached the target
110 value
.assign(p
, value_len
);
120 HoneyValueManager::add_value(Xapian::docid did
, Xapian::valueno slot
,
123 map
<Xapian::valueno
, map
<Xapian::docid
, string
> >::iterator i
;
124 i
= changes
.find(slot
);
125 if (i
== changes
.end()) {
126 i
= changes
.insert(make_pair(slot
, map
<Xapian::docid
, string
>())).first
;
128 i
->second
[did
] = val
;
132 HoneyValueManager::remove_value(Xapian::docid did
, Xapian::valueno slot
)
134 map
<Xapian::valueno
, map
<Xapian::docid
, string
> >::iterator i
;
135 i
= changes
.find(slot
);
136 if (i
== changes
.end()) {
137 i
= changes
.insert(make_pair(slot
, map
<Xapian::docid
, string
>())).first
;
139 i
->second
[did
] = string();
143 HoneyValueManager::get_chunk_containing_did(Xapian::valueno slot
,
147 LOGCALL(DB
, Xapian::docid
, "HoneyValueManager::get_chunk_containing_did", slot
| did
| chunk
);
149 cursor
.reset(postlist_table
.cursor_get());
150 if (!cursor
.get()) RETURN(0);
152 bool exact
= cursor
->find_entry_ge(make_valuechunk_key(slot
, did
));
154 // The chunk doesn't end with docid did, so we need to get
155 // the last docid in the chunk from the key to return.
156 did
= docid_from_key(slot
, cursor
->current_key
);
160 chunk
= cursor
->current_tag
;
161 // FIXME: fails, not sure why: swap(chunk, cursor->current_tag);
166 static const size_t CHUNK_SIZE_THRESHOLD
= 2000;
171 HoneyPostListTable
& table
;
173 Xapian::valueno slot
;
177 ValueChunkReader reader
;
181 Xapian::docid prev_did
;
183 Xapian::docid last_did
;
185 Xapian::docid new_last_did
;
187 Xapian::docid last_allowed_did
;
189 void append_to_stream(Xapian::docid did
, const string
& value
) {
192 AssertRel(did
,>,prev_did
);
193 pack_uint(tag
, did
- prev_did
- 1);
197 pack_string(tag
, value
);
198 if (tag
.size() >= CHUNK_SIZE_THRESHOLD
) write_tag();
202 // If the last docid has changed, delete the old entry.
203 if (last_did
&& new_last_did
!= last_did
) {
204 table
.del(make_valuechunk_key(slot
, last_did
));
207 table
.add(make_valuechunk_key(slot
, new_last_did
), tag
);
214 ValueUpdater(HoneyPostListTable
& table_
, Xapian::valueno slot_
)
215 : table(table_
), slot(slot_
), last_did(0), last_allowed_did(0) { }
218 while (!reader
.at_end()) {
219 // FIXME: use skip_to and some splicing magic instead?
220 append_to_stream(reader
.get_docid(), reader
.get_value());
226 void update(Xapian::docid did
, const string
& value
) {
227 if (last_allowed_did
&& did
> last_allowed_did
) {
228 // The next change needs to go in a later existing chunk than the
229 // one we're currently updating, so we copy over the rest of the
230 // entries from the current chunk, write out the updated chunk and
231 // drop through to the case below will read in that later chunk.
232 // FIXME: use some string splicing magic instead of this loop.
233 while (!reader
.at_end()) {
234 // last_allowed_did should be an upper bound for this chunk.
235 AssertRel(reader
.get_docid(),<=,last_allowed_did
);
236 append_to_stream(reader
.get_docid(), reader
.get_value());
240 last_allowed_did
= 0;
242 if (last_allowed_did
== 0) {
243 last_allowed_did
= HONEY_MAX_DOCID
;
246 unique_ptr
<HoneyCursor
> cursor(table
.cursor_get());
247 if (cursor
->find_entry_ge(make_valuechunk_key(slot
, did
))) {
248 // We found an exact match, so the last docid is the one
252 Assert(!cursor
->after_end());
253 // Otherwise we need to unpack it from the key we found.
254 // We may have found a non-value-chunk entry in which case
255 // docid_from_key() returns 0.
256 last_did
= docid_from_key(slot
, cursor
->current_key
);
259 // If there are no further chunks, then the last docid that can go
260 // in this chunk is the highest valid docid. If there are further
261 // chunks then it's one less than the first docid of the next
264 // We found a value chunk.
266 // FIXME:swap(cursor->current_tag, ctag);
267 ctag
= cursor
->current_tag
;
268 reader
.assign(ctag
.data(), ctag
.size(), last_did
);
270 if (cursor
->next()) {
271 const string
& key
= cursor
->current_key
;
272 Xapian::docid next_last_did
= docid_from_key(slot
, key
);
276 const char* p
= cursor
->current_tag
.data();
277 const char* e
= p
+ cursor
->current_tag
.size();
278 if (!unpack_uint(&p
, e
, &delta
)) {
279 throw Xapian::DatabaseCorruptError("Failed to unpack "
282 Xapian::docid next_first_did
= next_last_did
- delta
;
283 last_allowed_did
= next_first_did
- 1;
285 Assert(last_allowed_did
);
286 AssertRel(last_allowed_did
,>=,last_did
);
290 // Copy over entries until we get to the one we want to
291 // add/modify/delete.
292 // FIXME: use skip_to and some splicing magic instead?
293 while (!reader
.at_end() && reader
.get_docid() < did
) {
294 append_to_stream(reader
.get_docid(), reader
.get_value());
297 if (!reader
.at_end() && reader
.get_docid() == did
) reader
.next();
298 if (!value
.empty()) {
299 // Add/update entry for did.
300 append_to_stream(did
, value
);
308 HoneyValueManager::merge_changes()
310 for (auto&& i
: changes
) {
311 Xapian::valueno slot
= i
.first
;
312 Honey::ValueUpdater
updater(postlist_table
, slot
);
313 for (auto&& j
: i
.second
) {
314 updater
.update(j
.first
, j
.second
);
321 HoneyValueManager::add_document(Xapian::docid did
, const Xapian::Document
&doc
,
322 map
<Xapian::valueno
, ValueStats
> &val_stats
)
324 Xapian::ValueIterator it
= doc
.values_begin();
325 if (it
== doc
.values_end()) {
326 // No document values.
327 auto i
= slots
.find(did
);
328 if (i
!= slots
.end()) {
329 // Document's values already added or modified in this batch.
330 i
->second
= string();
335 Xapian::valueno count
= doc
.internal
->values_count();
336 Xapian::VecCOW
<Xapian::termpos
> slotvec(count
);
338 Xapian::valueno first_slot
= it
.get_valueno();
339 Xapian::valueno last_slot
= first_slot
;
340 while (it
!= doc
.values_end()) {
341 Xapian::valueno slot
= it
.get_valueno();
342 slotvec
.push_back(slot
);
343 const string
& value
= *it
;
345 // Update the statistics.
346 auto i
= val_stats
.insert(make_pair(slot
, ValueStats()));
347 ValueStats
& stats
= i
.first
->second
;
349 // There were no statistics stored already, so read them.
350 get_value_stats(slot
, stats
);
353 // Now, modify the stored statistics.
354 if ((stats
.freq
)++ == 0) {
355 // If the value count was previously zero, set the upper and lower
356 // bounds to the newly added value.
357 stats
.lower_bound
= value
;
358 stats
.upper_bound
= value
;
360 // Otherwise, simply make sure they reflect the new value.
362 // Check the upper bound first, as for some common uses of value
363 // slots (dates) the values will tend to get larger not smaller
365 int cmp
= value
.compare(stats
.upper_bound
);
367 if (cmp
> 0) stats
.upper_bound
= value
;
368 } else if (value
< stats
.lower_bound
) {
369 stats
.lower_bound
= value
;
373 add_value(did
, slot
, value
);
378 if (!termlist_table
.is_open()) {
383 pack_uint(enc
, last_slot
);
384 BitWriter
slots_used(enc
);
386 slots_used
.encode(first_slot
, last_slot
);
387 slots_used
.encode(count
- 2, last_slot
- first_slot
);
388 slots_used
.encode_interpolative(slotvec
, 0, slotvec
.size() - 1);
391 return slots_used
.freeze();
395 HoneyValueManager::delete_document(Xapian::docid did
,
396 map
<Xapian::valueno
, ValueStats
>& val_stats
)
398 Assert(termlist_table
.is_open());
399 map
<Xapian::docid
, string
>::iterator it
= slots
.find(did
);
401 if (it
!= slots
.end()) {
404 // Get from table, making a swift exit if this document has no terms or
406 if (!termlist_table
.get_exact_entry(termlist_table
.make_key(did
), s
))
408 slots
.insert(make_pair(did
, string()));
411 const char* p
= s
.data();
412 const char* end
= p
+ s
.size();
413 size_t slot_enc_size
;
414 if (!unpack_uint(&p
, end
, &slot_enc_size
)) {
415 throw Xapian::DatabaseCorruptError("Termlist encoding corrupt");
417 if (slot_enc_size
== 0)
420 end
= p
+ slot_enc_size
;
421 Xapian::valueno last_slot
;
422 if (!unpack_uint(&p
, end
, &last_slot
)) {
423 throw Xapian::DatabaseCorruptError("Slots used data corrupt");
427 BitReader
rd(p
, end
);
428 Xapian::valueno first_slot
= rd
.decode(last_slot
);
429 Xapian::valueno slot_count
= rd
.decode(last_slot
- first_slot
) + 2;
430 rd
.decode_interpolative(0, slot_count
- 1, first_slot
, last_slot
);
432 Xapian::valueno slot
= first_slot
;
433 while (slot
!= last_slot
) {
434 auto i
= val_stats
.insert(make_pair(slot
, ValueStats()));
435 ValueStats
& stats
= i
.first
->second
;
437 // There were no statistics stored already, so read them.
438 get_value_stats(slot
, stats
);
441 // Now, modify the stored statistics.
442 AssertRelParanoid(stats
.freq
, >, 0);
443 if (--(stats
.freq
) == 0) {
444 stats
.lower_bound
.resize(0);
445 stats
.upper_bound
.resize(0);
448 remove_value(did
, slot
);
450 slot
= rd
.decode_interpolative_next();
455 Xapian::valueno slot
= last_slot
;
458 // FIXME: share code with above
459 auto i
= val_stats
.insert(make_pair(slot
, ValueStats()));
460 ValueStats
& stats
= i
.first
->second
;
462 // There were no statistics stored already, so read them.
463 get_value_stats(slot
, stats
);
466 // Now, modify the stored statistics.
467 AssertRelParanoid(stats
.freq
, >, 0);
468 if (--(stats
.freq
) == 0) {
469 stats
.lower_bound
.resize(0);
470 stats
.upper_bound
.resize(0);
473 remove_value(did
, slot
);
479 HoneyValueManager::replace_document(Xapian::docid did
,
480 const Xapian::Document
&doc
,
481 map
<Xapian::valueno
, ValueStats
>& val_stats
)
483 if (doc
.get_docid() == did
) {
484 // If we're replacing a document with itself, but the optimisation for
485 // this higher up hasn't kicked in (e.g. because we've added/replaced
486 // a document since this one was read) and the values haven't changed,
487 // then the call to delete_document() below will remove the values
488 // before the subsequent add_document() can read them.
490 // The simplest way to handle this is to force the document to read its
491 // values, which we only need to do this is the docid matches. Note
492 // that this check can give false positives as we don't also check the
493 // database, so for example replacing document 4 in one database with
494 // document 4 from another will unnecessarily trigger this, but forcing
495 // the values to be read is fairly harmless, and this is unlikely to be
496 // a common case. (Currently we will end up fetching the values
497 // anyway, but there's scope to change that in the future).
498 doc
.internal
->ensure_values_fetched();
500 delete_document(did
, val_stats
);
501 return add_document(did
, doc
, val_stats
);
505 HoneyValueManager::get_value(Xapian::docid did
, Xapian::valueno slot
) const
507 map
<Xapian::valueno
, map
<Xapian::docid
, string
> >::const_iterator i
;
508 i
= changes
.find(slot
);
509 if (i
!= changes
.end()) {
510 map
<Xapian::docid
, string
>::const_iterator j
;
511 j
= i
->second
.find(did
);
512 if (j
!= i
->second
.end()) return j
->second
;
515 // Read it from the table.
517 Xapian::docid last_did
;
518 last_did
= get_chunk_containing_did(slot
, did
, chunk
);
519 if (last_did
== 0) return string();
521 ValueChunkReader
reader(chunk
.data(), chunk
.size(), last_did
);
523 if (reader
.at_end() || reader
.get_docid() != did
) return string();
524 return reader
.get_value();
528 HoneyValueManager::get_all_values(map
<Xapian::valueno
, string
> & values
,
529 Xapian::docid did
) const
531 Assert(values
.empty());
532 if (!termlist_table
.is_open()) {
533 // Either the database has been closed, or else there's no termlist
534 // table. Check if the postlist table is open to determine which is
536 if (!postlist_table
.is_open())
537 HoneyTable::throw_database_closed();
538 throw Xapian::FeatureUnavailableError("Database has no termlist");
542 if (!termlist_table
.get_exact_entry(termlist_table
.make_key(did
), s
))
545 const char* p
= s
.data();
546 const char* end
= p
+ s
.size();
547 size_t slot_enc_size
= *p
++;
549 if ((slot_enc_size
& 0x80) == 0) {
550 // If the top bit is clear we have a 7-bit bitmap of slots used.
551 Xapian::valueno slot
= 0;
552 while (slot_enc_size
) {
553 if (slot_enc_size
& 1) {
554 values
.insert(make_pair(slot
, get_value(did
, slot
)));
562 slot_enc_size
&= 0x7f;
563 if (slot_enc_size
== 0) {
564 if (!unpack_uint(&p
, end
, &slot_enc_size
)) {
565 throw Xapian::DatabaseCorruptError("Termlist encoding corrupt");
569 end
= p
+ slot_enc_size
;
570 Xapian::valueno last_slot
;
571 if (!unpack_uint(&p
, end
, &last_slot
)) {
572 throw Xapian::DatabaseCorruptError("Slots used data corrupt");
576 BitReader
rd(p
, end
);
577 Xapian::valueno first_slot
= rd
.decode(last_slot
);
578 Xapian::valueno slot_count
= rd
.decode(last_slot
- first_slot
) + 2;
579 rd
.decode_interpolative(0, slot_count
- 1, first_slot
, last_slot
);
581 Xapian::valueno slot
= first_slot
;
582 while (slot
!= last_slot
) {
583 values
.insert(make_pair(slot
, get_value(did
, slot
)));
584 slot
= rd
.decode_interpolative_next();
587 values
.insert(make_pair(last_slot
, get_value(did
, last_slot
)));
591 HoneyValueManager::get_value_stats(Xapian::valueno slot
) const
593 LOGCALL_VOID(DB
, "HoneyValueManager::get_value_stats", slot
);
594 // Invalidate the cache first in case an exception is thrown.
595 mru_slot
= Xapian::BAD_VALUENO
;
596 get_value_stats(slot
, mru_valstats
);
601 HoneyValueManager::get_value_stats(Xapian::valueno slot
,
602 ValueStats
& stats
) const
604 LOGCALL_VOID(DB
, "HoneyValueManager::get_value_stats", slot
| Literal("[stats]"));
607 if (postlist_table
.get_exact_entry(Honey::make_valuestats_key(slot
), tag
)) {
608 const char * pos
= tag
.data();
609 const char * end
= pos
+ tag
.size();
611 if (!unpack_uint(&pos
, end
, &(stats
.freq
))) {
613 throw Xapian::DatabaseCorruptError("Incomplete stats item in "
616 throw Xapian::RangeError("Frequency statistic in value table is "
619 if (!unpack_string(&pos
, end
, stats
.lower_bound
)) {
621 throw Xapian::DatabaseCorruptError("Incomplete stats item in "
624 throw Xapian::RangeError("Lower bound in value table is too "
627 size_t len
= end
- pos
;
629 stats
.upper_bound
= stats
.lower_bound
;
631 stats
.upper_bound
.assign(pos
, len
);
639 HoneyValueManager::set_value_stats(map
<Xapian::valueno
, ValueStats
>& val_stats
)
641 LOGCALL_VOID(DB
, "HoneyValueManager::set_value_stats", val_stats
);
642 map
<Xapian::valueno
, ValueStats
>::const_iterator i
;
643 for (i
= val_stats
.begin(); i
!= val_stats
.end(); ++i
) {
644 string key
= Honey::make_valuestats_key(i
->first
);
645 const ValueStats
& stats
= i
->second
;
646 if (stats
.freq
!= 0) {
648 pack_uint(new_value
, stats
.freq
);
649 pack_string(new_value
, stats
.lower_bound
);
650 // We don't store or count empty values, so neither of the bounds
651 // can be empty. So we can safely store an empty upper bound when
652 // the bounds are equal.
653 if (stats
.lower_bound
!= stats
.upper_bound
)
654 new_value
+= stats
.upper_bound
;
655 postlist_table
.add(key
, new_value
);
657 postlist_table
.del(key
);
661 mru_slot
= Xapian::BAD_VALUENO
;