Fix failing assertion in uniq_terms bounds code
[xapian.git] / xapian-core / backends / honey / honey_version.cc
blob133badf2f99fe77f5a7aa0863d837305822253dd
1 /** @file honey_version.cc
2 * @brief HoneyVersion class
3 */
4 /* Copyright (C) 2006,2007,2008,2009,2010,2013,2014,2015,2016,2017,2018 Olly Betts
5 * Copyright (C) 2011 Dan Colish
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "honey_version.h"
26 #include "debuglog.h"
27 #include "fd.h"
28 #include "honey_defs.h"
29 #include "io_utils.h"
30 #include "min_non_zero.h"
31 #include "omassert.h"
32 #include "pack.h"
33 #include "posixy_wrapper.h"
34 #include "stringutils.h" // For STRINGIZE() and CONST_STRLEN().
36 #include <cerrno>
37 #include <cstring> // For memcmp().
38 #include <string>
39 #include <sys/types.h>
40 #include "safesysstat.h"
41 #include "safefcntl.h"
42 #include "safeunistd.h"
43 #include "str.h"
44 #include "stringutils.h"
46 #include "backends/uuids.h"
48 #include "xapian/constants.h"
49 #include "xapian/error.h"
51 using namespace std;
53 /// Honey format version (date of change):
54 #define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,4,3)
55 // 2018,4,3 1.5.0 outlaw mixed-wdf terms
56 // 2018,3,28 don't special case first entry in SSTable
57 // 2018,3,27 new key format for value stats, value chunks, doclen chunks
58 // 2018,3,26 use known suffix from spelling B and T keys
59 // 2018,3,25 use known prefix from spelling B and H keys
60 // 2018,3,15 avoid storing flat wdf
61 // 2018,3,14 store per term wdf_max
62 // 2018,3,12 binary chop index
63 // 2018,3,11 spelling key encoding changed
64 // 2018,2,22 index valuestream chunks by last docid in chunk
65 // 2018,2,21 index doclen chunks by last docid in chunk
66 // 2018,2,20 implement array index
67 // 2018,2,19 allow 1,2,3 as well as 4 byte doc length width
68 // 2018,2,2 special case tf=2; first_wdf = floor(collfreq/2)
69 // 2018,2,1 pack_uint for postlist data
70 // 2018,1,31 Special case postlist when termfreq==2
71 // 2018,1,30 More compact postlist chunk headers
72 // 2018,1,23 Elide last-first for single occurrence terms
73 // 2018,1,4 Merge values used and terms used
74 // 2018,1,3 Table start offset in RootInfo
75 // 2017,12,30 Value stats key changes
76 // 2017,12,29 User metadata key changes
77 // 2017,12,5 New Honey backend
79 /// Convert date <-> version number. Dates up to 2141-12-31 fit in 2 bytes.
80 #define DATE_TO_VERSION(Y,M,D) \
81 ((unsigned(Y) - 2014) << 9 | unsigned(M) << 5 | unsigned(D))
82 #define VERSION_TO_YEAR(V) ((unsigned(V) >> 9) + 2014)
83 #define VERSION_TO_MONTH(V) ((unsigned(V) >> 5) & 0x0f)
84 #define VERSION_TO_DAY(V) (unsigned(V) & 0x1f)
86 #define HONEY_VERSION_MAGIC_LEN 14
87 #define HONEY_VERSION_MAGIC_AND_VERSION_LEN 16
89 static const char HONEY_VERSION_MAGIC[HONEY_VERSION_MAGIC_AND_VERSION_LEN] = {
90 '\x0f', '\x0d', 'X', 'a', 'p', 'i', 'a', 'n', ' ', 'H', 'o', 'n', 'e', 'y',
91 char((HONEY_FORMAT_VERSION >> 8) & 0xff), char(HONEY_FORMAT_VERSION & 0xff)
94 HoneyVersion::HoneyVersion(int fd_)
95 : rev(0), fd(fd_), offset(0), db_dir(),
96 doccount(0), total_doclen(0), last_docid(0),
97 doclen_lbound(0), doclen_ubound(0),
98 wdf_ubound(0), spelling_wordfreq_ubound(0),
99 oldest_changeset(0),
100 uniq_terms_lbound(0), uniq_terms_ubound(0)
102 offset = lseek(fd, 0, SEEK_CUR);
103 if (rare(offset == off_t(-1))) {
104 string msg = "lseek failed on file descriptor ";
105 msg += str(fd);
106 throw Xapian::DatabaseOpeningError(msg, errno);
110 HoneyVersion::~HoneyVersion()
112 // Either this is a single-file database, or this fd is from opening a new
113 // version file in write(), but sync() was never called.
114 if (fd != -1)
115 (void)::close(fd);
118 void
119 HoneyVersion::read()
121 LOGCALL_VOID(DB, "HoneyVersion::read", NO_ARGS);
122 FD close_fd(-1);
123 int fd_in;
124 if (single_file()) {
125 if (rare(lseek(fd, offset, SEEK_SET) == off_t(-1))) {
126 string msg = "Failed to rewind file descriptor ";
127 msg += str(fd);
128 throw Xapian::DatabaseOpeningError(msg, errno);
130 fd_in = fd;
131 } else {
132 string filename = db_dir;
133 filename += "/iamhoney";
134 fd_in = posixy_open(filename.c_str(), O_RDONLY|O_BINARY);
135 if (rare(fd_in < 0)) {
136 string msg = filename;
137 msg += ": Failed to open honey revision file for reading";
138 throw Xapian::DatabaseOpeningError(msg, errno);
140 close_fd = fd_in;
143 char buf[256];
145 const char * p = buf;
146 const char * end = p + io_read(fd_in, buf, sizeof(buf), 33);
148 if (memcmp(buf, HONEY_VERSION_MAGIC, HONEY_VERSION_MAGIC_LEN) != 0)
149 throw Xapian::DatabaseCorruptError("Rev file magic incorrect");
151 unsigned version;
152 version = static_cast<unsigned char>(buf[HONEY_VERSION_MAGIC_LEN]);
153 version <<= 8;
154 version |= static_cast<unsigned char>(buf[HONEY_VERSION_MAGIC_LEN + 1]);
155 if (version != HONEY_FORMAT_VERSION) {
156 string msg;
157 if (!single_file()) {
158 msg = db_dir;
159 msg += ": ";
161 msg += "Database is format version ";
162 msg += str(VERSION_TO_YEAR(version) * 10000 +
163 VERSION_TO_MONTH(version) * 100 +
164 VERSION_TO_DAY(version));
165 msg += " but I only understand ";
166 msg += str(VERSION_TO_YEAR(HONEY_FORMAT_VERSION) * 10000 +
167 VERSION_TO_MONTH(HONEY_FORMAT_VERSION) * 100 +
168 VERSION_TO_DAY(HONEY_FORMAT_VERSION));
169 throw Xapian::DatabaseVersionError(msg);
172 p += HONEY_VERSION_MAGIC_AND_VERSION_LEN;
173 uuid.assign(p);
174 p += uuid.BINARY_SIZE;
176 if (!unpack_uint(&p, end, &rev)) {
177 throw Xapian::DatabaseCorruptError("Rev file failed to decode "
178 "revision");
181 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
182 if (!root[table_no].unserialise(&p, end)) {
183 throw Xapian::DatabaseCorruptError("Rev file root_info missing");
185 old_root[table_no] = root[table_no];
188 // For a single-file database, this will assign extra data. We read
189 // sizeof(buf) above, then skip HONEY_VERSION_MAGIC_AND_VERSION_LEN,
190 // then 16, then the size of the serialised root info.
191 serialised_stats.assign(p, end);
192 unserialise_stats();
195 void
196 HoneyVersion::serialise_stats()
198 serialised_stats.resize(0);
199 pack_uint(serialised_stats, doccount);
200 // last_docid must always be >= doccount.
201 pack_uint(serialised_stats, last_docid - doccount);
202 pack_uint(serialised_stats, doclen_lbound);
203 pack_uint(serialised_stats, wdf_ubound);
204 // doclen_ubound should always be >= wdf_ubound, so we store the
205 // difference as it may encode smaller. wdf_ubound is likely to
206 // be larger than doclen_lbound.
207 pack_uint(serialised_stats, doclen_ubound - wdf_ubound);
208 pack_uint(serialised_stats, oldest_changeset);
209 pack_uint(serialised_stats, total_doclen);
210 pack_uint(serialised_stats, spelling_wordfreq_ubound);
211 // If total_doclen == 0 then unique_terms is always zero (or there are no
212 // documents at all) so storing these just complicates things because
213 // uniq_terms_lbound could legitimately be zero.
214 if (total_doclen != 0) {
215 // We rely on uniq_terms_lbound being non-zero to detect if it's present
216 // for a single file DB.
217 Assert(uniq_terms_lbound != 0);
218 pack_uint(serialised_stats, uniq_terms_lbound);
219 pack_uint(serialised_stats, uniq_terms_ubound);
223 void
224 HoneyVersion::unserialise_stats()
226 const char * p = serialised_stats.data();
227 const char * end = p + serialised_stats.size();
228 if (p == end) {
229 doccount = 0;
230 total_doclen = 0;
231 last_docid = 0;
232 doclen_lbound = 0;
233 doclen_ubound = 0;
234 wdf_ubound = 0;
235 oldest_changeset = 0;
236 spelling_wordfreq_ubound = 0;
237 uniq_terms_lbound = 0;
238 uniq_terms_ubound = 0;
239 return;
242 if (!unpack_uint(&p, end, &doccount) ||
243 !unpack_uint(&p, end, &last_docid) ||
244 !unpack_uint(&p, end, &doclen_lbound) ||
245 !unpack_uint(&p, end, &wdf_ubound) ||
246 !unpack_uint(&p, end, &doclen_ubound) ||
247 !unpack_uint(&p, end, &oldest_changeset) ||
248 !unpack_uint(&p, end, &total_doclen) ||
249 !unpack_uint(&p, end, &spelling_wordfreq_ubound)) {
250 const char * m = p ?
251 "Bad serialised DB stats (overflowed)" :
252 "Bad serialised DB stats (out of data)";
253 throw Xapian::DatabaseCorruptError(m);
256 // last_docid must always be >= doccount.
257 last_docid += doccount;
258 // doclen_ubound should always be >= wdf_ubound, so we store the
259 // difference as it may encode smaller. wdf_ubound is likely to
260 // be larger than doclen_lbound.
261 doclen_ubound += wdf_ubound;
263 // We don't check if there's undecoded data between p and end - in the
264 // single-file DB case there will be extra zero bytes in serialised_stats,
265 // and more generally it's useful to be able to add new stats when it is
266 // safe for old versions to just ignore them and there are sensible values
267 // to use when a new version reads an old database.
269 // Read bounds on unique_terms if stored. This test relies on the first
270 // byte of pack_uint(x) being zero if and only if x is zero, and on
271 // uniq_terms_lbound being non-zero.
272 if (p == end || *p == '\0') {
273 // No bounds stored so use weak bounds based on other stats.
274 uniq_terms_lbound = 1;
275 uniq_terms_ubound = doclen_ubound;
276 } else if (!unpack_uint(&p, end, &uniq_terms_lbound) ||
277 !unpack_uint(&p, end, &uniq_terms_ubound)) {
278 const char * m = p ?
279 "Bad serialised unique_terms bounds (overflowed)" :
280 "Bad serialised unique_terms bounds (out of data)";
281 throw Xapian::DatabaseCorruptError(m);
285 void
286 HoneyVersion::merge_stats(const HoneyVersion & o)
288 doccount += o.get_doccount();
289 if (doccount < o.get_doccount()) {
290 throw Xapian::DatabaseError("doccount overflowed!");
293 doclen_lbound = min_non_zero(doclen_lbound, o.get_doclength_lower_bound());
294 doclen_ubound = max(doclen_ubound, o.get_doclength_upper_bound());
295 wdf_ubound = max(wdf_ubound, o.get_wdf_upper_bound());
296 total_doclen += o.get_total_doclen();
297 if (total_doclen < o.get_total_doclen()) {
298 throw Xapian::DatabaseError("Total document length overflowed!");
301 // The upper bounds might be on the same word, so we must sum them.
302 spelling_wordfreq_ubound += o.get_spelling_wordfreq_upper_bound();
304 uniq_terms_lbound = min_non_zero(uniq_terms_lbound,
305 o.get_unique_terms_lower_bound());
306 uniq_terms_ubound = max(uniq_terms_ubound,
307 o.get_unique_terms_upper_bound());
310 void
311 HoneyVersion::merge_stats(Xapian::doccount o_doccount,
312 Xapian::termcount o_doclen_lbound,
313 Xapian::termcount o_doclen_ubound,
314 Xapian::termcount o_wdf_ubound,
315 Xapian::totallength o_total_doclen,
316 Xapian::termcount o_spelling_wordfreq_ubound,
317 Xapian::termcount o_uniq_terms_lbound,
318 Xapian::termcount o_uniq_terms_ubound)
320 doccount += o_doccount;
321 if (doccount < o_doccount) {
322 throw Xapian::DatabaseError("doccount overflowed!");
325 doclen_lbound = min_non_zero(doclen_lbound, o_doclen_lbound);
326 doclen_ubound = max(doclen_ubound, o_doclen_ubound);
327 wdf_ubound = max(wdf_ubound, o_wdf_ubound);
328 total_doclen += o_total_doclen;
329 if (total_doclen < o_total_doclen) {
330 throw Xapian::DatabaseError("Total document length overflowed!");
333 // The upper bounds might be on the same word, so we must sum them.
334 spelling_wordfreq_ubound += o_spelling_wordfreq_ubound;
336 uniq_terms_lbound = min_non_zero(uniq_terms_lbound, o_uniq_terms_lbound);
337 uniq_terms_ubound = max(uniq_terms_ubound, o_uniq_terms_ubound);
340 void
341 HoneyVersion::cancel()
343 LOGCALL_VOID(DB, "HoneyVersion::cancel", NO_ARGS);
344 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
345 root[table_no] = old_root[table_no];
347 unserialise_stats();
350 const string
351 HoneyVersion::write(honey_revision_number_t new_rev, int flags)
353 LOGCALL(DB, const string, "HoneyVersion::write", new_rev|flags);
355 string s(HONEY_VERSION_MAGIC, HONEY_VERSION_MAGIC_AND_VERSION_LEN);
356 s.append(uuid.data(), uuid.BINARY_SIZE);
358 pack_uint(s, new_rev);
360 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
361 root[table_no].serialise(s);
364 // Serialise database statistics.
365 serialise_stats();
366 s += serialised_stats;
368 string tmpfile;
369 if (!single_file()) {
370 tmpfile = db_dir;
371 // In dangerous mode, just write the new version file in place.
372 if (flags & Xapian::DB_DANGEROUS)
373 tmpfile += "/iamhoney";
374 else
375 tmpfile += "/v.tmp";
377 int open_flags = O_CREAT|O_TRUNC|O_WRONLY|O_BINARY;
378 fd = posixy_open(tmpfile.c_str(), open_flags, 0666);
379 if (rare(fd < 0)) {
380 string msg = "Couldn't write new rev file: ";
381 msg += tmpfile;
382 throw Xapian::DatabaseOpeningError(msg, errno);
385 if (flags & Xapian::DB_DANGEROUS)
386 tmpfile = string();
389 try {
390 io_write(fd, s.data(), s.size());
391 } catch (...) {
392 if (!single_file())
393 (void)close(fd);
394 throw;
397 RETURN(tmpfile);
400 bool
401 HoneyVersion::sync(const string & tmpfile,
402 honey_revision_number_t new_rev, int flags)
404 Assert(new_rev > rev || rev == 0);
406 if (single_file()) {
407 if ((flags & Xapian::DB_NO_SYNC) == 0 &&
408 ((flags & Xapian::DB_FULL_SYNC) ?
409 !io_full_sync(fd) :
410 !io_sync(fd))) {
411 // FIXME what to do?
413 } else {
414 int fd_to_close = fd;
415 fd = -1;
416 if ((flags & Xapian::DB_NO_SYNC) == 0 &&
417 ((flags & Xapian::DB_FULL_SYNC) ?
418 !io_full_sync(fd_to_close) :
419 !io_sync(fd_to_close))) {
420 int save_errno = errno;
421 (void)close(fd_to_close);
422 if (!tmpfile.empty())
423 (void)unlink(tmpfile.c_str());
424 errno = save_errno;
425 return false;
428 if (close(fd_to_close) != 0) {
429 if (!tmpfile.empty()) {
430 int save_errno = errno;
431 (void)unlink(tmpfile.c_str());
432 errno = save_errno;
434 return false;
437 if (!tmpfile.empty()) {
438 if (!io_tmp_rename(tmpfile, db_dir + "/iamhoney")) {
439 return false;
444 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
445 old_root[table_no] = root[table_no];
448 rev = new_rev;
449 return true;
452 // Only try to compress tags longer than this many bytes.
453 const size_t COMPRESS_MIN = 4;
455 static const uint4 compress_min_tab[] = {
456 0, // POSTLIST
457 COMPRESS_MIN, // DOCDATA
458 COMPRESS_MIN, // TERMLIST
459 0, // POSITION
460 COMPRESS_MIN, // SPELLING
461 COMPRESS_MIN // SYNONYM
464 void
465 HoneyVersion::create()
467 uuid.generate();
468 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
469 root[table_no].init(compress_min_tab[table_no]);
473 namespace Honey {
475 void
476 RootInfo::init(uint4 compress_min_)
478 offset = 0;
479 root = 0;
480 num_entries = 0;
481 compress_min = compress_min_;
482 fl_serialised.resize(0);
485 void
486 RootInfo::serialise(string &s) const
488 AssertRel(offset, >=, 0);
489 std::make_unsigned<off_t>::type uoffset = offset;
490 AssertRel(root, >=, offset);
491 pack_uint(s, uoffset);
492 pack_uint(s, root - uoffset);
493 pack_uint(s, 0u);
494 pack_uint(s, num_entries);
495 pack_uint(s, 2048u >> 11);
496 pack_uint(s, compress_min);
497 pack_string(s, fl_serialised);
500 bool
501 RootInfo::unserialise(const char ** p, const char * end)
503 std::make_unsigned<off_t>::type uoffset, uroot;
504 unsigned dummy_val;
505 unsigned dummy_blocksize;
506 if (!unpack_uint(p, end, &uoffset) ||
507 !unpack_uint(p, end, &uroot) ||
508 !unpack_uint(p, end, &dummy_val) ||
509 !unpack_uint(p, end, &num_entries) ||
510 !unpack_uint(p, end, &dummy_blocksize) ||
511 !unpack_uint(p, end, &compress_min) ||
512 !unpack_string(p, end, fl_serialised)) return false;
513 offset = uoffset;
514 root = uoffset + uroot;
515 // Not meaningful, but still there so that existing honey databases
516 // continue to work.
517 (void)dummy_val;
518 (void)dummy_blocksize;
519 return true;