[honey] Don't allow mixed wdf for a term
[xapian.git] / xapian-core / backends / honey / honey_version.cc
blobf9c4a8f1633b66d2faf637e6c007efb053aafed4
1 /** @file honey_version.cc
2 * @brief HoneyVersion class
3 */
4 /* Copyright (C) 2006,2007,2008,2009,2010,2013,2014,2015,2016,2017,2018 Olly Betts
5 * Copyright (C) 2011 Dan Colish
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "honey_version.h"
26 #include "debuglog.h"
27 #include "fd.h"
28 #include "honey_defs.h"
29 #include "io_utils.h"
30 #include "omassert.h"
31 #include "pack.h"
32 #include "posixy_wrapper.h"
33 #include "stringutils.h" // For STRINGIZE() and CONST_STRLEN().
35 #include <cstring> // For memcmp().
36 #include <string>
37 #include "safeerrno.h"
38 #include <sys/types.h>
39 #include "safesysstat.h"
40 #include "safefcntl.h"
41 #include "safeunistd.h"
42 #include "str.h"
43 #include "stringutils.h"
45 #include "common/safeuuid.h"
47 #include "xapian/constants.h"
48 #include "xapian/error.h"
50 using namespace std;
52 /// Honey format version (date of change):
53 #define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,4,3)
54 // 2018,4,3 1.5.0 outlaw mixed-wdf terms
55 // 2018,3,28 don't special case first entry in SSTable
56 // 2018,3,27 new key format for value stats, value chunks, doclen chunks
57 // 2018,3,26 use known suffix from spelling B and T keys
58 // 2018,3,25 use known prefix from spelling B and H keys
59 // 2018,3,15 avoid storing flat wdf
60 // 2018,3,14 store per term wdf_max
61 // 2018,3,12 binary chop index
62 // 2018,3,11 spelling key encoding changed
63 // 2018,2,22 index valuestream chunks by last docid in chunk
64 // 2018,2,21 index doclen chunks by last docid in chunk
65 // 2018,2,20 implement array index
66 // 2018,2,19 allow 1,2,3 as well as 4 byte doc length width
67 // 2018,2,2 special case tf=2; first_wdf = floor(collfreq/2)
68 // 2018,2,1 pack_uint for postlist data
69 // 2018,1,31 Special case postlist when termfreq==2
70 // 2018,1,30 More compact postlist chunk headers
71 // 2018,1,23 Elide last-first for single occurrence terms
72 // 2018,1,4 Merge values used and terms used
73 // 2018,1,3 Table start offset in RootInfo
74 // 2017,12,30 Value stats key changes
75 // 2017,12,29 User metadata key changes
76 // 2017,12,5 New Honey backend
78 /// Convert date <-> version number. Dates up to 2141-12-31 fit in 2 bytes.
79 #define DATE_TO_VERSION(Y,M,D) \
80 ((unsigned(Y) - 2014) << 9 | unsigned(M) << 5 | unsigned(D))
81 #define VERSION_TO_YEAR(V) ((unsigned(V) >> 9) + 2014)
82 #define VERSION_TO_MONTH(V) ((unsigned(V) >> 5) & 0x0f)
83 #define VERSION_TO_DAY(V) (unsigned(V) & 0x1f)
85 #define HONEY_VERSION_MAGIC_LEN 14
86 #define HONEY_VERSION_MAGIC_AND_VERSION_LEN 16
88 static const char HONEY_VERSION_MAGIC[HONEY_VERSION_MAGIC_AND_VERSION_LEN] = {
89 '\x0f', '\x0d', 'X', 'a', 'p', 'i', 'a', 'n', ' ', 'H', 'o', 'n', 'e', 'y',
90 char((HONEY_FORMAT_VERSION >> 8) & 0xff), char(HONEY_FORMAT_VERSION & 0xff)
93 HoneyVersion::HoneyVersion(int fd_)
94 : rev(0), fd(fd_), offset(0), db_dir(), changes(NULL),
95 doccount(0), total_doclen(0), last_docid(0),
96 doclen_lbound(0), doclen_ubound(0),
97 wdf_ubound(0), spelling_wordfreq_ubound(0),
98 oldest_changeset(0)
100 offset = lseek(fd, 0, SEEK_CUR);
101 if (rare(offset == off_t(-1))) {
102 string msg = "lseek failed on file descriptor ";
103 msg += str(fd);
104 throw Xapian::DatabaseOpeningError(msg, errno);
108 HoneyVersion::~HoneyVersion()
110 // Either this is a single-file database, or this fd is from opening a new
111 // version file in write(), but sync() was never called.
112 if (fd != -1)
113 (void)::close(fd);
116 void
117 HoneyVersion::read()
119 LOGCALL_VOID(DB, "HoneyVersion::read", NO_ARGS);
120 FD close_fd(-1);
121 int fd_in;
122 if (single_file()) {
123 if (rare(lseek(fd, offset, SEEK_SET) == off_t(-1))) {
124 string msg = "Failed to rewind file descriptor ";
125 msg += str(fd);
126 throw Xapian::DatabaseOpeningError(msg, errno);
128 fd_in = fd;
129 } else {
130 string filename = db_dir;
131 filename += "/iamhoney";
132 fd_in = posixy_open(filename.c_str(), O_RDONLY|O_BINARY);
133 if (rare(fd_in < 0)) {
134 string msg = filename;
135 msg += ": Failed to open honey revision file for reading";
136 throw Xapian::DatabaseOpeningError(msg, errno);
138 close_fd = fd_in;
141 char buf[256];
143 const char * p = buf;
144 const char * end = p + io_read(fd_in, buf, sizeof(buf), 33);
146 if (memcmp(buf, HONEY_VERSION_MAGIC, HONEY_VERSION_MAGIC_LEN) != 0)
147 throw Xapian::DatabaseCorruptError("Rev file magic incorrect");
149 unsigned version;
150 version = static_cast<unsigned char>(buf[HONEY_VERSION_MAGIC_LEN]);
151 version <<= 8;
152 version |= static_cast<unsigned char>(buf[HONEY_VERSION_MAGIC_LEN + 1]);
153 if (version != HONEY_FORMAT_VERSION) {
154 string msg;
155 if (!single_file()) {
156 msg = db_dir;
157 msg += ": ";
159 msg += "Database is format version ";
160 msg += str(VERSION_TO_YEAR(version) * 10000 +
161 VERSION_TO_MONTH(version) * 100 +
162 VERSION_TO_DAY(version));
163 msg += " but I only understand ";
164 msg += str(VERSION_TO_YEAR(HONEY_FORMAT_VERSION) * 10000 +
165 VERSION_TO_MONTH(HONEY_FORMAT_VERSION) * 100 +
166 VERSION_TO_DAY(HONEY_FORMAT_VERSION));
167 throw Xapian::DatabaseVersionError(msg);
170 p += HONEY_VERSION_MAGIC_AND_VERSION_LEN;
171 memcpy(uuid, p, 16);
172 p += 16;
174 if (!unpack_uint(&p, end, &rev)) {
175 throw Xapian::DatabaseCorruptError("Rev file failed to decode "
176 "revision");
179 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
180 if (!root[table_no].unserialise(&p, end)) {
181 throw Xapian::DatabaseCorruptError("Rev file root_info missing");
183 old_root[table_no] = root[table_no];
186 // For a single-file database, this will assign extra data. We read
187 // sizeof(buf) above, then skip HONEY_VERSION_MAGIC_AND_VERSION_LEN,
188 // then 16, then the size of the serialised root info.
189 serialised_stats.assign(p, end);
190 unserialise_stats();
193 void
194 HoneyVersion::serialise_stats()
196 serialised_stats.resize(0);
197 pack_uint(serialised_stats, doccount);
198 // last_docid must always be >= doccount.
199 pack_uint(serialised_stats, last_docid - doccount);
200 pack_uint(serialised_stats, doclen_lbound);
201 pack_uint(serialised_stats, wdf_ubound);
202 // doclen_ubound should always be >= wdf_ubound, so we store the
203 // difference as it may encode smaller. wdf_ubound is likely to
204 // be larger than doclen_lbound.
205 pack_uint(serialised_stats, doclen_ubound - wdf_ubound);
206 pack_uint(serialised_stats, oldest_changeset);
207 pack_uint(serialised_stats, total_doclen);
208 pack_uint(serialised_stats, spelling_wordfreq_ubound);
211 void
212 HoneyVersion::unserialise_stats()
214 const char * p = serialised_stats.data();
215 const char * end = p + serialised_stats.size();
216 if (p == end) {
217 doccount = 0;
218 total_doclen = 0;
219 last_docid = 0;
220 doclen_lbound = 0;
221 doclen_ubound = 0;
222 wdf_ubound = 0;
223 oldest_changeset = 0;
224 spelling_wordfreq_ubound = 0;
225 return;
228 if (!unpack_uint(&p, end, &doccount) ||
229 !unpack_uint(&p, end, &last_docid) ||
230 !unpack_uint(&p, end, &doclen_lbound) ||
231 !unpack_uint(&p, end, &wdf_ubound) ||
232 !unpack_uint(&p, end, &doclen_ubound) ||
233 !unpack_uint(&p, end, &oldest_changeset) ||
234 !unpack_uint(&p, end, &total_doclen) ||
235 !unpack_uint(&p, end, &spelling_wordfreq_ubound)) {
236 const char * m = p ?
237 "Bad serialised DB stats (overflowed)" :
238 "Bad serialised DB stats (out of data)";
239 throw Xapian::DatabaseCorruptError(m);
242 // In the single-file DB case, there will be extra data in
243 // serialised_stats, so suppress this check.
244 if (p != end && !single_file())
245 throw Xapian::DatabaseCorruptError("Rev file has junk at end");
247 // last_docid must always be >= doccount.
248 last_docid += doccount;
249 // doclen_ubound should always be >= wdf_ubound, so we store the
250 // difference as it may encode smaller. wdf_ubound is likely to
251 // be larger than doclen_lbound.
252 doclen_ubound += wdf_ubound;
255 void
256 HoneyVersion::merge_stats(const HoneyVersion & o)
258 doccount += o.get_doccount();
259 if (doccount < o.get_doccount()) {
260 throw Xapian::DatabaseError("doccount overflowed!");
263 Xapian::termcount o_doclen_lbound = o.get_doclength_lower_bound();
264 if (o_doclen_lbound > 0) {
265 if (doclen_lbound == 0 || o_doclen_lbound < doclen_lbound)
266 doclen_lbound = o_doclen_lbound;
269 doclen_ubound = max(doclen_ubound, o.get_doclength_upper_bound());
270 wdf_ubound = max(wdf_ubound, o.get_wdf_upper_bound());
271 total_doclen += o.get_total_doclen();
272 if (total_doclen < o.get_total_doclen()) {
273 throw Xapian::DatabaseError("Total document length overflowed!");
276 // The upper bounds might be on the same word, so we must sum them.
277 spelling_wordfreq_ubound += o.get_spelling_wordfreq_upper_bound();
280 void
281 HoneyVersion::merge_stats(Xapian::doccount o_doccount,
282 Xapian::termcount o_doclen_lbound,
283 Xapian::termcount o_doclen_ubound,
284 Xapian::termcount o_wdf_ubound,
285 Xapian::totallength o_total_doclen,
286 Xapian::termcount o_spelling_wordfreq_ubound)
288 doccount += o_doccount;
289 if (doccount < o_doccount) {
290 throw Xapian::DatabaseError("doccount overflowed!");
293 if (o_doclen_lbound > 0) {
294 if (doclen_lbound == 0 || o_doclen_lbound < doclen_lbound)
295 doclen_lbound = o_doclen_lbound;
298 doclen_ubound = max(doclen_ubound, o_doclen_ubound);
299 wdf_ubound = max(wdf_ubound, o_wdf_ubound);
300 total_doclen += o_total_doclen;
301 if (total_doclen < o_total_doclen) {
302 throw Xapian::DatabaseError("Total document length overflowed!");
305 // The upper bounds might be on the same word, so we must sum them.
306 spelling_wordfreq_ubound += o_spelling_wordfreq_ubound;
309 void
310 HoneyVersion::cancel()
312 LOGCALL_VOID(DB, "HoneyVersion::cancel", NO_ARGS);
313 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
314 root[table_no] = old_root[table_no];
316 unserialise_stats();
319 const string
320 HoneyVersion::write(honey_revision_number_t new_rev, int flags)
322 LOGCALL(DB, const string, "HoneyVersion::write", new_rev|flags);
324 string s(HONEY_VERSION_MAGIC, HONEY_VERSION_MAGIC_AND_VERSION_LEN);
325 s.append(reinterpret_cast<const char *>(uuid), 16);
327 pack_uint(s, new_rev);
329 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
330 root[table_no].serialise(s);
333 // Serialise database statistics.
334 serialise_stats();
335 s += serialised_stats;
337 string tmpfile;
338 if (!single_file()) {
339 tmpfile = db_dir;
340 // In dangerous mode, just write the new version file in place.
341 if (flags & Xapian::DB_DANGEROUS)
342 tmpfile += "/iamhoney";
343 else
344 tmpfile += "/v.tmp";
346 int open_flags = O_CREAT|O_TRUNC|O_WRONLY|O_BINARY;
347 fd = posixy_open(tmpfile.c_str(), open_flags, 0666);
348 if (rare(fd < 0)) {
349 string msg = "Couldn't write new rev file: ";
350 msg += tmpfile;
351 throw Xapian::DatabaseOpeningError(msg, errno);
354 if (flags & Xapian::DB_DANGEROUS)
355 tmpfile = string();
358 try {
359 io_write(fd, s.data(), s.size());
360 } catch (...) {
361 if (!single_file())
362 (void)close(fd);
363 throw;
366 if (changes) {
367 string changes_buf;
368 changes_buf += '\xfe';
369 pack_uint(changes_buf, new_rev);
370 pack_uint(changes_buf, s.size());
371 changes->write_block(changes_buf);
372 changes->write_block(s);
375 RETURN(tmpfile);
378 bool
379 HoneyVersion::sync(const string & tmpfile,
380 honey_revision_number_t new_rev, int flags)
382 Assert(new_rev > rev || rev == 0);
384 if (single_file()) {
385 if ((flags & Xapian::DB_NO_SYNC) == 0 &&
386 ((flags & Xapian::DB_FULL_SYNC) ?
387 !io_full_sync(fd) :
388 !io_sync(fd))) {
389 // FIXME what to do?
391 } else {
392 int fd_to_close = fd;
393 fd = -1;
394 if ((flags & Xapian::DB_NO_SYNC) == 0 &&
395 ((flags & Xapian::DB_FULL_SYNC) ?
396 !io_full_sync(fd_to_close) :
397 !io_sync(fd_to_close))) {
398 int save_errno = errno;
399 (void)close(fd_to_close);
400 if (!tmpfile.empty())
401 (void)unlink(tmpfile.c_str());
402 errno = save_errno;
403 return false;
406 if (close(fd_to_close) != 0) {
407 if (!tmpfile.empty()) {
408 int save_errno = errno;
409 (void)unlink(tmpfile.c_str());
410 errno = save_errno;
412 return false;
415 if (!tmpfile.empty()) {
416 if (!io_tmp_rename(tmpfile, db_dir + "/iamhoney")) {
417 return false;
422 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
423 old_root[table_no] = root[table_no];
426 rev = new_rev;
427 return true;
430 // Only try to compress tags longer than this many bytes.
431 const size_t COMPRESS_MIN = 4;
433 static const uint4 compress_min_tab[] = {
434 0, // POSTLIST
435 COMPRESS_MIN, // DOCDATA
436 COMPRESS_MIN, // TERMLIST
437 0, // POSITION
438 COMPRESS_MIN, // SPELLING
439 COMPRESS_MIN // SYNONYM
442 void
443 HoneyVersion::create(unsigned blocksize)
445 AssertRel(blocksize,>=,HONEY_MIN_BLOCKSIZE);
446 uuid_generate(uuid);
447 for (unsigned table_no = 0; table_no < Honey::MAX_; ++table_no) {
448 root[table_no].init(blocksize, compress_min_tab[table_no]);
452 namespace Honey {
454 void
455 RootInfo::init(unsigned blocksize_, uint4 compress_min_)
457 AssertRel(blocksize_,>=,HONEY_MIN_BLOCKSIZE);
458 offset = 0;
459 root = 0;
460 level = 0;
461 num_entries = 0;
462 root_is_fake = true;
463 sequential = true;
464 blocksize = blocksize_;
465 compress_min = compress_min_;
466 fl_serialised.resize(0);
469 void
470 RootInfo::serialise(string &s) const
472 AssertRel(offset, >=, 0);
473 std::make_unsigned<off_t>::type uoffset = offset;
474 AssertRel(root, >=, uoffset);
475 pack_uint(s, uoffset);
476 pack_uint(s, root - uoffset);
477 unsigned val = level << 2;
478 if (sequential) val |= 0x02;
479 if (root_is_fake) val |= 0x01;
480 pack_uint(s, val);
481 pack_uint(s, num_entries);
482 pack_uint(s, blocksize >> 11);
483 pack_uint(s, compress_min);
484 pack_string(s, fl_serialised);
487 bool
488 RootInfo::unserialise(const char ** p, const char * end)
490 std::make_unsigned<off_t>::type uoffset;
491 unsigned val;
492 if (!unpack_uint(p, end, &uoffset) ||
493 !unpack_uint(p, end, &root) ||
494 !unpack_uint(p, end, &val) ||
495 !unpack_uint(p, end, &num_entries) ||
496 !unpack_uint(p, end, &blocksize) ||
497 !unpack_uint(p, end, &compress_min) ||
498 !unpack_string(p, end, fl_serialised)) return false;
499 offset = uoffset;
500 root += uoffset;
501 level = val >> 2;
502 sequential = val & 0x02;
503 root_is_fake = val & 0x01;
504 blocksize <<= 11;
505 AssertRel(blocksize,>=,HONEY_MIN_BLOCKSIZE);
506 return true;