1 /** @file honey_version.cc
2 * @brief HoneyVersion class
4 /* Copyright (C) 2006,2007,2008,2009,2010,2013,2014,2015,2016,2017,2018 Olly Betts
5 * Copyright (C) 2011 Dan Colish
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "honey_version.h"
28 #include "honey_defs.h"
32 #include "posixy_wrapper.h"
33 #include "stringutils.h" // For STRINGIZE() and CONST_STRLEN().
35 #include <cstring> // For memcmp().
37 #include "safeerrno.h"
38 #include <sys/types.h>
39 #include "safesysstat.h"
40 #include "safefcntl.h"
41 #include "safeunistd.h"
43 #include "stringutils.h"
45 #include "common/safeuuid.h"
47 #include "xapian/constants.h"
48 #include "xapian/error.h"
52 /// Honey format version (date of change):
53 #define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,3,27)
54 // 2018,3,27 1.5.0 new key format for value stats, value chunks, doclen chunks
55 // 2018,3,26 use known suffix from spelling B and T keys
56 // 2018,3,25 use known prefix from spelling B and H keys
57 // 2018,3,15 avoid storing flat wdf
58 // 2018,3,14 store per term wdf_max
59 // 2018,3,12 binary chop index
60 // 2018,3,11 spelling key encoding changed
61 // 2018,2,22 index valuestream chunks by last docid in chunk
62 // 2018,2,21 index doclen chunks by last docid in chunk
63 // 2018,2,20 implement array index
64 // 2018,2,19 allow 1,2,3 as well as 4 byte doc length width
65 // 2018,2,2 special case tf=2; first_wdf = floor(collfreq/2)
66 // 2018,2,1 pack_uint for postlist data
67 // 2018,1,31 Special case postlist when termfreq==2
68 // 2018,1,30 More compact postlist chunk headers
69 // 2018,1,23 Elide last-first for single occurrence terms
70 // 2018,1,4 Merge values used and terms used
71 // 2018,1,3 Table start offset in RootInfo
72 // 2017,12,30 Value stats key changes
73 // 2017,12,29 User metadata key changes
74 // 2017,12,5 New Honey backend
76 /// Convert date <-> version number. Dates up to 2141-12-31 fit in 2 bytes.
77 #define DATE_TO_VERSION(Y,M,D) \
78 ((unsigned(Y) - 2014) << 9 | unsigned(M) << 5 | unsigned(D))
79 #define VERSION_TO_YEAR(V) ((unsigned(V) >> 9) + 2014)
80 #define VERSION_TO_MONTH(V) ((unsigned(V) >> 5) & 0x0f)
81 #define VERSION_TO_DAY(V) (unsigned(V) & 0x1f)
83 #define HONEY_VERSION_MAGIC_LEN 14
84 #define HONEY_VERSION_MAGIC_AND_VERSION_LEN 16
86 static const char HONEY_VERSION_MAGIC
[HONEY_VERSION_MAGIC_AND_VERSION_LEN
] = {
87 '\x0f', '\x0d', 'X', 'a', 'p', 'i', 'a', 'n', ' ', 'H', 'o', 'n', 'e', 'y',
88 char((HONEY_FORMAT_VERSION
>> 8) & 0xff), char(HONEY_FORMAT_VERSION
& 0xff)
91 HoneyVersion::HoneyVersion(int fd_
)
92 : rev(0), fd(fd_
), offset(0), db_dir(), changes(NULL
),
93 doccount(0), total_doclen(0), last_docid(0),
94 doclen_lbound(0), doclen_ubound(0),
95 wdf_ubound(0), spelling_wordfreq_ubound(0),
98 offset
= lseek(fd
, 0, SEEK_CUR
);
99 if (rare(offset
== off_t(-1))) {
100 string msg
= "lseek failed on file descriptor ";
102 throw Xapian::DatabaseOpeningError(msg
, errno
);
106 HoneyVersion::~HoneyVersion()
108 // Either this is a single-file database, or this fd is from opening a new
109 // version file in write(), but sync() was never called.
117 LOGCALL_VOID(DB
, "HoneyVersion::read", NO_ARGS
);
121 if (rare(lseek(fd
, offset
, SEEK_SET
) == off_t(-1))) {
122 string msg
= "Failed to rewind file descriptor ";
124 throw Xapian::DatabaseOpeningError(msg
, errno
);
128 string filename
= db_dir
;
129 filename
+= "/iamhoney";
130 fd_in
= posixy_open(filename
.c_str(), O_RDONLY
|O_BINARY
);
131 if (rare(fd_in
< 0)) {
132 string msg
= filename
;
133 msg
+= ": Failed to open honey revision file for reading";
134 throw Xapian::DatabaseOpeningError(msg
, errno
);
141 const char * p
= buf
;
142 const char * end
= p
+ io_read(fd_in
, buf
, sizeof(buf
), 33);
144 if (memcmp(buf
, HONEY_VERSION_MAGIC
, HONEY_VERSION_MAGIC_LEN
) != 0)
145 throw Xapian::DatabaseCorruptError("Rev file magic incorrect");
148 version
= static_cast<unsigned char>(buf
[HONEY_VERSION_MAGIC_LEN
]);
150 version
|= static_cast<unsigned char>(buf
[HONEY_VERSION_MAGIC_LEN
+ 1]);
151 if (version
!= HONEY_FORMAT_VERSION
) {
153 if (!single_file()) {
157 msg
+= "Database is format version ";
158 msg
+= str(VERSION_TO_YEAR(version
) * 10000 +
159 VERSION_TO_MONTH(version
) * 100 +
160 VERSION_TO_DAY(version
));
161 msg
+= " but I only understand ";
162 msg
+= str(VERSION_TO_YEAR(HONEY_FORMAT_VERSION
) * 10000 +
163 VERSION_TO_MONTH(HONEY_FORMAT_VERSION
) * 100 +
164 VERSION_TO_DAY(HONEY_FORMAT_VERSION
));
165 throw Xapian::DatabaseVersionError(msg
);
168 p
+= HONEY_VERSION_MAGIC_AND_VERSION_LEN
;
172 if (!unpack_uint(&p
, end
, &rev
)) {
173 throw Xapian::DatabaseCorruptError("Rev file failed to decode "
177 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
178 if (!root
[table_no
].unserialise(&p
, end
)) {
179 throw Xapian::DatabaseCorruptError("Rev file root_info missing");
181 old_root
[table_no
] = root
[table_no
];
184 // For a single-file database, this will assign extra data. We read
185 // sizeof(buf) above, then skip HONEY_VERSION_MAGIC_AND_VERSION_LEN,
186 // then 16, then the size of the serialised root info.
187 serialised_stats
.assign(p
, end
);
192 HoneyVersion::serialise_stats()
194 serialised_stats
.resize(0);
195 pack_uint(serialised_stats
, doccount
);
196 // last_docid must always be >= doccount.
197 pack_uint(serialised_stats
, last_docid
- doccount
);
198 pack_uint(serialised_stats
, doclen_lbound
);
199 pack_uint(serialised_stats
, wdf_ubound
);
200 // doclen_ubound should always be >= wdf_ubound, so we store the
201 // difference as it may encode smaller. wdf_ubound is likely to
202 // be larger than doclen_lbound.
203 pack_uint(serialised_stats
, doclen_ubound
- wdf_ubound
);
204 pack_uint(serialised_stats
, oldest_changeset
);
205 pack_uint(serialised_stats
, total_doclen
);
206 pack_uint(serialised_stats
, spelling_wordfreq_ubound
);
210 HoneyVersion::unserialise_stats()
212 const char * p
= serialised_stats
.data();
213 const char * end
= p
+ serialised_stats
.size();
221 oldest_changeset
= 0;
222 spelling_wordfreq_ubound
= 0;
226 if (!unpack_uint(&p
, end
, &doccount
) ||
227 !unpack_uint(&p
, end
, &last_docid
) ||
228 !unpack_uint(&p
, end
, &doclen_lbound
) ||
229 !unpack_uint(&p
, end
, &wdf_ubound
) ||
230 !unpack_uint(&p
, end
, &doclen_ubound
) ||
231 !unpack_uint(&p
, end
, &oldest_changeset
) ||
232 !unpack_uint(&p
, end
, &total_doclen
) ||
233 !unpack_uint(&p
, end
, &spelling_wordfreq_ubound
)) {
235 "Bad serialised DB stats (overflowed)" :
236 "Bad serialised DB stats (out of data)";
237 throw Xapian::DatabaseCorruptError(m
);
240 // In the single-file DB case, there will be extra data in
241 // serialised_stats, so suppress this check.
242 if (p
!= end
&& !single_file())
243 throw Xapian::DatabaseCorruptError("Rev file has junk at end");
245 // last_docid must always be >= doccount.
246 last_docid
+= doccount
;
247 // doclen_ubound should always be >= wdf_ubound, so we store the
248 // difference as it may encode smaller. wdf_ubound is likely to
249 // be larger than doclen_lbound.
250 doclen_ubound
+= wdf_ubound
;
254 HoneyVersion::merge_stats(const HoneyVersion
& o
)
256 doccount
+= o
.get_doccount();
257 if (doccount
< o
.get_doccount()) {
258 throw Xapian::DatabaseError("doccount overflowed!");
261 Xapian::termcount o_doclen_lbound
= o
.get_doclength_lower_bound();
262 if (o_doclen_lbound
> 0) {
263 if (doclen_lbound
== 0 || o_doclen_lbound
< doclen_lbound
)
264 doclen_lbound
= o_doclen_lbound
;
267 doclen_ubound
= max(doclen_ubound
, o
.get_doclength_upper_bound());
268 wdf_ubound
= max(wdf_ubound
, o
.get_wdf_upper_bound());
269 total_doclen
+= o
.get_total_doclen();
270 if (total_doclen
< o
.get_total_doclen()) {
271 throw Xapian::DatabaseError("Total document length overflowed!");
274 // The upper bounds might be on the same word, so we must sum them.
275 spelling_wordfreq_ubound
+= o
.get_spelling_wordfreq_upper_bound();
279 HoneyVersion::merge_stats(Xapian::doccount o_doccount
,
280 Xapian::termcount o_doclen_lbound
,
281 Xapian::termcount o_doclen_ubound
,
282 Xapian::termcount o_wdf_ubound
,
283 Xapian::totallength o_total_doclen
,
284 Xapian::termcount o_spelling_wordfreq_ubound
)
286 doccount
+= o_doccount
;
287 if (doccount
< o_doccount
) {
288 throw Xapian::DatabaseError("doccount overflowed!");
291 if (o_doclen_lbound
> 0) {
292 if (doclen_lbound
== 0 || o_doclen_lbound
< doclen_lbound
)
293 doclen_lbound
= o_doclen_lbound
;
296 doclen_ubound
= max(doclen_ubound
, o_doclen_ubound
);
297 wdf_ubound
= max(wdf_ubound
, o_wdf_ubound
);
298 total_doclen
+= o_total_doclen
;
299 if (total_doclen
< o_total_doclen
) {
300 throw Xapian::DatabaseError("Total document length overflowed!");
303 // The upper bounds might be on the same word, so we must sum them.
304 spelling_wordfreq_ubound
+= o_spelling_wordfreq_ubound
;
308 HoneyVersion::cancel()
310 LOGCALL_VOID(DB
, "HoneyVersion::cancel", NO_ARGS
);
311 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
312 root
[table_no
] = old_root
[table_no
];
318 HoneyVersion::write(honey_revision_number_t new_rev
, int flags
)
320 LOGCALL(DB
, const string
, "HoneyVersion::write", new_rev
|flags
);
322 string
s(HONEY_VERSION_MAGIC
, HONEY_VERSION_MAGIC_AND_VERSION_LEN
);
323 s
.append(reinterpret_cast<const char *>(uuid
), 16);
325 pack_uint(s
, new_rev
);
327 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
328 root
[table_no
].serialise(s
);
331 // Serialise database statistics.
333 s
+= serialised_stats
;
336 if (!single_file()) {
338 // In dangerous mode, just write the new version file in place.
339 if (flags
& Xapian::DB_DANGEROUS
)
340 tmpfile
+= "/iamhoney";
344 int open_flags
= O_CREAT
|O_TRUNC
|O_WRONLY
|O_BINARY
;
345 fd
= posixy_open(tmpfile
.c_str(), open_flags
, 0666);
347 string msg
= "Couldn't write new rev file: ";
349 throw Xapian::DatabaseOpeningError(msg
, errno
);
352 if (flags
& Xapian::DB_DANGEROUS
)
357 io_write(fd
, s
.data(), s
.size());
366 changes_buf
+= '\xfe';
367 pack_uint(changes_buf
, new_rev
);
368 pack_uint(changes_buf
, s
.size());
369 changes
->write_block(changes_buf
);
370 changes
->write_block(s
);
377 HoneyVersion::sync(const string
& tmpfile
,
378 honey_revision_number_t new_rev
, int flags
)
380 Assert(new_rev
> rev
|| rev
== 0);
383 if ((flags
& Xapian::DB_NO_SYNC
) == 0 &&
384 ((flags
& Xapian::DB_FULL_SYNC
) ?
390 int fd_to_close
= fd
;
392 if ((flags
& Xapian::DB_NO_SYNC
) == 0 &&
393 ((flags
& Xapian::DB_FULL_SYNC
) ?
394 !io_full_sync(fd_to_close
) :
395 !io_sync(fd_to_close
))) {
396 int save_errno
= errno
;
397 (void)close(fd_to_close
);
398 if (!tmpfile
.empty())
399 (void)unlink(tmpfile
.c_str());
404 if (close(fd_to_close
) != 0) {
405 if (!tmpfile
.empty()) {
406 int save_errno
= errno
;
407 (void)unlink(tmpfile
.c_str());
413 if (!tmpfile
.empty()) {
414 if (!io_tmp_rename(tmpfile
, db_dir
+ "/iamhoney")) {
420 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
421 old_root
[table_no
] = root
[table_no
];
428 // Only try to compress tags longer than this many bytes.
429 const size_t COMPRESS_MIN
= 4;
431 static const uint4 compress_min_tab
[] = {
433 COMPRESS_MIN
, // DOCDATA
434 COMPRESS_MIN
, // TERMLIST
436 COMPRESS_MIN
, // SPELLING
437 COMPRESS_MIN
// SYNONYM
441 HoneyVersion::create(unsigned blocksize
)
443 AssertRel(blocksize
,>=,HONEY_MIN_BLOCKSIZE
);
445 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
446 root
[table_no
].init(blocksize
, compress_min_tab
[table_no
]);
453 RootInfo::init(unsigned blocksize_
, uint4 compress_min_
)
455 AssertRel(blocksize_
,>=,HONEY_MIN_BLOCKSIZE
);
462 blocksize
= blocksize_
;
463 compress_min
= compress_min_
;
464 fl_serialised
.resize(0);
468 RootInfo::serialise(string
&s
) const
470 AssertRel(offset
, >=, 0);
471 std::make_unsigned
<off_t
>::type uoffset
= offset
;
472 AssertRel(root
, >=, uoffset
);
473 pack_uint(s
, uoffset
);
474 pack_uint(s
, root
- uoffset
);
475 unsigned val
= level
<< 2;
476 if (sequential
) val
|= 0x02;
477 if (root_is_fake
) val
|= 0x01;
479 pack_uint(s
, num_entries
);
480 pack_uint(s
, blocksize
>> 11);
481 pack_uint(s
, compress_min
);
482 pack_string(s
, fl_serialised
);
486 RootInfo::unserialise(const char ** p
, const char * end
)
488 std::make_unsigned
<off_t
>::type uoffset
;
490 if (!unpack_uint(p
, end
, &uoffset
) ||
491 !unpack_uint(p
, end
, &root
) ||
492 !unpack_uint(p
, end
, &val
) ||
493 !unpack_uint(p
, end
, &num_entries
) ||
494 !unpack_uint(p
, end
, &blocksize
) ||
495 !unpack_uint(p
, end
, &compress_min
) ||
496 !unpack_string(p
, end
, fl_serialised
)) return false;
500 sequential
= val
& 0x02;
501 root_is_fake
= val
& 0x01;
503 AssertRel(blocksize
,>=,HONEY_MIN_BLOCKSIZE
);