1 /** @file honey_version.cc
2 * @brief HoneyVersion class
4 /* Copyright (C) 2006,2007,2008,2009,2010,2013,2014,2015,2016,2017,2018 Olly Betts
5 * Copyright (C) 2011 Dan Colish
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "honey_version.h"
28 #include "honey_defs.h"
30 #include "min_non_zero.h"
33 #include "posixy_wrapper.h"
34 #include "stringutils.h" // For STRINGIZE() and CONST_STRLEN().
37 #include <cstring> // For memcmp().
39 #include <sys/types.h>
40 #include "safesysstat.h"
41 #include "safefcntl.h"
42 #include "safeunistd.h"
44 #include "stringutils.h"
46 #include "backends/uuids.h"
48 #include "xapian/constants.h"
49 #include "xapian/error.h"
53 /// Honey format version (date of change):
54 #define HONEY_FORMAT_VERSION DATE_TO_VERSION(2018,4,3)
55 // 2018,4,3 1.5.0 outlaw mixed-wdf terms
56 // 2018,3,28 don't special case first entry in SSTable
57 // 2018,3,27 new key format for value stats, value chunks, doclen chunks
58 // 2018,3,26 use known suffix from spelling B and T keys
59 // 2018,3,25 use known prefix from spelling B and H keys
60 // 2018,3,15 avoid storing flat wdf
61 // 2018,3,14 store per term wdf_max
62 // 2018,3,12 binary chop index
63 // 2018,3,11 spelling key encoding changed
64 // 2018,2,22 index valuestream chunks by last docid in chunk
65 // 2018,2,21 index doclen chunks by last docid in chunk
66 // 2018,2,20 implement array index
67 // 2018,2,19 allow 1,2,3 as well as 4 byte doc length width
68 // 2018,2,2 special case tf=2; first_wdf = floor(collfreq/2)
69 // 2018,2,1 pack_uint for postlist data
70 // 2018,1,31 Special case postlist when termfreq==2
71 // 2018,1,30 More compact postlist chunk headers
72 // 2018,1,23 Elide last-first for single occurrence terms
73 // 2018,1,4 Merge values used and terms used
74 // 2018,1,3 Table start offset in RootInfo
75 // 2017,12,30 Value stats key changes
76 // 2017,12,29 User metadata key changes
77 // 2017,12,5 New Honey backend
79 /// Convert date <-> version number. Dates up to 2141-12-31 fit in 2 bytes.
80 #define DATE_TO_VERSION(Y,M,D) \
81 ((unsigned(Y) - 2014) << 9 | unsigned(M) << 5 | unsigned(D))
82 #define VERSION_TO_YEAR(V) ((unsigned(V) >> 9) + 2014)
83 #define VERSION_TO_MONTH(V) ((unsigned(V) >> 5) & 0x0f)
84 #define VERSION_TO_DAY(V) (unsigned(V) & 0x1f)
86 #define HONEY_VERSION_MAGIC_LEN 14
87 #define HONEY_VERSION_MAGIC_AND_VERSION_LEN 16
89 static const char HONEY_VERSION_MAGIC
[HONEY_VERSION_MAGIC_AND_VERSION_LEN
] = {
90 '\x0f', '\x0d', 'X', 'a', 'p', 'i', 'a', 'n', ' ', 'H', 'o', 'n', 'e', 'y',
91 char((HONEY_FORMAT_VERSION
>> 8) & 0xff), char(HONEY_FORMAT_VERSION
& 0xff)
94 HoneyVersion::HoneyVersion(int fd_
)
95 : rev(0), fd(fd_
), offset(0), db_dir(),
96 doccount(0), total_doclen(0), last_docid(0),
97 doclen_lbound(0), doclen_ubound(0),
98 wdf_ubound(0), spelling_wordfreq_ubound(0),
100 uniq_terms_lbound(0), uniq_terms_ubound(0)
102 offset
= lseek(fd
, 0, SEEK_CUR
);
103 if (rare(offset
== off_t(-1))) {
104 string msg
= "lseek failed on file descriptor ";
106 throw Xapian::DatabaseOpeningError(msg
, errno
);
110 HoneyVersion::~HoneyVersion()
112 // Either this is a single-file database, or this fd is from opening a new
113 // version file in write(), but sync() was never called.
121 LOGCALL_VOID(DB
, "HoneyVersion::read", NO_ARGS
);
125 if (rare(lseek(fd
, offset
, SEEK_SET
) == off_t(-1))) {
126 string msg
= "Failed to rewind file descriptor ";
128 throw Xapian::DatabaseOpeningError(msg
, errno
);
132 string filename
= db_dir
;
133 filename
+= "/iamhoney";
134 fd_in
= posixy_open(filename
.c_str(), O_RDONLY
|O_BINARY
);
135 if (rare(fd_in
< 0)) {
136 string msg
= filename
;
137 msg
+= ": Failed to open honey revision file for reading";
138 throw Xapian::DatabaseOpeningError(msg
, errno
);
145 const char * p
= buf
;
146 const char * end
= p
+ io_read(fd_in
, buf
, sizeof(buf
), 33);
148 if (memcmp(buf
, HONEY_VERSION_MAGIC
, HONEY_VERSION_MAGIC_LEN
) != 0)
149 throw Xapian::DatabaseCorruptError("Rev file magic incorrect");
152 version
= static_cast<unsigned char>(buf
[HONEY_VERSION_MAGIC_LEN
]);
154 version
|= static_cast<unsigned char>(buf
[HONEY_VERSION_MAGIC_LEN
+ 1]);
155 if (version
!= HONEY_FORMAT_VERSION
) {
157 if (!single_file()) {
161 msg
+= "Database is format version ";
162 msg
+= str(VERSION_TO_YEAR(version
) * 10000 +
163 VERSION_TO_MONTH(version
) * 100 +
164 VERSION_TO_DAY(version
));
165 msg
+= " but I only understand ";
166 msg
+= str(VERSION_TO_YEAR(HONEY_FORMAT_VERSION
) * 10000 +
167 VERSION_TO_MONTH(HONEY_FORMAT_VERSION
) * 100 +
168 VERSION_TO_DAY(HONEY_FORMAT_VERSION
));
169 throw Xapian::DatabaseVersionError(msg
);
172 p
+= HONEY_VERSION_MAGIC_AND_VERSION_LEN
;
174 p
+= uuid
.BINARY_SIZE
;
176 if (!unpack_uint(&p
, end
, &rev
)) {
177 throw Xapian::DatabaseCorruptError("Rev file failed to decode "
181 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
182 if (!root
[table_no
].unserialise(&p
, end
)) {
183 throw Xapian::DatabaseCorruptError("Rev file root_info missing");
185 old_root
[table_no
] = root
[table_no
];
188 // For a single-file database, this will assign extra data. We read
189 // sizeof(buf) above, then skip HONEY_VERSION_MAGIC_AND_VERSION_LEN,
190 // then 16, then the size of the serialised root info.
191 serialised_stats
.assign(p
, end
);
196 HoneyVersion::serialise_stats()
198 serialised_stats
.resize(0);
199 pack_uint(serialised_stats
, doccount
);
200 // last_docid must always be >= doccount.
201 pack_uint(serialised_stats
, last_docid
- doccount
);
202 pack_uint(serialised_stats
, doclen_lbound
);
203 pack_uint(serialised_stats
, wdf_ubound
);
204 // doclen_ubound should always be >= wdf_ubound, so we store the
205 // difference as it may encode smaller. wdf_ubound is likely to
206 // be larger than doclen_lbound.
207 pack_uint(serialised_stats
, doclen_ubound
- wdf_ubound
);
208 pack_uint(serialised_stats
, oldest_changeset
);
209 pack_uint(serialised_stats
, total_doclen
);
210 pack_uint(serialised_stats
, spelling_wordfreq_ubound
);
211 // If total_doclen == 0 then unique_terms is always zero (or there are no
212 // documents at all) so storing these just complicates things because
213 // uniq_terms_lbound could legitimately be zero.
214 if (total_doclen
!= 0) {
215 // We rely on uniq_terms_lbound being non-zero to detect if it's present
216 // for a single file DB.
217 Assert(uniq_terms_lbound
!= 0);
218 pack_uint(serialised_stats
, uniq_terms_lbound
);
219 pack_uint(serialised_stats
, uniq_terms_ubound
);
224 HoneyVersion::unserialise_stats()
226 const char * p
= serialised_stats
.data();
227 const char * end
= p
+ serialised_stats
.size();
235 oldest_changeset
= 0;
236 spelling_wordfreq_ubound
= 0;
237 uniq_terms_lbound
= 0;
238 uniq_terms_ubound
= 0;
242 if (!unpack_uint(&p
, end
, &doccount
) ||
243 !unpack_uint(&p
, end
, &last_docid
) ||
244 !unpack_uint(&p
, end
, &doclen_lbound
) ||
245 !unpack_uint(&p
, end
, &wdf_ubound
) ||
246 !unpack_uint(&p
, end
, &doclen_ubound
) ||
247 !unpack_uint(&p
, end
, &oldest_changeset
) ||
248 !unpack_uint(&p
, end
, &total_doclen
) ||
249 !unpack_uint(&p
, end
, &spelling_wordfreq_ubound
)) {
251 "Bad serialised DB stats (overflowed)" :
252 "Bad serialised DB stats (out of data)";
253 throw Xapian::DatabaseCorruptError(m
);
256 // last_docid must always be >= doccount.
257 last_docid
+= doccount
;
258 // doclen_ubound should always be >= wdf_ubound, so we store the
259 // difference as it may encode smaller. wdf_ubound is likely to
260 // be larger than doclen_lbound.
261 doclen_ubound
+= wdf_ubound
;
263 // We don't check if there's undecoded data between p and end - in the
264 // single-file DB case there will be extra zero bytes in serialised_stats,
265 // and more generally it's useful to be able to add new stats when it is
266 // safe for old versions to just ignore them and there are sensible values
267 // to use when a new version reads an old database.
269 // Read bounds on unique_terms if stored. This test relies on the first
270 // byte of pack_uint(x) being zero if and only if x is zero, and on
271 // uniq_terms_lbound being non-zero.
272 if (p
== end
|| *p
== '\0') {
273 // No bounds stored so use weak bounds based on other stats.
274 uniq_terms_lbound
= 1;
275 uniq_terms_ubound
= doclen_ubound
;
276 } else if (!unpack_uint(&p
, end
, &uniq_terms_lbound
) ||
277 !unpack_uint(&p
, end
, &uniq_terms_ubound
)) {
279 "Bad serialised unique_terms bounds (overflowed)" :
280 "Bad serialised unique_terms bounds (out of data)";
281 throw Xapian::DatabaseCorruptError(m
);
286 HoneyVersion::merge_stats(const HoneyVersion
& o
)
288 doccount
+= o
.get_doccount();
289 if (doccount
< o
.get_doccount()) {
290 throw Xapian::DatabaseError("doccount overflowed!");
293 doclen_lbound
= min_non_zero(doclen_lbound
, o
.get_doclength_lower_bound());
294 doclen_ubound
= max(doclen_ubound
, o
.get_doclength_upper_bound());
295 wdf_ubound
= max(wdf_ubound
, o
.get_wdf_upper_bound());
296 total_doclen
+= o
.get_total_doclen();
297 if (total_doclen
< o
.get_total_doclen()) {
298 throw Xapian::DatabaseError("Total document length overflowed!");
301 // The upper bounds might be on the same word, so we must sum them.
302 spelling_wordfreq_ubound
+= o
.get_spelling_wordfreq_upper_bound();
304 uniq_terms_lbound
= min_non_zero(uniq_terms_lbound
,
305 o
.get_unique_terms_lower_bound());
306 uniq_terms_ubound
= max(uniq_terms_ubound
,
307 o
.get_unique_terms_upper_bound());
311 HoneyVersion::merge_stats(Xapian::doccount o_doccount
,
312 Xapian::termcount o_doclen_lbound
,
313 Xapian::termcount o_doclen_ubound
,
314 Xapian::termcount o_wdf_ubound
,
315 Xapian::totallength o_total_doclen
,
316 Xapian::termcount o_spelling_wordfreq_ubound
,
317 Xapian::termcount o_uniq_terms_lbound
,
318 Xapian::termcount o_uniq_terms_ubound
)
320 doccount
+= o_doccount
;
321 if (doccount
< o_doccount
) {
322 throw Xapian::DatabaseError("doccount overflowed!");
325 doclen_lbound
= min_non_zero(doclen_lbound
, o_doclen_lbound
);
326 doclen_ubound
= max(doclen_ubound
, o_doclen_ubound
);
327 wdf_ubound
= max(wdf_ubound
, o_wdf_ubound
);
328 total_doclen
+= o_total_doclen
;
329 if (total_doclen
< o_total_doclen
) {
330 throw Xapian::DatabaseError("Total document length overflowed!");
333 // The upper bounds might be on the same word, so we must sum them.
334 spelling_wordfreq_ubound
+= o_spelling_wordfreq_ubound
;
336 uniq_terms_lbound
= min_non_zero(uniq_terms_lbound
, o_uniq_terms_lbound
);
337 uniq_terms_ubound
= max(uniq_terms_ubound
, o_uniq_terms_ubound
);
341 HoneyVersion::cancel()
343 LOGCALL_VOID(DB
, "HoneyVersion::cancel", NO_ARGS
);
344 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
345 root
[table_no
] = old_root
[table_no
];
351 HoneyVersion::write(honey_revision_number_t new_rev
, int flags
)
353 LOGCALL(DB
, const string
, "HoneyVersion::write", new_rev
|flags
);
355 string
s(HONEY_VERSION_MAGIC
, HONEY_VERSION_MAGIC_AND_VERSION_LEN
);
356 s
.append(uuid
.data(), uuid
.BINARY_SIZE
);
358 pack_uint(s
, new_rev
);
360 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
361 root
[table_no
].serialise(s
);
364 // Serialise database statistics.
366 s
+= serialised_stats
;
369 if (!single_file()) {
371 // In dangerous mode, just write the new version file in place.
372 if (flags
& Xapian::DB_DANGEROUS
)
373 tmpfile
+= "/iamhoney";
377 int open_flags
= O_CREAT
|O_TRUNC
|O_WRONLY
|O_BINARY
;
378 fd
= posixy_open(tmpfile
.c_str(), open_flags
, 0666);
380 string msg
= "Couldn't write new rev file: ";
382 throw Xapian::DatabaseOpeningError(msg
, errno
);
385 if (flags
& Xapian::DB_DANGEROUS
)
390 io_write(fd
, s
.data(), s
.size());
401 HoneyVersion::sync(const string
& tmpfile
,
402 honey_revision_number_t new_rev
, int flags
)
404 Assert(new_rev
> rev
|| rev
== 0);
407 if ((flags
& Xapian::DB_NO_SYNC
) == 0 &&
408 ((flags
& Xapian::DB_FULL_SYNC
) ?
414 int fd_to_close
= fd
;
416 if ((flags
& Xapian::DB_NO_SYNC
) == 0 &&
417 ((flags
& Xapian::DB_FULL_SYNC
) ?
418 !io_full_sync(fd_to_close
) :
419 !io_sync(fd_to_close
))) {
420 int save_errno
= errno
;
421 (void)close(fd_to_close
);
422 if (!tmpfile
.empty())
423 (void)unlink(tmpfile
.c_str());
428 if (close(fd_to_close
) != 0) {
429 if (!tmpfile
.empty()) {
430 int save_errno
= errno
;
431 (void)unlink(tmpfile
.c_str());
437 if (!tmpfile
.empty()) {
438 if (!io_tmp_rename(tmpfile
, db_dir
+ "/iamhoney")) {
444 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
445 old_root
[table_no
] = root
[table_no
];
452 // Only try to compress tags longer than this many bytes.
453 const size_t COMPRESS_MIN
= 4;
455 static const uint4 compress_min_tab
[] = {
457 COMPRESS_MIN
, // DOCDATA
458 COMPRESS_MIN
, // TERMLIST
460 COMPRESS_MIN
, // SPELLING
461 COMPRESS_MIN
// SYNONYM
465 HoneyVersion::create()
468 for (unsigned table_no
= 0; table_no
< Honey::MAX_
; ++table_no
) {
469 root
[table_no
].init(compress_min_tab
[table_no
]);
476 RootInfo::init(uint4 compress_min_
)
481 compress_min
= compress_min_
;
482 fl_serialised
.resize(0);
486 RootInfo::serialise(string
&s
) const
488 AssertRel(offset
, >=, 0);
489 std::make_unsigned
<off_t
>::type uoffset
= offset
;
490 AssertRel(root
, >=, offset
);
491 pack_uint(s
, uoffset
);
492 pack_uint(s
, root
- uoffset
);
494 pack_uint(s
, num_entries
);
495 pack_uint(s
, 2048u >> 11);
496 pack_uint(s
, compress_min
);
497 pack_string(s
, fl_serialised
);
501 RootInfo::unserialise(const char ** p
, const char * end
)
503 std::make_unsigned
<off_t
>::type uoffset
, uroot
;
505 unsigned dummy_blocksize
;
506 if (!unpack_uint(p
, end
, &uoffset
) ||
507 !unpack_uint(p
, end
, &uroot
) ||
508 !unpack_uint(p
, end
, &dummy_val
) ||
509 !unpack_uint(p
, end
, &num_entries
) ||
510 !unpack_uint(p
, end
, &dummy_blocksize
) ||
511 !unpack_uint(p
, end
, &compress_min
) ||
512 !unpack_string(p
, end
, fl_serialised
)) return false;
514 root
= uoffset
+ uroot
;
515 // Not meaningful, but still there so that existing honey databases
518 (void)dummy_blocksize
;