[honey] Remove unless code copy
[xapian.git] / xapian-core / backends / honey / honey_table.h
blobb7c2b16a0cd2aae929cc08381ab4935b1385f9ae
1 /** @file honey_table.h
2 * @brief HoneyTable class
3 */
4 /* Copyright (C) 2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_HONEY_TABLE_H
22 #define XAPIAN_INCLUDED_HONEY_TABLE_H
24 //#define SSINDEX_ARRAY
25 //#define SSINDEX_BINARY_CHOP
26 #define SSINDEX_SKIPLIST
28 #define SSINDEX_BINARY_CHOP_KEY_SIZE 4
30 //#include "xapian/constants.h"
31 #include "xapian/error.h"
33 #include <algorithm>
34 #if 0
35 #include <iostream> // FIXME
36 #endif
38 #include <cstdio> // For EOF
39 #include <cstdlib> // std::abort()
40 #include <type_traits>
41 #ifdef HAVE_SYS_UIO_H
42 # include <sys/uio.h>
43 #endif
45 #include <sys/types.h>
46 #include "safesysstat.h"
47 #include "safeunistd.h"
49 #include "safeerrno.h"
51 #include "compression_stream.h"
52 #include "honey_defs.h"
53 #include "honey_version.h"
54 #include "internaltypes.h"
55 #include "io_utils.h"
56 #include "pack.h"
57 #include "str.h"
58 #include "stringutils.h"
59 #include "wordaccess.h"
61 #include "unicode/description_append.h"
63 #ifdef BLK_UNUSED
64 # undef BLK_UNUSED
65 #endif // FIXME: namespace it?
67 const uint4 BLK_UNUSED = uint4(-1);
69 class HoneyFreeListChecker;
71 const int FORCED_CLOSE = -2;
73 class BufferedFile {
74 int fd = -1;
75 mutable off_t pos = 0;
76 bool read_only = true;
77 mutable size_t buf_end = 0;
78 mutable char buf[4096];
80 public:
81 BufferedFile() { }
83 BufferedFile(const BufferedFile& o) : fd(o.fd) {
84 if (!o.read_only) std::abort();
85 #if 0
86 if (o.buf_end) {
87 buf_end = o.buf_end;
88 std::memcpy(buf, o.buf, buf_end);
90 #endif
93 BufferedFile(int fd_, off_t pos_, bool read_only_)
94 : fd(fd_), pos(pos_), read_only(read_only_) {}
96 ~BufferedFile() {
97 // if (fd >= 0) ::close(fd);
100 void close() {
101 if (fd >= 0) {
102 ::close(fd);
103 fd = -1;
107 void force_close() {
108 close();
109 fd = FORCED_CLOSE;
112 void reset_fd(bool permanent) {
113 fd = permanent ? FORCED_CLOSE : -1;
116 bool is_open() const { return fd >= 0; }
118 bool was_forced_closed() const { return fd == FORCED_CLOSE; }
120 bool open(const std::string& path, bool read_only_) {
121 // if (fd >= 0) ::close(fd);
122 read_only = read_only_;
123 if (read_only) {
124 // FIXME: add new io_open_stream_rd() etc?
125 fd = io_open_block_rd(path);
126 } else {
127 // FIXME: Always create anew for now...
128 fd = io_open_block_wr(path, true);
130 return fd >= 0;
133 off_t get_pos() const {
134 return read_only ? pos - buf_end : pos + buf_end;
137 void set_pos(off_t pos_) {
138 if (!read_only) flush();
139 if (false && pos_ >= pos) { // FIXME: need to take buf_end into account
140 skip(pos_ - pos);
141 } else {
142 // FIXME: salvage some of the buffer if we can?
143 buf_end = 0;
144 pos = pos_;
148 void skip(size_t delta) const {
149 if (!read_only) std::abort();
150 // Keep any buffered data we can.
151 if (delta > buf_end) {
152 pos -= buf_end;
153 pos += delta;
154 buf_end = 0;
155 } else {
156 buf_end -= delta;
160 #if 0
161 bool empty() const {
162 if (buf_end) return false;
163 struct stat sbuf;
164 if (fd == -1 || fstat(fd, &sbuf) < 0) return true;
165 return (sbuf.st_size == 0);
167 #endif
169 void write(unsigned char ch) {
170 if (buf_end == sizeof(buf)) {
171 // writev()?
172 io_write(fd, buf, buf_end);
173 pos += buf_end;
174 buf_end = 0;
176 buf[buf_end++] = ch;
179 void write(const char* p, size_t len) {
180 if (buf_end + len <= sizeof(buf)) {
181 memcpy(buf + buf_end, p, len);
182 buf_end += len;
183 return;
186 pos += buf_end + len;
187 #ifdef HAVE_WRITEV
188 while (true) {
189 struct iovec iov[2];
190 iov[0].iov_base = buf;
191 iov[0].iov_len = buf_end;
192 iov[1].iov_base = const_cast<char*>(p);
193 iov[1].iov_len = len;
194 ssize_t n_ = writev(fd, iov, 2);
195 if (n_ < 0) std::abort();
196 size_t n = n_;
197 if (n == buf_end + len) {
198 // Wrote everything.
199 buf_end = 0;
200 return;
202 if (n >= buf_end) {
203 // Wrote all of buf.
204 n -= buf_end;
205 p += n;
206 len -= n;
207 io_write(fd, p, len);
208 buf_end = 0;
209 return;
211 buf_end -= n;
212 memmove(buf, buf + n, buf_end);
214 #else
215 io_write(fd, buf, buf_end);
216 if (len >= sizeof(buf)) {
217 // If it's bigger than our buffer, just write it directly.
218 io_write(fd, p, len);
219 buf_end = 0;
220 return;
222 memcpy(buf, p, len);
223 buf_end = len;
224 #endif
227 int read() const {
228 #if 1
229 if (buf_end == 0) {
230 // The buffer is currently empty, so we need to read at least one
231 // byte.
232 size_t r = io_pread(fd, buf, sizeof(buf), pos, 0);
233 if (r < sizeof(buf)) {
234 if (r == 0) {
235 return EOF;
237 memmove(buf + sizeof(buf) - r, buf, r);
239 pos += r;
240 buf_end = r;
242 return static_cast<unsigned char>(buf[sizeof(buf) - buf_end--]);
243 #else
244 unsigned char ch;
245 if (io_pread(fd, &ch, 1, pos) != 1)
246 return EOF;
247 ++pos;
248 return ch;
249 #endif
252 void read(char* p, size_t len) const {
253 #if 1
254 if (buf_end != 0) {
255 if (len <= buf_end) {
256 memcpy(p, buf + sizeof(buf) - buf_end, len);
257 buf_end -= len;
258 return;
260 memcpy(p, buf + sizeof(buf) - buf_end, buf_end);
261 p += buf_end;
262 len -= buf_end;
263 buf_end = 0;
265 // FIXME: refill buffer if len < sizeof(buf)
266 #endif
267 size_t r = io_pread(fd, p, len, pos, len);
268 // io_pread() should throw an exception if it read < len bytes.
269 AssertEq(r, len);
270 pos += r;
273 void flush() {
274 if (!read_only && buf_end) {
275 io_write(fd, buf, buf_end);
276 pos += buf_end;
277 buf_end = 0;
281 void sync() {
282 io_sync(fd);
285 void rewind(off_t start) {
286 read_only = true;
287 pos = start;
288 buf_end = 0;
292 class HoneyCursor;
294 class SSIndex {
295 std::string data;
296 #if defined SSINDEX_BINARY_CHOP
297 size_t block = size_t(-1);
298 #elif defined SSINDEX_SKIPLIST
299 size_t block = 0;
300 #endif
301 size_t n_index = 0;
302 #if defined SSINDEX_BINARY_CHOP || defined SSINDEX_SKIPLIST
303 std::string last_index_key;
304 #endif
305 // Put an index entry every this much:
306 // FIXME: tune - seems 64K is common elsewhere
307 enum { INDEXBLOCK = 4096 };
308 SSIndex* parent_index = NULL;
310 #ifdef SSINDEX_ARRAY
311 unsigned char first, last = static_cast<unsigned char>(-1);
312 off_t* pointers = NULL;
313 #endif
315 public:
316 SSIndex() {
317 #ifdef SSINDEX_ARRAY
318 // Header added in write() method.
319 #elif defined SSINDEX_BINARY_CHOP
320 data.assign(5, '\x01');
321 #elif defined SSINDEX_SKIPLIST
322 data.assign(1, '\x02');
323 #else
324 # error "SSINDEX type not specified"
325 #endif
328 ~SSIndex() {
329 #ifdef SSINDEX_ARRAY
330 delete [] pointers;
331 #endif
334 void maybe_add_entry(const std::string& key, off_t ptr) {
335 #ifdef SSINDEX_ARRAY
336 unsigned char initial = key[0];
337 if (!pointers) {
338 pointers = new off_t[256]();
339 first = initial;
340 } else if (initial == last) {
341 return;
344 while (++last != initial) {
345 pointers[last] = ptr;
346 // FIXME: Perhaps record this differently so that an exact key
347 // search can return false?
349 pointers[initial] = ptr;
350 last = initial;
351 #elif defined SSINDEX_BINARY_CHOP
352 // We store entries truncated to a maximum width (and trailing zeros
353 // are used to indicate keys shorter than that max width). These then
354 // point to the first key that maps to this truncated value.
356 // We need constant width entries to allow binary chop to work, but
357 // there are other ways to achieve this which could be explored. We
358 // could allow the full key width of 256 bytes, but that would take a
359 // lot more space. We could store a pointer (offset) to the key data,
360 // but that's more complex to read, and adds the pointer overhead. We
361 // could use a "SKO" - a fixed width entry which encodes variable
362 // length pointer and key with short keys in the entry and long keys
363 // pointed to (or prefix included and rest pointed to).
364 if (last_index_key.size() == SSINDEX_BINARY_CHOP_KEY_SIZE) {
365 if (startswith(key, last_index_key)) {
366 return;
370 // Ensure the truncated key doesn't end in a zero byte.
371 if (key.size() >= SSINDEX_BINARY_CHOP_KEY_SIZE) {
372 // FIXME: Start from char N if we have N array index levels above.
373 last_index_key.assign(key, 0, SSINDEX_BINARY_CHOP_KEY_SIZE);
374 if (key[SSINDEX_BINARY_CHOP_KEY_SIZE - 1] == '\0')
375 return;
376 } else {
377 last_index_key = key;
378 if (key.back() == '\0')
379 return;
380 // Pad with zero bytes.
381 last_index_key.resize(SSINDEX_BINARY_CHOP_KEY_SIZE);
384 // Thin entries to at most one per INDEXBLOCK sized block.
385 size_t cur_block = ptr / INDEXBLOCK;
386 if (cur_block == block)
387 return;
389 #if 0
391 std::string esc;
392 description_append(esc, last_index_key);
393 std::cout << "Adding «" << esc << "» -> file offset " << ptr << std::endl;
395 #endif
396 data += last_index_key;
397 size_t c = data.size();
398 data.resize(c + 4);
399 unaligned_write4(reinterpret_cast<unsigned char*>(&data[c]), ptr);
401 block = cur_block;
402 #elif defined SSINDEX_SKIPLIST
403 size_t cur_block = ptr / INDEXBLOCK;
404 if (cur_block == block) return;
406 size_t reuse = common_prefix_length(last_index_key, key);
408 data += char(reuse);
409 data += char(key.size() - reuse);
410 data.append(key, reuse, key.size() - reuse);
411 pack_uint(data, static_cast<std::make_unsigned<off_t>::type>(ptr));
413 block = cur_block;
414 // FIXME: deal with parent_index...
416 last_index_key = key;
417 #else
418 # error "SSINDEX type not specified"
419 #endif
421 ++n_index;
424 off_t write(BufferedFile& fh) {
425 #ifdef SSINDEX_ARRAY
426 if (!pointers) {
427 first = last = 0;
428 pointers = new off_t[1]();
430 data.resize(0);
431 data.resize(3 + (last - first + 1) * 4);
432 data[0] = 0;
433 data[1] = first;
434 data[2] = last - first;
435 for (unsigned ch = first; ch <= last; ++ch) {
436 size_t o = 3 + (ch - first) * 4;
437 // FIXME: Just make offsets 8 bytes? Or allow different widths?
438 off_t ptr = pointers[ch];
439 if (ptr > 0xffffffff)
440 throw Xapian::DatabaseError("Index offset needs >4 bytes");
441 Assert(o + 4 <= data.size());
442 unaligned_write4(reinterpret_cast<unsigned char*>(&data[o]), ptr);
444 delete [] pointers;
445 pointers = NULL;
446 #elif defined SSINDEX_BINARY_CHOP
447 // Fill in bytes 1 to 4 with the number of entries.
448 AssertEq(n_index, (data.size() - 5) / (SSINDEX_BINARY_CHOP_KEY_SIZE + 4));
449 data[1] = n_index >> 24;
450 data[2] = n_index >> 16;
451 data[3] = n_index >> 8;
452 data[4] = n_index;
453 #elif defined SSINDEX_SKIPLIST
454 // Already built in data.
455 #else
456 # error "SSINDEX type not specified"
457 #endif
459 off_t root = fh.get_pos();
460 fh.write(data.data(), data.size());
461 // FIXME: parent stuff...
462 return root;
465 size_t size() const {
466 // FIXME: For SSINDEX_ARRAY, data.size() only correct after calling
467 // write().
468 size_t s = data.size();
469 if (parent_index) s += parent_index->size();
470 return s;
473 size_t get_num_entries() const { return n_index; }
476 class HoneyCursor;
477 class MutableHoneyCursor;
479 class HoneyTable {
480 friend class HoneyCursor; // Allow access to fh. FIXME cleaner way?
481 friend class MutableHoneyCursor; // Allow access to fh. FIXME cleaner way?
483 std::string path;
484 bool read_only;
485 int flags;
486 uint4 compress_min;
487 mutable BufferedFile fh;
488 mutable std::string last_key;
489 SSIndex index;
490 off_t root = -1;
491 honey_tablesize_t num_entries = 0;
492 bool lazy;
494 bool single_file() const { return path.empty(); }
496 /** Offset to add to pointers in this table.
498 * This is zero when each table is a separate file, but likely non-zero
499 * when the tables are all embedded in one file.
501 off_t offset = 0;
503 bool get_exact_entry(const std::string& key, std::string* tag) const;
505 bool read_key(std::string& key, size_t& val_size, bool& compressed) const;
507 void read_val(std::string& val, size_t val_size) const;
509 public:
510 HoneyTable(const char*, const std::string& path_, bool read_only_,
511 bool lazy_ = false)
512 : path(path_ + HONEY_TABLE_EXTENSION),
513 read_only(read_only_),
514 lazy(lazy_)
518 HoneyTable(const char*, int fd, off_t offset_, bool read_only_,
519 bool lazy_ = false)
520 : read_only(read_only_),
521 fh(fd, offset_, read_only_),
522 lazy(lazy_),
523 offset(offset_)
527 static size_t total_index_size;
529 ~HoneyTable() {
530 #if 0
531 size_t index_size = index.size();
532 total_index_size += index_size;
533 if (index_size)
534 std::cout << "*** " << path << " - index " << index_size << " for "
535 << index.get_num_entries() << " entries; total_size = "
536 << total_index_size << std::endl;
537 #endif
538 if (!single_file())
539 fh.close();
540 else
541 fh.reset_fd(false);
544 bool is_writable() const { return !read_only; }
546 int get_flags() const { return flags; }
548 void set_full_compaction(bool) { }
550 void set_max_item_size(unsigned) { }
552 void create_and_open(int flags_, const Honey::RootInfo& root_info);
554 void open(int flags_, const Honey::RootInfo& root_info,
555 honey_revision_number_t);
557 void close(bool permanent) {
558 if (!single_file()) {
559 if (permanent)
560 fh.force_close();
561 else
562 fh.close();
563 } else {
564 fh.reset_fd(permanent);
568 const std::string& get_path() const { return path; }
570 void add(const std::string& key,
571 const char* val,
572 size_t val_size,
573 bool compressed = false);
575 void add(const std::string& key,
576 const std::string& val,
577 bool compressed = false) {
578 add(key, val.data(), val.size(), compressed);
581 void flush_db() {
582 root = index.write(fh);
583 fh.flush();
586 void cancel(const Honey::RootInfo&, honey_revision_number_t) {
587 std::abort();
590 void commit(honey_revision_number_t, Honey::RootInfo* root_info);
592 bool sync() {
593 fh.sync();
594 return true;
597 bool empty() const {
598 return num_entries == 0;
601 bool get_exact_entry(const std::string& key, std::string& tag) const {
602 return get_exact_entry(key, &tag);
605 bool key_exists(const std::string& key) const {
606 return get_exact_entry(key, NULL);
609 bool del(const std::string&) {
610 std::abort();
613 // readahead probably not useful? (FIXME)
614 bool readahead_key(const std::string&) const { return false; }
616 bool is_modified() const { return !read_only && !empty(); }
618 HoneyCursor* cursor_get() const;
620 bool exists() const {
621 struct stat sbuf;
622 return stat(path.c_str(), &sbuf) == 0;
625 bool is_open() const { return fh.is_open(); }
627 void set_changes(HoneyChanges*) { }
629 static void throw_database_closed() {
630 throw Xapian::DatabaseError("Closed!");
633 honey_tablesize_t get_entry_count() const { return num_entries; }
635 off_t get_root() const { return root; }
637 off_t get_offset() const { return offset; }
640 #endif // XAPIAN_INCLUDED_HONEY_TABLE_H