HoneyCursor::find_entry() now uses the index
[xapian.git] / xapian-core / backends / honey / honey_cursor.h
blob14563790a9abbf0f38903ac240a9868f67f97769
1 #ifndef XAPIAN_INCLUDED_HONEY_CURSOR_H
2 #define XAPIAN_INCLUDED_HONEY_CURSOR_H
4 #include "honey_table.h"
6 #define DEBUGGING false
8 class HoneyCursor {
9 void rewind() {
10 fh.set_pos(0 * root);
11 last_key = std::string();
12 is_at_end = false;
13 index = root;
16 public:
17 BufferedFile fh;
18 std::string current_key, current_tag;
19 mutable size_t val_size = 0;
20 bool current_compressed = false;
21 mutable CompressionStream comp_stream;
22 bool is_at_end = false;
23 mutable std::string last_key;
25 // File offset to start of index and to current position in index.
26 off_t root, index;
28 // Forward to next constructor form.
29 explicit HoneyCursor(const HoneyTable* table)
30 : HoneyCursor(table->fh, table->get_root()) {}
32 HoneyCursor(const BufferedFile& fh_, off_t root_)
33 : fh(fh_),
34 comp_stream(Z_DEFAULT_STRATEGY),
35 root(root_),
36 index(root_)
38 fh.set_pos(0 * root);
41 HoneyCursor(const HoneyCursor& o)
42 : fh(o.fh),
43 current_key(o.current_key),
44 current_tag(o.current_tag), // FIXME really copy?
45 val_size(o.val_size),
46 current_compressed(o.current_compressed),
47 comp_stream(Z_DEFAULT_STRATEGY),
48 is_at_end(o.is_at_end),
49 last_key(o.last_key),
50 root(o.root),
51 index(o.index)
53 fh.set_pos(o.fh.get_pos());
56 bool after_end() const { return is_at_end; }
58 bool next() {
59 if (is_at_end) {
60 Assert(false);
61 return false;
64 if (val_size) {
65 // Skip val data we've not looked at.
66 fh.skip(val_size);
67 val_size = 0;
70 if (fh.get_pos() == root) {
71 is_at_end = true;
72 return false;
75 int ch = fh.read();
76 if (ch == EOF) {
77 is_at_end = true;
78 return false;
81 size_t reuse = 0;
82 if (!last_key.empty()) {
83 reuse = ch;
84 ch = fh.read();
85 if (ch == EOF) throw Xapian::DatabaseError("EOF/error while reading key length", errno);
87 size_t key_size = ch;
88 char buf[256];
89 if (!fh.read(buf, key_size))
90 throw Xapian::DatabaseError("read of " + str(key_size) + " bytes of key data failed", errno);
91 current_key.assign(last_key, 0, reuse);
92 current_key.append(buf, key_size);
93 last_key = current_key;
95 if (DEBUGGING) {
96 std::string esc;
97 description_append(esc, current_key);
98 std::cerr << "K:" << esc << std::endl;
101 int r;
103 // FIXME: rework to take advantage of buffering that's happening anyway?
104 char * p = buf;
105 for (int i = 0; i < 8; ++i) {
106 int ch2 = fh.read();
107 if (ch2 == EOF) {
108 break;
110 *p++ = ch2;
111 if (ch2 < 128) break;
113 r = p - buf;
115 const char* p = buf;
116 const char* end = p + r;
117 if (!unpack_uint(&p, end, &val_size)) {
118 throw Xapian::DatabaseError("val_size unpack_uint invalid");
120 if (p != end) std::abort();
121 current_compressed = val_size & 1;
122 val_size >>= 1;
124 // FIXME: Always resize to 0? Not doing so avoids always having to
125 // clear all the data before reading it.
126 if (true && val_size == 0)
127 current_tag.resize(0);
129 is_at_end = false;
130 return true;
133 bool read_tag(bool keep_compressed = false) {
134 if (val_size) {
135 current_tag.resize(val_size);
136 if (!fh.read(&(current_tag[0]), val_size))
137 throw Xapian::DatabaseError("read of " + str(val_size) + " bytes of value data failed", errno);
138 if (DEBUGGING) {
139 std::cerr << "read " << val_size << " bytes of value data ending @" << fh.get_pos() << std::endl;
141 val_size = 0;
142 if (DEBUGGING) {
143 std::string esc;
144 description_append(esc, current_tag);
145 std::cerr << "V:" << esc << std::endl;
148 if (!keep_compressed && current_compressed) {
149 // Need to decompress.
150 comp_stream.decompress_start();
151 std::string new_tag;
152 if (!comp_stream.decompress_chunk(current_tag.data(), current_tag.size(), new_tag)) {
153 // Decompression didn't complete.
154 abort();
156 swap(current_tag, new_tag);
157 current_compressed = false;
158 if (DEBUGGING) {
159 std::cerr << "decompressed to " << current_tag.size() << " bytes of value data" << std::endl;
162 return current_compressed;
165 bool find_exact(const std::string& key) {
166 if (DEBUGGING) {
167 std::string esc;
168 description_append(esc, key);
169 std::cerr << "find_exact(" << esc << ") @" << fh.get_pos() << std::endl;
171 if (is_at_end) {
172 rewind();
173 } else {
174 // FIXME: use index
175 int cmp0 = current_key.compare(key);
176 if (cmp0 == 0) return true;
177 if (cmp0 > 0) {
178 rewind();
182 while (next()) {
183 int cmp = current_key.compare(key);
184 if (cmp == 0) return true;
185 if (cmp > 0) break;
187 return false;
190 bool find_entry_ge(const std::string& key) {
191 if (DEBUGGING) {
192 std::string esc;
193 description_append(esc, key);
194 std::cerr << "find_entry_ge(" << esc << ") @" << fh.get_pos() << std::endl;
196 if (is_at_end) {
197 rewind();
198 } else {
199 // FIXME: use index
200 int cmp0 = current_key.compare(key);
201 if (cmp0 == 0) return true;
202 if (cmp0 > 0) {
203 rewind();
207 while (next()) {
208 int cmp = current_key.compare(key);
209 if (cmp == 0) return true;
210 if (cmp > 0) return false;
212 is_at_end = true;
213 return false;
216 bool find_entry(const std::string& key) {
217 if (DEBUGGING) {
218 std::string desc;
219 description_append(desc, key);
220 std::cerr << "find_entry(" << desc << ") [LE]" << std::endl;
222 #if 0
223 if (is_at_end) {
224 rewind();
225 } else {
226 // FIXME: use index
227 int cmp0 = current_key.compare(key);
228 if (cmp0 == 0) return true;
229 if (cmp0 > 0) {
230 rewind();
234 off_t pos;
235 std::string k;
236 size_t vs;
237 bool compressed;
238 while (pos = fh.get_pos(), k = current_key, vs = val_size, compressed = current_compressed, next()) {
239 int cmp = current_key.compare(key);
240 if (cmp == 0) return true;
241 if (cmp > 0) break;
243 fh.set_pos(pos);
244 current_key = last_key = k;
245 val_size = vs;
246 current_compressed = compressed;
247 return false;
248 #else
249 if (is_at_end) {
250 rewind();
251 } else {
252 int cmp0 = current_key.compare(key);
253 if (cmp0 == 0) {
254 if (DEBUGGING) std::cerr << " already on it" << std::endl;
255 return true;
257 #if 1
258 if (cmp0 > 0) {
259 rewind();
261 #else
262 // FIXME: If "close" just seek forwards? Or consider seeking from current index pos?
263 //off_t pos = fh.get_pos();
264 fh.set_pos(root);
265 std::string index_key, prev_index_key;
266 std::make_unsigned<off_t>::type ptr = 0;
267 while (true) {
268 int reuse = fh.read();
269 if (reuse == -1) break;
270 int len = fh.read();
271 if (len == -1) std::abort(); // FIXME
272 prev_index_key = index_key;
273 index_key.resize(reuse + len);
274 fh.read(&index_key[reuse], len);
276 if (DEBUGGING) {
277 std::string desc;
278 description_append(desc, index_key);
279 std::cerr << "Index key: " << desc << std::endl;
282 cmp0 = index_key.compare(key);
283 if (cmp0 > 0) break;
284 last_key = ptr ? index_key : std::string(); // for now (a lie, but the reuse part is correct).
285 char buf[8];
286 char* e = buf;
287 while (true) {
288 int b = fh.read();
289 *e++ = b;
290 if ((b & 0x80) == 0) break;
292 const char* p = buf;
293 if (!unpack_uint(&p, e, &ptr) || p != e) std::abort(); // FIXME
294 if (DEBUGGING) std::cerr << " -> " << ptr << std::endl;
295 if (cmp0 == 0) {
296 if (DEBUGGING) std::cerr << " hit straight from index" << std::endl;
297 fh.set_pos(ptr);
298 current_key = index_key;
299 int r;
301 // FIXME: rework to take advantage of buffering that's happening anyway?
302 char * q = buf;
303 for (int i = 0; i < 8; ++i) {
304 int ch2 = fh.read();
305 if (ch2 == EOF) {
306 break;
308 *q++ = ch2;
309 if (ch2 < 128) break;
311 r = q - buf;
313 p = buf;
314 const char* end = p + r;
315 if (!unpack_uint(&p, end, &val_size)) {
316 throw Xapian::DatabaseError("val_size unpack_uint invalid");
318 bool& compressed = current_compressed;
319 compressed = val_size & 1;
320 val_size >>= 1;
321 if (p != end) std::abort();
322 return true;
325 if (DEBUGGING) std::cerr << " cmp0 = " << cmp0 << ", going to " << ptr << std::endl;
326 fh.set_pos(ptr);
328 // FIXME: crude for now
329 if (ptr != 0) {
330 current_key = prev_index_key;
331 char buf[4096];
332 int r;
334 // FIXME: rework to take advantage of buffering that's happening anyway?
335 char * p = buf;
336 for (int i = 0; i < 8; ++i) {
337 int ch2 = fh.read();
338 if (ch2 == EOF) {
339 break;
341 *p++ = ch2;
342 if (ch2 < 128) break;
344 r = p - buf;
346 const char* p = buf;
347 const char* end = p + r;
348 if (!unpack_uint(&p, end, &val_size)) {
349 throw Xapian::DatabaseError("val_size unpack_uint invalid");
351 bool& compressed = current_compressed;
352 auto& val = current_tag;
353 compressed = val_size & 1;
354 val_size >>= 1;
355 val.assign(p, end);
356 if (p != end) std::abort();
357 val_size -= (end - p);
358 while (val_size) {
359 size_t n = std::min(val_size, sizeof(buf));
360 if (!fh.read(buf, n))
361 throw Xapian::DatabaseError("read of " + str(n) + "/" + str(val_size) + " bytes of value data failed", errno);
362 val.append(buf, n);
363 val_size -= n;
365 } else {
366 if (!next()) {
367 std::abort();
371 if (DEBUGGING) {
372 std::string desc;
373 description_append(desc, current_key);
374 std::cerr << "Dropped to data layer on key: " << desc << std::endl;
377 // FIXME: need to put us in the "read key not tag" state but persist that more?
378 // if (cmp0 == 0) this is an exact hit from the index...
379 #endif
382 off_t pos;
383 std::string k;
384 size_t vs;
385 bool compressed;
386 while (true) {
387 pos = fh.get_pos();
388 k = current_key;
389 vs = val_size;
390 compressed = current_compressed;
392 if (DEBUGGING) {
393 std::string desc;
394 description_append(desc, current_key);
395 std::cerr << "@ " << pos << " key: " << desc << std::endl;
398 if (!next()) break;
400 int cmp = current_key.compare(key);
401 if (cmp == 0) {
402 if (DEBUGGING) std::cerr << " found it" << std::endl;
403 return true;
405 if (cmp > 0) break;
408 if (DEBUGGING) {
409 std::string desc;
410 description_append(desc, current_key);
411 std::cerr << " NOT found - reached " << desc << std::endl;
413 // No match so back up to previous entry.
414 is_at_end = false;
415 last_key = current_key = k;
416 val_size = vs;
417 current_compressed = compressed;
418 fh.set_pos(pos);
419 if (DEBUGGING) {
420 std::string desc;
421 description_append(desc, current_key);
422 std::cerr << " NOT found - leaving us on " << desc << std::endl;
424 return false;
425 #endif
428 void find_entry_lt(const std::string& key) {
429 if (DEBUGGING) {
430 std::string esc;
431 description_append(esc, key);
432 std::cerr << "find_entry_lt(" << esc << ") @" << fh.get_pos() << std::endl;
434 // FIXME: use index
435 if (is_at_end || key < current_key) {
436 rewind();
437 current_key.resize(0);
440 off_t pos;
441 std::string k;
442 size_t vs;
443 bool compressed;
444 do {
445 pos = fh.get_pos();
446 k = current_key;
447 vs = val_size;
448 compressed = current_compressed;
449 } while (next() && current_key < key);
451 // Back up to previous entry.
452 is_at_end = false;
453 last_key = current_key = k;
454 val_size = vs;
455 current_compressed = compressed;
456 fh.set_pos(pos);
459 HoneyCursor * clone() const {
460 return new HoneyCursor(*this);
463 bool del() { return false; }
466 class MutableHoneyCursor : public HoneyCursor {
467 public:
468 MutableHoneyCursor(HoneyTable* table_)
469 : HoneyCursor(table_->fh, table_->get_root()) { }
472 #endif // XAPIAN_INCLUDED_HONEY_CURSOR_H