The eighth batch
[alt-git.git] / reftable / reader.c
blob481dff10d49b2fda49c13156b4f10e0dc0f3d276
1 /*
2 Copyright 2020 Google LLC
4 Use of this source code is governed by a BSD-style
5 license that can be found in the LICENSE file or at
6 https://developers.google.com/open-source/licenses/bsd
7 */
9 #include "reader.h"
11 #include "system.h"
12 #include "block.h"
13 #include "constants.h"
14 #include "generic.h"
15 #include "iter.h"
16 #include "record.h"
17 #include "reftable-error.h"
18 #include "reftable-generic.h"
20 uint64_t block_source_size(struct reftable_block_source *source)
22 return source->ops->size(source->arg);
25 int block_source_read_block(struct reftable_block_source *source,
26 struct reftable_block *dest, uint64_t off,
27 uint32_t size)
29 int result = source->ops->read_block(source->arg, dest, off, size);
30 dest->source = *source;
31 return result;
34 void block_source_close(struct reftable_block_source *source)
36 if (!source->ops) {
37 return;
40 source->ops->close(source->arg);
41 source->ops = NULL;
44 static struct reftable_reader_offsets *
45 reader_offsets_for(struct reftable_reader *r, uint8_t typ)
47 switch (typ) {
48 case BLOCK_TYPE_REF:
49 return &r->ref_offsets;
50 case BLOCK_TYPE_LOG:
51 return &r->log_offsets;
52 case BLOCK_TYPE_OBJ:
53 return &r->obj_offsets;
55 abort();
58 static int reader_get_block(struct reftable_reader *r,
59 struct reftable_block *dest, uint64_t off,
60 uint32_t sz)
62 if (off >= r->size)
63 return 0;
65 if (off + sz > r->size) {
66 sz = r->size - off;
69 return block_source_read_block(&r->source, dest, off, sz);
72 uint32_t reftable_reader_hash_id(struct reftable_reader *r)
74 return r->hash_id;
77 const char *reader_name(struct reftable_reader *r)
79 return r->name;
82 static int parse_footer(struct reftable_reader *r, uint8_t *footer,
83 uint8_t *header)
85 uint8_t *f = footer;
86 uint8_t first_block_typ;
87 int err = 0;
88 uint32_t computed_crc;
89 uint32_t file_crc;
91 if (memcmp(f, "REFT", 4)) {
92 err = REFTABLE_FORMAT_ERROR;
93 goto done;
95 f += 4;
97 if (memcmp(footer, header, header_size(r->version))) {
98 err = REFTABLE_FORMAT_ERROR;
99 goto done;
102 f++;
103 r->block_size = get_be24(f);
105 f += 3;
106 r->min_update_index = get_be64(f);
107 f += 8;
108 r->max_update_index = get_be64(f);
109 f += 8;
111 if (r->version == 1) {
112 r->hash_id = GIT_SHA1_FORMAT_ID;
113 } else {
114 r->hash_id = get_be32(f);
115 switch (r->hash_id) {
116 case GIT_SHA1_FORMAT_ID:
117 break;
118 case GIT_SHA256_FORMAT_ID:
119 break;
120 default:
121 err = REFTABLE_FORMAT_ERROR;
122 goto done;
124 f += 4;
127 r->ref_offsets.index_offset = get_be64(f);
128 f += 8;
130 r->obj_offsets.offset = get_be64(f);
131 f += 8;
133 r->object_id_len = r->obj_offsets.offset & ((1 << 5) - 1);
134 r->obj_offsets.offset >>= 5;
136 r->obj_offsets.index_offset = get_be64(f);
137 f += 8;
138 r->log_offsets.offset = get_be64(f);
139 f += 8;
140 r->log_offsets.index_offset = get_be64(f);
141 f += 8;
143 computed_crc = crc32(0, footer, f - footer);
144 file_crc = get_be32(f);
145 f += 4;
146 if (computed_crc != file_crc) {
147 err = REFTABLE_FORMAT_ERROR;
148 goto done;
151 first_block_typ = header[header_size(r->version)];
152 r->ref_offsets.is_present = (first_block_typ == BLOCK_TYPE_REF);
153 r->ref_offsets.offset = 0;
154 r->log_offsets.is_present = (first_block_typ == BLOCK_TYPE_LOG ||
155 r->log_offsets.offset > 0);
156 r->obj_offsets.is_present = r->obj_offsets.offset > 0;
157 if (r->obj_offsets.is_present && !r->object_id_len) {
158 err = REFTABLE_FORMAT_ERROR;
159 goto done;
162 err = 0;
163 done:
164 return err;
167 int init_reader(struct reftable_reader *r, struct reftable_block_source *source,
168 const char *name)
170 struct reftable_block footer = { NULL };
171 struct reftable_block header = { NULL };
172 int err = 0;
173 uint64_t file_size = block_source_size(source);
175 /* Need +1 to read type of first block. */
176 uint32_t read_size = header_size(2) + 1; /* read v2 because it's larger. */
177 memset(r, 0, sizeof(struct reftable_reader));
179 if (read_size > file_size) {
180 err = REFTABLE_FORMAT_ERROR;
181 goto done;
184 err = block_source_read_block(source, &header, 0, read_size);
185 if (err != read_size) {
186 err = REFTABLE_IO_ERROR;
187 goto done;
190 if (memcmp(header.data, "REFT", 4)) {
191 err = REFTABLE_FORMAT_ERROR;
192 goto done;
194 r->version = header.data[4];
195 if (r->version != 1 && r->version != 2) {
196 err = REFTABLE_FORMAT_ERROR;
197 goto done;
200 r->size = file_size - footer_size(r->version);
201 r->source = *source;
202 r->name = xstrdup(name);
203 r->hash_id = 0;
205 err = block_source_read_block(source, &footer, r->size,
206 footer_size(r->version));
207 if (err != footer_size(r->version)) {
208 err = REFTABLE_IO_ERROR;
209 goto done;
212 err = parse_footer(r, footer.data, header.data);
213 done:
214 reftable_block_done(&footer);
215 reftable_block_done(&header);
216 return err;
219 struct table_iter {
220 struct reftable_reader *r;
221 uint8_t typ;
222 uint64_t block_off;
223 struct block_reader br;
224 struct block_iter bi;
225 int is_finished;
227 #define TABLE_ITER_INIT { \
228 .bi = BLOCK_ITER_INIT \
231 static int table_iter_next_in_block(struct table_iter *ti,
232 struct reftable_record *rec)
234 int res = block_iter_next(&ti->bi, rec);
235 if (res == 0 && reftable_record_type(rec) == BLOCK_TYPE_REF) {
236 rec->u.ref.update_index += ti->r->min_update_index;
239 return res;
242 static void table_iter_block_done(struct table_iter *ti)
244 block_reader_release(&ti->br);
245 block_iter_reset(&ti->bi);
248 static int32_t extract_block_size(uint8_t *data, uint8_t *typ, uint64_t off,
249 int version)
251 int32_t result = 0;
253 if (off == 0) {
254 data += header_size(version);
257 *typ = data[0];
258 if (reftable_is_block_type(*typ)) {
259 result = get_be24(data + 1);
261 return result;
264 int reader_init_block_reader(struct reftable_reader *r, struct block_reader *br,
265 uint64_t next_off, uint8_t want_typ)
267 int32_t guess_block_size = r->block_size ? r->block_size :
268 DEFAULT_BLOCK_SIZE;
269 struct reftable_block block = { NULL };
270 uint8_t block_typ = 0;
271 int err = 0;
272 uint32_t header_off = next_off ? 0 : header_size(r->version);
273 int32_t block_size = 0;
275 if (next_off >= r->size)
276 return 1;
278 err = reader_get_block(r, &block, next_off, guess_block_size);
279 if (err < 0)
280 goto done;
282 block_size = extract_block_size(block.data, &block_typ, next_off,
283 r->version);
284 if (block_size < 0) {
285 err = block_size;
286 goto done;
288 if (want_typ != BLOCK_TYPE_ANY && block_typ != want_typ) {
289 err = 1;
290 goto done;
293 if (block_size > guess_block_size) {
294 reftable_block_done(&block);
295 err = reader_get_block(r, &block, next_off, block_size);
296 if (err < 0) {
297 goto done;
301 err = block_reader_init(br, &block, header_off, r->block_size,
302 hash_size(r->hash_id));
303 done:
304 reftable_block_done(&block);
306 return err;
309 static void table_iter_close(struct table_iter *ti)
311 table_iter_block_done(ti);
312 block_iter_close(&ti->bi);
315 static int table_iter_next_block(struct table_iter *ti)
317 uint64_t next_block_off = ti->block_off + ti->br.full_block_size;
318 int err;
320 err = reader_init_block_reader(ti->r, &ti->br, next_block_off, ti->typ);
321 if (err > 0)
322 ti->is_finished = 1;
323 if (err)
324 return err;
326 ti->block_off = next_block_off;
327 ti->is_finished = 0;
328 block_iter_seek_start(&ti->bi, &ti->br);
330 return 0;
333 static int table_iter_next(struct table_iter *ti, struct reftable_record *rec)
335 if (reftable_record_type(rec) != ti->typ)
336 return REFTABLE_API_ERROR;
338 while (1) {
339 int err;
341 if (ti->is_finished)
342 return 1;
345 * Check whether the current block still has more records. If
346 * so, return it. If the iterator returns positive then the
347 * current block has been exhausted.
349 err = table_iter_next_in_block(ti, rec);
350 if (err <= 0)
351 return err;
354 * Otherwise, we need to continue to the next block in the
355 * table and retry. If there are no more blocks then the
356 * iterator is drained.
358 err = table_iter_next_block(ti);
359 if (err) {
360 ti->is_finished = 1;
361 return err;
366 static int table_iter_next_void(void *ti, struct reftable_record *rec)
368 return table_iter_next(ti, rec);
371 static void table_iter_close_void(void *ti)
373 table_iter_close(ti);
376 static struct reftable_iterator_vtable table_iter_vtable = {
377 .next = &table_iter_next_void,
378 .close = &table_iter_close_void,
381 static void iterator_from_table_iter(struct reftable_iterator *it,
382 struct table_iter *ti)
384 assert(!it->ops);
385 it->iter_arg = ti;
386 it->ops = &table_iter_vtable;
389 static int reader_table_iter_at(struct reftable_reader *r,
390 struct table_iter *ti, uint64_t off,
391 uint8_t typ)
393 int err;
395 err = reader_init_block_reader(r, &ti->br, off, typ);
396 if (err != 0)
397 return err;
399 ti->r = r;
400 ti->typ = block_reader_type(&ti->br);
401 ti->block_off = off;
402 block_iter_seek_start(&ti->bi, &ti->br);
403 return 0;
406 static int reader_start(struct reftable_reader *r, struct table_iter *ti,
407 uint8_t typ, int index)
409 struct reftable_reader_offsets *offs = reader_offsets_for(r, typ);
410 uint64_t off = offs->offset;
411 if (index) {
412 off = offs->index_offset;
413 if (off == 0) {
414 return 1;
416 typ = BLOCK_TYPE_INDEX;
419 return reader_table_iter_at(r, ti, off, typ);
422 static int reader_seek_linear(struct table_iter *ti,
423 struct reftable_record *want)
425 struct strbuf want_key = STRBUF_INIT;
426 struct strbuf got_key = STRBUF_INIT;
427 struct reftable_record rec;
428 int err = -1;
430 reftable_record_init(&rec, reftable_record_type(want));
431 reftable_record_key(want, &want_key);
434 * First we need to locate the block that must contain our record. To
435 * do so we scan through blocks linearly until we find the first block
436 * whose first key is bigger than our wanted key. Once we have found
437 * that block we know that the key must be contained in the preceding
438 * block.
440 * This algorithm is somewhat unfortunate because it means that we
441 * always have to seek one block too far and then back up. But as we
442 * can only decode the _first_ key of a block but not its _last_ key we
443 * have no other way to do this.
445 while (1) {
446 struct table_iter next = *ti;
449 * We must be careful to not modify underlying data of `ti`
450 * because we may find that `next` does not contain our desired
451 * block, but that `ti` does. In that case, we would discard
452 * `next` and continue with `ti`.
454 * This also means that we cannot reuse allocated memory for
455 * `next` here. While it would be great if we could, it should
456 * in practice not be too bad given that we should only ever
457 * end up doing linear seeks with at most three blocks. As soon
458 * as we have more than three blocks we would have an index, so
459 * we would not do a linear search there anymore.
461 memset(&next.br.block, 0, sizeof(next.br.block));
462 next.br.zstream = NULL;
463 next.br.uncompressed_data = NULL;
464 next.br.uncompressed_cap = 0;
466 err = table_iter_next_block(&next);
467 if (err < 0)
468 goto done;
469 if (err > 0)
470 break;
472 err = block_reader_first_key(&next.br, &got_key);
473 if (err < 0)
474 goto done;
476 if (strbuf_cmp(&got_key, &want_key) > 0) {
477 table_iter_block_done(&next);
478 break;
481 table_iter_block_done(ti);
482 *ti = next;
486 * We have located the block that must contain our record, so we seek
487 * the wanted key inside of it. If the block does not contain our key
488 * we know that the corresponding record does not exist.
490 err = block_iter_seek_key(&ti->bi, &ti->br, &want_key);
491 if (err < 0)
492 goto done;
493 err = 0;
495 done:
496 reftable_record_release(&rec);
497 strbuf_release(&want_key);
498 strbuf_release(&got_key);
499 return err;
502 static int reader_seek_indexed(struct reftable_reader *r,
503 struct reftable_iterator *it,
504 struct reftable_record *rec)
506 struct reftable_record want_index = {
507 .type = BLOCK_TYPE_INDEX, .u.idx = { .last_key = STRBUF_INIT }
509 struct reftable_record index_result = {
510 .type = BLOCK_TYPE_INDEX,
511 .u.idx = { .last_key = STRBUF_INIT },
513 struct table_iter index_iter = TABLE_ITER_INIT;
514 struct table_iter empty = TABLE_ITER_INIT;
515 struct table_iter next = TABLE_ITER_INIT;
516 int err = 0;
518 reftable_record_key(rec, &want_index.u.idx.last_key);
519 err = reader_start(r, &index_iter, reftable_record_type(rec), 1);
520 if (err < 0)
521 goto done;
524 * The index may consist of multiple levels, where each level may have
525 * multiple index blocks. We start by doing a linear search in the
526 * highest layer that identifies the relevant index block as well as
527 * the record inside that block that corresponds to our wanted key.
529 err = reader_seek_linear(&index_iter, &want_index);
530 if (err < 0)
531 goto done;
534 * Traverse down the levels until we find a non-index entry.
536 while (1) {
538 * In case we seek a record that does not exist the index iter
539 * will tell us that the iterator is over. This works because
540 * the last index entry of the current level will contain the
541 * last key it knows about. So in case our seeked key is larger
542 * than the last indexed key we know that it won't exist.
544 * There is one subtlety in the layout of the index section
545 * that makes this work as expected: the highest-level index is
546 * at end of the section and will point backwards and thus we
547 * start reading from the end of the index section, not the
548 * beginning.
550 * If that wasn't the case and the order was reversed then the
551 * linear seek would seek into the lower levels and traverse
552 * all levels of the index only to find out that the key does
553 * not exist.
555 err = table_iter_next(&index_iter, &index_result);
556 if (err != 0)
557 goto done;
559 err = reader_table_iter_at(r, &next, index_result.u.idx.offset,
561 if (err != 0)
562 goto done;
564 err = block_iter_seek_key(&next.bi, &next.br, &want_index.u.idx.last_key);
565 if (err < 0)
566 goto done;
568 if (next.typ == reftable_record_type(rec)) {
569 err = 0;
570 break;
573 if (next.typ != BLOCK_TYPE_INDEX) {
574 err = REFTABLE_FORMAT_ERROR;
575 break;
578 table_iter_close(&index_iter);
579 index_iter = next;
580 next = empty;
583 if (err == 0) {
584 struct table_iter *malloced = reftable_calloc(1, sizeof(*malloced));
585 *malloced = next;
586 next = empty;
587 iterator_from_table_iter(it, malloced);
590 done:
591 table_iter_close(&next);
592 table_iter_close(&index_iter);
593 reftable_record_release(&want_index);
594 reftable_record_release(&index_result);
595 return err;
598 static int reader_seek_internal(struct reftable_reader *r,
599 struct reftable_iterator *it,
600 struct reftable_record *rec)
602 struct reftable_reader_offsets *offs =
603 reader_offsets_for(r, reftable_record_type(rec));
604 uint64_t idx = offs->index_offset;
605 struct table_iter ti = TABLE_ITER_INIT, *p;
606 int err;
608 if (idx > 0)
609 return reader_seek_indexed(r, it, rec);
611 err = reader_start(r, &ti, reftable_record_type(rec), 0);
612 if (err < 0)
613 goto out;
615 err = reader_seek_linear(&ti, rec);
616 if (err < 0)
617 goto out;
619 REFTABLE_ALLOC_ARRAY(p, 1);
620 *p = ti;
621 iterator_from_table_iter(it, p);
623 out:
624 if (err)
625 table_iter_close(&ti);
626 return err;
629 static int reader_seek(struct reftable_reader *r, struct reftable_iterator *it,
630 struct reftable_record *rec)
632 uint8_t typ = reftable_record_type(rec);
634 struct reftable_reader_offsets *offs = reader_offsets_for(r, typ);
635 if (!offs->is_present) {
636 iterator_set_empty(it);
637 return 0;
640 return reader_seek_internal(r, it, rec);
643 int reftable_reader_seek_ref(struct reftable_reader *r,
644 struct reftable_iterator *it, const char *name)
646 struct reftable_record rec = {
647 .type = BLOCK_TYPE_REF,
648 .u.ref = {
649 .refname = (char *)name,
652 return reader_seek(r, it, &rec);
655 int reftable_reader_seek_log_at(struct reftable_reader *r,
656 struct reftable_iterator *it, const char *name,
657 uint64_t update_index)
659 struct reftable_record rec = { .type = BLOCK_TYPE_LOG,
660 .u.log = {
661 .refname = (char *)name,
662 .update_index = update_index,
663 } };
664 return reader_seek(r, it, &rec);
667 int reftable_reader_seek_log(struct reftable_reader *r,
668 struct reftable_iterator *it, const char *name)
670 uint64_t max = ~((uint64_t)0);
671 return reftable_reader_seek_log_at(r, it, name, max);
674 void reader_close(struct reftable_reader *r)
676 block_source_close(&r->source);
677 FREE_AND_NULL(r->name);
680 int reftable_new_reader(struct reftable_reader **p,
681 struct reftable_block_source *src, char const *name)
683 struct reftable_reader *rd = reftable_calloc(1, sizeof(*rd));
684 int err = init_reader(rd, src, name);
685 if (err == 0) {
686 *p = rd;
687 } else {
688 block_source_close(src);
689 reftable_free(rd);
691 return err;
694 void reftable_reader_free(struct reftable_reader *r)
696 if (!r)
697 return;
698 reader_close(r);
699 reftable_free(r);
702 static int reftable_reader_refs_for_indexed(struct reftable_reader *r,
703 struct reftable_iterator *it,
704 uint8_t *oid)
706 struct reftable_record want = {
707 .type = BLOCK_TYPE_OBJ,
708 .u.obj = {
709 .hash_prefix = oid,
710 .hash_prefix_len = r->object_id_len,
713 struct reftable_iterator oit = { NULL };
714 struct reftable_record got = {
715 .type = BLOCK_TYPE_OBJ,
716 .u.obj = { 0 },
718 int err = 0;
719 struct indexed_table_ref_iter *itr = NULL;
721 /* Look through the reverse index. */
722 err = reader_seek(r, &oit, &want);
723 if (err != 0)
724 goto done;
726 /* read out the reftable_obj_record */
727 err = iterator_next(&oit, &got);
728 if (err < 0)
729 goto done;
731 if (err > 0 || memcmp(want.u.obj.hash_prefix, got.u.obj.hash_prefix,
732 r->object_id_len)) {
733 /* didn't find it; return empty iterator */
734 iterator_set_empty(it);
735 err = 0;
736 goto done;
739 err = new_indexed_table_ref_iter(&itr, r, oid, hash_size(r->hash_id),
740 got.u.obj.offsets,
741 got.u.obj.offset_len);
742 if (err < 0)
743 goto done;
744 got.u.obj.offsets = NULL;
745 iterator_from_indexed_table_ref_iter(it, itr);
747 done:
748 reftable_iterator_destroy(&oit);
749 reftable_record_release(&got);
750 return err;
753 static int reftable_reader_refs_for_unindexed(struct reftable_reader *r,
754 struct reftable_iterator *it,
755 uint8_t *oid)
757 struct table_iter ti_empty = TABLE_ITER_INIT;
758 struct table_iter *ti = reftable_calloc(1, sizeof(*ti));
759 struct filtering_ref_iterator *filter = NULL;
760 struct filtering_ref_iterator empty = FILTERING_REF_ITERATOR_INIT;
761 int oid_len = hash_size(r->hash_id);
762 int err;
764 *ti = ti_empty;
765 err = reader_start(r, ti, BLOCK_TYPE_REF, 0);
766 if (err < 0) {
767 reftable_free(ti);
768 return err;
771 filter = reftable_malloc(sizeof(struct filtering_ref_iterator));
772 *filter = empty;
774 strbuf_add(&filter->oid, oid, oid_len);
775 reftable_table_from_reader(&filter->tab, r);
776 filter->double_check = 0;
777 iterator_from_table_iter(&filter->it, ti);
779 iterator_from_filtering_ref_iterator(it, filter);
780 return 0;
783 int reftable_reader_refs_for(struct reftable_reader *r,
784 struct reftable_iterator *it, uint8_t *oid)
786 if (r->obj_offsets.is_present)
787 return reftable_reader_refs_for_indexed(r, it, oid);
788 return reftable_reader_refs_for_unindexed(r, it, oid);
791 uint64_t reftable_reader_max_update_index(struct reftable_reader *r)
793 return r->max_update_index;
796 uint64_t reftable_reader_min_update_index(struct reftable_reader *r)
798 return r->min_update_index;
801 /* generic table interface. */
803 static int reftable_reader_seek_void(void *tab, struct reftable_iterator *it,
804 struct reftable_record *rec)
806 return reader_seek(tab, it, rec);
809 static uint32_t reftable_reader_hash_id_void(void *tab)
811 return reftable_reader_hash_id(tab);
814 static uint64_t reftable_reader_min_update_index_void(void *tab)
816 return reftable_reader_min_update_index(tab);
819 static uint64_t reftable_reader_max_update_index_void(void *tab)
821 return reftable_reader_max_update_index(tab);
824 static struct reftable_table_vtable reader_vtable = {
825 .seek_record = reftable_reader_seek_void,
826 .hash_id = reftable_reader_hash_id_void,
827 .min_update_index = reftable_reader_min_update_index_void,
828 .max_update_index = reftable_reader_max_update_index_void,
831 void reftable_table_from_reader(struct reftable_table *tab,
832 struct reftable_reader *reader)
834 assert(!tab->ops);
835 tab->ops = &reader_vtable;
836 tab->table_arg = reader;
840 int reftable_reader_print_file(const char *tablename)
842 struct reftable_block_source src = { NULL };
843 int err = reftable_block_source_from_file(&src, tablename);
844 struct reftable_reader *r = NULL;
845 struct reftable_table tab = { NULL };
846 if (err < 0)
847 goto done;
849 err = reftable_new_reader(&r, &src, tablename);
850 if (err < 0)
851 goto done;
853 reftable_table_from_reader(&tab, r);
854 err = reftable_table_print(&tab);
855 done:
856 reftable_reader_free(r);
857 return err;