allow delta data reuse even if base object is a preferred base
[git/jnareb-git.git] / index-pack.c
blobfffddd25c9b9cc92b837434d7099cb7b4180ce0b
1 #include "cache.h"
2 #include "delta.h"
3 #include "pack.h"
4 #include "csum-file.h"
5 #include "blob.h"
6 #include "commit.h"
7 #include "tag.h"
8 #include "tree.h"
10 static const char index_pack_usage[] =
11 "git-index-pack [-o index-file] pack-file";
13 struct object_entry
15 unsigned long offset;
16 enum object_type type;
17 enum object_type real_type;
18 unsigned char sha1[20];
21 union delta_base {
22 unsigned char sha1[20];
23 unsigned long offset;
26 struct delta_entry
28 struct object_entry *obj;
29 union delta_base base;
32 static const char *pack_name;
33 static unsigned char *pack_base;
34 static unsigned long pack_size;
35 static struct object_entry *objects;
36 static struct delta_entry *deltas;
37 static int nr_objects;
38 static int nr_deltas;
40 static void open_pack_file(void)
42 int fd;
43 struct stat st;
45 fd = open(pack_name, O_RDONLY);
46 if (fd < 0)
47 die("cannot open packfile '%s': %s", pack_name,
48 strerror(errno));
49 if (fstat(fd, &st)) {
50 int err = errno;
51 close(fd);
52 die("cannot fstat packfile '%s': %s", pack_name,
53 strerror(err));
55 pack_size = st.st_size;
56 pack_base = mmap(NULL, pack_size, PROT_READ, MAP_PRIVATE, fd, 0);
57 if (pack_base == MAP_FAILED) {
58 int err = errno;
59 close(fd);
60 die("cannot mmap packfile '%s': %s", pack_name,
61 strerror(err));
63 close(fd);
66 static void parse_pack_header(void)
68 const struct pack_header *hdr;
69 unsigned char sha1[20];
70 SHA_CTX ctx;
72 /* Ensure there are enough bytes for the header and final SHA1 */
73 if (pack_size < sizeof(struct pack_header) + 20)
74 die("packfile '%s' is too small", pack_name);
76 /* Header consistency check */
77 hdr = (void *)pack_base;
78 if (hdr->hdr_signature != htonl(PACK_SIGNATURE))
79 die("packfile '%s' signature mismatch", pack_name);
80 if (!pack_version_ok(hdr->hdr_version))
81 die("packfile '%s' version %d unsupported",
82 pack_name, ntohl(hdr->hdr_version));
84 nr_objects = ntohl(hdr->hdr_entries);
86 /* Check packfile integrity */
87 SHA1_Init(&ctx);
88 SHA1_Update(&ctx, pack_base, pack_size - 20);
89 SHA1_Final(sha1, &ctx);
90 if (hashcmp(sha1, pack_base + pack_size - 20))
91 die("packfile '%s' SHA1 mismatch", pack_name);
94 static void bad_object(unsigned long offset, const char *format,
95 ...) NORETURN __attribute__((format (printf, 2, 3)));
97 static void bad_object(unsigned long offset, const char *format, ...)
99 va_list params;
100 char buf[1024];
102 va_start(params, format);
103 vsnprintf(buf, sizeof(buf), format, params);
104 va_end(params);
105 die("packfile '%s': bad object at offset %lu: %s",
106 pack_name, offset, buf);
109 static void *unpack_entry_data(unsigned long offset,
110 unsigned long *current_pos, unsigned long size)
112 unsigned long pack_limit = pack_size - 20;
113 unsigned long pos = *current_pos;
114 z_stream stream;
115 void *buf = xmalloc(size);
117 memset(&stream, 0, sizeof(stream));
118 stream.next_out = buf;
119 stream.avail_out = size;
120 stream.next_in = pack_base + pos;
121 stream.avail_in = pack_limit - pos;
122 inflateInit(&stream);
124 for (;;) {
125 int ret = inflate(&stream, 0);
126 if (ret == Z_STREAM_END)
127 break;
128 if (ret != Z_OK)
129 bad_object(offset, "inflate returned %d", ret);
131 inflateEnd(&stream);
132 if (stream.total_out != size)
133 bad_object(offset, "size mismatch (expected %lu, got %lu)",
134 size, stream.total_out);
135 *current_pos = pack_limit - stream.avail_in;
136 return buf;
139 static void *unpack_raw_entry(unsigned long offset,
140 enum object_type *obj_type,
141 unsigned long *obj_size,
142 union delta_base *delta_base,
143 unsigned long *next_obj_offset)
145 unsigned long pack_limit = pack_size - 20;
146 unsigned long pos = offset;
147 unsigned char c;
148 unsigned long size, base_offset;
149 unsigned shift;
150 enum object_type type;
151 void *data;
153 c = pack_base[pos++];
154 type = (c >> 4) & 7;
155 size = (c & 15);
156 shift = 4;
157 while (c & 0x80) {
158 if (pos >= pack_limit)
159 bad_object(offset, "object extends past end of pack");
160 c = pack_base[pos++];
161 size += (c & 0x7fUL) << shift;
162 shift += 7;
165 switch (type) {
166 case OBJ_REF_DELTA:
167 if (pos + 20 >= pack_limit)
168 bad_object(offset, "object extends past end of pack");
169 hashcpy(delta_base->sha1, pack_base + pos);
170 pos += 20;
171 break;
172 case OBJ_OFS_DELTA:
173 memset(delta_base, 0, sizeof(*delta_base));
174 c = pack_base[pos++];
175 base_offset = c & 127;
176 while (c & 128) {
177 base_offset += 1;
178 if (!base_offset || base_offset & ~(~0UL >> 7))
179 bad_object(offset, "offset value overflow for delta base object");
180 if (pos >= pack_limit)
181 bad_object(offset, "object extends past end of pack");
182 c = pack_base[pos++];
183 base_offset = (base_offset << 7) + (c & 127);
185 delta_base->offset = offset - base_offset;
186 if (delta_base->offset >= offset)
187 bad_object(offset, "delta base offset is out of bound");
188 break;
189 case OBJ_COMMIT:
190 case OBJ_TREE:
191 case OBJ_BLOB:
192 case OBJ_TAG:
193 break;
194 default:
195 bad_object(offset, "bad object type %d", type);
198 data = unpack_entry_data(offset, &pos, size);
199 *obj_type = type;
200 *obj_size = size;
201 *next_obj_offset = pos;
202 return data;
205 static int find_delta(const union delta_base *base)
207 int first = 0, last = nr_deltas;
209 while (first < last) {
210 int next = (first + last) / 2;
211 struct delta_entry *delta = &deltas[next];
212 int cmp;
214 cmp = memcmp(base, &delta->base, sizeof(*base));
215 if (!cmp)
216 return next;
217 if (cmp < 0) {
218 last = next;
219 continue;
221 first = next+1;
223 return -first-1;
226 static int find_delta_childs(const union delta_base *base,
227 int *first_index, int *last_index)
229 int first = find_delta(base);
230 int last = first;
231 int end = nr_deltas - 1;
233 if (first < 0)
234 return -1;
235 while (first > 0 && !memcmp(&deltas[first - 1].base, base, sizeof(*base)))
236 --first;
237 while (last < end && !memcmp(&deltas[last + 1].base, base, sizeof(*base)))
238 ++last;
239 *first_index = first;
240 *last_index = last;
241 return 0;
244 static void sha1_object(const void *data, unsigned long size,
245 enum object_type type, unsigned char *sha1)
247 SHA_CTX ctx;
248 char header[50];
249 int header_size;
250 const char *type_str;
252 switch (type) {
253 case OBJ_COMMIT: type_str = commit_type; break;
254 case OBJ_TREE: type_str = tree_type; break;
255 case OBJ_BLOB: type_str = blob_type; break;
256 case OBJ_TAG: type_str = tag_type; break;
257 default:
258 die("bad type %d", type);
261 header_size = sprintf(header, "%s %lu", type_str, size) + 1;
263 SHA1_Init(&ctx);
264 SHA1_Update(&ctx, header, header_size);
265 SHA1_Update(&ctx, data, size);
266 SHA1_Final(sha1, &ctx);
269 static void resolve_delta(struct delta_entry *delta, void *base_data,
270 unsigned long base_size, enum object_type type)
272 struct object_entry *obj = delta->obj;
273 void *delta_data;
274 unsigned long delta_size;
275 void *result;
276 unsigned long result_size;
277 enum object_type delta_type;
278 union delta_base delta_base;
279 unsigned long next_obj_offset;
280 int j, first, last;
282 obj->real_type = type;
283 delta_data = unpack_raw_entry(obj->offset, &delta_type,
284 &delta_size, &delta_base,
285 &next_obj_offset);
286 result = patch_delta(base_data, base_size, delta_data, delta_size,
287 &result_size);
288 free(delta_data);
289 if (!result)
290 bad_object(obj->offset, "failed to apply delta");
291 sha1_object(result, result_size, type, obj->sha1);
293 hashcpy(delta_base.sha1, obj->sha1);
294 if (!find_delta_childs(&delta_base, &first, &last)) {
295 for (j = first; j <= last; j++)
296 if (deltas[j].obj->type == OBJ_REF_DELTA)
297 resolve_delta(&deltas[j], result, result_size, type);
300 memset(&delta_base, 0, sizeof(delta_base));
301 delta_base.offset = obj->offset;
302 if (!find_delta_childs(&delta_base, &first, &last)) {
303 for (j = first; j <= last; j++)
304 if (deltas[j].obj->type == OBJ_OFS_DELTA)
305 resolve_delta(&deltas[j], result, result_size, type);
308 free(result);
311 static int compare_delta_entry(const void *a, const void *b)
313 const struct delta_entry *delta_a = a;
314 const struct delta_entry *delta_b = b;
315 return memcmp(&delta_a->base, &delta_b->base, sizeof(union delta_base));
318 static void parse_pack_objects(void)
320 int i;
321 unsigned long offset = sizeof(struct pack_header);
322 struct delta_entry *delta = deltas;
323 void *data;
324 unsigned long data_size;
327 * First pass:
328 * - find locations of all objects;
329 * - calculate SHA1 of all non-delta objects;
330 * - remember base SHA1 for all deltas.
332 for (i = 0; i < nr_objects; i++) {
333 struct object_entry *obj = &objects[i];
334 obj->offset = offset;
335 data = unpack_raw_entry(offset, &obj->type, &data_size,
336 &delta->base, &offset);
337 obj->real_type = obj->type;
338 if (obj->type == OBJ_REF_DELTA || obj->type == OBJ_OFS_DELTA) {
339 nr_deltas++;
340 delta->obj = obj;
341 delta++;
342 } else
343 sha1_object(data, data_size, obj->type, obj->sha1);
344 free(data);
346 if (offset != pack_size - 20)
347 die("packfile '%s' has junk at the end", pack_name);
349 /* Sort deltas by base SHA1/offset for fast searching */
350 qsort(deltas, nr_deltas, sizeof(struct delta_entry),
351 compare_delta_entry);
354 * Second pass:
355 * - for all non-delta objects, look if it is used as a base for
356 * deltas;
357 * - if used as a base, uncompress the object and apply all deltas,
358 * recursively checking if the resulting object is used as a base
359 * for some more deltas.
361 for (i = 0; i < nr_objects; i++) {
362 struct object_entry *obj = &objects[i];
363 union delta_base base;
364 int j, ref, ref_first, ref_last, ofs, ofs_first, ofs_last;
366 if (obj->type == OBJ_REF_DELTA || obj->type == OBJ_OFS_DELTA)
367 continue;
368 hashcpy(base.sha1, obj->sha1);
369 ref = !find_delta_childs(&base, &ref_first, &ref_last);
370 memset(&base, 0, sizeof(base));
371 base.offset = obj->offset;
372 ofs = !find_delta_childs(&base, &ofs_first, &ofs_last);
373 if (!ref && !ofs)
374 continue;
375 data = unpack_raw_entry(obj->offset, &obj->type, &data_size,
376 &base, &offset);
377 if (ref)
378 for (j = ref_first; j <= ref_last; j++)
379 if (deltas[j].obj->type == OBJ_REF_DELTA)
380 resolve_delta(&deltas[j], data,
381 data_size, obj->type);
382 if (ofs)
383 for (j = ofs_first; j <= ofs_last; j++)
384 if (deltas[j].obj->type == OBJ_OFS_DELTA)
385 resolve_delta(&deltas[j], data,
386 data_size, obj->type);
387 free(data);
390 /* Check for unresolved deltas */
391 for (i = 0; i < nr_deltas; i++) {
392 if (deltas[i].obj->real_type == OBJ_REF_DELTA ||
393 deltas[i].obj->real_type == OBJ_OFS_DELTA)
394 die("packfile '%s' has unresolved deltas", pack_name);
398 static int sha1_compare(const void *_a, const void *_b)
400 struct object_entry *a = *(struct object_entry **)_a;
401 struct object_entry *b = *(struct object_entry **)_b;
402 return hashcmp(a->sha1, b->sha1);
405 static void write_index_file(const char *index_name, unsigned char *sha1)
407 struct sha1file *f;
408 struct object_entry **sorted_by_sha, **list, **last;
409 unsigned int array[256];
410 int i;
411 SHA_CTX ctx;
413 if (nr_objects) {
414 sorted_by_sha =
415 xcalloc(nr_objects, sizeof(struct object_entry *));
416 list = sorted_by_sha;
417 last = sorted_by_sha + nr_objects;
418 for (i = 0; i < nr_objects; ++i)
419 sorted_by_sha[i] = &objects[i];
420 qsort(sorted_by_sha, nr_objects, sizeof(sorted_by_sha[0]),
421 sha1_compare);
424 else
425 sorted_by_sha = list = last = NULL;
427 unlink(index_name);
428 f = sha1create("%s", index_name);
431 * Write the first-level table (the list is sorted,
432 * but we use a 256-entry lookup to be able to avoid
433 * having to do eight extra binary search iterations).
435 for (i = 0; i < 256; i++) {
436 struct object_entry **next = list;
437 while (next < last) {
438 struct object_entry *obj = *next;
439 if (obj->sha1[0] != i)
440 break;
441 next++;
443 array[i] = htonl(next - sorted_by_sha);
444 list = next;
446 sha1write(f, array, 256 * sizeof(int));
448 /* recompute the SHA1 hash of sorted object names.
449 * currently pack-objects does not do this, but that
450 * can be fixed.
452 SHA1_Init(&ctx);
454 * Write the actual SHA1 entries..
456 list = sorted_by_sha;
457 for (i = 0; i < nr_objects; i++) {
458 struct object_entry *obj = *list++;
459 unsigned int offset = htonl(obj->offset);
460 sha1write(f, &offset, 4);
461 sha1write(f, obj->sha1, 20);
462 SHA1_Update(&ctx, obj->sha1, 20);
464 sha1write(f, pack_base + pack_size - 20, 20);
465 sha1close(f, NULL, 1);
466 free(sorted_by_sha);
467 SHA1_Final(sha1, &ctx);
470 int main(int argc, char **argv)
472 int i;
473 char *index_name = NULL;
474 char *index_name_buf = NULL;
475 unsigned char sha1[20];
477 for (i = 1; i < argc; i++) {
478 const char *arg = argv[i];
480 if (*arg == '-') {
481 if (!strcmp(arg, "-o")) {
482 if (index_name || (i+1) >= argc)
483 usage(index_pack_usage);
484 index_name = argv[++i];
485 } else
486 usage(index_pack_usage);
487 continue;
490 if (pack_name)
491 usage(index_pack_usage);
492 pack_name = arg;
495 if (!pack_name)
496 usage(index_pack_usage);
497 if (!index_name) {
498 int len = strlen(pack_name);
499 if (!has_extension(pack_name, ".pack"))
500 die("packfile name '%s' does not end with '.pack'",
501 pack_name);
502 index_name_buf = xmalloc(len);
503 memcpy(index_name_buf, pack_name, len - 5);
504 strcpy(index_name_buf + len - 5, ".idx");
505 index_name = index_name_buf;
508 open_pack_file();
509 parse_pack_header();
510 objects = xcalloc(nr_objects, sizeof(struct object_entry));
511 deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
512 parse_pack_objects();
513 free(deltas);
514 write_index_file(index_name, sha1);
515 free(objects);
516 free(index_name_buf);
518 printf("%s\n", sha1_to_hex(sha1));
520 return 0;