8 static const char pack_usage
[] = "git-pack-objects [-q] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list";
11 unsigned char sha1
[20];
12 unsigned long size
; /* uncompressed size */
13 unsigned long offset
; /* offset into the final pack file (nonzero if already written) */
14 unsigned int depth
; /* delta depth */
15 unsigned int hash
; /* name hint hash */
16 enum object_type type
;
17 unsigned long delta_size
; /* delta data size (uncompressed) */
18 struct object_entry
*delta
; /* delta base object */
19 struct packed_git
*in_pack
; /* already in pack */
20 enum object_type in_pack_type
; /* could be delta */
21 unsigned int in_pack_offset
;
25 * Objects we are going to pack are colected in objects array (dynamically
26 * expanded). nr_objects & nr_alloc controls this array. They are stored
27 * in the order we see -- typically rev-list --objects order that gives us
28 * nice "minimum seek" order.
30 * sorted-by-sha ans sorted-by-type are arrays of pointers that point at
31 * elements in the objects array. The former is used to build the pack
32 * index (lists object names in the ascending order to help offset lookup),
33 * and the latter is used to group similar things together by try_delta()
37 static unsigned char object_list_sha1
[20];
38 static int non_empty
= 0;
40 static int incremental
= 0;
41 static struct object_entry
**sorted_by_sha
, **sorted_by_type
;
42 static struct object_entry
*objects
= NULL
;
43 static int nr_objects
= 0, nr_alloc
= 0;
44 static const char *base_name
;
45 static unsigned char pack_file_sha1
[20];
46 static int progress
= 1;
49 * The object names in objects array are hashed with this hashtable,
50 * to help looking up the entry by object name. Binary search from
51 * sorted_by_sha is also possible but this was easier to code and faster.
52 * This hashtable is built after all the objects are seen.
54 static int *object_ix
= NULL
;
55 static int object_ix_hashsz
= 0;
58 * Pack index for existing packs give us easy access to the offsets into
59 * corresponding pack file where each object's data starts, but the entries
60 * do not store the size of the compressed representation (uncompressed
61 * size is easily available by examining the pack entry header). We build
62 * a hashtable of existing packs (pack_revindex), and keep reverse index
63 * here -- pack index file is sorted by object name mapping to offset; this
64 * pack_revindex[].revindex array is an ordered list of offsets, so if you
65 * know the offset of an object, next offset is where its packed
66 * representation ends.
68 struct pack_revindex
{
70 unsigned long *revindex
;
71 } *pack_revindex
= NULL
;
72 static int pack_revindex_hashsz
= 0;
77 static int written
= 0;
78 static int reused
= 0;
80 static int pack_revindex_ix(struct packed_git
*p
)
82 unsigned int ui
= (unsigned int) p
;
85 ui
= ui
^ (ui
>> 16); /* defeat structure alignment */
86 i
= (int)(ui
% pack_revindex_hashsz
);
87 while (pack_revindex
[i
].p
) {
88 if (pack_revindex
[i
].p
== p
)
90 if (++i
== pack_revindex_hashsz
)
96 static void prepare_pack_ix(void)
100 for (num
= 0, p
= packed_git
; p
; p
= p
->next
)
104 pack_revindex_hashsz
= num
* 11;
105 pack_revindex
= xcalloc(sizeof(*pack_revindex
), pack_revindex_hashsz
);
106 for (p
= packed_git
; p
; p
= p
->next
) {
107 num
= pack_revindex_ix(p
);
109 pack_revindex
[num
].p
= p
;
111 /* revindex elements are lazily initialized */
114 static int cmp_offset(const void *a_
, const void *b_
)
116 unsigned long a
= *(unsigned long *) a_
;
117 unsigned long b
= *(unsigned long *) b_
;
127 * Ordered list of offsets of objects in the pack.
129 static void prepare_pack_revindex(struct pack_revindex
*rix
)
131 struct packed_git
*p
= rix
->p
;
132 int num_ent
= num_packed_objects(p
);
134 void *index
= p
->index_base
+ 256;
136 rix
->revindex
= xmalloc(sizeof(unsigned long) * (num_ent
+ 1));
137 for (i
= 0; i
< num_ent
; i
++) {
138 long hl
= *((long *)(index
+ 24 * i
));
139 rix
->revindex
[i
] = ntohl(hl
);
141 /* This knows the pack format -- the 20-byte trailer
142 * follows immediately after the last object data.
144 rix
->revindex
[num_ent
] = p
->pack_size
- 20;
145 qsort(rix
->revindex
, num_ent
, sizeof(unsigned long), cmp_offset
);
148 static unsigned long find_packed_object_size(struct packed_git
*p
,
153 struct pack_revindex
*rix
;
154 unsigned long *revindex
;
155 num
= pack_revindex_ix(p
);
157 die("internal error: pack revindex uninitialized");
158 rix
= &pack_revindex
[num
];
160 prepare_pack_revindex(rix
);
161 revindex
= rix
->revindex
;
163 hi
= num_packed_objects(p
) + 1;
165 int mi
= (lo
+ hi
) / 2;
166 if (revindex
[mi
] == ofs
) {
167 return revindex
[mi
+1] - ofs
;
169 else if (ofs
< revindex
[mi
])
174 die("internal error: pack revindex corrupt");
177 static void *delta_against(void *buf
, unsigned long size
, struct object_entry
*entry
)
179 unsigned long othersize
, delta_size
;
181 void *otherbuf
= read_sha1_file(entry
->delta
->sha1
, type
, &othersize
);
185 die("unable to read %s", sha1_to_hex(entry
->delta
->sha1
));
186 delta_buf
= diff_delta(otherbuf
, othersize
,
187 buf
, size
, &delta_size
, 0);
188 if (!delta_buf
|| delta_size
!= entry
->delta_size
)
189 die("delta size changed");
196 * The per-object header is a pretty dense thing, which is
197 * - first byte: low four bits are "size", then three bits of "type",
198 * and the high bit is "size continues".
199 * - each byte afterwards: low seven bits are size continuation,
200 * with the high bit being "size continues"
202 static int encode_header(enum object_type type
, unsigned long size
, unsigned char *hdr
)
207 if (type
< OBJ_COMMIT
|| type
> OBJ_DELTA
)
208 die("bad type %d", type
);
210 c
= (type
<< 4) | (size
& 15);
222 static unsigned long write_object(struct sha1file
*f
, struct object_entry
*entry
)
227 unsigned char header
[10];
228 unsigned hdrlen
, datalen
;
229 enum object_type obj_type
;
231 obj_type
= entry
->type
;
232 if (!entry
->in_pack
||
233 (obj_type
!= entry
->in_pack_type
)) {
234 buf
= read_sha1_file(entry
->sha1
, type
, &size
);
236 die("unable to read %s", sha1_to_hex(entry
->sha1
));
237 if (size
!= entry
->size
)
238 die("object %s size inconsistency (%lu vs %lu)",
239 sha1_to_hex(entry
->sha1
), size
, entry
->size
);
241 buf
= delta_against(buf
, size
, entry
);
242 size
= entry
->delta_size
;
243 obj_type
= OBJ_DELTA
;
246 * The object header is a byte of 'type' followed by zero or
247 * more bytes of length. For deltas, the 20 bytes of delta
250 hdrlen
= encode_header(obj_type
, size
, header
);
251 sha1write(f
, header
, hdrlen
);
254 sha1write(f
, entry
->delta
, 20);
257 datalen
= sha1write_compressed(f
, buf
, size
);
261 struct packed_git
*p
= entry
->in_pack
;
264 datalen
= find_packed_object_size(p
, entry
->in_pack_offset
);
265 buf
= p
->pack_base
+ entry
->in_pack_offset
;
266 sha1write(f
, buf
, datalen
);
268 hdrlen
= 0; /* not really */
272 return hdrlen
+ datalen
;
275 static unsigned long write_one(struct sha1file
*f
,
276 struct object_entry
*e
,
277 unsigned long offset
)
280 /* offset starts from header size and cannot be zero
281 * if it is written already.
285 offset
+= write_object(f
, e
);
286 /* if we are deltified, write out its base object. */
288 offset
= write_one(f
, e
->delta
, offset
);
292 static void write_pack_file(void)
296 unsigned long offset
;
298 struct pack_header hdr
;
301 f
= sha1fd(1, "<stdout>");
303 f
= sha1create("%s-%s.%s", base_name
, sha1_to_hex(object_list_sha1
), "pack");
304 hdr
.hdr_signature
= htonl(PACK_SIGNATURE
);
305 hdr
.hdr_version
= htonl(PACK_VERSION
);
306 hdr
.hdr_entries
= htonl(nr_objects
);
307 sha1write(f
, &hdr
, sizeof(hdr
));
308 offset
= sizeof(hdr
);
309 for (i
= 0; i
< nr_objects
; i
++)
310 offset
= write_one(f
, objects
+ i
, offset
);
312 sha1close(f
, pack_file_sha1
, 1);
315 static void write_index_file(void)
318 struct sha1file
*f
= sha1create("%s-%s.%s", base_name
, sha1_to_hex(object_list_sha1
), "idx");
319 struct object_entry
**list
= sorted_by_sha
;
320 struct object_entry
**last
= list
+ nr_objects
;
321 unsigned int array
[256];
324 * Write the first-level table (the list is sorted,
325 * but we use a 256-entry lookup to be able to avoid
326 * having to do eight extra binary search iterations).
328 for (i
= 0; i
< 256; i
++) {
329 struct object_entry
**next
= list
;
330 while (next
< last
) {
331 struct object_entry
*entry
= *next
;
332 if (entry
->sha1
[0] != i
)
336 array
[i
] = htonl(next
- sorted_by_sha
);
339 sha1write(f
, array
, 256 * sizeof(int));
342 * Write the actual SHA1 entries..
344 list
= sorted_by_sha
;
345 for (i
= 0; i
< nr_objects
; i
++) {
346 struct object_entry
*entry
= *list
++;
347 unsigned int offset
= htonl(entry
->offset
);
348 sha1write(f
, &offset
, 4);
349 sha1write(f
, entry
->sha1
, 20);
351 sha1write(f
, pack_file_sha1
, 20);
352 sha1close(f
, NULL
, 1);
355 static int add_object_entry(unsigned char *sha1
, unsigned int hash
)
357 unsigned int idx
= nr_objects
;
358 struct object_entry
*entry
;
359 struct packed_git
*p
;
360 unsigned int found_offset
;
361 struct packed_git
*found_pack
;
364 for (p
= packed_git
; p
; p
= p
->next
) {
366 if (find_pack_entry_one(sha1
, &e
, p
)) {
369 if (local
&& !p
->pack_local
)
372 found_offset
= e
.offset
;
378 if (idx
>= nr_alloc
) {
379 unsigned int needed
= (idx
+ 1024) * 3 / 2;
380 objects
= xrealloc(objects
, needed
* sizeof(*entry
));
383 entry
= objects
+ idx
;
384 memset(entry
, 0, sizeof(*entry
));
385 memcpy(entry
->sha1
, sha1
, 20);
388 entry
->in_pack
= found_pack
;
389 entry
->in_pack_offset
= found_offset
;
395 static int locate_object_entry_hash(unsigned char *sha1
)
399 memcpy(&ui
, sha1
, sizeof(unsigned int));
400 i
= ui
% object_ix_hashsz
;
401 while (0 < object_ix
[i
]) {
402 if (!memcmp(sha1
, objects
[object_ix
[i
]-1].sha1
, 20))
404 if (++i
== object_ix_hashsz
)
410 static struct object_entry
*locate_object_entry(unsigned char *sha1
)
412 int i
= locate_object_entry_hash(sha1
);
414 return &objects
[object_ix
[i
]-1];
418 static void check_object(struct object_entry
*entry
)
422 if (entry
->in_pack
) {
423 /* Check if it is delta, and the base is also an object
424 * we are going to pack. If so we will reuse the existing
427 unsigned char base
[20];
429 struct object_entry
*base_entry
;
430 if (!check_reuse_pack_delta(entry
->in_pack
,
431 entry
->in_pack_offset
,
433 &entry
->in_pack_type
) &&
434 (base_entry
= locate_object_entry(base
))) {
435 /* We do not know depth at this point, but it
436 * does not matter. Getting delta_chain_length
437 * with packed_object_info_detail() is not so
438 * expensive, so we could do that later if we
439 * wanted to. Calling sha1_object_info to get
440 * the true size (and later an uncompressed
441 * representation) of deeply deltified object
442 * is quite expensive.
445 /* uncompressed size */
446 entry
->size
= entry
->delta_size
= size
;
447 entry
->delta
= base_entry
;
448 entry
->type
= OBJ_DELTA
;
451 /* Otherwise we would do the usual */
454 if (sha1_object_info(entry
->sha1
, type
, &entry
->size
))
455 die("unable to get type of object %s",
456 sha1_to_hex(entry
->sha1
));
458 if (!strcmp(type
, "commit")) {
459 entry
->type
= OBJ_COMMIT
;
460 } else if (!strcmp(type
, "tree")) {
461 entry
->type
= OBJ_TREE
;
462 } else if (!strcmp(type
, "blob")) {
463 entry
->type
= OBJ_BLOB
;
464 } else if (!strcmp(type
, "tag")) {
465 entry
->type
= OBJ_TAG
;
467 die("unable to pack object %s of type %s",
468 sha1_to_hex(entry
->sha1
), type
);
471 static void hash_objects(void)
474 struct object_entry
*oe
;
476 object_ix_hashsz
= nr_objects
* 2;
477 object_ix
= xcalloc(sizeof(int), object_ix_hashsz
);
478 for (i
= 0, oe
= objects
; i
< nr_objects
; i
++, oe
++) {
479 int ix
= locate_object_entry_hash(oe
->sha1
);
481 error("the same object '%s' added twice",
482 sha1_to_hex(oe
->sha1
));
486 object_ix
[ix
] = i
+ 1;
490 static void get_object_details(void)
493 struct object_entry
*entry
= objects
;
497 for (i
= 0; i
< nr_objects
; i
++)
498 check_object(entry
++);
501 typedef int (*entry_sort_t
)(const struct object_entry
*, const struct object_entry
*);
503 static entry_sort_t current_sort
;
505 static int sort_comparator(const void *_a
, const void *_b
)
507 struct object_entry
*a
= *(struct object_entry
**)_a
;
508 struct object_entry
*b
= *(struct object_entry
**)_b
;
509 return current_sort(a
,b
);
512 static struct object_entry
**create_sorted_list(entry_sort_t sort
)
514 struct object_entry
**list
= xmalloc(nr_objects
* sizeof(struct object_entry
*));
517 for (i
= 0; i
< nr_objects
; i
++)
518 list
[i
] = objects
+ i
;
520 qsort(list
, nr_objects
, sizeof(struct object_entry
*), sort_comparator
);
524 static int sha1_sort(const struct object_entry
*a
, const struct object_entry
*b
)
526 return memcmp(a
->sha1
, b
->sha1
, 20);
529 static int type_size_sort(const struct object_entry
*a
, const struct object_entry
*b
)
531 if (a
->type
< b
->type
)
533 if (a
->type
> b
->type
)
535 if (a
->hash
< b
->hash
)
537 if (a
->hash
> b
->hash
)
539 if (a
->size
< b
->size
)
541 if (a
->size
> b
->size
)
543 return a
< b
? -1 : (a
> b
);
547 struct object_entry
*entry
;
552 * We search for deltas _backwards_ in a list sorted by type and
553 * by size, so that we see progressively smaller and smaller files.
554 * That's because we prefer deltas to be from the bigger file
555 * to the smaller - deletes are potentially cheaper, but perhaps
556 * more importantly, the bigger file is likely the more recent
559 static int try_delta(struct unpacked
*cur
, struct unpacked
*old
, unsigned max_depth
)
561 struct object_entry
*cur_entry
= cur
->entry
;
562 struct object_entry
*old_entry
= old
->entry
;
563 unsigned long size
, oldsize
, delta_size
, sizediff
;
567 /* Don't bother doing diffs between different types */
568 if (cur_entry
->type
!= old_entry
->type
)
571 size
= cur_entry
->size
;
574 oldsize
= old_entry
->size
;
575 sizediff
= oldsize
> size
? oldsize
- size
: size
- oldsize
;
576 if (sizediff
> size
/ 8)
578 if (old_entry
->depth
>= max_depth
)
584 * We always delta from the bigger to the smaller, since that's
585 * more space-efficient (deletes don't have to say _what_ they
588 max_size
= size
/ 2 - 20;
589 if (cur_entry
->delta
)
590 max_size
= cur_entry
->delta_size
-1;
591 if (sizediff
>= max_size
)
593 delta_buf
= diff_delta(old
->data
, oldsize
,
594 cur
->data
, size
, &delta_size
, max_size
);
597 cur_entry
->delta
= old_entry
;
598 cur_entry
->delta_size
= delta_size
;
599 cur_entry
->depth
= old_entry
->depth
+ 1;
604 static void find_deltas(struct object_entry
**list
, int window
, int depth
)
607 unsigned int array_size
= window
* sizeof(struct unpacked
);
608 struct unpacked
*array
= xmalloc(array_size
);
611 memset(array
, 0, array_size
);
614 eye_candy
= i
- (nr_objects
/ 20);
617 struct object_entry
*entry
= list
[i
];
618 struct unpacked
*n
= array
+ idx
;
623 if (progress
&& i
<= eye_candy
) {
624 eye_candy
-= nr_objects
/ 20;
629 /* This happens if we decided to reuse existing
636 n
->data
= read_sha1_file(entry
->sha1
, type
, &size
);
637 if (size
!= entry
->size
)
638 die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry
->sha1
), size
, entry
->size
);
641 unsigned int other_idx
= idx
+ j
;
643 if (other_idx
>= window
)
645 m
= array
+ other_idx
;
648 if (try_delta(n
, m
, depth
) < 0)
656 for (i
= 0; i
< window
; ++i
)
661 static void prepare_pack(int window
, int depth
)
664 fprintf(stderr
, "Packing %d objects", nr_objects
);
665 get_object_details();
667 fprintf(stderr
, ".");
669 sorted_by_type
= create_sorted_list(type_size_sort
);
671 find_deltas(sorted_by_type
, window
+1, depth
);
677 static int reuse_cached_pack(unsigned char *sha1
, int pack_to_stdout
)
679 static const char cache
[] = "pack-cache/pack-%s.%s";
680 char *cached_pack
, *cached_idx
;
681 int ifd
, ofd
, ifd_ix
= -1;
683 cached_pack
= git_path(cache
, sha1_to_hex(sha1
), "pack");
684 ifd
= open(cached_pack
, O_RDONLY
);
688 if (!pack_to_stdout
) {
689 cached_idx
= git_path(cache
, sha1_to_hex(sha1
), "idx");
690 ifd_ix
= open(cached_idx
, O_RDONLY
);
697 fprintf(stderr
, "Reusing %d objects pack %s\n", nr_objects
,
700 if (pack_to_stdout
) {
707 snprintf(name
, sizeof(name
),
708 "%s-%s.%s", base_name
, sha1_to_hex(sha1
), "pack");
709 ofd
= open(name
, O_CREAT
| O_EXCL
| O_WRONLY
, 0666);
711 die("unable to open %s (%s)", name
, strerror(errno
));
712 if (copy_fd(ifd
, ofd
))
716 snprintf(name
, sizeof(name
),
717 "%s-%s.%s", base_name
, sha1_to_hex(sha1
), "idx");
718 ofd
= open(name
, O_CREAT
| O_EXCL
| O_WRONLY
, 0666);
720 die("unable to open %s (%s)", name
, strerror(errno
));
721 if (copy_fd(ifd_ix
, ofd
))
724 puts(sha1_to_hex(sha1
));
730 int main(int argc
, char **argv
)
733 char line
[PATH_MAX
+ 20];
734 int window
= 10, depth
= 10, pack_to_stdout
= 0;
735 struct object_entry
**list
;
737 struct timeval prev_tv
;
739 int eye_candy_incr
= 500;
742 setup_git_directory();
744 for (i
= 1; i
< argc
; i
++) {
745 const char *arg
= argv
[i
];
748 if (!strcmp("--non-empty", arg
)) {
752 if (!strcmp("--local", arg
)) {
756 if (!strcmp("--incremental", arg
)) {
760 if (!strncmp("--window=", arg
, 9)) {
762 window
= strtoul(arg
+9, &end
, 0);
767 if (!strncmp("--depth=", arg
, 8)) {
769 depth
= strtoul(arg
+8, &end
, 0);
774 if (!strcmp("-q", arg
)) {
778 if (!strcmp("--stdout", arg
)) {
789 if (pack_to_stdout
!= !base_name
)
792 prepare_packed_git();
794 fprintf(stderr
, "Generating pack...\n");
795 gettimeofday(&prev_tv
, NULL
);
797 while (fgets(line
, sizeof(line
), stdin
) != NULL
) {
800 unsigned char sha1
[20];
802 if (progress
&& (eye_candy
<= nr_objects
)) {
803 fprintf(stderr
, "Counting objects...%d\r", nr_objects
);
804 if (eye_candy
&& (50 <= eye_candy_incr
)) {
807 gettimeofday(&tv
, NULL
);
808 time_diff
= (tv
.tv_sec
- prev_tv
.tv_sec
);
810 time_diff
+= (tv
.tv_usec
- prev_tv
.tv_usec
);
811 if ((1 << 9) < time_diff
)
812 eye_candy_incr
+= 50;
813 else if (50 < eye_candy_incr
)
814 eye_candy_incr
-= 50;
816 eye_candy
+= eye_candy_incr
;
818 if (get_sha1_hex(line
, sha1
))
819 die("expected sha1, got garbage:\n %s", line
);
823 unsigned char c
= *p
++;
826 hash
= hash
* 11 + c
;
828 add_object_entry(sha1
, hash
);
831 fprintf(stderr
, "Done counting %d objects.\n", nr_objects
);
832 if (non_empty
&& !nr_objects
)
835 sorted_by_sha
= create_sorted_list(sha1_sort
);
837 list
= sorted_by_sha
;
838 for (i
= 0; i
< nr_objects
; i
++) {
839 struct object_entry
*entry
= *list
++;
840 SHA1_Update(&ctx
, entry
->sha1
, 20);
842 SHA1_Final(object_list_sha1
, &ctx
);
844 if (reuse_cached_pack(object_list_sha1
, pack_to_stdout
))
847 prepare_pack(window
, depth
);
848 if (!pack_to_stdout
) {
850 puts(sha1_to_hex(object_list_sha1
));
853 fprintf(stderr
, "Total %d, written %d, reused %d\n",
854 nr_objects
, written
, reused
);