2 * diff-delta.c: generate a delta between two buffers
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
7 * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
9 * This code is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
14 #include "git-compat-util.h"
17 /* maximum hash entry list for the same hash bucket */
20 #define RABIN_SHIFT 23
21 #define RABIN_WINDOW 16
23 static const unsigned int T
[256] = {
24 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
69 static const unsigned int U
[256] = {
70 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
116 const unsigned char *ptr
;
120 struct unpacked_index_entry
{
121 struct index_entry entry
;
122 struct unpacked_index_entry
*next
;
126 unsigned long memsize
;
128 unsigned long src_size
;
129 unsigned int hash_mask
;
130 struct index_entry
*hash
[FLEX_ARRAY
];
133 struct delta_index
* create_delta_index(const void *buf
, unsigned long bufsize
)
135 unsigned int i
, hsize
, hmask
, entries
, prev_val
, *hash_count
;
136 const unsigned char *data
, *buffer
= buf
;
137 struct delta_index
*index
;
138 struct unpacked_index_entry
*entry
, **hash
;
139 struct index_entry
*packed_entry
, **packed_hash
;
141 unsigned long memsize
;
143 if (!buf
|| !bufsize
)
146 /* Determine index hash size. Note that indexing skips the
147 first byte to allow for optimizing the Rabin's polynomial
148 initialization in create_delta(). */
149 entries
= (bufsize
- 1) / RABIN_WINDOW
;
150 if (bufsize
>= 0xffffffffUL
) {
152 * Current delta format can't encode offsets into
153 * reference buffer with more than 32 bits.
155 entries
= 0xfffffffeU
/ RABIN_WINDOW
;
158 for (i
= 4; (1u << i
) < hsize
&& i
< 31; i
++);
162 /* allocate lookup index */
163 memsize
= sizeof(*hash
) * hsize
+
164 sizeof(*entry
) * entries
;
165 mem
= malloc(memsize
);
172 memset(hash
, 0, hsize
* sizeof(*hash
));
174 /* allocate an array to count hash entries */
175 hash_count
= calloc(hsize
, sizeof(*hash_count
));
181 /* then populate the index */
183 for (data
= buffer
+ entries
* RABIN_WINDOW
- RABIN_WINDOW
;
185 data
-= RABIN_WINDOW
) {
186 unsigned int val
= 0;
187 for (i
= 1; i
<= RABIN_WINDOW
; i
++)
188 val
= ((val
<< 8) | data
[i
]) ^ T
[val
>> RABIN_SHIFT
];
189 if (val
== prev_val
) {
190 /* keep the lowest of consecutive identical blocks */
191 entry
[-1].entry
.ptr
= data
+ RABIN_WINDOW
;
196 entry
->entry
.ptr
= data
+ RABIN_WINDOW
;
197 entry
->entry
.val
= val
;
198 entry
->next
= hash
[i
];
205 * Determine a limit on the number of entries in the same hash
206 * bucket. This guards us against pathological data sets causing
207 * really bad hash distribution with most entries in the same hash
208 * bucket that would bring us to O(m*n) computing costs (m and n
209 * corresponding to reference and target buffer sizes).
211 * Make sure none of the hash buckets has more entries than
212 * we're willing to test. Otherwise we cull the entry list
213 * uniformly to still preserve a good repartition across
214 * the reference buffer.
216 for (i
= 0; i
< hsize
; i
++) {
219 if (hash_count
[i
] <= HASH_LIMIT
)
222 /* We leave exactly HASH_LIMIT entries in the bucket */
223 entries
-= hash_count
[i
] - HASH_LIMIT
;
229 * Assume that this loop is gone through exactly
230 * HASH_LIMIT times and is entered and left with
231 * acc==0. So the first statement in the loop
232 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
233 * to the accumulator, and the inner loop consequently
234 * is run (hash_count[i]-HASH_LIMIT) times, removing
235 * one element from the list each time. Since acc
236 * balances out to 0 at the final run, the inner loop
237 * body can't be left with entry==NULL. So we indeed
238 * encounter entry==NULL in the outer loop only.
241 acc
+= hash_count
[i
] - HASH_LIMIT
;
243 struct unpacked_index_entry
*keep
= entry
;
248 keep
->next
= entry
->next
;
256 * Now create the packed index in array form
257 * rather than linked lists.
259 memsize
= sizeof(*index
)
260 + sizeof(*packed_hash
) * (hsize
+1)
261 + sizeof(*packed_entry
) * entries
;
262 mem
= malloc(memsize
);
269 index
->memsize
= memsize
;
270 index
->src_buf
= buf
;
271 index
->src_size
= bufsize
;
272 index
->hash_mask
= hmask
;
276 mem
= packed_hash
+ (hsize
+1);
279 for (i
= 0; i
< hsize
; i
++) {
281 * Coalesce all entries belonging to one linked list
282 * into consecutive array entries.
284 packed_hash
[i
] = packed_entry
;
285 for (entry
= hash
[i
]; entry
; entry
= entry
->next
)
286 *packed_entry
++ = entry
->entry
;
289 /* Sentinel value to indicate the length of the last hash bucket */
290 packed_hash
[hsize
] = packed_entry
;
292 assert(packed_entry
- (struct index_entry
*)mem
== entries
);
298 void free_delta_index(struct delta_index
*index
)
303 unsigned long sizeof_delta_index(struct delta_index
*index
)
306 return index
->memsize
;
312 * The maximum size for any opcode sequence, including the initial header
313 * plus Rabin window plus biggest copy.
315 #define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
318 create_delta(const struct delta_index
*index
,
319 const void *trg_buf
, unsigned long trg_size
,
320 unsigned long *delta_size
, unsigned long max_size
)
322 unsigned int i
, outpos
, outsize
, moff
, msize
, val
;
324 const unsigned char *ref_data
, *ref_top
, *data
, *top
;
327 if (!trg_buf
|| !trg_size
)
332 if (max_size
&& outsize
>= max_size
)
333 outsize
= max_size
+ MAX_OP_SIZE
+ 1;
334 out
= malloc(outsize
);
338 /* store reference buffer size */
341 out
[outpos
++] = i
| 0x80;
346 /* store target buffer size */
349 out
[outpos
++] = i
| 0x80;
354 ref_data
= index
->src_buf
;
355 ref_top
= ref_data
+ index
->src_size
;
357 top
= (const unsigned char *) trg_buf
+ trg_size
;
361 for (i
= 0; i
< RABIN_WINDOW
&& data
< top
; i
++, data
++) {
362 out
[outpos
++] = *data
;
363 val
= ((val
<< 8) | *data
) ^ T
[val
>> RABIN_SHIFT
];
371 struct index_entry
*entry
;
372 val
^= U
[data
[-RABIN_WINDOW
]];
373 val
= ((val
<< 8) | *data
) ^ T
[val
>> RABIN_SHIFT
];
374 i
= val
& index
->hash_mask
;
375 for (entry
= index
->hash
[i
]; entry
< index
->hash
[i
+1]; entry
++) {
376 const unsigned char *ref
= entry
->ptr
;
377 const unsigned char *src
= data
;
378 unsigned int ref_size
= ref_top
- ref
;
379 if (entry
->val
!= val
)
381 if (ref_size
> top
- src
)
382 ref_size
= top
- src
;
383 if (ref_size
<= msize
)
385 while (ref_size
-- && *src
++ == *ref
)
387 if (msize
< ref
- entry
->ptr
) {
388 /* this is our best match so far */
389 msize
= ref
- entry
->ptr
;
390 moff
= entry
->ptr
- ref_data
;
391 if (msize
>= 4096) /* good enough */
400 out
[outpos
++] = *data
++;
402 if (inscnt
== 0x7f) {
403 out
[outpos
- inscnt
- 1] = inscnt
;
412 while (moff
&& ref_data
[moff
-1] == data
[-1]) {
413 /* we can match one byte back */
420 outpos
--; /* remove count slot */
421 inscnt
--; /* make it -1 */
424 out
[outpos
- inscnt
- 1] = inscnt
;
428 /* A copy op is currently limited to 64KB (pack v2) */
429 left
= (msize
< 0x10000) ? 0 : (msize
- 0x10000);
435 if (moff
& 0x000000ff)
436 out
[outpos
++] = moff
>> 0, i
|= 0x01;
437 if (moff
& 0x0000ff00)
438 out
[outpos
++] = moff
>> 8, i
|= 0x02;
439 if (moff
& 0x00ff0000)
440 out
[outpos
++] = moff
>> 16, i
|= 0x04;
441 if (moff
& 0xff000000)
442 out
[outpos
++] = moff
>> 24, i
|= 0x08;
445 out
[outpos
++] = msize
>> 0, i
|= 0x10;
447 out
[outpos
++] = msize
>> 8, i
|= 0x20;
458 for (j
= -RABIN_WINDOW
; j
< 0; j
++)
459 val
= ((val
<< 8) | data
[j
])
460 ^ T
[val
>> RABIN_SHIFT
];
464 if (outpos
>= outsize
- MAX_OP_SIZE
) {
466 outsize
= outsize
* 3 / 2;
467 if (max_size
&& outsize
>= max_size
)
468 outsize
= max_size
+ MAX_OP_SIZE
+ 1;
469 if (max_size
&& outpos
> max_size
)
471 out
= realloc(out
, outsize
);
480 out
[outpos
- inscnt
- 1] = inscnt
;
482 if (max_size
&& outpos
> max_size
) {
487 *delta_size
= outpos
;