git-commit: don't add multiple Signed-off-by: from the same identity
[git.git] / diff-delta.c
blobfaf96e47130e3a2af26f55fc1173874f078617fc
1 /*
2 * diff-delta.c: generate a delta between two buffers
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
9 * This code is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
14 #include "git-compat-util.h"
15 #include "delta.h"
17 /* maximum hash entry list for the same hash bucket */
18 #define HASH_LIMIT 64
20 #define RABIN_SHIFT 23
21 #define RABIN_WINDOW 16
23 static const unsigned int T[256] = {
24 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
69 static const unsigned int U[256] = {
70 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
115 struct index_entry {
116 const unsigned char *ptr;
117 unsigned int val;
118 struct index_entry *next;
121 struct delta_index {
122 const void *src_buf;
123 unsigned long src_size;
124 unsigned int hash_mask;
125 struct index_entry *hash[FLEX_ARRAY];
128 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
130 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
131 const unsigned char *data, *buffer = buf;
132 struct delta_index *index;
133 struct index_entry *entry, **hash;
134 void *mem;
135 unsigned long memsize;
137 if (!buf || !bufsize)
138 return NULL;
140 /* Determine index hash size. Note that indexing skips the
141 first byte to allow for optimizing the Rabin's polynomial
142 initialization in create_delta(). */
143 entries = (bufsize - 1) / RABIN_WINDOW;
144 hsize = entries / 4;
145 for (i = 4; (1u << i) < hsize && i < 31; i++);
146 hsize = 1 << i;
147 hmask = hsize - 1;
149 /* allocate lookup index */
150 memsize = sizeof(*index) +
151 sizeof(*hash) * hsize +
152 sizeof(*entry) * entries;
153 mem = malloc(memsize);
154 if (!mem)
155 return NULL;
156 index = mem;
157 mem = index + 1;
158 hash = mem;
159 mem = hash + hsize;
160 entry = mem;
162 index->src_buf = buf;
163 index->src_size = bufsize;
164 index->hash_mask = hmask;
165 memset(hash, 0, hsize * sizeof(*hash));
167 /* allocate an array to count hash entries */
168 hash_count = calloc(hsize, sizeof(*hash_count));
169 if (!hash_count) {
170 free(index);
171 return NULL;
174 /* then populate the index */
175 prev_val = ~0;
176 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
177 data >= buffer;
178 data -= RABIN_WINDOW) {
179 unsigned int val = 0;
180 for (i = 1; i <= RABIN_WINDOW; i++)
181 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
182 if (val == prev_val) {
183 /* keep the lowest of consecutive identical blocks */
184 entry[-1].ptr = data + RABIN_WINDOW;
185 } else {
186 prev_val = val;
187 i = val & hmask;
188 entry->ptr = data + RABIN_WINDOW;
189 entry->val = val;
190 entry->next = hash[i];
191 hash[i] = entry++;
192 hash_count[i]++;
197 * Determine a limit on the number of entries in the same hash
198 * bucket. This guards us against pathological data sets causing
199 * really bad hash distribution with most entries in the same hash
200 * bucket that would bring us to O(m*n) computing costs (m and n
201 * corresponding to reference and target buffer sizes).
203 * Make sure none of the hash buckets has more entries than
204 * we're willing to test. Otherwise we cull the entry list
205 * uniformly to still preserve a good repartition across
206 * the reference buffer.
208 for (i = 0; i < hsize; i++) {
209 if (hash_count[i] < HASH_LIMIT)
210 continue;
211 entry = hash[i];
212 do {
213 struct index_entry *keep = entry;
214 int skip = hash_count[i] / HASH_LIMIT / 2;
215 do {
216 entry = entry->next;
217 } while(--skip && entry);
218 keep->next = entry;
219 } while(entry);
221 free(hash_count);
223 return index;
226 void free_delta_index(struct delta_index *index)
228 free(index);
232 * The maximum size for any opcode sequence, including the initial header
233 * plus Rabin window plus biggest copy.
235 #define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
237 void *
238 create_delta(const struct delta_index *index,
239 const void *trg_buf, unsigned long trg_size,
240 unsigned long *delta_size, unsigned long max_size)
242 unsigned int i, outpos, outsize, moff, msize, val;
243 int inscnt;
244 const unsigned char *ref_data, *ref_top, *data, *top;
245 unsigned char *out;
247 if (!trg_buf || !trg_size)
248 return NULL;
250 outpos = 0;
251 outsize = 8192;
252 if (max_size && outsize >= max_size)
253 outsize = max_size + MAX_OP_SIZE + 1;
254 out = malloc(outsize);
255 if (!out)
256 return NULL;
258 /* store reference buffer size */
259 i = index->src_size;
260 while (i >= 0x80) {
261 out[outpos++] = i | 0x80;
262 i >>= 7;
264 out[outpos++] = i;
266 /* store target buffer size */
267 i = trg_size;
268 while (i >= 0x80) {
269 out[outpos++] = i | 0x80;
270 i >>= 7;
272 out[outpos++] = i;
274 ref_data = index->src_buf;
275 ref_top = ref_data + index->src_size;
276 data = trg_buf;
277 top = (const unsigned char *) trg_buf + trg_size;
279 outpos++;
280 val = 0;
281 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
282 out[outpos++] = *data;
283 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
285 inscnt = i;
287 moff = 0;
288 msize = 0;
289 while (data < top) {
290 if (msize < 4096) {
291 struct index_entry *entry;
292 val ^= U[data[-RABIN_WINDOW]];
293 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
294 i = val & index->hash_mask;
295 for (entry = index->hash[i]; entry; entry = entry->next) {
296 const unsigned char *ref = entry->ptr;
297 const unsigned char *src = data;
298 unsigned int ref_size = ref_top - ref;
299 if (entry->val != val)
300 continue;
301 if (ref_size > top - src)
302 ref_size = top - src;
303 if (ref_size <= msize)
304 break;
305 while (ref_size-- && *src++ == *ref)
306 ref++;
307 if (msize < ref - entry->ptr) {
308 /* this is our best match so far */
309 msize = ref - entry->ptr;
310 moff = entry->ptr - ref_data;
311 if (msize >= 4096) /* good enough */
312 break;
317 if (msize < 4) {
318 if (!inscnt)
319 outpos++;
320 out[outpos++] = *data++;
321 inscnt++;
322 if (inscnt == 0x7f) {
323 out[outpos - inscnt - 1] = inscnt;
324 inscnt = 0;
326 msize = 0;
327 } else {
328 unsigned int left;
329 unsigned char *op;
331 if (inscnt) {
332 while (moff && ref_data[moff-1] == data[-1]) {
333 /* we can match one byte back */
334 msize++;
335 moff--;
336 data--;
337 outpos--;
338 if (--inscnt)
339 continue;
340 outpos--; /* remove count slot */
341 inscnt--; /* make it -1 */
342 break;
344 out[outpos - inscnt - 1] = inscnt;
345 inscnt = 0;
348 /* A copy op is currently limited to 64KB (pack v2) */
349 left = (msize < 0x10000) ? 0 : (msize - 0x10000);
350 msize -= left;
352 op = out + outpos++;
353 i = 0x80;
355 if (moff & 0x000000ff)
356 out[outpos++] = moff >> 0, i |= 0x01;
357 if (moff & 0x0000ff00)
358 out[outpos++] = moff >> 8, i |= 0x02;
359 if (moff & 0x00ff0000)
360 out[outpos++] = moff >> 16, i |= 0x04;
361 if (moff & 0xff000000)
362 out[outpos++] = moff >> 24, i |= 0x08;
364 if (msize & 0x00ff)
365 out[outpos++] = msize >> 0, i |= 0x10;
366 if (msize & 0xff00)
367 out[outpos++] = msize >> 8, i |= 0x20;
369 *op = i;
371 data += msize;
372 moff += msize;
373 msize = left;
375 if (msize < 4096) {
376 int j;
377 val = 0;
378 for (j = -RABIN_WINDOW; j < 0; j++)
379 val = ((val << 8) | data[j])
380 ^ T[val >> RABIN_SHIFT];
384 if (outpos >= outsize - MAX_OP_SIZE) {
385 void *tmp = out;
386 outsize = outsize * 3 / 2;
387 if (max_size && outsize >= max_size)
388 outsize = max_size + MAX_OP_SIZE + 1;
389 if (max_size && outpos > max_size)
390 break;
391 out = realloc(out, outsize);
392 if (!out) {
393 free(tmp);
394 return NULL;
399 if (inscnt)
400 out[outpos - inscnt - 1] = inscnt;
402 if (max_size && outpos > max_size) {
403 free(out);
404 return NULL;
407 *delta_size = outpos;
408 return out;