Merge branch 'jc/again'
[git.git] / diff-delta.c
blobc61887518874d550f66d2d52bd7559d023fcb176
1 /*
2 * diff-delta.c: generate a delta between two buffers
4 * Many parts of this file have been lifted from LibXDiff version 0.10.
5 * http://www.xmailserver.org/xdiff-lib.html
7 * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8 * Copyright (C) 2003 Davide Libenzi
10 * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
12 * This file is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Use of this within git automatically means that the LGPL
18 * licensing gets turned into GPLv2 within this project.
21 #include <stdlib.h>
22 #include <string.h>
23 #include "delta.h"
26 /* maximum hash entry list for the same hash bucket */
27 #define HASH_LIMIT 64
29 #define RABIN_SHIFT 23
30 #define RABIN_WINDOW 16
32 static const unsigned int T[256] = {
33 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
34 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
35 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
36 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
37 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
38 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
39 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
40 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
41 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
42 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
43 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
44 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
45 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
46 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
47 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
48 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
49 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
50 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
51 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
52 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
53 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
54 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
55 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
56 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
57 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
58 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
59 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
60 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
61 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
62 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
63 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
64 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
65 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
66 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
67 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
68 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
69 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
70 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
71 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
72 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
73 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
74 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
75 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
78 static const unsigned int U[256] = {
79 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
80 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
81 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
82 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
83 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
84 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
85 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
86 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
87 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
88 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
89 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
90 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
91 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
92 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
93 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
94 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
95 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
96 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
97 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
98 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
99 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
100 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
101 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
102 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
103 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
104 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
105 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
106 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
107 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
108 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
109 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
110 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
111 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
112 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
113 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
114 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
115 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
116 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
117 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
118 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
119 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
120 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
121 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
124 struct index_entry {
125 const unsigned char *ptr;
126 unsigned int val;
127 struct index_entry *next;
130 struct delta_index {
131 const void *src_buf;
132 unsigned long src_size;
133 unsigned int hash_mask;
134 struct index_entry *hash[0];
137 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
139 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
140 const unsigned char *data, *buffer = buf;
141 struct delta_index *index;
142 struct index_entry *entry, **hash;
143 void *mem;
144 unsigned long memsize;
146 if (!buf || !bufsize)
147 return NULL;
149 /* Determine index hash size. Note that indexing skips the
150 first byte to allow for optimizing the rabin polynomial
151 initialization in create_delta(). */
152 entries = (bufsize - 1) / RABIN_WINDOW;
153 hsize = entries / 4;
154 for (i = 4; (1 << i) < hsize && i < 31; i++);
155 hsize = 1 << i;
156 hmask = hsize - 1;
158 /* allocate lookup index */
159 memsize = sizeof(*index) +
160 sizeof(*hash) * hsize +
161 sizeof(*entry) * entries;
162 mem = malloc(memsize);
163 if (!mem)
164 return NULL;
165 index = mem;
166 mem = index + 1;
167 hash = mem;
168 mem = hash + hsize;
169 entry = mem;
171 index->src_buf = buf;
172 index->src_size = bufsize;
173 index->hash_mask = hmask;
174 memset(hash, 0, hsize * sizeof(*hash));
176 /* allocate an array to count hash entries */
177 hash_count = calloc(hsize, sizeof(*hash_count));
178 if (!hash_count) {
179 free(index);
180 return NULL;
183 /* then populate the index */
184 prev_val = ~0;
185 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
186 data >= buffer;
187 data -= RABIN_WINDOW) {
188 unsigned int val = 0;
189 for (i = 1; i <= RABIN_WINDOW; i++)
190 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
191 if (val == prev_val) {
192 /* keep the lowest of consecutive identical blocks */
193 entry[-1].ptr = data + RABIN_WINDOW;
194 } else {
195 prev_val = val;
196 i = val & hmask;
197 entry->ptr = data + RABIN_WINDOW;
198 entry->val = val;
199 entry->next = hash[i];
200 hash[i] = entry++;
201 hash_count[i]++;
202 entries--;
207 * Determine a limit on the number of entries in the same hash
208 * bucket. This guard us against patological data sets causing
209 * really bad hash distribution with most entries in the same hash
210 * bucket that would bring us to O(m*n) computing costs (m and n
211 * corresponding to reference and target buffer sizes).
213 * Make sure none of the hash buckets has more entries than
214 * we're willing to test. Otherwise we cull the entry list
215 * uniformly to still preserve a good repartition across
216 * the reference buffer.
218 for (i = 0; i < hsize; i++) {
219 if (hash_count[i] < HASH_LIMIT)
220 continue;
221 entry = hash[i];
222 do {
223 struct index_entry *keep = entry;
224 int skip = hash_count[i] / HASH_LIMIT / 2;
225 do {
226 entry = entry->next;
227 } while(--skip && entry);
228 keep->next = entry;
229 } while(entry);
231 free(hash_count);
233 /* If we didn't use all hash entries, free the unused memory. */
234 if (entries)
235 index = realloc(index, memsize - entries * sizeof(*entry));
237 return index;
240 void free_delta_index(struct delta_index *index)
242 free(index);
246 * The maximum size for any opcode sequence, including the initial header
247 * plus rabin window plus biggest copy.
249 #define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
251 void *
252 create_delta(const struct delta_index *index,
253 const void *trg_buf, unsigned long trg_size,
254 unsigned long *delta_size, unsigned long max_size)
256 unsigned int i, outpos, outsize, val;
257 int inscnt;
258 const unsigned char *ref_data, *ref_top, *data, *top;
259 unsigned char *out;
261 if (!trg_buf || !trg_size)
262 return NULL;
264 outpos = 0;
265 outsize = 8192;
266 if (max_size && outsize >= max_size)
267 outsize = max_size + MAX_OP_SIZE + 1;
268 out = malloc(outsize);
269 if (!out)
270 return NULL;
272 /* store reference buffer size */
273 i = index->src_size;
274 while (i >= 0x80) {
275 out[outpos++] = i | 0x80;
276 i >>= 7;
278 out[outpos++] = i;
280 /* store target buffer size */
281 i = trg_size;
282 while (i >= 0x80) {
283 out[outpos++] = i | 0x80;
284 i >>= 7;
286 out[outpos++] = i;
288 ref_data = index->src_buf;
289 ref_top = ref_data + index->src_size;
290 data = trg_buf;
291 top = trg_buf + trg_size;
293 outpos++;
294 val = 0;
295 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
296 out[outpos++] = *data;
297 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
299 inscnt = i;
301 while (data < top) {
302 unsigned int moff = 0, msize = 0;
303 struct index_entry *entry;
304 val ^= U[data[-RABIN_WINDOW]];
305 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
306 i = val & index->hash_mask;
307 for (entry = index->hash[i]; entry; entry = entry->next) {
308 const unsigned char *ref = entry->ptr;
309 const unsigned char *src = data;
310 unsigned int ref_size = ref_top - ref;
311 if (entry->val != val)
312 continue;
313 if (ref_size > top - src)
314 ref_size = top - src;
315 if (ref_size > 0x10000)
316 ref_size = 0x10000;
317 if (ref_size <= msize)
318 break;
319 while (ref_size-- && *src++ == *ref)
320 ref++;
321 if (msize < ref - entry->ptr) {
322 /* this is our best match so far */
323 msize = ref - entry->ptr;
324 moff = entry->ptr - ref_data;
328 if (msize < 4) {
329 if (!inscnt)
330 outpos++;
331 out[outpos++] = *data++;
332 inscnt++;
333 if (inscnt == 0x7f) {
334 out[outpos - inscnt - 1] = inscnt;
335 inscnt = 0;
337 } else {
338 unsigned char *op;
340 if (msize >= RABIN_WINDOW) {
341 const unsigned char *sk;
342 sk = data + msize - RABIN_WINDOW;
343 val = 0;
344 for (i = 0; i < RABIN_WINDOW; i++)
345 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
346 } else {
347 const unsigned char *sk = data + 1;
348 for (i = 1; i < msize; i++) {
349 val ^= U[sk[-RABIN_WINDOW]];
350 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
354 if (inscnt) {
355 while (moff && ref_data[moff-1] == data[-1]) {
356 if (msize == 0x10000)
357 break;
358 /* we can match one byte back */
359 msize++;
360 moff--;
361 data--;
362 outpos--;
363 if (--inscnt)
364 continue;
365 outpos--; /* remove count slot */
366 inscnt--; /* make it -1 */
367 break;
369 out[outpos - inscnt - 1] = inscnt;
370 inscnt = 0;
373 data += msize;
374 op = out + outpos++;
375 i = 0x80;
377 if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
378 moff >>= 8;
379 if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
380 moff >>= 8;
381 if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
382 moff >>= 8;
383 if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
385 if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
386 msize >>= 8;
387 if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
389 *op = i;
392 if (outpos >= outsize - MAX_OP_SIZE) {
393 void *tmp = out;
394 outsize = outsize * 3 / 2;
395 if (max_size && outsize >= max_size)
396 outsize = max_size + MAX_OP_SIZE + 1;
397 if (max_size && outpos > max_size)
398 break;
399 out = realloc(out, outsize);
400 if (!out) {
401 free(tmp);
402 return NULL;
407 if (inscnt)
408 out[outpos - inscnt - 1] = inscnt;
410 if (max_size && outpos > max_size) {
411 free(out);
412 return NULL;
415 *delta_size = outpos;
416 return out;