builtin-grep: make pieces of it available as library.
[tgit.git] / diff-delta.c
blobfa16d06c8d1e85a458428c673cb2f589857f5424
1 /*
2 * diff-delta.c: generate a delta between two buffers
4 * Many parts of this file have been lifted from LibXDiff version 0.10.
5 * http://www.xmailserver.org/xdiff-lib.html
7 * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8 * Copyright (C) 2003 Davide Libenzi
10 * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
12 * This file is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Use of this within git automatically means that the LGPL
18 * licensing gets turned into GPLv2 within this project.
21 #include <stdlib.h>
22 #include <string.h>
23 #include "delta.h"
25 #include "git-compat-util.h"
27 /* maximum hash entry list for the same hash bucket */
28 #define HASH_LIMIT 64
30 #define RABIN_SHIFT 23
31 #define RABIN_WINDOW 16
33 static const unsigned int T[256] = {
34 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
35 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
36 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
37 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
38 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
39 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
40 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
41 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
42 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
43 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
44 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
45 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
46 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
47 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
48 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
49 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
50 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
51 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
52 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
53 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
54 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
55 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
56 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
57 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
58 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
59 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
60 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
61 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
62 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
63 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
64 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
65 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
66 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
67 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
68 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
69 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
70 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
71 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
72 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
73 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
74 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
75 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
76 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
79 static const unsigned int U[256] = {
80 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
81 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
82 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
83 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
84 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
85 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
86 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
87 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
88 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
89 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
90 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
91 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
92 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
93 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
94 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
95 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
96 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
97 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
98 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
99 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
100 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
101 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
102 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
103 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
104 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
105 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
106 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
107 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
108 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
109 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
110 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
111 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
112 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
113 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
114 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
115 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
116 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
117 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
118 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
119 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
120 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
121 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
122 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
125 struct index_entry {
126 const unsigned char *ptr;
127 unsigned int val;
128 struct index_entry *next;
131 struct delta_index {
132 const void *src_buf;
133 unsigned long src_size;
134 unsigned int hash_mask;
135 struct index_entry *hash[FLEX_ARRAY];
138 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
140 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
141 const unsigned char *data, *buffer = buf;
142 struct delta_index *index;
143 struct index_entry *entry, **hash;
144 void *mem;
145 unsigned long memsize;
147 if (!buf || !bufsize)
148 return NULL;
150 /* Determine index hash size. Note that indexing skips the
151 first byte to allow for optimizing the Rabin's polynomial
152 initialization in create_delta(). */
153 entries = (bufsize - 1) / RABIN_WINDOW;
154 hsize = entries / 4;
155 for (i = 4; (1u << i) < hsize && i < 31; i++);
156 hsize = 1 << i;
157 hmask = hsize - 1;
159 /* allocate lookup index */
160 memsize = sizeof(*index) +
161 sizeof(*hash) * hsize +
162 sizeof(*entry) * entries;
163 mem = malloc(memsize);
164 if (!mem)
165 return NULL;
166 index = mem;
167 mem = index + 1;
168 hash = mem;
169 mem = hash + hsize;
170 entry = mem;
172 index->src_buf = buf;
173 index->src_size = bufsize;
174 index->hash_mask = hmask;
175 memset(hash, 0, hsize * sizeof(*hash));
177 /* allocate an array to count hash entries */
178 hash_count = calloc(hsize, sizeof(*hash_count));
179 if (!hash_count) {
180 free(index);
181 return NULL;
184 /* then populate the index */
185 prev_val = ~0;
186 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
187 data >= buffer;
188 data -= RABIN_WINDOW) {
189 unsigned int val = 0;
190 for (i = 1; i <= RABIN_WINDOW; i++)
191 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
192 if (val == prev_val) {
193 /* keep the lowest of consecutive identical blocks */
194 entry[-1].ptr = data + RABIN_WINDOW;
195 } else {
196 prev_val = val;
197 i = val & hmask;
198 entry->ptr = data + RABIN_WINDOW;
199 entry->val = val;
200 entry->next = hash[i];
201 hash[i] = entry++;
202 hash_count[i]++;
207 * Determine a limit on the number of entries in the same hash
208 * bucket. This guards us against pathological data sets causing
209 * really bad hash distribution with most entries in the same hash
210 * bucket that would bring us to O(m*n) computing costs (m and n
211 * corresponding to reference and target buffer sizes).
213 * Make sure none of the hash buckets has more entries than
214 * we're willing to test. Otherwise we cull the entry list
215 * uniformly to still preserve a good repartition across
216 * the reference buffer.
218 for (i = 0; i < hsize; i++) {
219 if (hash_count[i] < HASH_LIMIT)
220 continue;
221 entry = hash[i];
222 do {
223 struct index_entry *keep = entry;
224 int skip = hash_count[i] / HASH_LIMIT / 2;
225 do {
226 entry = entry->next;
227 } while(--skip && entry);
228 keep->next = entry;
229 } while(entry);
231 free(hash_count);
233 return index;
236 void free_delta_index(struct delta_index *index)
238 free(index);
242 * The maximum size for any opcode sequence, including the initial header
243 * plus Rabin window plus biggest copy.
245 #define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
247 void *
248 create_delta(const struct delta_index *index,
249 const void *trg_buf, unsigned long trg_size,
250 unsigned long *delta_size, unsigned long max_size)
252 unsigned int i, outpos, outsize, val;
253 int inscnt;
254 const unsigned char *ref_data, *ref_top, *data, *top;
255 unsigned char *out;
257 if (!trg_buf || !trg_size)
258 return NULL;
260 outpos = 0;
261 outsize = 8192;
262 if (max_size && outsize >= max_size)
263 outsize = max_size + MAX_OP_SIZE + 1;
264 out = malloc(outsize);
265 if (!out)
266 return NULL;
268 /* store reference buffer size */
269 i = index->src_size;
270 while (i >= 0x80) {
271 out[outpos++] = i | 0x80;
272 i >>= 7;
274 out[outpos++] = i;
276 /* store target buffer size */
277 i = trg_size;
278 while (i >= 0x80) {
279 out[outpos++] = i | 0x80;
280 i >>= 7;
282 out[outpos++] = i;
284 ref_data = index->src_buf;
285 ref_top = ref_data + index->src_size;
286 data = trg_buf;
287 top = (const unsigned char *) trg_buf + trg_size;
289 outpos++;
290 val = 0;
291 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
292 out[outpos++] = *data;
293 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
295 inscnt = i;
297 while (data < top) {
298 unsigned int moff = 0, msize = 0;
299 struct index_entry *entry;
300 val ^= U[data[-RABIN_WINDOW]];
301 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
302 i = val & index->hash_mask;
303 for (entry = index->hash[i]; entry; entry = entry->next) {
304 const unsigned char *ref = entry->ptr;
305 const unsigned char *src = data;
306 unsigned int ref_size = ref_top - ref;
307 if (entry->val != val)
308 continue;
309 if (ref_size > top - src)
310 ref_size = top - src;
311 if (ref_size > 0x10000)
312 ref_size = 0x10000;
313 if (ref_size <= msize)
314 break;
315 while (ref_size-- && *src++ == *ref)
316 ref++;
317 if (msize < ref - entry->ptr) {
318 /* this is our best match so far */
319 msize = ref - entry->ptr;
320 moff = entry->ptr - ref_data;
324 if (msize < 4) {
325 if (!inscnt)
326 outpos++;
327 out[outpos++] = *data++;
328 inscnt++;
329 if (inscnt == 0x7f) {
330 out[outpos - inscnt - 1] = inscnt;
331 inscnt = 0;
333 } else {
334 unsigned char *op;
336 if (msize >= RABIN_WINDOW) {
337 const unsigned char *sk;
338 sk = data + msize - RABIN_WINDOW;
339 val = 0;
340 for (i = 0; i < RABIN_WINDOW; i++)
341 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
342 } else {
343 const unsigned char *sk = data + 1;
344 for (i = 1; i < msize; i++) {
345 val ^= U[sk[-RABIN_WINDOW]];
346 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
350 if (inscnt) {
351 while (moff && ref_data[moff-1] == data[-1]) {
352 if (msize == 0x10000)
353 break;
354 /* we can match one byte back */
355 msize++;
356 moff--;
357 data--;
358 outpos--;
359 if (--inscnt)
360 continue;
361 outpos--; /* remove count slot */
362 inscnt--; /* make it -1 */
363 break;
365 out[outpos - inscnt - 1] = inscnt;
366 inscnt = 0;
369 data += msize;
370 op = out + outpos++;
371 i = 0x80;
373 if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
374 moff >>= 8;
375 if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
376 moff >>= 8;
377 if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
378 moff >>= 8;
379 if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
381 if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
382 msize >>= 8;
383 if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
385 *op = i;
388 if (outpos >= outsize - MAX_OP_SIZE) {
389 void *tmp = out;
390 outsize = outsize * 3 / 2;
391 if (max_size && outsize >= max_size)
392 outsize = max_size + MAX_OP_SIZE + 1;
393 if (max_size && outpos > max_size)
394 break;
395 out = xrealloc(out, outsize);
396 if (!out) {
397 free(tmp);
398 return NULL;
403 if (inscnt)
404 out[outpos - inscnt - 1] = inscnt;
406 if (max_size && outpos > max_size) {
407 free(out);
408 return NULL;
411 *delta_size = outpos;
412 return out;