From 22e313f8c853c012f351f53e81c376d8724839a2 Mon Sep 17 00:00:00 2001 From: Sam Hocevar Date: Tue, 17 Mar 2009 08:49:55 +0100 Subject: [PATCH] fast-import: exclude big files from delta search. If a file is larger than core.bigFileThreshold, we deflate it on-the-fly instead of keeping an entire copy in memory. fast-import is now twice as fast and memory usage decreases more than threefold when importing big files. --- fast-import.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/fast-import.c b/fast-import.c index 41bc054ba7..6a581d0e01 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1724,11 +1724,15 @@ static void parse_mark(void) /* This actually parses a "data" command, with the addition that if sha1out * is not NULL, it will also compute the sha1 on the fly. */ -static void parse_object_data( +static size_t parse_object_data( enum object_type type, struct strbuf *sb, - unsigned char *sha1out) + unsigned char *sha1out, + int candeflate) { + int orig_bytes = 0; + size_t n = 0, length; + strbuf_reset(sb); if (prefixcmp(command_buf.buf, "data ")) @@ -1753,12 +1757,60 @@ static void parse_object_data( if (sha1out) sha1_object(type, sb, sha1out); } - else { - size_t n = 0, length; + else if ((length = strtoul(command_buf.buf + 5, NULL, 10)) + > big_file_threshold + && candeflate) { + /* The incoming file is really big. As it is pretty unlikely + * it will give any interesting deltas, we immediately deflate + * it instead of storing the original data in memory. */ + static struct strbuf tmp = STRBUF_INIT; + git_SHA_CTX c; + z_stream zs; + + if (sha1out) { + unsigned char hdr[96]; + unsigned long hdrlen; + hdrlen = sprintf((char*)hdr,"%s %lu", typename(type), + (unsigned long)length) + 1; + git_SHA1_Init(&c); + git_SHA1_Update(&c, hdr, hdrlen); + } - length = strtoul(command_buf.buf + 5, NULL, 10); + memset(&zs, 0, sizeof(zs)); + deflateInit(&zs, pack_compression_level); + /* TODO: ideally, this should grow dynamically while we + * deflate the file. */ + zs.avail_out = deflateBound(&zs, length); + strbuf_grow(sb, zs.avail_out); + zs.next_out = (unsigned char *)sb->buf; while (n < length) { + size_t s = strbuf_fread(&tmp, length - n < 4096 ? + length - n : 4096, stdin); + if (!s && feof(stdin)) + die("EOF in data (%lu bytes remaining)", + (unsigned long)(length - n)); + if (sha1out) + git_SHA1_Update(&c, tmp.buf, s); + zs.next_in = (unsigned char *)tmp.buf; + zs.avail_in = s; + while (deflate(&zs, Z_NO_FLUSH) == Z_OK) + /* nothing */; + strbuf_reset(&tmp); + + n += s; + } + deflate(&zs, Z_FINISH); + deflateEnd(&zs); + strbuf_setlen(sb, zs.total_out); + + if (sha1out) + git_SHA1_Final(sha1out, &c); + + orig_bytes = length; + } + else { + while (n < length) { size_t s = strbuf_fread(sb, length - n, stdin); if (!s && feof(stdin)) die("EOF in data (%lu bytes remaining)", @@ -1771,11 +1823,12 @@ static void parse_object_data( } skip_optional_lf(); + return orig_bytes; } static void parse_data(struct strbuf *sb) { - parse_object_data(OBJ_NONE, sb, NULL); + parse_object_data(OBJ_NONE, sb, NULL, 0); } static int validate_raw_date(const char *src, char *result, int maxlen) @@ -1841,11 +1894,12 @@ static void parse_new_blob(void) { static struct strbuf buf = STRBUF_INIT; unsigned char sha1[20]; + size_t orig_bytes; read_next_command(); parse_mark(); - parse_object_data(OBJ_BLOB, &buf, sha1); - store_object(OBJ_BLOB, &buf, &last_blob, sha1, next_mark, 0); + orig_bytes = parse_object_data(OBJ_BLOB, &buf, sha1, 1); + store_object(OBJ_BLOB, &buf, &last_blob, sha1, next_mark, orig_bytes); } static void unload_one_branch(void) @@ -1957,14 +2011,15 @@ static void file_change_m(struct branch *b) */ } else if (inline_data) { static struct strbuf buf = STRBUF_INIT; + size_t orig_bytes; if (p != uq.buf) { strbuf_addstr(&uq, p); p = uq.buf; } read_next_command(); - parse_object_data(OBJ_BLOB, &buf, sha1); - store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0, 0); + orig_bytes = parse_object_data(OBJ_BLOB, &buf, sha1, 1); + store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0, orig_bytes); } else if (oe) { if (oe->type != OBJ_BLOB) die("Not a blob (actually a %s): %s", -- 2.11.4.GIT