From 4dd1fbc7b1df0030f813a05cee19cad2c7a9cbf9 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 8 May 2011 01:47:35 -0700 Subject: [PATCH] Bigfile: teach "git add" to send a large file straight to a pack When adding a new content to the repository, we have always slurped the blob in its entirety in-core first, and computed the object name and compressed it into a loose object file. Handling large binary files (e.g. video and audio asset for games) has been problematic because of this design. At the middle level of "git add" callchain is an internal API index_fd() that takes an open file descriptor to read from the working tree file being added with its size. Teach it to call out to fast-import when adding a large blob. The write-out codepath in entry.c::write_entry() should be taught to stream, instead of reading everything in core. This should not be so hard to implement, especially if we limit ourselves only to loose object files and non-delta representation in packfiles. Signed-off-by: Junio C Hamano --- sha1_file.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- t/t1050-large.sh | 27 ++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) create mode 100755 t/t1050-large.sh diff --git a/sha1_file.c b/sha1_file.c index 49416b0291..f0ca6a1749 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -11,6 +11,7 @@ #include "pack.h" #include "blob.h" #include "commit.h" +#include "run-command.h" #include "tag.h" #include "tree.h" #include "tree-walk.h" @@ -2658,6 +2659,85 @@ static int index_core(unsigned char *sha1, int fd, size_t size, return ret; } +/* + * This creates one packfile per large blob, because the caller + * immediately wants the result sha1, and fast-import can report the + * object name via marks mechanism only by closing the created + * packfile. + * + * This also bypasses the usual "convert-to-git" dance, and that is on + * purpose. We could write a streaming version of the converting + * functions and insert that before feeding the data to fast-import + * (or equivalent in-core API described above), but the primary + * motivation for trying to stream from the working tree file and to + * avoid mmaping it in core is to deal with large binary blobs, and + * by definition they do _not_ want to get any conversion. + */ +static int index_stream(unsigned char *sha1, int fd, size_t size, + enum object_type type, const char *path, + unsigned flags) +{ + struct child_process fast_import; + char export_marks[512]; + const char *argv[] = { "fast-import", "--quiet", export_marks, NULL }; + char tmpfile[512]; + char fast_import_cmd[512]; + char buf[512]; + int len, tmpfd; + + strcpy(tmpfile, git_path("hashstream_XXXXXX")); + tmpfd = git_mkstemp_mode(tmpfile, 0600); + if (tmpfd < 0) + die_errno("cannot create tempfile: %s", tmpfile); + if (close(tmpfd)) + die_errno("cannot close tempfile: %s", tmpfile); + sprintf(export_marks, "--export-marks=%s", tmpfile); + + memset(&fast_import, 0, sizeof(fast_import)); + fast_import.in = -1; + fast_import.argv = argv; + fast_import.git_cmd = 1; + if (start_command(&fast_import)) + die_errno("index-stream: git fast-import failed"); + + len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n", + (unsigned long) size); + write_or_whine(fast_import.in, fast_import_cmd, len, + "index-stream: feeding fast-import"); + while (size) { + char buf[10240]; + size_t sz = size < sizeof(buf) ? size : sizeof(buf); + size_t actual; + + actual = read_in_full(fd, buf, sz); + if (actual < 0) + die_errno("index-stream: reading input"); + if (write_in_full(fast_import.in, buf, actual) != actual) + die_errno("index-stream: feeding fast-import"); + size -= actual; + } + if (close(fast_import.in)) + die_errno("index-stream: closing fast-import"); + if (finish_command(&fast_import)) + die_errno("index-stream: finishing fast-import"); + + tmpfd = open(tmpfile, O_RDONLY); + if (tmpfd < 0) + die_errno("index-stream: cannot open fast-import mark"); + len = read(tmpfd, buf, sizeof(buf)); + if (len < 0) + die_errno("index-stream: reading fast-import mark"); + if (close(tmpfd) < 0) + die_errno("index-stream: closing fast-import mark"); + if (unlink(tmpfile)) + die_errno("index-stream: unlinking fast-import mark"); + if (len != 44 || + memcmp(":1 ", buf, 3) || + get_sha1_hex(buf + 3, sha1)) + die_errno("index-stream: unexpected fast-import mark: <%s>", buf); + return 0; +} + int index_fd(unsigned char *sha1, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags) { @@ -2666,8 +2746,10 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, if (!S_ISREG(st->st_mode)) ret = index_pipe(sha1, fd, type, path, flags); - else + else if (size <= big_file_threshold || type != OBJ_BLOB) ret = index_core(sha1, fd, size, type, path, flags); + else + ret = index_stream(sha1, fd, size, type, path, flags); close(fd); return ret; } diff --git a/t/t1050-large.sh b/t/t1050-large.sh new file mode 100755 index 0000000000..deba111bd7 --- /dev/null +++ b/t/t1050-large.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# Copyright (c) 2011, Google Inc. + +test_description='adding and checking out large blobs' + +. ./test-lib.sh + +test_expect_success setup ' + git config core.bigfilethreshold 200k && + echo X | dd of=large bs=1k seek=2000 +' + +test_expect_success 'add a large file' ' + git add large && + # make sure we got a packfile and no loose objects + test -f .git/objects/pack/pack-*.pack && + test ! -f .git/objects/??/?????????????????????????????????????? +' + +test_expect_success 'checkout a large file' ' + large=$(git rev-parse :large) && + git update-index --add --cacheinfo 100644 $large another && + git checkout another && + cmp large another ;# this must not be test_cmp +' + +test_done -- 2.11.4.GIT