From fb6fc7f3ce6b0b70a5df7f605e71c4f8541e256b Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sat, 4 Mar 2023 11:41:03 -0800 Subject: [PATCH] split: split more evenly with -n * src/split.c (bytes_split): New arg REM_BYTES. Use this to split more evenly. All callers changed. (lines_chunk_split, bytes_chunk_extract): Be consistent with new byte_split. * tests/split/b-chunk.sh, tests/split/l-chunk.sh: Test new behavior. --- NEWS | 5 +++++ doc/coreutils.texi | 8 +++---- src/split.c | 58 +++++++++++++++++++++++--------------------------- tests/split/b-chunk.sh | 24 ++++++++++++++++++--- tests/split/l-chunk.sh | 24 +++++++++------------ 5 files changed, 67 insertions(+), 52 deletions(-) diff --git a/NEWS b/NEWS index 2694cf305..f7a95e7fb 100644 --- a/NEWS +++ b/NEWS @@ -106,6 +106,11 @@ GNU coreutils NEWS -*- outline -*- internal errors it would exit with status 1, which was less distinguishable from errors from the invoked command. + 'split -n N' now splits more evenly when the input size is not a + multiple of N, by creating N output files whose sizes differ by at + most 1 byte. Formerly, it did this only when the input size was + less than N. + 'stat -c %s' now prints sizes as unsigned, consistent with 'ls'. ** New Features diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 7ea910ba8..b07a330eb 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3393,8 +3393,8 @@ r/@var{n} like @samp{l} but use round robin distribution r/@var{k}/@var{n} likewise but output only @var{k}th of @var{n} to stdout @end example -Any excess bytes remaining after dividing the @var{input} -into @var{n} chunks, are assigned to the last chunk. +If the input size is not a multiple of @var{n}, early output files are +one byte longer than later output files, to make up the difference. Any excess bytes appearing after the initial calculation are discarded (except when using @samp{r} mode). @@ -3402,8 +3402,8 @@ All @var{n} files are created even if there are fewer than @var{n} lines, or the @var{input} is truncated. For @samp{l} mode, chunks are approximately @var{input} size / @var{n}. -The @var{input} is partitioned into @var{n} equal sized portions, with -the last assigned any excess. If a line @emph{starts} within a partition +Although the @var{input} is still partitioned as before into @var{n} regions +of approximately equal size, if a line @emph{starts} within a partition it is written completely to the corresponding file. Since lines or records are not split even if they overlap a partition, the files written can be larger or smaller than the partition size, and even empty diff --git a/src/split.c b/src/split.c index 574250d27..c66bc69a2 100644 --- a/src/split.c +++ b/src/split.c @@ -619,21 +619,23 @@ cwrite (bool new_file_flag, char const *bp, size_t bytes) } /* Split into pieces of exactly N_BYTES bytes. + However, the first REM_BYTES pieces should be 1 byte longer. Use buffer BUF, whose size is BUFSIZE. BUF contains the first INITIAL_READ input bytes. */ static void -bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read, +bytes_split (uintmax_t n_bytes, uintmax_t rem_bytes, + char *buf, size_t bufsize, size_t initial_read, uintmax_t max_files) { size_t n_read; bool new_file_flag = true; bool filter_ok = true; - uintmax_t to_write = n_bytes; uintmax_t opened = 0; - bool eof; + uintmax_t to_write = n_bytes + (0 < rem_bytes); + bool eof = ! to_write; - do + while (! eof) { if (initial_read != SIZE_MAX) { @@ -646,7 +648,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read, if (! filter_ok && lseek (STDIN_FILENO, to_write, SEEK_CUR) != -1) { - to_write = n_bytes; + to_write = n_bytes + (opened + 1 < rem_bytes); new_file_flag = true; } @@ -656,7 +658,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read, eof = n_read == 0; } char *bp_out = buf; - while (to_write <= n_read) + while (0 < to_write && to_write <= n_read) { if (filter_ok || new_file_flag) filter_ok = cwrite (new_file_flag, bp_out, to_write); @@ -671,7 +673,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read, } bp_out += to_write; n_read -= to_write; - to_write = n_bytes; + to_write = n_bytes + (opened < rem_bytes); } if (n_read != 0) { @@ -687,7 +689,6 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read, to_write -= n_read; } } - while (! eof); /* Ensure NUMBER files are created, which truncates any existing files or notifies any consumers on fifos. @@ -864,19 +865,20 @@ static void lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, size_t initial_read, off_t file_size) { - assert (n && k <= n && n <= file_size); + assert (n && k <= n); - const off_t chunk_size = file_size / n; + uintmax_t rem_bytes = file_size % n; + off_t chunk_size = file_size / n; uintmax_t chunk_no = 1; - off_t chunk_end = chunk_size; + off_t chunk_end = chunk_size + (0 < rem_bytes); off_t n_written = 0; bool new_file_flag = true; bool chunk_truncated = false; - if (k > 1) + if (k > 1 && 0 < file_size) { /* Start reading 1 byte before kth chunk of file. */ - off_t start = (k - 1) * chunk_size - 1; + off_t start = (k - 1) * chunk_size + MIN (k - 1, rem_bytes) - 1; if (start < initial_read) { memmove (buf, buf + start, initial_read - start); @@ -890,7 +892,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, } n_written = start; chunk_no = k - 1; - chunk_end = chunk_no * chunk_size; + chunk_end = start + 1; } while (n_written < file_size) @@ -904,13 +906,13 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, } else { - n_read = safe_read (STDIN_FILENO, buf, bufsize); + n_read = safe_read (STDIN_FILENO, buf, + MIN (bufsize, file_size - n_written)); if (n_read == SAFE_READ_ERROR) die (EXIT_FAILURE, errno, "%s", quotef (infile)); } if (n_read == 0) break; /* eof. */ - n_read = MIN (n_read, file_size - n_written); chunk_truncated = false; eob = buf + n_read; @@ -956,13 +958,10 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, chunk_truncated = true; break; } - chunk_no++; - if (k && chunk_no > k) + if (k == chunk_no) return; - if (chunk_no == n) - chunk_end = file_size; /* >= chunk_size. */ - else - chunk_end += chunk_size; + chunk_end += chunk_size + (chunk_no < rem_bytes); + chunk_no++; if (chunk_end <= n_written) { if (! k) @@ -994,10 +993,10 @@ bytes_chunk_extract (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, off_t start; off_t end; - assert (k && n && k <= n && n <= file_size); + assert (0 < k && k <= n); - start = (k - 1) * (file_size / n); - end = (k == n) ? file_size : k * (file_size / n); + start = (k - 1) * (file_size / n) + MIN (k - 1, file_size % n); + end = k == n ? file_size : k * (file_size / n) + MIN (k, file_size % n); if (start < initial_read) { @@ -1607,9 +1606,6 @@ main (int argc, char **argv) _("invalid number of chunks"), quote (umaxtostr (n_units, buffer))); } - /* increase file_size to n_units here, so that we still process - any input data, and create empty files for the rest. */ - file_size = MAX (file_size, n_units); } /* When filtering, closure of one pipe must not terminate the process, @@ -1632,7 +1628,7 @@ main (int argc, char **argv) break; case type_bytes: - bytes_split (n_units, buf, in_blk_size, SIZE_MAX, 0); + bytes_split (n_units, 0, buf, in_blk_size, SIZE_MAX, 0); break; case type_byteslines: @@ -1641,8 +1637,8 @@ main (int argc, char **argv) case type_chunk_bytes: if (k_units == 0) - bytes_split (file_size / n_units, buf, in_blk_size, initial_read, - n_units); + bytes_split (file_size / n_units, file_size % n_units, + buf, in_blk_size, initial_read, n_units); else bytes_chunk_extract (k_units, n_units, buf, in_blk_size, initial_read, file_size); diff --git a/tests/split/b-chunk.sh b/tests/split/b-chunk.sh index b3195000f..1e9a6f6e9 100755 --- a/tests/split/b-chunk.sh +++ b/tests/split/b-chunk.sh @@ -25,6 +25,24 @@ split -n 10 /dev/null || fail=1 test "$(stat -c %s x* | uniq -c | sed 's/^ *//; s/ /x/')" = "10x0" || fail=1 rm -f x?? +printf 'abc' > abc || framework_failure_ +printf 'a' > exp-a || framework_failure_ +printf 'b' > exp-b || framework_failure_ +printf 'c' > exp-c || framework_failure_ +printf 'ab' > exp-ab || framework_failure_ +split -n 4 abc || fail=1 +compare exp-a xaa || fail=1 +compare exp-b xab || fail=1 +compare exp-c xac || fail=1 +compare /dev/null xad || fail=1 +test ! -f xae || fail=1 +rm -f x?? +split -n 2 abc || fail=1 +compare exp-ab xaa || fail=1 +compare exp-c xab || fail=1 +test ! -f xac || fail=1 +rm -f x?? + # When extracting K of N where N > file size # no data is extracted, and no files are written split -n 2/3 /dev/null || fail=1 @@ -35,9 +53,9 @@ split -e -n 10 /dev/null || fail=1 returns_ 1 stat x?? 2>/dev/null || fail=1 printf '1\n2\n3\n4\n5\n' > input || framework_failure_ -printf '1\n2' > exp-1 || framework_failure_ -printf '\n3\n' > exp-2 || framework_failure_ -printf '4\n5\n' > exp-3 || framework_failure_ +printf '1\n2\n' > exp-1 || framework_failure_ +printf '3\n4' > exp-2 || framework_failure_ +printf '\n5\n' > exp-3 || framework_failure_ for file in input /proc/version /sys/kernel/profiling; do test -f $file || continue diff --git a/tests/split/l-chunk.sh b/tests/split/l-chunk.sh index 73cd729a4..cdb201746 100755 --- a/tests/split/l-chunk.sh +++ b/tests/split/l-chunk.sh @@ -59,11 +59,11 @@ sed "s/': .*/'/" < err.t > err || framework_failure_ compare exp err || fail=1 printf '%s' "\ -14 16 09 15 16 10 +14 16 16 08 16 10 14 08 08 10 14 08 08 10 -06 08 08 02 06 08 08 02 06 08 08 10 -06 08 02 06 08 00 08 02 06 08 02 06 08 00 10 -06 00 08 00 02 06 00 02 06 00 08 00 01 07 00 02 06 00 08 00 02 16 +08 06 08 08 08 08 08 02 06 08 08 02 +06 08 08 02 06 08 02 06 08 02 06 08 00 08 02 +06 02 06 02 06 02 06 02 06 02 06 02 06 02 06 00 08 00 02 06 00 02 " > exp || framework_failure_ sed 's/00 *//g' exp > exp.elide_empty || framework_failure_ @@ -120,17 +120,13 @@ test "$DEBUGGING" && test "$VERBOSE" && set -x # Check extraction of particular chunks -> out -printf '1\n12345\n' > exp -split -n l/13/15 in > out +split -n l/13/15 in > out && +compare /dev/null out || fail=1 +printf '1\n12345\n' > exp || framework_failure_ +split -n l/14/15 in > out && compare exp out || fail=1 -> out -printf '' > exp -split -n l/14/15 in > out -compare exp out || fail=1 -> out -printf '1\n12345\n1\n' > exp -split -n l/15/15 in > out +printf '1\n' > exp || framework_failure_ +split -n l/15/15 in > out && compare exp out || fail=1 # test input with no \n at end -- 2.11.4.GIT