1 /* split.c -- split a file into pieces.
2 Copyright (C) 1988-2022 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* By tege@sics.se, with rms.
20 * support -p REGEX as in BSD's split.
21 * support --suppress-matched as in csplit. */
28 #include <sys/types.h>
32 #include "alignalloc.h"
35 #include "fd-reopen.h"
37 #include "full-write.h"
38 #include "ioblksize.h"
40 #include "safe-read.h"
42 #include "xbinary-io.h"
43 #include "xdectoint.h"
46 /* The official name of this program (e.g., no 'g' prefix). */
47 #define PROGRAM_NAME "split"
50 proper_name ("Torbjorn Granlund"), \
51 proper_name ("Richard M. Stallman")
53 /* Shell command to filter through, instead of creating files. */
54 static char const *filter_command
;
56 /* Process ID of the filter. */
57 static int filter_pid
;
59 /* Array of open pipes. */
60 static int *open_pipes
;
61 static size_t open_pipes_alloc
;
62 static size_t n_open_pipes
;
64 /* Blocked signals. */
65 static sigset_t oldblocked
;
66 static sigset_t newblocked
;
68 /* Base name of output files. */
69 static char const *outbase
;
71 /* Name of output files. */
74 /* Pointer to the end of the prefix in OUTFILE.
75 Suffixes are inserted here. */
76 static char *outfile_mid
;
78 /* Generate new suffix when suffixes are exhausted. */
79 static bool suffix_auto
= true;
81 /* Length of OUTFILE's suffix. */
82 static size_t suffix_length
;
84 /* Alphabet of characters to use in suffix. */
85 static char const *suffix_alphabet
= "abcdefghijklmnopqrstuvwxyz";
87 /* Numerical suffix start value. */
88 static char const *numeric_suffix_start
;
90 /* Additional suffix to append to output file names. */
91 static char const *additional_suffix
;
93 /* Name of input file. May be "-". */
96 /* stat buf for input file. */
97 static struct stat in_stat_buf
;
99 /* Descriptor on which output file is open. */
100 static int output_desc
= -1;
102 /* If true, print a diagnostic on standard error just before each
103 output file is opened. */
106 /* If true, don't generate zero length output files. */
107 static bool elide_empty_files
;
109 /* If true, in round robin mode, immediately copy
110 input to output, which is much slower, so disabled by default. */
111 static bool unbuffered
;
113 /* The character marking end of line. Defaults to \n below. */
114 static int eolchar
= -1;
116 /* The split mode to use. */
119 type_undef
, type_bytes
, type_byteslines
, type_lines
, type_digits
,
120 type_chunk_bytes
, type_chunk_lines
, type_rr
123 /* For long options that have no equivalent short option, use a
124 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
127 VERBOSE_OPTION
= CHAR_MAX
+ 1,
130 ADDITIONAL_SUFFIX_OPTION
133 static struct option
const longopts
[] =
135 {"bytes", required_argument
, NULL
, 'b'},
136 {"lines", required_argument
, NULL
, 'l'},
137 {"line-bytes", required_argument
, NULL
, 'C'},
138 {"number", required_argument
, NULL
, 'n'},
139 {"elide-empty-files", no_argument
, NULL
, 'e'},
140 {"unbuffered", no_argument
, NULL
, 'u'},
141 {"suffix-length", required_argument
, NULL
, 'a'},
142 {"additional-suffix", required_argument
, NULL
,
143 ADDITIONAL_SUFFIX_OPTION
},
144 {"numeric-suffixes", optional_argument
, NULL
, 'd'},
145 {"hex-suffixes", optional_argument
, NULL
, 'x'},
146 {"filter", required_argument
, NULL
, FILTER_OPTION
},
147 {"verbose", no_argument
, NULL
, VERBOSE_OPTION
},
148 {"separator", required_argument
, NULL
, 't'},
149 {"-io-blksize", required_argument
, NULL
,
150 IO_BLKSIZE_OPTION
}, /* do not document */
151 {GETOPT_HELP_OPTION_DECL
},
152 {GETOPT_VERSION_OPTION_DECL
},
156 /* Return true if the errno value, ERR, is ignorable. */
160 return filter_command
&& err
== EPIPE
;
164 set_suffix_length (uintmax_t n_units
, enum Split_type split_type
)
166 #define DEFAULT_SUFFIX_LENGTH 2
168 uintmax_t suffix_length_needed
= 0;
170 /* The suffix auto length feature is incompatible with
171 a user specified start value as the generated suffixes
172 are not all consecutive. */
173 if (numeric_suffix_start
)
176 /* Auto-calculate the suffix length if the number of files is given. */
177 if (split_type
== type_chunk_bytes
|| split_type
== type_chunk_lines
178 || split_type
== type_rr
)
180 uintmax_t n_units_end
= n_units
- 1;
181 if (numeric_suffix_start
)
184 strtol_error e
= xstrtoumax (numeric_suffix_start
, NULL
, 10,
186 if (e
== LONGINT_OK
&& n_start
<= UINTMAX_MAX
- n_units
)
188 /* Restrict auto adjustment so we don't keep
189 incrementing a suffix size arbitrarily,
190 as that would break sort order for files
191 generated from multiple split runs. */
192 if (n_start
< n_units
)
193 n_units_end
+= n_start
;
197 size_t alphabet_len
= strlen (suffix_alphabet
);
199 suffix_length_needed
++;
200 while (n_units_end
/= alphabet_len
);
205 if (suffix_length
) /* set by user */
207 if (suffix_length
< suffix_length_needed
)
209 die (EXIT_FAILURE
, 0,
210 _("the suffix length needs to be at least %"PRIuMAX
),
211 suffix_length_needed
);
217 suffix_length
= MAX (DEFAULT_SUFFIX_LENGTH
, suffix_length_needed
);
223 if (status
!= EXIT_SUCCESS
)
228 Usage: %s [OPTION]... [FILE [PREFIX]]\n\
232 Output pieces of FILE to PREFIXaa, PREFIXab, ...;\n\
233 default size is 1000 lines, and default PREFIX is 'x'.\n\
237 emit_mandatory_arg_note ();
239 fprintf (stdout
, _("\
240 -a, --suffix-length=N generate suffixes of length N (default %d)\n\
241 --additional-suffix=SUFFIX append an additional SUFFIX to file names\n\
242 -b, --bytes=SIZE put SIZE bytes per output file\n\
243 -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\
244 -d use numeric suffixes starting at 0, not alphabetic\n\
245 --numeric-suffixes[=FROM] same as -d, but allow setting the start value\
247 -x use hex suffixes starting at 0, not alphabetic\n\
248 --hex-suffixes[=FROM] same as -x, but allow setting the start value\n\
249 -e, --elide-empty-files do not generate empty output files with '-n'\n\
250 --filter=COMMAND write to shell COMMAND; file name is $FILE\n\
251 -l, --lines=NUMBER put NUMBER lines/records per output file\n\
252 -n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\
253 -t, --separator=SEP use SEP instead of newline as the record separator;\n\
254 '\\0' (zero) specifies the NUL character\n\
255 -u, --unbuffered immediately copy input to output with '-n r/...'\n\
256 "), DEFAULT_SUFFIX_LENGTH
);
258 --verbose print a diagnostic just before each\n\
259 output file is opened\n\
261 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
262 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
266 N split into N files based on size of input\n\
267 K/N output Kth of N to stdout\n\
268 l/N split into N files without splitting lines/records\n\
269 l/K/N output Kth of N to stdout without splitting lines/records\n\
270 r/N like 'l' but use round robin distribution\n\
271 r/K/N likewise but only output Kth of N to stdout\n\
273 emit_ancillary_info (PROGRAM_NAME
);
278 /* Return the number of bytes that can be read from FD with status ST.
279 Store up to the first BUFSIZE bytes of the file's data into BUF,
280 and advance the file position by the number of bytes read. On
281 input error, set errno and return -1. */
284 input_file_size (int fd
, struct stat
const *st
, char *buf
, size_t bufsize
)
286 off_t cur
= lseek (fd
, 0, SEEK_CUR
);
290 errno
= 0; /* Suppress confusing seek error. */
297 size_t n_read
= safe_read (fd
, buf
+ size
, bufsize
- size
);
300 if (n_read
== SAFE_READ_ERROR
)
304 while (size
< bufsize
);
306 /* Note we check st_size _after_ the read() above
307 because /proc files on GNU/Linux are seekable
308 but have st_size == 0. */
309 if (st
->st_size
== 0)
311 /* We've filled the buffer, from a seekable file,
312 which has an st_size==0, E.g., /dev/zero on GNU/Linux.
313 Assume there is no limit to file size. */
320 if (usable_st_size (st
) && cur
<= st
->st_size
)
324 end
= lseek (fd
, 0, SEEK_END
);
329 if (lseek (fd
, cur
, SEEK_SET
) < 0)
337 if (size
== OFF_T_MAX
)
339 /* E.g., /dev/zero on GNU/Hurd. */
347 /* Compute the next sequential output file name and store it into the
351 next_file_name (void)
353 /* Index in suffix_alphabet of each character in the suffix. */
354 static size_t *sufindex
;
355 static size_t outbase_length
;
356 static size_t outfile_length
;
357 static size_t addsuf_length
;
364 widen
= !! outfile_length
;
368 /* Allocate and initialize the first file name. */
370 outbase_length
= strlen (outbase
);
371 addsuf_length
= additional_suffix
? strlen (additional_suffix
) : 0;
372 outfile_length
= outbase_length
+ suffix_length
+ addsuf_length
;
376 /* Reallocate and initialize a new wider file name.
377 We do this by subsuming the unchanging part of
378 the generated suffix into the prefix (base), and
379 reinitializing the now one longer suffix. */
385 if (outfile_length
+ 1 < outbase_length
)
387 outfile
= xrealloc (outfile
, outfile_length
+ 1);
390 memcpy (outfile
, outbase
, outbase_length
);
393 /* Append the last alphabet character to the file name prefix. */
394 outfile
[outbase_length
] = suffix_alphabet
[sufindex
[0]];
398 outfile_mid
= outfile
+ outbase_length
;
399 memset (outfile_mid
, suffix_alphabet
[0], suffix_length
);
400 if (additional_suffix
)
401 memcpy (outfile_mid
+ suffix_length
, additional_suffix
, addsuf_length
);
402 outfile
[outfile_length
] = 0;
405 sufindex
= xcalloc (suffix_length
, sizeof *sufindex
);
407 if (numeric_suffix_start
)
411 /* Update the output file name. */
412 size_t i
= strlen (numeric_suffix_start
);
413 memcpy (outfile_mid
+ suffix_length
- i
, numeric_suffix_start
, i
);
415 /* Update the suffix index. */
416 size_t *sufindex_end
= sufindex
+ suffix_length
;
418 *--sufindex_end
= numeric_suffix_start
[i
] - '0';
421 #if ! _POSIX_NO_TRUNC && HAVE_PATHCONF && defined _PC_NAME_MAX
422 /* POSIX requires that if the output file name is too long for
423 its directory, 'split' must fail without creating any files.
424 This must be checked for explicitly on operating systems that
425 silently truncate file names. */
427 char *dir
= dir_name (outfile
);
428 long name_max
= pathconf (dir
, _PC_NAME_MAX
);
429 if (0 <= name_max
&& name_max
< base_len (last_component (outfile
)))
430 die (EXIT_FAILURE
, ENAMETOOLONG
, "%s", quotef (outfile
));
437 /* Increment the suffix in place, if possible. */
439 size_t i
= suffix_length
;
443 if (suffix_auto
&& i
== 0 && ! suffix_alphabet
[sufindex
[0] + 1])
445 outfile_mid
[i
] = suffix_alphabet
[sufindex
[i
]];
449 outfile_mid
[i
] = suffix_alphabet
[sufindex
[i
]];
451 die (EXIT_FAILURE
, 0, _("output file suffixes exhausted"));
455 /* Create or truncate a file. */
458 create (char const *name
)
463 fprintf (stdout
, _("creating file %s\n"), quoteaf (name
));
465 int fd
= open (name
, O_WRONLY
| O_CREAT
| O_BINARY
, MODE_RW_UGO
);
468 struct stat out_stat_buf
;
469 if (fstat (fd
, &out_stat_buf
) != 0)
470 die (EXIT_FAILURE
, errno
, _("failed to stat %s"), quoteaf (name
));
471 if (SAME_INODE (in_stat_buf
, out_stat_buf
))
472 die (EXIT_FAILURE
, 0, _("%s would overwrite input; aborting"),
474 if (ftruncate (fd
, 0) != 0
475 && (S_ISREG (out_stat_buf
.st_mode
) || S_TYPEISSHM (&out_stat_buf
)))
476 die (EXIT_FAILURE
, errno
, _("%s: error truncating"), quotef (name
));
484 char const *shell_prog
= getenv ("SHELL");
485 if (shell_prog
== NULL
)
486 shell_prog
= "/bin/sh";
487 if (setenv ("FILE", name
, 1) != 0)
488 die (EXIT_FAILURE
, errno
,
489 _("failed to set FILE environment variable"));
491 fprintf (stdout
, _("executing with FILE=%s\n"), quotef (name
));
492 if (pipe (fd_pair
) != 0)
493 die (EXIT_FAILURE
, errno
, _("failed to create pipe"));
497 /* This is the child process. If an error occurs here, the
498 parent will eventually learn about it after doing a wait,
499 at which time it will emit its own error message. */
501 /* We have to close any pipes that were opened during an
502 earlier call, otherwise this process will be holding a
503 write-pipe that will prevent the earlier process from
504 reading an EOF on the corresponding read-pipe. */
505 for (j
= 0; j
< n_open_pipes
; ++j
)
506 if (close (open_pipes
[j
]) != 0)
507 die (EXIT_FAILURE
, errno
, _("closing prior pipe"));
508 if (close (fd_pair
[1]))
509 die (EXIT_FAILURE
, errno
, _("closing output pipe"));
510 if (fd_pair
[0] != STDIN_FILENO
)
512 if (dup2 (fd_pair
[0], STDIN_FILENO
) != STDIN_FILENO
)
513 die (EXIT_FAILURE
, errno
, _("moving input pipe"));
514 if (close (fd_pair
[0]) != 0)
515 die (EXIT_FAILURE
, errno
, _("closing input pipe"));
517 sigprocmask (SIG_SETMASK
, &oldblocked
, NULL
);
518 execl (shell_prog
, last_component (shell_prog
), "-c",
519 filter_command
, (char *) NULL
);
520 die (EXIT_FAILURE
, errno
, _("failed to run command: \"%s -c %s\""),
521 shell_prog
, filter_command
);
524 die (EXIT_FAILURE
, errno
, _("fork system call failed"));
525 if (close (fd_pair
[0]) != 0)
526 die (EXIT_FAILURE
, errno
, _("failed to close input pipe"));
527 filter_pid
= child_pid
;
528 if (n_open_pipes
== open_pipes_alloc
)
529 open_pipes
= x2nrealloc (open_pipes
, &open_pipes_alloc
,
531 open_pipes
[n_open_pipes
++] = fd_pair
[1];
536 /* Close the output file, and do any associated cleanup.
537 If FP and FD are both specified, they refer to the same open file;
538 in this case FP is closed, but FD is still used in cleanup. */
540 closeout (FILE *fp
, int fd
, pid_t pid
, char const *name
)
542 if (fp
!= NULL
&& fclose (fp
) != 0 && ! ignorable (errno
))
543 die (EXIT_FAILURE
, errno
, "%s", quotef (name
));
546 if (fp
== NULL
&& close (fd
) < 0)
547 die (EXIT_FAILURE
, errno
, "%s", quotef (name
));
549 for (j
= 0; j
< n_open_pipes
; ++j
)
551 if (open_pipes
[j
] == fd
)
553 open_pipes
[j
] = open_pipes
[--n_open_pipes
];
561 if (waitpid (pid
, &wstatus
, 0) == -1 && errno
!= ECHILD
)
562 die (EXIT_FAILURE
, errno
, _("waiting for child process"));
563 if (WIFSIGNALED (wstatus
))
565 int sig
= WTERMSIG (wstatus
);
568 char signame
[MAX (SIG2STR_MAX
, INT_BUFSIZE_BOUND (int))];
569 if (sig2str (sig
, signame
) != 0)
570 sprintf (signame
, "%d", sig
);
572 _("with FILE=%s, signal %s from command: %s"),
573 quotef (name
), signame
, filter_command
);
576 else if (WIFEXITED (wstatus
))
578 int ex
= WEXITSTATUS (wstatus
);
580 error (ex
, 0, _("with FILE=%s, exit %d from command: %s"),
581 quotef (name
), ex
, filter_command
);
585 /* shouldn't happen. */
586 die (EXIT_FAILURE
, 0,
587 _("unknown status from command (0x%X)"), wstatus
+ 0u);
592 /* Write BYTES bytes at BP to an output file.
593 If NEW_FILE_FLAG is true, open the next output file.
594 Otherwise add to the same output file already in use.
595 Return true if successful. */
598 cwrite (bool new_file_flag
, char const *bp
, size_t bytes
)
602 if (!bp
&& bytes
== 0 && elide_empty_files
)
604 closeout (NULL
, output_desc
, filter_pid
, outfile
);
606 output_desc
= create (outfile
);
608 die (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
611 if (full_write (output_desc
, bp
, bytes
) == bytes
)
615 if (! ignorable (errno
))
616 die (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
621 /* Split into pieces of exactly N_BYTES bytes.
622 Use buffer BUF, whose size is BUFSIZE.
623 BUF contains the first INITIAL_READ input bytes. */
626 bytes_split (uintmax_t n_bytes
, char *buf
, size_t bufsize
, size_t initial_read
,
630 bool new_file_flag
= true;
631 bool filter_ok
= true;
632 uintmax_t to_write
= n_bytes
;
633 uintmax_t opened
= 0;
638 if (initial_read
!= SIZE_MAX
)
640 n_read
= initial_read
;
641 initial_read
= SIZE_MAX
;
642 eof
= n_read
< bufsize
;
647 && lseek (STDIN_FILENO
, to_write
, SEEK_CUR
) != -1)
650 new_file_flag
= true;
653 n_read
= safe_read (STDIN_FILENO
, buf
, bufsize
);
654 if (n_read
== SAFE_READ_ERROR
)
655 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
659 while (to_write
<= n_read
)
661 if (filter_ok
|| new_file_flag
)
662 filter_ok
= cwrite (new_file_flag
, bp_out
, to_write
);
663 opened
+= new_file_flag
;
664 new_file_flag
= !max_files
|| (opened
< max_files
);
665 if (! filter_ok
&& ! new_file_flag
)
667 /* If filters no longer accepting input, stop reading. */
678 if (filter_ok
|| new_file_flag
)
679 filter_ok
= cwrite (new_file_flag
, bp_out
, n_read
);
680 opened
+= new_file_flag
;
681 new_file_flag
= false;
682 if (! filter_ok
&& opened
== max_files
)
684 /* If filters no longer accepting input, stop reading. */
692 /* Ensure NUMBER files are created, which truncates
693 any existing files or notifies any consumers on fifos.
694 FIXME: Should we do this before EXIT_FAILURE? */
695 while (opened
++ < max_files
)
696 cwrite (true, NULL
, 0);
699 /* Split into pieces of exactly N_LINES lines.
700 Use buffer BUF, whose size is BUFSIZE. */
703 lines_split (uintmax_t n_lines
, char *buf
, size_t bufsize
)
706 char *bp
, *bp_out
, *eob
;
707 bool new_file_flag
= true;
712 n_read
= safe_read (STDIN_FILENO
, buf
, bufsize
);
713 if (n_read
== SAFE_READ_ERROR
)
714 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
720 bp
= rawmemchr (bp
, eolchar
);
723 if (eob
!= bp_out
) /* do not write 0 bytes! */
725 size_t len
= eob
- bp_out
;
726 cwrite (new_file_flag
, bp_out
, len
);
727 new_file_flag
= false;
735 cwrite (new_file_flag
, bp_out
, bp
- bp_out
);
737 new_file_flag
= true;
745 /* Split into pieces that are as large as possible while still not more
746 than N_BYTES bytes, and are split on line boundaries except
747 where lines longer than N_BYTES bytes occur. */
750 line_bytes_split (uintmax_t n_bytes
, char *buf
, size_t bufsize
)
753 uintmax_t n_out
= 0; /* for each split. */
755 char *hold
= NULL
; /* for lines > bufsize. */
756 size_t hold_size
= 0;
757 bool split_line
= false; /* Whether a \n was output in a split. */
761 n_read
= safe_read (STDIN_FILENO
, buf
, bufsize
);
762 if (n_read
== SAFE_READ_ERROR
)
763 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
764 size_t n_left
= n_read
;
768 size_t split_rest
= 0;
772 /* Determine End Of Chunk and/or End of Line,
773 which are used below to select what to write or buffer. */
774 if (n_bytes
- n_out
- n_hold
<= n_left
)
776 /* Have enough for split. */
777 split_rest
= n_bytes
- n_out
- n_hold
;
778 eoc
= sob
+ split_rest
- 1;
779 eol
= memrchr (sob
, eolchar
, split_rest
);
782 eol
= memrchr (sob
, eolchar
, n_left
);
784 /* Output hold space if possible. */
785 if (n_hold
&& !(!eol
&& n_out
))
787 cwrite (n_out
== 0, hold
, n_hold
);
789 if (n_hold
> bufsize
)
790 hold
= xrealloc (hold
, bufsize
);
795 /* Output to eol if present. */
799 size_t n_write
= eol
- sob
+ 1;
800 cwrite (n_out
== 0, sob
, n_write
);
805 split_rest
-= n_write
;
808 /* Output to eoc or eob if possible. */
809 if (n_left
&& !split_line
)
811 size_t n_write
= eoc
? split_rest
: n_left
;
812 cwrite (n_out
== 0, sob
, n_write
);
817 split_rest
-= n_write
;
820 /* Update hold if needed. */
821 if ((eoc
&& split_rest
) || (!eoc
&& n_left
))
823 size_t n_buf
= eoc
? split_rest
: n_left
;
824 if (hold_size
- n_hold
< n_buf
)
826 if (hold_size
<= SIZE_MAX
- bufsize
)
827 hold_size
+= bufsize
;
830 hold
= xrealloc (hold
, hold_size
);
832 memcpy (hold
+ n_hold
, sob
, n_buf
);
838 /* Reset for new split. */
848 /* Handle no eol at end of file. */
850 cwrite (n_out
== 0, hold
, n_hold
);
855 /* -n l/[K/]N: Write lines to files of approximately file size / N.
856 The file is partitioned into file size / N sized portions, with the
857 last assigned any excess. If a line _starts_ within a partition
858 it is written completely to the corresponding file. Since lines
859 are not split even if they overlap a partition, the files written
860 can be larger or smaller than the partition size, and even empty
861 if a line is so long as to completely overlap the partition. */
864 lines_chunk_split (uintmax_t k
, uintmax_t n
, char *buf
, size_t bufsize
,
865 size_t initial_read
, off_t file_size
)
867 assert (n
&& k
<= n
&& n
<= file_size
);
869 const off_t chunk_size
= file_size
/ n
;
870 uintmax_t chunk_no
= 1;
871 off_t chunk_end
= chunk_size
- 1;
873 bool new_file_flag
= true;
874 bool chunk_truncated
= false;
878 /* Start reading 1 byte before kth chunk of file. */
879 off_t start
= (k
- 1) * chunk_size
- 1;
880 if (start
< initial_read
)
882 memmove (buf
, buf
+ start
, initial_read
- start
);
883 initial_read
-= start
;
887 if (lseek (STDIN_FILENO
, start
- initial_read
, SEEK_CUR
) < 0)
888 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
889 initial_read
= SIZE_MAX
;
893 chunk_end
= chunk_no
* chunk_size
- 1;
896 while (n_written
< file_size
)
898 char *bp
= buf
, *eob
;
900 if (initial_read
!= SIZE_MAX
)
902 n_read
= initial_read
;
903 initial_read
= SIZE_MAX
;
907 n_read
= safe_read (STDIN_FILENO
, buf
, bufsize
);
908 if (n_read
== SAFE_READ_ERROR
)
909 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
913 n_read
= MIN (n_read
, file_size
- n_written
);
914 chunk_truncated
= false;
922 /* Begin looking for '\n' at last byte of chunk. */
923 off_t skip
= MIN (n_read
, MAX (0, chunk_end
- n_written
));
924 char *bp_out
= memchr (bp
+ skip
, eolchar
, n_read
- skip
);
932 to_write
= bp_out
- bp
;
936 /* We don't use the stdout buffer here since we're writing
937 large chunks from an existing file, so it's more efficient
938 to write out directly. */
939 if (full_write (STDOUT_FILENO
, bp
, to_write
) != to_write
)
940 die (EXIT_FAILURE
, errno
, "%s", _("write error"));
943 cwrite (new_file_flag
, bp
, to_write
);
944 n_written
+= to_write
;
947 new_file_flag
= next
;
949 /* A line could have been so long that it skipped
950 entire chunks. So create empty files in that case. */
951 while (next
|| chunk_end
<= n_written
- 1)
953 if (!next
&& bp
== eob
)
955 /* replenish buf, before going to next chunk. */
956 chunk_truncated
= true;
960 if (k
&& chunk_no
> k
)
963 chunk_end
= file_size
- 1; /* >= chunk_size. */
965 chunk_end
+= chunk_size
;
966 if (chunk_end
<= n_written
- 1)
969 cwrite (true, NULL
, 0);
980 /* Ensure NUMBER files are created, which truncates
981 any existing files or notifies any consumers on fifos.
982 FIXME: Should we do this before EXIT_FAILURE? */
983 while (!k
&& chunk_no
++ <= n
)
984 cwrite (true, NULL
, 0);
987 /* -n K/N: Extract Kth of N chunks. */
990 bytes_chunk_extract (uintmax_t k
, uintmax_t n
, char *buf
, size_t bufsize
,
991 size_t initial_read
, off_t file_size
)
996 assert (k
&& n
&& k
<= n
&& n
<= file_size
);
998 start
= (k
- 1) * (file_size
/ n
);
999 end
= (k
== n
) ? file_size
: k
* (file_size
/ n
);
1001 if (start
< initial_read
)
1003 memmove (buf
, buf
+ start
, initial_read
- start
);
1004 initial_read
-= start
;
1008 if (lseek (STDIN_FILENO
, start
- initial_read
, SEEK_CUR
) < 0)
1009 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1010 initial_read
= SIZE_MAX
;
1016 if (initial_read
!= SIZE_MAX
)
1018 n_read
= initial_read
;
1019 initial_read
= SIZE_MAX
;
1023 n_read
= safe_read (STDIN_FILENO
, buf
, bufsize
);
1024 if (n_read
== SAFE_READ_ERROR
)
1025 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1029 n_read
= MIN (n_read
, end
- start
);
1030 if (full_write (STDOUT_FILENO
, buf
, n_read
) != n_read
1031 && ! ignorable (errno
))
1032 die (EXIT_FAILURE
, errno
, "%s", quotef ("-"));
1037 typedef struct of_info
1051 /* Rotate file descriptors when we're writing to more output files than we
1052 have available file descriptors.
1053 Return whether we came under file resource pressure.
1054 If so, it's probably best to close each file when finished with it. */
1057 ofile_open (of_t
*files
, size_t i_check
, size_t nfiles
)
1059 bool file_limit
= false;
1061 if (files
[i_check
].ofd
<= OFD_NEW
)
1064 size_t i_reopen
= i_check
? i_check
- 1 : nfiles
- 1;
1066 /* Another process could have opened a file in between the calls to
1067 close and open, so we should keep trying until open succeeds or
1068 we've closed all of our files. */
1071 if (files
[i_check
].ofd
== OFD_NEW
)
1072 fd
= create (files
[i_check
].of_name
);
1073 else /* OFD_APPEND */
1075 /* Attempt to append to previously opened file.
1076 We use O_NONBLOCK to support writing to fifos,
1077 where the other end has closed because of our
1078 previous close. In that case we'll immediately
1079 get an error, rather than waiting indefinitely.
1080 In specialised cases the consumer can keep reading
1081 from the fifo, terminating on conditions in the data
1082 itself, or perhaps never in the case of 'tail -f'.
1083 I.e., for fifos it is valid to attempt this reopen.
1085 We don't handle the filter_command case here, as create()
1086 will exit if there are not enough files in that case.
1087 I.e., we don't support restarting filters, as that would
1088 put too much burden on users specifying --filter commands. */
1089 fd
= open (files
[i_check
].of_name
,
1090 O_WRONLY
| O_BINARY
| O_APPEND
| O_NONBLOCK
);
1096 if (!(errno
== EMFILE
|| errno
== ENFILE
))
1097 die (EXIT_FAILURE
, errno
, "%s", quotef (files
[i_check
].of_name
));
1101 /* Search backwards for an open file to close. */
1102 while (files
[i_reopen
].ofd
< 0)
1104 i_reopen
= i_reopen
? i_reopen
- 1 : nfiles
- 1;
1105 /* No more open files to close, exit with E[NM]FILE. */
1106 if (i_reopen
== i_check
)
1107 die (EXIT_FAILURE
, errno
, "%s",
1108 quotef (files
[i_check
].of_name
));
1111 if (fclose (files
[i_reopen
].ofile
) != 0)
1112 die (EXIT_FAILURE
, errno
, "%s", quotef (files
[i_reopen
].of_name
));
1113 files
[i_reopen
].ofile
= NULL
;
1114 files
[i_reopen
].ofd
= OFD_APPEND
;
1117 files
[i_check
].ofd
= fd
;
1118 if (!(files
[i_check
].ofile
= fdopen (fd
, "a")))
1119 die (EXIT_FAILURE
, errno
, "%s", quotef (files
[i_check
].of_name
));
1120 files
[i_check
].opid
= filter_pid
;
1127 /* -n r/[K/]N: Divide file into N chunks in round robin fashion.
1128 When K == 0, we try to keep the files open in parallel.
1129 If we run out of file resources, then we revert
1130 to opening and closing each file for each line. */
1133 lines_rr (uintmax_t k
, uintmax_t n
, char *buf
, size_t bufsize
)
1135 bool wrapped
= false;
1139 of_t
*files
IF_LINT (= NULL
);
1148 files
= xnmalloc (n
, sizeof *files
);
1150 /* Generate output file names. */
1151 for (i_file
= 0; i_file
< n
; i_file
++)
1154 files
[i_file
].of_name
= xstrdup (outfile
);
1155 files
[i_file
].ofd
= OFD_NEW
;
1156 files
[i_file
].ofile
= NULL
;
1157 files
[i_file
].opid
= 0;
1165 char *bp
= buf
, *eob
;
1166 size_t n_read
= safe_read (STDIN_FILENO
, buf
, bufsize
);
1167 if (n_read
== SAFE_READ_ERROR
)
1168 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1169 else if (n_read
== 0)
1178 /* Find end of line. */
1179 char *bp_out
= memchr (bp
, eolchar
, eob
- bp
);
1187 to_write
= bp_out
- bp
;
1191 if (line_no
== k
&& unbuffered
)
1193 if (full_write (STDOUT_FILENO
, bp
, to_write
) != to_write
)
1194 die (EXIT_FAILURE
, errno
, "%s", _("write error"));
1196 else if (line_no
== k
&& fwrite (bp
, to_write
, 1, stdout
) != 1)
1198 clearerr (stdout
); /* To silence close_stdout(). */
1199 die (EXIT_FAILURE
, errno
, "%s", _("write error"));
1202 line_no
= (line_no
== n
) ? 1 : line_no
+ 1;
1206 /* Secure file descriptor. */
1207 file_limit
|= ofile_open (files
, i_file
, n
);
1210 /* Note writing to fd, rather than flushing the FILE gives
1211 an 8% performance benefit, due to reduced data copying. */
1212 if (full_write (files
[i_file
].ofd
, bp
, to_write
) != to_write
1213 && ! ignorable (errno
))
1215 die (EXIT_FAILURE
, errno
, "%s",
1216 quotef (files
[i_file
].of_name
));
1219 else if (fwrite (bp
, to_write
, 1, files
[i_file
].ofile
) != 1
1220 && ! ignorable (errno
))
1222 die (EXIT_FAILURE
, errno
, "%s",
1223 quotef (files
[i_file
].of_name
));
1226 if (! ignorable (errno
))
1231 if (fclose (files
[i_file
].ofile
) != 0)
1233 die (EXIT_FAILURE
, errno
, "%s",
1234 quotef (files
[i_file
].of_name
));
1236 files
[i_file
].ofile
= NULL
;
1237 files
[i_file
].ofd
= OFD_APPEND
;
1239 if (next
&& ++i_file
== n
)
1242 /* If no filters are accepting input, stop reading. */
1255 /* Ensure all files created, so that any existing files are truncated,
1256 and to signal any waiting fifo consumers.
1257 Also, close any open file descriptors.
1258 FIXME: Should we do this before EXIT_FAILURE? */
1261 int ceiling
= (wrapped
? n
: i_file
);
1262 for (i_file
= 0; i_file
< n
; i_file
++)
1264 if (i_file
>= ceiling
&& !elide_empty_files
)
1265 file_limit
|= ofile_open (files
, i_file
, n
);
1266 if (files
[i_file
].ofd
>= 0)
1267 closeout (files
[i_file
].ofile
, files
[i_file
].ofd
,
1268 files
[i_file
].opid
, files
[i_file
].of_name
);
1269 files
[i_file
].ofd
= OFD_APPEND
;
1272 IF_LINT (free (files
));
1275 #define FAIL_ONLY_ONE_WAY() \
1278 error (0, 0, _("cannot split in more than one way")); \
1279 usage (EXIT_FAILURE); \
1284 /* Parse K/N syntax of chunk options. */
1287 parse_chunk (uintmax_t *k_units
, uintmax_t *n_units
, char *slash
)
1289 *n_units
= xdectoumax (slash
+ 1, 1, UINTMAX_MAX
, "",
1290 _("invalid number of chunks"), 0);
1291 if (slash
!= optarg
) /* a leading number is specified. */
1294 *k_units
= xdectoumax (optarg
, 1, *n_units
, "",
1295 _("invalid chunk number"), 0);
1301 main (int argc
, char **argv
)
1303 enum Split_type split_type
= type_undef
;
1304 idx_t in_blk_size
= 0; /* optimal block size of input file device */
1305 size_t page_size
= getpagesize ();
1306 uintmax_t k_units
= 0;
1307 uintmax_t n_units
= 0;
1309 static char const multipliers
[] = "bEGKkMmPTYZ0";
1311 int digits_optind
= 0;
1312 off_t file_size
= OFF_T_MAX
;
1314 initialize_main (&argc
, &argv
);
1315 set_program_name (argv
[0]);
1316 setlocale (LC_ALL
, "");
1317 bindtextdomain (PACKAGE
, LOCALEDIR
);
1318 textdomain (PACKAGE
);
1320 atexit (close_stdout
);
1322 /* Parse command line options. */
1324 infile
= bad_cast ("-");
1325 outbase
= bad_cast ("x");
1329 /* This is the argv-index of the option we will read next. */
1330 int this_optind
= optind
? optind
: 1;
1333 c
= getopt_long (argc
, argv
, "0123456789C:a:b:del:n:t:ux",
1341 suffix_length
= xdectoumax (optarg
, 0, SIZE_MAX
/ sizeof (size_t),
1342 "", _("invalid suffix length"), 0);
1345 case ADDITIONAL_SUFFIX_OPTION
:
1346 if (last_component (optarg
) != optarg
)
1349 _("invalid suffix %s, contains directory separator"),
1351 usage (EXIT_FAILURE
);
1353 additional_suffix
= optarg
;
1357 if (split_type
!= type_undef
)
1358 FAIL_ONLY_ONE_WAY ();
1359 split_type
= type_bytes
;
1360 /* Limit to OFF_T_MAX, because if input is a pipe, we could get more
1361 data than is possible to write to a single file, so indicate that
1362 immediately rather than having possibly future invocations fail. */
1363 n_units
= xdectoumax (optarg
, 1, OFF_T_MAX
, multipliers
,
1364 _("invalid number of bytes"), 0);
1368 if (split_type
!= type_undef
)
1369 FAIL_ONLY_ONE_WAY ();
1370 split_type
= type_lines
;
1371 n_units
= xdectoumax (optarg
, 1, UINTMAX_MAX
, "",
1372 _("invalid number of lines"), 0);
1376 if (split_type
!= type_undef
)
1377 FAIL_ONLY_ONE_WAY ();
1378 split_type
= type_byteslines
;
1379 n_units
= xdectoumax (optarg
, 1, MIN (SIZE_MAX
, OFF_T_MAX
),
1380 multipliers
, _("invalid number of bytes"), 0);
1384 if (split_type
!= type_undef
)
1385 FAIL_ONLY_ONE_WAY ();
1386 /* skip any whitespace */
1387 while (isspace (to_uchar (*optarg
)))
1389 if (STRNCMP_LIT (optarg
, "r/") == 0)
1391 split_type
= type_rr
;
1394 else if (STRNCMP_LIT (optarg
, "l/") == 0)
1396 split_type
= type_chunk_lines
;
1400 split_type
= type_chunk_bytes
;
1401 if ((slash
= strchr (optarg
, '/')))
1402 parse_chunk (&k_units
, &n_units
, slash
);
1404 n_units
= xdectoumax (optarg
, 1, UINTMAX_MAX
, "",
1405 _("invalid number of chunks"), 0);
1414 char neweol
= optarg
[0];
1416 die (EXIT_FAILURE
, 0, _("empty record separator"));
1419 if (STREQ (optarg
, "\\0"))
1423 /* Provoke with 'split -txx'. Complain about
1424 "multi-character tab" instead of "multibyte tab", so
1425 that the diagnostic's wording does not need to be
1426 changed once multibyte characters are supported. */
1427 die (EXIT_FAILURE
, 0, _("multi-character separator %s"),
1431 /* Make it explicit we don't support multiple separators. */
1432 if (0 <= eolchar
&& neweol
!= eolchar
)
1434 die (EXIT_FAILURE
, 0,
1435 _("multiple separator characters specified"));
1452 if (split_type
== type_undef
)
1454 split_type
= type_digits
;
1457 if (split_type
!= type_undef
&& split_type
!= type_digits
)
1458 FAIL_ONLY_ONE_WAY ();
1459 if (digits_optind
!= 0 && digits_optind
!= this_optind
)
1460 n_units
= 0; /* More than one number given; ignore other. */
1461 digits_optind
= this_optind
;
1462 if (!DECIMAL_DIGIT_ACCUMULATE (n_units
, c
- '0', uintmax_t))
1464 char buffer
[INT_BUFSIZE_BOUND (uintmax_t)];
1465 die (EXIT_FAILURE
, 0,
1466 _("line count option -%s%c... is too large"),
1467 umaxtostr (n_units
, buffer
), c
);
1474 suffix_alphabet
= "0123456789";
1476 suffix_alphabet
= "0123456789abcdef";
1479 if (strlen (optarg
) != strspn (optarg
, suffix_alphabet
))
1483 _("%s: invalid start value for numerical suffix") :
1484 _("%s: invalid start value for hexadecimal suffix"),
1486 usage (EXIT_FAILURE
);
1490 /* Skip any leading zero. */
1491 while (*optarg
== '0' && *(optarg
+ 1) != '\0')
1493 numeric_suffix_start
= optarg
;
1499 elide_empty_files
= true;
1503 filter_command
= optarg
;
1506 case IO_BLKSIZE_OPTION
:
1507 in_blk_size
= xdectoumax (optarg
, 1, MIN (IDX_MAX
, SIZE_MAX
) - 1,
1508 multipliers
, _("invalid IO block size"), 0);
1511 case VERBOSE_OPTION
:
1515 case_GETOPT_HELP_CHAR
;
1517 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
1520 usage (EXIT_FAILURE
);
1524 if (k_units
!= 0 && filter_command
)
1526 error (0, 0, _("--filter does not process a chunk extracted to stdout"));
1527 usage (EXIT_FAILURE
);
1530 /* Handle default case. */
1531 if (split_type
== type_undef
)
1533 split_type
= type_lines
;
1539 error (0, 0, "%s: %s", _("invalid number of lines"), quote ("0"));
1540 usage (EXIT_FAILURE
);
1546 set_suffix_length (n_units
, split_type
);
1548 /* Get out the filename arguments. */
1551 infile
= argv
[optind
++];
1554 outbase
= argv
[optind
++];
1558 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
1559 usage (EXIT_FAILURE
);
1562 /* Check that the suffix length is large enough for the numerical
1563 suffix start value. */
1564 if (numeric_suffix_start
&& strlen (numeric_suffix_start
) > suffix_length
)
1566 error (0, 0, _("numerical suffix start value is too large "
1567 "for the suffix length"));
1568 usage (EXIT_FAILURE
);
1571 /* Open the input file. */
1572 if (! STREQ (infile
, "-")
1573 && fd_reopen (STDIN_FILENO
, infile
, O_RDONLY
, 0) < 0)
1574 die (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
1577 /* Binary I/O is safer when byte counts are used. */
1578 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
1580 /* Get the optimal block size of input device and make a buffer. */
1582 if (fstat (STDIN_FILENO
, &in_stat_buf
) != 0)
1583 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1585 bool specified_buf_size
= !! in_blk_size
;
1586 if (! specified_buf_size
)
1587 in_blk_size
= io_blksize (in_stat_buf
);
1589 char *buf
= xalignalloc (page_size
, in_blk_size
+ 1);
1590 size_t initial_read
= SIZE_MAX
;
1592 if (split_type
== type_chunk_bytes
|| split_type
== type_chunk_lines
)
1594 file_size
= input_file_size (STDIN_FILENO
, &in_stat_buf
,
1597 die (EXIT_FAILURE
, errno
, _("%s: cannot determine file size"),
1599 initial_read
= MIN (file_size
, in_blk_size
);
1600 /* Overflow, and sanity checking. */
1601 if (OFF_T_MAX
< n_units
)
1603 char buffer
[INT_BUFSIZE_BOUND (uintmax_t)];
1604 die (EXIT_FAILURE
, EOVERFLOW
, "%s: %s",
1605 _("invalid number of chunks"),
1606 quote (umaxtostr (n_units
, buffer
)));
1608 /* increase file_size to n_units here, so that we still process
1609 any input data, and create empty files for the rest. */
1610 file_size
= MAX (file_size
, n_units
);
1613 /* When filtering, closure of one pipe must not terminate the process,
1614 as there may still be other streams expecting input from us. */
1617 struct sigaction act
;
1618 sigemptyset (&newblocked
);
1619 sigaction (SIGPIPE
, NULL
, &act
);
1620 if (act
.sa_handler
!= SIG_IGN
)
1621 sigaddset (&newblocked
, SIGPIPE
);
1622 sigprocmask (SIG_BLOCK
, &newblocked
, &oldblocked
);
1629 lines_split (n_units
, buf
, in_blk_size
);
1633 bytes_split (n_units
, buf
, in_blk_size
, SIZE_MAX
, 0);
1636 case type_byteslines
:
1637 line_bytes_split (n_units
, buf
, in_blk_size
);
1640 case type_chunk_bytes
:
1642 bytes_split (file_size
/ n_units
, buf
, in_blk_size
, initial_read
,
1645 bytes_chunk_extract (k_units
, n_units
, buf
, in_blk_size
, initial_read
,
1649 case type_chunk_lines
:
1650 lines_chunk_split (k_units
, n_units
, buf
, in_blk_size
, initial_read
,
1655 /* Note, this is like 'sed -n ${k}~${n}p' when k > 0,
1656 but the functionality is provided for symmetry. */
1657 lines_rr (k_units
, n_units
, buf
, in_blk_size
);
1664 IF_LINT (alignfree (buf
));
1666 if (close (STDIN_FILENO
) != 0)
1667 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1668 closeout (NULL
, output_desc
, filter_pid
, outfile
);
1670 return EXIT_SUCCESS
;