1 /* Shuffle lines of text.
3 Copyright (C) 2006-2023 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>.
18 Written by Paul Eggert. */
22 #include <sys/types.h>
29 #include "linebuffer.h"
33 #include "read-file.h"
37 /* The official name of this program (e.g., no 'g' prefix). */
38 #define PROGRAM_NAME "shuf"
40 #define AUTHORS proper_name ("Paul Eggert")
42 /* For reservoir-sampling, allocate the reservoir lines in batches. */
43 enum { RESERVOIR_LINES_INCREMENT
= 1024 };
45 /* reservoir-sampling introduces CPU overhead for small inputs.
46 So only enable it for inputs >= this limit.
47 This limit was determined using these commands:
48 $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
49 $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
50 $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done .*/
51 enum { RESERVOIR_MIN_INPUT
= 8192 * 1024 };
57 if (status
!= EXIT_SUCCESS
)
62 Usage: %s [OPTION]... [FILE]\n\
63 or: %s -e [OPTION]... [ARG]...\n\
64 or: %s -i LO-HI [OPTION]...\n\
66 program_name
, program_name
, program_name
);
68 Write a random permutation of the input lines to standard output.\n\
72 emit_mandatory_arg_note ();
75 -e, --echo treat each ARG as an input line\n\
76 -i, --input-range=LO-HI treat each number LO through HI as an input line\n\
77 -n, --head-count=COUNT output at most COUNT lines\n\
78 -o, --output=FILE write result to FILE instead of standard output\n\
79 --random-source=FILE get random bytes from FILE\n\
80 -r, --repeat output lines can be repeated\n\
83 -z, --zero-terminated line delimiter is NUL, not newline\n\
85 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
86 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
87 emit_ancillary_info (PROGRAM_NAME
);
93 /* For long options that have no equivalent short option, use a
94 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
97 RANDOM_SOURCE_OPTION
= CHAR_MAX
+ 1
100 static struct option
const long_opts
[] =
102 {"echo", no_argument
, nullptr, 'e'},
103 {"input-range", required_argument
, nullptr, 'i'},
104 {"head-count", required_argument
, nullptr, 'n'},
105 {"output", required_argument
, nullptr, 'o'},
106 {"random-source", required_argument
, nullptr, RANDOM_SOURCE_OPTION
},
107 {"repeat", no_argument
, nullptr, 'r'},
108 {"zero-terminated", no_argument
, nullptr, 'z'},
109 {GETOPT_HELP_OPTION_DECL
},
110 {GETOPT_VERSION_OPTION_DECL
},
115 input_from_argv (char **operand
, int n_operands
, char eolbyte
)
118 size_t size
= n_operands
;
121 for (i
= 0; i
< n_operands
; i
++)
122 size
+= strlen (operand
[i
]);
125 for (i
= 0; i
< n_operands
; i
++)
127 char *p1
= stpcpy (p
, operand
[i
]);
133 operand
[n_operands
] = p
;
136 /* Return the start of the next line after LINE, which is guaranteed
137 to end in EOLBYTE. */
140 next_line (char *line
, char eolbyte
)
142 char *p
= rawmemchr (line
, eolbyte
);
146 /* Return the size of the input if possible or OFF_T_MAX if not. */
153 struct stat stat_buf
;
154 if (fstat (STDIN_FILENO
, &stat_buf
) != 0)
156 if (usable_st_size (&stat_buf
))
157 file_size
= stat_buf
.st_size
;
161 off_t input_offset
= lseek (STDIN_FILENO
, 0, SEEK_CUR
);
162 if (input_offset
< 0)
165 file_size
-= input_offset
;
170 /* Read all lines and store up to K permuted lines in *OUT_RSRV.
171 Return the number of lines read, up to a maximum of K. */
174 read_input_reservoir_sampling (FILE *in
, char eolbyte
, size_t k
,
175 struct randint_source
*s
,
176 struct linebuffer
**out_rsrv
)
179 size_t n_alloc_lines
= MIN (k
, RESERVOIR_LINES_INCREMENT
);
180 struct linebuffer
*line
= nullptr;
181 struct linebuffer
*rsrv
;
183 rsrv
= xcalloc (n_alloc_lines
, sizeof (struct linebuffer
));
185 /* Fill the first K lines, directly into the reservoir. */
188 readlinebuffer_delim (&rsrv
[n_lines
], in
, eolbyte
)) != nullptr)
192 /* Enlarge reservoir. */
193 if (n_lines
>= n_alloc_lines
)
195 n_alloc_lines
+= RESERVOIR_LINES_INCREMENT
;
196 rsrv
= xnrealloc (rsrv
, n_alloc_lines
, sizeof (struct linebuffer
));
197 memset (&rsrv
[n_lines
], 0,
198 RESERVOIR_LINES_INCREMENT
* sizeof (struct linebuffer
));
202 /* last line wasn't null - so there may be more lines to read. */
205 struct linebuffer dummy
;
206 initbuffer (&dummy
); /* space for lines not put in reservoir. */
208 /* Choose the fate of the next line, with decreasing probability (as
209 n_lines increases in size).
211 If the line will be used, store it directly in the reservoir.
212 Otherwise, store it in dummy space.
214 With 'struct linebuffer', storing into existing buffer will reduce
215 re-allocations (will only re-allocate if the new line is longer than
216 the currently allocated space). */
219 randint j
= randint_choose (s
, n_lines
+ 1); /* 0 .. n_lines. */
220 line
= (j
< k
) ? (&rsrv
[j
]) : (&dummy
);
222 while (readlinebuffer_delim (line
, in
, eolbyte
) != nullptr && n_lines
++);
225 die (EXIT_FAILURE
, EOVERFLOW
, _("too many input lines"));
230 /* no more input lines, or an input error. */
232 die (EXIT_FAILURE
, errno
, _("read error"));
235 return MIN (k
, n_lines
);
239 write_permuted_output_reservoir (size_t n_lines
, struct linebuffer
*lines
,
240 size_t const *permutation
)
242 for (size_t i
= 0; i
< n_lines
; i
++)
244 const struct linebuffer
*p
= &lines
[permutation
[i
]];
245 if (fwrite (p
->buffer
, sizeof (char), p
->length
, stdout
) != p
->length
)
252 /* Read data from file IN. Input lines are delimited by EOLBYTE;
253 silently append a trailing EOLBYTE if the file ends in some other
254 byte. Store a pointer to the resulting array of lines into *PLINE.
255 Return the number of lines read. Report an error and exit on
259 read_input (FILE *in
, char eolbyte
, char ***pline
)
268 /* TODO: We should limit the amount of data read here,
269 to less than RESERVOIR_MIN_INPUT. I.e., adjust fread_file() to support
270 taking a byte limit. We'd then need to ensure we handle a line spanning
271 this boundary. With that in place we could set use_reservoir_sampling
272 when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
273 call a wrapper function to populate a linebuffer from the internal pline
274 or if none left, stdin. Doing that would give better performance by
275 avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
276 from a pipe, and allow us to dispense with the input_size() function. */
277 if (!(buf
= fread_file (in
, 0, &used
)))
278 die (EXIT_FAILURE
, errno
, _("read error"));
280 if (used
&& buf
[used
- 1] != eolbyte
)
281 buf
[used
++] = eolbyte
;
286 for (p
= buf
; p
< lim
; p
= next_line (p
, eolbyte
))
289 *pline
= line
= xnmalloc (n_lines
+ 1, sizeof *line
);
292 for (size_t i
= 1; i
<= n_lines
; i
++)
293 line
[i
] = p
= next_line (p
, eolbyte
);
298 /* Output N_LINES lines to stdout from LINE array,
299 chosen by the indices in PERMUTATION.
300 PERMUTATION and LINE must have at least N_LINES elements.
301 Strings in LINE must include the line-terminator character. */
303 write_permuted_lines (size_t n_lines
, char *const *line
,
304 size_t const *permutation
)
306 for (size_t i
= 0; i
< n_lines
; i
++)
308 char *const *p
= line
+ permutation
[i
];
309 size_t len
= p
[1] - p
[0];
310 if (fwrite (p
[0], sizeof *p
[0], len
, stdout
) != len
)
317 /* Output N_LINES of numbers to stdout, from PERMUTATION array.
318 PERMUTATION must have at least N_LINES elements. */
320 write_permuted_numbers (size_t n_lines
, size_t lo_input
,
321 size_t const *permutation
, char eolbyte
)
323 for (size_t i
= 0; i
< n_lines
; i
++)
325 unsigned long int n
= lo_input
+ permutation
[i
];
326 if (printf ("%lu%c", n
, eolbyte
) < 0)
333 /* Output COUNT numbers to stdout, chosen randomly from range
334 LO_INPUT through HI_INPUT. */
336 write_random_numbers (struct randint_source
*s
, size_t count
,
337 size_t lo_input
, size_t hi_input
, char eolbyte
)
339 const randint range
= hi_input
- lo_input
+ 1;
341 for (size_t i
= 0; i
< count
; i
++)
343 unsigned long int j
= lo_input
+ randint_choose (s
, range
);
344 if (printf ("%lu%c", j
, eolbyte
) < 0)
351 /* Output COUNT lines to stdout from LINES array.
352 LINES must have at least N_LINES elements in it.
353 Strings in LINES_ must include the line-terminator character. */
355 write_random_lines (struct randint_source
*s
, size_t count
,
356 char *const *lines
, size_t n_lines
)
358 for (size_t i
= 0; i
< count
; i
++)
360 const randint j
= randint_choose (s
, n_lines
);
361 char *const *p
= lines
+ j
;
362 size_t len
= p
[1] - p
[0];
363 if (fwrite (p
[0], sizeof *p
[0], len
, stdout
) != len
)
371 main (int argc
, char **argv
)
374 bool input_range
= false;
375 size_t lo_input
= SIZE_MAX
;
377 size_t head_lines
= SIZE_MAX
;
378 char const *outfile
= nullptr;
379 char *random_source
= nullptr;
381 char **input_lines
= nullptr;
382 bool use_reservoir_sampling
= false;
389 char **line
= nullptr;
390 struct linebuffer
*reservoir
= nullptr;
391 struct randint_source
*randint_source
;
392 size_t *permutation
= nullptr;
395 initialize_main (&argc
, &argv
);
396 set_program_name (argv
[0]);
397 setlocale (LC_ALL
, "");
398 bindtextdomain (PACKAGE
, LOCALEDIR
);
399 textdomain (PACKAGE
);
401 atexit (close_stdout
);
403 while ((optc
= getopt_long (argc
, argv
, "ei:n:o:rz", long_opts
, nullptr))
414 die (EXIT_FAILURE
, 0, _("multiple -i options specified"));
419 strtol_error err
= xstrtoumax (optarg
, &lo_end
, 10, &u
, nullptr);
420 if (err
== LONGINT_OK
)
424 err
= LONGINT_OVERFLOW
;
425 else if (*lo_end
!= '-')
426 err
= LONGINT_INVALID
;
429 err
= xstrtoumax (lo_end
+ 1, nullptr, 10, &u
, "");
430 if (err
== LONGINT_OK
)
434 err
= LONGINT_OVERFLOW
;
439 n_lines
= hi_input
- lo_input
+ 1;
441 if (err
!= LONGINT_OK
|| (lo_input
<= hi_input
) == (n_lines
== 0))
442 die (EXIT_FAILURE
, err
== LONGINT_OVERFLOW
? EOVERFLOW
: 0,
443 "%s: %s", _("invalid input range"), quote (optarg
));
450 strtol_error e
= xstrtoumax (optarg
, nullptr, 10, &argval
, "");
453 head_lines
= MIN (head_lines
, argval
);
454 else if (e
!= LONGINT_OVERFLOW
)
455 die (EXIT_FAILURE
, 0, _("invalid line count: %s"),
461 if (outfile
&& !STREQ (outfile
, optarg
))
462 die (EXIT_FAILURE
, 0, _("multiple output files specified"));
466 case RANDOM_SOURCE_OPTION
:
467 if (random_source
&& !STREQ (random_source
, optarg
))
468 die (EXIT_FAILURE
, 0, _("multiple random sources specified"));
469 random_source
= optarg
;
480 case_GETOPT_HELP_CHAR
;
481 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
483 usage (EXIT_FAILURE
);
486 n_operands
= argc
- optind
;
487 operand
= argv
+ optind
;
489 /* Check invalid usage. */
490 if (echo
&& input_range
)
492 error (0, 0, _("cannot combine -e and -i options"));
493 usage (EXIT_FAILURE
);
495 if (input_range
? 0 < n_operands
: !echo
&& 1 < n_operands
)
497 error (0, 0, _("extra operand %s"), quote (operand
[!input_range
]));
498 usage (EXIT_FAILURE
);
509 input_from_argv (operand
, n_operands
, eolbyte
);
510 n_lines
= n_operands
;
513 else if (input_range
)
515 n_lines
= hi_input
- lo_input
+ 1;
520 /* If an input file is specified, re-open it as stdin. */
522 && ! (STREQ (operand
[0], "-")
523 || freopen (operand
[0], "r", stdin
)))
524 die (EXIT_FAILURE
, errno
, "%s", quotef (operand
[0]));
526 fadvise (stdin
, FADVISE_SEQUENTIAL
);
528 if (repeat
|| head_lines
== SIZE_MAX
529 || input_size () <= RESERVOIR_MIN_INPUT
)
531 n_lines
= read_input (stdin
, eolbyte
, &input_lines
);
536 use_reservoir_sampling
= true;
537 n_lines
= SIZE_MAX
; /* unknown number of input lines, for now. */
541 /* The adjusted head line count; can be less than HEAD_LINES if the
542 input is small and if not repeating. */
543 size_t ahead_lines
= repeat
|| head_lines
< n_lines
? head_lines
: n_lines
;
545 randint_source
= randint_all_new (random_source
,
546 (use_reservoir_sampling
|| repeat
548 : randperm_bound (ahead_lines
, n_lines
)));
549 if (! randint_source
)
550 die (EXIT_FAILURE
, errno
, "%s",
551 quotef (random_source
? random_source
: "getrandom"));
553 if (use_reservoir_sampling
)
555 /* Instead of reading the entire file into 'line',
556 use reservoir-sampling to store just AHEAD_LINES random lines. */
557 n_lines
= read_input_reservoir_sampling (stdin
, eolbyte
, ahead_lines
,
558 randint_source
, &reservoir
);
559 ahead_lines
= n_lines
;
562 /* Close stdin now, rather than earlier, so that randint_all_new
563 doesn't have to worry about opening something other than
565 if (! (head_lines
== 0 || echo
|| input_range
|| fclose (stdin
) == 0))
566 die (EXIT_FAILURE
, errno
, _("read error"));
569 permutation
= randperm_new (randint_source
, ahead_lines
, n_lines
);
571 if (outfile
&& ! freopen (outfile
, "w", stdout
))
572 die (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
574 /* Generate output according to requested method */
582 die (EXIT_FAILURE
, 0, _("no lines to repeat"));
584 i
= write_random_numbers (randint_source
, ahead_lines
,
585 lo_input
, hi_input
, eolbyte
);
587 i
= write_random_lines (randint_source
, ahead_lines
, line
, n_lines
);
592 if (use_reservoir_sampling
)
593 i
= write_permuted_output_reservoir (n_lines
, reservoir
, permutation
);
594 else if (input_range
)
595 i
= write_permuted_numbers (ahead_lines
, lo_input
,
596 permutation
, eolbyte
);
598 i
= write_permuted_lines (ahead_lines
, line
, permutation
);
602 die (EXIT_FAILURE
, errno
, _("write error"));
604 main_exit (EXIT_SUCCESS
);