maint: prefer C23-style nullptr
[coreutils.git] / src / shuf.c
blob2338cdb78e3de1f74e6b77c1d5b2d0a583a6f23a
1 /* Shuffle lines of text.
3 Copyright (C) 2006-2023 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>.
18 Written by Paul Eggert. */
20 #include <config.h>
22 #include <sys/types.h>
23 #include "system.h"
25 #include "die.h"
26 #include "error.h"
27 #include "fadvise.h"
28 #include "getopt.h"
29 #include "linebuffer.h"
30 #include "quote.h"
31 #include "randint.h"
32 #include "randperm.h"
33 #include "read-file.h"
34 #include "stdio--.h"
35 #include "xstrtol.h"
37 /* The official name of this program (e.g., no 'g' prefix). */
38 #define PROGRAM_NAME "shuf"
40 #define AUTHORS proper_name ("Paul Eggert")
42 /* For reservoir-sampling, allocate the reservoir lines in batches. */
43 enum { RESERVOIR_LINES_INCREMENT = 1024 };
45 /* reservoir-sampling introduces CPU overhead for small inputs.
46 So only enable it for inputs >= this limit.
47 This limit was determined using these commands:
48 $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
49 $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
50 $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done .*/
51 enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
54 void
55 usage (int status)
57 if (status != EXIT_SUCCESS)
58 emit_try_help ();
59 else
61 printf (_("\
62 Usage: %s [OPTION]... [FILE]\n\
63 or: %s -e [OPTION]... [ARG]...\n\
64 or: %s -i LO-HI [OPTION]...\n\
65 "),
66 program_name, program_name, program_name);
67 fputs (_("\
68 Write a random permutation of the input lines to standard output.\n\
69 "), stdout);
71 emit_stdin_note ();
72 emit_mandatory_arg_note ();
74 fputs (_("\
75 -e, --echo treat each ARG as an input line\n\
76 -i, --input-range=LO-HI treat each number LO through HI as an input line\n\
77 -n, --head-count=COUNT output at most COUNT lines\n\
78 -o, --output=FILE write result to FILE instead of standard output\n\
79 --random-source=FILE get random bytes from FILE\n\
80 -r, --repeat output lines can be repeated\n\
81 "), stdout);
82 fputs (_("\
83 -z, --zero-terminated line delimiter is NUL, not newline\n\
84 "), stdout);
85 fputs (HELP_OPTION_DESCRIPTION, stdout);
86 fputs (VERSION_OPTION_DESCRIPTION, stdout);
87 emit_ancillary_info (PROGRAM_NAME);
90 exit (status);
93 /* For long options that have no equivalent short option, use a
94 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
95 enum
97 RANDOM_SOURCE_OPTION = CHAR_MAX + 1
100 static struct option const long_opts[] =
102 {"echo", no_argument, nullptr, 'e'},
103 {"input-range", required_argument, nullptr, 'i'},
104 {"head-count", required_argument, nullptr, 'n'},
105 {"output", required_argument, nullptr, 'o'},
106 {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
107 {"repeat", no_argument, nullptr, 'r'},
108 {"zero-terminated", no_argument, nullptr, 'z'},
109 {GETOPT_HELP_OPTION_DECL},
110 {GETOPT_VERSION_OPTION_DECL},
111 {0, 0, 0, 0},
114 static void
115 input_from_argv (char **operand, int n_operands, char eolbyte)
117 char *p;
118 size_t size = n_operands;
119 int i;
121 for (i = 0; i < n_operands; i++)
122 size += strlen (operand[i]);
123 p = xmalloc (size);
125 for (i = 0; i < n_operands; i++)
127 char *p1 = stpcpy (p, operand[i]);
128 operand[i] = p;
129 p = p1;
130 *p++ = eolbyte;
133 operand[n_operands] = p;
136 /* Return the start of the next line after LINE, which is guaranteed
137 to end in EOLBYTE. */
139 static char *
140 next_line (char *line, char eolbyte)
142 char *p = rawmemchr (line, eolbyte);
143 return p + 1;
146 /* Return the size of the input if possible or OFF_T_MAX if not. */
148 static off_t
149 input_size (void)
151 off_t file_size;
153 struct stat stat_buf;
154 if (fstat (STDIN_FILENO, &stat_buf) != 0)
155 return OFF_T_MAX;
156 if (usable_st_size (&stat_buf))
157 file_size = stat_buf.st_size;
158 else
159 return OFF_T_MAX;
161 off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
162 if (input_offset < 0)
163 return OFF_T_MAX;
165 file_size -= input_offset;
167 return file_size;
170 /* Read all lines and store up to K permuted lines in *OUT_RSRV.
171 Return the number of lines read, up to a maximum of K. */
173 static size_t
174 read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
175 struct randint_source *s,
176 struct linebuffer **out_rsrv)
178 randint n_lines = 0;
179 size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
180 struct linebuffer *line = nullptr;
181 struct linebuffer *rsrv;
183 rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
185 /* Fill the first K lines, directly into the reservoir. */
186 while (n_lines < k
187 && (line =
188 readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != nullptr)
190 n_lines++;
192 /* Enlarge reservoir. */
193 if (n_lines >= n_alloc_lines)
195 n_alloc_lines += RESERVOIR_LINES_INCREMENT;
196 rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
197 memset (&rsrv[n_lines], 0,
198 RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
202 /* last line wasn't null - so there may be more lines to read. */
203 if (line != nullptr)
205 struct linebuffer dummy;
206 initbuffer (&dummy); /* space for lines not put in reservoir. */
208 /* Choose the fate of the next line, with decreasing probability (as
209 n_lines increases in size).
211 If the line will be used, store it directly in the reservoir.
212 Otherwise, store it in dummy space.
214 With 'struct linebuffer', storing into existing buffer will reduce
215 re-allocations (will only re-allocate if the new line is longer than
216 the currently allocated space). */
219 randint j = randint_choose (s, n_lines + 1); /* 0 .. n_lines. */
220 line = (j < k) ? (&rsrv[j]) : (&dummy);
222 while (readlinebuffer_delim (line, in, eolbyte) != nullptr && n_lines++);
224 if (! n_lines)
225 die (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
227 freebuffer (&dummy);
230 /* no more input lines, or an input error. */
231 if (ferror (in))
232 die (EXIT_FAILURE, errno, _("read error"));
234 *out_rsrv = rsrv;
235 return MIN (k, n_lines);
238 static int
239 write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
240 size_t const *permutation)
242 for (size_t i = 0; i < n_lines; i++)
244 const struct linebuffer *p = &lines[permutation[i]];
245 if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
246 return -1;
249 return 0;
252 /* Read data from file IN. Input lines are delimited by EOLBYTE;
253 silently append a trailing EOLBYTE if the file ends in some other
254 byte. Store a pointer to the resulting array of lines into *PLINE.
255 Return the number of lines read. Report an error and exit on
256 failure. */
258 static size_t
259 read_input (FILE *in, char eolbyte, char ***pline)
261 char *p;
262 char *buf = nullptr;
263 size_t used;
264 char *lim;
265 char **line;
266 size_t n_lines;
268 /* TODO: We should limit the amount of data read here,
269 to less than RESERVOIR_MIN_INPUT. I.e., adjust fread_file() to support
270 taking a byte limit. We'd then need to ensure we handle a line spanning
271 this boundary. With that in place we could set use_reservoir_sampling
272 when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
273 call a wrapper function to populate a linebuffer from the internal pline
274 or if none left, stdin. Doing that would give better performance by
275 avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
276 from a pipe, and allow us to dispense with the input_size() function. */
277 if (!(buf = fread_file (in, 0, &used)))
278 die (EXIT_FAILURE, errno, _("read error"));
280 if (used && buf[used - 1] != eolbyte)
281 buf[used++] = eolbyte;
283 lim = buf + used;
285 n_lines = 0;
286 for (p = buf; p < lim; p = next_line (p, eolbyte))
287 n_lines++;
289 *pline = line = xnmalloc (n_lines + 1, sizeof *line);
291 line[0] = p = buf;
292 for (size_t i = 1; i <= n_lines; i++)
293 line[i] = p = next_line (p, eolbyte);
295 return n_lines;
298 /* Output N_LINES lines to stdout from LINE array,
299 chosen by the indices in PERMUTATION.
300 PERMUTATION and LINE must have at least N_LINES elements.
301 Strings in LINE must include the line-terminator character. */
302 static int
303 write_permuted_lines (size_t n_lines, char *const *line,
304 size_t const *permutation)
306 for (size_t i = 0; i < n_lines; i++)
308 char *const *p = line + permutation[i];
309 size_t len = p[1] - p[0];
310 if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
311 return -1;
314 return 0;
317 /* Output N_LINES of numbers to stdout, from PERMUTATION array.
318 PERMUTATION must have at least N_LINES elements. */
319 static int
320 write_permuted_numbers (size_t n_lines, size_t lo_input,
321 size_t const *permutation, char eolbyte)
323 for (size_t i = 0; i < n_lines; i++)
325 unsigned long int n = lo_input + permutation[i];
326 if (printf ("%lu%c", n, eolbyte) < 0)
327 return -1;
330 return 0;
333 /* Output COUNT numbers to stdout, chosen randomly from range
334 LO_INPUT through HI_INPUT. */
335 static int
336 write_random_numbers (struct randint_source *s, size_t count,
337 size_t lo_input, size_t hi_input, char eolbyte)
339 const randint range = hi_input - lo_input + 1;
341 for (size_t i = 0; i < count; i++)
343 unsigned long int j = lo_input + randint_choose (s, range);
344 if (printf ("%lu%c", j, eolbyte) < 0)
345 return -1;
348 return 0;
351 /* Output COUNT lines to stdout from LINES array.
352 LINES must have at least N_LINES elements in it.
353 Strings in LINES_ must include the line-terminator character. */
354 static int
355 write_random_lines (struct randint_source *s, size_t count,
356 char *const *lines, size_t n_lines)
358 for (size_t i = 0; i < count; i++)
360 const randint j = randint_choose (s, n_lines);
361 char *const *p = lines + j;
362 size_t len = p[1] - p[0];
363 if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
364 return -1;
367 return 0;
371 main (int argc, char **argv)
373 bool echo = false;
374 bool input_range = false;
375 size_t lo_input = SIZE_MAX;
376 size_t hi_input = 0;
377 size_t head_lines = SIZE_MAX;
378 char const *outfile = nullptr;
379 char *random_source = nullptr;
380 char eolbyte = '\n';
381 char **input_lines = nullptr;
382 bool use_reservoir_sampling = false;
383 bool repeat = false;
385 int optc;
386 int n_operands;
387 char **operand;
388 size_t n_lines;
389 char **line = nullptr;
390 struct linebuffer *reservoir = nullptr;
391 struct randint_source *randint_source;
392 size_t *permutation = nullptr;
393 int i;
395 initialize_main (&argc, &argv);
396 set_program_name (argv[0]);
397 setlocale (LC_ALL, "");
398 bindtextdomain (PACKAGE, LOCALEDIR);
399 textdomain (PACKAGE);
401 atexit (close_stdout);
403 while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, nullptr))
404 != -1)
405 switch (optc)
407 case 'e':
408 echo = true;
409 break;
411 case 'i':
413 if (input_range)
414 die (EXIT_FAILURE, 0, _("multiple -i options specified"));
415 input_range = true;
417 uintmax_t u;
418 char *lo_end;
419 strtol_error err = xstrtoumax (optarg, &lo_end, 10, &u, nullptr);
420 if (err == LONGINT_OK)
422 lo_input = u;
423 if (lo_input != u)
424 err = LONGINT_OVERFLOW;
425 else if (*lo_end != '-')
426 err = LONGINT_INVALID;
427 else
429 err = xstrtoumax (lo_end + 1, nullptr, 10, &u, "");
430 if (err == LONGINT_OK)
432 hi_input = u;
433 if (hi_input != u)
434 err = LONGINT_OVERFLOW;
439 n_lines = hi_input - lo_input + 1;
441 if (err != LONGINT_OK || (lo_input <= hi_input) == (n_lines == 0))
442 die (EXIT_FAILURE, err == LONGINT_OVERFLOW ? EOVERFLOW : 0,
443 "%s: %s", _("invalid input range"), quote (optarg));
445 break;
447 case 'n':
449 uintmax_t argval;
450 strtol_error e = xstrtoumax (optarg, nullptr, 10, &argval, "");
452 if (e == LONGINT_OK)
453 head_lines = MIN (head_lines, argval);
454 else if (e != LONGINT_OVERFLOW)
455 die (EXIT_FAILURE, 0, _("invalid line count: %s"),
456 quote (optarg));
458 break;
460 case 'o':
461 if (outfile && !STREQ (outfile, optarg))
462 die (EXIT_FAILURE, 0, _("multiple output files specified"));
463 outfile = optarg;
464 break;
466 case RANDOM_SOURCE_OPTION:
467 if (random_source && !STREQ (random_source, optarg))
468 die (EXIT_FAILURE, 0, _("multiple random sources specified"));
469 random_source = optarg;
470 break;
472 case 'r':
473 repeat = true;
474 break;
476 case 'z':
477 eolbyte = '\0';
478 break;
480 case_GETOPT_HELP_CHAR;
481 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
482 default:
483 usage (EXIT_FAILURE);
486 n_operands = argc - optind;
487 operand = argv + optind;
489 /* Check invalid usage. */
490 if (echo && input_range)
492 error (0, 0, _("cannot combine -e and -i options"));
493 usage (EXIT_FAILURE);
495 if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
497 error (0, 0, _("extra operand %s"), quote (operand[!input_range]));
498 usage (EXIT_FAILURE);
501 /* Prepare input. */
502 if (head_lines == 0)
504 n_lines = 0;
505 line = nullptr;
507 else if (echo)
509 input_from_argv (operand, n_operands, eolbyte);
510 n_lines = n_operands;
511 line = operand;
513 else if (input_range)
515 n_lines = hi_input - lo_input + 1;
516 line = nullptr;
518 else
520 /* If an input file is specified, re-open it as stdin. */
521 if (n_operands == 1
522 && ! (STREQ (operand[0], "-")
523 || freopen (operand[0], "r", stdin)))
524 die (EXIT_FAILURE, errno, "%s", quotef (operand[0]));
526 fadvise (stdin, FADVISE_SEQUENTIAL);
528 if (repeat || head_lines == SIZE_MAX
529 || input_size () <= RESERVOIR_MIN_INPUT)
531 n_lines = read_input (stdin, eolbyte, &input_lines);
532 line = input_lines;
534 else
536 use_reservoir_sampling = true;
537 n_lines = SIZE_MAX; /* unknown number of input lines, for now. */
541 /* The adjusted head line count; can be less than HEAD_LINES if the
542 input is small and if not repeating. */
543 size_t ahead_lines = repeat || head_lines < n_lines ? head_lines : n_lines;
545 randint_source = randint_all_new (random_source,
546 (use_reservoir_sampling || repeat
547 ? SIZE_MAX
548 : randperm_bound (ahead_lines, n_lines)));
549 if (! randint_source)
550 die (EXIT_FAILURE, errno, "%s",
551 quotef (random_source ? random_source : "getrandom"));
553 if (use_reservoir_sampling)
555 /* Instead of reading the entire file into 'line',
556 use reservoir-sampling to store just AHEAD_LINES random lines. */
557 n_lines = read_input_reservoir_sampling (stdin, eolbyte, ahead_lines,
558 randint_source, &reservoir);
559 ahead_lines = n_lines;
562 /* Close stdin now, rather than earlier, so that randint_all_new
563 doesn't have to worry about opening something other than
564 stdin. */
565 if (! (head_lines == 0 || echo || input_range || fclose (stdin) == 0))
566 die (EXIT_FAILURE, errno, _("read error"));
568 if (!repeat)
569 permutation = randperm_new (randint_source, ahead_lines, n_lines);
571 if (outfile && ! freopen (outfile, "w", stdout))
572 die (EXIT_FAILURE, errno, "%s", quotef (outfile));
574 /* Generate output according to requested method */
575 if (repeat)
577 if (head_lines == 0)
578 i = 0;
579 else
581 if (n_lines == 0)
582 die (EXIT_FAILURE, 0, _("no lines to repeat"));
583 if (input_range)
584 i = write_random_numbers (randint_source, ahead_lines,
585 lo_input, hi_input, eolbyte);
586 else
587 i = write_random_lines (randint_source, ahead_lines, line, n_lines);
590 else
592 if (use_reservoir_sampling)
593 i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
594 else if (input_range)
595 i = write_permuted_numbers (ahead_lines, lo_input,
596 permutation, eolbyte);
597 else
598 i = write_permuted_lines (ahead_lines, line, permutation);
601 if (i != 0)
602 die (EXIT_FAILURE, errno, _("write error"));
604 main_exit (EXIT_SUCCESS);