maint: factor out common macros of stat and printf
[coreutils.git] / src / sort.c
blob32e4c73a2fe867479a58561867e7adca3d7e21da
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <ctype.h>
26 #include <getopt.h>
27 #include <pthread.h>
28 #include <sys/resource.h>
29 #include <sys/types.h>
30 #include <sys/wait.h>
31 #include <signal.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "assure.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "flexmember.h"
38 #include "hard-locale.h"
39 #include "hash.h"
40 #include "heap.h"
41 #include "ignore-value.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "randread.h"
48 #include "readtokens0.h"
49 #include "stdlib--.h"
50 #include "strnumcmp.h"
51 #include "xmemcoll.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
54 #include "xstrtol-error.h"
56 #ifndef RLIMIT_DATA
57 struct rlimit { size_t rlim_cur; };
58 # define getrlimit(Resource, Rlp) (-1)
59 #endif
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
64 #define AUTHORS \
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
70 #endif
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73 present. */
74 #ifndef SA_NOCLDSTOP
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
78 # define sigset_t int
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
81 # endif
82 #endif
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
86 #endif
87 #if !defined OPEN_MAX
88 # define OPEN_MAX 20
89 #endif
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #ifndef DEFAULT_TMPDIR
94 # define DEFAULT_TMPDIR "/tmp"
95 #endif
97 /* Maximum number of lines to merge every time a NODE is taken from
98 the merge queue. Node is at LEVEL in the binary merge tree,
99 and is responsible for merging TOTAL lines. */
100 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
102 /* Heuristic value for the number of lines for which it is worth creating
103 a subthread, during an internal merge sort. I.e., it is a small number
104 of "average" lines for which sorting via two threads is faster than
105 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
106 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
107 lines of gensort -a output is sorted slightly faster with --parallel=2
108 than with --parallel=1. By contrast, using --parallel=1 is about 10%
109 faster than using --parallel=2 with a 64K-line input. */
110 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
111 static_assert (4 <= SUBTHREAD_LINES_HEURISTIC);
113 /* The number of threads after which there are
114 diminishing performance gains. */
115 enum { DEFAULT_MAX_THREADS = 8 };
117 /* Exit statuses. */
118 enum
120 /* POSIX says to exit with status 1 if invoked with -c and the
121 input is not properly sorted. */
122 SORT_OUT_OF_ORDER = 1,
124 /* POSIX says any other irregular exit must exit with a status
125 code greater than 1. */
126 SORT_FAILURE = 2
129 enum
131 /* The number of times we should try to fork a compression process
132 (we retry if the fork call fails). We don't _need_ to compress
133 temp files, this is just to reduce file system access, so this number
134 can be small. Each retry doubles in duration. */
135 MAX_FORK_TRIES_COMPRESS = 4,
137 /* The number of times we should try to fork a decompression process.
138 If we can't fork a decompression process, we can't sort, so this
139 number should be big. Each retry doubles in duration. */
140 MAX_FORK_TRIES_DECOMPRESS = 9
143 enum
145 /* Level of the end-of-merge node, one level above the root. */
146 MERGE_END = 0,
148 /* Level of the root node in merge tree. */
149 MERGE_ROOT = 1
152 /* The representation of the decimal point in the current locale. */
153 static char decimal_point;
155 /* Thousands separator; if outside char range, there is no separator. */
156 static int thousands_sep;
157 /* We currently ignore multi-byte grouping chars. */
158 static bool thousands_sep_ignored;
160 /* Nonzero if the corresponding locales are hard. */
161 static bool hard_LC_COLLATE;
162 #if HAVE_NL_LANGINFO
163 static bool hard_LC_TIME;
164 #endif
166 #define NONZERO(x) ((x) != 0)
168 /* The kind of blanks for '-b' to skip in various options. */
169 enum blanktype { bl_start, bl_end, bl_both };
171 /* The character marking end of line. Default to \n. */
172 static char eolchar = '\n';
174 /* Lines are held in memory as counted strings. */
175 struct line
177 char *text; /* Text of the line. */
178 size_t length; /* Length including final newline. */
179 char *keybeg; /* Start of first key. */
180 char *keylim; /* Limit of first key. */
183 /* Input buffers. */
184 struct buffer
186 char *buf; /* Dynamically allocated buffer,
187 partitioned into 3 regions:
188 - input data;
189 - unused area;
190 - an array of lines, in reverse order. */
191 size_t used; /* Number of bytes used for input data. */
192 size_t nlines; /* Number of lines in the line array. */
193 size_t alloc; /* Number of bytes allocated. */
194 size_t left; /* Number of bytes left from previous reads. */
195 size_t line_bytes; /* Number of bytes to reserve for each line. */
196 bool eof; /* An EOF has been read. */
199 /* Sort key. */
200 struct keyfield
202 size_t sword; /* Zero-origin 'word' to start at. */
203 size_t schar; /* Additional characters to skip. */
204 size_t eword; /* Zero-origin last 'word' of key. */
205 size_t echar; /* Additional characters in field. */
206 bool const *ignore; /* Boolean array of characters to ignore. */
207 char const *translate; /* Translation applied to characters. */
208 bool skipsblanks; /* Skip leading blanks when finding start. */
209 bool skipeblanks; /* Skip leading blanks when finding end. */
210 bool numeric; /* Flag for numeric comparison. Handle
211 strings of digits with optional decimal
212 point, but no exponential notation. */
213 bool random; /* Sort by random hash of key. */
214 bool general_numeric; /* Flag for general, numeric comparison.
215 Handle numbers in exponential notation. */
216 bool human_numeric; /* Flag for sorting by human readable
217 units with either SI or IEC prefixes. */
218 bool month; /* Flag for comparison by month name. */
219 bool reverse; /* Reverse the sense of comparison. */
220 bool version; /* sort by version number */
221 bool traditional_used; /* Traditional key option format is used. */
222 struct keyfield *next; /* Next keyfield to try. */
225 struct month
227 char const *name;
228 int val;
231 /* Binary merge tree node. */
232 struct merge_node
234 struct line *lo; /* Lines to merge from LO child node. */
235 struct line *hi; /* Lines to merge from HI child node. */
236 struct line *end_lo; /* End of available lines from LO. */
237 struct line *end_hi; /* End of available lines from HI. */
238 struct line **dest; /* Pointer to destination of merge. */
239 size_t nlo; /* Total Lines remaining from LO. */
240 size_t nhi; /* Total lines remaining from HI. */
241 struct merge_node *parent; /* Parent node. */
242 struct merge_node *lo_child; /* LO child node. */
243 struct merge_node *hi_child; /* HI child node. */
244 unsigned int level; /* Level in merge tree. */
245 bool queued; /* Node is already in heap. */
246 pthread_mutex_t lock; /* Lock for node operations. */
249 /* Priority queue of merge nodes. */
250 struct merge_node_queue
252 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
253 pthread_mutex_t mutex; /* Lock for queue operations. */
254 pthread_cond_t cond; /* Conditional wait for empty queue to populate
255 when popping. */
258 /* Used to implement --unique (-u). */
259 static struct line saved_line;
261 /* FIXME: None of these tables work with multibyte character sets.
262 Also, there are many other bugs when handling multibyte characters.
263 One way to fix this is to rewrite 'sort' to use wide characters
264 internally, but doing this with good performance is a bit
265 tricky. */
267 /* Table of blanks. */
268 static bool blanks[UCHAR_LIM];
270 /* Table of non-printing characters. */
271 static bool nonprinting[UCHAR_LIM];
273 /* Table of non-dictionary characters (not letters, digits, or blanks). */
274 static bool nondictionary[UCHAR_LIM];
276 /* Translation table folding lower case to upper. */
277 static char fold_toupper[UCHAR_LIM];
279 #define MONTHS_PER_YEAR 12
281 /* Table mapping month names to integers.
282 Alphabetic order allows binary search. */
283 static struct month monthtab[] =
285 {"APR", 4},
286 {"AUG", 8},
287 {"DEC", 12},
288 {"FEB", 2},
289 {"JAN", 1},
290 {"JUL", 7},
291 {"JUN", 6},
292 {"MAR", 3},
293 {"MAY", 5},
294 {"NOV", 11},
295 {"OCT", 10},
296 {"SEP", 9}
299 /* During the merge phase, the number of files to merge at once. */
300 #define NMERGE_DEFAULT 16
302 /* Minimum size for a merge or check buffer. */
303 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
305 /* Minimum sort size; the code might not work with smaller sizes. */
306 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
308 /* The number of bytes needed for a merge or check buffer, which can
309 function relatively efficiently even if it holds only one line. If
310 a longer line is seen, this value is increased. */
311 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
313 /* The approximate maximum number of bytes of main memory to use, as
314 specified by the user. Zero if the user has not specified a size. */
315 static size_t sort_size;
317 /* The initial allocation factor for non-regular files.
318 This is used, e.g., when reading from a pipe.
319 Don't make it too big, since it is multiplied by ~130 to
320 obtain the size of the actual buffer sort will allocate.
321 Also, there may be 8 threads all doing this at the same time. */
322 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
324 /* Array of directory names in which any temporary files are to be created. */
325 static char const **temp_dirs;
327 /* Number of temporary directory names used. */
328 static size_t temp_dir_count;
330 /* Number of allocated slots in temp_dirs. */
331 static size_t temp_dir_alloc;
333 /* Flag to reverse the order of all comparisons. */
334 static bool reverse;
336 /* Flag for stable sort. This turns off the last ditch bytewise
337 comparison of lines, and instead leaves lines in the same order
338 they were read if all keys compare equal. */
339 static bool stable;
341 /* An int value outside char range. */
342 enum { NON_CHAR = CHAR_MAX + 1 };
344 /* If TAB has this value, blanks separate fields. */
345 enum { TAB_DEFAULT = CHAR_MAX + 1 };
347 /* Tab character separating fields. If TAB_DEFAULT, then fields are
348 separated by the empty string between a non-blank character and a blank
349 character. */
350 static int tab = TAB_DEFAULT;
352 /* Flag to remove consecutive duplicate lines from the output.
353 Only the last of a sequence of equal lines will be output. */
354 static bool unique;
356 /* Nonzero if any of the input files are the standard input. */
357 static bool have_read_stdin;
359 /* List of key field comparisons to be tried. */
360 static struct keyfield *keylist;
362 /* Program used to (de)compress temp files. Must accept -d. */
363 static char const *compress_program;
365 /* Annotate the output with extra info to aid the user. */
366 static bool debug;
368 /* Maximum number of files to merge in one go. If more than this
369 number are present, temp files will be used. */
370 static unsigned int nmerge = NMERGE_DEFAULT;
372 /* Output an error to stderr and exit using async-signal-safe routines.
373 This can be used safely from signal handlers,
374 and between fork and exec of multithreaded processes. */
376 static _Noreturn void
377 async_safe_die (int errnum, char const *errstr)
379 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
381 /* Even if defined HAVE_STRERROR_R, we can't use it,
382 as it may return a translated string etc. and even if not
383 may call malloc which is unsafe. We might improve this
384 by testing for sys_errlist and using that if available.
385 For now just report the error number. */
386 if (errnum)
388 char errbuf[INT_BUFSIZE_BOUND (errnum)];
389 char *p = inttostr (errnum, errbuf);
390 ignore_value (write (STDERR_FILENO, ": errno ", 8));
391 ignore_value (write (STDERR_FILENO, p, strlen (p)));
394 ignore_value (write (STDERR_FILENO, "\n", 1));
396 _exit (SORT_FAILURE);
399 /* Report MESSAGE for FILE, then clean up and exit.
400 If FILE is null, it represents standard output. */
402 static void
403 sort_die (char const *message, char const *file)
405 error (SORT_FAILURE, errno, "%s: %s", message,
406 quotef (file ? file : _("standard output")));
409 void
410 usage (int status)
412 if (status != EXIT_SUCCESS)
413 emit_try_help ();
414 else
416 printf (_("\
417 Usage: %s [OPTION]... [FILE]...\n\
418 or: %s [OPTION]... --files0-from=F\n\
420 program_name, program_name);
421 fputs (_("\
422 Write sorted concatenation of all FILE(s) to standard output.\n\
423 "), stdout);
425 emit_stdin_note ();
426 emit_mandatory_arg_note ();
428 fputs (_("\
429 Ordering options:\n\
431 "), stdout);
432 fputs (_("\
433 -b, --ignore-leading-blanks ignore leading blanks\n\
434 -d, --dictionary-order consider only blanks and alphanumeric characters\
436 -f, --ignore-case fold lower case to upper case characters\n\
437 "), stdout);
438 fputs (_("\
439 -g, --general-numeric-sort compare according to general numerical value\n\
440 -i, --ignore-nonprinting consider only printable characters\n\
441 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
442 "), stdout);
443 fputs (_("\
444 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
445 "), stdout);
446 fputs (_("\
447 -n, --numeric-sort compare according to string numerical value;\n\
448 see manual for which strings are supported\n\
449 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
450 --random-source=FILE get random bytes from FILE\n\
451 -r, --reverse reverse the result of comparisons\n\
452 "), stdout);
453 fputs (_("\
454 --sort=WORD sort according to WORD:\n\
455 general-numeric -g, human-numeric -h, month -M,\
457 numeric -n, random -R, version -V\n\
458 -V, --version-sort natural sort of (version) numbers within text\n\
460 "), stdout);
461 fputs (_("\
462 Other options:\n\
464 "), stdout);
465 fputs (_("\
466 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
467 for more use temp files\n\
468 "), stdout);
469 fputs (_("\
470 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
471 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
473 --compress-program=PROG compress temporaries with PROG;\n\
474 decompress them with PROG -d\n\
475 "), stdout);
476 fputs (_("\
477 --debug annotate the part of the line used to sort,\n\
478 and warn about questionable usage to stderr\n\
479 --files0-from=F read input from the files specified by\n\
480 NUL-terminated names in file F;\n\
481 If F is - then read names from standard input\n\
482 "), stdout);
483 fputs (_("\
484 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
485 -m, --merge merge already sorted files; do not sort\n\
486 "), stdout);
487 fputs (_("\
488 -o, --output=FILE write result to FILE instead of standard output\n\
489 -s, --stable stabilize sort by disabling last-resort comparison\
491 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
492 "), stdout);
493 printf (_("\
494 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
495 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
496 multiple options specify multiple directories\n\
497 --parallel=N change the number of sorts run concurrently to N\n\
498 -u, --unique with -c, check for strict ordering;\n\
499 without -c, output only the first of an equal run\
501 "), DEFAULT_TMPDIR);
502 fputs (_("\
503 -z, --zero-terminated line delimiter is NUL, not newline\n\
504 "), stdout);
505 fputs (HELP_OPTION_DESCRIPTION, stdout);
506 fputs (VERSION_OPTION_DESCRIPTION, stdout);
507 fputs (_("\
509 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
510 field number and C a character position in the field; both are origin 1, and\n\
511 the stop position defaults to the line's end. If neither -t nor -b is in\n\
512 effect, characters in a field are counted from the beginning of the preceding\n\
513 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
515 which override global ordering options for that key. If no key is given, use\n\
516 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
518 SIZE may be followed by the following multiplicative suffixes:\n\
519 "), stdout);
520 fputs (_("\
521 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y, R, Q.\
522 \n\n\
523 *** WARNING ***\n\
524 The locale specified by the environment affects sort order.\n\
525 Set LC_ALL=C to get the traditional sort order that uses\n\
526 native byte values.\n\
527 "), stdout );
528 emit_ancillary_info (PROGRAM_NAME);
531 exit (status);
534 /* For long options that have no equivalent short option, use a
535 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
536 enum
538 CHECK_OPTION = CHAR_MAX + 1,
539 COMPRESS_PROGRAM_OPTION,
540 DEBUG_PROGRAM_OPTION,
541 FILES0_FROM_OPTION,
542 NMERGE_OPTION,
543 RANDOM_SOURCE_OPTION,
544 SORT_OPTION,
545 PARALLEL_OPTION
548 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
550 static struct option const long_options[] =
552 {"ignore-leading-blanks", no_argument, nullptr, 'b'},
553 {"check", optional_argument, nullptr, CHECK_OPTION},
554 {"compress-program", required_argument, nullptr, COMPRESS_PROGRAM_OPTION},
555 {"debug", no_argument, nullptr, DEBUG_PROGRAM_OPTION},
556 {"dictionary-order", no_argument, nullptr, 'd'},
557 {"ignore-case", no_argument, nullptr, 'f'},
558 {"files0-from", required_argument, nullptr, FILES0_FROM_OPTION},
559 {"general-numeric-sort", no_argument, nullptr, 'g'},
560 {"ignore-nonprinting", no_argument, nullptr, 'i'},
561 {"key", required_argument, nullptr, 'k'},
562 {"merge", no_argument, nullptr, 'm'},
563 {"month-sort", no_argument, nullptr, 'M'},
564 {"numeric-sort", no_argument, nullptr, 'n'},
565 {"human-numeric-sort", no_argument, nullptr, 'h'},
566 {"version-sort", no_argument, nullptr, 'V'},
567 {"random-sort", no_argument, nullptr, 'R'},
568 {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
569 {"sort", required_argument, nullptr, SORT_OPTION},
570 {"output", required_argument, nullptr, 'o'},
571 {"reverse", no_argument, nullptr, 'r'},
572 {"stable", no_argument, nullptr, 's'},
573 {"batch-size", required_argument, nullptr, NMERGE_OPTION},
574 {"buffer-size", required_argument, nullptr, 'S'},
575 {"field-separator", required_argument, nullptr, 't'},
576 {"temporary-directory", required_argument, nullptr, 'T'},
577 {"unique", no_argument, nullptr, 'u'},
578 {"zero-terminated", no_argument, nullptr, 'z'},
579 {"parallel", required_argument, nullptr, PARALLEL_OPTION},
580 {GETOPT_HELP_OPTION_DECL},
581 {GETOPT_VERSION_OPTION_DECL},
582 {nullptr, 0, nullptr, 0},
585 #define CHECK_TABLE \
586 _ct_("quiet", 'C') \
587 _ct_("silent", 'C') \
588 _ct_("diagnose-first", 'c')
590 static char const *const check_args[] =
592 #define _ct_(_s, _c) _s,
593 CHECK_TABLE nullptr
594 #undef _ct_
596 static char const check_types[] =
598 #define _ct_(_s, _c) _c,
599 CHECK_TABLE
600 #undef _ct_
603 #define SORT_TABLE \
604 _st_("general-numeric", 'g') \
605 _st_("human-numeric", 'h') \
606 _st_("month", 'M') \
607 _st_("numeric", 'n') \
608 _st_("random", 'R') \
609 _st_("version", 'V')
611 static char const *const sort_args[] =
613 #define _st_(_s, _c) _s,
614 SORT_TABLE nullptr
615 #undef _st_
617 static char const sort_types[] =
619 #define _st_(_s, _c) _c,
620 SORT_TABLE
621 #undef _st_
624 /* The set of signals that are caught. */
625 static sigset_t caught_signals;
627 /* Critical section status. */
628 struct cs_status
630 bool valid;
631 sigset_t sigs;
634 /* Enter a critical section. */
635 static void
636 cs_enter (struct cs_status *status)
638 int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs);
639 status->valid = ret == 0;
642 /* Leave a critical section. */
643 static void
644 cs_leave (struct cs_status const *status)
646 if (status->valid)
648 /* Ignore failure when restoring the signal mask. */
649 pthread_sigmask (SIG_SETMASK, &status->sigs, nullptr);
653 /* Possible states for a temp file. If compressed, the file's status
654 is unreaped or reaped, depending on whether 'sort' has waited for
655 the subprocess to finish. */
656 enum { UNCOMPRESSED, UNREAPED, REAPED };
658 /* The list of temporary files. */
659 struct tempnode
661 struct tempnode *volatile next;
662 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
663 char state;
664 char name[FLEXIBLE_ARRAY_MEMBER];
666 static struct tempnode *volatile temphead;
667 static struct tempnode *volatile *temptail = &temphead;
669 /* A file to be sorted. */
670 struct sortfile
672 /* The file's name. */
673 char const *name;
675 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
676 struct tempnode *temp;
679 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
680 static Hash_table *proctab;
682 enum { INIT_PROCTAB_SIZE = 47 };
684 static size_t
685 proctab_hasher (void const *entry, size_t tabsize)
687 struct tempnode const *node = entry;
688 return node->pid % tabsize;
691 static bool
692 proctab_comparator (void const *e1, void const *e2)
694 struct tempnode const *n1 = e1;
695 struct tempnode const *n2 = e2;
696 return n1->pid == n2->pid;
699 /* The number of unreaped child processes. */
700 static pid_t nprocs;
702 static bool delete_proc (pid_t);
704 /* If PID is positive, wait for the child process with that PID to
705 exit, and assume that PID has already been removed from the process
706 table. If PID is 0 or -1, clean up some child that has exited (by
707 waiting for it, and removing it from the proc table) and return the
708 child's process ID. However, if PID is 0 and no children have
709 exited, return 0 without waiting. */
711 static pid_t
712 reap (pid_t pid)
714 int status;
715 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
717 if (cpid < 0)
718 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
719 quoteaf (compress_program));
720 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
722 if (! WIFEXITED (status) || WEXITSTATUS (status))
723 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
724 quoteaf (compress_program));
725 --nprocs;
728 return cpid;
731 /* TEMP represents a new process; add it to the process table. Create
732 the process table the first time it's called. */
734 static void
735 register_proc (struct tempnode *temp)
737 if (! proctab)
739 proctab = hash_initialize (INIT_PROCTAB_SIZE, nullptr,
740 proctab_hasher,
741 proctab_comparator,
742 nullptr);
743 if (! proctab)
744 xalloc_die ();
747 temp->state = UNREAPED;
749 if (! hash_insert (proctab, temp))
750 xalloc_die ();
753 /* If PID is in the process table, remove it and return true.
754 Otherwise, return false. */
756 static bool
757 delete_proc (pid_t pid)
759 struct tempnode test;
761 test.pid = pid;
762 struct tempnode *node = hash_remove (proctab, &test);
763 if (! node)
764 return false;
765 node->state = REAPED;
766 return true;
769 /* Remove PID from the process table, and wait for it to exit if it
770 hasn't already. */
772 static void
773 wait_proc (pid_t pid)
775 if (delete_proc (pid))
776 reap (pid);
779 /* Reap any exited children. Do not block; reap only those that have
780 already exited. */
782 static void
783 reap_exited (void)
785 while (0 < nprocs && reap (0))
786 continue;
789 /* Reap at least one exited child, waiting if necessary. */
791 static void
792 reap_some (void)
794 reap (-1);
795 reap_exited ();
798 /* Reap all children, waiting if necessary. */
800 static void
801 reap_all (void)
803 while (0 < nprocs)
804 reap (-1);
807 /* Clean up any remaining temporary files. */
809 static void
810 cleanup (void)
812 struct tempnode const *node;
814 for (node = temphead; node; node = node->next)
815 unlink (node->name);
816 temphead = nullptr;
819 /* Cleanup actions to take when exiting. */
821 static void
822 exit_cleanup (void)
824 if (temphead)
826 /* Clean up any remaining temporary files in a critical section so
827 that a signal handler does not try to clean them too. */
828 struct cs_status cs;
829 cs_enter (&cs);
830 cleanup ();
831 cs_leave (&cs);
834 close_stdout ();
837 /* Create a new temporary file, returning its newly allocated tempnode.
838 Store into *PFD the file descriptor open for writing.
839 If the creation fails, return nullptr and store -1 into *PFD if the
840 failure is due to file descriptor exhaustion and
841 SURVIVE_FD_EXHAUSTION; otherwise, die. */
843 static struct tempnode *
844 create_temp_file (int *pfd, bool survive_fd_exhaustion)
846 static char const slashbase[] = "/sortXXXXXX";
847 static size_t temp_dir_index;
848 int fd;
849 int saved_errno;
850 char const *temp_dir = temp_dirs[temp_dir_index];
851 size_t len = strlen (temp_dir);
852 struct tempnode *node =
853 xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase));
854 char *file = node->name;
855 struct cs_status cs;
857 memcpy (file, temp_dir, len);
858 memcpy (file + len, slashbase, sizeof slashbase);
859 node->next = nullptr;
860 if (++temp_dir_index == temp_dir_count)
861 temp_dir_index = 0;
863 /* Create the temporary file in a critical section, to avoid races. */
864 cs_enter (&cs);
865 fd = mkostemp (file, O_CLOEXEC);
866 if (0 <= fd)
868 *temptail = node;
869 temptail = &node->next;
871 saved_errno = errno;
872 cs_leave (&cs);
873 errno = saved_errno;
875 if (fd < 0)
877 if (! (survive_fd_exhaustion && errno == EMFILE))
878 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
879 quoteaf (temp_dir));
880 free (node);
881 node = nullptr;
884 *pfd = fd;
885 return node;
888 /* Return a pointer to stdout status, or nullptr on failure. */
890 static struct stat *
891 get_outstatus (void)
893 static int outstat_errno;
894 static struct stat outstat;
895 if (outstat_errno == 0)
896 outstat_errno = fstat (STDOUT_FILENO, &outstat) == 0 ? -1 : errno;
897 return outstat_errno < 0 ? &outstat : nullptr;
900 /* Return a stream for FILE, opened with mode HOW. If HOW is "w",
901 the file is already open on standard output, and needs to be
902 truncated unless FILE is null. When opening for input, "-"
903 means standard input. To avoid confusion, do not return file
904 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
905 opening an ordinary FILE. Return nullptr if unsuccessful.
907 Use fadvise to specify an access pattern for input files.
908 There are a few hints we could possibly provide,
909 and after careful testing it was decided that
910 specifying FADVISE_SEQUENTIAL was not detrimental
911 to any cases. On Linux 2.6.31, this option doubles
912 the size of read ahead performed and thus was seen to
913 benefit these cases:
914 Merging
915 Sorting with a smaller internal buffer
916 Reading from faster flash devices
918 In _addition_ one could also specify other hints...
920 FADVISE_WILLNEED was tested, but Linux 2.6.31
921 at least uses that to _synchronously_ prepopulate the cache
922 with the specified range. While sort does need to
923 read all of its input before outputting, a synchronous
924 read of the whole file up front precludes any processing
925 that sort could do in parallel with the system doing
926 read ahead of the data. This was seen to have negative effects
927 in a couple of cases:
928 Merging
929 Sorting with a smaller internal buffer
930 This option was seen to shorten the runtime for sort
931 on a multicore system with lots of RAM and other processes
932 competing for CPU. It could be argued that more explicit
933 scheduling hints with 'nice' et. al. are more appropriate
934 for this situation.
936 FADVISE_NOREUSE is a possibility as it could lower
937 the priority of input data in the cache as sort will
938 only need to process it once. However its functionality
939 has changed over Linux kernel versions and as of 2.6.31
940 it does nothing and thus we can't depend on what it might
941 do in future.
943 FADVISE_DONTNEED is not appropriate for user specified
944 input files, but for temp files we do want to drop the
945 cache immediately after processing. This is done implicitly
946 however when the files are unlinked. */
948 static FILE *
949 stream_open (char const *file, char const *how)
951 FILE *fp;
953 if (*how == 'r')
955 if (STREQ (file, "-"))
957 have_read_stdin = true;
958 fp = stdin;
960 else
962 int fd = open (file, O_RDONLY | O_CLOEXEC);
963 fp = fd < 0 ? nullptr : fdopen (fd, how);
965 fadvise (fp, FADVISE_SEQUENTIAL);
967 else if (*how == 'w')
969 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
971 int ftruncate_errno = errno;
972 struct stat *outst = get_outstatus ();
973 if (!outst || S_ISREG (outst->st_mode) || S_TYPEISSHM (outst))
974 error (SORT_FAILURE, ftruncate_errno, _("%s: error truncating"),
975 quotef (file));
977 fp = stdout;
979 else
980 affirm (!"unexpected mode passed to stream_open");
982 return fp;
985 /* Same as stream_open, except always return a non-null value; die on
986 failure. */
988 static FILE *
989 xfopen (char const *file, char const *how)
991 FILE *fp = stream_open (file, how);
992 if (!fp)
993 sort_die (_("open failed"), file);
994 return fp;
997 /* Close FP, whose name is FILE, and report any errors. */
999 static void
1000 xfclose (FILE *fp, char const *file)
1002 switch (fileno (fp))
1004 case STDIN_FILENO:
1005 /* Allow reading stdin from tty more than once. */
1006 clearerr (fp);
1007 break;
1009 case STDOUT_FILENO:
1010 /* Don't close stdout just yet. close_stdout does that. */
1011 if (fflush (fp) != 0)
1012 sort_die (_("fflush failed"), file);
1013 break;
1015 default:
1016 if (fclose (fp) != 0)
1017 sort_die (_("close failed"), file);
1018 break;
1022 /* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
1024 static void
1025 move_fd (int oldfd, int newfd)
1027 if (oldfd != newfd)
1029 /* These should never fail for our usage. */
1030 ignore_value (dup2 (oldfd, newfd));
1031 ignore_value (close (oldfd));
1035 /* Fork a child process for piping to and do common cleanup. The
1036 TRIES parameter specifies how many times to try to fork before
1037 giving up. Return the PID of the child, or -1 (setting errno)
1038 on failure. */
1040 static pid_t
1041 pipe_fork (int pipefds[2], size_t tries)
1043 #if HAVE_WORKING_FORK
1044 struct tempnode *saved_temphead;
1045 int saved_errno;
1046 double wait_retry = 0.25;
1047 pid_t pid;
1048 struct cs_status cs;
1050 if (pipe2 (pipefds, O_CLOEXEC) < 0)
1051 return -1;
1053 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1054 uncontrolled subprocess generation can hurt performance significantly.
1055 Allow at most NMERGE + 2 subprocesses, on the theory that there
1056 may be some useful parallelism by letting compression for the
1057 previous merge finish (1 subprocess) in parallel with the current
1058 merge (NMERGE + 1 subprocesses). */
1060 if (nmerge + 1 < nprocs)
1061 reap_some ();
1063 while (tries--)
1065 /* This is so the child process won't delete our temp files
1066 if it receives a signal before exec-ing. */
1067 cs_enter (&cs);
1068 saved_temphead = temphead;
1069 temphead = nullptr;
1071 pid = fork ();
1072 saved_errno = errno;
1073 if (pid)
1074 temphead = saved_temphead;
1076 cs_leave (&cs);
1077 errno = saved_errno;
1079 if (0 <= pid || errno != EAGAIN)
1080 break;
1081 else
1083 xnanosleep (wait_retry);
1084 wait_retry *= 2;
1085 reap_exited ();
1089 if (pid < 0)
1091 saved_errno = errno;
1092 close (pipefds[0]);
1093 close (pipefds[1]);
1094 errno = saved_errno;
1096 else if (pid == 0)
1098 close (STDIN_FILENO);
1099 close (STDOUT_FILENO);
1101 else
1102 ++nprocs;
1104 return pid;
1106 #else /* ! HAVE_WORKING_FORK */
1107 return -1;
1108 #endif
1111 /* Create a temporary file and, if asked for, start a compressor
1112 to that file. Set *PFP to the file handle and return
1113 the address of the new temp node. If the creation
1114 fails, return nullptr if the failure is due to file descriptor
1115 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1117 static struct tempnode *
1118 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1120 int tempfd;
1121 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1122 if (! node)
1123 return nullptr;
1125 node->state = UNCOMPRESSED;
1127 if (compress_program)
1129 int pipefds[2];
1131 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1132 if (0 < node->pid)
1134 close (tempfd);
1135 close (pipefds[0]);
1136 tempfd = pipefds[1];
1138 register_proc (node);
1140 else if (node->pid == 0)
1142 /* Being the child of a multithreaded program before exec,
1143 we're restricted to calling async-signal-safe routines here. */
1144 close (pipefds[1]);
1145 move_fd (tempfd, STDOUT_FILENO);
1146 move_fd (pipefds[0], STDIN_FILENO);
1148 execlp (compress_program, compress_program, (char *) nullptr);
1150 async_safe_die (errno, "couldn't execute compress program");
1154 *pfp = fdopen (tempfd, "w");
1155 if (! *pfp)
1156 sort_die (_("couldn't create temporary file"), node->name);
1158 return node;
1161 /* Create a temporary file and, if asked for, start a compressor
1162 to that file. Set *PFP to the file handle and return the address
1163 of the new temp node. Die on failure. */
1165 static struct tempnode *
1166 create_temp (FILE **pfp)
1168 return maybe_create_temp (pfp, false);
1171 /* Open a compressed temp file and start a decompression process through
1172 which to filter the input. Return nullptr (setting errno to
1173 EMFILE) if we ran out of file descriptors, and die on any other
1174 kind of failure. */
1176 static FILE *
1177 open_temp (struct tempnode *temp)
1179 int tempfd, pipefds[2];
1180 FILE *fp = nullptr;
1182 if (temp->state == UNREAPED)
1183 wait_proc (temp->pid);
1185 tempfd = open (temp->name, O_RDONLY);
1186 if (tempfd < 0)
1187 return nullptr;
1189 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1191 switch (child)
1193 case -1:
1194 if (errno != EMFILE)
1195 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1196 quoteaf (compress_program));
1197 close (tempfd);
1198 errno = EMFILE;
1199 break;
1201 case 0:
1202 /* Being the child of a multithreaded program before exec,
1203 we're restricted to calling async-signal-safe routines here. */
1204 close (pipefds[0]);
1205 move_fd (tempfd, STDIN_FILENO);
1206 move_fd (pipefds[1], STDOUT_FILENO);
1208 execlp (compress_program, compress_program, "-d", (char *) nullptr);
1210 async_safe_die (errno, "couldn't execute compress program (with -d)");
1212 default:
1213 temp->pid = child;
1214 register_proc (temp);
1215 close (tempfd);
1216 close (pipefds[1]);
1218 fp = fdopen (pipefds[0], "r");
1219 if (! fp)
1221 int saved_errno = errno;
1222 close (pipefds[0]);
1223 errno = saved_errno;
1225 break;
1228 return fp;
1231 /* Append DIR to the array of temporary directory names. */
1232 static void
1233 add_temp_dir (char const *dir)
1235 if (temp_dir_count == temp_dir_alloc)
1236 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1238 temp_dirs[temp_dir_count++] = dir;
1241 /* Remove NAME from the list of temporary files. */
1243 static void
1244 zaptemp (char const *name)
1246 struct tempnode *volatile *pnode;
1247 struct tempnode *node;
1248 struct tempnode *next;
1249 int unlink_status;
1250 int unlink_errno = 0;
1251 struct cs_status cs;
1253 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1254 continue;
1256 if (node->state == UNREAPED)
1257 wait_proc (node->pid);
1259 /* Unlink the temporary file in a critical section to avoid races. */
1260 next = node->next;
1261 cs_enter (&cs);
1262 unlink_status = unlink (name);
1263 unlink_errno = errno;
1264 *pnode = next;
1265 cs_leave (&cs);
1267 if (unlink_status != 0)
1268 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1269 if (! next)
1270 temptail = pnode;
1271 free (node);
1274 #if HAVE_NL_LANGINFO
1276 static int
1277 struct_month_cmp (void const *m1, void const *m2)
1279 struct month const *month1 = m1;
1280 struct month const *month2 = m2;
1281 return strcmp (month1->name, month2->name);
1284 #endif
1286 /* Initialize the character class tables. */
1288 static void
1289 inittables (void)
1291 size_t i;
1293 for (i = 0; i < UCHAR_LIM; ++i)
1295 blanks[i] = i == '\n' || isblank (i);
1296 nondictionary[i] = ! blanks[i] && ! isalnum (i);
1297 nonprinting[i] = ! isprint (i);
1298 fold_toupper[i] = toupper (i);
1301 #if HAVE_NL_LANGINFO
1302 /* If we're not in the "C" locale, read different names for months. */
1303 if (hard_LC_TIME)
1305 for (i = 0; i < MONTHS_PER_YEAR; i++)
1307 char const *s;
1308 size_t s_len;
1309 size_t j, k;
1310 char *name;
1312 s = nl_langinfo (ABMON_1 + i);
1313 s_len = strlen (s);
1314 monthtab[i].name = name = xmalloc (s_len + 1);
1315 monthtab[i].val = i + 1;
1317 for (j = k = 0; j < s_len; j++)
1318 if (! isblank (to_uchar (s[j])))
1319 name[k++] = fold_toupper[to_uchar (s[j])];
1320 name[k] = '\0';
1322 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1324 #endif
1327 /* Specify how many inputs may be merged at once.
1328 This may be set on the command-line with the
1329 --batch-size option. */
1330 static void
1331 specify_nmerge (int oi, char c, char const *s)
1333 uintmax_t n;
1334 struct rlimit rlimit;
1335 enum strtol_error e = xstrtoumax (s, nullptr, 10, &n, "");
1337 /* Try to find out how many file descriptors we'll be able
1338 to open. We need at least nmerge + 3 (STDIN_FILENO,
1339 STDOUT_FILENO and STDERR_FILENO). */
1340 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1341 ? rlimit.rlim_cur
1342 : OPEN_MAX)
1343 - 3);
1345 if (e == LONGINT_OK)
1347 nmerge = n;
1348 if (nmerge != n)
1349 e = LONGINT_OVERFLOW;
1350 else
1352 if (nmerge < 2)
1354 error (0, 0, _("invalid --%s argument %s"),
1355 long_options[oi].name, quote (s));
1356 error (SORT_FAILURE, 0,
1357 _("minimum --%s argument is %s"),
1358 long_options[oi].name, quote ("2"));
1360 else if (max_nmerge < nmerge)
1362 e = LONGINT_OVERFLOW;
1364 else
1365 return;
1369 if (e == LONGINT_OVERFLOW)
1371 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1372 error (0, 0, _("--%s argument %s too large"),
1373 long_options[oi].name, quote (s));
1374 error (SORT_FAILURE, 0,
1375 _("maximum --%s argument with current rlimit is %s"),
1376 long_options[oi].name,
1377 uinttostr (max_nmerge, max_nmerge_buf));
1379 else
1380 xstrtol_fatal (e, oi, c, long_options, s);
1383 /* Specify the amount of main memory to use when sorting. */
1384 static void
1385 specify_sort_size (int oi, char c, char const *s)
1387 uintmax_t n;
1388 char *suffix;
1389 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPQRtTYZ");
1391 /* The default unit is KiB. */
1392 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1394 if (n <= UINTMAX_MAX / 1024)
1395 n *= 1024;
1396 else
1397 e = LONGINT_OVERFLOW;
1400 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1401 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1402 switch (suffix[0])
1404 case 'b':
1405 e = LONGINT_OK;
1406 break;
1408 case '%':
1410 double mem = physmem_total () * n / 100;
1412 /* Use "<", not "<=", to avoid problems with rounding. */
1413 if (mem < UINTMAX_MAX)
1415 n = mem;
1416 e = LONGINT_OK;
1418 else
1419 e = LONGINT_OVERFLOW;
1421 break;
1424 if (e == LONGINT_OK)
1426 /* If multiple sort sizes are specified, take the maximum, so
1427 that option order does not matter. */
1428 if (n < sort_size)
1429 return;
1431 sort_size = n;
1432 if (sort_size == n)
1434 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1435 return;
1438 e = LONGINT_OVERFLOW;
1441 xstrtol_fatal (e, oi, c, long_options, s);
1444 /* Specify the number of threads to spawn during internal sort. */
1445 static size_t
1446 specify_nthreads (int oi, char c, char const *s)
1448 uintmax_t nthreads;
1449 enum strtol_error e = xstrtoumax (s, nullptr, 10, &nthreads, "");
1450 if (e == LONGINT_OVERFLOW)
1451 return SIZE_MAX;
1452 if (e != LONGINT_OK)
1453 xstrtol_fatal (e, oi, c, long_options, s);
1454 if (SIZE_MAX < nthreads)
1455 nthreads = SIZE_MAX;
1456 if (nthreads == 0)
1457 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1458 return nthreads;
1461 /* Return the default sort size. */
1462 static size_t
1463 default_sort_size (void)
1465 /* Let SIZE be MEM, but no more than the maximum object size,
1466 total memory, or system resource limits. Don't bother to check
1467 for values like RLIM_INFINITY since in practice they are not much
1468 less than SIZE_MAX. */
1469 size_t size = SIZE_MAX;
1470 struct rlimit rlimit;
1471 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1472 size = rlimit.rlim_cur;
1473 #ifdef RLIMIT_AS
1474 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1475 size = rlimit.rlim_cur;
1476 #endif
1478 /* Leave a large safety margin for the above limits, as failure can
1479 occur when they are exceeded. */
1480 size /= 2;
1482 #ifdef RLIMIT_RSS
1483 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1484 Exceeding RSS is not fatal, but can be quite slow. */
1485 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1486 size = rlimit.rlim_cur / 16 * 15;
1487 #endif
1489 /* Let MEM be available memory or 1/8 of total memory, whichever
1490 is greater. */
1491 double avail = physmem_available ();
1492 double total = physmem_total ();
1493 double mem = MAX (avail, total / 8);
1495 /* Leave a 1/4 margin for physical memory. */
1496 if (total * 0.75 < size)
1497 size = total * 0.75;
1499 /* Return the minimum of MEM and SIZE, but no less than
1500 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1501 right when only one argument is floating point. */
1502 if (mem < size)
1503 size = mem;
1504 return MAX (size, MIN_SORT_SIZE);
1507 /* Return the sort buffer size to use with the input files identified
1508 by FPS and FILES, which are alternate names of the same files.
1509 NFILES gives the number of input files; NFPS may be less. Assume
1510 that each input line requires LINE_BYTES extra bytes' worth of line
1511 information. Do not exceed the size bound specified by the user
1512 (or a default size bound, if the user does not specify one). */
1514 static size_t
1515 sort_buffer_size (FILE *const *fps, size_t nfps,
1516 char *const *files, size_t nfiles,
1517 size_t line_bytes)
1519 /* A bound on the input size. If zero, the bound hasn't been
1520 determined yet. */
1521 static size_t size_bound;
1523 /* In the worst case, each input byte is a newline. */
1524 size_t worst_case_per_input_byte = line_bytes + 1;
1526 /* Keep enough room for one extra input line and an extra byte.
1527 This extra room might be needed when preparing to read EOF. */
1528 size_t size = worst_case_per_input_byte + 1;
1530 for (size_t i = 0; i < nfiles; i++)
1532 struct stat st;
1533 off_t file_size;
1534 size_t worst_case;
1536 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1537 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1538 : stat (files[i], &st))
1539 != 0)
1540 sort_die (_("stat failed"), files[i]);
1542 if (usable_st_size (&st) && 0 < st.st_size)
1543 file_size = st.st_size;
1544 else
1546 /* The file has unknown size. If the user specified a sort
1547 buffer size, use that; otherwise, guess the size. */
1548 if (sort_size)
1549 return sort_size;
1550 file_size = INPUT_FILE_SIZE_GUESS;
1553 if (! size_bound)
1555 size_bound = sort_size;
1556 if (! size_bound)
1557 size_bound = default_sort_size ();
1560 /* Add the amount of memory needed to represent the worst case
1561 where the input consists entirely of newlines followed by a
1562 single non-newline. Check for overflow. */
1563 worst_case = file_size * worst_case_per_input_byte + 1;
1564 if (file_size != worst_case / worst_case_per_input_byte
1565 || size_bound - size <= worst_case)
1566 return size_bound;
1567 size += worst_case;
1570 return size;
1573 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1574 must be at least sizeof (struct line). Allocate ALLOC bytes
1575 initially. */
1577 static void
1578 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1580 /* Ensure that the line array is properly aligned. If the desired
1581 size cannot be allocated, repeatedly halve it until allocation
1582 succeeds. The smaller allocation may hurt overall performance,
1583 but that's better than failing. */
1584 while (true)
1586 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1587 buf->buf = malloc (alloc);
1588 if (buf->buf)
1589 break;
1590 alloc /= 2;
1591 if (alloc <= line_bytes + 1)
1592 xalloc_die ();
1595 buf->line_bytes = line_bytes;
1596 buf->alloc = alloc;
1597 buf->used = buf->left = buf->nlines = 0;
1598 buf->eof = false;
1601 /* Return one past the limit of the line array. */
1603 static inline struct line *
1604 buffer_linelim (struct buffer const *buf)
1606 void *linelim = buf->buf + buf->alloc;
1607 return linelim;
1610 /* Return a pointer to the first character of the field specified
1611 by KEY in LINE. */
1613 static char *
1614 begfield (struct line const *line, struct keyfield const *key)
1616 char *ptr = line->text, *lim = ptr + line->length - 1;
1617 size_t sword = key->sword;
1618 size_t schar = key->schar;
1620 /* The leading field separator itself is included in a field when -t
1621 is absent. */
1623 if (tab != TAB_DEFAULT)
1624 while (ptr < lim && sword--)
1626 while (ptr < lim && *ptr != tab)
1627 ++ptr;
1628 if (ptr < lim)
1629 ++ptr;
1631 else
1632 while (ptr < lim && sword--)
1634 while (ptr < lim && blanks[to_uchar (*ptr)])
1635 ++ptr;
1636 while (ptr < lim && !blanks[to_uchar (*ptr)])
1637 ++ptr;
1640 /* If we're ignoring leading blanks when computing the Start
1641 of the field, skip past them here. */
1642 if (key->skipsblanks)
1643 while (ptr < lim && blanks[to_uchar (*ptr)])
1644 ++ptr;
1646 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1647 ptr = MIN (lim, ptr + schar);
1649 return ptr;
1652 /* Return the limit of (a pointer to the first character after) the field
1653 in LINE specified by KEY. */
1655 ATTRIBUTE_PURE
1656 static char *
1657 limfield (struct line const *line, struct keyfield const *key)
1659 char *ptr = line->text, *lim = ptr + line->length - 1;
1660 size_t eword = key->eword, echar = key->echar;
1662 if (echar == 0)
1663 eword++; /* Skip all of end field. */
1665 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1666 whichever comes first. If there are more than EWORD fields, leave
1667 PTR pointing at the beginning of the field having zero-based index,
1668 EWORD. If a delimiter character was specified (via -t), then that
1669 'beginning' is the first character following the delimiting TAB.
1670 Otherwise, leave PTR pointing at the first 'blank' character after
1671 the preceding field. */
1672 if (tab != TAB_DEFAULT)
1673 while (ptr < lim && eword--)
1675 while (ptr < lim && *ptr != tab)
1676 ++ptr;
1677 if (ptr < lim && (eword || echar))
1678 ++ptr;
1680 else
1681 while (ptr < lim && eword--)
1683 while (ptr < lim && blanks[to_uchar (*ptr)])
1684 ++ptr;
1685 while (ptr < lim && !blanks[to_uchar (*ptr)])
1686 ++ptr;
1689 #ifdef POSIX_UNSPECIFIED
1690 /* The following block of code makes GNU sort incompatible with
1691 standard Unix sort, so it's ifdef'd out for now.
1692 The POSIX spec isn't clear on how to interpret this.
1693 FIXME: request clarification.
1695 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1696 Date: Thu, 30 May 96 12:20:41 -0400
1697 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1699 [...]I believe I've found another bug in 'sort'.
1701 $ cat /tmp/sort.in
1702 a b c 2 d
1703 pq rs 1 t
1704 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1705 a b c 2 d
1706 pq rs 1 t
1707 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1708 pq rs 1 t
1709 a b c 2 d
1711 Unix sort produced the answer I expected: sort on the single character
1712 in column 7. GNU sort produced different results, because it disagrees
1713 on the interpretation of the key-end spec "M.N". Unix sort reads this
1714 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1715 "skip M-1 fields, then either N-1 characters or the rest of the current
1716 field, whichever comes first". This extra clause applies only to
1717 key-ends, not key-starts.
1720 /* Make LIM point to the end of (one byte past) the current field. */
1721 if (tab != TAB_DEFAULT)
1723 char *newlim;
1724 newlim = memchr (ptr, tab, lim - ptr);
1725 if (newlim)
1726 lim = newlim;
1728 else
1730 char *newlim;
1731 newlim = ptr;
1732 while (newlim < lim && blanks[to_uchar (*newlim)])
1733 ++newlim;
1734 while (newlim < lim && !blanks[to_uchar (*newlim)])
1735 ++newlim;
1736 lim = newlim;
1738 #endif
1740 if (echar != 0) /* We need to skip over a portion of the end field. */
1742 /* If we're ignoring leading blanks when computing the End
1743 of the field, skip past them here. */
1744 if (key->skipeblanks)
1745 while (ptr < lim && blanks[to_uchar (*ptr)])
1746 ++ptr;
1748 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1749 ptr = MIN (lim, ptr + echar);
1752 return ptr;
1755 /* Fill BUF reading from FP, moving buf->left bytes from the end
1756 of buf->buf to the beginning first. If EOF is reached and the
1757 file wasn't terminated by a newline, supply one. Set up BUF's line
1758 table too. FILE is the name of the file corresponding to FP.
1759 Return true if some input was read. */
1761 static bool
1762 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1764 struct keyfield const *key = keylist;
1765 char eol = eolchar;
1766 size_t line_bytes = buf->line_bytes;
1767 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1769 if (buf->eof)
1770 return false;
1772 if (buf->used != buf->left)
1774 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1775 buf->used = buf->left;
1776 buf->nlines = 0;
1779 while (true)
1781 char *ptr = buf->buf + buf->used;
1782 struct line *linelim = buffer_linelim (buf);
1783 struct line *line = linelim - buf->nlines;
1784 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1785 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1787 while (line_bytes + 1 < avail)
1789 /* Read as many bytes as possible, but do not read so many
1790 bytes that there might not be enough room for the
1791 corresponding line array. The worst case is when the
1792 rest of the input file consists entirely of newlines,
1793 except that the last byte is not a newline. */
1794 size_t readsize = (avail - 1) / (line_bytes + 1);
1795 size_t bytes_read = fread (ptr, 1, readsize, fp);
1796 char *ptrlim = ptr + bytes_read;
1797 char *p;
1798 avail -= bytes_read;
1800 if (bytes_read != readsize)
1802 if (ferror (fp))
1803 sort_die (_("read failed"), file);
1804 if (feof (fp))
1806 buf->eof = true;
1807 if (buf->buf == ptrlim)
1808 return false;
1809 if (line_start != ptrlim && ptrlim[-1] != eol)
1810 *ptrlim++ = eol;
1814 /* Find and record each line in the just-read input. */
1815 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1817 /* Delimit the line with NUL. This eliminates the need to
1818 temporarily replace the last byte with NUL when calling
1819 xmemcoll, which increases performance. */
1820 *p = '\0';
1821 ptr = p + 1;
1822 line--;
1823 line->text = line_start;
1824 line->length = ptr - line_start;
1825 mergesize = MAX (mergesize, line->length);
1826 avail -= line_bytes;
1828 if (key)
1830 /* Precompute the position of the first key for
1831 efficiency. */
1832 line->keylim = (key->eword == SIZE_MAX
1834 : limfield (line, key));
1836 if (key->sword != SIZE_MAX)
1837 line->keybeg = begfield (line, key);
1838 else
1840 if (key->skipsblanks)
1841 while (blanks[to_uchar (*line_start)])
1842 line_start++;
1843 line->keybeg = line_start;
1847 line_start = ptr;
1850 ptr = ptrlim;
1851 if (buf->eof)
1852 break;
1855 buf->used = ptr - buf->buf;
1856 buf->nlines = buffer_linelim (buf) - line;
1857 if (buf->nlines != 0)
1859 buf->left = ptr - line_start;
1860 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1861 return true;
1865 /* The current input line is too long to fit in the buffer.
1866 Increase the buffer size and try again, keeping it properly
1867 aligned. */
1868 size_t line_alloc = buf->alloc / sizeof (struct line);
1869 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1870 buf->alloc = line_alloc * sizeof (struct line);
1875 /* Table that maps characters to order-of-magnitude values. */
1876 static char const unit_order[UCHAR_LIM] =
1878 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1879 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'R' == 82 && 'Q' == 81 \
1880 && 'k' == 107)
1881 /* This initializer syntax works on all C99 hosts. For now, use
1882 it only on non-ASCII hosts, to ease the pain of porting to
1883 pre-C99 ASCII hosts. */
1884 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1885 ['R']=9, ['Q']=10,
1886 ['k']=1,
1887 #else
1888 /* Generate the following table with this command:
1889 perl -e 'my %a=(k=>1,K=>1,M=>2,G=>3,T=>4,P=>5,E=>6,Z=>7,Y=>8,R=>9,Q=>10);
1890 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1891 |fmt */
1892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1895 0, 0, 0, 1, 0, 2, 0, 0, 5, 10, 9, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1903 #endif
1906 /* Traverse number given as *number consisting of digits, thousands_sep, and
1907 decimal_point chars only. Returns the highest digit found in the number,
1908 or '\0' if no digit has been found. Upon return *number points at the
1909 character that immediately follows after the given number. */
1910 static char
1911 traverse_raw_number (char const **number)
1913 char const *p = *number;
1914 char ch;
1915 char max_digit = '\0';
1916 bool ends_with_thousands_sep = false;
1918 /* Scan to end of number.
1919 Decimals or separators not followed by digits stop the scan.
1920 Numbers ending in decimals or separators are thus considered
1921 to be lacking in units.
1922 FIXME: add support for multibyte thousands_sep and decimal_point. */
1924 while (ISDIGIT (ch = *p++))
1926 if (max_digit < ch)
1927 max_digit = ch;
1929 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1930 the unit in the next column in case thousands_sep matches as blank
1931 and is used as column delimiter. */
1932 ends_with_thousands_sep = (*p == thousands_sep);
1933 if (ends_with_thousands_sep)
1934 ++p;
1937 if (ends_with_thousands_sep)
1939 /* thousands_sep not followed by digit is not allowed. */
1940 *number = p - 2;
1941 return max_digit;
1944 if (ch == decimal_point)
1945 while (ISDIGIT (ch = *p++))
1946 if (max_digit < ch)
1947 max_digit = ch;
1949 *number = p - 1;
1950 return max_digit;
1953 /* Return an integer that represents the order of magnitude of the
1954 unit following the number. The number may contain thousands
1955 separators and a decimal point, but it may not contain leading blanks.
1956 Negative numbers get negative orders; zero numbers have a zero order. */
1958 ATTRIBUTE_PURE
1959 static int
1960 find_unit_order (char const *number)
1962 bool minus_sign = (*number == '-');
1963 char const *p = number + minus_sign;
1964 char max_digit = traverse_raw_number (&p);
1965 if ('0' < max_digit)
1967 unsigned char ch = *p;
1968 int order = unit_order[ch];
1969 return (minus_sign ? -order : order);
1971 else
1972 return 0;
1975 /* Compare numbers A and B ending in units with SI or IEC prefixes
1976 <none/unknown> < K/k < M < G < T < P < E < Z < Y < R < Q */
1978 ATTRIBUTE_PURE
1979 static int
1980 human_numcompare (char const *a, char const *b)
1982 while (blanks[to_uchar (*a)])
1983 a++;
1984 while (blanks[to_uchar (*b)])
1985 b++;
1987 int diff = find_unit_order (a) - find_unit_order (b);
1988 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1991 /* Compare strings A and B as numbers without explicitly converting them to
1992 machine numbers. Comparatively slow for short strings, but asymptotically
1993 hideously fast. */
1995 ATTRIBUTE_PURE
1996 static int
1997 numcompare (char const *a, char const *b)
1999 while (blanks[to_uchar (*a)])
2000 a++;
2001 while (blanks[to_uchar (*b)])
2002 b++;
2004 return strnumcmp (a, b, decimal_point, thousands_sep);
2007 static int
2008 nan_compare (long double a, long double b)
2010 char buf[2][sizeof "-nan""()" + CHAR_BIT * sizeof a];
2011 snprintf (buf[0], sizeof buf[0], "%Lf", a);
2012 snprintf (buf[1], sizeof buf[1], "%Lf", b);
2013 return strcmp (buf[0], buf[1]);
2016 static int
2017 general_numcompare (char const *sa, char const *sb)
2019 /* FIXME: maybe add option to try expensive FP conversion
2020 only if A and B can't be compared more cheaply/accurately. */
2022 char *ea;
2023 char *eb;
2024 long double a = strtold (sa, &ea);
2025 long double b = strtold (sb, &eb);
2027 /* Put conversion errors at the start of the collating sequence. */
2028 if (sa == ea)
2029 return sb == eb ? 0 : -1;
2030 if (sb == eb)
2031 return 1;
2033 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2034 conversion errors but before numbers; sort them by internal
2035 bit-pattern, for lack of a more portable alternative. */
2036 return (a < b ? -1
2037 : a > b ? 1
2038 : a == b ? 0
2039 : b == b ? -1
2040 : a == a ? 1
2041 : nan_compare (a, b));
2044 /* Return an integer in 1..12 of the month name MONTH.
2045 Return 0 if the name in S is not recognized. */
2047 static int
2048 getmonth (char const *month, char **ea)
2050 size_t lo = 0;
2051 size_t hi = MONTHS_PER_YEAR;
2053 while (blanks[to_uchar (*month)])
2054 month++;
2058 size_t ix = (lo + hi) / 2;
2059 char const *m = month;
2060 char const *n = monthtab[ix].name;
2062 for (;; m++, n++)
2064 if (!*n)
2066 if (ea)
2067 *ea = (char *) m;
2068 return monthtab[ix].val;
2070 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2072 hi = ix;
2073 break;
2075 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2077 lo = ix + 1;
2078 break;
2082 while (lo < hi);
2084 return 0;
2087 /* When using the OpenSSL implementation, dynamically link only if -R.
2088 This saves startup time in the usual (sans -R) case. */
2090 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2091 /* In the typical case where md5.h does not #undef HAVE_OPENSSL_MD5,
2092 trick md5.h into declaring and using pointers to functions not functions.
2093 This causes the compiler's -lcrypto option to have no effect,
2094 as sort.o no longer uses any crypto symbols statically. */
2096 # if 4 < __GNUC__ + (8 <= __GNUC_MINOR__)
2097 /* Pacify gcc -Wmissing-variable-declarations through at least GCC 14. */
2098 # pragma GCC diagnostic push
2099 # pragma GCC diagnostic ignored "-Wmissing-variable-declarations"
2100 # endif
2101 # define MD5_Init (*ptr_MD5_Init)
2102 # define MD5_Update (*ptr_MD5_Update)
2103 # define MD5_Final (*ptr_MD5_Final)
2104 #endif
2106 #include "md5.h"
2108 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2109 # include <dlfcn.h>
2111 /* Diagnose a dynamic linking failure. */
2112 static void
2113 link_failure (void)
2115 error (SORT_FAILURE, 0, "%s", dlerror ());
2118 /* Return a function pointer in HANDLE for SYMBOL. */
2119 static void *
2120 symbol_address (void *handle, char const *symbol)
2122 void *address = dlsym (handle, symbol);
2123 if (!address)
2124 link_failure ();
2125 return address;
2127 #endif
2129 /* Dynamically link the crypto library, if it needs linking. */
2130 static void
2131 link_libcrypto (void)
2133 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2134 void *handle = dlopen (LIBCRYPTO_SONAME, RTLD_LAZY | RTLD_GLOBAL);
2135 if (!handle)
2136 link_failure ();
2137 ptr_MD5_Init = symbol_address (handle, "MD5_Init");
2138 ptr_MD5_Update = symbol_address (handle, "MD5_Update");
2139 ptr_MD5_Final = symbol_address (handle, "MD5_Final");
2140 # if 4 < __GNUC__ + (8 <= __GNUC_MINOR__)
2141 # pragma GCC diagnostic pop
2142 # endif
2143 #endif
2146 /* A randomly chosen MD5 state, used for random comparison. */
2147 static struct md5_ctx random_md5_state;
2149 /* Initialize the randomly chosen MD5 state. */
2151 static void
2152 random_md5_state_init (char const *random_source)
2154 unsigned char buf[MD5_DIGEST_SIZE];
2155 struct randread_source *r = randread_new (random_source, sizeof buf);
2156 if (! r)
2157 sort_die (_("open failed"), random_source ? random_source : "getrandom");
2158 randread (r, buf, sizeof buf);
2159 if (randread_free (r) != 0)
2160 sort_die (_("close failed"), random_source);
2161 link_libcrypto ();
2162 md5_init_ctx (&random_md5_state);
2163 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2166 /* This is like strxfrm, except it reports any error and exits. */
2168 static size_t
2169 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2171 errno = 0;
2172 size_t translated_size = strxfrm (dest, src, destsize);
2174 if (errno)
2176 error (0, errno, _("string transformation failed"));
2177 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2178 error (SORT_FAILURE, 0,
2179 _("the original string was %s"),
2180 quotearg_n_style (0, locale_quoting_style, src));
2183 return translated_size;
2186 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2187 using one or more random hash functions. TEXTA[LENA] and
2188 TEXTB[LENB] must be zero. */
2190 static int
2191 compare_random (char *restrict texta, size_t lena,
2192 char *restrict textb, size_t lenb)
2194 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2195 data. This is used to break ties if there is a checksum
2196 collision, and this is good enough given the astronomically low
2197 probability of a collision. */
2198 int xfrm_diff = 0;
2200 char stackbuf[4000];
2201 char *buf = stackbuf;
2202 size_t bufsize = sizeof stackbuf;
2203 void *allocated = nullptr;
2204 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2205 struct md5_ctx s[2];
2206 s[0] = s[1] = random_md5_state;
2208 if (hard_LC_COLLATE)
2210 char const *lima = texta + lena;
2211 char const *limb = textb + lenb;
2213 while (true)
2215 /* Transform the text into the basis of comparison, so that byte
2216 strings that would otherwise considered to be equal are
2217 considered equal here even if their bytes differ.
2219 Each time through this loop, transform one
2220 null-terminated string's worth from TEXTA or from TEXTB
2221 or both. That way, there's no need to store the
2222 transformation of the whole line, if it contains many
2223 null-terminated strings. */
2225 /* Store the transformed data into a big-enough buffer. */
2227 /* A 3X size guess avoids the overhead of calling strxfrm
2228 twice on typical implementations. Don't worry about
2229 size_t overflow, as the guess need not be correct. */
2230 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2231 if (bufsize < guess_bufsize)
2233 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2234 free (allocated);
2235 buf = allocated = malloc (bufsize);
2236 if (! buf)
2238 buf = stackbuf;
2239 bufsize = sizeof stackbuf;
2243 size_t sizea =
2244 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2245 bool a_fits = sizea <= bufsize;
2246 size_t sizeb =
2247 (textb < limb
2248 ? (xstrxfrm ((a_fits ? buf + sizea : nullptr), textb,
2249 (a_fits ? bufsize - sizea : 0))
2250 + 1)
2251 : 0);
2253 if (! (a_fits && sizea + sizeb <= bufsize))
2255 bufsize = sizea + sizeb;
2256 if (bufsize < SIZE_MAX / 3)
2257 bufsize = bufsize * 3 / 2;
2258 free (allocated);
2259 buf = allocated = xmalloc (bufsize);
2260 if (texta < lima)
2261 strxfrm (buf, texta, sizea);
2262 if (textb < limb)
2263 strxfrm (buf + sizea, textb, sizeb);
2266 /* Advance past NULs to the next part of each input string,
2267 exiting the loop if both strings are exhausted. When
2268 exiting the loop, prepare to finish off the tiebreaker
2269 comparison properly. */
2270 if (texta < lima)
2271 texta += strlen (texta) + 1;
2272 if (textb < limb)
2273 textb += strlen (textb) + 1;
2274 if (! (texta < lima || textb < limb))
2276 lena = sizea; texta = buf;
2277 lenb = sizeb; textb = buf + sizea;
2278 break;
2281 /* Accumulate the transformed data in the corresponding
2282 checksums. */
2283 md5_process_bytes (buf, sizea, &s[0]);
2284 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2286 /* Update the tiebreaker comparison of the transformed data. */
2287 if (! xfrm_diff)
2289 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2290 if (! xfrm_diff)
2291 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2296 /* Compute and compare the checksums. */
2297 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2298 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2299 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2301 /* Fall back on the tiebreaker if the checksums collide. */
2302 if (! diff)
2304 if (! xfrm_diff)
2306 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2307 if (! xfrm_diff)
2308 xfrm_diff = (lena > lenb) - (lena < lenb);
2311 diff = xfrm_diff;
2314 free (allocated);
2316 return diff;
2319 /* Return the printable width of the block of memory starting at
2320 TEXT and ending just before LIM, counting each tab as one byte.
2321 FIXME: Should we generally be counting non printable chars? */
2323 static size_t
2324 debug_width (char const *text, char const *lim)
2326 size_t width = mbsnwidth (text, lim - text, 0);
2327 while (text < lim)
2328 width += (*text++ == '\t');
2329 return width;
2332 /* For debug mode, "underline" a key at the
2333 specified offset and screen width. */
2335 static void
2336 mark_key (size_t offset, size_t width)
2338 while (offset--)
2339 putchar (' ');
2341 if (!width)
2342 printf (_("^ no match for key\n"));
2343 else
2346 putchar ('_');
2347 while (--width);
2349 putchar ('\n');
2353 /* Return true if KEY is a numeric key. */
2355 static inline bool
2356 key_numeric (struct keyfield const *key)
2358 return key->numeric || key->general_numeric || key->human_numeric;
2361 /* For LINE, output a debugging line that underlines KEY in LINE.
2362 If KEY is null, underline the whole line. */
2364 static void
2365 debug_key (struct line const *line, struct keyfield const *key)
2367 char *text = line->text;
2368 char *beg = text;
2369 char *lim = text + line->length - 1;
2371 if (key)
2373 if (key->sword != SIZE_MAX)
2374 beg = begfield (line, key);
2375 if (key->eword != SIZE_MAX)
2376 lim = limfield (line, key);
2378 if ((key->skipsblanks && key->sword == SIZE_MAX)
2379 || key->month || key_numeric (key))
2381 char saved = *lim;
2382 *lim = '\0';
2384 while (blanks[to_uchar (*beg)])
2385 beg++;
2387 char *tighter_lim = beg;
2389 if (lim < beg)
2390 tighter_lim = lim;
2391 else if (key->month)
2392 getmonth (beg, &tighter_lim);
2393 else if (key->general_numeric)
2394 ignore_value (strtold (beg, &tighter_lim));
2395 else if (key->numeric || key->human_numeric)
2397 char const *p = beg + (beg < lim && *beg == '-');
2398 char max_digit = traverse_raw_number (&p);
2399 if ('0' <= max_digit)
2401 unsigned char ch = *p;
2402 tighter_lim = (char *) p
2403 + (key->human_numeric && unit_order[ch]);
2406 else
2407 tighter_lim = lim;
2409 *lim = saved;
2410 lim = tighter_lim;
2414 size_t offset = debug_width (text, beg);
2415 size_t width = debug_width (beg, lim);
2416 mark_key (offset, width);
2419 /* Debug LINE by underlining its keys. */
2421 static void
2422 debug_line (struct line const *line)
2424 struct keyfield const *key = keylist;
2427 debug_key (line, key);
2428 while (key && ((key = key->next) || ! (unique || stable)));
2431 /* Return whether sorting options specified for key. */
2433 static bool
2434 default_key_compare (struct keyfield const *key)
2436 return ! (key->ignore
2437 || key->translate
2438 || key->skipsblanks
2439 || key->skipeblanks
2440 || key_numeric (key)
2441 || key->month
2442 || key->version
2443 || key->random
2444 /* || key->reverse */
2448 /* Convert a key to the short options used to specify it. */
2450 static void
2451 key_to_opts (struct keyfield const *key, char *opts)
2453 if (key->skipsblanks || key->skipeblanks)
2454 *opts++ = 'b';/* either disables global -b */
2455 if (key->ignore == nondictionary)
2456 *opts++ = 'd';
2457 if (key->translate)
2458 *opts++ = 'f';
2459 if (key->general_numeric)
2460 *opts++ = 'g';
2461 if (key->human_numeric)
2462 *opts++ = 'h';
2463 if (key->ignore == nonprinting)
2464 *opts++ = 'i';
2465 if (key->month)
2466 *opts++ = 'M';
2467 if (key->numeric)
2468 *opts++ = 'n';
2469 if (key->random)
2470 *opts++ = 'R';
2471 if (key->reverse)
2472 *opts++ = 'r';
2473 if (key->version)
2474 *opts++ = 'V';
2475 *opts = '\0';
2478 /* Output data independent key warnings to stderr. */
2480 static void
2481 key_warnings (struct keyfield const *gkey, bool gkey_only)
2483 struct keyfield const *key;
2484 struct keyfield ugkey = *gkey;
2485 unsigned long keynum = 1;
2486 bool basic_numeric_field = false;
2487 bool general_numeric_field = false;
2488 bool basic_numeric_field_span = false;
2489 bool general_numeric_field_span = false;
2491 for (key = keylist; key; key = key->next, keynum++)
2493 if (key_numeric (key))
2495 if (key->general_numeric)
2496 general_numeric_field = true;
2497 else
2498 basic_numeric_field = true;
2501 if (key->traditional_used)
2503 size_t sword = key->sword;
2504 size_t eword = key->eword;
2505 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2506 /* obsolescent syntax +A.x -B.y is equivalent to:
2507 -k A+1.x+1,B.y (when y = 0)
2508 -k A+1.x+1,B+1.y (when y > 0) */
2509 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2510 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2511 char *po = obuf;
2512 char *pn = nbuf;
2514 if (sword == SIZE_MAX)
2515 sword++;
2517 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2518 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2519 if (key->eword != SIZE_MAX)
2521 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2522 stpcpy (stpcpy (pn, ","),
2523 umaxtostr (eword + 1
2524 + (key->echar == SIZE_MAX), tmp));
2526 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2527 quote_n (0, obuf), quote_n (1, nbuf));
2530 /* Warn about field specs that will never match. */
2531 bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2532 if (zero_width)
2533 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2535 /* Warn about significant leading blanks. */
2536 bool implicit_skip = key_numeric (key) || key->month;
2537 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2538 if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2539 && ((!key->skipsblanks && !implicit_skip)
2540 || (!key->skipsblanks && key->schar)
2541 || (!key->skipeblanks && key->echar)))
2542 error (0, 0, _("leading blanks are significant in key %lu; "
2543 "consider also specifying 'b'"), keynum);
2545 /* Warn about numeric comparisons spanning fields,
2546 as field delimiters could be interpreted as part
2547 of the number (maybe only in other locales). */
2548 if (!gkey_only && key_numeric (key))
2550 size_t sword = key->sword + 1;
2551 size_t eword = key->eword + 1;
2552 if (!sword)
2553 sword++;
2554 if (!eword || sword < eword)
2556 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2557 keynum);
2558 if (key->general_numeric)
2559 general_numeric_field_span = true;
2560 else
2561 basic_numeric_field_span = true;
2565 /* Flag global options not copied or specified in any key. */
2566 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2567 ugkey.ignore = nullptr;
2568 if (ugkey.translate && (ugkey.translate == key->translate))
2569 ugkey.translate = nullptr;
2570 ugkey.skipsblanks &= !key->skipsblanks;
2571 ugkey.skipeblanks &= !key->skipeblanks;
2572 ugkey.month &= !key->month;
2573 ugkey.numeric &= !key->numeric;
2574 ugkey.general_numeric &= !key->general_numeric;
2575 ugkey.human_numeric &= !key->human_numeric;
2576 ugkey.random &= !key->random;
2577 ugkey.version &= !key->version;
2578 ugkey.reverse &= !key->reverse;
2581 /* Explicitly warn if field delimiters in this locale
2582 don't constrain numbers. */
2583 bool number_locale_warned = false;
2584 if (basic_numeric_field_span)
2586 if (tab == TAB_DEFAULT
2587 ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
2588 : tab == thousands_sep)
2590 error (0, 0,
2591 _("field separator %s is treated as a "
2592 "group separator in numbers"),
2593 quote (((char []) {thousands_sep, 0})));
2594 number_locale_warned = true;
2597 if (basic_numeric_field_span || general_numeric_field_span)
2599 if (tab == TAB_DEFAULT
2600 ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
2601 : tab == decimal_point)
2603 error (0, 0,
2604 _("field separator %s is treated as a "
2605 "decimal point in numbers"),
2606 quote (((char []) {decimal_point, 0})));
2607 number_locale_warned = true;
2609 else if (tab == '-')
2611 error (0, 0,
2612 _("field separator %s is treated as a "
2613 "minus sign in numbers"),
2614 quote (((char []) {tab, 0})));
2616 else if (general_numeric_field_span && tab == '+')
2618 error (0, 0,
2619 _("field separator %s is treated as a "
2620 "plus sign in numbers"),
2621 quote (((char []) {tab, 0})));
2625 /* Explicitly indicate the decimal point used in this locale,
2626 as it suggests that robust scripts need to consider
2627 setting the locale when comparing numbers. */
2628 if ((basic_numeric_field || general_numeric_field) && ! number_locale_warned)
2630 error (0, 0,
2631 _("%snumbers use %s as a decimal point in this locale"),
2632 tab == decimal_point ? "" : _("note "),
2633 quote (((char []) {decimal_point, 0})));
2637 if (basic_numeric_field && thousands_sep_ignored)
2639 error (0, 0,
2640 _("the multi-byte number group separator "
2641 "in this locale is not supported"));
2644 /* Warn about ignored global options flagged above.
2645 This clears all flags if UGKEY is the only one in the list. */
2646 if (!default_key_compare (&ugkey)
2647 || (ugkey.reverse && (stable || unique) && keylist))
2649 bool ugkey_reverse = ugkey.reverse;
2650 if (!(stable || unique))
2651 ugkey.reverse = false;
2652 /* The following is too big, but guaranteed to be "big enough". */
2653 char opts[sizeof short_options];
2654 key_to_opts (&ugkey, opts);
2655 error (0, 0,
2656 ngettext ("option '-%s' is ignored",
2657 "options '-%s' are ignored",
2658 select_plural (strlen (opts))), opts);
2659 ugkey.reverse = ugkey_reverse;
2661 if (ugkey.reverse && !(stable || unique) && keylist)
2662 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2665 /* Return either the sense of DIFF or its reverse, depending on REVERSED.
2666 If REVERSED, do not simply negate DIFF as that can mishandle INT_MIN. */
2668 static int
2669 diff_reversed (int diff, bool reversed)
2671 return reversed ? (diff < 0) - (diff > 0) : diff;
2674 /* Compare two lines A and B trying every key in sequence until there
2675 are no more keys or a difference is found. */
2677 static int
2678 keycompare (struct line const *a, struct line const *b)
2680 struct keyfield *key = keylist;
2682 /* For the first iteration only, the key positions have been
2683 precomputed for us. */
2684 char *texta = a->keybeg;
2685 char *textb = b->keybeg;
2686 char *lima = a->keylim;
2687 char *limb = b->keylim;
2689 int diff;
2691 while (true)
2693 char const *translate = key->translate;
2694 bool const *ignore = key->ignore;
2696 /* Treat field ends before field starts as empty fields. */
2697 lima = MAX (texta, lima);
2698 limb = MAX (textb, limb);
2700 /* Find the lengths. */
2701 size_t lena = lima - texta;
2702 size_t lenb = limb - textb;
2704 if (hard_LC_COLLATE || key_numeric (key)
2705 || key->month || key->random || key->version)
2707 /* Ordinarily use the keys in-place, temporarily null-terminated. */
2708 char *ta = texta;
2709 char *tb = textb;
2710 size_t tlena = lena;
2711 size_t tlenb = lenb;
2712 char enda = ta[tlena];
2713 char endb = tb[tlenb];
2715 void *allocated = nullptr;
2716 char stackbuf[4000];
2718 if (ignore || translate)
2720 /* Compute with copies of the keys, which are the result of
2721 translating or ignoring characters, and which need their
2722 own storage. */
2724 size_t i;
2726 /* Allocate space for copies. */
2727 size_t size = lena + 1 + lenb + 1;
2728 if (size <= sizeof stackbuf)
2729 ta = stackbuf;
2730 else
2731 ta = allocated = xmalloc (size);
2732 tb = ta + lena + 1;
2734 /* Put into each copy a version of the key in which the
2735 requested characters are ignored or translated. */
2736 for (tlena = i = 0; i < lena; i++)
2737 if (! (ignore && ignore[to_uchar (texta[i])]))
2738 ta[tlena++] = (translate
2739 ? translate[to_uchar (texta[i])]
2740 : texta[i]);
2742 for (tlenb = i = 0; i < lenb; i++)
2743 if (! (ignore && ignore[to_uchar (textb[i])]))
2744 tb[tlenb++] = (translate
2745 ? translate[to_uchar (textb[i])]
2746 : textb[i]);
2749 ta[tlena] = '\0';
2750 tb[tlenb] = '\0';
2752 if (key->numeric)
2753 diff = numcompare (ta, tb);
2754 else if (key->general_numeric)
2755 diff = general_numcompare (ta, tb);
2756 else if (key->human_numeric)
2757 diff = human_numcompare (ta, tb);
2758 else if (key->month)
2759 diff = getmonth (ta, nullptr) - getmonth (tb, nullptr);
2760 else if (key->random)
2761 diff = compare_random (ta, tlena, tb, tlenb);
2762 else if (key->version)
2763 diff = filenvercmp (ta, tlena, tb, tlenb);
2764 else
2766 /* Locale-dependent string sorting. This is slower than
2767 C-locale sorting, which is implemented below. */
2768 if (tlena == 0)
2769 diff = - NONZERO (tlenb);
2770 else if (tlenb == 0)
2771 diff = 1;
2772 else
2773 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2776 ta[tlena] = enda;
2777 tb[tlenb] = endb;
2779 free (allocated);
2781 else if (ignore)
2783 #define CMP_WITH_IGNORE(A, B) \
2784 do \
2786 while (true) \
2788 while (texta < lima && ignore[to_uchar (*texta)]) \
2789 ++texta; \
2790 while (textb < limb && ignore[to_uchar (*textb)]) \
2791 ++textb; \
2792 if (! (texta < lima && textb < limb)) \
2794 diff = (texta < lima) - (textb < limb); \
2795 break; \
2797 diff = to_uchar (A) - to_uchar (B); \
2798 if (diff) \
2799 break; \
2800 ++texta; \
2801 ++textb; \
2805 while (0)
2807 if (translate)
2808 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2809 translate[to_uchar (*textb)]);
2810 else
2811 CMP_WITH_IGNORE (*texta, *textb);
2813 else
2815 size_t lenmin = MIN (lena, lenb);
2816 if (lenmin == 0)
2817 diff = 0;
2818 else if (translate)
2820 size_t i = 0;
2823 diff = (to_uchar (translate[to_uchar (texta[i])])
2824 - to_uchar (translate[to_uchar (textb[i])]));
2825 if (diff)
2826 break;
2827 i++;
2829 while (i < lenmin);
2831 else
2832 diff = memcmp (texta, textb, lenmin);
2834 if (! diff)
2835 diff = (lena > lenb) - (lena < lenb);
2838 if (diff)
2839 break;
2841 key = key->next;
2842 if (! key)
2843 return 0;
2845 /* Find the beginning and limit of the next field. */
2846 if (key->eword != SIZE_MAX)
2847 lima = limfield (a, key), limb = limfield (b, key);
2848 else
2849 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2851 if (key->sword != SIZE_MAX)
2852 texta = begfield (a, key), textb = begfield (b, key);
2853 else
2855 texta = a->text, textb = b->text;
2856 if (key->skipsblanks)
2858 while (texta < lima && blanks[to_uchar (*texta)])
2859 ++texta;
2860 while (textb < limb && blanks[to_uchar (*textb)])
2861 ++textb;
2866 return diff_reversed (diff, key->reverse);
2869 /* Compare two lines A and B, returning negative, zero, or positive
2870 depending on whether A compares less than, equal to, or greater than B. */
2872 static int
2873 compare (struct line const *a, struct line const *b)
2875 int diff;
2876 size_t alen, blen;
2878 /* First try to compare on the specified keys (if any).
2879 The only two cases with no key at all are unadorned sort,
2880 and unadorned sort -r. */
2881 if (keylist)
2883 diff = keycompare (a, b);
2884 if (diff || unique || stable)
2885 return diff;
2888 /* If the keys all compare equal (or no keys were specified)
2889 fall through to the default comparison. */
2890 alen = a->length - 1, blen = b->length - 1;
2892 if (alen == 0)
2893 diff = - NONZERO (blen);
2894 else if (blen == 0)
2895 diff = 1;
2896 else if (hard_LC_COLLATE)
2898 /* xmemcoll0 is a performance enhancement as
2899 it will not unconditionally write '\0' after the
2900 passed in buffers, which was seen to give around
2901 a 3% increase in performance for short lines. */
2902 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2904 else
2906 diff = memcmp (a->text, b->text, MIN (alen, blen));
2907 if (!diff)
2908 diff = (alen > blen) - (alen < blen);
2911 return diff_reversed (diff, reverse);
2914 /* Write LINE to output stream FP; the output file's name is
2915 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2916 otherwise. If debugging is enabled and FP is standard output,
2917 append some debugging information. */
2919 static void
2920 write_line (struct line const *line, FILE *fp, char const *output_file)
2922 char *buf = line->text;
2923 size_t n_bytes = line->length;
2924 char *ebuf = buf + n_bytes;
2926 if (!output_file && debug)
2928 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2929 char const *c = buf;
2931 while (c < ebuf)
2933 char wc = *c++;
2934 if (wc == '\t')
2935 wc = '>';
2936 else if (c == ebuf)
2937 wc = '\n';
2938 if (fputc (wc, fp) == EOF)
2939 sort_die (_("write failed"), output_file);
2942 debug_line (line);
2944 else
2946 ebuf[-1] = eolchar;
2947 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2948 sort_die (_("write failed"), output_file);
2949 ebuf[-1] = '\0';
2953 /* Check that the lines read from FILE_NAME come in order. Return
2954 true if they are in order. If CHECKONLY == 'c', also print a
2955 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2956 they are not in order. */
2958 static bool
2959 check (char const *file_name, char checkonly)
2961 FILE *fp = xfopen (file_name, "r");
2962 struct buffer buf; /* Input buffer. */
2963 struct line temp; /* Copy of previous line. */
2964 size_t alloc = 0;
2965 uintmax_t line_number = 0;
2966 struct keyfield const *key = keylist;
2967 bool nonunique = ! unique;
2968 bool ordered = true;
2970 initbuf (&buf, sizeof (struct line),
2971 MAX (merge_buffer_size, sort_size));
2972 temp.text = nullptr;
2974 while (fillbuf (&buf, fp, file_name))
2976 struct line const *line = buffer_linelim (&buf);
2977 struct line const *linebase = line - buf.nlines;
2979 /* Make sure the line saved from the old buffer contents is
2980 less than or equal to the first line of the new buffer. */
2981 if (alloc && nonunique <= compare (&temp, line - 1))
2983 found_disorder:
2985 if (checkonly == 'c')
2987 struct line const *disorder_line = line - 1;
2988 uintmax_t disorder_line_number =
2989 buffer_linelim (&buf) - disorder_line + line_number;
2990 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2991 fprintf (stderr, _("%s: %s:%s: disorder: "),
2992 program_name, file_name,
2993 umaxtostr (disorder_line_number, hr_buf));
2994 write_line (disorder_line, stderr, _("standard error"));
2997 ordered = false;
2998 break;
3002 /* Compare each line in the buffer with its successor. */
3003 while (linebase < --line)
3004 if (nonunique <= compare (line, line - 1))
3005 goto found_disorder;
3007 line_number += buf.nlines;
3009 /* Save the last line of the buffer. */
3010 if (alloc < line->length)
3014 alloc *= 2;
3015 if (! alloc)
3017 alloc = line->length;
3018 break;
3021 while (alloc < line->length);
3023 free (temp.text);
3024 temp.text = xmalloc (alloc);
3026 memcpy (temp.text, line->text, line->length);
3027 temp.length = line->length;
3028 if (key)
3030 temp.keybeg = temp.text + (line->keybeg - line->text);
3031 temp.keylim = temp.text + (line->keylim - line->text);
3035 xfclose (fp, file_name);
3036 free (buf.buf);
3037 free (temp.text);
3038 return ordered;
3041 /* Open FILES (there are NFILES of them) and store the resulting array
3042 of stream pointers into (*PFPS). Allocate the array. Return the
3043 number of successfully opened files, setting errno if this value is
3044 less than NFILES. */
3046 static size_t
3047 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
3049 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
3050 int i;
3052 /* Open as many input files as we can. */
3053 for (i = 0; i < nfiles; i++)
3055 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
3056 ? open_temp (files[i].temp)
3057 : stream_open (files[i].name, "r"));
3058 if (!fps[i])
3059 break;
3062 return i;
3065 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3066 files (all of which are at the start of the FILES array), and
3067 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3068 FPS is the vector of open stream corresponding to the files.
3069 Close input and output streams before returning.
3070 OUTPUT_FILE gives the name of the output file. If it is null,
3071 the output file is standard output. */
3073 static void
3074 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
3075 FILE *ofp, char const *output_file, FILE **fps)
3077 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
3078 /* Input buffers for each file. */
3079 struct line saved; /* Saved line storage for unique check. */
3080 struct line const *savedline = nullptr;
3081 /* &saved if there is a saved line. */
3082 size_t savealloc = 0; /* Size allocated for the saved line. */
3083 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
3084 /* Current line in each line table. */
3085 struct line const **base = xnmalloc (nfiles, sizeof *base);
3086 /* Base of each line table. */
3087 size_t *ord = xnmalloc (nfiles, sizeof *ord);
3088 /* Table representing a permutation of fps,
3089 such that cur[ord[0]] is the smallest line
3090 and will be next output. */
3091 size_t i;
3092 size_t j;
3093 size_t t;
3094 struct keyfield const *key = keylist;
3095 saved.text = nullptr;
3097 /* Read initial lines from each input file. */
3098 for (i = 0; i < nfiles; )
3100 initbuf (&buffer[i], sizeof (struct line),
3101 MAX (merge_buffer_size, sort_size / nfiles));
3102 if (fillbuf (&buffer[i], fps[i], files[i].name))
3104 struct line const *linelim = buffer_linelim (&buffer[i]);
3105 cur[i] = linelim - 1;
3106 base[i] = linelim - buffer[i].nlines;
3107 i++;
3109 else
3111 /* fps[i] is empty; eliminate it from future consideration. */
3112 xfclose (fps[i], files[i].name);
3113 if (i < ntemps)
3115 ntemps--;
3116 zaptemp (files[i].name);
3118 free (buffer[i].buf);
3119 --nfiles;
3120 for (j = i; j < nfiles; ++j)
3122 files[j] = files[j + 1];
3123 fps[j] = fps[j + 1];
3128 /* Set up the ord table according to comparisons among input lines.
3129 Since this only reorders two items if one is strictly greater than
3130 the other, it is stable. */
3131 for (i = 0; i < nfiles; ++i)
3132 ord[i] = i;
3133 for (i = 1; i < nfiles; ++i)
3134 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
3135 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
3137 /* Repeatedly output the smallest line until no input remains. */
3138 while (nfiles)
3140 struct line const *smallest = cur[ord[0]];
3142 /* If uniquified output is turned on, output only the first of
3143 an identical series of lines. */
3144 if (unique)
3146 if (savedline && compare (savedline, smallest))
3148 savedline = nullptr;
3149 write_line (&saved, ofp, output_file);
3151 if (!savedline)
3153 savedline = &saved;
3154 if (savealloc < smallest->length)
3157 if (! savealloc)
3159 savealloc = smallest->length;
3160 break;
3162 while ((savealloc *= 2) < smallest->length);
3164 free (saved.text);
3165 saved.text = xmalloc (savealloc);
3167 saved.length = smallest->length;
3168 memcpy (saved.text, smallest->text, saved.length);
3169 if (key)
3171 saved.keybeg =
3172 saved.text + (smallest->keybeg - smallest->text);
3173 saved.keylim =
3174 saved.text + (smallest->keylim - smallest->text);
3178 else
3179 write_line (smallest, ofp, output_file);
3181 /* Check if we need to read more lines into memory. */
3182 if (base[ord[0]] < smallest)
3183 cur[ord[0]] = smallest - 1;
3184 else
3186 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3188 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3189 cur[ord[0]] = linelim - 1;
3190 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3192 else
3194 /* We reached EOF on fps[ord[0]]. */
3195 for (i = 1; i < nfiles; ++i)
3196 if (ord[i] > ord[0])
3197 --ord[i];
3198 --nfiles;
3199 xfclose (fps[ord[0]], files[ord[0]].name);
3200 if (ord[0] < ntemps)
3202 ntemps--;
3203 zaptemp (files[ord[0]].name);
3205 free (buffer[ord[0]].buf);
3206 for (i = ord[0]; i < nfiles; ++i)
3208 fps[i] = fps[i + 1];
3209 files[i] = files[i + 1];
3210 buffer[i] = buffer[i + 1];
3211 cur[i] = cur[i + 1];
3212 base[i] = base[i + 1];
3214 for (i = 0; i < nfiles; ++i)
3215 ord[i] = ord[i + 1];
3216 continue;
3220 /* The new line just read in may be larger than other lines
3221 already in main memory; push it back in the queue until we
3222 encounter a line larger than it. Optimize for the common
3223 case where the new line is smallest. */
3225 size_t lo = 1;
3226 size_t hi = nfiles;
3227 size_t probe = lo;
3228 size_t ord0 = ord[0];
3229 size_t count_of_smaller_lines;
3231 while (lo < hi)
3233 int cmp = compare (cur[ord0], cur[ord[probe]]);
3234 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3235 hi = probe;
3236 else
3237 lo = probe + 1;
3238 probe = (lo + hi) / 2;
3241 count_of_smaller_lines = lo - 1;
3242 for (j = 0; j < count_of_smaller_lines; j++)
3243 ord[j] = ord[j + 1];
3244 ord[count_of_smaller_lines] = ord0;
3248 if (unique && savedline)
3250 write_line (&saved, ofp, output_file);
3251 free (saved.text);
3254 xfclose (ofp, output_file);
3255 free (fps);
3256 free (buffer);
3257 free (ord);
3258 free (base);
3259 free (cur);
3262 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3263 files (all of which are at the start of the FILES array), and
3264 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3265 Close input and output files before returning.
3266 OUTPUT_FILE gives the name of the output file.
3268 Return the number of files successfully merged. This number can be
3269 less than NFILES if we ran low on file descriptors, but in this
3270 case it is never less than 2. */
3272 static size_t
3273 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3274 FILE *ofp, char const *output_file)
3276 FILE **fps;
3277 size_t nopened = open_input_files (files, nfiles, &fps);
3278 if (nopened < nfiles && nopened < 2)
3279 sort_die (_("open failed"), files[nopened].name);
3280 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3281 return nopened;
3284 /* Merge into T (of size NLINES) the two sorted arrays of lines
3285 LO (with NLINES / 2 members), and
3286 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3287 T and LO point just past their respective arrays, and the arrays
3288 are in reverse order. NLINES must be at least 2. */
3290 static void
3291 mergelines (struct line *restrict t, size_t nlines,
3292 struct line const *restrict lo)
3294 size_t nlo = nlines / 2;
3295 size_t nhi = nlines - nlo;
3296 struct line *hi = t - nlo;
3298 while (true)
3299 if (compare (lo - 1, hi - 1) <= 0)
3301 *--t = *--lo;
3302 if (! --nlo)
3304 /* HI must equal T now, and there is no need to copy from
3305 HI to T. */
3306 return;
3309 else
3311 *--t = *--hi;
3312 if (! --nhi)
3315 *--t = *--lo;
3316 while (--nlo);
3318 return;
3323 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3324 Do this all within one thread. NLINES must be at least 2.
3325 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3326 Otherwise the sort is in-place and TEMP is half-sized.
3327 The input and output arrays are in reverse order, and LINES and
3328 TEMP point just past the end of their respective arrays.
3330 Use a recursive divide-and-conquer algorithm, in the style
3331 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3332 the optimization suggested by exercise 5.2.4-10; this requires room
3333 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3334 writes that this memory optimization was originally published by
3335 D. A. Bell, Comp J. 1 (1958), 75. */
3337 static void
3338 sequential_sort (struct line *restrict lines, size_t nlines,
3339 struct line *restrict temp, bool to_temp)
3341 if (nlines == 2)
3343 /* Declare 'swap' as int, not bool, to work around a bug
3344 <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
3345 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3346 int swap = (0 < compare (&lines[-1], &lines[-2]));
3347 if (to_temp)
3349 temp[-1] = lines[-1 - swap];
3350 temp[-2] = lines[-2 + swap];
3352 else if (swap)
3354 temp[-1] = lines[-1];
3355 lines[-1] = lines[-2];
3356 lines[-2] = temp[-1];
3359 else
3361 size_t nlo = nlines / 2;
3362 size_t nhi = nlines - nlo;
3363 struct line *lo = lines;
3364 struct line *hi = lines - nlo;
3366 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3367 if (1 < nlo)
3368 sequential_sort (lo, nlo, temp, !to_temp);
3369 else if (!to_temp)
3370 temp[-1] = lo[-1];
3372 struct line *dest;
3373 struct line const *sorted_lo;
3374 if (to_temp)
3376 dest = temp;
3377 sorted_lo = lines;
3379 else
3381 dest = lines;
3382 sorted_lo = temp;
3384 mergelines (dest, nlines, sorted_lo);
3388 static struct merge_node *init_node (struct merge_node *restrict,
3389 struct merge_node *restrict,
3390 struct line *, size_t, size_t, bool);
3393 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3394 lines, with destination DEST. */
3395 static struct merge_node *
3396 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3398 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3400 struct merge_node *root = merge_tree;
3401 root->lo = root->hi = root->end_lo = root->end_hi = nullptr;
3402 root->dest = nullptr;
3403 root->nlo = root->nhi = nlines;
3404 root->parent = nullptr;
3405 root->level = MERGE_END;
3406 root->queued = false;
3407 pthread_mutex_init (&root->lock, nullptr);
3409 init_node (root, root + 1, dest, nthreads, nlines, false);
3410 return merge_tree;
3413 /* Destroy the merge tree. */
3414 static void
3415 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3417 size_t n_nodes = nthreads * 2;
3418 struct merge_node *node = merge_tree;
3420 while (n_nodes--)
3422 pthread_mutex_destroy (&node->lock);
3423 node++;
3426 free (merge_tree);
3429 /* Initialize a merge tree node and its descendants. The node's
3430 parent is PARENT. The node and its descendants are taken from the
3431 array of nodes NODE_POOL. Their destination starts at DEST; they
3432 will consume NTHREADS threads. The total number of sort lines is
3433 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3434 its parent. */
3436 static struct merge_node *
3437 init_node (struct merge_node *restrict parent,
3438 struct merge_node *restrict node_pool,
3439 struct line *dest, size_t nthreads,
3440 size_t total_lines, bool is_lo_child)
3442 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3443 size_t nlo = nlines / 2;
3444 size_t nhi = nlines - nlo;
3445 struct line *lo = dest - total_lines;
3446 struct line *hi = lo - nlo;
3447 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3449 struct merge_node *node = node_pool++;
3450 node->lo = node->end_lo = lo;
3451 node->hi = node->end_hi = hi;
3452 node->dest = parent_end;
3453 node->nlo = nlo;
3454 node->nhi = nhi;
3455 node->parent = parent;
3456 node->level = parent->level + 1;
3457 node->queued = false;
3458 pthread_mutex_init (&node->lock, nullptr);
3460 if (nthreads > 1)
3462 size_t lo_threads = nthreads / 2;
3463 size_t hi_threads = nthreads - lo_threads;
3464 node->lo_child = node_pool;
3465 node_pool = init_node (node, node_pool, lo, lo_threads,
3466 total_lines, true);
3467 node->hi_child = node_pool;
3468 node_pool = init_node (node, node_pool, hi, hi_threads,
3469 total_lines, false);
3471 else
3473 node->lo_child = nullptr;
3474 node->hi_child = nullptr;
3476 return node_pool;
3480 /* Compare two merge nodes A and B for priority. */
3482 static int
3483 compare_nodes (void const *a, void const *b)
3485 struct merge_node const *nodea = a;
3486 struct merge_node const *nodeb = b;
3487 if (nodea->level == nodeb->level)
3488 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3489 return nodea->level < nodeb->level;
3492 /* Lock a merge tree NODE. */
3494 static inline void
3495 lock_node (struct merge_node *node)
3497 pthread_mutex_lock (&node->lock);
3500 /* Unlock a merge tree NODE. */
3502 static inline void
3503 unlock_node (struct merge_node *node)
3505 pthread_mutex_unlock (&node->lock);
3508 /* Destroy merge QUEUE. */
3510 static void
3511 queue_destroy (struct merge_node_queue *queue)
3513 heap_free (queue->priority_queue);
3514 pthread_cond_destroy (&queue->cond);
3515 pthread_mutex_destroy (&queue->mutex);
3518 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3519 NTHREADS threads. */
3521 static void
3522 queue_init (struct merge_node_queue *queue, size_t nthreads)
3524 /* Though it's highly unlikely all nodes are in the heap at the same
3525 time, the heap should accommodate all of them. Counting a null
3526 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3527 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3528 pthread_mutex_init (&queue->mutex, nullptr);
3529 pthread_cond_init (&queue->cond, nullptr);
3532 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3533 does not need to lock NODE. */
3535 static void
3536 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3538 pthread_mutex_lock (&queue->mutex);
3539 heap_insert (queue->priority_queue, node);
3540 node->queued = true;
3541 pthread_cond_signal (&queue->cond);
3542 pthread_mutex_unlock (&queue->mutex);
3545 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3547 static struct merge_node *
3548 queue_pop (struct merge_node_queue *queue)
3550 struct merge_node *node;
3551 pthread_mutex_lock (&queue->mutex);
3552 while (! (node = heap_remove_top (queue->priority_queue)))
3553 pthread_cond_wait (&queue->cond, &queue->mutex);
3554 pthread_mutex_unlock (&queue->mutex);
3555 lock_node (node);
3556 node->queued = false;
3557 return node;
3560 /* Output LINE to TFP, unless -u is specified and the line compares
3561 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3562 is null if TFP is standard output.
3564 This function does not save the line for comparison later, so it is
3565 appropriate only for internal sort. */
3567 static void
3568 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3570 if (unique)
3572 if (saved_line.text && ! compare (line, &saved_line))
3573 return;
3574 saved_line = *line;
3577 write_line (line, tfp, temp_output);
3580 /* Merge the lines currently available to a NODE in the binary
3581 merge tree. Merge a number of lines appropriate for this merge
3582 level, assuming TOTAL_LINES is the total number of lines.
3584 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3585 the name of TFP, or is null if TFP is standard output. */
3587 static void
3588 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3589 FILE *tfp, char const *temp_output)
3591 struct line *lo_orig = node->lo;
3592 struct line *hi_orig = node->hi;
3593 size_t to_merge = MAX_MERGE (total_lines, node->level);
3594 size_t merged_lo;
3595 size_t merged_hi;
3597 if (node->level > MERGE_ROOT)
3599 /* Merge to destination buffer. */
3600 struct line *dest = *node->dest;
3601 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3602 if (compare (node->lo - 1, node->hi - 1) <= 0)
3603 *--dest = *--node->lo;
3604 else
3605 *--dest = *--node->hi;
3607 merged_lo = lo_orig - node->lo;
3608 merged_hi = hi_orig - node->hi;
3610 if (node->nhi == merged_hi)
3611 while (node->lo != node->end_lo && to_merge--)
3612 *--dest = *--node->lo;
3613 else if (node->nlo == merged_lo)
3614 while (node->hi != node->end_hi && to_merge--)
3615 *--dest = *--node->hi;
3616 *node->dest = dest;
3618 else
3620 /* Merge directly to output. */
3621 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3623 if (compare (node->lo - 1, node->hi - 1) <= 0)
3624 write_unique (--node->lo, tfp, temp_output);
3625 else
3626 write_unique (--node->hi, tfp, temp_output);
3629 merged_lo = lo_orig - node->lo;
3630 merged_hi = hi_orig - node->hi;
3632 if (node->nhi == merged_hi)
3634 while (node->lo != node->end_lo && to_merge--)
3635 write_unique (--node->lo, tfp, temp_output);
3637 else if (node->nlo == merged_lo)
3639 while (node->hi != node->end_hi && to_merge--)
3640 write_unique (--node->hi, tfp, temp_output);
3644 /* Update NODE. */
3645 merged_lo = lo_orig - node->lo;
3646 merged_hi = hi_orig - node->hi;
3647 node->nlo -= merged_lo;
3648 node->nhi -= merged_hi;
3651 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3652 NODE's children has available lines and the other either has
3653 available lines or has exhausted its lines. */
3655 static void
3656 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3658 if (! node->queued)
3660 bool lo_avail = (node->lo - node->end_lo) != 0;
3661 bool hi_avail = (node->hi - node->end_hi) != 0;
3662 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3663 queue_insert (queue, node);
3667 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3669 static void
3670 queue_check_insert_parent (struct merge_node_queue *queue,
3671 struct merge_node *node)
3673 if (node->level > MERGE_ROOT)
3675 lock_node (node->parent);
3676 queue_check_insert (queue, node->parent);
3677 unlock_node (node->parent);
3679 else if (node->nlo + node->nhi == 0)
3681 /* If the MERGE_ROOT NODE has finished merging, insert the
3682 MERGE_END node. */
3683 queue_insert (queue, node->parent);
3687 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3688 some of those lines, until the MERGE_END node is popped.
3689 TOTAL_LINES is the total number of lines. If merging at the top
3690 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3691 null if TFP is standard output. */
3693 static void
3694 merge_loop (struct merge_node_queue *queue,
3695 size_t total_lines, FILE *tfp, char const *temp_output)
3697 while (true)
3699 struct merge_node *node = queue_pop (queue);
3701 if (node->level == MERGE_END)
3703 unlock_node (node);
3704 /* Reinsert so other threads can pop it. */
3705 queue_insert (queue, node);
3706 break;
3708 mergelines_node (node, total_lines, tfp, temp_output);
3709 queue_check_insert (queue, node);
3710 queue_check_insert_parent (queue, node);
3712 unlock_node (node);
3717 static void sortlines (struct line *restrict, size_t, size_t,
3718 struct merge_node *, struct merge_node_queue *,
3719 FILE *, char const *);
3721 /* Thread arguments for sortlines_thread. */
3723 struct thread_args
3725 /* Source, i.e., the array of lines to sort. This points just past
3726 the end of the array. */
3727 struct line *lines;
3729 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3730 size_t nthreads;
3732 /* Number of lines in LINES and DEST. */
3733 size_t const total_lines;
3735 /* Merge node. Lines from this node and this node's sibling will merged
3736 to this node's parent. */
3737 struct merge_node *const node;
3739 /* The priority queue controlling available work for the entire
3740 internal sort. */
3741 struct merge_node_queue *const queue;
3743 /* If at the top level, the file to output to, and the file's name.
3744 If the file is standard output, the file's name is null. */
3745 FILE *tfp;
3746 char const *output_temp;
3749 /* Like sortlines, except with a signature acceptable to pthread_create. */
3751 static void *
3752 sortlines_thread (void *data)
3754 struct thread_args const *args = data;
3755 sortlines (args->lines, args->nthreads, args->total_lines,
3756 args->node, args->queue, args->tfp,
3757 args->output_temp);
3758 return nullptr;
3761 /* Sort lines, possibly in parallel. The arguments are as in struct
3762 thread_args above.
3764 The algorithm has three phases: node creation, sequential sort,
3765 and binary merge.
3767 During node creation, sortlines recursively visits each node in the
3768 binary merge tree and creates a NODE structure corresponding to all the
3769 future line merging NODE is responsible for. For each call to
3770 sortlines, half the available threads are assigned to each recursive
3771 call, until a leaf node having only 1 available thread is reached.
3773 Each leaf node then performs two sequential sorts, one on each half of
3774 the lines it is responsible for. It records in its NODE structure that
3775 there are two sorted sublists available to merge from, and inserts its
3776 NODE into the priority queue.
3778 The binary merge phase then begins. Each thread drops into a loop
3779 where the thread retrieves a NODE from the priority queue, merges lines
3780 available to that NODE, and potentially insert NODE or its parent back
3781 into the queue if there are sufficient available lines for them to
3782 merge. This continues until all lines at all nodes of the merge tree
3783 have been merged. */
3785 static void
3786 sortlines (struct line *restrict lines, size_t nthreads,
3787 size_t total_lines, struct merge_node *node,
3788 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3790 size_t nlines = node->nlo + node->nhi;
3792 /* Calculate thread arguments. */
3793 size_t lo_threads = nthreads / 2;
3794 size_t hi_threads = nthreads - lo_threads;
3795 pthread_t thread;
3796 struct thread_args args = {lines, lo_threads, total_lines,
3797 node->lo_child, queue, tfp, temp_output};
3799 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3800 && pthread_create (&thread, nullptr, sortlines_thread, &args) == 0)
3802 sortlines (lines - node->nlo, hi_threads, total_lines,
3803 node->hi_child, queue, tfp, temp_output);
3804 pthread_join (thread, nullptr);
3806 else
3808 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3809 Sort with 1 thread. */
3810 size_t nlo = node->nlo;
3811 size_t nhi = node->nhi;
3812 struct line *temp = lines - total_lines;
3813 if (1 < nhi)
3814 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3815 if (1 < nlo)
3816 sequential_sort (lines, nlo, temp, false);
3818 /* Update merge NODE. No need to lock yet. */
3819 node->lo = lines;
3820 node->hi = lines - nlo;
3821 node->end_lo = lines - nlo;
3822 node->end_hi = lines - nlo - nhi;
3824 queue_insert (queue, node);
3825 merge_loop (queue, total_lines, tfp, temp_output);
3829 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3830 the same as OUTFILE. If found, replace each with the same
3831 temporary copy that can be merged into OUTFILE without destroying
3832 OUTFILE before it is completely read. This temporary copy does not
3833 count as a merge temp, so don't worry about incrementing NTEMPS in
3834 the caller; final cleanup will remove it, not zaptemp.
3836 This test ensures that an otherwise-erroneous use like
3837 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3838 It's not clear that POSIX requires this nicety.
3839 Detect common error cases, but don't try to catch obscure cases like
3840 "cat ... FILE ... | sort -m -o FILE"
3841 where traditional "sort" doesn't copy the input and where
3842 people should know that they're getting into trouble anyway.
3843 Catching these obscure cases would slow down performance in
3844 common cases. */
3846 static void
3847 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3848 size_t nfiles, char const *outfile)
3850 struct tempnode *tempcopy = nullptr;
3852 for (size_t i = ntemps; i < nfiles; i++)
3854 bool is_stdin = STREQ (files[i].name, "-");
3855 bool same;
3856 struct stat instat;
3858 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3859 same = true;
3860 else
3862 struct stat *outst = get_outstatus ();
3863 if (!outst)
3864 break;
3866 same = (((is_stdin
3867 ? fstat (STDIN_FILENO, &instat)
3868 : stat (files[i].name, &instat))
3869 == 0)
3870 && psame_inode (&instat, outst));
3873 if (same)
3875 if (! tempcopy)
3877 FILE *tftp;
3878 tempcopy = create_temp (&tftp);
3879 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3882 files[i].name = tempcopy->name;
3883 files[i].temp = tempcopy;
3888 /* Scan the input files to ensure all are accessible.
3889 Otherwise exit with a diagnostic.
3891 This will catch common issues with permissions etc.
3892 but will fail to notice issues where you can open but not read,
3893 like when a directory is specified on some systems.
3894 Catching these obscure cases could slow down performance in
3895 common cases. */
3897 static void
3898 check_inputs (char *const *files, size_t nfiles)
3900 for (size_t i = 0; i < nfiles; i++)
3902 if (STREQ (files[i], "-"))
3903 continue;
3905 if (euidaccess (files[i], R_OK) != 0)
3906 sort_die (_("cannot read"), files[i]);
3910 /* Ensure a specified output file can be created or written to,
3911 and point stdout to it. Do not truncate the file.
3912 Exit with a diagnostic on failure. */
3914 static void
3915 check_output (char const *outfile)
3917 if (outfile)
3919 int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT;
3920 int outfd = open (outfile, oflags, MODE_RW_UGO);
3921 if (outfd < 0)
3922 sort_die (_("open failed"), outfile);
3923 move_fd (outfd, STDOUT_FILENO);
3927 /* Merge the input FILES. NTEMPS is the number of files at the
3928 start of FILES that are temporary; it is zero at the top level.
3929 NFILES is the total number of files. Put the output in
3930 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3932 static void
3933 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3934 char const *output_file)
3936 while (nmerge < nfiles)
3938 /* Number of input files processed so far. */
3939 size_t in;
3941 /* Number of output files generated so far. */
3942 size_t out;
3944 /* nfiles % NMERGE; this counts input files that are left over
3945 after all full-sized merges have been done. */
3946 size_t remainder;
3948 /* Number of easily-available slots at the next loop iteration. */
3949 size_t cheap_slots;
3951 /* Do as many NMERGE-size merges as possible. In the case that
3952 nmerge is bogus, increment by the maximum number of file
3953 descriptors allowed. */
3954 for (out = in = 0; nmerge <= nfiles - in; out++)
3956 FILE *tfp;
3957 struct tempnode *temp = create_temp (&tfp);
3958 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3959 nmerge, tfp, temp->name);
3960 ntemps -= MIN (ntemps, num_merged);
3961 files[out].name = temp->name;
3962 files[out].temp = temp;
3963 in += num_merged;
3966 remainder = nfiles - in;
3967 cheap_slots = nmerge - out % nmerge;
3969 if (cheap_slots < remainder)
3971 /* So many files remain that they can't all be put into the last
3972 NMERGE-sized output window. Do one more merge. Merge as few
3973 files as possible, to avoid needless I/O. */
3974 size_t nshortmerge = remainder - cheap_slots + 1;
3975 FILE *tfp;
3976 struct tempnode *temp = create_temp (&tfp);
3977 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3978 nshortmerge, tfp, temp->name);
3979 ntemps -= MIN (ntemps, num_merged);
3980 files[out].name = temp->name;
3981 files[out++].temp = temp;
3982 in += num_merged;
3985 /* Put the remaining input files into the last NMERGE-sized output
3986 window, so they will be merged in the next pass. */
3987 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3988 ntemps += out;
3989 nfiles -= in - out;
3992 avoid_trashing_input (files, ntemps, nfiles, output_file);
3994 /* We aren't guaranteed that this final mergefiles will work, therefore we
3995 try to merge into the output, and then merge as much as we can into a
3996 temp file if we can't. Repeat. */
3998 while (true)
4000 /* Merge directly into the output file if possible. */
4001 FILE **fps;
4002 size_t nopened = open_input_files (files, nfiles, &fps);
4004 if (nopened == nfiles)
4006 FILE *ofp = stream_open (output_file, "w");
4007 if (ofp)
4009 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
4010 break;
4012 if (errno != EMFILE || nopened <= 2)
4013 sort_die (_("open failed"), output_file);
4015 else if (nopened <= 2)
4016 sort_die (_("open failed"), files[nopened].name);
4018 /* We ran out of file descriptors. Close one of the input
4019 files, to gain a file descriptor. Then create a temporary
4020 file with our spare file descriptor. Retry if that failed
4021 (e.g., some other process could open a file between the time
4022 we closed and tried to create). */
4023 FILE *tfp;
4024 struct tempnode *temp;
4027 nopened--;
4028 xfclose (fps[nopened], files[nopened].name);
4029 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
4031 while (!temp);
4033 /* Merge into the newly allocated temporary. */
4034 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
4035 fps);
4036 ntemps -= MIN (ntemps, nopened);
4037 files[0].name = temp->name;
4038 files[0].temp = temp;
4040 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
4041 ntemps++;
4042 nfiles -= nopened - 1;
4046 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
4048 static void
4049 sort (char *const *files, size_t nfiles, char const *output_file,
4050 size_t nthreads)
4052 struct buffer buf;
4053 size_t ntemps = 0;
4054 bool output_file_created = false;
4056 buf.alloc = 0;
4058 while (nfiles)
4060 char const *temp_output;
4061 char const *file = *files;
4062 FILE *fp = xfopen (file, "r");
4063 FILE *tfp;
4065 size_t bytes_per_line;
4066 if (nthreads > 1)
4068 /* Get log P. */
4069 size_t tmp = 1;
4070 size_t mult = 1;
4071 while (tmp < nthreads)
4073 tmp *= 2;
4074 mult++;
4076 bytes_per_line = (mult * sizeof (struct line));
4078 else
4079 bytes_per_line = sizeof (struct line) * 3 / 2;
4081 if (! buf.alloc)
4082 initbuf (&buf, bytes_per_line,
4083 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
4084 buf.eof = false;
4085 files++;
4086 nfiles--;
4088 while (fillbuf (&buf, fp, file))
4090 struct line *line;
4092 if (buf.eof && nfiles
4093 && (bytes_per_line + 1
4094 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
4096 /* End of file, but there is more input and buffer room.
4097 Concatenate the next input file; this is faster in
4098 the usual case. */
4099 buf.left = buf.used;
4100 break;
4103 saved_line.text = nullptr;
4104 line = buffer_linelim (&buf);
4105 if (buf.eof && !nfiles && !ntemps && !buf.left)
4107 xfclose (fp, file);
4108 tfp = xfopen (output_file, "w");
4109 temp_output = output_file;
4110 output_file_created = true;
4112 else
4114 ++ntemps;
4115 temp_output = create_temp (&tfp)->name;
4117 if (1 < buf.nlines)
4119 struct merge_node_queue queue;
4120 queue_init (&queue, nthreads);
4121 struct merge_node *merge_tree =
4122 merge_tree_init (nthreads, buf.nlines, line);
4124 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
4125 &queue, tfp, temp_output);
4127 merge_tree_destroy (nthreads, merge_tree);
4128 queue_destroy (&queue);
4130 else
4131 write_unique (line - 1, tfp, temp_output);
4133 xfclose (tfp, temp_output);
4135 if (output_file_created)
4136 goto finish;
4138 xfclose (fp, file);
4141 finish:
4142 free (buf.buf);
4144 if (! output_file_created)
4146 struct tempnode *node = temphead;
4147 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
4148 for (size_t i = 0; node; i++)
4150 tempfiles[i].name = node->name;
4151 tempfiles[i].temp = node;
4152 node = node->next;
4154 merge (tempfiles, ntemps, ntemps, output_file);
4155 free (tempfiles);
4158 reap_all ();
4161 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4163 static void
4164 insertkey (struct keyfield *key_arg)
4166 struct keyfield **p;
4167 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4169 for (p = &keylist; *p; p = &(*p)->next)
4170 continue;
4171 *p = key;
4172 key->next = nullptr;
4175 /* Report a bad field specification SPEC, with extra info MSGID. */
4177 static void
4178 badfieldspec (char const *spec, char const *msgid)
4180 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4181 _(msgid), quote (spec));
4184 /* Report incompatible options. */
4186 static void
4187 incompatible_options (char const *opts)
4189 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4192 /* Check compatibility of ordering options. */
4194 static void
4195 check_ordering_compatibility (void)
4197 struct keyfield *key;
4199 for (key = keylist; key; key = key->next)
4200 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4201 + key->month + (key->version | key->random | !!key->ignore)))
4203 /* The following is too big, but guaranteed to be "big enough". */
4204 char opts[sizeof short_options];
4205 /* Clear flags we're not interested in. */
4206 key->skipsblanks = key->skipeblanks = key->reverse = false;
4207 key_to_opts (key, opts);
4208 incompatible_options (opts);
4212 /* Parse the leading integer in STRING and store the resulting value
4213 (which must fit into size_t) into *VAL. Return the address of the
4214 suffix after the integer. If the value is too large, silently
4215 substitute SIZE_MAX. If MSGID is null, return nullptr after
4216 failure; otherwise, report MSGID and exit on failure. */
4218 static char const *
4219 parse_field_count (char const *string, size_t *val, char const *msgid)
4221 char *suffix;
4222 uintmax_t n;
4224 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4226 case LONGINT_OK:
4227 case LONGINT_INVALID_SUFFIX_CHAR:
4228 *val = n;
4229 if (*val == n)
4230 break;
4231 FALLTHROUGH;
4232 case LONGINT_OVERFLOW:
4233 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4234 *val = SIZE_MAX;
4235 break;
4237 case LONGINT_INVALID:
4238 if (msgid)
4239 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4240 _(msgid), quote (string));
4241 return nullptr;
4244 return suffix;
4247 /* Handle interrupts and hangups. */
4249 static void
4250 sighandler (int sig)
4252 if (! SA_NOCLDSTOP)
4253 signal (sig, SIG_IGN);
4255 cleanup ();
4257 signal (sig, SIG_DFL);
4258 raise (sig);
4261 /* Set the ordering options for KEY specified in S.
4262 Return the address of the first character in S that
4263 is not a valid ordering option.
4264 BLANKTYPE is the kind of blanks that 'b' should skip. */
4266 static char *
4267 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4269 while (*s)
4271 switch (*s)
4273 case 'b':
4274 if (blanktype == bl_start || blanktype == bl_both)
4275 key->skipsblanks = true;
4276 if (blanktype == bl_end || blanktype == bl_both)
4277 key->skipeblanks = true;
4278 break;
4279 case 'd':
4280 key->ignore = nondictionary;
4281 break;
4282 case 'f':
4283 key->translate = fold_toupper;
4284 break;
4285 case 'g':
4286 key->general_numeric = true;
4287 break;
4288 case 'h':
4289 key->human_numeric = true;
4290 break;
4291 case 'i':
4292 /* Option order should not matter, so don't let -i override
4293 -d. -d implies -i, but -i does not imply -d. */
4294 if (! key->ignore)
4295 key->ignore = nonprinting;
4296 break;
4297 case 'M':
4298 key->month = true;
4299 break;
4300 case 'n':
4301 key->numeric = true;
4302 break;
4303 case 'R':
4304 key->random = true;
4305 break;
4306 case 'r':
4307 key->reverse = true;
4308 break;
4309 case 'V':
4310 key->version = true;
4311 break;
4312 default:
4313 return (char *) s;
4315 ++s;
4317 return (char *) s;
4320 /* Initialize KEY. */
4322 static struct keyfield *
4323 key_init (struct keyfield *key)
4325 memset (key, 0, sizeof *key);
4326 key->eword = SIZE_MAX;
4327 return key;
4331 main (int argc, char **argv)
4333 struct keyfield *key;
4334 struct keyfield key_buf;
4335 struct keyfield gkey;
4336 bool gkey_only = false;
4337 char const *s;
4338 int c = 0;
4339 char checkonly = 0;
4340 bool mergeonly = false;
4341 char *random_source = nullptr;
4342 bool need_random = false;
4343 size_t nthreads = 0;
4344 size_t nfiles = 0;
4345 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
4346 int posix_ver = posix2_version ();
4347 bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4348 char **files;
4349 char *files_from = nullptr;
4350 struct Tokens tok;
4351 char const *outfile = nullptr;
4352 bool locale_ok;
4354 initialize_main (&argc, &argv);
4355 set_program_name (argv[0]);
4356 locale_ok = !! setlocale (LC_ALL, "");
4357 bindtextdomain (PACKAGE, LOCALEDIR);
4358 textdomain (PACKAGE);
4360 initialize_exit_failure (SORT_FAILURE);
4362 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4363 #if HAVE_NL_LANGINFO
4364 hard_LC_TIME = hard_locale (LC_TIME);
4365 #endif
4367 /* Get locale's representation of the decimal point. */
4369 struct lconv const *locale = localeconv ();
4371 /* If the locale doesn't define a decimal point, or if the decimal
4372 point is multibyte, use the C locale's decimal point. FIXME:
4373 add support for multibyte decimal points. */
4374 decimal_point = locale->decimal_point[0];
4375 if (! decimal_point || locale->decimal_point[1])
4376 decimal_point = '.';
4378 /* FIXME: add support for multibyte thousands separators. */
4379 thousands_sep = locale->thousands_sep[0];
4380 if (thousands_sep && locale->thousands_sep[1])
4381 thousands_sep_ignored = true;
4382 if (! thousands_sep || locale->thousands_sep[1])
4383 thousands_sep = NON_CHAR;
4386 have_read_stdin = false;
4387 inittables ();
4390 size_t i;
4391 static int const sig[] =
4393 /* The usual suspects. */
4394 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4395 #ifdef SIGPOLL
4396 SIGPOLL,
4397 #endif
4398 #ifdef SIGPROF
4399 SIGPROF,
4400 #endif
4401 #ifdef SIGVTALRM
4402 SIGVTALRM,
4403 #endif
4404 #ifdef SIGXCPU
4405 SIGXCPU,
4406 #endif
4407 #ifdef SIGXFSZ
4408 SIGXFSZ,
4409 #endif
4411 enum { nsigs = ARRAY_CARDINALITY (sig) };
4413 #if SA_NOCLDSTOP
4414 struct sigaction act;
4416 sigemptyset (&caught_signals);
4417 for (i = 0; i < nsigs; i++)
4419 sigaction (sig[i], nullptr, &act);
4420 if (act.sa_handler != SIG_IGN)
4421 sigaddset (&caught_signals, sig[i]);
4424 act.sa_handler = sighandler;
4425 act.sa_mask = caught_signals;
4426 act.sa_flags = 0;
4428 for (i = 0; i < nsigs; i++)
4429 if (sigismember (&caught_signals, sig[i]))
4430 sigaction (sig[i], &act, nullptr);
4431 #else
4432 for (i = 0; i < nsigs; i++)
4433 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4435 signal (sig[i], sighandler);
4436 siginterrupt (sig[i], 1);
4438 #endif
4440 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4442 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4443 atexit (exit_cleanup);
4445 key_init (&gkey);
4446 gkey.sword = SIZE_MAX;
4448 files = xnmalloc (argc, sizeof *files);
4450 while (true)
4452 /* Parse an operand as a file after "--" was seen; or if
4453 pedantic and a file was seen, unless the POSIX version
4454 is not 1003.1-2001 and -c was not seen and the operand is
4455 "-o FILE" or "-oFILE". */
4456 int oi = -1;
4458 if (c == -1
4459 || (posixly_correct && nfiles != 0
4460 && ! (traditional_usage
4461 && ! checkonly
4462 && optind != argc
4463 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4464 && (argv[optind][2] || optind + 1 != argc)))
4465 || ((c = getopt_long (argc, argv, short_options,
4466 long_options, &oi))
4467 == -1))
4469 if (argc <= optind)
4470 break;
4471 files[nfiles++] = argv[optind++];
4473 else switch (c)
4475 case 1:
4476 key = nullptr;
4477 if (optarg[0] == '+')
4479 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4480 && ISDIGIT (argv[optind][1]));
4481 traditional_usage |= minus_pos_usage && !posixly_correct;
4482 if (traditional_usage)
4484 /* Treat +POS1 [-POS2] as a key if possible; but silently
4485 treat an operand as a file if it is not a valid +POS1. */
4486 key = key_init (&key_buf);
4487 s = parse_field_count (optarg + 1, &key->sword, nullptr);
4488 if (s && *s == '.')
4489 s = parse_field_count (s + 1, &key->schar, nullptr);
4490 if (! (key->sword || key->schar))
4491 key->sword = SIZE_MAX;
4492 if (! s || *set_ordering (s, key, bl_start))
4493 key = nullptr;
4494 else
4496 if (minus_pos_usage)
4498 char const *optarg1 = argv[optind++];
4499 s = parse_field_count (optarg1 + 1, &key->eword,
4500 N_("invalid number after '-'"));
4501 if (*s == '.')
4502 s = parse_field_count (s + 1, &key->echar,
4503 N_("invalid number after '.'"));
4504 if (!key->echar && key->eword)
4506 /* obsolescent syntax +A.x -B.y is equivalent to:
4507 -k A+1.x+1,B.y (when y = 0)
4508 -k A+1.x+1,B+1.y (when y > 0)
4509 So eword is decremented as in the -k case
4510 only when the end field (B) is specified and
4511 echar (y) is 0. */
4512 key->eword--;
4514 if (*set_ordering (s, key, bl_end))
4515 badfieldspec (optarg1,
4516 N_("stray character in field spec"));
4518 key->traditional_used = true;
4519 insertkey (key);
4523 if (! key)
4524 files[nfiles++] = optarg;
4525 break;
4527 case SORT_OPTION:
4528 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4529 FALLTHROUGH;
4530 case 'b':
4531 case 'd':
4532 case 'f':
4533 case 'g':
4534 case 'h':
4535 case 'i':
4536 case 'M':
4537 case 'n':
4538 case 'r':
4539 case 'R':
4540 case 'V':
4542 char str[2];
4543 str[0] = c;
4544 str[1] = '\0';
4545 set_ordering (str, &gkey, bl_both);
4547 break;
4549 case CHECK_OPTION:
4550 c = (optarg
4551 ? XARGMATCH ("--check", optarg, check_args, check_types)
4552 : 'c');
4553 FALLTHROUGH;
4554 case 'c':
4555 case 'C':
4556 if (checkonly && checkonly != c)
4557 incompatible_options ("cC");
4558 checkonly = c;
4559 break;
4561 case COMPRESS_PROGRAM_OPTION:
4562 if (compress_program && !STREQ (compress_program, optarg))
4563 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4564 compress_program = optarg;
4565 break;
4567 case DEBUG_PROGRAM_OPTION:
4568 debug = true;
4569 break;
4571 case FILES0_FROM_OPTION:
4572 files_from = optarg;
4573 break;
4575 case 'k':
4576 key = key_init (&key_buf);
4578 /* Get POS1. */
4579 s = parse_field_count (optarg, &key->sword,
4580 N_("invalid number at field start"));
4581 if (! key->sword--)
4583 /* Provoke with 'sort -k0' */
4584 badfieldspec (optarg, N_("field number is zero"));
4586 if (*s == '.')
4588 s = parse_field_count (s + 1, &key->schar,
4589 N_("invalid number after '.'"));
4590 if (! key->schar--)
4592 /* Provoke with 'sort -k1.0' */
4593 badfieldspec (optarg, N_("character offset is zero"));
4596 if (! (key->sword || key->schar))
4597 key->sword = SIZE_MAX;
4598 s = set_ordering (s, key, bl_start);
4599 if (*s != ',')
4601 key->eword = SIZE_MAX;
4602 key->echar = 0;
4604 else
4606 /* Get POS2. */
4607 s = parse_field_count (s + 1, &key->eword,
4608 N_("invalid number after ','"));
4609 if (! key->eword--)
4611 /* Provoke with 'sort -k1,0' */
4612 badfieldspec (optarg, N_("field number is zero"));
4614 if (*s == '.')
4616 s = parse_field_count (s + 1, &key->echar,
4617 N_("invalid number after '.'"));
4619 s = set_ordering (s, key, bl_end);
4621 if (*s)
4622 badfieldspec (optarg, N_("stray character in field spec"));
4623 insertkey (key);
4624 break;
4626 case 'm':
4627 mergeonly = true;
4628 break;
4630 case NMERGE_OPTION:
4631 specify_nmerge (oi, c, optarg);
4632 break;
4634 case 'o':
4635 if (outfile && !STREQ (outfile, optarg))
4636 error (SORT_FAILURE, 0, _("multiple output files specified"));
4637 outfile = optarg;
4638 break;
4640 case RANDOM_SOURCE_OPTION:
4641 if (random_source && !STREQ (random_source, optarg))
4642 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4643 random_source = optarg;
4644 break;
4646 case 's':
4647 stable = true;
4648 break;
4650 case 'S':
4651 specify_sort_size (oi, c, optarg);
4652 break;
4654 case 't':
4656 char newtab = optarg[0];
4657 if (! newtab)
4658 error (SORT_FAILURE, 0, _("empty tab"));
4659 if (optarg[1])
4661 if (STREQ (optarg, "\\0"))
4662 newtab = '\0';
4663 else
4665 /* Provoke with 'sort -txx'. Complain about
4666 "multi-character tab" instead of "multibyte tab", so
4667 that the diagnostic's wording does not need to be
4668 changed once multibyte characters are supported. */
4669 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4670 quote (optarg));
4673 if (tab != TAB_DEFAULT && tab != newtab)
4674 error (SORT_FAILURE, 0, _("incompatible tabs"));
4675 tab = newtab;
4677 break;
4679 case 'T':
4680 add_temp_dir (optarg);
4681 break;
4683 case PARALLEL_OPTION:
4684 nthreads = specify_nthreads (oi, c, optarg);
4685 break;
4687 case 'u':
4688 unique = true;
4689 break;
4691 case 'y':
4692 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4693 through Solaris 7. It is also accepted by many non-Solaris
4694 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4695 -y is marked as obsolete starting with Solaris 8 (1999), but is
4696 still accepted as of Solaris 10 prerelease (2004).
4698 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4699 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4700 and which in general ignores the argument after "-y" if it
4701 consists entirely of digits (it can even be empty). */
4702 if (optarg == argv[optind - 1])
4704 char const *p;
4705 for (p = optarg; ISDIGIT (*p); p++)
4706 continue;
4707 optind -= (*p != '\0');
4709 break;
4711 case 'z':
4712 eolchar = 0;
4713 break;
4715 case_GETOPT_HELP_CHAR;
4717 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4719 default:
4720 usage (SORT_FAILURE);
4724 if (files_from)
4726 /* When using --files0-from=F, you may not specify any files
4727 on the command-line. */
4728 if (nfiles)
4730 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4731 fprintf (stderr, "%s\n",
4732 _("file operands cannot be combined with --files0-from"));
4733 usage (SORT_FAILURE);
4736 FILE *stream = xfopen (files_from, "r");
4738 readtokens0_init (&tok);
4740 if (! readtokens0 (stream, &tok))
4741 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4742 quoteaf (files_from));
4743 xfclose (stream, files_from);
4745 if (tok.n_tok)
4747 free (files);
4748 files = tok.tok;
4749 nfiles = tok.n_tok;
4750 for (size_t i = 0; i < nfiles; i++)
4752 if (STREQ (files[i], "-"))
4753 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4754 "no file name of %s allowed"),
4755 quoteaf (files[i]));
4756 else if (files[i][0] == '\0')
4758 /* Using the standard 'filename:line-number:' prefix here is
4759 not totally appropriate, since NUL is the separator,
4760 not NL, but it might be better than nothing. */
4761 unsigned long int file_number = i + 1;
4762 error (SORT_FAILURE, 0,
4763 _("%s:%lu: invalid zero-length file name"),
4764 quotef (files_from), file_number);
4768 else
4769 error (SORT_FAILURE, 0, _("no input from %s"),
4770 quoteaf (files_from));
4773 /* Inheritance of global options to individual keys. */
4774 for (key = keylist; key; key = key->next)
4776 if (default_key_compare (key) && !key->reverse)
4778 key->ignore = gkey.ignore;
4779 key->translate = gkey.translate;
4780 key->skipsblanks = gkey.skipsblanks;
4781 key->skipeblanks = gkey.skipeblanks;
4782 key->month = gkey.month;
4783 key->numeric = gkey.numeric;
4784 key->general_numeric = gkey.general_numeric;
4785 key->human_numeric = gkey.human_numeric;
4786 key->version = gkey.version;
4787 key->random = gkey.random;
4788 key->reverse = gkey.reverse;
4791 need_random |= key->random;
4794 if (!keylist && !default_key_compare (&gkey))
4796 gkey_only = true;
4797 insertkey (&gkey);
4798 need_random |= gkey.random;
4801 check_ordering_compatibility ();
4803 if (debug)
4805 if (checkonly || outfile)
4807 static char opts[] = "X --debug";
4808 opts[0] = (checkonly ? checkonly : 'o');
4809 incompatible_options (opts);
4812 /* Always output the locale in debug mode, since this
4813 is such a common source of confusion. */
4815 /* OpenBSD can only set some categories with LC_ALL above,
4816 so set LC_COLLATE explicitly to flag errors. */
4817 if (locale_ok)
4818 locale_ok = !! setlocale (LC_COLLATE, "");
4819 if (! locale_ok)
4820 error (0, 0, "%s", _("failed to set locale"));
4821 if (hard_LC_COLLATE)
4822 error (0, 0, _("text ordering performed using %s sorting rules"),
4823 quote (setlocale (LC_COLLATE, nullptr)));
4824 else
4825 error (0, 0, "%s",
4826 _("text ordering performed using simple byte comparison"));
4828 key_warnings (&gkey, gkey_only);
4831 reverse = gkey.reverse;
4833 if (need_random)
4834 random_md5_state_init (random_source);
4836 if (temp_dir_count == 0)
4838 char const *tmp_dir = getenv ("TMPDIR");
4839 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4842 if (nfiles == 0)
4844 nfiles = 1;
4845 free (files);
4846 files = xmalloc (sizeof *files);
4847 *files = (char *) "-";
4850 /* Need to re-check that we meet the minimum requirement for memory
4851 usage with the final value for NMERGE. */
4852 if (0 < sort_size)
4853 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4855 if (checkonly)
4857 if (nfiles > 1)
4858 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4859 quoteaf (files[1]), checkonly);
4861 if (outfile)
4863 static char opts[] = {0, 'o', 0};
4864 opts[0] = checkonly;
4865 incompatible_options (opts);
4868 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4869 input is not properly sorted. */
4870 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4873 /* Check all inputs are accessible, or exit immediately. */
4874 check_inputs (files, nfiles);
4876 /* Check output is writable, or exit immediately. */
4877 check_output (outfile);
4879 if (mergeonly)
4881 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4883 for (size_t i = 0; i < nfiles; ++i)
4884 sortfiles[i].name = files[i];
4886 merge (sortfiles, 0, nfiles, outfile);
4888 else
4890 if (!nthreads)
4892 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4893 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4896 /* Avoid integer overflow later. */
4897 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4898 nthreads = MIN (nthreads, nthreads_max);
4900 sort (files, nfiles, outfile, nthreads);
4903 if (have_read_stdin && fclose (stdin) == EOF)
4904 sort_die (_("close failed"), "-");
4906 main_exit (EXIT_SUCCESS);