stty: fix untranslated diagnostics
[coreutils.git] / src / sort.c
bloba7002d170b4a47cf580f0451547d0dcd86e7965b
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/resource.h>
28 #include <sys/types.h>
29 #include <sys/wait.h>
30 #include <signal.h>
31 #include "system.h"
32 #include "argmatch.h"
33 #include "assure.h"
34 #include "fadvise.h"
35 #include "filevercmp.h"
36 #include "flexmember.h"
37 #include "hard-locale.h"
38 #include "hash.h"
39 #include "heap.h"
40 #include "ignore-value.h"
41 #include "md5.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "randread.h"
48 #include "readtokens0.h"
49 #include "stdlib--.h"
50 #include "strnumcmp.h"
51 #include "xmemcoll.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
54 #include "xstrtol-error.h"
56 #ifndef RLIMIT_DATA
57 struct rlimit { size_t rlim_cur; };
58 # define getrlimit(Resource, Rlp) (-1)
59 #endif
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
64 #define AUTHORS \
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
70 #endif
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73 present. */
74 #ifndef SA_NOCLDSTOP
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
78 # define sigset_t int
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
81 # endif
82 #endif
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
86 #endif
87 #if !defined OPEN_MAX
88 # define OPEN_MAX 20
89 #endif
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #ifndef DEFAULT_TMPDIR
94 # define DEFAULT_TMPDIR "/tmp"
95 #endif
97 /* Maximum number of lines to merge every time a NODE is taken from
98 the merge queue. Node is at LEVEL in the binary merge tree,
99 and is responsible for merging TOTAL lines. */
100 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
102 /* Heuristic value for the number of lines for which it is worth creating
103 a subthread, during an internal merge sort. I.e., it is a small number
104 of "average" lines for which sorting via two threads is faster than
105 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
106 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
107 lines of gensort -a output is sorted slightly faster with --parallel=2
108 than with --parallel=1. By contrast, using --parallel=1 is about 10%
109 faster than using --parallel=2 with a 64K-line input. */
110 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
111 static_assert (4 <= SUBTHREAD_LINES_HEURISTIC);
113 /* The number of threads after which there are
114 diminishing performance gains. */
115 enum { DEFAULT_MAX_THREADS = 8 };
117 /* Exit statuses. */
118 enum
120 /* POSIX says to exit with status 1 if invoked with -c and the
121 input is not properly sorted. */
122 SORT_OUT_OF_ORDER = 1,
124 /* POSIX says any other irregular exit must exit with a status
125 code greater than 1. */
126 SORT_FAILURE = 2
129 enum
131 /* The number of times we should try to fork a compression process
132 (we retry if the fork call fails). We don't _need_ to compress
133 temp files, this is just to reduce file system access, so this number
134 can be small. Each retry doubles in duration. */
135 MAX_FORK_TRIES_COMPRESS = 4,
137 /* The number of times we should try to fork a decompression process.
138 If we can't fork a decompression process, we can't sort, so this
139 number should be big. Each retry doubles in duration. */
140 MAX_FORK_TRIES_DECOMPRESS = 9
143 enum
145 /* Level of the end-of-merge node, one level above the root. */
146 MERGE_END = 0,
148 /* Level of the root node in merge tree. */
149 MERGE_ROOT = 1
152 /* The representation of the decimal point in the current locale. */
153 static char decimal_point;
155 /* Thousands separator; if outside char range, there is no separator. */
156 static int thousands_sep;
157 /* We currently ignore multi-byte grouping chars. */
158 static bool thousands_sep_ignored;
160 /* Nonzero if the corresponding locales are hard. */
161 static bool hard_LC_COLLATE;
162 #if HAVE_NL_LANGINFO
163 static bool hard_LC_TIME;
164 #endif
166 #define NONZERO(x) ((x) != 0)
168 /* The kind of blanks for '-b' to skip in various options. */
169 enum blanktype { bl_start, bl_end, bl_both };
171 /* The character marking end of line. Default to \n. */
172 static char eolchar = '\n';
174 /* Lines are held in memory as counted strings. */
175 struct line
177 char *text; /* Text of the line. */
178 size_t length; /* Length including final newline. */
179 char *keybeg; /* Start of first key. */
180 char *keylim; /* Limit of first key. */
183 /* Input buffers. */
184 struct buffer
186 char *buf; /* Dynamically allocated buffer,
187 partitioned into 3 regions:
188 - input data;
189 - unused area;
190 - an array of lines, in reverse order. */
191 size_t used; /* Number of bytes used for input data. */
192 size_t nlines; /* Number of lines in the line array. */
193 size_t alloc; /* Number of bytes allocated. */
194 size_t left; /* Number of bytes left from previous reads. */
195 size_t line_bytes; /* Number of bytes to reserve for each line. */
196 bool eof; /* An EOF has been read. */
199 /* Sort key. */
200 struct keyfield
202 size_t sword; /* Zero-origin 'word' to start at. */
203 size_t schar; /* Additional characters to skip. */
204 size_t eword; /* Zero-origin last 'word' of key. */
205 size_t echar; /* Additional characters in field. */
206 bool const *ignore; /* Boolean array of characters to ignore. */
207 char const *translate; /* Translation applied to characters. */
208 bool skipsblanks; /* Skip leading blanks when finding start. */
209 bool skipeblanks; /* Skip leading blanks when finding end. */
210 bool numeric; /* Flag for numeric comparison. Handle
211 strings of digits with optional decimal
212 point, but no exponential notation. */
213 bool random; /* Sort by random hash of key. */
214 bool general_numeric; /* Flag for general, numeric comparison.
215 Handle numbers in exponential notation. */
216 bool human_numeric; /* Flag for sorting by human readable
217 units with either SI or IEC prefixes. */
218 bool month; /* Flag for comparison by month name. */
219 bool reverse; /* Reverse the sense of comparison. */
220 bool version; /* sort by version number */
221 bool traditional_used; /* Traditional key option format is used. */
222 struct keyfield *next; /* Next keyfield to try. */
225 struct month
227 char const *name;
228 int val;
231 /* Binary merge tree node. */
232 struct merge_node
234 struct line *lo; /* Lines to merge from LO child node. */
235 struct line *hi; /* Lines to merge from HI child node. */
236 struct line *end_lo; /* End of available lines from LO. */
237 struct line *end_hi; /* End of available lines from HI. */
238 struct line **dest; /* Pointer to destination of merge. */
239 size_t nlo; /* Total Lines remaining from LO. */
240 size_t nhi; /* Total lines remaining from HI. */
241 struct merge_node *parent; /* Parent node. */
242 struct merge_node *lo_child; /* LO child node. */
243 struct merge_node *hi_child; /* HI child node. */
244 unsigned int level; /* Level in merge tree. */
245 bool queued; /* Node is already in heap. */
246 pthread_mutex_t lock; /* Lock for node operations. */
249 /* Priority queue of merge nodes. */
250 struct merge_node_queue
252 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
253 pthread_mutex_t mutex; /* Lock for queue operations. */
254 pthread_cond_t cond; /* Conditional wait for empty queue to populate
255 when popping. */
258 /* Used to implement --unique (-u). */
259 static struct line saved_line;
261 /* FIXME: None of these tables work with multibyte character sets.
262 Also, there are many other bugs when handling multibyte characters.
263 One way to fix this is to rewrite 'sort' to use wide characters
264 internally, but doing this with good performance is a bit
265 tricky. */
267 /* Table of blanks. */
268 static bool blanks[UCHAR_LIM];
270 /* Table of non-printing characters. */
271 static bool nonprinting[UCHAR_LIM];
273 /* Table of non-dictionary characters (not letters, digits, or blanks). */
274 static bool nondictionary[UCHAR_LIM];
276 /* Translation table folding lower case to upper. */
277 static char fold_toupper[UCHAR_LIM];
279 #define MONTHS_PER_YEAR 12
281 /* Table mapping month names to integers.
282 Alphabetic order allows binary search. */
283 static struct month monthtab[] =
285 {"APR", 4},
286 {"AUG", 8},
287 {"DEC", 12},
288 {"FEB", 2},
289 {"JAN", 1},
290 {"JUL", 7},
291 {"JUN", 6},
292 {"MAR", 3},
293 {"MAY", 5},
294 {"NOV", 11},
295 {"OCT", 10},
296 {"SEP", 9}
299 /* During the merge phase, the number of files to merge at once. */
300 #define NMERGE_DEFAULT 16
302 /* Minimum size for a merge or check buffer. */
303 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
305 /* Minimum sort size; the code might not work with smaller sizes. */
306 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
308 /* The number of bytes needed for a merge or check buffer, which can
309 function relatively efficiently even if it holds only one line. If
310 a longer line is seen, this value is increased. */
311 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
313 /* The approximate maximum number of bytes of main memory to use, as
314 specified by the user. Zero if the user has not specified a size. */
315 static size_t sort_size;
317 /* The initial allocation factor for non-regular files.
318 This is used, e.g., when reading from a pipe.
319 Don't make it too big, since it is multiplied by ~130 to
320 obtain the size of the actual buffer sort will allocate.
321 Also, there may be 8 threads all doing this at the same time. */
322 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
324 /* Array of directory names in which any temporary files are to be created. */
325 static char const **temp_dirs;
327 /* Number of temporary directory names used. */
328 static size_t temp_dir_count;
330 /* Number of allocated slots in temp_dirs. */
331 static size_t temp_dir_alloc;
333 /* Flag to reverse the order of all comparisons. */
334 static bool reverse;
336 /* Flag for stable sort. This turns off the last ditch bytewise
337 comparison of lines, and instead leaves lines in the same order
338 they were read if all keys compare equal. */
339 static bool stable;
341 /* An int value outside char range. */
342 enum { NON_CHAR = CHAR_MAX + 1 };
344 /* If TAB has this value, blanks separate fields. */
345 enum { TAB_DEFAULT = CHAR_MAX + 1 };
347 /* Tab character separating fields. If TAB_DEFAULT, then fields are
348 separated by the empty string between a non-blank character and a blank
349 character. */
350 static int tab = TAB_DEFAULT;
352 /* Flag to remove consecutive duplicate lines from the output.
353 Only the last of a sequence of equal lines will be output. */
354 static bool unique;
356 /* Nonzero if any of the input files are the standard input. */
357 static bool have_read_stdin;
359 /* List of key field comparisons to be tried. */
360 static struct keyfield *keylist;
362 /* Program used to (de)compress temp files. Must accept -d. */
363 static char const *compress_program;
365 /* Annotate the output with extra info to aid the user. */
366 static bool debug;
368 /* Maximum number of files to merge in one go. If more than this
369 number are present, temp files will be used. */
370 static unsigned int nmerge = NMERGE_DEFAULT;
372 /* Output an error to stderr and exit using async-signal-safe routines.
373 This can be used safely from signal handlers,
374 and between fork and exec of multithreaded processes. */
376 static _Noreturn void
377 async_safe_die (int errnum, char const *errstr)
379 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
381 /* Even if defined HAVE_STRERROR_R, we can't use it,
382 as it may return a translated string etc. and even if not
383 may call malloc which is unsafe. We might improve this
384 by testing for sys_errlist and using that if available.
385 For now just report the error number. */
386 if (errnum)
388 char errbuf[INT_BUFSIZE_BOUND (errnum)];
389 char *p = inttostr (errnum, errbuf);
390 ignore_value (write (STDERR_FILENO, ": errno ", 8));
391 ignore_value (write (STDERR_FILENO, p, strlen (p)));
394 ignore_value (write (STDERR_FILENO, "\n", 1));
396 _exit (SORT_FAILURE);
399 /* Report MESSAGE for FILE, then clean up and exit.
400 If FILE is null, it represents standard output. */
402 static void
403 sort_die (char const *message, char const *file)
405 error (SORT_FAILURE, errno, "%s: %s", message,
406 quotef (file ? file : _("standard output")));
409 void
410 usage (int status)
412 if (status != EXIT_SUCCESS)
413 emit_try_help ();
414 else
416 printf (_("\
417 Usage: %s [OPTION]... [FILE]...\n\
418 or: %s [OPTION]... --files0-from=F\n\
420 program_name, program_name);
421 fputs (_("\
422 Write sorted concatenation of all FILE(s) to standard output.\n\
423 "), stdout);
425 emit_stdin_note ();
426 emit_mandatory_arg_note ();
428 fputs (_("\
429 Ordering options:\n\
431 "), stdout);
432 fputs (_("\
433 -b, --ignore-leading-blanks ignore leading blanks\n\
434 -d, --dictionary-order consider only blanks and alphanumeric characters\
436 -f, --ignore-case fold lower case to upper case characters\n\
437 "), stdout);
438 fputs (_("\
439 -g, --general-numeric-sort compare according to general numerical value\n\
440 -i, --ignore-nonprinting consider only printable characters\n\
441 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
442 "), stdout);
443 fputs (_("\
444 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
445 "), stdout);
446 fputs (_("\
447 -n, --numeric-sort compare according to string numerical value\n\
448 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
449 --random-source=FILE get random bytes from FILE\n\
450 -r, --reverse reverse the result of comparisons\n\
451 "), stdout);
452 fputs (_("\
453 --sort=WORD sort according to WORD:\n\
454 general-numeric -g, human-numeric -h, month -M,\
456 numeric -n, random -R, version -V\n\
457 -V, --version-sort natural sort of (version) numbers within text\n\
459 "), stdout);
460 fputs (_("\
461 Other options:\n\
463 "), stdout);
464 fputs (_("\
465 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
466 for more use temp files\n\
467 "), stdout);
468 fputs (_("\
469 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
470 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
472 --compress-program=PROG compress temporaries with PROG;\n\
473 decompress them with PROG -d\n\
474 "), stdout);
475 fputs (_("\
476 --debug annotate the part of the line used to sort,\n\
477 and warn about questionable usage to stderr\n\
478 --files0-from=F read input from the files specified by\n\
479 NUL-terminated names in file F;\n\
480 If F is - then read names from standard input\n\
481 "), stdout);
482 fputs (_("\
483 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
484 -m, --merge merge already sorted files; do not sort\n\
485 "), stdout);
486 fputs (_("\
487 -o, --output=FILE write result to FILE instead of standard output\n\
488 -s, --stable stabilize sort by disabling last-resort comparison\
490 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
491 "), stdout);
492 printf (_("\
493 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
494 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
495 multiple options specify multiple directories\n\
496 --parallel=N change the number of sorts run concurrently to N\n\
497 -u, --unique with -c, check for strict ordering;\n\
498 without -c, output only the first of an equal run\
500 "), DEFAULT_TMPDIR);
501 fputs (_("\
502 -z, --zero-terminated line delimiter is NUL, not newline\n\
503 "), stdout);
504 fputs (HELP_OPTION_DESCRIPTION, stdout);
505 fputs (VERSION_OPTION_DESCRIPTION, stdout);
506 fputs (_("\
508 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
509 field number and C a character position in the field; both are origin 1, and\n\
510 the stop position defaults to the line's end. If neither -t nor -b is in\n\
511 effect, characters in a field are counted from the beginning of the preceding\n\
512 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
514 which override global ordering options for that key. If no key is given, use\n\
515 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
517 SIZE may be followed by the following multiplicative suffixes:\n\
518 "), stdout);
519 fputs (_("\
520 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y, R, Q.\
521 \n\n\
522 *** WARNING ***\n\
523 The locale specified by the environment affects sort order.\n\
524 Set LC_ALL=C to get the traditional sort order that uses\n\
525 native byte values.\n\
526 "), stdout );
527 emit_ancillary_info (PROGRAM_NAME);
530 exit (status);
533 /* For long options that have no equivalent short option, use a
534 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
535 enum
537 CHECK_OPTION = CHAR_MAX + 1,
538 COMPRESS_PROGRAM_OPTION,
539 DEBUG_PROGRAM_OPTION,
540 FILES0_FROM_OPTION,
541 NMERGE_OPTION,
542 RANDOM_SOURCE_OPTION,
543 SORT_OPTION,
544 PARALLEL_OPTION
547 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
549 static struct option const long_options[] =
551 {"ignore-leading-blanks", no_argument, nullptr, 'b'},
552 {"check", optional_argument, nullptr, CHECK_OPTION},
553 {"compress-program", required_argument, nullptr, COMPRESS_PROGRAM_OPTION},
554 {"debug", no_argument, nullptr, DEBUG_PROGRAM_OPTION},
555 {"dictionary-order", no_argument, nullptr, 'd'},
556 {"ignore-case", no_argument, nullptr, 'f'},
557 {"files0-from", required_argument, nullptr, FILES0_FROM_OPTION},
558 {"general-numeric-sort", no_argument, nullptr, 'g'},
559 {"ignore-nonprinting", no_argument, nullptr, 'i'},
560 {"key", required_argument, nullptr, 'k'},
561 {"merge", no_argument, nullptr, 'm'},
562 {"month-sort", no_argument, nullptr, 'M'},
563 {"numeric-sort", no_argument, nullptr, 'n'},
564 {"human-numeric-sort", no_argument, nullptr, 'h'},
565 {"version-sort", no_argument, nullptr, 'V'},
566 {"random-sort", no_argument, nullptr, 'R'},
567 {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
568 {"sort", required_argument, nullptr, SORT_OPTION},
569 {"output", required_argument, nullptr, 'o'},
570 {"reverse", no_argument, nullptr, 'r'},
571 {"stable", no_argument, nullptr, 's'},
572 {"batch-size", required_argument, nullptr, NMERGE_OPTION},
573 {"buffer-size", required_argument, nullptr, 'S'},
574 {"field-separator", required_argument, nullptr, 't'},
575 {"temporary-directory", required_argument, nullptr, 'T'},
576 {"unique", no_argument, nullptr, 'u'},
577 {"zero-terminated", no_argument, nullptr, 'z'},
578 {"parallel", required_argument, nullptr, PARALLEL_OPTION},
579 {GETOPT_HELP_OPTION_DECL},
580 {GETOPT_VERSION_OPTION_DECL},
581 {nullptr, 0, nullptr, 0},
584 #define CHECK_TABLE \
585 _ct_("quiet", 'C') \
586 _ct_("silent", 'C') \
587 _ct_("diagnose-first", 'c')
589 static char const *const check_args[] =
591 #define _ct_(_s, _c) _s,
592 CHECK_TABLE nullptr
593 #undef _ct_
595 static char const check_types[] =
597 #define _ct_(_s, _c) _c,
598 CHECK_TABLE
599 #undef _ct_
602 #define SORT_TABLE \
603 _st_("general-numeric", 'g') \
604 _st_("human-numeric", 'h') \
605 _st_("month", 'M') \
606 _st_("numeric", 'n') \
607 _st_("random", 'R') \
608 _st_("version", 'V')
610 static char const *const sort_args[] =
612 #define _st_(_s, _c) _s,
613 SORT_TABLE nullptr
614 #undef _st_
616 static char const sort_types[] =
618 #define _st_(_s, _c) _c,
619 SORT_TABLE
620 #undef _st_
623 /* The set of signals that are caught. */
624 static sigset_t caught_signals;
626 /* Critical section status. */
627 struct cs_status
629 bool valid;
630 sigset_t sigs;
633 /* Enter a critical section. */
634 static void
635 cs_enter (struct cs_status *status)
637 int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs);
638 status->valid = ret == 0;
641 /* Leave a critical section. */
642 static void
643 cs_leave (struct cs_status const *status)
645 if (status->valid)
647 /* Ignore failure when restoring the signal mask. */
648 pthread_sigmask (SIG_SETMASK, &status->sigs, nullptr);
652 /* Possible states for a temp file. If compressed, the file's status
653 is unreaped or reaped, depending on whether 'sort' has waited for
654 the subprocess to finish. */
655 enum { UNCOMPRESSED, UNREAPED, REAPED };
657 /* The list of temporary files. */
658 struct tempnode
660 struct tempnode *volatile next;
661 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
662 char state;
663 char name[FLEXIBLE_ARRAY_MEMBER];
665 static struct tempnode *volatile temphead;
666 static struct tempnode *volatile *temptail = &temphead;
668 /* A file to be sorted. */
669 struct sortfile
671 /* The file's name. */
672 char const *name;
674 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
675 struct tempnode *temp;
678 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
679 static Hash_table *proctab;
681 enum { INIT_PROCTAB_SIZE = 47 };
683 static size_t
684 proctab_hasher (void const *entry, size_t tabsize)
686 struct tempnode const *node = entry;
687 return node->pid % tabsize;
690 static bool
691 proctab_comparator (void const *e1, void const *e2)
693 struct tempnode const *n1 = e1;
694 struct tempnode const *n2 = e2;
695 return n1->pid == n2->pid;
698 /* The number of unreaped child processes. */
699 static pid_t nprocs;
701 static bool delete_proc (pid_t);
703 /* If PID is positive, wait for the child process with that PID to
704 exit, and assume that PID has already been removed from the process
705 table. If PID is 0 or -1, clean up some child that has exited (by
706 waiting for it, and removing it from the proc table) and return the
707 child's process ID. However, if PID is 0 and no children have
708 exited, return 0 without waiting. */
710 static pid_t
711 reap (pid_t pid)
713 int status;
714 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
716 if (cpid < 0)
717 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
718 quoteaf (compress_program));
719 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
721 if (! WIFEXITED (status) || WEXITSTATUS (status))
722 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
723 quoteaf (compress_program));
724 --nprocs;
727 return cpid;
730 /* TEMP represents a new process; add it to the process table. Create
731 the process table the first time it's called. */
733 static void
734 register_proc (struct tempnode *temp)
736 if (! proctab)
738 proctab = hash_initialize (INIT_PROCTAB_SIZE, nullptr,
739 proctab_hasher,
740 proctab_comparator,
741 nullptr);
742 if (! proctab)
743 xalloc_die ();
746 temp->state = UNREAPED;
748 if (! hash_insert (proctab, temp))
749 xalloc_die ();
752 /* If PID is in the process table, remove it and return true.
753 Otherwise, return false. */
755 static bool
756 delete_proc (pid_t pid)
758 struct tempnode test;
760 test.pid = pid;
761 struct tempnode *node = hash_remove (proctab, &test);
762 if (! node)
763 return false;
764 node->state = REAPED;
765 return true;
768 /* Remove PID from the process table, and wait for it to exit if it
769 hasn't already. */
771 static void
772 wait_proc (pid_t pid)
774 if (delete_proc (pid))
775 reap (pid);
778 /* Reap any exited children. Do not block; reap only those that have
779 already exited. */
781 static void
782 reap_exited (void)
784 while (0 < nprocs && reap (0))
785 continue;
788 /* Reap at least one exited child, waiting if necessary. */
790 static void
791 reap_some (void)
793 reap (-1);
794 reap_exited ();
797 /* Reap all children, waiting if necessary. */
799 static void
800 reap_all (void)
802 while (0 < nprocs)
803 reap (-1);
806 /* Clean up any remaining temporary files. */
808 static void
809 cleanup (void)
811 struct tempnode const *node;
813 for (node = temphead; node; node = node->next)
814 unlink (node->name);
815 temphead = nullptr;
818 /* Cleanup actions to take when exiting. */
820 static void
821 exit_cleanup (void)
823 if (temphead)
825 /* Clean up any remaining temporary files in a critical section so
826 that a signal handler does not try to clean them too. */
827 struct cs_status cs;
828 cs_enter (&cs);
829 cleanup ();
830 cs_leave (&cs);
833 close_stdout ();
836 /* Create a new temporary file, returning its newly allocated tempnode.
837 Store into *PFD the file descriptor open for writing.
838 If the creation fails, return nullptr and store -1 into *PFD if the
839 failure is due to file descriptor exhaustion and
840 SURVIVE_FD_EXHAUSTION; otherwise, die. */
842 static struct tempnode *
843 create_temp_file (int *pfd, bool survive_fd_exhaustion)
845 static char const slashbase[] = "/sortXXXXXX";
846 static size_t temp_dir_index;
847 int fd;
848 int saved_errno;
849 char const *temp_dir = temp_dirs[temp_dir_index];
850 size_t len = strlen (temp_dir);
851 struct tempnode *node =
852 xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase));
853 char *file = node->name;
854 struct cs_status cs;
856 memcpy (file, temp_dir, len);
857 memcpy (file + len, slashbase, sizeof slashbase);
858 node->next = nullptr;
859 if (++temp_dir_index == temp_dir_count)
860 temp_dir_index = 0;
862 /* Create the temporary file in a critical section, to avoid races. */
863 cs_enter (&cs);
864 fd = mkostemp (file, O_CLOEXEC);
865 if (0 <= fd)
867 *temptail = node;
868 temptail = &node->next;
870 saved_errno = errno;
871 cs_leave (&cs);
872 errno = saved_errno;
874 if (fd < 0)
876 if (! (survive_fd_exhaustion && errno == EMFILE))
877 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
878 quoteaf (temp_dir));
879 free (node);
880 node = nullptr;
883 *pfd = fd;
884 return node;
887 /* Return a pointer to stdout status, or nullptr on failure. */
889 static struct stat *
890 get_outstatus (void)
892 static int outstat_errno;
893 static struct stat outstat;
894 if (outstat_errno == 0)
895 outstat_errno = fstat (STDOUT_FILENO, &outstat) == 0 ? -1 : errno;
896 return outstat_errno < 0 ? &outstat : nullptr;
899 /* Return a stream for FILE, opened with mode HOW. If HOW is "w",
900 the file is already open on standard output, and needs to be
901 truncated unless FILE is null. When opening for input, "-"
902 means standard input. To avoid confusion, do not return file
903 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
904 opening an ordinary FILE. Return nullptr if unsuccessful.
906 Use fadvise to specify an access pattern for input files.
907 There are a few hints we could possibly provide,
908 and after careful testing it was decided that
909 specifying FADVISE_SEQUENTIAL was not detrimental
910 to any cases. On Linux 2.6.31, this option doubles
911 the size of read ahead performed and thus was seen to
912 benefit these cases:
913 Merging
914 Sorting with a smaller internal buffer
915 Reading from faster flash devices
917 In _addition_ one could also specify other hints...
919 FADVISE_WILLNEED was tested, but Linux 2.6.31
920 at least uses that to _synchronously_ prepopulate the cache
921 with the specified range. While sort does need to
922 read all of its input before outputting, a synchronous
923 read of the whole file up front precludes any processing
924 that sort could do in parallel with the system doing
925 read ahead of the data. This was seen to have negative effects
926 in a couple of cases:
927 Merging
928 Sorting with a smaller internal buffer
929 This option was seen to shorten the runtime for sort
930 on a multicore system with lots of RAM and other processes
931 competing for CPU. It could be argued that more explicit
932 scheduling hints with 'nice' et. al. are more appropriate
933 for this situation.
935 FADVISE_NOREUSE is a possibility as it could lower
936 the priority of input data in the cache as sort will
937 only need to process it once. However its functionality
938 has changed over Linux kernel versions and as of 2.6.31
939 it does nothing and thus we can't depend on what it might
940 do in future.
942 FADVISE_DONTNEED is not appropriate for user specified
943 input files, but for temp files we do want to drop the
944 cache immediately after processing. This is done implicitly
945 however when the files are unlinked. */
947 static FILE *
948 stream_open (char const *file, char const *how)
950 FILE *fp;
952 if (*how == 'r')
954 if (STREQ (file, "-"))
956 have_read_stdin = true;
957 fp = stdin;
959 else
961 int fd = open (file, O_RDONLY | O_CLOEXEC);
962 fp = fd < 0 ? nullptr : fdopen (fd, how);
964 fadvise (fp, FADVISE_SEQUENTIAL);
966 else if (*how == 'w')
968 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
970 int ftruncate_errno = errno;
971 struct stat *outst = get_outstatus ();
972 if (!outst || S_ISREG (outst->st_mode) || S_TYPEISSHM (outst))
973 error (SORT_FAILURE, ftruncate_errno, _("%s: error truncating"),
974 quotef (file));
976 fp = stdout;
978 else
979 affirm (!"unexpected mode passed to stream_open");
981 return fp;
984 /* Same as stream_open, except always return a non-null value; die on
985 failure. */
987 static FILE *
988 xfopen (char const *file, char const *how)
990 FILE *fp = stream_open (file, how);
991 if (!fp)
992 sort_die (_("open failed"), file);
993 return fp;
996 /* Close FP, whose name is FILE, and report any errors. */
998 static void
999 xfclose (FILE *fp, char const *file)
1001 switch (fileno (fp))
1003 case STDIN_FILENO:
1004 /* Allow reading stdin from tty more than once. */
1005 clearerr (fp);
1006 break;
1008 case STDOUT_FILENO:
1009 /* Don't close stdout just yet. close_stdout does that. */
1010 if (fflush (fp) != 0)
1011 sort_die (_("fflush failed"), file);
1012 break;
1014 default:
1015 if (fclose (fp) != 0)
1016 sort_die (_("close failed"), file);
1017 break;
1021 /* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
1023 static void
1024 move_fd (int oldfd, int newfd)
1026 if (oldfd != newfd)
1028 /* This should never fail for our usage. */
1029 dup2 (oldfd, newfd);
1030 close (oldfd);
1034 /* Fork a child process for piping to and do common cleanup. The
1035 TRIES parameter specifies how many times to try to fork before
1036 giving up. Return the PID of the child, or -1 (setting errno)
1037 on failure. */
1039 static pid_t
1040 pipe_fork (int pipefds[2], size_t tries)
1042 #if HAVE_WORKING_FORK
1043 struct tempnode *saved_temphead;
1044 int saved_errno;
1045 double wait_retry = 0.25;
1046 pid_t pid;
1047 struct cs_status cs;
1049 if (pipe2 (pipefds, O_CLOEXEC) < 0)
1050 return -1;
1052 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1053 uncontrolled subprocess generation can hurt performance significantly.
1054 Allow at most NMERGE + 2 subprocesses, on the theory that there
1055 may be some useful parallelism by letting compression for the
1056 previous merge finish (1 subprocess) in parallel with the current
1057 merge (NMERGE + 1 subprocesses). */
1059 if (nmerge + 1 < nprocs)
1060 reap_some ();
1062 while (tries--)
1064 /* This is so the child process won't delete our temp files
1065 if it receives a signal before exec-ing. */
1066 cs_enter (&cs);
1067 saved_temphead = temphead;
1068 temphead = nullptr;
1070 pid = fork ();
1071 saved_errno = errno;
1072 if (pid)
1073 temphead = saved_temphead;
1075 cs_leave (&cs);
1076 errno = saved_errno;
1078 if (0 <= pid || errno != EAGAIN)
1079 break;
1080 else
1082 xnanosleep (wait_retry);
1083 wait_retry *= 2;
1084 reap_exited ();
1088 if (pid < 0)
1090 saved_errno = errno;
1091 close (pipefds[0]);
1092 close (pipefds[1]);
1093 errno = saved_errno;
1095 else if (pid == 0)
1097 close (STDIN_FILENO);
1098 close (STDOUT_FILENO);
1100 else
1101 ++nprocs;
1103 return pid;
1105 #else /* ! HAVE_WORKING_FORK */
1106 return -1;
1107 #endif
1110 /* Create a temporary file and, if asked for, start a compressor
1111 to that file. Set *PFP to the file handle and return
1112 the address of the new temp node. If the creation
1113 fails, return nullptr if the failure is due to file descriptor
1114 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1116 static struct tempnode *
1117 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1119 int tempfd;
1120 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1121 if (! node)
1122 return nullptr;
1124 node->state = UNCOMPRESSED;
1126 if (compress_program)
1128 int pipefds[2];
1130 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1131 if (0 < node->pid)
1133 close (tempfd);
1134 close (pipefds[0]);
1135 tempfd = pipefds[1];
1137 register_proc (node);
1139 else if (node->pid == 0)
1141 /* Being the child of a multithreaded program before exec,
1142 we're restricted to calling async-signal-safe routines here. */
1143 close (pipefds[1]);
1144 move_fd (tempfd, STDOUT_FILENO);
1145 move_fd (pipefds[0], STDIN_FILENO);
1147 execlp (compress_program, compress_program, (char *) nullptr);
1149 async_safe_die (errno, "couldn't execute compress program");
1153 *pfp = fdopen (tempfd, "w");
1154 if (! *pfp)
1155 sort_die (_("couldn't create temporary file"), node->name);
1157 return node;
1160 /* Create a temporary file and, if asked for, start a compressor
1161 to that file. Set *PFP to the file handle and return the address
1162 of the new temp node. Die on failure. */
1164 static struct tempnode *
1165 create_temp (FILE **pfp)
1167 return maybe_create_temp (pfp, false);
1170 /* Open a compressed temp file and start a decompression process through
1171 which to filter the input. Return nullptr (setting errno to
1172 EMFILE) if we ran out of file descriptors, and die on any other
1173 kind of failure. */
1175 static FILE *
1176 open_temp (struct tempnode *temp)
1178 int tempfd, pipefds[2];
1179 FILE *fp = nullptr;
1181 if (temp->state == UNREAPED)
1182 wait_proc (temp->pid);
1184 tempfd = open (temp->name, O_RDONLY);
1185 if (tempfd < 0)
1186 return nullptr;
1188 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1190 switch (child)
1192 case -1:
1193 if (errno != EMFILE)
1194 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1195 quoteaf (compress_program));
1196 close (tempfd);
1197 errno = EMFILE;
1198 break;
1200 case 0:
1201 /* Being the child of a multithreaded program before exec,
1202 we're restricted to calling async-signal-safe routines here. */
1203 close (pipefds[0]);
1204 move_fd (tempfd, STDIN_FILENO);
1205 move_fd (pipefds[1], STDOUT_FILENO);
1207 execlp (compress_program, compress_program, "-d", (char *) nullptr);
1209 async_safe_die (errno, "couldn't execute compress program (with -d)");
1211 default:
1212 temp->pid = child;
1213 register_proc (temp);
1214 close (tempfd);
1215 close (pipefds[1]);
1217 fp = fdopen (pipefds[0], "r");
1218 if (! fp)
1220 int saved_errno = errno;
1221 close (pipefds[0]);
1222 errno = saved_errno;
1224 break;
1227 return fp;
1230 /* Append DIR to the array of temporary directory names. */
1231 static void
1232 add_temp_dir (char const *dir)
1234 if (temp_dir_count == temp_dir_alloc)
1235 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1237 temp_dirs[temp_dir_count++] = dir;
1240 /* Remove NAME from the list of temporary files. */
1242 static void
1243 zaptemp (char const *name)
1245 struct tempnode *volatile *pnode;
1246 struct tempnode *node;
1247 struct tempnode *next;
1248 int unlink_status;
1249 int unlink_errno = 0;
1250 struct cs_status cs;
1252 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1253 continue;
1255 if (node->state == UNREAPED)
1256 wait_proc (node->pid);
1258 /* Unlink the temporary file in a critical section to avoid races. */
1259 next = node->next;
1260 cs_enter (&cs);
1261 unlink_status = unlink (name);
1262 unlink_errno = errno;
1263 *pnode = next;
1264 cs_leave (&cs);
1266 if (unlink_status != 0)
1267 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1268 if (! next)
1269 temptail = pnode;
1270 free (node);
1273 #if HAVE_NL_LANGINFO
1275 static int
1276 struct_month_cmp (void const *m1, void const *m2)
1278 struct month const *month1 = m1;
1279 struct month const *month2 = m2;
1280 return strcmp (month1->name, month2->name);
1283 #endif
1285 /* Initialize the character class tables. */
1287 static void
1288 inittables (void)
1290 size_t i;
1292 for (i = 0; i < UCHAR_LIM; ++i)
1294 blanks[i] = field_sep (i);
1295 nonprinting[i] = ! isprint (i);
1296 nondictionary[i] = ! isalnum (i) && ! field_sep (i);
1297 fold_toupper[i] = toupper (i);
1300 #if HAVE_NL_LANGINFO
1301 /* If we're not in the "C" locale, read different names for months. */
1302 if (hard_LC_TIME)
1304 for (i = 0; i < MONTHS_PER_YEAR; i++)
1306 char const *s;
1307 size_t s_len;
1308 size_t j, k;
1309 char *name;
1311 s = nl_langinfo (ABMON_1 + i);
1312 s_len = strlen (s);
1313 monthtab[i].name = name = xmalloc (s_len + 1);
1314 monthtab[i].val = i + 1;
1316 for (j = k = 0; j < s_len; j++)
1317 if (! isblank (to_uchar (s[j])))
1318 name[k++] = fold_toupper[to_uchar (s[j])];
1319 name[k] = '\0';
1321 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1323 #endif
1326 /* Specify how many inputs may be merged at once.
1327 This may be set on the command-line with the
1328 --batch-size option. */
1329 static void
1330 specify_nmerge (int oi, char c, char const *s)
1332 uintmax_t n;
1333 struct rlimit rlimit;
1334 enum strtol_error e = xstrtoumax (s, nullptr, 10, &n, "");
1336 /* Try to find out how many file descriptors we'll be able
1337 to open. We need at least nmerge + 3 (STDIN_FILENO,
1338 STDOUT_FILENO and STDERR_FILENO). */
1339 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1340 ? rlimit.rlim_cur
1341 : OPEN_MAX)
1342 - 3);
1344 if (e == LONGINT_OK)
1346 nmerge = n;
1347 if (nmerge != n)
1348 e = LONGINT_OVERFLOW;
1349 else
1351 if (nmerge < 2)
1353 error (0, 0, _("invalid --%s argument %s"),
1354 long_options[oi].name, quote (s));
1355 error (SORT_FAILURE, 0,
1356 _("minimum --%s argument is %s"),
1357 long_options[oi].name, quote ("2"));
1359 else if (max_nmerge < nmerge)
1361 e = LONGINT_OVERFLOW;
1363 else
1364 return;
1368 if (e == LONGINT_OVERFLOW)
1370 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1371 error (0, 0, _("--%s argument %s too large"),
1372 long_options[oi].name, quote (s));
1373 error (SORT_FAILURE, 0,
1374 _("maximum --%s argument with current rlimit is %s"),
1375 long_options[oi].name,
1376 uinttostr (max_nmerge, max_nmerge_buf));
1378 else
1379 xstrtol_fatal (e, oi, c, long_options, s);
1382 /* Specify the amount of main memory to use when sorting. */
1383 static void
1384 specify_sort_size (int oi, char c, char const *s)
1386 uintmax_t n;
1387 char *suffix;
1388 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPQRtTYZ");
1390 /* The default unit is KiB. */
1391 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1393 if (n <= UINTMAX_MAX / 1024)
1394 n *= 1024;
1395 else
1396 e = LONGINT_OVERFLOW;
1399 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1400 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1401 switch (suffix[0])
1403 case 'b':
1404 e = LONGINT_OK;
1405 break;
1407 case '%':
1409 double mem = physmem_total () * n / 100;
1411 /* Use "<", not "<=", to avoid problems with rounding. */
1412 if (mem < UINTMAX_MAX)
1414 n = mem;
1415 e = LONGINT_OK;
1417 else
1418 e = LONGINT_OVERFLOW;
1420 break;
1423 if (e == LONGINT_OK)
1425 /* If multiple sort sizes are specified, take the maximum, so
1426 that option order does not matter. */
1427 if (n < sort_size)
1428 return;
1430 sort_size = n;
1431 if (sort_size == n)
1433 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1434 return;
1437 e = LONGINT_OVERFLOW;
1440 xstrtol_fatal (e, oi, c, long_options, s);
1443 /* Specify the number of threads to spawn during internal sort. */
1444 static size_t
1445 specify_nthreads (int oi, char c, char const *s)
1447 uintmax_t nthreads;
1448 enum strtol_error e = xstrtoumax (s, nullptr, 10, &nthreads, "");
1449 if (e == LONGINT_OVERFLOW)
1450 return SIZE_MAX;
1451 if (e != LONGINT_OK)
1452 xstrtol_fatal (e, oi, c, long_options, s);
1453 if (SIZE_MAX < nthreads)
1454 nthreads = SIZE_MAX;
1455 if (nthreads == 0)
1456 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1457 return nthreads;
1460 /* Return the default sort size. */
1461 static size_t
1462 default_sort_size (void)
1464 /* Let SIZE be MEM, but no more than the maximum object size,
1465 total memory, or system resource limits. Don't bother to check
1466 for values like RLIM_INFINITY since in practice they are not much
1467 less than SIZE_MAX. */
1468 size_t size = SIZE_MAX;
1469 struct rlimit rlimit;
1470 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1471 size = rlimit.rlim_cur;
1472 #ifdef RLIMIT_AS
1473 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1474 size = rlimit.rlim_cur;
1475 #endif
1477 /* Leave a large safety margin for the above limits, as failure can
1478 occur when they are exceeded. */
1479 size /= 2;
1481 #ifdef RLIMIT_RSS
1482 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1483 Exceeding RSS is not fatal, but can be quite slow. */
1484 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1485 size = rlimit.rlim_cur / 16 * 15;
1486 #endif
1488 /* Let MEM be available memory or 1/8 of total memory, whichever
1489 is greater. */
1490 double avail = physmem_available ();
1491 double total = physmem_total ();
1492 double mem = MAX (avail, total / 8);
1494 /* Leave a 1/4 margin for physical memory. */
1495 if (total * 0.75 < size)
1496 size = total * 0.75;
1498 /* Return the minimum of MEM and SIZE, but no less than
1499 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1500 right when only one argument is floating point. */
1501 if (mem < size)
1502 size = mem;
1503 return MAX (size, MIN_SORT_SIZE);
1506 /* Return the sort buffer size to use with the input files identified
1507 by FPS and FILES, which are alternate names of the same files.
1508 NFILES gives the number of input files; NFPS may be less. Assume
1509 that each input line requires LINE_BYTES extra bytes' worth of line
1510 information. Do not exceed the size bound specified by the user
1511 (or a default size bound, if the user does not specify one). */
1513 static size_t
1514 sort_buffer_size (FILE *const *fps, size_t nfps,
1515 char *const *files, size_t nfiles,
1516 size_t line_bytes)
1518 /* A bound on the input size. If zero, the bound hasn't been
1519 determined yet. */
1520 static size_t size_bound;
1522 /* In the worst case, each input byte is a newline. */
1523 size_t worst_case_per_input_byte = line_bytes + 1;
1525 /* Keep enough room for one extra input line and an extra byte.
1526 This extra room might be needed when preparing to read EOF. */
1527 size_t size = worst_case_per_input_byte + 1;
1529 for (size_t i = 0; i < nfiles; i++)
1531 struct stat st;
1532 off_t file_size;
1533 size_t worst_case;
1535 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1536 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1537 : stat (files[i], &st))
1538 != 0)
1539 sort_die (_("stat failed"), files[i]);
1541 if (S_ISREG (st.st_mode))
1542 file_size = st.st_size;
1543 else
1545 /* The file has unknown size. If the user specified a sort
1546 buffer size, use that; otherwise, guess the size. */
1547 if (sort_size)
1548 return sort_size;
1549 file_size = INPUT_FILE_SIZE_GUESS;
1552 if (! size_bound)
1554 size_bound = sort_size;
1555 if (! size_bound)
1556 size_bound = default_sort_size ();
1559 /* Add the amount of memory needed to represent the worst case
1560 where the input consists entirely of newlines followed by a
1561 single non-newline. Check for overflow. */
1562 worst_case = file_size * worst_case_per_input_byte + 1;
1563 if (file_size != worst_case / worst_case_per_input_byte
1564 || size_bound - size <= worst_case)
1565 return size_bound;
1566 size += worst_case;
1569 return size;
1572 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1573 must be at least sizeof (struct line). Allocate ALLOC bytes
1574 initially. */
1576 static void
1577 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1579 /* Ensure that the line array is properly aligned. If the desired
1580 size cannot be allocated, repeatedly halve it until allocation
1581 succeeds. The smaller allocation may hurt overall performance,
1582 but that's better than failing. */
1583 while (true)
1585 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1586 buf->buf = malloc (alloc);
1587 if (buf->buf)
1588 break;
1589 alloc /= 2;
1590 if (alloc <= line_bytes + 1)
1591 xalloc_die ();
1594 buf->line_bytes = line_bytes;
1595 buf->alloc = alloc;
1596 buf->used = buf->left = buf->nlines = 0;
1597 buf->eof = false;
1600 /* Return one past the limit of the line array. */
1602 static inline struct line *
1603 buffer_linelim (struct buffer const *buf)
1605 void *linelim = buf->buf + buf->alloc;
1606 return linelim;
1609 /* Return a pointer to the first character of the field specified
1610 by KEY in LINE. */
1612 static char *
1613 begfield (struct line const *line, struct keyfield const *key)
1615 char *ptr = line->text, *lim = ptr + line->length - 1;
1616 size_t sword = key->sword;
1617 size_t schar = key->schar;
1619 /* The leading field separator itself is included in a field when -t
1620 is absent. */
1622 if (tab != TAB_DEFAULT)
1623 while (ptr < lim && sword--)
1625 while (ptr < lim && *ptr != tab)
1626 ++ptr;
1627 if (ptr < lim)
1628 ++ptr;
1630 else
1631 while (ptr < lim && sword--)
1633 while (ptr < lim && blanks[to_uchar (*ptr)])
1634 ++ptr;
1635 while (ptr < lim && !blanks[to_uchar (*ptr)])
1636 ++ptr;
1639 /* If we're ignoring leading blanks when computing the Start
1640 of the field, skip past them here. */
1641 if (key->skipsblanks)
1642 while (ptr < lim && blanks[to_uchar (*ptr)])
1643 ++ptr;
1645 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1646 ptr = MIN (lim, ptr + schar);
1648 return ptr;
1651 /* Return the limit of (a pointer to the first character after) the field
1652 in LINE specified by KEY. */
1654 ATTRIBUTE_PURE
1655 static char *
1656 limfield (struct line const *line, struct keyfield const *key)
1658 char *ptr = line->text, *lim = ptr + line->length - 1;
1659 size_t eword = key->eword, echar = key->echar;
1661 if (echar == 0)
1662 eword++; /* Skip all of end field. */
1664 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1665 whichever comes first. If there are more than EWORD fields, leave
1666 PTR pointing at the beginning of the field having zero-based index,
1667 EWORD. If a delimiter character was specified (via -t), then that
1668 'beginning' is the first character following the delimiting TAB.
1669 Otherwise, leave PTR pointing at the first 'blank' character after
1670 the preceding field. */
1671 if (tab != TAB_DEFAULT)
1672 while (ptr < lim && eword--)
1674 while (ptr < lim && *ptr != tab)
1675 ++ptr;
1676 if (ptr < lim && (eword || echar))
1677 ++ptr;
1679 else
1680 while (ptr < lim && eword--)
1682 while (ptr < lim && blanks[to_uchar (*ptr)])
1683 ++ptr;
1684 while (ptr < lim && !blanks[to_uchar (*ptr)])
1685 ++ptr;
1688 #ifdef POSIX_UNSPECIFIED
1689 /* The following block of code makes GNU sort incompatible with
1690 standard Unix sort, so it's ifdef'd out for now.
1691 The POSIX spec isn't clear on how to interpret this.
1692 FIXME: request clarification.
1694 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1695 Date: Thu, 30 May 96 12:20:41 -0400
1696 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1698 [...]I believe I've found another bug in 'sort'.
1700 $ cat /tmp/sort.in
1701 a b c 2 d
1702 pq rs 1 t
1703 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1704 a b c 2 d
1705 pq rs 1 t
1706 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1707 pq rs 1 t
1708 a b c 2 d
1710 Unix sort produced the answer I expected: sort on the single character
1711 in column 7. GNU sort produced different results, because it disagrees
1712 on the interpretation of the key-end spec "M.N". Unix sort reads this
1713 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1714 "skip M-1 fields, then either N-1 characters or the rest of the current
1715 field, whichever comes first". This extra clause applies only to
1716 key-ends, not key-starts.
1719 /* Make LIM point to the end of (one byte past) the current field. */
1720 if (tab != TAB_DEFAULT)
1722 char *newlim;
1723 newlim = memchr (ptr, tab, lim - ptr);
1724 if (newlim)
1725 lim = newlim;
1727 else
1729 char *newlim;
1730 newlim = ptr;
1731 while (newlim < lim && blanks[to_uchar (*newlim)])
1732 ++newlim;
1733 while (newlim < lim && !blanks[to_uchar (*newlim)])
1734 ++newlim;
1735 lim = newlim;
1737 #endif
1739 if (echar != 0) /* We need to skip over a portion of the end field. */
1741 /* If we're ignoring leading blanks when computing the End
1742 of the field, skip past them here. */
1743 if (key->skipeblanks)
1744 while (ptr < lim && blanks[to_uchar (*ptr)])
1745 ++ptr;
1747 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1748 ptr = MIN (lim, ptr + echar);
1751 return ptr;
1754 /* Fill BUF reading from FP, moving buf->left bytes from the end
1755 of buf->buf to the beginning first. If EOF is reached and the
1756 file wasn't terminated by a newline, supply one. Set up BUF's line
1757 table too. FILE is the name of the file corresponding to FP.
1758 Return true if some input was read. */
1760 static bool
1761 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1763 struct keyfield const *key = keylist;
1764 char eol = eolchar;
1765 size_t line_bytes = buf->line_bytes;
1766 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1768 if (buf->eof)
1769 return false;
1771 if (buf->used != buf->left)
1773 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1774 buf->used = buf->left;
1775 buf->nlines = 0;
1778 while (true)
1780 char *ptr = buf->buf + buf->used;
1781 struct line *linelim = buffer_linelim (buf);
1782 struct line *line = linelim - buf->nlines;
1783 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1784 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1786 while (line_bytes + 1 < avail)
1788 /* Read as many bytes as possible, but do not read so many
1789 bytes that there might not be enough room for the
1790 corresponding line array. The worst case is when the
1791 rest of the input file consists entirely of newlines,
1792 except that the last byte is not a newline. */
1793 size_t readsize = (avail - 1) / (line_bytes + 1);
1794 size_t bytes_read = fread (ptr, 1, readsize, fp);
1795 char *ptrlim = ptr + bytes_read;
1796 char *p;
1797 avail -= bytes_read;
1799 if (bytes_read != readsize)
1801 if (ferror (fp))
1802 sort_die (_("read failed"), file);
1803 if (feof (fp))
1805 buf->eof = true;
1806 if (buf->buf == ptrlim)
1807 return false;
1808 if (line_start != ptrlim && ptrlim[-1] != eol)
1809 *ptrlim++ = eol;
1813 /* Find and record each line in the just-read input. */
1814 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1816 /* Delimit the line with NUL. This eliminates the need to
1817 temporarily replace the last byte with NUL when calling
1818 xmemcoll, which increases performance. */
1819 *p = '\0';
1820 ptr = p + 1;
1821 line--;
1822 line->text = line_start;
1823 line->length = ptr - line_start;
1824 mergesize = MAX (mergesize, line->length);
1825 avail -= line_bytes;
1827 if (key)
1829 /* Precompute the position of the first key for
1830 efficiency. */
1831 line->keylim = (key->eword == SIZE_MAX
1833 : limfield (line, key));
1835 if (key->sword != SIZE_MAX)
1836 line->keybeg = begfield (line, key);
1837 else
1839 if (key->skipsblanks)
1840 while (blanks[to_uchar (*line_start)])
1841 line_start++;
1842 line->keybeg = line_start;
1846 line_start = ptr;
1849 ptr = ptrlim;
1850 if (buf->eof)
1851 break;
1854 buf->used = ptr - buf->buf;
1855 buf->nlines = buffer_linelim (buf) - line;
1856 if (buf->nlines != 0)
1858 buf->left = ptr - line_start;
1859 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1860 return true;
1864 /* The current input line is too long to fit in the buffer.
1865 Increase the buffer size and try again, keeping it properly
1866 aligned. */
1867 size_t line_alloc = buf->alloc / sizeof (struct line);
1868 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1869 buf->alloc = line_alloc * sizeof (struct line);
1874 /* Table that maps characters to order-of-magnitude values. */
1875 static char const unit_order[UCHAR_LIM] =
1877 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1878 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'R' == 82 && 'Q' == 81 \
1879 && 'k' == 107)
1880 /* This initializer syntax works on all C99 hosts. For now, use
1881 it only on non-ASCII hosts, to ease the pain of porting to
1882 pre-C99 ASCII hosts. */
1883 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1884 ['R']=9, ['Q']=10,
1885 ['k']=1,
1886 #else
1887 /* Generate the following table with this command:
1888 perl -e 'my %a=(k=>1,K=>1,M=>2,G=>3,T=>4,P=>5,E=>6,Z=>7,Y=>8,R=>9,Q=>10);
1889 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1890 |fmt */
1891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1894 0, 0, 0, 1, 0, 2, 0, 0, 5, 10, 9, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902 #endif
1905 /* Traverse number given as *number consisting of digits, thousands_sep, and
1906 decimal_point chars only. Returns the highest digit found in the number,
1907 or '\0' if no digit has been found. Upon return *number points at the
1908 character that immediately follows after the given number. */
1909 static char
1910 traverse_raw_number (char const **number)
1912 char const *p = *number;
1913 char ch;
1914 char max_digit = '\0';
1915 bool ends_with_thousands_sep = false;
1917 /* Scan to end of number.
1918 Decimals or separators not followed by digits stop the scan.
1919 Numbers ending in decimals or separators are thus considered
1920 to be lacking in units.
1921 FIXME: add support for multibyte thousands_sep and decimal_point. */
1923 while (ISDIGIT (ch = *p++))
1925 if (max_digit < ch)
1926 max_digit = ch;
1928 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1929 the unit in the next column in case thousands_sep matches as blank
1930 and is used as column delimiter. */
1931 ends_with_thousands_sep = (*p == thousands_sep);
1932 if (ends_with_thousands_sep)
1933 ++p;
1936 if (ends_with_thousands_sep)
1938 /* thousands_sep not followed by digit is not allowed. */
1939 *number = p - 2;
1940 return max_digit;
1943 if (ch == decimal_point)
1944 while (ISDIGIT (ch = *p++))
1945 if (max_digit < ch)
1946 max_digit = ch;
1948 *number = p - 1;
1949 return max_digit;
1952 /* Return an integer that represents the order of magnitude of the
1953 unit following the number. The number may contain thousands
1954 separators and a decimal point, but it may not contain leading blanks.
1955 Negative numbers get negative orders; zero numbers have a zero order. */
1957 ATTRIBUTE_PURE
1958 static int
1959 find_unit_order (char const *number)
1961 bool minus_sign = (*number == '-');
1962 char const *p = number + minus_sign;
1963 char max_digit = traverse_raw_number (&p);
1964 if ('0' < max_digit)
1966 unsigned char ch = *p;
1967 int order = unit_order[ch];
1968 return (minus_sign ? -order : order);
1970 else
1971 return 0;
1974 /* Compare numbers A and B ending in units with SI or IEC prefixes
1975 <none/unknown> < K/k < M < G < T < P < E < Z < Y < R < Q */
1977 ATTRIBUTE_PURE
1978 static int
1979 human_numcompare (char const *a, char const *b)
1981 while (blanks[to_uchar (*a)])
1982 a++;
1983 while (blanks[to_uchar (*b)])
1984 b++;
1986 int diff = find_unit_order (a) - find_unit_order (b);
1987 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1990 /* Compare strings A and B as numbers without explicitly converting them to
1991 machine numbers. Comparatively slow for short strings, but asymptotically
1992 hideously fast. */
1994 ATTRIBUTE_PURE
1995 static int
1996 numcompare (char const *a, char const *b)
1998 while (blanks[to_uchar (*a)])
1999 a++;
2000 while (blanks[to_uchar (*b)])
2001 b++;
2003 return strnumcmp (a, b, decimal_point, thousands_sep);
2006 static int
2007 nan_compare (long double a, long double b)
2009 char buf[2][sizeof "-nan""()" + CHAR_BIT * sizeof a];
2010 snprintf (buf[0], sizeof buf[0], "%Lf", a);
2011 snprintf (buf[1], sizeof buf[1], "%Lf", b);
2012 return strcmp (buf[0], buf[1]);
2015 static int
2016 general_numcompare (char const *sa, char const *sb)
2018 /* FIXME: maybe add option to try expensive FP conversion
2019 only if A and B can't be compared more cheaply/accurately. */
2021 char *ea;
2022 char *eb;
2023 long double a = strtold (sa, &ea);
2024 long double b = strtold (sb, &eb);
2026 /* Put conversion errors at the start of the collating sequence. */
2027 if (sa == ea)
2028 return sb == eb ? 0 : -1;
2029 if (sb == eb)
2030 return 1;
2032 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2033 conversion errors but before numbers; sort them by internal
2034 bit-pattern, for lack of a more portable alternative. */
2035 return (a < b ? -1
2036 : a > b ? 1
2037 : a == b ? 0
2038 : b == b ? -1
2039 : a == a ? 1
2040 : nan_compare (a, b));
2043 /* Return an integer in 1..12 of the month name MONTH.
2044 Return 0 if the name in S is not recognized. */
2046 static int
2047 getmonth (char const *month, char **ea)
2049 size_t lo = 0;
2050 size_t hi = MONTHS_PER_YEAR;
2052 while (blanks[to_uchar (*month)])
2053 month++;
2057 size_t ix = (lo + hi) / 2;
2058 char const *m = month;
2059 char const *n = monthtab[ix].name;
2061 for (;; m++, n++)
2063 if (!*n)
2065 if (ea)
2066 *ea = (char *) m;
2067 return monthtab[ix].val;
2069 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2071 hi = ix;
2072 break;
2074 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2076 lo = ix + 1;
2077 break;
2081 while (lo < hi);
2083 return 0;
2086 /* A randomly chosen MD5 state, used for random comparison. */
2087 static struct md5_ctx random_md5_state;
2089 /* Initialize the randomly chosen MD5 state. */
2091 static void
2092 random_md5_state_init (char const *random_source)
2094 unsigned char buf[MD5_DIGEST_SIZE];
2095 struct randread_source *r = randread_new (random_source, sizeof buf);
2096 if (! r)
2097 sort_die (_("open failed"), random_source ? random_source : "getrandom");
2098 randread (r, buf, sizeof buf);
2099 if (randread_free (r) != 0)
2100 sort_die (_("close failed"), random_source);
2101 md5_init_ctx (&random_md5_state);
2102 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2105 /* This is like strxfrm, except it reports any error and exits. */
2107 static size_t
2108 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2110 errno = 0;
2111 size_t translated_size = strxfrm (dest, src, destsize);
2113 if (errno)
2115 error (0, errno, _("string transformation failed"));
2116 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2117 error (SORT_FAILURE, 0,
2118 _("the untransformed string was %s"),
2119 quotearg_n_style (0, locale_quoting_style, src));
2122 return translated_size;
2125 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2126 using one or more random hash functions. TEXTA[LENA] and
2127 TEXTB[LENB] must be zero. */
2129 static int
2130 compare_random (char *restrict texta, size_t lena,
2131 char *restrict textb, size_t lenb)
2133 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2134 data. This is used to break ties if there is a checksum
2135 collision, and this is good enough given the astronomically low
2136 probability of a collision. */
2137 int xfrm_diff = 0;
2139 char stackbuf[4000];
2140 char *buf = stackbuf;
2141 size_t bufsize = sizeof stackbuf;
2142 void *allocated = nullptr;
2143 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2144 struct md5_ctx s[2];
2145 s[0] = s[1] = random_md5_state;
2147 if (hard_LC_COLLATE)
2149 char const *lima = texta + lena;
2150 char const *limb = textb + lenb;
2152 while (true)
2154 /* Transform the text into the basis of comparison, so that byte
2155 strings that would otherwise considered to be equal are
2156 considered equal here even if their bytes differ.
2158 Each time through this loop, transform one
2159 null-terminated string's worth from TEXTA or from TEXTB
2160 or both. That way, there's no need to store the
2161 transformation of the whole line, if it contains many
2162 null-terminated strings. */
2164 /* Store the transformed data into a big-enough buffer. */
2166 /* A 3X size guess avoids the overhead of calling strxfrm
2167 twice on typical implementations. Don't worry about
2168 size_t overflow, as the guess need not be correct. */
2169 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2170 if (bufsize < guess_bufsize)
2172 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2173 free (allocated);
2174 buf = allocated = malloc (bufsize);
2175 if (! buf)
2177 buf = stackbuf;
2178 bufsize = sizeof stackbuf;
2182 size_t sizea =
2183 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2184 bool a_fits = sizea <= bufsize;
2185 size_t sizeb =
2186 (textb < limb
2187 ? (xstrxfrm ((a_fits ? buf + sizea : nullptr), textb,
2188 (a_fits ? bufsize - sizea : 0))
2189 + 1)
2190 : 0);
2192 if (! (a_fits && sizea + sizeb <= bufsize))
2194 bufsize = sizea + sizeb;
2195 if (bufsize < SIZE_MAX / 3)
2196 bufsize = bufsize * 3 / 2;
2197 free (allocated);
2198 buf = allocated = xmalloc (bufsize);
2199 if (texta < lima)
2200 strxfrm (buf, texta, sizea);
2201 if (textb < limb)
2202 strxfrm (buf + sizea, textb, sizeb);
2205 /* Advance past NULs to the next part of each input string,
2206 exiting the loop if both strings are exhausted. When
2207 exiting the loop, prepare to finish off the tiebreaker
2208 comparison properly. */
2209 if (texta < lima)
2210 texta += strlen (texta) + 1;
2211 if (textb < limb)
2212 textb += strlen (textb) + 1;
2213 if (! (texta < lima || textb < limb))
2215 lena = sizea; texta = buf;
2216 lenb = sizeb; textb = buf + sizea;
2217 break;
2220 /* Accumulate the transformed data in the corresponding
2221 checksums. */
2222 md5_process_bytes (buf, sizea, &s[0]);
2223 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2225 /* Update the tiebreaker comparison of the transformed data. */
2226 if (! xfrm_diff)
2228 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2229 if (! xfrm_diff)
2230 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2235 /* Compute and compare the checksums. */
2236 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2237 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2238 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2240 /* Fall back on the tiebreaker if the checksums collide. */
2241 if (! diff)
2243 if (! xfrm_diff)
2245 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2246 if (! xfrm_diff)
2247 xfrm_diff = (lena > lenb) - (lena < lenb);
2250 diff = xfrm_diff;
2253 free (allocated);
2255 return diff;
2258 /* Return the printable width of the block of memory starting at
2259 TEXT and ending just before LIM, counting each tab as one byte.
2260 FIXME: Should we generally be counting non printable chars? */
2262 static size_t
2263 debug_width (char const *text, char const *lim)
2265 size_t width = mbsnwidth (text, lim - text, 0);
2266 while (text < lim)
2267 width += (*text++ == '\t');
2268 return width;
2271 /* For debug mode, "underline" a key at the
2272 specified offset and screen width. */
2274 static void
2275 mark_key (size_t offset, size_t width)
2277 while (offset--)
2278 putchar (' ');
2280 if (!width)
2281 printf (_("^ no match for key\n"));
2282 else
2285 putchar ('_');
2286 while (--width);
2288 putchar ('\n');
2292 /* Return true if KEY is a numeric key. */
2294 static inline bool
2295 key_numeric (struct keyfield const *key)
2297 return key->numeric || key->general_numeric || key->human_numeric;
2300 /* For LINE, output a debugging line that underlines KEY in LINE.
2301 If KEY is null, underline the whole line. */
2303 static void
2304 debug_key (struct line const *line, struct keyfield const *key)
2306 char *text = line->text;
2307 char *beg = text;
2308 char *lim = text + line->length - 1;
2310 if (key)
2312 if (key->sword != SIZE_MAX)
2313 beg = begfield (line, key);
2314 if (key->eword != SIZE_MAX)
2315 lim = limfield (line, key);
2317 if ((key->skipsblanks && key->sword == SIZE_MAX)
2318 || key->month || key_numeric (key))
2320 char saved = *lim;
2321 *lim = '\0';
2323 while (blanks[to_uchar (*beg)])
2324 beg++;
2326 char *tighter_lim = beg;
2328 if (lim < beg)
2329 tighter_lim = lim;
2330 else if (key->month)
2331 getmonth (beg, &tighter_lim);
2332 else if (key->general_numeric)
2333 ignore_value (strtold (beg, &tighter_lim));
2334 else if (key->numeric || key->human_numeric)
2336 char const *p = beg + (beg < lim && *beg == '-');
2337 char max_digit = traverse_raw_number (&p);
2338 if ('0' <= max_digit)
2340 unsigned char ch = *p;
2341 tighter_lim = (char *) p
2342 + (key->human_numeric && unit_order[ch]);
2345 else
2346 tighter_lim = lim;
2348 *lim = saved;
2349 lim = tighter_lim;
2353 size_t offset = debug_width (text, beg);
2354 size_t width = debug_width (beg, lim);
2355 mark_key (offset, width);
2358 /* Debug LINE by underlining its keys. */
2360 static void
2361 debug_line (struct line const *line)
2363 struct keyfield const *key = keylist;
2366 debug_key (line, key);
2367 while (key && ((key = key->next) || ! (unique || stable)));
2370 /* Return whether sorting options specified for key. */
2372 static bool
2373 default_key_compare (struct keyfield const *key)
2375 return ! (key->ignore
2376 || key->translate
2377 || key->skipsblanks
2378 || key->skipeblanks
2379 || key_numeric (key)
2380 || key->month
2381 || key->version
2382 || key->random
2383 /* || key->reverse */
2387 /* Convert a key to the short options used to specify it. */
2389 static void
2390 key_to_opts (struct keyfield const *key, char *opts)
2392 if (key->skipsblanks || key->skipeblanks)
2393 *opts++ = 'b';/* either disables global -b */
2394 if (key->ignore == nondictionary)
2395 *opts++ = 'd';
2396 if (key->translate)
2397 *opts++ = 'f';
2398 if (key->general_numeric)
2399 *opts++ = 'g';
2400 if (key->human_numeric)
2401 *opts++ = 'h';
2402 if (key->ignore == nonprinting)
2403 *opts++ = 'i';
2404 if (key->month)
2405 *opts++ = 'M';
2406 if (key->numeric)
2407 *opts++ = 'n';
2408 if (key->random)
2409 *opts++ = 'R';
2410 if (key->reverse)
2411 *opts++ = 'r';
2412 if (key->version)
2413 *opts++ = 'V';
2414 *opts = '\0';
2417 /* Output data independent key warnings to stderr. */
2419 static void
2420 key_warnings (struct keyfield const *gkey, bool gkey_only)
2422 struct keyfield const *key;
2423 struct keyfield ugkey = *gkey;
2424 unsigned long keynum = 1;
2425 bool basic_numeric_field = false;
2426 bool general_numeric_field = false;
2427 bool basic_numeric_field_span = false;
2428 bool general_numeric_field_span = false;
2430 for (key = keylist; key; key = key->next, keynum++)
2432 if (key_numeric (key))
2434 if (key->general_numeric)
2435 general_numeric_field = true;
2436 else
2437 basic_numeric_field = true;
2440 if (key->traditional_used)
2442 size_t sword = key->sword;
2443 size_t eword = key->eword;
2444 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2445 /* obsolescent syntax +A.x -B.y is equivalent to:
2446 -k A+1.x+1,B.y (when y = 0)
2447 -k A+1.x+1,B+1.y (when y > 0) */
2448 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2449 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2450 char *po = obuf;
2451 char *pn = nbuf;
2453 if (sword == SIZE_MAX)
2454 sword++;
2456 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2457 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2458 if (key->eword != SIZE_MAX)
2460 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2461 stpcpy (stpcpy (pn, ","),
2462 umaxtostr (eword + 1
2463 + (key->echar == SIZE_MAX), tmp));
2465 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2466 quote_n (0, obuf), quote_n (1, nbuf));
2469 /* Warn about field specs that will never match. */
2470 bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2471 if (zero_width)
2472 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2474 /* Warn about significant leading blanks. */
2475 bool implicit_skip = key_numeric (key) || key->month;
2476 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2477 if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2478 && ((!key->skipsblanks && !implicit_skip)
2479 || (!key->skipsblanks && key->schar)
2480 || (!key->skipeblanks && key->echar)))
2481 error (0, 0, _("leading blanks are significant in key %lu; "
2482 "consider also specifying 'b'"), keynum);
2484 /* Warn about numeric comparisons spanning fields,
2485 as field delimiters could be interpreted as part
2486 of the number (maybe only in other locales). */
2487 if (!gkey_only && key_numeric (key))
2489 size_t sword = key->sword + 1;
2490 size_t eword = key->eword + 1;
2491 if (!sword)
2492 sword++;
2493 if (!eword || sword < eword)
2495 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2496 keynum);
2497 if (key->general_numeric)
2498 general_numeric_field_span = true;
2499 else
2500 basic_numeric_field_span = true;
2504 /* Flag global options not copied or specified in any key. */
2505 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2506 ugkey.ignore = nullptr;
2507 if (ugkey.translate && (ugkey.translate == key->translate))
2508 ugkey.translate = nullptr;
2509 ugkey.skipsblanks &= !key->skipsblanks;
2510 ugkey.skipeblanks &= !key->skipeblanks;
2511 ugkey.month &= !key->month;
2512 ugkey.numeric &= !key->numeric;
2513 ugkey.general_numeric &= !key->general_numeric;
2514 ugkey.human_numeric &= !key->human_numeric;
2515 ugkey.random &= !key->random;
2516 ugkey.version &= !key->version;
2517 ugkey.reverse &= !key->reverse;
2520 /* Explicitly warn if field delimiters in this locale
2521 don't constrain numbers. */
2522 bool number_locale_warned = false;
2523 if (basic_numeric_field_span)
2525 if (tab == TAB_DEFAULT
2526 ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
2527 : tab == thousands_sep)
2529 error (0, 0,
2530 _("field separator %s is treated as a "
2531 "group separator in numbers"),
2532 quote (((char []) {thousands_sep, 0})));
2533 number_locale_warned = true;
2536 if (basic_numeric_field_span || general_numeric_field_span)
2538 if (tab == TAB_DEFAULT
2539 ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
2540 : tab == decimal_point)
2542 error (0, 0,
2543 _("field separator %s is treated as a "
2544 "decimal point in numbers"),
2545 quote (((char []) {decimal_point, 0})));
2546 number_locale_warned = true;
2548 else if (tab == '-')
2550 error (0, 0,
2551 _("field separator %s is treated as a "
2552 "minus sign in numbers"),
2553 quote (((char []) {tab, 0})));
2555 else if (general_numeric_field_span && tab == '+')
2557 error (0, 0,
2558 _("field separator %s is treated as a "
2559 "plus sign in numbers"),
2560 quote (((char []) {tab, 0})));
2564 /* Explicitly indicate the decimal point used in this locale,
2565 as it suggests that robust scripts need to consider
2566 setting the locale when comparing numbers. */
2567 if ((basic_numeric_field || general_numeric_field) && ! number_locale_warned)
2569 error (0, 0,
2570 _("%snumbers use %s as a decimal point in this locale"),
2571 tab == decimal_point ? "" : _("note "),
2572 quote (((char []) {decimal_point, 0})));
2576 if (basic_numeric_field && thousands_sep_ignored)
2578 error (0, 0,
2579 _("the multi-byte number group separator "
2580 "in this locale is not supported"));
2583 /* Warn about ignored global options flagged above.
2584 This clears all flags if UGKEY is the only one in the list. */
2585 if (!default_key_compare (&ugkey)
2586 || (ugkey.reverse && (stable || unique) && keylist))
2588 bool ugkey_reverse = ugkey.reverse;
2589 if (!(stable || unique))
2590 ugkey.reverse = false;
2591 /* The following is too big, but guaranteed to be "big enough". */
2592 char opts[sizeof short_options];
2593 key_to_opts (&ugkey, opts);
2594 error (0, 0,
2595 ngettext ("option '-%s' is ignored",
2596 "options '-%s' are ignored",
2597 select_plural (strlen (opts))), opts);
2598 ugkey.reverse = ugkey_reverse;
2600 if (ugkey.reverse && !(stable || unique) && keylist)
2601 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2604 /* Return either the sense of DIFF or its reverse, depending on REVERSED.
2605 If REVERSED, do not simply negate DIFF as that can mishandle INT_MIN. */
2607 static int
2608 diff_reversed (int diff, bool reversed)
2610 return reversed ? (diff < 0) - (diff > 0) : diff;
2613 /* Compare two lines A and B trying every key in sequence until there
2614 are no more keys or a difference is found. */
2616 static int
2617 keycompare (struct line const *a, struct line const *b)
2619 struct keyfield *key = keylist;
2621 /* For the first iteration only, the key positions have been
2622 precomputed for us. */
2623 char *texta = a->keybeg;
2624 char *textb = b->keybeg;
2625 char *lima = a->keylim;
2626 char *limb = b->keylim;
2628 int diff;
2630 while (true)
2632 char const *translate = key->translate;
2633 bool const *ignore = key->ignore;
2635 /* Treat field ends before field starts as empty fields. */
2636 lima = MAX (texta, lima);
2637 limb = MAX (textb, limb);
2639 /* Find the lengths. */
2640 size_t lena = lima - texta;
2641 size_t lenb = limb - textb;
2643 if (hard_LC_COLLATE || key_numeric (key)
2644 || key->month || key->random || key->version)
2646 /* Ordinarily use the keys in-place, temporarily null-terminated. */
2647 char *ta = texta;
2648 char *tb = textb;
2649 size_t tlena = lena;
2650 size_t tlenb = lenb;
2651 char enda = ta[tlena];
2652 char endb = tb[tlenb];
2654 void *allocated = nullptr;
2655 char stackbuf[4000];
2657 if (ignore || translate)
2659 /* Compute with copies of the keys, which are the result of
2660 translating or ignoring characters, and which need their
2661 own storage. */
2663 size_t i;
2665 /* Allocate space for copies. */
2666 size_t size = lena + 1 + lenb + 1;
2667 if (size <= sizeof stackbuf)
2668 ta = stackbuf;
2669 else
2670 ta = allocated = xmalloc (size);
2671 tb = ta + lena + 1;
2673 /* Put into each copy a version of the key in which the
2674 requested characters are ignored or translated. */
2675 for (tlena = i = 0; i < lena; i++)
2676 if (! (ignore && ignore[to_uchar (texta[i])]))
2677 ta[tlena++] = (translate
2678 ? translate[to_uchar (texta[i])]
2679 : texta[i]);
2681 for (tlenb = i = 0; i < lenb; i++)
2682 if (! (ignore && ignore[to_uchar (textb[i])]))
2683 tb[tlenb++] = (translate
2684 ? translate[to_uchar (textb[i])]
2685 : textb[i]);
2688 ta[tlena] = '\0';
2689 tb[tlenb] = '\0';
2691 if (key->numeric)
2692 diff = numcompare (ta, tb);
2693 else if (key->general_numeric)
2694 diff = general_numcompare (ta, tb);
2695 else if (key->human_numeric)
2696 diff = human_numcompare (ta, tb);
2697 else if (key->month)
2698 diff = getmonth (ta, nullptr) - getmonth (tb, nullptr);
2699 else if (key->random)
2700 diff = compare_random (ta, tlena, tb, tlenb);
2701 else if (key->version)
2702 diff = filenvercmp (ta, tlena, tb, tlenb);
2703 else
2705 /* Locale-dependent string sorting. This is slower than
2706 C-locale sorting, which is implemented below. */
2707 if (tlena == 0)
2708 diff = - NONZERO (tlenb);
2709 else if (tlenb == 0)
2710 diff = 1;
2711 else
2712 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2715 ta[tlena] = enda;
2716 tb[tlenb] = endb;
2718 free (allocated);
2720 else if (ignore)
2722 #define CMP_WITH_IGNORE(A, B) \
2723 do \
2725 while (true) \
2727 while (texta < lima && ignore[to_uchar (*texta)]) \
2728 ++texta; \
2729 while (textb < limb && ignore[to_uchar (*textb)]) \
2730 ++textb; \
2731 if (! (texta < lima && textb < limb)) \
2733 diff = (texta < lima) - (textb < limb); \
2734 break; \
2736 diff = to_uchar (A) - to_uchar (B); \
2737 if (diff) \
2738 break; \
2739 ++texta; \
2740 ++textb; \
2744 while (0)
2746 if (translate)
2747 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2748 translate[to_uchar (*textb)]);
2749 else
2750 CMP_WITH_IGNORE (*texta, *textb);
2752 else
2754 size_t lenmin = MIN (lena, lenb);
2755 if (lenmin == 0)
2756 diff = 0;
2757 else if (translate)
2759 size_t i = 0;
2762 diff = (to_uchar (translate[to_uchar (texta[i])])
2763 - to_uchar (translate[to_uchar (textb[i])]));
2764 if (diff)
2765 break;
2766 i++;
2768 while (i < lenmin);
2770 else
2771 diff = memcmp (texta, textb, lenmin);
2773 if (! diff)
2774 diff = (lena > lenb) - (lena < lenb);
2777 if (diff)
2778 break;
2780 key = key->next;
2781 if (! key)
2782 return 0;
2784 /* Find the beginning and limit of the next field. */
2785 if (key->eword != SIZE_MAX)
2786 lima = limfield (a, key), limb = limfield (b, key);
2787 else
2788 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2790 if (key->sword != SIZE_MAX)
2791 texta = begfield (a, key), textb = begfield (b, key);
2792 else
2794 texta = a->text, textb = b->text;
2795 if (key->skipsblanks)
2797 while (texta < lima && blanks[to_uchar (*texta)])
2798 ++texta;
2799 while (textb < limb && blanks[to_uchar (*textb)])
2800 ++textb;
2805 return diff_reversed (diff, key->reverse);
2808 /* Compare two lines A and B, returning negative, zero, or positive
2809 depending on whether A compares less than, equal to, or greater than B. */
2811 static int
2812 compare (struct line const *a, struct line const *b)
2814 int diff;
2815 size_t alen, blen;
2817 /* First try to compare on the specified keys (if any).
2818 The only two cases with no key at all are unadorned sort,
2819 and unadorned sort -r. */
2820 if (keylist)
2822 diff = keycompare (a, b);
2823 if (diff || unique || stable)
2824 return diff;
2827 /* If the keys all compare equal (or no keys were specified)
2828 fall through to the default comparison. */
2829 alen = a->length - 1, blen = b->length - 1;
2831 if (alen == 0)
2832 diff = - NONZERO (blen);
2833 else if (blen == 0)
2834 diff = 1;
2835 else if (hard_LC_COLLATE)
2837 /* xmemcoll0 is a performance enhancement as
2838 it will not unconditionally write '\0' after the
2839 passed in buffers, which was seen to give around
2840 a 3% increase in performance for short lines. */
2841 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2843 else
2845 diff = memcmp (a->text, b->text, MIN (alen, blen));
2846 if (!diff)
2847 diff = (alen > blen) - (alen < blen);
2850 return diff_reversed (diff, reverse);
2853 /* Write LINE to output stream FP; the output file's name is
2854 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2855 otherwise. If debugging is enabled and FP is standard output,
2856 append some debugging information. */
2858 static void
2859 write_line (struct line const *line, FILE *fp, char const *output_file)
2861 char *buf = line->text;
2862 size_t n_bytes = line->length;
2863 char *ebuf = buf + n_bytes;
2865 if (!output_file && debug)
2867 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2868 char const *c = buf;
2870 while (c < ebuf)
2872 char wc = *c++;
2873 if (wc == '\t')
2874 wc = '>';
2875 else if (c == ebuf)
2876 wc = '\n';
2877 if (fputc (wc, fp) == EOF)
2878 sort_die (_("write failed"), output_file);
2881 debug_line (line);
2883 else
2885 ebuf[-1] = eolchar;
2886 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2887 sort_die (_("write failed"), output_file);
2888 ebuf[-1] = '\0';
2892 /* Check that the lines read from FILE_NAME come in order. Return
2893 true if they are in order. If CHECKONLY == 'c', also print a
2894 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2895 they are not in order. */
2897 static bool
2898 check (char const *file_name, char checkonly)
2900 FILE *fp = xfopen (file_name, "r");
2901 struct buffer buf; /* Input buffer. */
2902 struct line temp; /* Copy of previous line. */
2903 size_t alloc = 0;
2904 uintmax_t line_number = 0;
2905 struct keyfield const *key = keylist;
2906 bool nonunique = ! unique;
2907 bool ordered = true;
2909 initbuf (&buf, sizeof (struct line),
2910 MAX (merge_buffer_size, sort_size));
2911 temp.text = nullptr;
2913 while (fillbuf (&buf, fp, file_name))
2915 struct line const *line = buffer_linelim (&buf);
2916 struct line const *linebase = line - buf.nlines;
2918 /* Make sure the line saved from the old buffer contents is
2919 less than or equal to the first line of the new buffer. */
2920 if (alloc && nonunique <= compare (&temp, line - 1))
2922 found_disorder:
2924 if (checkonly == 'c')
2926 struct line const *disorder_line = line - 1;
2927 uintmax_t disorder_line_number =
2928 buffer_linelim (&buf) - disorder_line + line_number;
2929 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2930 fprintf (stderr, _("%s: %s:%s: disorder: "),
2931 program_name, file_name,
2932 umaxtostr (disorder_line_number, hr_buf));
2933 write_line (disorder_line, stderr, _("standard error"));
2936 ordered = false;
2937 break;
2941 /* Compare each line in the buffer with its successor. */
2942 while (linebase < --line)
2943 if (nonunique <= compare (line, line - 1))
2944 goto found_disorder;
2946 line_number += buf.nlines;
2948 /* Save the last line of the buffer. */
2949 if (alloc < line->length)
2953 alloc *= 2;
2954 if (! alloc)
2956 alloc = line->length;
2957 break;
2960 while (alloc < line->length);
2962 free (temp.text);
2963 temp.text = xmalloc (alloc);
2965 memcpy (temp.text, line->text, line->length);
2966 temp.length = line->length;
2967 if (key)
2969 temp.keybeg = temp.text + (line->keybeg - line->text);
2970 temp.keylim = temp.text + (line->keylim - line->text);
2974 xfclose (fp, file_name);
2975 free (buf.buf);
2976 free (temp.text);
2977 return ordered;
2980 /* Open FILES (there are NFILES of them) and store the resulting array
2981 of stream pointers into (*PFPS). Allocate the array. Return the
2982 number of successfully opened files, setting errno if this value is
2983 less than NFILES. */
2985 static size_t
2986 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2988 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2989 int i;
2991 /* Open as many input files as we can. */
2992 for (i = 0; i < nfiles; i++)
2994 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2995 ? open_temp (files[i].temp)
2996 : stream_open (files[i].name, "r"));
2997 if (!fps[i])
2998 break;
3001 return i;
3004 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3005 files (all of which are at the start of the FILES array), and
3006 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3007 FPS is the vector of open stream corresponding to the files.
3008 Close input and output streams before returning.
3009 OUTPUT_FILE gives the name of the output file. If it is null,
3010 the output file is standard output. */
3012 static void
3013 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
3014 FILE *ofp, char const *output_file, FILE **fps)
3016 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
3017 /* Input buffers for each file. */
3018 struct line saved; /* Saved line storage for unique check. */
3019 struct line const *savedline = nullptr;
3020 /* &saved if there is a saved line. */
3021 size_t savealloc = 0; /* Size allocated for the saved line. */
3022 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
3023 /* Current line in each line table. */
3024 struct line const **base = xnmalloc (nfiles, sizeof *base);
3025 /* Base of each line table. */
3026 size_t *ord = xnmalloc (nfiles, sizeof *ord);
3027 /* Table representing a permutation of fps,
3028 such that cur[ord[0]] is the smallest line
3029 and will be next output. */
3030 size_t i;
3031 size_t j;
3032 size_t t;
3033 struct keyfield const *key = keylist;
3034 saved.text = nullptr;
3036 /* Read initial lines from each input file. */
3037 for (i = 0; i < nfiles; )
3039 initbuf (&buffer[i], sizeof (struct line),
3040 MAX (merge_buffer_size, sort_size / nfiles));
3041 if (fillbuf (&buffer[i], fps[i], files[i].name))
3043 struct line const *linelim = buffer_linelim (&buffer[i]);
3044 cur[i] = linelim - 1;
3045 base[i] = linelim - buffer[i].nlines;
3046 i++;
3048 else
3050 /* fps[i] is empty; eliminate it from future consideration. */
3051 xfclose (fps[i], files[i].name);
3052 if (i < ntemps)
3054 ntemps--;
3055 zaptemp (files[i].name);
3057 free (buffer[i].buf);
3058 --nfiles;
3059 for (j = i; j < nfiles; ++j)
3061 files[j] = files[j + 1];
3062 fps[j] = fps[j + 1];
3067 /* Set up the ord table according to comparisons among input lines.
3068 Since this only reorders two items if one is strictly greater than
3069 the other, it is stable. */
3070 for (i = 0; i < nfiles; ++i)
3071 ord[i] = i;
3072 for (i = 1; i < nfiles; ++i)
3073 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
3074 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
3076 /* Repeatedly output the smallest line until no input remains. */
3077 while (nfiles)
3079 struct line const *smallest = cur[ord[0]];
3081 /* If uniquified output is turned on, output only the first of
3082 an identical series of lines. */
3083 if (unique)
3085 if (savedline && compare (savedline, smallest))
3087 savedline = nullptr;
3088 write_line (&saved, ofp, output_file);
3090 if (!savedline)
3092 savedline = &saved;
3093 if (savealloc < smallest->length)
3096 if (! savealloc)
3098 savealloc = smallest->length;
3099 break;
3101 while ((savealloc *= 2) < smallest->length);
3103 free (saved.text);
3104 saved.text = xmalloc (savealloc);
3106 saved.length = smallest->length;
3107 memcpy (saved.text, smallest->text, saved.length);
3108 if (key)
3110 saved.keybeg =
3111 saved.text + (smallest->keybeg - smallest->text);
3112 saved.keylim =
3113 saved.text + (smallest->keylim - smallest->text);
3117 else
3118 write_line (smallest, ofp, output_file);
3120 /* Check if we need to read more lines into memory. */
3121 if (base[ord[0]] < smallest)
3122 cur[ord[0]] = smallest - 1;
3123 else
3125 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3127 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3128 cur[ord[0]] = linelim - 1;
3129 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3131 else
3133 /* We reached EOF on fps[ord[0]]. */
3134 for (i = 1; i < nfiles; ++i)
3135 if (ord[i] > ord[0])
3136 --ord[i];
3137 --nfiles;
3138 xfclose (fps[ord[0]], files[ord[0]].name);
3139 if (ord[0] < ntemps)
3141 ntemps--;
3142 zaptemp (files[ord[0]].name);
3144 free (buffer[ord[0]].buf);
3145 for (i = ord[0]; i < nfiles; ++i)
3147 fps[i] = fps[i + 1];
3148 files[i] = files[i + 1];
3149 buffer[i] = buffer[i + 1];
3150 cur[i] = cur[i + 1];
3151 base[i] = base[i + 1];
3153 for (i = 0; i < nfiles; ++i)
3154 ord[i] = ord[i + 1];
3155 continue;
3159 /* The new line just read in may be larger than other lines
3160 already in main memory; push it back in the queue until we
3161 encounter a line larger than it. Optimize for the common
3162 case where the new line is smallest. */
3164 size_t lo = 1;
3165 size_t hi = nfiles;
3166 size_t probe = lo;
3167 size_t ord0 = ord[0];
3168 size_t count_of_smaller_lines;
3170 while (lo < hi)
3172 int cmp = compare (cur[ord0], cur[ord[probe]]);
3173 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3174 hi = probe;
3175 else
3176 lo = probe + 1;
3177 probe = (lo + hi) / 2;
3180 count_of_smaller_lines = lo - 1;
3181 for (j = 0; j < count_of_smaller_lines; j++)
3182 ord[j] = ord[j + 1];
3183 ord[count_of_smaller_lines] = ord0;
3187 if (unique && savedline)
3189 write_line (&saved, ofp, output_file);
3190 free (saved.text);
3193 xfclose (ofp, output_file);
3194 free (fps);
3195 free (buffer);
3196 free (ord);
3197 free (base);
3198 free (cur);
3201 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3202 files (all of which are at the start of the FILES array), and
3203 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3204 Close input and output files before returning.
3205 OUTPUT_FILE gives the name of the output file.
3207 Return the number of files successfully merged. This number can be
3208 less than NFILES if we ran low on file descriptors, but in this
3209 case it is never less than 2. */
3211 static size_t
3212 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3213 FILE *ofp, char const *output_file)
3215 FILE **fps;
3216 size_t nopened = open_input_files (files, nfiles, &fps);
3217 if (nopened < nfiles && nopened < 2)
3218 sort_die (_("open failed"), files[nopened].name);
3219 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3220 return nopened;
3223 /* Merge into T (of size NLINES) the two sorted arrays of lines
3224 LO (with NLINES / 2 members), and
3225 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3226 T and LO point just past their respective arrays, and the arrays
3227 are in reverse order. NLINES must be at least 2. */
3229 static void
3230 mergelines (struct line *restrict t, size_t nlines,
3231 struct line const *restrict lo)
3233 size_t nlo = nlines / 2;
3234 size_t nhi = nlines - nlo;
3235 struct line *hi = t - nlo;
3237 while (true)
3238 if (compare (lo - 1, hi - 1) <= 0)
3240 *--t = *--lo;
3241 if (! --nlo)
3243 /* HI must equal T now, and there is no need to copy from
3244 HI to T. */
3245 return;
3248 else
3250 *--t = *--hi;
3251 if (! --nhi)
3254 *--t = *--lo;
3255 while (--nlo);
3257 return;
3262 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3263 Do this all within one thread. NLINES must be at least 2.
3264 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3265 Otherwise the sort is in-place and TEMP is half-sized.
3266 The input and output arrays are in reverse order, and LINES and
3267 TEMP point just past the end of their respective arrays.
3269 Use a recursive divide-and-conquer algorithm, in the style
3270 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3271 the optimization suggested by exercise 5.2.4-10; this requires room
3272 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3273 writes that this memory optimization was originally published by
3274 D. A. Bell, Comp J. 1 (1958), 75. */
3276 static void
3277 sequential_sort (struct line *restrict lines, size_t nlines,
3278 struct line *restrict temp, bool to_temp)
3280 if (nlines == 2)
3282 /* Declare 'swap' as int, not bool, to work around a bug
3283 <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
3284 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3285 int swap = (0 < compare (&lines[-1], &lines[-2]));
3286 if (to_temp)
3288 temp[-1] = lines[-1 - swap];
3289 temp[-2] = lines[-2 + swap];
3291 else if (swap)
3293 temp[-1] = lines[-1];
3294 lines[-1] = lines[-2];
3295 lines[-2] = temp[-1];
3298 else
3300 size_t nlo = nlines / 2;
3301 size_t nhi = nlines - nlo;
3302 struct line *lo = lines;
3303 struct line *hi = lines - nlo;
3305 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3306 if (1 < nlo)
3307 sequential_sort (lo, nlo, temp, !to_temp);
3308 else if (!to_temp)
3309 temp[-1] = lo[-1];
3311 struct line *dest;
3312 struct line const *sorted_lo;
3313 if (to_temp)
3315 dest = temp;
3316 sorted_lo = lines;
3318 else
3320 dest = lines;
3321 sorted_lo = temp;
3323 mergelines (dest, nlines, sorted_lo);
3327 static struct merge_node *init_node (struct merge_node *restrict,
3328 struct merge_node *restrict,
3329 struct line *, size_t, size_t, bool);
3332 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3333 lines, with destination DEST. */
3334 static struct merge_node *
3335 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3337 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3339 struct merge_node *root = merge_tree;
3340 root->lo = root->hi = root->end_lo = root->end_hi = nullptr;
3341 root->dest = nullptr;
3342 root->nlo = root->nhi = nlines;
3343 root->parent = nullptr;
3344 root->level = MERGE_END;
3345 root->queued = false;
3346 pthread_mutex_init (&root->lock, nullptr);
3348 init_node (root, root + 1, dest, nthreads, nlines, false);
3349 return merge_tree;
3352 /* Destroy the merge tree. */
3353 static void
3354 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3356 size_t n_nodes = nthreads * 2;
3357 struct merge_node *node = merge_tree;
3359 while (n_nodes--)
3361 pthread_mutex_destroy (&node->lock);
3362 node++;
3365 free (merge_tree);
3368 /* Initialize a merge tree node and its descendants. The node's
3369 parent is PARENT. The node and its descendants are taken from the
3370 array of nodes NODE_POOL. Their destination starts at DEST; they
3371 will consume NTHREADS threads. The total number of sort lines is
3372 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3373 its parent. */
3375 static struct merge_node *
3376 init_node (struct merge_node *restrict parent,
3377 struct merge_node *restrict node_pool,
3378 struct line *dest, size_t nthreads,
3379 size_t total_lines, bool is_lo_child)
3381 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3382 size_t nlo = nlines / 2;
3383 size_t nhi = nlines - nlo;
3384 struct line *lo = dest - total_lines;
3385 struct line *hi = lo - nlo;
3386 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3388 struct merge_node *node = node_pool++;
3389 node->lo = node->end_lo = lo;
3390 node->hi = node->end_hi = hi;
3391 node->dest = parent_end;
3392 node->nlo = nlo;
3393 node->nhi = nhi;
3394 node->parent = parent;
3395 node->level = parent->level + 1;
3396 node->queued = false;
3397 pthread_mutex_init (&node->lock, nullptr);
3399 if (nthreads > 1)
3401 size_t lo_threads = nthreads / 2;
3402 size_t hi_threads = nthreads - lo_threads;
3403 node->lo_child = node_pool;
3404 node_pool = init_node (node, node_pool, lo, lo_threads,
3405 total_lines, true);
3406 node->hi_child = node_pool;
3407 node_pool = init_node (node, node_pool, hi, hi_threads,
3408 total_lines, false);
3410 else
3412 node->lo_child = nullptr;
3413 node->hi_child = nullptr;
3415 return node_pool;
3419 /* Compare two merge nodes A and B for priority. */
3421 static int
3422 compare_nodes (void const *a, void const *b)
3424 struct merge_node const *nodea = a;
3425 struct merge_node const *nodeb = b;
3426 if (nodea->level == nodeb->level)
3427 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3428 return nodea->level < nodeb->level;
3431 /* Lock a merge tree NODE. */
3433 static inline void
3434 lock_node (struct merge_node *node)
3436 pthread_mutex_lock (&node->lock);
3439 /* Unlock a merge tree NODE. */
3441 static inline void
3442 unlock_node (struct merge_node *node)
3444 pthread_mutex_unlock (&node->lock);
3447 /* Destroy merge QUEUE. */
3449 static void
3450 queue_destroy (struct merge_node_queue *queue)
3452 heap_free (queue->priority_queue);
3453 pthread_cond_destroy (&queue->cond);
3454 pthread_mutex_destroy (&queue->mutex);
3457 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3458 NTHREADS threads. */
3460 static void
3461 queue_init (struct merge_node_queue *queue, size_t nthreads)
3463 /* Though it's highly unlikely all nodes are in the heap at the same
3464 time, the heap should accommodate all of them. Counting a null
3465 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3466 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3467 pthread_mutex_init (&queue->mutex, nullptr);
3468 pthread_cond_init (&queue->cond, nullptr);
3471 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3472 does not need to lock NODE. */
3474 static void
3475 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3477 pthread_mutex_lock (&queue->mutex);
3478 heap_insert (queue->priority_queue, node);
3479 node->queued = true;
3480 pthread_cond_signal (&queue->cond);
3481 pthread_mutex_unlock (&queue->mutex);
3484 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3486 static struct merge_node *
3487 queue_pop (struct merge_node_queue *queue)
3489 struct merge_node *node;
3490 pthread_mutex_lock (&queue->mutex);
3491 while (! (node = heap_remove_top (queue->priority_queue)))
3492 pthread_cond_wait (&queue->cond, &queue->mutex);
3493 pthread_mutex_unlock (&queue->mutex);
3494 lock_node (node);
3495 node->queued = false;
3496 return node;
3499 /* Output LINE to TFP, unless -u is specified and the line compares
3500 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3501 is null if TFP is standard output.
3503 This function does not save the line for comparison later, so it is
3504 appropriate only for internal sort. */
3506 static void
3507 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3509 if (unique)
3511 if (saved_line.text && ! compare (line, &saved_line))
3512 return;
3513 saved_line = *line;
3516 write_line (line, tfp, temp_output);
3519 /* Merge the lines currently available to a NODE in the binary
3520 merge tree. Merge a number of lines appropriate for this merge
3521 level, assuming TOTAL_LINES is the total number of lines.
3523 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3524 the name of TFP, or is null if TFP is standard output. */
3526 static void
3527 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3528 FILE *tfp, char const *temp_output)
3530 struct line *lo_orig = node->lo;
3531 struct line *hi_orig = node->hi;
3532 size_t to_merge = MAX_MERGE (total_lines, node->level);
3533 size_t merged_lo;
3534 size_t merged_hi;
3536 if (node->level > MERGE_ROOT)
3538 /* Merge to destination buffer. */
3539 struct line *dest = *node->dest;
3540 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3541 if (compare (node->lo - 1, node->hi - 1) <= 0)
3542 *--dest = *--node->lo;
3543 else
3544 *--dest = *--node->hi;
3546 merged_lo = lo_orig - node->lo;
3547 merged_hi = hi_orig - node->hi;
3549 if (node->nhi == merged_hi)
3550 while (node->lo != node->end_lo && to_merge--)
3551 *--dest = *--node->lo;
3552 else if (node->nlo == merged_lo)
3553 while (node->hi != node->end_hi && to_merge--)
3554 *--dest = *--node->hi;
3555 *node->dest = dest;
3557 else
3559 /* Merge directly to output. */
3560 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3562 if (compare (node->lo - 1, node->hi - 1) <= 0)
3563 write_unique (--node->lo, tfp, temp_output);
3564 else
3565 write_unique (--node->hi, tfp, temp_output);
3568 merged_lo = lo_orig - node->lo;
3569 merged_hi = hi_orig - node->hi;
3571 if (node->nhi == merged_hi)
3573 while (node->lo != node->end_lo && to_merge--)
3574 write_unique (--node->lo, tfp, temp_output);
3576 else if (node->nlo == merged_lo)
3578 while (node->hi != node->end_hi && to_merge--)
3579 write_unique (--node->hi, tfp, temp_output);
3583 /* Update NODE. */
3584 merged_lo = lo_orig - node->lo;
3585 merged_hi = hi_orig - node->hi;
3586 node->nlo -= merged_lo;
3587 node->nhi -= merged_hi;
3590 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3591 NODE's children has available lines and the other either has
3592 available lines or has exhausted its lines. */
3594 static void
3595 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3597 if (! node->queued)
3599 bool lo_avail = (node->lo - node->end_lo) != 0;
3600 bool hi_avail = (node->hi - node->end_hi) != 0;
3601 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3602 queue_insert (queue, node);
3606 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3608 static void
3609 queue_check_insert_parent (struct merge_node_queue *queue,
3610 struct merge_node *node)
3612 if (node->level > MERGE_ROOT)
3614 lock_node (node->parent);
3615 queue_check_insert (queue, node->parent);
3616 unlock_node (node->parent);
3618 else if (node->nlo + node->nhi == 0)
3620 /* If the MERGE_ROOT NODE has finished merging, insert the
3621 MERGE_END node. */
3622 queue_insert (queue, node->parent);
3626 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3627 some of those lines, until the MERGE_END node is popped.
3628 TOTAL_LINES is the total number of lines. If merging at the top
3629 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3630 null if TFP is standard output. */
3632 static void
3633 merge_loop (struct merge_node_queue *queue,
3634 size_t total_lines, FILE *tfp, char const *temp_output)
3636 while (true)
3638 struct merge_node *node = queue_pop (queue);
3640 if (node->level == MERGE_END)
3642 unlock_node (node);
3643 /* Reinsert so other threads can pop it. */
3644 queue_insert (queue, node);
3645 break;
3647 mergelines_node (node, total_lines, tfp, temp_output);
3648 queue_check_insert (queue, node);
3649 queue_check_insert_parent (queue, node);
3651 unlock_node (node);
3656 static void sortlines (struct line *restrict, size_t, size_t,
3657 struct merge_node *, struct merge_node_queue *,
3658 FILE *, char const *);
3660 /* Thread arguments for sortlines_thread. */
3662 struct thread_args
3664 /* Source, i.e., the array of lines to sort. This points just past
3665 the end of the array. */
3666 struct line *lines;
3668 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3669 size_t nthreads;
3671 /* Number of lines in LINES and DEST. */
3672 size_t const total_lines;
3674 /* Merge node. Lines from this node and this node's sibling will merged
3675 to this node's parent. */
3676 struct merge_node *const node;
3678 /* The priority queue controlling available work for the entire
3679 internal sort. */
3680 struct merge_node_queue *const queue;
3682 /* If at the top level, the file to output to, and the file's name.
3683 If the file is standard output, the file's name is null. */
3684 FILE *tfp;
3685 char const *output_temp;
3688 /* Like sortlines, except with a signature acceptable to pthread_create. */
3690 static void *
3691 sortlines_thread (void *data)
3693 struct thread_args const *args = data;
3694 sortlines (args->lines, args->nthreads, args->total_lines,
3695 args->node, args->queue, args->tfp,
3696 args->output_temp);
3697 return nullptr;
3700 /* Sort lines, possibly in parallel. The arguments are as in struct
3701 thread_args above.
3703 The algorithm has three phases: node creation, sequential sort,
3704 and binary merge.
3706 During node creation, sortlines recursively visits each node in the
3707 binary merge tree and creates a NODE structure corresponding to all the
3708 future line merging NODE is responsible for. For each call to
3709 sortlines, half the available threads are assigned to each recursive
3710 call, until a leaf node having only 1 available thread is reached.
3712 Each leaf node then performs two sequential sorts, one on each half of
3713 the lines it is responsible for. It records in its NODE structure that
3714 there are two sorted sublists available to merge from, and inserts its
3715 NODE into the priority queue.
3717 The binary merge phase then begins. Each thread drops into a loop
3718 where the thread retrieves a NODE from the priority queue, merges lines
3719 available to that NODE, and potentially insert NODE or its parent back
3720 into the queue if there are sufficient available lines for them to
3721 merge. This continues until all lines at all nodes of the merge tree
3722 have been merged. */
3724 static void
3725 sortlines (struct line *restrict lines, size_t nthreads,
3726 size_t total_lines, struct merge_node *node,
3727 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3729 size_t nlines = node->nlo + node->nhi;
3731 /* Calculate thread arguments. */
3732 size_t lo_threads = nthreads / 2;
3733 size_t hi_threads = nthreads - lo_threads;
3734 pthread_t thread;
3735 struct thread_args args = {lines, lo_threads, total_lines,
3736 node->lo_child, queue, tfp, temp_output};
3738 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3739 && pthread_create (&thread, nullptr, sortlines_thread, &args) == 0)
3741 sortlines (lines - node->nlo, hi_threads, total_lines,
3742 node->hi_child, queue, tfp, temp_output);
3743 pthread_join (thread, nullptr);
3745 else
3747 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3748 Sort with 1 thread. */
3749 size_t nlo = node->nlo;
3750 size_t nhi = node->nhi;
3751 struct line *temp = lines - total_lines;
3752 if (1 < nhi)
3753 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3754 if (1 < nlo)
3755 sequential_sort (lines, nlo, temp, false);
3757 /* Update merge NODE. No need to lock yet. */
3758 node->lo = lines;
3759 node->hi = lines - nlo;
3760 node->end_lo = lines - nlo;
3761 node->end_hi = lines - nlo - nhi;
3763 queue_insert (queue, node);
3764 merge_loop (queue, total_lines, tfp, temp_output);
3768 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3769 the same as OUTFILE. If found, replace each with the same
3770 temporary copy that can be merged into OUTFILE without destroying
3771 OUTFILE before it is completely read. This temporary copy does not
3772 count as a merge temp, so don't worry about incrementing NTEMPS in
3773 the caller; final cleanup will remove it, not zaptemp.
3775 This test ensures that an otherwise-erroneous use like
3776 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3777 It's not clear that POSIX requires this nicety.
3778 Detect common error cases, but don't try to catch obscure cases like
3779 "cat ... FILE ... | sort -m -o FILE"
3780 where traditional "sort" doesn't copy the input and where
3781 people should know that they're getting into trouble anyway.
3782 Catching these obscure cases would slow down performance in
3783 common cases. */
3785 static void
3786 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3787 size_t nfiles, char const *outfile)
3789 struct tempnode *tempcopy = nullptr;
3791 for (size_t i = ntemps; i < nfiles; i++)
3793 bool is_stdin = STREQ (files[i].name, "-");
3794 bool same;
3795 struct stat instat;
3797 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3798 same = true;
3799 else
3801 struct stat *outst = get_outstatus ();
3802 if (!outst)
3803 break;
3805 same = (((is_stdin
3806 ? fstat (STDIN_FILENO, &instat)
3807 : stat (files[i].name, &instat))
3808 == 0)
3809 && SAME_INODE (instat, *outst));
3812 if (same)
3814 if (! tempcopy)
3816 FILE *tftp;
3817 tempcopy = create_temp (&tftp);
3818 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3821 files[i].name = tempcopy->name;
3822 files[i].temp = tempcopy;
3827 /* Scan the input files to ensure all are accessible.
3828 Otherwise exit with a diagnostic.
3830 This will catch common issues with permissions etc.
3831 but will fail to notice issues where you can open but not read,
3832 like when a directory is specified on some systems.
3833 Catching these obscure cases could slow down performance in
3834 common cases. */
3836 static void
3837 check_inputs (char *const *files, size_t nfiles)
3839 for (size_t i = 0; i < nfiles; i++)
3841 if (STREQ (files[i], "-"))
3842 continue;
3844 if (euidaccess (files[i], R_OK) != 0)
3845 sort_die (_("cannot read"), files[i]);
3849 /* Ensure a specified output file can be created or written to,
3850 and point stdout to it. Do not truncate the file.
3851 Exit with a diagnostic on failure. */
3853 static void
3854 check_output (char const *outfile)
3856 if (outfile)
3858 int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT;
3859 int outfd = open (outfile, oflags, MODE_RW_UGO);
3860 if (outfd < 0)
3861 sort_die (_("open failed"), outfile);
3862 move_fd (outfd, STDOUT_FILENO);
3866 /* Merge the input FILES. NTEMPS is the number of files at the
3867 start of FILES that are temporary; it is zero at the top level.
3868 NFILES is the total number of files. Put the output in
3869 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3871 static void
3872 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3873 char const *output_file)
3875 while (nmerge < nfiles)
3877 /* Number of input files processed so far. */
3878 size_t in;
3880 /* Number of output files generated so far. */
3881 size_t out;
3883 /* nfiles % NMERGE; this counts input files that are left over
3884 after all full-sized merges have been done. */
3885 size_t remainder;
3887 /* Number of easily-available slots at the next loop iteration. */
3888 size_t cheap_slots;
3890 /* Do as many NMERGE-size merges as possible. In the case that
3891 nmerge is bogus, increment by the maximum number of file
3892 descriptors allowed. */
3893 for (out = in = 0; nmerge <= nfiles - in; out++)
3895 FILE *tfp;
3896 struct tempnode *temp = create_temp (&tfp);
3897 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3898 nmerge, tfp, temp->name);
3899 ntemps -= MIN (ntemps, num_merged);
3900 files[out].name = temp->name;
3901 files[out].temp = temp;
3902 in += num_merged;
3905 remainder = nfiles - in;
3906 cheap_slots = nmerge - out % nmerge;
3908 if (cheap_slots < remainder)
3910 /* So many files remain that they can't all be put into the last
3911 NMERGE-sized output window. Do one more merge. Merge as few
3912 files as possible, to avoid needless I/O. */
3913 size_t nshortmerge = remainder - cheap_slots + 1;
3914 FILE *tfp;
3915 struct tempnode *temp = create_temp (&tfp);
3916 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3917 nshortmerge, tfp, temp->name);
3918 ntemps -= MIN (ntemps, num_merged);
3919 files[out].name = temp->name;
3920 files[out++].temp = temp;
3921 in += num_merged;
3924 /* Put the remaining input files into the last NMERGE-sized output
3925 window, so they will be merged in the next pass. */
3926 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3927 ntemps += out;
3928 nfiles -= in - out;
3931 avoid_trashing_input (files, ntemps, nfiles, output_file);
3933 /* We aren't guaranteed that this final mergefiles will work, therefore we
3934 try to merge into the output, and then merge as much as we can into a
3935 temp file if we can't. Repeat. */
3937 while (true)
3939 /* Merge directly into the output file if possible. */
3940 FILE **fps;
3941 size_t nopened = open_input_files (files, nfiles, &fps);
3943 if (nopened == nfiles)
3945 FILE *ofp = stream_open (output_file, "w");
3946 if (ofp)
3948 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3949 break;
3951 if (errno != EMFILE || nopened <= 2)
3952 sort_die (_("open failed"), output_file);
3954 else if (nopened <= 2)
3955 sort_die (_("open failed"), files[nopened].name);
3957 /* We ran out of file descriptors. Close one of the input
3958 files, to gain a file descriptor. Then create a temporary
3959 file with our spare file descriptor. Retry if that failed
3960 (e.g., some other process could open a file between the time
3961 we closed and tried to create). */
3962 FILE *tfp;
3963 struct tempnode *temp;
3966 nopened--;
3967 xfclose (fps[nopened], files[nopened].name);
3968 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3970 while (!temp);
3972 /* Merge into the newly allocated temporary. */
3973 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3974 fps);
3975 ntemps -= MIN (ntemps, nopened);
3976 files[0].name = temp->name;
3977 files[0].temp = temp;
3979 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3980 ntemps++;
3981 nfiles -= nopened - 1;
3985 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3987 static void
3988 sort (char *const *files, size_t nfiles, char const *output_file,
3989 size_t nthreads)
3991 struct buffer buf;
3992 size_t ntemps = 0;
3993 bool output_file_created = false;
3995 buf.alloc = 0;
3997 while (nfiles)
3999 char const *temp_output;
4000 char const *file = *files;
4001 FILE *fp = xfopen (file, "r");
4002 FILE *tfp;
4004 size_t bytes_per_line;
4005 if (nthreads > 1)
4007 /* Get log P. */
4008 size_t tmp = 1;
4009 size_t mult = 1;
4010 while (tmp < nthreads)
4012 tmp *= 2;
4013 mult++;
4015 bytes_per_line = (mult * sizeof (struct line));
4017 else
4018 bytes_per_line = sizeof (struct line) * 3 / 2;
4020 if (! buf.alloc)
4021 initbuf (&buf, bytes_per_line,
4022 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
4023 buf.eof = false;
4024 files++;
4025 nfiles--;
4027 while (fillbuf (&buf, fp, file))
4029 struct line *line;
4031 if (buf.eof && nfiles
4032 && (bytes_per_line + 1
4033 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
4035 /* End of file, but there is more input and buffer room.
4036 Concatenate the next input file; this is faster in
4037 the usual case. */
4038 buf.left = buf.used;
4039 break;
4042 saved_line.text = nullptr;
4043 line = buffer_linelim (&buf);
4044 if (buf.eof && !nfiles && !ntemps && !buf.left)
4046 xfclose (fp, file);
4047 tfp = xfopen (output_file, "w");
4048 temp_output = output_file;
4049 output_file_created = true;
4051 else
4053 ++ntemps;
4054 temp_output = create_temp (&tfp)->name;
4056 if (1 < buf.nlines)
4058 struct merge_node_queue queue;
4059 queue_init (&queue, nthreads);
4060 struct merge_node *merge_tree =
4061 merge_tree_init (nthreads, buf.nlines, line);
4063 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
4064 &queue, tfp, temp_output);
4066 merge_tree_destroy (nthreads, merge_tree);
4067 queue_destroy (&queue);
4069 else
4070 write_unique (line - 1, tfp, temp_output);
4072 xfclose (tfp, temp_output);
4074 if (output_file_created)
4075 goto finish;
4077 xfclose (fp, file);
4080 finish:
4081 free (buf.buf);
4083 if (! output_file_created)
4085 struct tempnode *node = temphead;
4086 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
4087 for (size_t i = 0; node; i++)
4089 tempfiles[i].name = node->name;
4090 tempfiles[i].temp = node;
4091 node = node->next;
4093 merge (tempfiles, ntemps, ntemps, output_file);
4094 free (tempfiles);
4097 reap_all ();
4100 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4102 static void
4103 insertkey (struct keyfield *key_arg)
4105 struct keyfield **p;
4106 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4108 for (p = &keylist; *p; p = &(*p)->next)
4109 continue;
4110 *p = key;
4111 key->next = nullptr;
4114 /* Report a bad field specification SPEC, with extra info MSGID. */
4116 static void
4117 badfieldspec (char const *spec, char const *msgid)
4119 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4120 _(msgid), quote (spec));
4123 /* Report incompatible options. */
4125 static void
4126 incompatible_options (char const *opts)
4128 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4131 /* Check compatibility of ordering options. */
4133 static void
4134 check_ordering_compatibility (void)
4136 struct keyfield *key;
4138 for (key = keylist; key; key = key->next)
4139 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4140 + key->month + (key->version | key->random | !!key->ignore)))
4142 /* The following is too big, but guaranteed to be "big enough". */
4143 char opts[sizeof short_options];
4144 /* Clear flags we're not interested in. */
4145 key->skipsblanks = key->skipeblanks = key->reverse = false;
4146 key_to_opts (key, opts);
4147 incompatible_options (opts);
4151 /* Parse the leading integer in STRING and store the resulting value
4152 (which must fit into size_t) into *VAL. Return the address of the
4153 suffix after the integer. If the value is too large, silently
4154 substitute SIZE_MAX. If MSGID is null, return nullptr after
4155 failure; otherwise, report MSGID and exit on failure. */
4157 static char const *
4158 parse_field_count (char const *string, size_t *val, char const *msgid)
4160 char *suffix;
4161 uintmax_t n;
4163 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4165 case LONGINT_OK:
4166 case LONGINT_INVALID_SUFFIX_CHAR:
4167 *val = n;
4168 if (*val == n)
4169 break;
4170 FALLTHROUGH;
4171 case LONGINT_OVERFLOW:
4172 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4173 *val = SIZE_MAX;
4174 break;
4176 case LONGINT_INVALID:
4177 if (msgid)
4178 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4179 _(msgid), quote (string));
4180 return nullptr;
4183 return suffix;
4186 /* Handle interrupts and hangups. */
4188 static void
4189 sighandler (int sig)
4191 if (! SA_NOCLDSTOP)
4192 signal (sig, SIG_IGN);
4194 cleanup ();
4196 signal (sig, SIG_DFL);
4197 raise (sig);
4200 /* Set the ordering options for KEY specified in S.
4201 Return the address of the first character in S that
4202 is not a valid ordering option.
4203 BLANKTYPE is the kind of blanks that 'b' should skip. */
4205 static char *
4206 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4208 while (*s)
4210 switch (*s)
4212 case 'b':
4213 if (blanktype == bl_start || blanktype == bl_both)
4214 key->skipsblanks = true;
4215 if (blanktype == bl_end || blanktype == bl_both)
4216 key->skipeblanks = true;
4217 break;
4218 case 'd':
4219 key->ignore = nondictionary;
4220 break;
4221 case 'f':
4222 key->translate = fold_toupper;
4223 break;
4224 case 'g':
4225 key->general_numeric = true;
4226 break;
4227 case 'h':
4228 key->human_numeric = true;
4229 break;
4230 case 'i':
4231 /* Option order should not matter, so don't let -i override
4232 -d. -d implies -i, but -i does not imply -d. */
4233 if (! key->ignore)
4234 key->ignore = nonprinting;
4235 break;
4236 case 'M':
4237 key->month = true;
4238 break;
4239 case 'n':
4240 key->numeric = true;
4241 break;
4242 case 'R':
4243 key->random = true;
4244 break;
4245 case 'r':
4246 key->reverse = true;
4247 break;
4248 case 'V':
4249 key->version = true;
4250 break;
4251 default:
4252 return (char *) s;
4254 ++s;
4256 return (char *) s;
4259 /* Initialize KEY. */
4261 static struct keyfield *
4262 key_init (struct keyfield *key)
4264 memset (key, 0, sizeof *key);
4265 key->eword = SIZE_MAX;
4266 return key;
4270 main (int argc, char **argv)
4272 struct keyfield *key;
4273 struct keyfield key_buf;
4274 struct keyfield gkey;
4275 bool gkey_only = false;
4276 char const *s;
4277 int c = 0;
4278 char checkonly = 0;
4279 bool mergeonly = false;
4280 char *random_source = nullptr;
4281 bool need_random = false;
4282 size_t nthreads = 0;
4283 size_t nfiles = 0;
4284 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
4285 int posix_ver = posix2_version ();
4286 bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4287 char **files;
4288 char *files_from = nullptr;
4289 struct Tokens tok;
4290 char const *outfile = nullptr;
4291 bool locale_ok;
4293 initialize_main (&argc, &argv);
4294 set_program_name (argv[0]);
4295 locale_ok = !! setlocale (LC_ALL, "");
4296 bindtextdomain (PACKAGE, LOCALEDIR);
4297 textdomain (PACKAGE);
4299 initialize_exit_failure (SORT_FAILURE);
4301 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4302 #if HAVE_NL_LANGINFO
4303 hard_LC_TIME = hard_locale (LC_TIME);
4304 #endif
4306 /* Get locale's representation of the decimal point. */
4308 struct lconv const *locale = localeconv ();
4310 /* If the locale doesn't define a decimal point, or if the decimal
4311 point is multibyte, use the C locale's decimal point. FIXME:
4312 add support for multibyte decimal points. */
4313 decimal_point = locale->decimal_point[0];
4314 if (! decimal_point || locale->decimal_point[1])
4315 decimal_point = '.';
4317 /* FIXME: add support for multibyte thousands separators. */
4318 thousands_sep = locale->thousands_sep[0];
4319 if (thousands_sep && locale->thousands_sep[1])
4320 thousands_sep_ignored = true;
4321 if (! thousands_sep || locale->thousands_sep[1])
4322 thousands_sep = NON_CHAR;
4325 have_read_stdin = false;
4326 inittables ();
4329 size_t i;
4330 static int const sig[] =
4332 /* The usual suspects. */
4333 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4334 #ifdef SIGPOLL
4335 SIGPOLL,
4336 #endif
4337 #ifdef SIGPROF
4338 SIGPROF,
4339 #endif
4340 #ifdef SIGVTALRM
4341 SIGVTALRM,
4342 #endif
4343 #ifdef SIGXCPU
4344 SIGXCPU,
4345 #endif
4346 #ifdef SIGXFSZ
4347 SIGXFSZ,
4348 #endif
4350 enum { nsigs = ARRAY_CARDINALITY (sig) };
4352 #if SA_NOCLDSTOP
4353 struct sigaction act;
4355 sigemptyset (&caught_signals);
4356 for (i = 0; i < nsigs; i++)
4358 sigaction (sig[i], nullptr, &act);
4359 if (act.sa_handler != SIG_IGN)
4360 sigaddset (&caught_signals, sig[i]);
4363 act.sa_handler = sighandler;
4364 act.sa_mask = caught_signals;
4365 act.sa_flags = 0;
4367 for (i = 0; i < nsigs; i++)
4368 if (sigismember (&caught_signals, sig[i]))
4369 sigaction (sig[i], &act, nullptr);
4370 #else
4371 for (i = 0; i < nsigs; i++)
4372 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4374 signal (sig[i], sighandler);
4375 siginterrupt (sig[i], 1);
4377 #endif
4379 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4381 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4382 atexit (exit_cleanup);
4384 key_init (&gkey);
4385 gkey.sword = SIZE_MAX;
4387 files = xnmalloc (argc, sizeof *files);
4389 while (true)
4391 /* Parse an operand as a file after "--" was seen; or if
4392 pedantic and a file was seen, unless the POSIX version
4393 is not 1003.1-2001 and -c was not seen and the operand is
4394 "-o FILE" or "-oFILE". */
4395 int oi = -1;
4397 if (c == -1
4398 || (posixly_correct && nfiles != 0
4399 && ! (traditional_usage
4400 && ! checkonly
4401 && optind != argc
4402 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4403 && (argv[optind][2] || optind + 1 != argc)))
4404 || ((c = getopt_long (argc, argv, short_options,
4405 long_options, &oi))
4406 == -1))
4408 if (argc <= optind)
4409 break;
4410 files[nfiles++] = argv[optind++];
4412 else switch (c)
4414 case 1:
4415 key = nullptr;
4416 if (optarg[0] == '+')
4418 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4419 && ISDIGIT (argv[optind][1]));
4420 traditional_usage |= minus_pos_usage && !posixly_correct;
4421 if (traditional_usage)
4423 /* Treat +POS1 [-POS2] as a key if possible; but silently
4424 treat an operand as a file if it is not a valid +POS1. */
4425 key = key_init (&key_buf);
4426 s = parse_field_count (optarg + 1, &key->sword, nullptr);
4427 if (s && *s == '.')
4428 s = parse_field_count (s + 1, &key->schar, nullptr);
4429 if (! (key->sword || key->schar))
4430 key->sword = SIZE_MAX;
4431 if (! s || *set_ordering (s, key, bl_start))
4432 key = nullptr;
4433 else
4435 if (minus_pos_usage)
4437 char const *optarg1 = argv[optind++];
4438 s = parse_field_count (optarg1 + 1, &key->eword,
4439 N_("invalid number after '-'"));
4440 if (*s == '.')
4441 s = parse_field_count (s + 1, &key->echar,
4442 N_("invalid number after '.'"));
4443 if (!key->echar && key->eword)
4445 /* obsolescent syntax +A.x -B.y is equivalent to:
4446 -k A+1.x+1,B.y (when y = 0)
4447 -k A+1.x+1,B+1.y (when y > 0)
4448 So eword is decremented as in the -k case
4449 only when the end field (B) is specified and
4450 echar (y) is 0. */
4451 key->eword--;
4453 if (*set_ordering (s, key, bl_end))
4454 badfieldspec (optarg1,
4455 N_("stray character in field spec"));
4457 key->traditional_used = true;
4458 insertkey (key);
4462 if (! key)
4463 files[nfiles++] = optarg;
4464 break;
4466 case SORT_OPTION:
4467 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4468 FALLTHROUGH;
4469 case 'b':
4470 case 'd':
4471 case 'f':
4472 case 'g':
4473 case 'h':
4474 case 'i':
4475 case 'M':
4476 case 'n':
4477 case 'r':
4478 case 'R':
4479 case 'V':
4481 char str[2];
4482 str[0] = c;
4483 str[1] = '\0';
4484 set_ordering (str, &gkey, bl_both);
4486 break;
4488 case CHECK_OPTION:
4489 c = (optarg
4490 ? XARGMATCH ("--check", optarg, check_args, check_types)
4491 : 'c');
4492 FALLTHROUGH;
4493 case 'c':
4494 case 'C':
4495 if (checkonly && checkonly != c)
4496 incompatible_options ("cC");
4497 checkonly = c;
4498 break;
4500 case COMPRESS_PROGRAM_OPTION:
4501 if (compress_program && !STREQ (compress_program, optarg))
4502 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4503 compress_program = optarg;
4504 break;
4506 case DEBUG_PROGRAM_OPTION:
4507 debug = true;
4508 break;
4510 case FILES0_FROM_OPTION:
4511 files_from = optarg;
4512 break;
4514 case 'k':
4515 key = key_init (&key_buf);
4517 /* Get POS1. */
4518 s = parse_field_count (optarg, &key->sword,
4519 N_("invalid number at field start"));
4520 if (! key->sword--)
4522 /* Provoke with 'sort -k0' */
4523 badfieldspec (optarg, N_("field number is zero"));
4525 if (*s == '.')
4527 s = parse_field_count (s + 1, &key->schar,
4528 N_("invalid number after '.'"));
4529 if (! key->schar--)
4531 /* Provoke with 'sort -k1.0' */
4532 badfieldspec (optarg, N_("character offset is zero"));
4535 if (! (key->sword || key->schar))
4536 key->sword = SIZE_MAX;
4537 s = set_ordering (s, key, bl_start);
4538 if (*s != ',')
4540 key->eword = SIZE_MAX;
4541 key->echar = 0;
4543 else
4545 /* Get POS2. */
4546 s = parse_field_count (s + 1, &key->eword,
4547 N_("invalid number after ','"));
4548 if (! key->eword--)
4550 /* Provoke with 'sort -k1,0' */
4551 badfieldspec (optarg, N_("field number is zero"));
4553 if (*s == '.')
4555 s = parse_field_count (s + 1, &key->echar,
4556 N_("invalid number after '.'"));
4558 s = set_ordering (s, key, bl_end);
4560 if (*s)
4561 badfieldspec (optarg, N_("stray character in field spec"));
4562 insertkey (key);
4563 break;
4565 case 'm':
4566 mergeonly = true;
4567 break;
4569 case NMERGE_OPTION:
4570 specify_nmerge (oi, c, optarg);
4571 break;
4573 case 'o':
4574 if (outfile && !STREQ (outfile, optarg))
4575 error (SORT_FAILURE, 0, _("multiple output files specified"));
4576 outfile = optarg;
4577 break;
4579 case RANDOM_SOURCE_OPTION:
4580 if (random_source && !STREQ (random_source, optarg))
4581 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4582 random_source = optarg;
4583 break;
4585 case 's':
4586 stable = true;
4587 break;
4589 case 'S':
4590 specify_sort_size (oi, c, optarg);
4591 break;
4593 case 't':
4595 char newtab = optarg[0];
4596 if (! newtab)
4597 error (SORT_FAILURE, 0, _("empty tab"));
4598 if (optarg[1])
4600 if (STREQ (optarg, "\\0"))
4601 newtab = '\0';
4602 else
4604 /* Provoke with 'sort -txx'. Complain about
4605 "multi-character tab" instead of "multibyte tab", so
4606 that the diagnostic's wording does not need to be
4607 changed once multibyte characters are supported. */
4608 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4609 quote (optarg));
4612 if (tab != TAB_DEFAULT && tab != newtab)
4613 error (SORT_FAILURE, 0, _("incompatible tabs"));
4614 tab = newtab;
4616 break;
4618 case 'T':
4619 add_temp_dir (optarg);
4620 break;
4622 case PARALLEL_OPTION:
4623 nthreads = specify_nthreads (oi, c, optarg);
4624 break;
4626 case 'u':
4627 unique = true;
4628 break;
4630 case 'y':
4631 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4632 through Solaris 7. It is also accepted by many non-Solaris
4633 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4634 -y is marked as obsolete starting with Solaris 8 (1999), but is
4635 still accepted as of Solaris 10 prerelease (2004).
4637 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4638 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4639 and which in general ignores the argument after "-y" if it
4640 consists entirely of digits (it can even be empty). */
4641 if (optarg == argv[optind - 1])
4643 char const *p;
4644 for (p = optarg; ISDIGIT (*p); p++)
4645 continue;
4646 optind -= (*p != '\0');
4648 break;
4650 case 'z':
4651 eolchar = 0;
4652 break;
4654 case_GETOPT_HELP_CHAR;
4656 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4658 default:
4659 usage (SORT_FAILURE);
4663 if (files_from)
4665 /* When using --files0-from=F, you may not specify any files
4666 on the command-line. */
4667 if (nfiles)
4669 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4670 fprintf (stderr, "%s\n",
4671 _("file operands cannot be combined with --files0-from"));
4672 usage (SORT_FAILURE);
4675 FILE *stream = xfopen (files_from, "r");
4677 readtokens0_init (&tok);
4679 if (! readtokens0 (stream, &tok))
4680 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4681 quoteaf (files_from));
4682 xfclose (stream, files_from);
4684 if (tok.n_tok)
4686 free (files);
4687 files = tok.tok;
4688 nfiles = tok.n_tok;
4689 for (size_t i = 0; i < nfiles; i++)
4691 if (STREQ (files[i], "-"))
4692 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4693 "no file name of %s allowed"),
4694 quoteaf (files[i]));
4695 else if (files[i][0] == '\0')
4697 /* Using the standard 'filename:line-number:' prefix here is
4698 not totally appropriate, since NUL is the separator,
4699 not NL, but it might be better than nothing. */
4700 unsigned long int file_number = i + 1;
4701 error (SORT_FAILURE, 0,
4702 _("%s:%lu: invalid zero-length file name"),
4703 quotef (files_from), file_number);
4707 else
4708 error (SORT_FAILURE, 0, _("no input from %s"),
4709 quoteaf (files_from));
4712 /* Inheritance of global options to individual keys. */
4713 for (key = keylist; key; key = key->next)
4715 if (default_key_compare (key) && !key->reverse)
4717 key->ignore = gkey.ignore;
4718 key->translate = gkey.translate;
4719 key->skipsblanks = gkey.skipsblanks;
4720 key->skipeblanks = gkey.skipeblanks;
4721 key->month = gkey.month;
4722 key->numeric = gkey.numeric;
4723 key->general_numeric = gkey.general_numeric;
4724 key->human_numeric = gkey.human_numeric;
4725 key->version = gkey.version;
4726 key->random = gkey.random;
4727 key->reverse = gkey.reverse;
4730 need_random |= key->random;
4733 if (!keylist && !default_key_compare (&gkey))
4735 gkey_only = true;
4736 insertkey (&gkey);
4737 need_random |= gkey.random;
4740 check_ordering_compatibility ();
4742 if (debug)
4744 if (checkonly || outfile)
4746 static char opts[] = "X --debug";
4747 opts[0] = (checkonly ? checkonly : 'o');
4748 incompatible_options (opts);
4751 /* Always output the locale in debug mode, since this
4752 is such a common source of confusion. */
4754 /* OpenBSD can only set some categories with LC_ALL above,
4755 so set LC_COLLATE explicitly to flag errors. */
4756 if (locale_ok)
4757 locale_ok = !! setlocale (LC_COLLATE, "");
4758 if (! locale_ok)
4759 error (0, 0, "%s", _("failed to set locale"));
4760 if (hard_LC_COLLATE)
4761 error (0, 0, _("text ordering performed using %s sorting rules"),
4762 quote (setlocale (LC_COLLATE, nullptr)));
4763 else
4764 error (0, 0, "%s",
4765 _("text ordering performed using simple byte comparison"));
4767 key_warnings (&gkey, gkey_only);
4770 reverse = gkey.reverse;
4772 if (need_random)
4773 random_md5_state_init (random_source);
4775 if (temp_dir_count == 0)
4777 char const *tmp_dir = getenv ("TMPDIR");
4778 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4781 if (nfiles == 0)
4783 nfiles = 1;
4784 free (files);
4785 files = xmalloc (sizeof *files);
4786 *files = (char *) "-";
4789 /* Need to re-check that we meet the minimum requirement for memory
4790 usage with the final value for NMERGE. */
4791 if (0 < sort_size)
4792 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4794 if (checkonly)
4796 if (nfiles > 1)
4797 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4798 quoteaf (files[1]), checkonly);
4800 if (outfile)
4802 static char opts[] = {0, 'o', 0};
4803 opts[0] = checkonly;
4804 incompatible_options (opts);
4807 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4808 input is not properly sorted. */
4809 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4812 /* Check all inputs are accessible, or exit immediately. */
4813 check_inputs (files, nfiles);
4815 /* Check output is writable, or exit immediately. */
4816 check_output (outfile);
4818 if (mergeonly)
4820 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4822 for (size_t i = 0; i < nfiles; ++i)
4823 sortfiles[i].name = files[i];
4825 merge (sortfiles, 0, nfiles, outfile);
4827 else
4829 if (!nthreads)
4831 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4832 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4835 /* Avoid integer overflow later. */
4836 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4837 nthreads = MIN (nthreads, nthreads_max);
4839 sort (files, nfiles, outfile, nthreads);
4842 if (have_read_stdin && fclose (stdin) == EOF)
4843 sort_die (_("close failed"), "-");
4845 main_exit (EXIT_SUCCESS);