sort: make -h work with -k and blank used as thousands separator
[coreutils.git] / src / sort.c
blob038f6aee3270b8a0718926667deb7dd266e18f68
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/resource.h>
28 #include <sys/types.h>
29 #include <sys/wait.h>
30 #include <signal.h>
31 #include <assert.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "error.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "hard-locale.h"
38 #include "hash.h"
39 #include "heap.h"
40 #include "ignore-value.h"
41 #include "md5.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "randread.h"
48 #include "readtokens0.h"
49 #include "stdio--.h"
50 #include "stdlib--.h"
51 #include "strnumcmp.h"
52 #include "xmemcoll.h"
53 #include "xnanosleep.h"
54 #include "xstrtol.h"
56 #ifndef RLIMIT_DATA
57 struct rlimit { size_t rlim_cur; };
58 # define getrlimit(Resource, Rlp) (-1)
59 #endif
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
64 #define AUTHORS \
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
70 #endif
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73 present. */
74 #ifndef SA_NOCLDSTOP
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
78 # define sigset_t int
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
81 # endif
82 #endif
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
86 #endif
87 #if !defined OPEN_MAX
88 # define OPEN_MAX 20
89 #endif
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #if HAVE_C99_STRTOLD
94 # define long_double long double
95 #else
96 # define long_double double
97 # undef strtold
98 # define strtold strtod
99 #endif
101 #ifndef DEFAULT_TMPDIR
102 # define DEFAULT_TMPDIR "/tmp"
103 #endif
105 /* Maximum number of lines to merge every time a NODE is taken from
106 the merge queue. Node is at LEVEL in the binary merge tree,
107 and is responsible for merging TOTAL lines. */
108 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
110 /* Heuristic value for the number of lines for which it is worth creating
111 a subthread, during an internal merge sort. I.e., it is a small number
112 of "average" lines for which sorting via two threads is faster than
113 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
114 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
115 lines of gensort -a output is sorted slightly faster with --parallel=2
116 than with --parallel=1. By contrast, using --parallel=1 is about 10%
117 faster than using --parallel=2 with a 64K-line input. */
118 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
119 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
121 /* The number of threads after which there are
122 diminishing performance gains. */
123 enum { DEFAULT_MAX_THREADS = 8 };
125 /* Exit statuses. */
126 enum
128 /* POSIX says to exit with status 1 if invoked with -c and the
129 input is not properly sorted. */
130 SORT_OUT_OF_ORDER = 1,
132 /* POSIX says any other irregular exit must exit with a status
133 code greater than 1. */
134 SORT_FAILURE = 2
137 enum
139 /* The number of times we should try to fork a compression process
140 (we retry if the fork call fails). We don't _need_ to compress
141 temp files, this is just to reduce disk access, so this number
142 can be small. Each retry doubles in duration. */
143 MAX_FORK_TRIES_COMPRESS = 4,
145 /* The number of times we should try to fork a decompression process.
146 If we can't fork a decompression process, we can't sort, so this
147 number should be big. Each retry doubles in duration. */
148 MAX_FORK_TRIES_DECOMPRESS = 9
151 enum
153 /* Level of the end-of-merge node, one level above the root. */
154 MERGE_END = 0,
156 /* Level of the root node in merge tree. */
157 MERGE_ROOT = 1
160 /* The representation of the decimal point in the current locale. */
161 static int decimal_point;
163 /* Thousands separator; if -1, then there isn't one. */
164 static int thousands_sep;
166 /* Nonzero if the corresponding locales are hard. */
167 static bool hard_LC_COLLATE;
168 #if HAVE_NL_LANGINFO
169 static bool hard_LC_TIME;
170 #endif
172 #define NONZERO(x) ((x) != 0)
174 /* The kind of blanks for '-b' to skip in various options. */
175 enum blanktype { bl_start, bl_end, bl_both };
177 /* The character marking end of line. Default to \n. */
178 static char eolchar = '\n';
180 /* Lines are held in core as counted strings. */
181 struct line
183 char *text; /* Text of the line. */
184 size_t length; /* Length including final newline. */
185 char *keybeg; /* Start of first key. */
186 char *keylim; /* Limit of first key. */
189 /* Input buffers. */
190 struct buffer
192 char *buf; /* Dynamically allocated buffer,
193 partitioned into 3 regions:
194 - input data;
195 - unused area;
196 - an array of lines, in reverse order. */
197 size_t used; /* Number of bytes used for input data. */
198 size_t nlines; /* Number of lines in the line array. */
199 size_t alloc; /* Number of bytes allocated. */
200 size_t left; /* Number of bytes left from previous reads. */
201 size_t line_bytes; /* Number of bytes to reserve for each line. */
202 bool eof; /* An EOF has been read. */
205 /* Sort key. */
206 struct keyfield
208 size_t sword; /* Zero-origin 'word' to start at. */
209 size_t schar; /* Additional characters to skip. */
210 size_t eword; /* Zero-origin last 'word' of key. */
211 size_t echar; /* Additional characters in field. */
212 bool const *ignore; /* Boolean array of characters to ignore. */
213 char const *translate; /* Translation applied to characters. */
214 bool skipsblanks; /* Skip leading blanks when finding start. */
215 bool skipeblanks; /* Skip leading blanks when finding end. */
216 bool numeric; /* Flag for numeric comparison. Handle
217 strings of digits with optional decimal
218 point, but no exponential notation. */
219 bool random; /* Sort by random hash of key. */
220 bool general_numeric; /* Flag for general, numeric comparison.
221 Handle numbers in exponential notation. */
222 bool human_numeric; /* Flag for sorting by human readable
223 units with either SI or IEC prefixes. */
224 bool month; /* Flag for comparison by month name. */
225 bool reverse; /* Reverse the sense of comparison. */
226 bool version; /* sort by version number */
227 bool traditional_used; /* Traditional key option format is used. */
228 struct keyfield *next; /* Next keyfield to try. */
231 struct month
233 char const *name;
234 int val;
237 /* Binary merge tree node. */
238 struct merge_node
240 struct line *lo; /* Lines to merge from LO child node. */
241 struct line *hi; /* Lines to merge from HI child node. */
242 struct line *end_lo; /* End of available lines from LO. */
243 struct line *end_hi; /* End of available lines from HI. */
244 struct line **dest; /* Pointer to destination of merge. */
245 size_t nlo; /* Total Lines remaining from LO. */
246 size_t nhi; /* Total lines remaining from HI. */
247 struct merge_node *parent; /* Parent node. */
248 struct merge_node *lo_child; /* LO child node. */
249 struct merge_node *hi_child; /* HI child node. */
250 unsigned int level; /* Level in merge tree. */
251 bool queued; /* Node is already in heap. */
252 pthread_mutex_t lock; /* Lock for node operations. */
255 /* Priority queue of merge nodes. */
256 struct merge_node_queue
258 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
259 pthread_mutex_t mutex; /* Lock for queue operations. */
260 pthread_cond_t cond; /* Conditional wait for empty queue to populate
261 when popping. */
264 /* Used to implement --unique (-u). */
265 static struct line saved_line;
267 /* FIXME: None of these tables work with multibyte character sets.
268 Also, there are many other bugs when handling multibyte characters.
269 One way to fix this is to rewrite 'sort' to use wide characters
270 internally, but doing this with good performance is a bit
271 tricky. */
273 /* Table of blanks. */
274 static bool blanks[UCHAR_LIM];
276 /* Table of non-printing characters. */
277 static bool nonprinting[UCHAR_LIM];
279 /* Table of non-dictionary characters (not letters, digits, or blanks). */
280 static bool nondictionary[UCHAR_LIM];
282 /* Translation table folding lower case to upper. */
283 static char fold_toupper[UCHAR_LIM];
285 #define MONTHS_PER_YEAR 12
287 /* Table mapping month names to integers.
288 Alphabetic order allows binary search. */
289 static struct month monthtab[] =
291 {"APR", 4},
292 {"AUG", 8},
293 {"DEC", 12},
294 {"FEB", 2},
295 {"JAN", 1},
296 {"JUL", 7},
297 {"JUN", 6},
298 {"MAR", 3},
299 {"MAY", 5},
300 {"NOV", 11},
301 {"OCT", 10},
302 {"SEP", 9}
305 /* During the merge phase, the number of files to merge at once. */
306 #define NMERGE_DEFAULT 16
308 /* Minimum size for a merge or check buffer. */
309 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
311 /* Minimum sort size; the code might not work with smaller sizes. */
312 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
314 /* The number of bytes needed for a merge or check buffer, which can
315 function relatively efficiently even if it holds only one line. If
316 a longer line is seen, this value is increased. */
317 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
319 /* The approximate maximum number of bytes of main memory to use, as
320 specified by the user. Zero if the user has not specified a size. */
321 static size_t sort_size;
323 /* The initial allocation factor for non-regular files.
324 This is used, e.g., when reading from a pipe.
325 Don't make it too big, since it is multiplied by ~130 to
326 obtain the size of the actual buffer sort will allocate.
327 Also, there may be 8 threads all doing this at the same time. */
328 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
330 /* Array of directory names in which any temporary files are to be created. */
331 static char const **temp_dirs;
333 /* Number of temporary directory names used. */
334 static size_t temp_dir_count;
336 /* Number of allocated slots in temp_dirs. */
337 static size_t temp_dir_alloc;
339 /* Flag to reverse the order of all comparisons. */
340 static bool reverse;
342 /* Flag for stable sort. This turns off the last ditch bytewise
343 comparison of lines, and instead leaves lines in the same order
344 they were read if all keys compare equal. */
345 static bool stable;
347 /* If TAB has this value, blanks separate fields. */
348 enum { TAB_DEFAULT = CHAR_MAX + 1 };
350 /* Tab character separating fields. If TAB_DEFAULT, then fields are
351 separated by the empty string between a non-blank character and a blank
352 character. */
353 static int tab = TAB_DEFAULT;
355 /* Flag to remove consecutive duplicate lines from the output.
356 Only the last of a sequence of equal lines will be output. */
357 static bool unique;
359 /* Nonzero if any of the input files are the standard input. */
360 static bool have_read_stdin;
362 /* List of key field comparisons to be tried. */
363 static struct keyfield *keylist;
365 /* Program used to (de)compress temp files. Must accept -d. */
366 static char const *compress_program;
368 /* Annotate the output with extra info to aid the user. */
369 static bool debug;
371 /* Maximum number of files to merge in one go. If more than this
372 number are present, temp files will be used. */
373 static unsigned int nmerge = NMERGE_DEFAULT;
375 /* Output an error to stderr using async-signal-safe routines, and _exit().
376 This can be used safely from signal handlers,
377 and between fork() and exec() of multithreaded processes. */
379 static void async_safe_die (int, const char *) ATTRIBUTE_NORETURN;
380 static void
381 async_safe_die (int errnum, const char *errstr)
383 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
385 /* Even if defined HAVE_STRERROR_R, we can't use it,
386 as it may return a translated string etc. and even if not
387 may malloc() which is unsafe. We might improve this
388 by testing for sys_errlist and using that if available.
389 For now just report the error number. */
390 if (errnum)
392 char errbuf[INT_BUFSIZE_BOUND (errnum)];
393 char *p = inttostr (errnum, errbuf);
394 ignore_value (write (STDERR_FILENO, ": errno ", 8));
395 ignore_value (write (STDERR_FILENO, p, strlen (p)));
398 ignore_value (write (STDERR_FILENO, "\n", 1));
400 _exit (SORT_FAILURE);
403 /* Report MESSAGE for FILE, then clean up and exit.
404 If FILE is null, it represents standard output. */
406 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
407 static void
408 die (char const *message, char const *file)
410 error (0, errno, "%s: %s", message,
411 quotef (file ? file : _("standard output")));
412 exit (SORT_FAILURE);
415 void
416 usage (int status)
418 if (status != EXIT_SUCCESS)
419 emit_try_help ();
420 else
422 printf (_("\
423 Usage: %s [OPTION]... [FILE]...\n\
424 or: %s [OPTION]... --files0-from=F\n\
426 program_name, program_name);
427 fputs (_("\
428 Write sorted concatenation of all FILE(s) to standard output.\n\
429 "), stdout);
431 emit_stdin_note ();
432 emit_mandatory_arg_note ();
434 fputs (_("\
435 Ordering options:\n\
437 "), stdout);
438 fputs (_("\
439 -b, --ignore-leading-blanks ignore leading blanks\n\
440 -d, --dictionary-order consider only blanks and alphanumeric characters\
442 -f, --ignore-case fold lower case to upper case characters\n\
443 "), stdout);
444 fputs (_("\
445 -g, --general-numeric-sort compare according to general numerical value\n\
446 -i, --ignore-nonprinting consider only printable characters\n\
447 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
448 "), stdout);
449 fputs (_("\
450 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
451 "), stdout);
452 fputs (_("\
453 -n, --numeric-sort compare according to string numerical value\n\
454 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
455 --random-source=FILE get random bytes from FILE\n\
456 -r, --reverse reverse the result of comparisons\n\
457 "), stdout);
458 fputs (_("\
459 --sort=WORD sort according to WORD:\n\
460 general-numeric -g, human-numeric -h, month -M,\
462 numeric -n, random -R, version -V\n\
463 -V, --version-sort natural sort of (version) numbers within text\n\
465 "), stdout);
466 fputs (_("\
467 Other options:\n\
469 "), stdout);
470 fputs (_("\
471 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
472 for more use temp files\n\
473 "), stdout);
474 fputs (_("\
475 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
476 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
478 --compress-program=PROG compress temporaries with PROG;\n\
479 decompress them with PROG -d\n\
480 "), stdout);
481 fputs (_("\
482 --debug annotate the part of the line used to sort,\n\
483 and warn about questionable usage to stderr\n\
484 --files0-from=F read input from the files specified by\n\
485 NUL-terminated names in file F;\n\
486 If F is - then read names from standard input\n\
487 "), stdout);
488 fputs (_("\
489 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
490 -m, --merge merge already sorted files; do not sort\n\
491 "), stdout);
492 fputs (_("\
493 -o, --output=FILE write result to FILE instead of standard output\n\
494 -s, --stable stabilize sort by disabling last-resort comparison\
496 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
497 "), stdout);
498 printf (_("\
499 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
500 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
501 multiple options specify multiple directories\n\
502 --parallel=N change the number of sorts run concurrently to N\n\
503 -u, --unique with -c, check for strict ordering;\n\
504 without -c, output only the first of an equal run\
506 "), DEFAULT_TMPDIR);
507 fputs (_("\
508 -z, --zero-terminated line delimiter is NUL, not newline\n\
509 "), stdout);
510 fputs (HELP_OPTION_DESCRIPTION, stdout);
511 fputs (VERSION_OPTION_DESCRIPTION, stdout);
512 fputs (_("\
514 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
515 field number and C a character position in the field; both are origin 1, and\n\
516 the stop position defaults to the line's end. If neither -t nor -b is in\n\
517 effect, characters in a field are counted from the beginning of the preceding\n\
518 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
520 which override global ordering options for that key. If no key is given, use\n\
521 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
523 SIZE may be followed by the following multiplicative suffixes:\n\
524 "), stdout);
525 fputs (_("\
526 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
528 *** WARNING ***\n\
529 The locale specified by the environment affects sort order.\n\
530 Set LC_ALL=C to get the traditional sort order that uses\n\
531 native byte values.\n\
532 "), stdout );
533 emit_ancillary_info (PROGRAM_NAME);
536 exit (status);
539 /* For long options that have no equivalent short option, use a
540 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
541 enum
543 CHECK_OPTION = CHAR_MAX + 1,
544 COMPRESS_PROGRAM_OPTION,
545 DEBUG_PROGRAM_OPTION,
546 FILES0_FROM_OPTION,
547 NMERGE_OPTION,
548 RANDOM_SOURCE_OPTION,
549 SORT_OPTION,
550 PARALLEL_OPTION
553 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
555 static struct option const long_options[] =
557 {"ignore-leading-blanks", no_argument, NULL, 'b'},
558 {"check", optional_argument, NULL, CHECK_OPTION},
559 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
560 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
561 {"dictionary-order", no_argument, NULL, 'd'},
562 {"ignore-case", no_argument, NULL, 'f'},
563 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
564 {"general-numeric-sort", no_argument, NULL, 'g'},
565 {"ignore-nonprinting", no_argument, NULL, 'i'},
566 {"key", required_argument, NULL, 'k'},
567 {"merge", no_argument, NULL, 'm'},
568 {"month-sort", no_argument, NULL, 'M'},
569 {"numeric-sort", no_argument, NULL, 'n'},
570 {"human-numeric-sort", no_argument, NULL, 'h'},
571 {"version-sort", no_argument, NULL, 'V'},
572 {"random-sort", no_argument, NULL, 'R'},
573 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
574 {"sort", required_argument, NULL, SORT_OPTION},
575 {"output", required_argument, NULL, 'o'},
576 {"reverse", no_argument, NULL, 'r'},
577 {"stable", no_argument, NULL, 's'},
578 {"batch-size", required_argument, NULL, NMERGE_OPTION},
579 {"buffer-size", required_argument, NULL, 'S'},
580 {"field-separator", required_argument, NULL, 't'},
581 {"temporary-directory", required_argument, NULL, 'T'},
582 {"unique", no_argument, NULL, 'u'},
583 {"zero-terminated", no_argument, NULL, 'z'},
584 {"parallel", required_argument, NULL, PARALLEL_OPTION},
585 {GETOPT_HELP_OPTION_DECL},
586 {GETOPT_VERSION_OPTION_DECL},
587 {NULL, 0, NULL, 0},
590 #define CHECK_TABLE \
591 _ct_("quiet", 'C') \
592 _ct_("silent", 'C') \
593 _ct_("diagnose-first", 'c')
595 static char const *const check_args[] =
597 #define _ct_(_s, _c) _s,
598 CHECK_TABLE NULL
599 #undef _ct_
601 static char const check_types[] =
603 #define _ct_(_s, _c) _c,
604 CHECK_TABLE
605 #undef _ct_
608 #define SORT_TABLE \
609 _st_("general-numeric", 'g') \
610 _st_("human-numeric", 'h') \
611 _st_("month", 'M') \
612 _st_("numeric", 'n') \
613 _st_("random", 'R') \
614 _st_("version", 'V')
616 static char const *const sort_args[] =
618 #define _st_(_s, _c) _s,
619 SORT_TABLE NULL
620 #undef _st_
622 static char const sort_types[] =
624 #define _st_(_s, _c) _c,
625 SORT_TABLE
626 #undef _st_
629 /* The set of signals that are caught. */
630 static sigset_t caught_signals;
632 /* Critical section status. */
633 struct cs_status
635 bool valid;
636 sigset_t sigs;
639 /* Enter a critical section. */
640 static struct cs_status
641 cs_enter (void)
643 struct cs_status status;
644 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
645 return status;
648 /* Leave a critical section. */
649 static void
650 cs_leave (struct cs_status status)
652 if (status.valid)
654 /* Ignore failure when restoring the signal mask. */
655 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
659 /* Possible states for a temp file. If compressed, the file's status
660 is unreaped or reaped, depending on whether 'sort' has waited for
661 the subprocess to finish. */
662 enum { UNCOMPRESSED, UNREAPED, REAPED };
664 /* The list of temporary files. */
665 struct tempnode
667 struct tempnode *volatile next;
668 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
669 char state;
670 char name[1]; /* Actual size is 1 + file name length. */
672 static struct tempnode *volatile temphead;
673 static struct tempnode *volatile *temptail = &temphead;
675 /* A file to be sorted. */
676 struct sortfile
678 /* The file's name. */
679 char const *name;
681 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
682 struct tempnode *temp;
685 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
686 static Hash_table *proctab;
688 enum { INIT_PROCTAB_SIZE = 47 };
690 static size_t
691 proctab_hasher (void const *entry, size_t tabsize)
693 struct tempnode const *node = entry;
694 return node->pid % tabsize;
697 static bool
698 proctab_comparator (void const *e1, void const *e2)
700 struct tempnode const *n1 = e1;
701 struct tempnode const *n2 = e2;
702 return n1->pid == n2->pid;
705 /* The number of unreaped child processes. */
706 static pid_t nprocs;
708 static bool delete_proc (pid_t);
710 /* If PID is positive, wait for the child process with that PID to
711 exit, and assume that PID has already been removed from the process
712 table. If PID is 0 or -1, clean up some child that has exited (by
713 waiting for it, and removing it from the proc table) and return the
714 child's process ID. However, if PID is 0 and no children have
715 exited, return 0 without waiting. */
717 static pid_t
718 reap (pid_t pid)
720 int status;
721 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
723 if (cpid < 0)
724 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
725 quoteaf (compress_program));
726 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
728 if (! WIFEXITED (status) || WEXITSTATUS (status))
729 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
730 quoteaf (compress_program));
731 --nprocs;
734 return cpid;
737 /* TEMP represents a new process; add it to the process table. Create
738 the process table the first time it's called. */
740 static void
741 register_proc (struct tempnode *temp)
743 if (! proctab)
745 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
746 proctab_hasher,
747 proctab_comparator,
748 NULL);
749 if (! proctab)
750 xalloc_die ();
753 temp->state = UNREAPED;
755 if (! hash_insert (proctab, temp))
756 xalloc_die ();
759 /* If PID is in the process table, remove it and return true.
760 Otherwise, return false. */
762 static bool
763 delete_proc (pid_t pid)
765 struct tempnode test;
767 test.pid = pid;
768 struct tempnode *node = hash_delete (proctab, &test);
769 if (! node)
770 return false;
771 node->state = REAPED;
772 return true;
775 /* Remove PID from the process table, and wait for it to exit if it
776 hasn't already. */
778 static void
779 wait_proc (pid_t pid)
781 if (delete_proc (pid))
782 reap (pid);
785 /* Reap any exited children. Do not block; reap only those that have
786 already exited. */
788 static void
789 reap_exited (void)
791 while (0 < nprocs && reap (0))
792 continue;
795 /* Reap at least one exited child, waiting if necessary. */
797 static void
798 reap_some (void)
800 reap (-1);
801 reap_exited ();
804 /* Reap all children, waiting if necessary. */
806 static void
807 reap_all (void)
809 while (0 < nprocs)
810 reap (-1);
813 /* Clean up any remaining temporary files. */
815 static void
816 cleanup (void)
818 struct tempnode const *node;
820 for (node = temphead; node; node = node->next)
821 unlink (node->name);
822 temphead = NULL;
825 /* Cleanup actions to take when exiting. */
827 static void
828 exit_cleanup (void)
830 if (temphead)
832 /* Clean up any remaining temporary files in a critical section so
833 that a signal handler does not try to clean them too. */
834 struct cs_status cs = cs_enter ();
835 cleanup ();
836 cs_leave (cs);
839 close_stdout ();
842 /* Create a new temporary file, returning its newly allocated tempnode.
843 Store into *PFD the file descriptor open for writing.
844 If the creation fails, return NULL and store -1 into *PFD if the
845 failure is due to file descriptor exhaustion and
846 SURVIVE_FD_EXHAUSTION; otherwise, die. */
848 static struct tempnode *
849 create_temp_file (int *pfd, bool survive_fd_exhaustion)
851 static char const slashbase[] = "/sortXXXXXX";
852 static size_t temp_dir_index;
853 int fd;
854 int saved_errno;
855 char const *temp_dir = temp_dirs[temp_dir_index];
856 size_t len = strlen (temp_dir);
857 struct tempnode *node =
858 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
859 char *file = node->name;
860 struct cs_status cs;
862 memcpy (file, temp_dir, len);
863 memcpy (file + len, slashbase, sizeof slashbase);
864 node->next = NULL;
865 if (++temp_dir_index == temp_dir_count)
866 temp_dir_index = 0;
868 /* Create the temporary file in a critical section, to avoid races. */
869 cs = cs_enter ();
870 fd = mkstemp (file);
871 if (0 <= fd)
873 *temptail = node;
874 temptail = &node->next;
876 saved_errno = errno;
877 cs_leave (cs);
878 errno = saved_errno;
880 if (fd < 0)
882 if (! (survive_fd_exhaustion && errno == EMFILE))
883 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
884 quoteaf (temp_dir));
885 free (node);
886 node = NULL;
889 *pfd = fd;
890 return node;
893 /* Return a stream for FILE, opened with mode HOW. A null FILE means
894 standard output; HOW should be "w". When opening for input, "-"
895 means standard input. To avoid confusion, do not return file
896 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
897 opening an ordinary FILE. Return NULL if unsuccessful.
899 fadvise() is used to specify an access pattern for input files.
900 There are a few hints we could possibly provide,
901 and after careful testing it was decided that
902 specifying POSIX_FADV_SEQUENTIAL was not detrimental
903 to any cases. On Linux 2.6.31, this option doubles
904 the size of read ahead performed and thus was seen to
905 benefit these cases:
906 Merging
907 Sorting with a smaller internal buffer
908 Reading from faster flash devices
910 In _addition_ one could also specify other hints...
912 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
913 at least uses that to _synchronously_ prepopulate the cache
914 with the specified range. While sort does need to
915 read all of its input before outputting, a synchronous
916 read of the whole file up front precludes any processing
917 that sort could do in parallel with the system doing
918 read ahead of the data. This was seen to have negative effects
919 in a couple of cases:
920 Merging
921 Sorting with a smaller internal buffer
922 Note this option was seen to shorten the runtime for sort
923 on a multicore system with lots of RAM and other processes
924 competing for CPU. It could be argued that more explicit
925 scheduling hints with 'nice' et. al. are more appropriate
926 for this situation.
928 POSIX_FADV_NOREUSE is a possibility as it could lower
929 the priority of input data in the cache as sort will
930 only need to process it once. However its functionality
931 has changed over Linux kernel versions and as of 2.6.31
932 it does nothing and thus we can't depend on what it might
933 do in future.
935 POSIX_FADV_DONTNEED is not appropriate for user specified
936 input files, but for temp files we do want to drop the
937 cache immediately after processing. This is done implicitly
938 however when the files are unlinked. */
940 static FILE *
941 stream_open (char const *file, char const *how)
943 FILE *fp;
945 if (*how == 'r')
947 if (STREQ (file, "-"))
949 have_read_stdin = true;
950 fp = stdin;
952 else
953 fp = fopen (file, how);
954 fadvise (fp, FADVISE_SEQUENTIAL);
956 else if (*how == 'w')
958 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
959 error (SORT_FAILURE, errno, _("%s: error truncating"),
960 quotef (file));
961 fp = stdout;
963 else
964 assert (!"unexpected mode passed to stream_open");
966 return fp;
969 /* Same as stream_open, except always return a non-null value; die on
970 failure. */
972 static FILE *
973 xfopen (char const *file, char const *how)
975 FILE *fp = stream_open (file, how);
976 if (!fp)
977 die (_("open failed"), file);
978 return fp;
981 /* Close FP, whose name is FILE, and report any errors. */
983 static void
984 xfclose (FILE *fp, char const *file)
986 switch (fileno (fp))
988 case STDIN_FILENO:
989 /* Allow reading stdin from tty more than once. */
990 if (feof (fp))
991 clearerr (fp);
992 break;
994 case STDOUT_FILENO:
995 /* Don't close stdout just yet. close_stdout does that. */
996 if (fflush (fp) != 0)
997 die (_("fflush failed"), file);
998 break;
1000 default:
1001 if (fclose (fp) != 0)
1002 die (_("close failed"), file);
1003 break;
1007 static void
1008 move_fd_or_die (int oldfd, int newfd)
1010 if (oldfd != newfd)
1012 /* This should never fail for our usage. */
1013 dup2 (oldfd, newfd);
1014 close (oldfd);
1018 /* Fork a child process for piping to and do common cleanup. The
1019 TRIES parameter tells us how many times to try to fork before
1020 giving up. Return the PID of the child, or -1 (setting errno)
1021 on failure. */
1023 static pid_t
1024 pipe_fork (int pipefds[2], size_t tries)
1026 #if HAVE_WORKING_FORK
1027 struct tempnode *saved_temphead;
1028 int saved_errno;
1029 double wait_retry = 0.25;
1030 pid_t pid IF_LINT ( = -1);
1031 struct cs_status cs;
1033 if (pipe (pipefds) < 0)
1034 return -1;
1036 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1037 uncontrolled subprocess generation can hurt performance significantly.
1038 Allow at most NMERGE + 2 subprocesses, on the theory that there
1039 may be some useful parallelism by letting compression for the
1040 previous merge finish (1 subprocess) in parallel with the current
1041 merge (NMERGE + 1 subprocesses). */
1043 if (nmerge + 1 < nprocs)
1044 reap_some ();
1046 while (tries--)
1048 /* This is so the child process won't delete our temp files
1049 if it receives a signal before exec-ing. */
1050 cs = cs_enter ();
1051 saved_temphead = temphead;
1052 temphead = NULL;
1054 pid = fork ();
1055 saved_errno = errno;
1056 if (pid)
1057 temphead = saved_temphead;
1059 cs_leave (cs);
1060 errno = saved_errno;
1062 if (0 <= pid || errno != EAGAIN)
1063 break;
1064 else
1066 xnanosleep (wait_retry);
1067 wait_retry *= 2;
1068 reap_exited ();
1072 if (pid < 0)
1074 saved_errno = errno;
1075 close (pipefds[0]);
1076 close (pipefds[1]);
1077 errno = saved_errno;
1079 else if (pid == 0)
1081 close (STDIN_FILENO);
1082 close (STDOUT_FILENO);
1084 else
1085 ++nprocs;
1087 return pid;
1089 #else /* ! HAVE_WORKING_FORK */
1090 return -1;
1091 #endif
1094 /* Create a temporary file and, if asked for, start a compressor
1095 to that file. Set *PFP to the file handle and return
1096 the address of the new temp node. If the creation
1097 fails, return NULL if the failure is due to file descriptor
1098 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1100 static struct tempnode *
1101 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1103 int tempfd;
1104 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1105 if (! node)
1106 return NULL;
1108 node->state = UNCOMPRESSED;
1110 if (compress_program)
1112 int pipefds[2];
1114 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1115 if (0 < node->pid)
1117 close (tempfd);
1118 close (pipefds[0]);
1119 tempfd = pipefds[1];
1121 register_proc (node);
1123 else if (node->pid == 0)
1125 /* Being the child of a multithreaded program before exec(),
1126 we're restricted to calling async-signal-safe routines here. */
1127 close (pipefds[1]);
1128 move_fd_or_die (tempfd, STDOUT_FILENO);
1129 move_fd_or_die (pipefds[0], STDIN_FILENO);
1131 execlp (compress_program, compress_program, (char *) NULL);
1133 async_safe_die (errno, "couldn't execute compress program");
1137 *pfp = fdopen (tempfd, "w");
1138 if (! *pfp)
1139 die (_("couldn't create temporary file"), node->name);
1141 return node;
1144 /* Create a temporary file and, if asked for, start a compressor
1145 to that file. Set *PFP to the file handle and return the address
1146 of the new temp node. Die on failure. */
1148 static struct tempnode *
1149 create_temp (FILE **pfp)
1151 return maybe_create_temp (pfp, false);
1154 /* Open a compressed temp file and start a decompression process through
1155 which to filter the input. Return NULL (setting errno to
1156 EMFILE) if we ran out of file descriptors, and die on any other
1157 kind of failure. */
1159 static FILE *
1160 open_temp (struct tempnode *temp)
1162 int tempfd, pipefds[2];
1163 FILE *fp = NULL;
1165 if (temp->state == UNREAPED)
1166 wait_proc (temp->pid);
1168 tempfd = open (temp->name, O_RDONLY);
1169 if (tempfd < 0)
1170 return NULL;
1172 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1174 switch (child)
1176 case -1:
1177 if (errno != EMFILE)
1178 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1179 quoteaf (compress_program));
1180 close (tempfd);
1181 errno = EMFILE;
1182 break;
1184 case 0:
1185 /* Being the child of a multithreaded program before exec(),
1186 we're restricted to calling async-signal-safe routines here. */
1187 close (pipefds[0]);
1188 move_fd_or_die (tempfd, STDIN_FILENO);
1189 move_fd_or_die (pipefds[1], STDOUT_FILENO);
1191 execlp (compress_program, compress_program, "-d", (char *) NULL);
1193 async_safe_die (errno, "couldn't execute compress program (with -d)");
1195 default:
1196 temp->pid = child;
1197 register_proc (temp);
1198 close (tempfd);
1199 close (pipefds[1]);
1201 fp = fdopen (pipefds[0], "r");
1202 if (! fp)
1204 int saved_errno = errno;
1205 close (pipefds[0]);
1206 errno = saved_errno;
1208 break;
1211 return fp;
1214 /* Append DIR to the array of temporary directory names. */
1215 static void
1216 add_temp_dir (char const *dir)
1218 if (temp_dir_count == temp_dir_alloc)
1219 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1221 temp_dirs[temp_dir_count++] = dir;
1224 /* Remove NAME from the list of temporary files. */
1226 static void
1227 zaptemp (char const *name)
1229 struct tempnode *volatile *pnode;
1230 struct tempnode *node;
1231 struct tempnode *next;
1232 int unlink_status;
1233 int unlink_errno = 0;
1234 struct cs_status cs;
1236 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1237 continue;
1239 if (node->state == UNREAPED)
1240 wait_proc (node->pid);
1242 /* Unlink the temporary file in a critical section to avoid races. */
1243 next = node->next;
1244 cs = cs_enter ();
1245 unlink_status = unlink (name);
1246 unlink_errno = errno;
1247 *pnode = next;
1248 cs_leave (cs);
1250 if (unlink_status != 0)
1251 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1252 if (! next)
1253 temptail = pnode;
1254 free (node);
1257 #if HAVE_NL_LANGINFO
1259 static int
1260 struct_month_cmp (void const *m1, void const *m2)
1262 struct month const *month1 = m1;
1263 struct month const *month2 = m2;
1264 return strcmp (month1->name, month2->name);
1267 #endif
1269 /* Initialize the character class tables. */
1271 static void
1272 inittables (void)
1274 size_t i;
1276 for (i = 0; i < UCHAR_LIM; ++i)
1278 blanks[i] = field_sep (i);
1279 nonprinting[i] = ! isprint (i);
1280 nondictionary[i] = ! isalnum (i) && ! field_sep (i);
1281 fold_toupper[i] = toupper (i);
1284 #if HAVE_NL_LANGINFO
1285 /* If we're not in the "C" locale, read different names for months. */
1286 if (hard_LC_TIME)
1288 for (i = 0; i < MONTHS_PER_YEAR; i++)
1290 char const *s;
1291 size_t s_len;
1292 size_t j, k;
1293 char *name;
1295 s = nl_langinfo (ABMON_1 + i);
1296 s_len = strlen (s);
1297 monthtab[i].name = name = xmalloc (s_len + 1);
1298 monthtab[i].val = i + 1;
1300 for (j = k = 0; j < s_len; j++)
1301 if (! isblank (to_uchar (s[j])))
1302 name[k++] = fold_toupper[to_uchar (s[j])];
1303 name[k] = '\0';
1305 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1307 #endif
1310 /* Specify how many inputs may be merged at once.
1311 This may be set on the command-line with the
1312 --batch-size option. */
1313 static void
1314 specify_nmerge (int oi, char c, char const *s)
1316 uintmax_t n;
1317 struct rlimit rlimit;
1318 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1320 /* Try to find out how many file descriptors we'll be able
1321 to open. We need at least nmerge + 3 (STDIN_FILENO,
1322 STDOUT_FILENO and STDERR_FILENO). */
1323 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1324 ? rlimit.rlim_cur
1325 : OPEN_MAX)
1326 - 3);
1328 if (e == LONGINT_OK)
1330 nmerge = n;
1331 if (nmerge != n)
1332 e = LONGINT_OVERFLOW;
1333 else
1335 if (nmerge < 2)
1337 error (0, 0, _("invalid --%s argument %s"),
1338 long_options[oi].name, quote (s));
1339 error (SORT_FAILURE, 0,
1340 _("minimum --%s argument is %s"),
1341 long_options[oi].name, quote ("2"));
1343 else if (max_nmerge < nmerge)
1345 e = LONGINT_OVERFLOW;
1347 else
1348 return;
1352 if (e == LONGINT_OVERFLOW)
1354 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1355 error (0, 0, _("--%s argument %s too large"),
1356 long_options[oi].name, quote (s));
1357 error (SORT_FAILURE, 0,
1358 _("maximum --%s argument with current rlimit is %s"),
1359 long_options[oi].name,
1360 uinttostr (max_nmerge, max_nmerge_buf));
1362 else
1363 xstrtol_fatal (e, oi, c, long_options, s);
1366 /* Specify the amount of main memory to use when sorting. */
1367 static void
1368 specify_sort_size (int oi, char c, char const *s)
1370 uintmax_t n;
1371 char *suffix;
1372 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1374 /* The default unit is KiB. */
1375 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1377 if (n <= UINTMAX_MAX / 1024)
1378 n *= 1024;
1379 else
1380 e = LONGINT_OVERFLOW;
1383 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1384 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1385 switch (suffix[0])
1387 case 'b':
1388 e = LONGINT_OK;
1389 break;
1391 case '%':
1393 double mem = physmem_total () * n / 100;
1395 /* Use "<", not "<=", to avoid problems with rounding. */
1396 if (mem < UINTMAX_MAX)
1398 n = mem;
1399 e = LONGINT_OK;
1401 else
1402 e = LONGINT_OVERFLOW;
1404 break;
1407 if (e == LONGINT_OK)
1409 /* If multiple sort sizes are specified, take the maximum, so
1410 that option order does not matter. */
1411 if (n < sort_size)
1412 return;
1414 sort_size = n;
1415 if (sort_size == n)
1417 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1418 return;
1421 e = LONGINT_OVERFLOW;
1424 xstrtol_fatal (e, oi, c, long_options, s);
1427 /* Specify the number of threads to spawn during internal sort. */
1428 static size_t
1429 specify_nthreads (int oi, char c, char const *s)
1431 unsigned long int nthreads;
1432 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1433 if (e == LONGINT_OVERFLOW)
1434 return SIZE_MAX;
1435 if (e != LONGINT_OK)
1436 xstrtol_fatal (e, oi, c, long_options, s);
1437 if (SIZE_MAX < nthreads)
1438 nthreads = SIZE_MAX;
1439 if (nthreads == 0)
1440 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1441 return nthreads;
1444 /* Return the default sort size. */
1445 static size_t
1446 default_sort_size (void)
1448 /* Let SIZE be MEM, but no more than the maximum object size,
1449 total memory, or system resource limits. Don't bother to check
1450 for values like RLIM_INFINITY since in practice they are not much
1451 less than SIZE_MAX. */
1452 size_t size = SIZE_MAX;
1453 struct rlimit rlimit;
1454 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1455 size = rlimit.rlim_cur;
1456 #ifdef RLIMIT_AS
1457 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1458 size = rlimit.rlim_cur;
1459 #endif
1461 /* Leave a large safety margin for the above limits, as failure can
1462 occur when they are exceeded. */
1463 size /= 2;
1465 #ifdef RLIMIT_RSS
1466 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1467 Exceeding RSS is not fatal, but can be quite slow. */
1468 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1469 size = rlimit.rlim_cur / 16 * 15;
1470 #endif
1472 /* Let MEM be available memory or 1/8 of total memory, whichever
1473 is greater. */
1474 double avail = physmem_available ();
1475 double total = physmem_total ();
1476 double mem = MAX (avail, total / 8);
1478 /* Leave a 1/4 margin for physical memory. */
1479 if (total * 0.75 < size)
1480 size = total * 0.75;
1482 /* Return the minimum of MEM and SIZE, but no less than
1483 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1484 right when only one argument is floating point. */
1485 if (mem < size)
1486 size = mem;
1487 return MAX (size, MIN_SORT_SIZE);
1490 /* Return the sort buffer size to use with the input files identified
1491 by FPS and FILES, which are alternate names of the same files.
1492 NFILES gives the number of input files; NFPS may be less. Assume
1493 that each input line requires LINE_BYTES extra bytes' worth of line
1494 information. Do not exceed the size bound specified by the user
1495 (or a default size bound, if the user does not specify one). */
1497 static size_t
1498 sort_buffer_size (FILE *const *fps, size_t nfps,
1499 char *const *files, size_t nfiles,
1500 size_t line_bytes)
1502 /* A bound on the input size. If zero, the bound hasn't been
1503 determined yet. */
1504 static size_t size_bound;
1506 /* In the worst case, each input byte is a newline. */
1507 size_t worst_case_per_input_byte = line_bytes + 1;
1509 /* Keep enough room for one extra input line and an extra byte.
1510 This extra room might be needed when preparing to read EOF. */
1511 size_t size = worst_case_per_input_byte + 1;
1513 size_t i;
1515 for (i = 0; i < nfiles; i++)
1517 struct stat st;
1518 off_t file_size;
1519 size_t worst_case;
1521 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1522 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1523 : stat (files[i], &st))
1524 != 0)
1525 die (_("stat failed"), files[i]);
1527 if (S_ISREG (st.st_mode))
1528 file_size = st.st_size;
1529 else
1531 /* The file has unknown size. If the user specified a sort
1532 buffer size, use that; otherwise, guess the size. */
1533 if (sort_size)
1534 return sort_size;
1535 file_size = INPUT_FILE_SIZE_GUESS;
1538 if (! size_bound)
1540 size_bound = sort_size;
1541 if (! size_bound)
1542 size_bound = default_sort_size ();
1545 /* Add the amount of memory needed to represent the worst case
1546 where the input consists entirely of newlines followed by a
1547 single non-newline. Check for overflow. */
1548 worst_case = file_size * worst_case_per_input_byte + 1;
1549 if (file_size != worst_case / worst_case_per_input_byte
1550 || size_bound - size <= worst_case)
1551 return size_bound;
1552 size += worst_case;
1555 return size;
1558 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1559 must be at least sizeof (struct line). Allocate ALLOC bytes
1560 initially. */
1562 static void
1563 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1565 /* Ensure that the line array is properly aligned. If the desired
1566 size cannot be allocated, repeatedly halve it until allocation
1567 succeeds. The smaller allocation may hurt overall performance,
1568 but that's better than failing. */
1569 while (true)
1571 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1572 buf->buf = malloc (alloc);
1573 if (buf->buf)
1574 break;
1575 alloc /= 2;
1576 if (alloc <= line_bytes + 1)
1577 xalloc_die ();
1580 buf->line_bytes = line_bytes;
1581 buf->alloc = alloc;
1582 buf->used = buf->left = buf->nlines = 0;
1583 buf->eof = false;
1586 /* Return one past the limit of the line array. */
1588 static inline struct line *
1589 buffer_linelim (struct buffer const *buf)
1591 void *linelim = buf->buf + buf->alloc;
1592 return linelim;
1595 /* Return a pointer to the first character of the field specified
1596 by KEY in LINE. */
1598 static char *
1599 begfield (struct line const *line, struct keyfield const *key)
1601 char *ptr = line->text, *lim = ptr + line->length - 1;
1602 size_t sword = key->sword;
1603 size_t schar = key->schar;
1605 /* The leading field separator itself is included in a field when -t
1606 is absent. */
1608 if (tab != TAB_DEFAULT)
1609 while (ptr < lim && sword--)
1611 while (ptr < lim && *ptr != tab)
1612 ++ptr;
1613 if (ptr < lim)
1614 ++ptr;
1616 else
1617 while (ptr < lim && sword--)
1619 while (ptr < lim && blanks[to_uchar (*ptr)])
1620 ++ptr;
1621 while (ptr < lim && !blanks[to_uchar (*ptr)])
1622 ++ptr;
1625 /* If we're ignoring leading blanks when computing the Start
1626 of the field, skip past them here. */
1627 if (key->skipsblanks)
1628 while (ptr < lim && blanks[to_uchar (*ptr)])
1629 ++ptr;
1631 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1632 ptr = MIN (lim, ptr + schar);
1634 return ptr;
1637 /* Return the limit of (a pointer to the first character after) the field
1638 in LINE specified by KEY. */
1640 static char *
1641 limfield (struct line const *line, struct keyfield const *key)
1643 char *ptr = line->text, *lim = ptr + line->length - 1;
1644 size_t eword = key->eword, echar = key->echar;
1646 if (echar == 0)
1647 eword++; /* Skip all of end field. */
1649 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1650 whichever comes first. If there are more than EWORD fields, leave
1651 PTR pointing at the beginning of the field having zero-based index,
1652 EWORD. If a delimiter character was specified (via -t), then that
1653 'beginning' is the first character following the delimiting TAB.
1654 Otherwise, leave PTR pointing at the first 'blank' character after
1655 the preceding field. */
1656 if (tab != TAB_DEFAULT)
1657 while (ptr < lim && eword--)
1659 while (ptr < lim && *ptr != tab)
1660 ++ptr;
1661 if (ptr < lim && (eword || echar))
1662 ++ptr;
1664 else
1665 while (ptr < lim && eword--)
1667 while (ptr < lim && blanks[to_uchar (*ptr)])
1668 ++ptr;
1669 while (ptr < lim && !blanks[to_uchar (*ptr)])
1670 ++ptr;
1673 #ifdef POSIX_UNSPECIFIED
1674 /* The following block of code makes GNU sort incompatible with
1675 standard Unix sort, so it's ifdef'd out for now.
1676 The POSIX spec isn't clear on how to interpret this.
1677 FIXME: request clarification.
1679 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1680 Date: Thu, 30 May 96 12:20:41 -0400
1681 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1683 [...]I believe I've found another bug in 'sort'.
1685 $ cat /tmp/sort.in
1686 a b c 2 d
1687 pq rs 1 t
1688 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1689 a b c 2 d
1690 pq rs 1 t
1691 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1692 pq rs 1 t
1693 a b c 2 d
1695 Unix sort produced the answer I expected: sort on the single character
1696 in column 7. GNU sort produced different results, because it disagrees
1697 on the interpretation of the key-end spec "M.N". Unix sort reads this
1698 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1699 "skip M-1 fields, then either N-1 characters or the rest of the current
1700 field, whichever comes first". This extra clause applies only to
1701 key-ends, not key-starts.
1704 /* Make LIM point to the end of (one byte past) the current field. */
1705 if (tab != TAB_DEFAULT)
1707 char *newlim;
1708 newlim = memchr (ptr, tab, lim - ptr);
1709 if (newlim)
1710 lim = newlim;
1712 else
1714 char *newlim;
1715 newlim = ptr;
1716 while (newlim < lim && blanks[to_uchar (*newlim)])
1717 ++newlim;
1718 while (newlim < lim && !blanks[to_uchar (*newlim)])
1719 ++newlim;
1720 lim = newlim;
1722 #endif
1724 if (echar != 0) /* We need to skip over a portion of the end field. */
1726 /* If we're ignoring leading blanks when computing the End
1727 of the field, skip past them here. */
1728 if (key->skipeblanks)
1729 while (ptr < lim && blanks[to_uchar (*ptr)])
1730 ++ptr;
1732 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1733 ptr = MIN (lim, ptr + echar);
1736 return ptr;
1739 /* Fill BUF reading from FP, moving buf->left bytes from the end
1740 of buf->buf to the beginning first. If EOF is reached and the
1741 file wasn't terminated by a newline, supply one. Set up BUF's line
1742 table too. FILE is the name of the file corresponding to FP.
1743 Return true if some input was read. */
1745 static bool
1746 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1748 struct keyfield const *key = keylist;
1749 char eol = eolchar;
1750 size_t line_bytes = buf->line_bytes;
1751 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1753 if (buf->eof)
1754 return false;
1756 if (buf->used != buf->left)
1758 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1759 buf->used = buf->left;
1760 buf->nlines = 0;
1763 while (true)
1765 char *ptr = buf->buf + buf->used;
1766 struct line *linelim = buffer_linelim (buf);
1767 struct line *line = linelim - buf->nlines;
1768 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1769 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1771 while (line_bytes + 1 < avail)
1773 /* Read as many bytes as possible, but do not read so many
1774 bytes that there might not be enough room for the
1775 corresponding line array. The worst case is when the
1776 rest of the input file consists entirely of newlines,
1777 except that the last byte is not a newline. */
1778 size_t readsize = (avail - 1) / (line_bytes + 1);
1779 size_t bytes_read = fread (ptr, 1, readsize, fp);
1780 char *ptrlim = ptr + bytes_read;
1781 char *p;
1782 avail -= bytes_read;
1784 if (bytes_read != readsize)
1786 if (ferror (fp))
1787 die (_("read failed"), file);
1788 if (feof (fp))
1790 buf->eof = true;
1791 if (buf->buf == ptrlim)
1792 return false;
1793 if (line_start != ptrlim && ptrlim[-1] != eol)
1794 *ptrlim++ = eol;
1798 /* Find and record each line in the just-read input. */
1799 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1801 /* Delimit the line with NUL. This eliminates the need to
1802 temporarily replace the last byte with NUL when calling
1803 xmemcoll(), which increases performance. */
1804 *p = '\0';
1805 ptr = p + 1;
1806 line--;
1807 line->text = line_start;
1808 line->length = ptr - line_start;
1809 mergesize = MAX (mergesize, line->length);
1810 avail -= line_bytes;
1812 if (key)
1814 /* Precompute the position of the first key for
1815 efficiency. */
1816 line->keylim = (key->eword == SIZE_MAX
1818 : limfield (line, key));
1820 if (key->sword != SIZE_MAX)
1821 line->keybeg = begfield (line, key);
1822 else
1824 if (key->skipsblanks)
1825 while (blanks[to_uchar (*line_start)])
1826 line_start++;
1827 line->keybeg = line_start;
1831 line_start = ptr;
1834 ptr = ptrlim;
1835 if (buf->eof)
1836 break;
1839 buf->used = ptr - buf->buf;
1840 buf->nlines = buffer_linelim (buf) - line;
1841 if (buf->nlines != 0)
1843 buf->left = ptr - line_start;
1844 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1845 return true;
1849 /* The current input line is too long to fit in the buffer.
1850 Increase the buffer size and try again, keeping it properly
1851 aligned. */
1852 size_t line_alloc = buf->alloc / sizeof (struct line);
1853 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1854 buf->alloc = line_alloc * sizeof (struct line);
1859 /* Table that maps characters to order-of-magnitude values. */
1860 static char const unit_order[UCHAR_LIM] =
1862 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1863 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1864 /* This initializer syntax works on all C99 hosts. For now, use
1865 it only on non-ASCII hosts, to ease the pain of porting to
1866 pre-C99 ASCII hosts. */
1867 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1868 ['k']=1,
1869 #else
1870 /* Generate the following table with this command:
1871 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1872 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1873 |fmt */
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1877 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1881 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1882 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1884 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1885 #endif
1888 /* Traverse number given as *number consisting of digits, thousands_sep, and
1889 decimal_point chars only. Returns the highest digit found in the number,
1890 or '\0' if no digit has been found. Upon return *number points at the
1891 character that immediately follows after the given number. */
1892 static unsigned char
1893 traverse_raw_number (char const **number)
1895 char const *p = *number;
1896 unsigned char ch;
1897 unsigned char max_digit = '\0';
1899 /* Scan to end of number.
1900 Decimals or separators not followed by digits stop the scan.
1901 Numbers ending in decimals or separators are thus considered
1902 to be lacking in units.
1903 FIXME: add support for multibyte thousands_sep and decimal_point. */
1905 while (ISDIGIT (ch = *p++))
1907 if (max_digit < ch)
1908 max_digit = ch;
1910 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1911 the unit in the next column in case thousands_sep matches as blank
1912 and is used as column delimiter. */
1913 if (*p == thousands_sep)
1914 ++p;
1917 if (ch == decimal_point)
1918 while (ISDIGIT (ch = *p++))
1919 if (max_digit < ch)
1920 max_digit = ch;
1922 *number = p - 1;
1923 return max_digit;
1926 /* Return an integer that represents the order of magnitude of the
1927 unit following the number. The number may contain thousands
1928 separators and a decimal point, but it may not contain leading blanks.
1929 Negative numbers get negative orders; zero numbers have a zero order. */
1931 static int _GL_ATTRIBUTE_PURE
1932 find_unit_order (char const *number)
1934 bool minus_sign = (*number == '-');
1935 char const *p = number + minus_sign;
1936 unsigned char max_digit = traverse_raw_number (&p);
1937 if ('0' < max_digit)
1939 unsigned char ch = *p;
1940 int order = unit_order[ch];
1941 return (minus_sign ? -order : order);
1943 else
1944 return 0;
1947 /* Compare numbers A and B ending in units with SI or IEC prefixes
1948 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1950 static int
1951 human_numcompare (char const *a, char const *b)
1953 while (blanks[to_uchar (*a)])
1954 a++;
1955 while (blanks[to_uchar (*b)])
1956 b++;
1958 int diff = find_unit_order (a) - find_unit_order (b);
1959 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1962 /* Compare strings A and B as numbers without explicitly converting them to
1963 machine numbers. Comparatively slow for short strings, but asymptotically
1964 hideously fast. */
1966 static int
1967 numcompare (char const *a, char const *b)
1969 while (blanks[to_uchar (*a)])
1970 a++;
1971 while (blanks[to_uchar (*b)])
1972 b++;
1974 return strnumcmp (a, b, decimal_point, thousands_sep);
1977 /* Work around a problem whereby the long double value returned by glibc's
1978 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1979 A and B before calling strtold. FIXME: remove this function once
1980 gnulib guarantees that strtold's result is always well defined. */
1981 static int
1982 nan_compare (char const *sa, char const *sb)
1984 long_double a;
1985 memset (&a, 0, sizeof a);
1986 a = strtold (sa, NULL);
1988 long_double b;
1989 memset (&b, 0, sizeof b);
1990 b = strtold (sb, NULL);
1992 return memcmp (&a, &b, sizeof a);
1995 static int
1996 general_numcompare (char const *sa, char const *sb)
1998 /* FIXME: maybe add option to try expensive FP conversion
1999 only if A and B can't be compared more cheaply/accurately. */
2001 char *ea;
2002 char *eb;
2003 long_double a = strtold (sa, &ea);
2004 long_double b = strtold (sb, &eb);
2006 /* Put conversion errors at the start of the collating sequence. */
2007 if (sa == ea)
2008 return sb == eb ? 0 : -1;
2009 if (sb == eb)
2010 return 1;
2012 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2013 conversion errors but before numbers; sort them by internal
2014 bit-pattern, for lack of a more portable alternative. */
2015 return (a < b ? -1
2016 : a > b ? 1
2017 : a == b ? 0
2018 : b == b ? -1
2019 : a == a ? 1
2020 : nan_compare (sa, sb));
2023 /* Return an integer in 1..12 of the month name MONTH.
2024 Return 0 if the name in S is not recognized. */
2026 static int
2027 getmonth (char const *month, char **ea)
2029 size_t lo = 0;
2030 size_t hi = MONTHS_PER_YEAR;
2032 while (blanks[to_uchar (*month)])
2033 month++;
2037 size_t ix = (lo + hi) / 2;
2038 char const *m = month;
2039 char const *n = monthtab[ix].name;
2041 for (;; m++, n++)
2043 if (!*n)
2045 if (ea)
2046 *ea = (char *) m;
2047 return monthtab[ix].val;
2049 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2051 hi = ix;
2052 break;
2054 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2056 lo = ix + 1;
2057 break;
2061 while (lo < hi);
2063 return 0;
2066 /* A randomly chosen MD5 state, used for random comparison. */
2067 static struct md5_ctx random_md5_state;
2069 /* Initialize the randomly chosen MD5 state. */
2071 static void
2072 random_md5_state_init (char const *random_source)
2074 unsigned char buf[MD5_DIGEST_SIZE];
2075 struct randread_source *r = randread_new (random_source, sizeof buf);
2076 if (! r)
2077 die (_("open failed"), random_source);
2078 randread (r, buf, sizeof buf);
2079 if (randread_free (r) != 0)
2080 die (_("close failed"), random_source);
2081 md5_init_ctx (&random_md5_state);
2082 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2085 /* This is like strxfrm, except it reports any error and exits. */
2087 static size_t
2088 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2090 errno = 0;
2091 size_t translated_size = strxfrm (dest, src, destsize);
2093 if (errno)
2095 error (0, errno, _("string transformation failed"));
2096 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2097 error (SORT_FAILURE, 0,
2098 _("the untransformed string was %s"),
2099 quotearg_n_style (0, locale_quoting_style, src));
2102 return translated_size;
2105 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2106 using one or more random hash functions. TEXTA[LENA] and
2107 TEXTB[LENB] must be zero. */
2109 static int
2110 compare_random (char *restrict texta, size_t lena,
2111 char *restrict textb, size_t lenb)
2113 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2114 data. This is used to break ties if there is a checksum
2115 collision, and this is good enough given the astronomically low
2116 probability of a collision. */
2117 int xfrm_diff = 0;
2119 char stackbuf[4000];
2120 char *buf = stackbuf;
2121 size_t bufsize = sizeof stackbuf;
2122 void *allocated = NULL;
2123 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2124 struct md5_ctx s[2];
2125 s[0] = s[1] = random_md5_state;
2127 if (hard_LC_COLLATE)
2129 char const *lima = texta + lena;
2130 char const *limb = textb + lenb;
2132 while (true)
2134 /* Transform the text into the basis of comparison, so that byte
2135 strings that would otherwise considered to be equal are
2136 considered equal here even if their bytes differ.
2138 Each time through this loop, transform one
2139 null-terminated string's worth from TEXTA or from TEXTB
2140 or both. That way, there's no need to store the
2141 transformation of the whole line, if it contains many
2142 null-terminated strings. */
2144 /* Store the transformed data into a big-enough buffer. */
2146 /* A 3X size guess avoids the overhead of calling strxfrm
2147 twice on typical implementations. Don't worry about
2148 size_t overflow, as the guess need not be correct. */
2149 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2150 if (bufsize < guess_bufsize)
2152 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2153 free (allocated);
2154 buf = allocated = malloc (bufsize);
2155 if (! buf)
2157 buf = stackbuf;
2158 bufsize = sizeof stackbuf;
2162 size_t sizea =
2163 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2164 bool a_fits = sizea <= bufsize;
2165 size_t sizeb =
2166 (textb < limb
2167 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2168 (a_fits ? bufsize - sizea : 0))
2169 + 1)
2170 : 0);
2172 if (! (a_fits && sizea + sizeb <= bufsize))
2174 bufsize = sizea + sizeb;
2175 if (bufsize < SIZE_MAX / 3)
2176 bufsize = bufsize * 3 / 2;
2177 free (allocated);
2178 buf = allocated = xmalloc (bufsize);
2179 if (texta < lima)
2180 strxfrm (buf, texta, sizea);
2181 if (textb < limb)
2182 strxfrm (buf + sizea, textb, sizeb);
2185 /* Advance past NULs to the next part of each input string,
2186 exiting the loop if both strings are exhausted. When
2187 exiting the loop, prepare to finish off the tiebreaker
2188 comparison properly. */
2189 if (texta < lima)
2190 texta += strlen (texta) + 1;
2191 if (textb < limb)
2192 textb += strlen (textb) + 1;
2193 if (! (texta < lima || textb < limb))
2195 lena = sizea; texta = buf;
2196 lenb = sizeb; textb = buf + sizea;
2197 break;
2200 /* Accumulate the transformed data in the corresponding
2201 checksums. */
2202 md5_process_bytes (buf, sizea, &s[0]);
2203 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2205 /* Update the tiebreaker comparison of the transformed data. */
2206 if (! xfrm_diff)
2208 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2209 if (! xfrm_diff)
2210 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2215 /* Compute and compare the checksums. */
2216 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2217 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2218 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2220 /* Fall back on the tiebreaker if the checksums collide. */
2221 if (! diff)
2223 if (! xfrm_diff)
2225 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2226 if (! xfrm_diff)
2227 xfrm_diff = (lena > lenb) - (lena < lenb);
2230 diff = xfrm_diff;
2233 free (allocated);
2235 return diff;
2238 /* Return the printable width of the block of memory starting at
2239 TEXT and ending just before LIM, counting each tab as one byte.
2240 FIXME: Should we generally be counting non printable chars? */
2242 static size_t
2243 debug_width (char const *text, char const *lim)
2245 size_t width = mbsnwidth (text, lim - text, 0);
2246 while (text < lim)
2247 width += (*text++ == '\t');
2248 return width;
2251 /* For debug mode, "underline" a key at the
2252 specified offset and screen width. */
2254 static void
2255 mark_key (size_t offset, size_t width)
2257 while (offset--)
2258 putchar (' ');
2260 if (!width)
2261 printf (_("^ no match for key\n"));
2262 else
2265 putchar ('_');
2266 while (--width);
2268 putchar ('\n');
2272 /* Return true if KEY is a numeric key. */
2274 static inline bool
2275 key_numeric (struct keyfield const *key)
2277 return key->numeric || key->general_numeric || key->human_numeric;
2280 /* For LINE, output a debugging line that underlines KEY in LINE.
2281 If KEY is null, underline the whole line. */
2283 static void
2284 debug_key (struct line const *line, struct keyfield const *key)
2286 char *text = line->text;
2287 char *beg = text;
2288 char *lim = text + line->length - 1;
2290 if (key)
2292 if (key->sword != SIZE_MAX)
2293 beg = begfield (line, key);
2294 if (key->eword != SIZE_MAX)
2295 lim = limfield (line, key);
2297 if ((key->skipsblanks && key->sword == SIZE_MAX)
2298 || key->month || key_numeric (key))
2300 char saved = *lim;
2301 *lim = '\0';
2303 while (blanks[to_uchar (*beg)])
2304 beg++;
2306 char *tighter_lim = beg;
2308 if (lim < beg)
2309 tighter_lim = lim;
2310 else if (key->month)
2311 getmonth (beg, &tighter_lim);
2312 else if (key->general_numeric)
2313 ignore_value (strtold (beg, &tighter_lim));
2314 else if (key->numeric || key->human_numeric)
2316 char const *p = beg + (beg < lim && *beg == '-');
2317 unsigned char max_digit = traverse_raw_number (&p);
2318 if ('0' <= max_digit)
2320 unsigned char ch = *p;
2321 tighter_lim = (char *) p
2322 + (key->human_numeric && unit_order[ch]);
2325 else
2326 tighter_lim = lim;
2328 *lim = saved;
2329 lim = tighter_lim;
2333 size_t offset = debug_width (text, beg);
2334 size_t width = debug_width (beg, lim);
2335 mark_key (offset, width);
2338 /* Debug LINE by underlining its keys. */
2340 static void
2341 debug_line (struct line const *line)
2343 struct keyfield const *key = keylist;
2346 debug_key (line, key);
2347 while (key && ((key = key->next) || ! (unique || stable)));
2350 /* Return whether sorting options specified for key. */
2352 static bool
2353 default_key_compare (struct keyfield const *key)
2355 return ! (key->ignore
2356 || key->translate
2357 || key->skipsblanks
2358 || key->skipeblanks
2359 || key_numeric (key)
2360 || key->month
2361 || key->version
2362 || key->random
2363 /* || key->reverse */
2367 /* Convert a key to the short options used to specify it. */
2369 static void
2370 key_to_opts (struct keyfield const *key, char *opts)
2372 if (key->skipsblanks || key->skipeblanks)
2373 *opts++ = 'b';/* either disables global -b */
2374 if (key->ignore == nondictionary)
2375 *opts++ = 'd';
2376 if (key->translate)
2377 *opts++ = 'f';
2378 if (key->general_numeric)
2379 *opts++ = 'g';
2380 if (key->human_numeric)
2381 *opts++ = 'h';
2382 if (key->ignore == nonprinting)
2383 *opts++ = 'i';
2384 if (key->month)
2385 *opts++ = 'M';
2386 if (key->numeric)
2387 *opts++ = 'n';
2388 if (key->random)
2389 *opts++ = 'R';
2390 if (key->reverse)
2391 *opts++ = 'r';
2392 if (key->version)
2393 *opts++ = 'V';
2394 *opts = '\0';
2397 /* Output data independent key warnings to stderr. */
2399 static void
2400 key_warnings (struct keyfield const *gkey, bool gkey_only)
2402 struct keyfield const *key;
2403 struct keyfield ugkey = *gkey;
2404 unsigned long keynum = 1;
2406 for (key = keylist; key; key = key->next, keynum++)
2408 if (key->traditional_used)
2410 size_t sword = key->sword;
2411 size_t eword = key->eword;
2412 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2413 /* obsolescent syntax +A.x -B.y is equivalent to:
2414 -k A+1.x+1,B.y (when y = 0)
2415 -k A+1.x+1,B+1.y (when y > 0) */
2416 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2417 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2418 char *po = obuf;
2419 char *pn = nbuf;
2421 if (sword == SIZE_MAX)
2422 sword++;
2424 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2425 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2426 if (key->eword != SIZE_MAX)
2428 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2429 stpcpy (stpcpy (pn, ","),
2430 umaxtostr (eword + 1
2431 + (key->echar == SIZE_MAX), tmp));
2433 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2434 quote_n (0, obuf), quote_n (1, nbuf));
2437 /* Warn about field specs that will never match. */
2438 bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2439 if (zero_width)
2440 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2442 /* Warn about significant leading blanks. */
2443 bool implicit_skip = key_numeric (key) || key->month;
2444 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2445 if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2446 && ((!key->skipsblanks && !implicit_skip)
2447 || (!key->skipsblanks && key->schar)
2448 || (!key->skipeblanks && key->echar)))
2449 error (0, 0, _("leading blanks are significant in key %lu; "
2450 "consider also specifying 'b'"), keynum);
2452 /* Warn about numeric comparisons spanning fields,
2453 as field delimiters could be interpreted as part
2454 of the number (maybe only in other locales). */
2455 if (!gkey_only && key_numeric (key))
2457 size_t sword = key->sword + 1;
2458 size_t eword = key->eword + 1;
2459 if (!sword)
2460 sword++;
2461 if (!eword || sword < eword)
2462 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2463 keynum);
2466 /* Flag global options not copied or specified in any key. */
2467 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2468 ugkey.ignore = NULL;
2469 if (ugkey.translate && (ugkey.translate == key->translate))
2470 ugkey.translate = NULL;
2471 ugkey.skipsblanks &= !key->skipsblanks;
2472 ugkey.skipeblanks &= !key->skipeblanks;
2473 ugkey.month &= !key->month;
2474 ugkey.numeric &= !key->numeric;
2475 ugkey.general_numeric &= !key->general_numeric;
2476 ugkey.human_numeric &= !key->human_numeric;
2477 ugkey.random &= !key->random;
2478 ugkey.version &= !key->version;
2479 ugkey.reverse &= !key->reverse;
2482 /* Warn about ignored global options flagged above.
2483 Note if gkey is the only one in the list, all flags are cleared. */
2484 if (!default_key_compare (&ugkey)
2485 || (ugkey.reverse && (stable || unique) && keylist))
2487 bool ugkey_reverse = ugkey.reverse;
2488 if (!(stable || unique))
2489 ugkey.reverse = false;
2490 /* The following is too big, but guaranteed to be "big enough". */
2491 char opts[sizeof short_options];
2492 key_to_opts (&ugkey, opts);
2493 error (0, 0,
2494 ngettext ("option '-%s' is ignored",
2495 "options '-%s' are ignored",
2496 select_plural (strlen (opts))), opts);
2497 ugkey.reverse = ugkey_reverse;
2499 if (ugkey.reverse && !(stable || unique) && keylist)
2500 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2503 /* Compare two lines A and B trying every key in sequence until there
2504 are no more keys or a difference is found. */
2506 static int
2507 keycompare (struct line const *a, struct line const *b)
2509 struct keyfield *key = keylist;
2511 /* For the first iteration only, the key positions have been
2512 precomputed for us. */
2513 char *texta = a->keybeg;
2514 char *textb = b->keybeg;
2515 char *lima = a->keylim;
2516 char *limb = b->keylim;
2518 int diff;
2520 while (true)
2522 char const *translate = key->translate;
2523 bool const *ignore = key->ignore;
2525 /* Treat field ends before field starts as empty fields. */
2526 lima = MAX (texta, lima);
2527 limb = MAX (textb, limb);
2529 /* Find the lengths. */
2530 size_t lena = lima - texta;
2531 size_t lenb = limb - textb;
2533 if (hard_LC_COLLATE || key_numeric (key)
2534 || key->month || key->random || key->version)
2536 char *ta;
2537 char *tb;
2538 size_t tlena;
2539 size_t tlenb;
2541 char enda IF_LINT (= 0);
2542 char endb IF_LINT (= 0);
2543 void *allocated IF_LINT (= NULL);
2544 char stackbuf[4000];
2546 if (ignore || translate)
2548 /* Compute with copies of the keys, which are the result of
2549 translating or ignoring characters, and which need their
2550 own storage. */
2552 size_t i;
2554 /* Allocate space for copies. */
2555 size_t size = lena + 1 + lenb + 1;
2556 if (size <= sizeof stackbuf)
2557 ta = stackbuf, allocated = NULL;
2558 else
2559 ta = allocated = xmalloc (size);
2560 tb = ta + lena + 1;
2562 /* Put into each copy a version of the key in which the
2563 requested characters are ignored or translated. */
2564 for (tlena = i = 0; i < lena; i++)
2565 if (! (ignore && ignore[to_uchar (texta[i])]))
2566 ta[tlena++] = (translate
2567 ? translate[to_uchar (texta[i])]
2568 : texta[i]);
2569 ta[tlena] = '\0';
2571 for (tlenb = i = 0; i < lenb; i++)
2572 if (! (ignore && ignore[to_uchar (textb[i])]))
2573 tb[tlenb++] = (translate
2574 ? translate[to_uchar (textb[i])]
2575 : textb[i]);
2576 tb[tlenb] = '\0';
2578 else
2580 /* Use the keys in-place, temporarily null-terminated. */
2581 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2582 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2585 if (key->numeric)
2586 diff = numcompare (ta, tb);
2587 else if (key->general_numeric)
2588 diff = general_numcompare (ta, tb);
2589 else if (key->human_numeric)
2590 diff = human_numcompare (ta, tb);
2591 else if (key->month)
2592 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2593 else if (key->random)
2594 diff = compare_random (ta, tlena, tb, tlenb);
2595 else if (key->version)
2596 diff = filevercmp (ta, tb);
2597 else
2599 /* Locale-dependent string sorting. This is slower than
2600 C-locale sorting, which is implemented below. */
2601 if (tlena == 0)
2602 diff = - NONZERO (tlenb);
2603 else if (tlenb == 0)
2604 diff = 1;
2605 else
2606 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2609 if (ignore || translate)
2610 free (allocated);
2611 else
2613 ta[tlena] = enda;
2614 tb[tlenb] = endb;
2617 else if (ignore)
2619 #define CMP_WITH_IGNORE(A, B) \
2620 do \
2622 while (true) \
2624 while (texta < lima && ignore[to_uchar (*texta)]) \
2625 ++texta; \
2626 while (textb < limb && ignore[to_uchar (*textb)]) \
2627 ++textb; \
2628 if (! (texta < lima && textb < limb)) \
2629 break; \
2630 diff = to_uchar (A) - to_uchar (B); \
2631 if (diff) \
2632 goto not_equal; \
2633 ++texta; \
2634 ++textb; \
2637 diff = (texta < lima) - (textb < limb); \
2639 while (0)
2641 if (translate)
2642 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2643 translate[to_uchar (*textb)]);
2644 else
2645 CMP_WITH_IGNORE (*texta, *textb);
2647 else if (lena == 0)
2648 diff = - NONZERO (lenb);
2649 else if (lenb == 0)
2650 goto greater;
2651 else
2653 if (translate)
2655 while (texta < lima && textb < limb)
2657 diff = (to_uchar (translate[to_uchar (*texta++)])
2658 - to_uchar (translate[to_uchar (*textb++)]));
2659 if (diff)
2660 goto not_equal;
2663 else
2665 diff = memcmp (texta, textb, MIN (lena, lenb));
2666 if (diff)
2667 goto not_equal;
2669 diff = lena < lenb ? -1 : lena != lenb;
2672 if (diff)
2673 goto not_equal;
2675 key = key->next;
2676 if (! key)
2677 break;
2679 /* Find the beginning and limit of the next field. */
2680 if (key->eword != SIZE_MAX)
2681 lima = limfield (a, key), limb = limfield (b, key);
2682 else
2683 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2685 if (key->sword != SIZE_MAX)
2686 texta = begfield (a, key), textb = begfield (b, key);
2687 else
2689 texta = a->text, textb = b->text;
2690 if (key->skipsblanks)
2692 while (texta < lima && blanks[to_uchar (*texta)])
2693 ++texta;
2694 while (textb < limb && blanks[to_uchar (*textb)])
2695 ++textb;
2700 return 0;
2702 greater:
2703 diff = 1;
2704 not_equal:
2705 return key->reverse ? -diff : diff;
2708 /* Compare two lines A and B, returning negative, zero, or positive
2709 depending on whether A compares less than, equal to, or greater than B. */
2711 static int
2712 compare (struct line const *a, struct line const *b)
2714 int diff;
2715 size_t alen, blen;
2717 /* First try to compare on the specified keys (if any).
2718 The only two cases with no key at all are unadorned sort,
2719 and unadorned sort -r. */
2720 if (keylist)
2722 diff = keycompare (a, b);
2723 if (diff || unique || stable)
2724 return diff;
2727 /* If the keys all compare equal (or no keys were specified)
2728 fall through to the default comparison. */
2729 alen = a->length - 1, blen = b->length - 1;
2731 if (alen == 0)
2732 diff = - NONZERO (blen);
2733 else if (blen == 0)
2734 diff = 1;
2735 else if (hard_LC_COLLATE)
2737 /* Note xmemcoll0 is a performance enhancement as
2738 it will not unconditionally write '\0' after the
2739 passed in buffers, which was seen to give around
2740 a 3% increase in performance for short lines. */
2741 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2743 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2744 diff = alen < blen ? -1 : alen != blen;
2746 return reverse ? -diff : diff;
2749 /* Write LINE to output stream FP; the output file's name is
2750 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2751 otherwise. If debugging is enabled and FP is standard output,
2752 append some debugging information. */
2754 static void
2755 write_line (struct line const *line, FILE *fp, char const *output_file)
2757 char *buf = line->text;
2758 size_t n_bytes = line->length;
2759 char *ebuf = buf + n_bytes;
2761 if (!output_file && debug)
2763 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2764 char const *c = buf;
2766 while (c < ebuf)
2768 char wc = *c++;
2769 if (wc == '\t')
2770 wc = '>';
2771 else if (c == ebuf)
2772 wc = '\n';
2773 if (fputc (wc, fp) == EOF)
2774 die (_("write failed"), output_file);
2777 debug_line (line);
2779 else
2781 ebuf[-1] = eolchar;
2782 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2783 die (_("write failed"), output_file);
2784 ebuf[-1] = '\0';
2788 /* Check that the lines read from FILE_NAME come in order. Return
2789 true if they are in order. If CHECKONLY == 'c', also print a
2790 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2791 they are not in order. */
2793 static bool
2794 check (char const *file_name, char checkonly)
2796 FILE *fp = xfopen (file_name, "r");
2797 struct buffer buf; /* Input buffer. */
2798 struct line temp; /* Copy of previous line. */
2799 size_t alloc = 0;
2800 uintmax_t line_number = 0;
2801 struct keyfield const *key = keylist;
2802 bool nonunique = ! unique;
2803 bool ordered = true;
2805 initbuf (&buf, sizeof (struct line),
2806 MAX (merge_buffer_size, sort_size));
2807 temp.text = NULL;
2809 while (fillbuf (&buf, fp, file_name))
2811 struct line const *line = buffer_linelim (&buf);
2812 struct line const *linebase = line - buf.nlines;
2814 /* Make sure the line saved from the old buffer contents is
2815 less than or equal to the first line of the new buffer. */
2816 if (alloc && nonunique <= compare (&temp, line - 1))
2818 found_disorder:
2820 if (checkonly == 'c')
2822 struct line const *disorder_line = line - 1;
2823 uintmax_t disorder_line_number =
2824 buffer_linelim (&buf) - disorder_line + line_number;
2825 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2826 fprintf (stderr, _("%s: %s:%s: disorder: "),
2827 program_name, file_name,
2828 umaxtostr (disorder_line_number, hr_buf));
2829 write_line (disorder_line, stderr, _("standard error"));
2832 ordered = false;
2833 break;
2837 /* Compare each line in the buffer with its successor. */
2838 while (linebase < --line)
2839 if (nonunique <= compare (line, line - 1))
2840 goto found_disorder;
2842 line_number += buf.nlines;
2844 /* Save the last line of the buffer. */
2845 if (alloc < line->length)
2849 alloc *= 2;
2850 if (! alloc)
2852 alloc = line->length;
2853 break;
2856 while (alloc < line->length);
2858 free (temp.text);
2859 temp.text = xmalloc (alloc);
2861 memcpy (temp.text, line->text, line->length);
2862 temp.length = line->length;
2863 if (key)
2865 temp.keybeg = temp.text + (line->keybeg - line->text);
2866 temp.keylim = temp.text + (line->keylim - line->text);
2870 xfclose (fp, file_name);
2871 free (buf.buf);
2872 free (temp.text);
2873 return ordered;
2876 /* Open FILES (there are NFILES of them) and store the resulting array
2877 of stream pointers into (*PFPS). Allocate the array. Return the
2878 number of successfully opened files, setting errno if this value is
2879 less than NFILES. */
2881 static size_t
2882 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2884 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2885 int i;
2887 /* Open as many input files as we can. */
2888 for (i = 0; i < nfiles; i++)
2890 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2891 ? open_temp (files[i].temp)
2892 : stream_open (files[i].name, "r"));
2893 if (!fps[i])
2894 break;
2897 return i;
2900 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2901 files (all of which are at the start of the FILES array), and
2902 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2903 FPS is the vector of open stream corresponding to the files.
2904 Close input and output streams before returning.
2905 OUTPUT_FILE gives the name of the output file. If it is NULL,
2906 the output file is standard output. */
2908 static void
2909 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2910 FILE *ofp, char const *output_file, FILE **fps)
2912 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2913 /* Input buffers for each file. */
2914 struct line saved; /* Saved line storage for unique check. */
2915 struct line const *savedline = NULL;
2916 /* &saved if there is a saved line. */
2917 size_t savealloc = 0; /* Size allocated for the saved line. */
2918 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2919 /* Current line in each line table. */
2920 struct line const **base = xnmalloc (nfiles, sizeof *base);
2921 /* Base of each line table. */
2922 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2923 /* Table representing a permutation of fps,
2924 such that cur[ord[0]] is the smallest line
2925 and will be next output. */
2926 size_t i;
2927 size_t j;
2928 size_t t;
2929 struct keyfield const *key = keylist;
2930 saved.text = NULL;
2932 /* Read initial lines from each input file. */
2933 for (i = 0; i < nfiles; )
2935 initbuf (&buffer[i], sizeof (struct line),
2936 MAX (merge_buffer_size, sort_size / nfiles));
2937 if (fillbuf (&buffer[i], fps[i], files[i].name))
2939 struct line const *linelim = buffer_linelim (&buffer[i]);
2940 cur[i] = linelim - 1;
2941 base[i] = linelim - buffer[i].nlines;
2942 i++;
2944 else
2946 /* fps[i] is empty; eliminate it from future consideration. */
2947 xfclose (fps[i], files[i].name);
2948 if (i < ntemps)
2950 ntemps--;
2951 zaptemp (files[i].name);
2953 free (buffer[i].buf);
2954 --nfiles;
2955 for (j = i; j < nfiles; ++j)
2957 files[j] = files[j + 1];
2958 fps[j] = fps[j + 1];
2963 /* Set up the ord table according to comparisons among input lines.
2964 Since this only reorders two items if one is strictly greater than
2965 the other, it is stable. */
2966 for (i = 0; i < nfiles; ++i)
2967 ord[i] = i;
2968 for (i = 1; i < nfiles; ++i)
2969 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2970 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2972 /* Repeatedly output the smallest line until no input remains. */
2973 while (nfiles)
2975 struct line const *smallest = cur[ord[0]];
2977 /* If uniquified output is turned on, output only the first of
2978 an identical series of lines. */
2979 if (unique)
2981 if (savedline && compare (savedline, smallest))
2983 savedline = NULL;
2984 write_line (&saved, ofp, output_file);
2986 if (!savedline)
2988 savedline = &saved;
2989 if (savealloc < smallest->length)
2992 if (! savealloc)
2994 savealloc = smallest->length;
2995 break;
2997 while ((savealloc *= 2) < smallest->length);
2999 free (saved.text);
3000 saved.text = xmalloc (savealloc);
3002 saved.length = smallest->length;
3003 memcpy (saved.text, smallest->text, saved.length);
3004 if (key)
3006 saved.keybeg =
3007 saved.text + (smallest->keybeg - smallest->text);
3008 saved.keylim =
3009 saved.text + (smallest->keylim - smallest->text);
3013 else
3014 write_line (smallest, ofp, output_file);
3016 /* Check if we need to read more lines into core. */
3017 if (base[ord[0]] < smallest)
3018 cur[ord[0]] = smallest - 1;
3019 else
3021 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3023 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3024 cur[ord[0]] = linelim - 1;
3025 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3027 else
3029 /* We reached EOF on fps[ord[0]]. */
3030 for (i = 1; i < nfiles; ++i)
3031 if (ord[i] > ord[0])
3032 --ord[i];
3033 --nfiles;
3034 xfclose (fps[ord[0]], files[ord[0]].name);
3035 if (ord[0] < ntemps)
3037 ntemps--;
3038 zaptemp (files[ord[0]].name);
3040 free (buffer[ord[0]].buf);
3041 for (i = ord[0]; i < nfiles; ++i)
3043 fps[i] = fps[i + 1];
3044 files[i] = files[i + 1];
3045 buffer[i] = buffer[i + 1];
3046 cur[i] = cur[i + 1];
3047 base[i] = base[i + 1];
3049 for (i = 0; i < nfiles; ++i)
3050 ord[i] = ord[i + 1];
3051 continue;
3055 /* The new line just read in may be larger than other lines
3056 already in main memory; push it back in the queue until we
3057 encounter a line larger than it. Optimize for the common
3058 case where the new line is smallest. */
3060 size_t lo = 1;
3061 size_t hi = nfiles;
3062 size_t probe = lo;
3063 size_t ord0 = ord[0];
3064 size_t count_of_smaller_lines;
3066 while (lo < hi)
3068 int cmp = compare (cur[ord0], cur[ord[probe]]);
3069 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3070 hi = probe;
3071 else
3072 lo = probe + 1;
3073 probe = (lo + hi) / 2;
3076 count_of_smaller_lines = lo - 1;
3077 for (j = 0; j < count_of_smaller_lines; j++)
3078 ord[j] = ord[j + 1];
3079 ord[count_of_smaller_lines] = ord0;
3083 if (unique && savedline)
3085 write_line (&saved, ofp, output_file);
3086 free (saved.text);
3089 xfclose (ofp, output_file);
3090 free (fps);
3091 free (buffer);
3092 free (ord);
3093 free (base);
3094 free (cur);
3097 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3098 files (all of which are at the start of the FILES array), and
3099 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3100 Close input and output files before returning.
3101 OUTPUT_FILE gives the name of the output file.
3103 Return the number of files successfully merged. This number can be
3104 less than NFILES if we ran low on file descriptors, but in this
3105 case it is never less than 2. */
3107 static size_t
3108 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3109 FILE *ofp, char const *output_file)
3111 FILE **fps;
3112 size_t nopened = open_input_files (files, nfiles, &fps);
3113 if (nopened < nfiles && nopened < 2)
3114 die (_("open failed"), files[nopened].name);
3115 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3116 return nopened;
3119 /* Merge into T (of size NLINES) the two sorted arrays of lines
3120 LO (with NLINES / 2 members), and
3121 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3122 T and LO point just past their respective arrays, and the arrays
3123 are in reverse order. NLINES must be at least 2. */
3125 static void
3126 mergelines (struct line *restrict t, size_t nlines,
3127 struct line const *restrict lo)
3129 size_t nlo = nlines / 2;
3130 size_t nhi = nlines - nlo;
3131 struct line *hi = t - nlo;
3133 while (true)
3134 if (compare (lo - 1, hi - 1) <= 0)
3136 *--t = *--lo;
3137 if (! --nlo)
3139 /* HI must equal T now, and there is no need to copy from
3140 HI to T. */
3141 return;
3144 else
3146 *--t = *--hi;
3147 if (! --nhi)
3150 *--t = *--lo;
3151 while (--nlo);
3153 return;
3158 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3159 Do this all within one thread. NLINES must be at least 2.
3160 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3161 Otherwise the sort is in-place and TEMP is half-sized.
3162 The input and output arrays are in reverse order, and LINES and
3163 TEMP point just past the end of their respective arrays.
3165 Use a recursive divide-and-conquer algorithm, in the style
3166 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3167 the optimization suggested by exercise 5.2.4-10; this requires room
3168 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3169 writes that this memory optimization was originally published by
3170 D. A. Bell, Comp J. 1 (1958), 75. */
3172 static void
3173 sequential_sort (struct line *restrict lines, size_t nlines,
3174 struct line *restrict temp, bool to_temp)
3176 if (nlines == 2)
3178 /* Declare 'swap' as int, not bool, to work around a bug
3179 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3180 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3181 int swap = (0 < compare (&lines[-1], &lines[-2]));
3182 if (to_temp)
3184 temp[-1] = lines[-1 - swap];
3185 temp[-2] = lines[-2 + swap];
3187 else if (swap)
3189 temp[-1] = lines[-1];
3190 lines[-1] = lines[-2];
3191 lines[-2] = temp[-1];
3194 else
3196 size_t nlo = nlines / 2;
3197 size_t nhi = nlines - nlo;
3198 struct line *lo = lines;
3199 struct line *hi = lines - nlo;
3201 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3202 if (1 < nlo)
3203 sequential_sort (lo, nlo, temp, !to_temp);
3204 else if (!to_temp)
3205 temp[-1] = lo[-1];
3207 struct line *dest;
3208 struct line const *sorted_lo;
3209 if (to_temp)
3211 dest = temp;
3212 sorted_lo = lines;
3214 else
3216 dest = lines;
3217 sorted_lo = temp;
3219 mergelines (dest, nlines, sorted_lo);
3223 static struct merge_node *init_node (struct merge_node *restrict,
3224 struct merge_node *restrict,
3225 struct line *, size_t, size_t, bool);
3228 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3229 lines, with destination DEST. */
3230 static struct merge_node *
3231 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3233 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3235 struct merge_node *root = merge_tree;
3236 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3237 root->dest = NULL;
3238 root->nlo = root->nhi = nlines;
3239 root->parent = NULL;
3240 root->level = MERGE_END;
3241 root->queued = false;
3242 pthread_mutex_init (&root->lock, NULL);
3244 init_node (root, root + 1, dest, nthreads, nlines, false);
3245 return merge_tree;
3248 /* Destroy the merge tree. */
3249 static void
3250 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3252 size_t n_nodes = nthreads * 2;
3253 struct merge_node *node = merge_tree;
3255 while (n_nodes--)
3257 pthread_mutex_destroy (&node->lock);
3258 node++;
3261 free (merge_tree);
3264 /* Initialize a merge tree node and its descendants. The node's
3265 parent is PARENT. The node and its descendants are taken from the
3266 array of nodes NODE_POOL. Their destination starts at DEST; they
3267 will consume NTHREADS threads. The total number of sort lines is
3268 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3269 its parent. */
3271 static struct merge_node *
3272 init_node (struct merge_node *restrict parent,
3273 struct merge_node *restrict node_pool,
3274 struct line *dest, size_t nthreads,
3275 size_t total_lines, bool is_lo_child)
3277 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3278 size_t nlo = nlines / 2;
3279 size_t nhi = nlines - nlo;
3280 struct line *lo = dest - total_lines;
3281 struct line *hi = lo - nlo;
3282 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3284 struct merge_node *node = node_pool++;
3285 node->lo = node->end_lo = lo;
3286 node->hi = node->end_hi = hi;
3287 node->dest = parent_end;
3288 node->nlo = nlo;
3289 node->nhi = nhi;
3290 node->parent = parent;
3291 node->level = parent->level + 1;
3292 node->queued = false;
3293 pthread_mutex_init (&node->lock, NULL);
3295 if (nthreads > 1)
3297 size_t lo_threads = nthreads / 2;
3298 size_t hi_threads = nthreads - lo_threads;
3299 node->lo_child = node_pool;
3300 node_pool = init_node (node, node_pool, lo, lo_threads,
3301 total_lines, true);
3302 node->hi_child = node_pool;
3303 node_pool = init_node (node, node_pool, hi, hi_threads,
3304 total_lines, false);
3306 else
3308 node->lo_child = NULL;
3309 node->hi_child = NULL;
3311 return node_pool;
3315 /* Compare two merge nodes A and B for priority. */
3317 static int
3318 compare_nodes (void const *a, void const *b)
3320 struct merge_node const *nodea = a;
3321 struct merge_node const *nodeb = b;
3322 if (nodea->level == nodeb->level)
3323 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3324 return nodea->level < nodeb->level;
3327 /* Lock a merge tree NODE. */
3329 static inline void
3330 lock_node (struct merge_node *node)
3332 pthread_mutex_lock (&node->lock);
3335 /* Unlock a merge tree NODE. */
3337 static inline void
3338 unlock_node (struct merge_node *node)
3340 pthread_mutex_unlock (&node->lock);
3343 /* Destroy merge QUEUE. */
3345 static void
3346 queue_destroy (struct merge_node_queue *queue)
3348 heap_free (queue->priority_queue);
3349 pthread_cond_destroy (&queue->cond);
3350 pthread_mutex_destroy (&queue->mutex);
3353 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3354 NTHREADS threads. */
3356 static void
3357 queue_init (struct merge_node_queue *queue, size_t nthreads)
3359 /* Though it's highly unlikely all nodes are in the heap at the same
3360 time, the heap should accommodate all of them. Counting a NULL
3361 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3362 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3363 pthread_mutex_init (&queue->mutex, NULL);
3364 pthread_cond_init (&queue->cond, NULL);
3367 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3368 does not need to lock NODE. */
3370 static void
3371 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3373 pthread_mutex_lock (&queue->mutex);
3374 heap_insert (queue->priority_queue, node);
3375 node->queued = true;
3376 pthread_cond_signal (&queue->cond);
3377 pthread_mutex_unlock (&queue->mutex);
3380 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3382 static struct merge_node *
3383 queue_pop (struct merge_node_queue *queue)
3385 struct merge_node *node;
3386 pthread_mutex_lock (&queue->mutex);
3387 while (! (node = heap_remove_top (queue->priority_queue)))
3388 pthread_cond_wait (&queue->cond, &queue->mutex);
3389 pthread_mutex_unlock (&queue->mutex);
3390 lock_node (node);
3391 node->queued = false;
3392 return node;
3395 /* Output LINE to TFP, unless -u is specified and the line compares
3396 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3397 is null if TFP is standard output.
3399 This function does not save the line for comparison later, so it is
3400 appropriate only for internal sort. */
3402 static void
3403 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3405 if (unique)
3407 if (saved_line.text && ! compare (line, &saved_line))
3408 return;
3409 saved_line = *line;
3412 write_line (line, tfp, temp_output);
3415 /* Merge the lines currently available to a NODE in the binary
3416 merge tree. Merge a number of lines appropriate for this merge
3417 level, assuming TOTAL_LINES is the total number of lines.
3419 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3420 the name of TFP, or is null if TFP is standard output. */
3422 static void
3423 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3424 FILE *tfp, char const *temp_output)
3426 struct line *lo_orig = node->lo;
3427 struct line *hi_orig = node->hi;
3428 size_t to_merge = MAX_MERGE (total_lines, node->level);
3429 size_t merged_lo;
3430 size_t merged_hi;
3432 if (node->level > MERGE_ROOT)
3434 /* Merge to destination buffer. */
3435 struct line *dest = *node->dest;
3436 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3437 if (compare (node->lo - 1, node->hi - 1) <= 0)
3438 *--dest = *--node->lo;
3439 else
3440 *--dest = *--node->hi;
3442 merged_lo = lo_orig - node->lo;
3443 merged_hi = hi_orig - node->hi;
3445 if (node->nhi == merged_hi)
3446 while (node->lo != node->end_lo && to_merge--)
3447 *--dest = *--node->lo;
3448 else if (node->nlo == merged_lo)
3449 while (node->hi != node->end_hi && to_merge--)
3450 *--dest = *--node->hi;
3451 *node->dest = dest;
3453 else
3455 /* Merge directly to output. */
3456 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3458 if (compare (node->lo - 1, node->hi - 1) <= 0)
3459 write_unique (--node->lo, tfp, temp_output);
3460 else
3461 write_unique (--node->hi, tfp, temp_output);
3464 merged_lo = lo_orig - node->lo;
3465 merged_hi = hi_orig - node->hi;
3467 if (node->nhi == merged_hi)
3469 while (node->lo != node->end_lo && to_merge--)
3470 write_unique (--node->lo, tfp, temp_output);
3472 else if (node->nlo == merged_lo)
3474 while (node->hi != node->end_hi && to_merge--)
3475 write_unique (--node->hi, tfp, temp_output);
3479 /* Update NODE. */
3480 merged_lo = lo_orig - node->lo;
3481 merged_hi = hi_orig - node->hi;
3482 node->nlo -= merged_lo;
3483 node->nhi -= merged_hi;
3486 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3487 NODE's children has available lines and the other either has
3488 available lines or has exhausted its lines. */
3490 static void
3491 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3493 if (! node->queued)
3495 bool lo_avail = (node->lo - node->end_lo) != 0;
3496 bool hi_avail = (node->hi - node->end_hi) != 0;
3497 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3498 queue_insert (queue, node);
3502 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3504 static void
3505 queue_check_insert_parent (struct merge_node_queue *queue,
3506 struct merge_node *node)
3508 if (node->level > MERGE_ROOT)
3510 lock_node (node->parent);
3511 queue_check_insert (queue, node->parent);
3512 unlock_node (node->parent);
3514 else if (node->nlo + node->nhi == 0)
3516 /* If the MERGE_ROOT NODE has finished merging, insert the
3517 MERGE_END node. */
3518 queue_insert (queue, node->parent);
3522 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3523 some of those lines, until the MERGE_END node is popped.
3524 TOTAL_LINES is the total number of lines. If merging at the top
3525 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3526 null if TFP is standard output. */
3528 static void
3529 merge_loop (struct merge_node_queue *queue,
3530 size_t total_lines, FILE *tfp, char const *temp_output)
3532 while (1)
3534 struct merge_node *node = queue_pop (queue);
3536 if (node->level == MERGE_END)
3538 unlock_node (node);
3539 /* Reinsert so other threads can pop it. */
3540 queue_insert (queue, node);
3541 break;
3543 mergelines_node (node, total_lines, tfp, temp_output);
3544 queue_check_insert (queue, node);
3545 queue_check_insert_parent (queue, node);
3547 unlock_node (node);
3552 static void sortlines (struct line *restrict, size_t, size_t,
3553 struct merge_node *, struct merge_node_queue *,
3554 FILE *, char const *);
3556 /* Thread arguments for sortlines_thread. */
3558 struct thread_args
3560 /* Source, i.e., the array of lines to sort. This points just past
3561 the end of the array. */
3562 struct line *lines;
3564 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3565 size_t nthreads;
3567 /* Number of lines in LINES and DEST. */
3568 size_t const total_lines;
3570 /* Merge node. Lines from this node and this node's sibling will merged
3571 to this node's parent. */
3572 struct merge_node *const node;
3574 /* The priority queue controlling available work for the entire
3575 internal sort. */
3576 struct merge_node_queue *const queue;
3578 /* If at the top level, the file to output to, and the file's name.
3579 If the file is standard output, the file's name is null. */
3580 FILE *tfp;
3581 char const *output_temp;
3584 /* Like sortlines, except with a signature acceptable to pthread_create. */
3586 static void *
3587 sortlines_thread (void *data)
3589 struct thread_args const *args = data;
3590 sortlines (args->lines, args->nthreads, args->total_lines,
3591 args->node, args->queue, args->tfp,
3592 args->output_temp);
3593 return NULL;
3596 /* Sort lines, possibly in parallel. The arguments are as in struct
3597 thread_args above.
3599 The algorithm has three phases: node creation, sequential sort,
3600 and binary merge.
3602 During node creation, sortlines recursively visits each node in the
3603 binary merge tree and creates a NODE structure corresponding to all the
3604 future line merging NODE is responsible for. For each call to
3605 sortlines, half the available threads are assigned to each recursive
3606 call, until a leaf node having only 1 available thread is reached.
3608 Each leaf node then performs two sequential sorts, one on each half of
3609 the lines it is responsible for. It records in its NODE structure that
3610 there are two sorted sublists available to merge from, and inserts its
3611 NODE into the priority queue.
3613 The binary merge phase then begins. Each thread drops into a loop
3614 where the thread retrieves a NODE from the priority queue, merges lines
3615 available to that NODE, and potentially insert NODE or its parent back
3616 into the queue if there are sufficient available lines for them to
3617 merge. This continues until all lines at all nodes of the merge tree
3618 have been merged. */
3620 static void
3621 sortlines (struct line *restrict lines, size_t nthreads,
3622 size_t total_lines, struct merge_node *node,
3623 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3625 size_t nlines = node->nlo + node->nhi;
3627 /* Calculate thread arguments. */
3628 size_t lo_threads = nthreads / 2;
3629 size_t hi_threads = nthreads - lo_threads;
3630 pthread_t thread;
3631 struct thread_args args = {lines, lo_threads, total_lines,
3632 node->lo_child, queue, tfp, temp_output};
3634 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3635 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3637 sortlines (lines - node->nlo, hi_threads, total_lines,
3638 node->hi_child, queue, tfp, temp_output);
3639 pthread_join (thread, NULL);
3641 else
3643 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3644 Sort with 1 thread. */
3645 size_t nlo = node->nlo;
3646 size_t nhi = node->nhi;
3647 struct line *temp = lines - total_lines;
3648 if (1 < nhi)
3649 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3650 if (1 < nlo)
3651 sequential_sort (lines, nlo, temp, false);
3653 /* Update merge NODE. No need to lock yet. */
3654 node->lo = lines;
3655 node->hi = lines - nlo;
3656 node->end_lo = lines - nlo;
3657 node->end_hi = lines - nlo - nhi;
3659 queue_insert (queue, node);
3660 merge_loop (queue, total_lines, tfp, temp_output);
3664 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3665 the same as OUTFILE. If found, replace each with the same
3666 temporary copy that can be merged into OUTFILE without destroying
3667 OUTFILE before it is completely read. This temporary copy does not
3668 count as a merge temp, so don't worry about incrementing NTEMPS in
3669 the caller; final cleanup will remove it, not zaptemp.
3671 This test ensures that an otherwise-erroneous use like
3672 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3673 It's not clear that POSIX requires this nicety.
3674 Detect common error cases, but don't try to catch obscure cases like
3675 "cat ... FILE ... | sort -m -o FILE"
3676 where traditional "sort" doesn't copy the input and where
3677 people should know that they're getting into trouble anyway.
3678 Catching these obscure cases would slow down performance in
3679 common cases. */
3681 static void
3682 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3683 size_t nfiles, char const *outfile)
3685 size_t i;
3686 bool got_outstat = false;
3687 struct stat outstat;
3688 struct tempnode *tempcopy = NULL;
3690 for (i = ntemps; i < nfiles; i++)
3692 bool is_stdin = STREQ (files[i].name, "-");
3693 bool same;
3694 struct stat instat;
3696 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3697 same = true;
3698 else
3700 if (! got_outstat)
3702 if (fstat (STDOUT_FILENO, &outstat) != 0)
3703 break;
3704 got_outstat = true;
3707 same = (((is_stdin
3708 ? fstat (STDIN_FILENO, &instat)
3709 : stat (files[i].name, &instat))
3710 == 0)
3711 && SAME_INODE (instat, outstat));
3714 if (same)
3716 if (! tempcopy)
3718 FILE *tftp;
3719 tempcopy = create_temp (&tftp);
3720 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3723 files[i].name = tempcopy->name;
3724 files[i].temp = tempcopy;
3729 /* Scan the input files to ensure all are accessible.
3730 Otherwise exit with a diagnostic.
3732 Note this will catch common issues with permissions etc.
3733 but will fail to notice issues where you can open() but not read(),
3734 like when a directory is specified on some systems.
3735 Catching these obscure cases could slow down performance in
3736 common cases. */
3738 static void
3739 check_inputs (char *const *files, size_t nfiles)
3741 size_t i;
3742 for (i = 0; i < nfiles; i++)
3744 if (STREQ (files[i], "-"))
3745 continue;
3747 if (euidaccess (files[i], R_OK) != 0)
3748 die (_("cannot read"), files[i]);
3752 /* Ensure a specified output file can be created or written to,
3753 and point stdout to it. Do not truncate the file.
3754 Exit with a diagnostic on failure. */
3756 static void
3757 check_output (char const *outfile)
3759 if (outfile)
3761 int outfd = open (outfile, O_WRONLY | O_CREAT | O_BINARY, MODE_RW_UGO);
3762 if (outfd < 0)
3763 die (_("open failed"), outfile);
3764 move_fd_or_die (outfd, STDOUT_FILENO);
3768 /* Merge the input FILES. NTEMPS is the number of files at the
3769 start of FILES that are temporary; it is zero at the top level.
3770 NFILES is the total number of files. Put the output in
3771 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3773 static void
3774 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3775 char const *output_file)
3777 while (nmerge < nfiles)
3779 /* Number of input files processed so far. */
3780 size_t in;
3782 /* Number of output files generated so far. */
3783 size_t out;
3785 /* nfiles % NMERGE; this counts input files that are left over
3786 after all full-sized merges have been done. */
3787 size_t remainder;
3789 /* Number of easily-available slots at the next loop iteration. */
3790 size_t cheap_slots;
3792 /* Do as many NMERGE-size merges as possible. In the case that
3793 nmerge is bogus, increment by the maximum number of file
3794 descriptors allowed. */
3795 for (out = in = 0; nmerge <= nfiles - in; out++)
3797 FILE *tfp;
3798 struct tempnode *temp = create_temp (&tfp);
3799 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3800 nmerge, tfp, temp->name);
3801 ntemps -= MIN (ntemps, num_merged);
3802 files[out].name = temp->name;
3803 files[out].temp = temp;
3804 in += num_merged;
3807 remainder = nfiles - in;
3808 cheap_slots = nmerge - out % nmerge;
3810 if (cheap_slots < remainder)
3812 /* So many files remain that they can't all be put into the last
3813 NMERGE-sized output window. Do one more merge. Merge as few
3814 files as possible, to avoid needless I/O. */
3815 size_t nshortmerge = remainder - cheap_slots + 1;
3816 FILE *tfp;
3817 struct tempnode *temp = create_temp (&tfp);
3818 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3819 nshortmerge, tfp, temp->name);
3820 ntemps -= MIN (ntemps, num_merged);
3821 files[out].name = temp->name;
3822 files[out++].temp = temp;
3823 in += num_merged;
3826 /* Put the remaining input files into the last NMERGE-sized output
3827 window, so they will be merged in the next pass. */
3828 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3829 ntemps += out;
3830 nfiles -= in - out;
3833 avoid_trashing_input (files, ntemps, nfiles, output_file);
3835 /* We aren't guaranteed that this final mergefiles will work, therefore we
3836 try to merge into the output, and then merge as much as we can into a
3837 temp file if we can't. Repeat. */
3839 while (true)
3841 /* Merge directly into the output file if possible. */
3842 FILE **fps;
3843 size_t nopened = open_input_files (files, nfiles, &fps);
3845 if (nopened == nfiles)
3847 FILE *ofp = stream_open (output_file, "w");
3848 if (ofp)
3850 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3851 break;
3853 if (errno != EMFILE || nopened <= 2)
3854 die (_("open failed"), output_file);
3856 else if (nopened <= 2)
3857 die (_("open failed"), files[nopened].name);
3859 /* We ran out of file descriptors. Close one of the input
3860 files, to gain a file descriptor. Then create a temporary
3861 file with our spare file descriptor. Retry if that failed
3862 (e.g., some other process could open a file between the time
3863 we closed and tried to create). */
3864 FILE *tfp;
3865 struct tempnode *temp;
3868 nopened--;
3869 xfclose (fps[nopened], files[nopened].name);
3870 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3872 while (!temp);
3874 /* Merge into the newly allocated temporary. */
3875 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3876 fps);
3877 ntemps -= MIN (ntemps, nopened);
3878 files[0].name = temp->name;
3879 files[0].temp = temp;
3881 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3882 ntemps++;
3883 nfiles -= nopened - 1;
3887 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3889 static void
3890 sort (char *const *files, size_t nfiles, char const *output_file,
3891 size_t nthreads)
3893 struct buffer buf;
3894 IF_LINT (buf.buf = NULL);
3895 size_t ntemps = 0;
3896 bool output_file_created = false;
3898 buf.alloc = 0;
3900 while (nfiles)
3902 char const *temp_output;
3903 char const *file = *files;
3904 FILE *fp = xfopen (file, "r");
3905 FILE *tfp;
3907 size_t bytes_per_line;
3908 if (nthreads > 1)
3910 /* Get log P. */
3911 size_t tmp = 1;
3912 size_t mult = 1;
3913 while (tmp < nthreads)
3915 tmp *= 2;
3916 mult++;
3918 bytes_per_line = (mult * sizeof (struct line));
3920 else
3921 bytes_per_line = sizeof (struct line) * 3 / 2;
3923 if (! buf.alloc)
3924 initbuf (&buf, bytes_per_line,
3925 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3926 buf.eof = false;
3927 files++;
3928 nfiles--;
3930 while (fillbuf (&buf, fp, file))
3932 struct line *line;
3934 if (buf.eof && nfiles
3935 && (bytes_per_line + 1
3936 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3938 /* End of file, but there is more input and buffer room.
3939 Concatenate the next input file; this is faster in
3940 the usual case. */
3941 buf.left = buf.used;
3942 break;
3945 saved_line.text = NULL;
3946 line = buffer_linelim (&buf);
3947 if (buf.eof && !nfiles && !ntemps && !buf.left)
3949 xfclose (fp, file);
3950 tfp = xfopen (output_file, "w");
3951 temp_output = output_file;
3952 output_file_created = true;
3954 else
3956 ++ntemps;
3957 temp_output = create_temp (&tfp)->name;
3959 if (1 < buf.nlines)
3961 struct merge_node_queue queue;
3962 queue_init (&queue, nthreads);
3963 struct merge_node *merge_tree =
3964 merge_tree_init (nthreads, buf.nlines, line);
3966 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
3967 &queue, tfp, temp_output);
3969 #ifdef lint
3970 merge_tree_destroy (nthreads, merge_tree);
3971 queue_destroy (&queue);
3972 #endif
3974 else
3975 write_unique (line - 1, tfp, temp_output);
3977 xfclose (tfp, temp_output);
3979 if (output_file_created)
3980 goto finish;
3982 xfclose (fp, file);
3985 finish:
3986 free (buf.buf);
3988 if (! output_file_created)
3990 size_t i;
3991 struct tempnode *node = temphead;
3992 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3993 for (i = 0; node; i++)
3995 tempfiles[i].name = node->name;
3996 tempfiles[i].temp = node;
3997 node = node->next;
3999 merge (tempfiles, ntemps, ntemps, output_file);
4000 free (tempfiles);
4003 reap_all ();
4006 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4008 static void
4009 insertkey (struct keyfield *key_arg)
4011 struct keyfield **p;
4012 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4014 for (p = &keylist; *p; p = &(*p)->next)
4015 continue;
4016 *p = key;
4017 key->next = NULL;
4020 /* Report a bad field specification SPEC, with extra info MSGID. */
4022 static void badfieldspec (char const *, char const *)
4023 ATTRIBUTE_NORETURN;
4024 static void
4025 badfieldspec (char const *spec, char const *msgid)
4027 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4028 _(msgid), quote (spec));
4029 abort ();
4032 /* Report incompatible options. */
4034 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
4035 static void
4036 incompatible_options (char const *opts)
4038 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4039 abort ();
4042 /* Check compatibility of ordering options. */
4044 static void
4045 check_ordering_compatibility (void)
4047 struct keyfield *key;
4049 for (key = keylist; key; key = key->next)
4050 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4051 + key->month + (key->version | key->random | !!key->ignore)))
4053 /* The following is too big, but guaranteed to be "big enough". */
4054 char opts[sizeof short_options];
4055 /* Clear flags we're not interested in. */
4056 key->skipsblanks = key->skipeblanks = key->reverse = false;
4057 key_to_opts (key, opts);
4058 incompatible_options (opts);
4062 /* Parse the leading integer in STRING and store the resulting value
4063 (which must fit into size_t) into *VAL. Return the address of the
4064 suffix after the integer. If the value is too large, silently
4065 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4066 failure; otherwise, report MSGID and exit on failure. */
4068 static char const *
4069 parse_field_count (char const *string, size_t *val, char const *msgid)
4071 char *suffix;
4072 uintmax_t n;
4074 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4076 case LONGINT_OK:
4077 case LONGINT_INVALID_SUFFIX_CHAR:
4078 *val = n;
4079 if (*val == n)
4080 break;
4081 /* Fall through. */
4082 case LONGINT_OVERFLOW:
4083 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4084 *val = SIZE_MAX;
4085 break;
4087 case LONGINT_INVALID:
4088 if (msgid)
4089 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4090 _(msgid), quote (string));
4091 return NULL;
4094 return suffix;
4097 /* Handle interrupts and hangups. */
4099 static void
4100 sighandler (int sig)
4102 if (! SA_NOCLDSTOP)
4103 signal (sig, SIG_IGN);
4105 cleanup ();
4107 signal (sig, SIG_DFL);
4108 raise (sig);
4111 /* Set the ordering options for KEY specified in S.
4112 Return the address of the first character in S that
4113 is not a valid ordering option.
4114 BLANKTYPE is the kind of blanks that 'b' should skip. */
4116 static char *
4117 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4119 while (*s)
4121 switch (*s)
4123 case 'b':
4124 if (blanktype == bl_start || blanktype == bl_both)
4125 key->skipsblanks = true;
4126 if (blanktype == bl_end || blanktype == bl_both)
4127 key->skipeblanks = true;
4128 break;
4129 case 'd':
4130 key->ignore = nondictionary;
4131 break;
4132 case 'f':
4133 key->translate = fold_toupper;
4134 break;
4135 case 'g':
4136 key->general_numeric = true;
4137 break;
4138 case 'h':
4139 key->human_numeric = true;
4140 break;
4141 case 'i':
4142 /* Option order should not matter, so don't let -i override
4143 -d. -d implies -i, but -i does not imply -d. */
4144 if (! key->ignore)
4145 key->ignore = nonprinting;
4146 break;
4147 case 'M':
4148 key->month = true;
4149 break;
4150 case 'n':
4151 key->numeric = true;
4152 break;
4153 case 'R':
4154 key->random = true;
4155 break;
4156 case 'r':
4157 key->reverse = true;
4158 break;
4159 case 'V':
4160 key->version = true;
4161 break;
4162 default:
4163 return (char *) s;
4165 ++s;
4167 return (char *) s;
4170 /* Initialize KEY. */
4172 static struct keyfield *
4173 key_init (struct keyfield *key)
4175 memset (key, 0, sizeof *key);
4176 key->eword = SIZE_MAX;
4177 return key;
4181 main (int argc, char **argv)
4183 struct keyfield *key;
4184 struct keyfield key_buf;
4185 struct keyfield gkey;
4186 bool gkey_only = false;
4187 char const *s;
4188 int c = 0;
4189 char checkonly = 0;
4190 bool mergeonly = false;
4191 char *random_source = NULL;
4192 bool need_random = false;
4193 size_t nthreads = 0;
4194 size_t nfiles = 0;
4195 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4196 int posix_ver = posix2_version ();
4197 bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4198 char **files;
4199 char *files_from = NULL;
4200 struct Tokens tok;
4201 char const *outfile = NULL;
4202 bool locale_ok;
4204 initialize_main (&argc, &argv);
4205 set_program_name (argv[0]);
4206 locale_ok = !! setlocale (LC_ALL, "");
4207 bindtextdomain (PACKAGE, LOCALEDIR);
4208 textdomain (PACKAGE);
4210 initialize_exit_failure (SORT_FAILURE);
4212 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4213 #if HAVE_NL_LANGINFO
4214 hard_LC_TIME = hard_locale (LC_TIME);
4215 #endif
4217 /* Get locale's representation of the decimal point. */
4219 struct lconv const *locale = localeconv ();
4221 /* If the locale doesn't define a decimal point, or if the decimal
4222 point is multibyte, use the C locale's decimal point. FIXME:
4223 add support for multibyte decimal points. */
4224 decimal_point = to_uchar (locale->decimal_point[0]);
4225 if (! decimal_point || locale->decimal_point[1])
4226 decimal_point = '.';
4228 /* FIXME: add support for multibyte thousands separators. */
4229 thousands_sep = to_uchar (*locale->thousands_sep);
4230 if (! thousands_sep || locale->thousands_sep[1])
4231 thousands_sep = -1;
4234 have_read_stdin = false;
4235 inittables ();
4238 size_t i;
4239 static int const sig[] =
4241 /* The usual suspects. */
4242 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4243 #ifdef SIGPOLL
4244 SIGPOLL,
4245 #endif
4246 #ifdef SIGPROF
4247 SIGPROF,
4248 #endif
4249 #ifdef SIGVTALRM
4250 SIGVTALRM,
4251 #endif
4252 #ifdef SIGXCPU
4253 SIGXCPU,
4254 #endif
4255 #ifdef SIGXFSZ
4256 SIGXFSZ,
4257 #endif
4259 enum { nsigs = ARRAY_CARDINALITY (sig) };
4261 #if SA_NOCLDSTOP
4262 struct sigaction act;
4264 sigemptyset (&caught_signals);
4265 for (i = 0; i < nsigs; i++)
4267 sigaction (sig[i], NULL, &act);
4268 if (act.sa_handler != SIG_IGN)
4269 sigaddset (&caught_signals, sig[i]);
4272 act.sa_handler = sighandler;
4273 act.sa_mask = caught_signals;
4274 act.sa_flags = 0;
4276 for (i = 0; i < nsigs; i++)
4277 if (sigismember (&caught_signals, sig[i]))
4278 sigaction (sig[i], &act, NULL);
4279 #else
4280 for (i = 0; i < nsigs; i++)
4281 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4283 signal (sig[i], sighandler);
4284 siginterrupt (sig[i], 1);
4286 #endif
4288 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4290 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4291 atexit (exit_cleanup);
4293 key_init (&gkey);
4294 gkey.sword = SIZE_MAX;
4296 files = xnmalloc (argc, sizeof *files);
4298 while (true)
4300 /* Parse an operand as a file after "--" was seen; or if
4301 pedantic and a file was seen, unless the POSIX version
4302 is not 1003.1-2001 and -c was not seen and the operand is
4303 "-o FILE" or "-oFILE". */
4304 int oi = -1;
4306 if (c == -1
4307 || (posixly_correct && nfiles != 0
4308 && ! (traditional_usage
4309 && ! checkonly
4310 && optind != argc
4311 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4312 && (argv[optind][2] || optind + 1 != argc)))
4313 || ((c = getopt_long (argc, argv, short_options,
4314 long_options, &oi))
4315 == -1))
4317 if (argc <= optind)
4318 break;
4319 files[nfiles++] = argv[optind++];
4321 else switch (c)
4323 case 1:
4324 key = NULL;
4325 if (optarg[0] == '+')
4327 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4328 && ISDIGIT (argv[optind][1]));
4329 traditional_usage |= minus_pos_usage && !posixly_correct;
4330 if (traditional_usage)
4332 /* Treat +POS1 [-POS2] as a key if possible; but silently
4333 treat an operand as a file if it is not a valid +POS1. */
4334 key = key_init (&key_buf);
4335 s = parse_field_count (optarg + 1, &key->sword, NULL);
4336 if (s && *s == '.')
4337 s = parse_field_count (s + 1, &key->schar, NULL);
4338 if (! (key->sword || key->schar))
4339 key->sword = SIZE_MAX;
4340 if (! s || *set_ordering (s, key, bl_start))
4341 key = NULL;
4342 else
4344 if (minus_pos_usage)
4346 char const *optarg1 = argv[optind++];
4347 s = parse_field_count (optarg1 + 1, &key->eword,
4348 N_("invalid number after '-'"));
4349 /* When called with a non-NULL message ID,
4350 parse_field_count cannot return NULL. Tell static
4351 analysis tools that dereferencing S is safe. */
4352 assert (s);
4353 if (*s == '.')
4354 s = parse_field_count (s + 1, &key->echar,
4355 N_("invalid number after '.'"));
4356 if (!key->echar && key->eword)
4358 /* obsolescent syntax +A.x -B.y is equivalent to:
4359 -k A+1.x+1,B.y (when y = 0)
4360 -k A+1.x+1,B+1.y (when y > 0)
4361 So eword is decremented as in the -k case
4362 only when the end field (B) is specified and
4363 echar (y) is 0. */
4364 key->eword--;
4366 if (*set_ordering (s, key, bl_end))
4367 badfieldspec (optarg1,
4368 N_("stray character in field spec"));
4370 key->traditional_used = true;
4371 insertkey (key);
4375 if (! key)
4376 files[nfiles++] = optarg;
4377 break;
4379 case SORT_OPTION:
4380 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4381 /* Fall through. */
4382 case 'b':
4383 case 'd':
4384 case 'f':
4385 case 'g':
4386 case 'h':
4387 case 'i':
4388 case 'M':
4389 case 'n':
4390 case 'r':
4391 case 'R':
4392 case 'V':
4394 char str[2];
4395 str[0] = c;
4396 str[1] = '\0';
4397 set_ordering (str, &gkey, bl_both);
4399 break;
4401 case CHECK_OPTION:
4402 c = (optarg
4403 ? XARGMATCH ("--check", optarg, check_args, check_types)
4404 : 'c');
4405 /* Fall through. */
4406 case 'c':
4407 case 'C':
4408 if (checkonly && checkonly != c)
4409 incompatible_options ("cC");
4410 checkonly = c;
4411 break;
4413 case COMPRESS_PROGRAM_OPTION:
4414 if (compress_program && !STREQ (compress_program, optarg))
4415 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4416 compress_program = optarg;
4417 break;
4419 case DEBUG_PROGRAM_OPTION:
4420 debug = true;
4421 break;
4423 case FILES0_FROM_OPTION:
4424 files_from = optarg;
4425 break;
4427 case 'k':
4428 key = key_init (&key_buf);
4430 /* Get POS1. */
4431 s = parse_field_count (optarg, &key->sword,
4432 N_("invalid number at field start"));
4433 if (! key->sword--)
4435 /* Provoke with 'sort -k0' */
4436 badfieldspec (optarg, N_("field number is zero"));
4438 if (*s == '.')
4440 s = parse_field_count (s + 1, &key->schar,
4441 N_("invalid number after '.'"));
4442 if (! key->schar--)
4444 /* Provoke with 'sort -k1.0' */
4445 badfieldspec (optarg, N_("character offset is zero"));
4448 if (! (key->sword || key->schar))
4449 key->sword = SIZE_MAX;
4450 s = set_ordering (s, key, bl_start);
4451 if (*s != ',')
4453 key->eword = SIZE_MAX;
4454 key->echar = 0;
4456 else
4458 /* Get POS2. */
4459 s = parse_field_count (s + 1, &key->eword,
4460 N_("invalid number after ','"));
4461 if (! key->eword--)
4463 /* Provoke with 'sort -k1,0' */
4464 badfieldspec (optarg, N_("field number is zero"));
4466 if (*s == '.')
4468 s = parse_field_count (s + 1, &key->echar,
4469 N_("invalid number after '.'"));
4471 s = set_ordering (s, key, bl_end);
4473 if (*s)
4474 badfieldspec (optarg, N_("stray character in field spec"));
4475 insertkey (key);
4476 break;
4478 case 'm':
4479 mergeonly = true;
4480 break;
4482 case NMERGE_OPTION:
4483 specify_nmerge (oi, c, optarg);
4484 break;
4486 case 'o':
4487 if (outfile && !STREQ (outfile, optarg))
4488 error (SORT_FAILURE, 0, _("multiple output files specified"));
4489 outfile = optarg;
4490 break;
4492 case RANDOM_SOURCE_OPTION:
4493 if (random_source && !STREQ (random_source, optarg))
4494 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4495 random_source = optarg;
4496 break;
4498 case 's':
4499 stable = true;
4500 break;
4502 case 'S':
4503 specify_sort_size (oi, c, optarg);
4504 break;
4506 case 't':
4508 char newtab = optarg[0];
4509 if (! newtab)
4510 error (SORT_FAILURE, 0, _("empty tab"));
4511 if (optarg[1])
4513 if (STREQ (optarg, "\\0"))
4514 newtab = '\0';
4515 else
4517 /* Provoke with 'sort -txx'. Complain about
4518 "multi-character tab" instead of "multibyte tab", so
4519 that the diagnostic's wording does not need to be
4520 changed once multibyte characters are supported. */
4521 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4522 quote (optarg));
4525 if (tab != TAB_DEFAULT && tab != newtab)
4526 error (SORT_FAILURE, 0, _("incompatible tabs"));
4527 tab = newtab;
4529 break;
4531 case 'T':
4532 add_temp_dir (optarg);
4533 break;
4535 case PARALLEL_OPTION:
4536 nthreads = specify_nthreads (oi, c, optarg);
4537 break;
4539 case 'u':
4540 unique = true;
4541 break;
4543 case 'y':
4544 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4545 through Solaris 7. It is also accepted by many non-Solaris
4546 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4547 -y is marked as obsolete starting with Solaris 8 (1999), but is
4548 still accepted as of Solaris 10 prerelease (2004).
4550 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4551 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4552 and which in general ignores the argument after "-y" if it
4553 consists entirely of digits (it can even be empty). */
4554 if (optarg == argv[optind - 1])
4556 char const *p;
4557 for (p = optarg; ISDIGIT (*p); p++)
4558 continue;
4559 optind -= (*p != '\0');
4561 break;
4563 case 'z':
4564 eolchar = 0;
4565 break;
4567 case_GETOPT_HELP_CHAR;
4569 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4571 default:
4572 usage (SORT_FAILURE);
4576 if (files_from)
4578 FILE *stream;
4580 /* When using --files0-from=F, you may not specify any files
4581 on the command-line. */
4582 if (nfiles)
4584 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4585 fprintf (stderr, "%s\n",
4586 _("file operands cannot be combined with --files0-from"));
4587 usage (SORT_FAILURE);
4590 if (STREQ (files_from, "-"))
4591 stream = stdin;
4592 else
4594 stream = fopen (files_from, "r");
4595 if (stream == NULL)
4596 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4597 quoteaf (files_from));
4600 readtokens0_init (&tok);
4602 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4603 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4604 quoteaf (files_from));
4606 if (tok.n_tok)
4608 size_t i;
4609 free (files);
4610 files = tok.tok;
4611 nfiles = tok.n_tok;
4612 for (i = 0; i < nfiles; i++)
4614 if (STREQ (files[i], "-"))
4615 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4616 "no file name of %s allowed"),
4617 quoteaf (files[i]));
4618 else if (files[i][0] == '\0')
4620 /* Using the standard 'filename:line-number:' prefix here is
4621 not totally appropriate, since NUL is the separator,
4622 not NL, but it might be better than nothing. */
4623 unsigned long int file_number = i + 1;
4624 error (SORT_FAILURE, 0,
4625 _("%s:%lu: invalid zero-length file name"),
4626 quotef (files_from), file_number);
4630 else
4631 error (SORT_FAILURE, 0, _("no input from %s"),
4632 quoteaf (files_from));
4635 /* Inheritance of global options to individual keys. */
4636 for (key = keylist; key; key = key->next)
4638 if (default_key_compare (key) && !key->reverse)
4640 key->ignore = gkey.ignore;
4641 key->translate = gkey.translate;
4642 key->skipsblanks = gkey.skipsblanks;
4643 key->skipeblanks = gkey.skipeblanks;
4644 key->month = gkey.month;
4645 key->numeric = gkey.numeric;
4646 key->general_numeric = gkey.general_numeric;
4647 key->human_numeric = gkey.human_numeric;
4648 key->version = gkey.version;
4649 key->random = gkey.random;
4650 key->reverse = gkey.reverse;
4653 need_random |= key->random;
4656 if (!keylist && !default_key_compare (&gkey))
4658 gkey_only = true;
4659 insertkey (&gkey);
4660 need_random |= gkey.random;
4663 check_ordering_compatibility ();
4665 if (debug)
4667 if (checkonly || outfile)
4669 static char opts[] = "X --debug";
4670 opts[0] = (checkonly ? checkonly : 'o');
4671 incompatible_options (opts);
4674 /* Always output the locale in debug mode, since this
4675 is such a common source of confusion. */
4676 if (hard_LC_COLLATE)
4677 error (0, 0, _("using %s sorting rules"),
4678 quote (setlocale (LC_COLLATE, NULL)));
4679 else
4681 /* OpenBSD can only set some categories with LC_ALL above,
4682 so set LC_COLLATE explicitly to flag errors. */
4683 if (locale_ok)
4684 locale_ok = !! setlocale (LC_COLLATE, "");
4685 error (0, 0, "%s%s", locale_ok ? "" : _("failed to set locale; "),
4686 _("using simple byte comparison"));
4688 key_warnings (&gkey, gkey_only);
4691 reverse = gkey.reverse;
4693 if (need_random)
4694 random_md5_state_init (random_source);
4696 if (temp_dir_count == 0)
4698 char const *tmp_dir = getenv ("TMPDIR");
4699 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4702 if (nfiles == 0)
4704 nfiles = 1;
4705 free (files);
4706 files = xmalloc (sizeof *files);
4707 *files = (char *) "-";
4710 /* Need to re-check that we meet the minimum requirement for memory
4711 usage with the final value for NMERGE. */
4712 if (0 < sort_size)
4713 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4715 if (checkonly)
4717 if (nfiles > 1)
4718 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4719 quoteaf (files[1]), checkonly);
4721 if (outfile)
4723 static char opts[] = {0, 'o', 0};
4724 opts[0] = checkonly;
4725 incompatible_options (opts);
4728 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4729 input is not properly sorted. */
4730 return check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER;
4733 /* Check all inputs are accessible, or exit immediately. */
4734 check_inputs (files, nfiles);
4736 /* Check output is writable, or exit immediately. */
4737 check_output (outfile);
4739 if (mergeonly)
4741 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4742 size_t i;
4744 for (i = 0; i < nfiles; ++i)
4745 sortfiles[i].name = files[i];
4747 merge (sortfiles, 0, nfiles, outfile);
4748 IF_LINT (free (sortfiles));
4750 else
4752 if (!nthreads)
4754 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4755 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4758 /* Avoid integer overflow later. */
4759 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4760 nthreads = MIN (nthreads, nthreads_max);
4762 sort (files, nfiles, outfile, nthreads);
4765 #ifdef lint
4766 if (files_from)
4767 readtokens0_free (&tok);
4768 else
4769 free (files);
4770 #endif
4772 if (have_read_stdin && fclose (stdin) == EOF)
4773 die (_("close failed"), "-");
4775 return EXIT_SUCCESS;