doc: clarify the description of du --separate-dirs
[coreutils.git] / src / sort.c
blob062b5f946fd4cd08dd4ac04c7cc135a2b0f366cf
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/resource.h>
28 #include <sys/types.h>
29 #include <sys/wait.h>
30 #include <signal.h>
31 #include <assert.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "error.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "hard-locale.h"
38 #include "hash.h"
39 #include "heap.h"
40 #include "ignore-value.h"
41 #include "md5.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "quotearg.h"
48 #include "randread.h"
49 #include "readtokens0.h"
50 #include "stdio--.h"
51 #include "stdlib--.h"
52 #include "strnumcmp.h"
53 #include "xmemcoll.h"
54 #include "xnanosleep.h"
55 #include "xstrtol.h"
57 #ifndef RLIMIT_DATA
58 struct rlimit { size_t rlim_cur; };
59 # define getrlimit(Resource, Rlp) (-1)
60 #endif
62 /* The official name of this program (e.g., no 'g' prefix). */
63 #define PROGRAM_NAME "sort"
65 #define AUTHORS \
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
71 #endif
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
74 present. */
75 #ifndef SA_NOCLDSTOP
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
79 # define sigset_t int
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
82 # endif
83 #endif
85 #if !defined OPEN_MAX && defined NR_OPEN
86 # define OPEN_MAX NR_OPEN
87 #endif
88 #if !defined OPEN_MAX
89 # define OPEN_MAX 20
90 #endif
92 #define UCHAR_LIM (UCHAR_MAX + 1)
94 #if HAVE_C99_STRTOLD
95 # define long_double long double
96 #else
97 # define long_double double
98 # undef strtold
99 # define strtold strtod
100 #endif
102 #ifndef DEFAULT_TMPDIR
103 # define DEFAULT_TMPDIR "/tmp"
104 #endif
106 /* Maximum number of lines to merge every time a NODE is taken from
107 the merge queue. Node is at LEVEL in the binary merge tree,
108 and is responsible for merging TOTAL lines. */
109 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
111 /* Heuristic value for the number of lines for which it is worth creating
112 a subthread, during an internal merge sort. I.e., it is a small number
113 of "average" lines for which sorting via two threads is faster than
114 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
115 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
116 lines of gensort -a output is sorted slightly faster with --parallel=2
117 than with --parallel=1. By contrast, using --parallel=1 is about 10%
118 faster than using --parallel=2 with a 64K-line input. */
119 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
120 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
122 /* The number of threads after which there are
123 diminishing performance gains. */
124 enum { DEFAULT_MAX_THREADS = 8 };
126 /* Exit statuses. */
127 enum
129 /* POSIX says to exit with status 1 if invoked with -c and the
130 input is not properly sorted. */
131 SORT_OUT_OF_ORDER = 1,
133 /* POSIX says any other irregular exit must exit with a status
134 code greater than 1. */
135 SORT_FAILURE = 2
138 enum
140 /* The number of times we should try to fork a compression process
141 (we retry if the fork call fails). We don't _need_ to compress
142 temp files, this is just to reduce disk access, so this number
143 can be small. Each retry doubles in duration. */
144 MAX_FORK_TRIES_COMPRESS = 4,
146 /* The number of times we should try to fork a decompression process.
147 If we can't fork a decompression process, we can't sort, so this
148 number should be big. Each retry doubles in duration. */
149 MAX_FORK_TRIES_DECOMPRESS = 9
152 enum
154 /* Level of the end-of-merge node, one level above the root. */
155 MERGE_END = 0,
157 /* Level of the root node in merge tree. */
158 MERGE_ROOT = 1
161 /* The representation of the decimal point in the current locale. */
162 static int decimal_point;
164 /* Thousands separator; if -1, then there isn't one. */
165 static int thousands_sep;
167 /* Nonzero if the corresponding locales are hard. */
168 static bool hard_LC_COLLATE;
169 #if HAVE_NL_LANGINFO
170 static bool hard_LC_TIME;
171 #endif
173 #define NONZERO(x) ((x) != 0)
175 /* The kind of blanks for '-b' to skip in various options. */
176 enum blanktype { bl_start, bl_end, bl_both };
178 /* The character marking end of line. Default to \n. */
179 static char eolchar = '\n';
181 /* Lines are held in core as counted strings. */
182 struct line
184 char *text; /* Text of the line. */
185 size_t length; /* Length including final newline. */
186 char *keybeg; /* Start of first key. */
187 char *keylim; /* Limit of first key. */
190 /* Input buffers. */
191 struct buffer
193 char *buf; /* Dynamically allocated buffer,
194 partitioned into 3 regions:
195 - input data;
196 - unused area;
197 - an array of lines, in reverse order. */
198 size_t used; /* Number of bytes used for input data. */
199 size_t nlines; /* Number of lines in the line array. */
200 size_t alloc; /* Number of bytes allocated. */
201 size_t left; /* Number of bytes left from previous reads. */
202 size_t line_bytes; /* Number of bytes to reserve for each line. */
203 bool eof; /* An EOF has been read. */
206 /* Sort key. */
207 struct keyfield
209 size_t sword; /* Zero-origin 'word' to start at. */
210 size_t schar; /* Additional characters to skip. */
211 size_t eword; /* Zero-origin last 'word' of key. */
212 size_t echar; /* Additional characters in field. */
213 bool const *ignore; /* Boolean array of characters to ignore. */
214 char const *translate; /* Translation applied to characters. */
215 bool skipsblanks; /* Skip leading blanks when finding start. */
216 bool skipeblanks; /* Skip leading blanks when finding end. */
217 bool numeric; /* Flag for numeric comparison. Handle
218 strings of digits with optional decimal
219 point, but no exponential notation. */
220 bool random; /* Sort by random hash of key. */
221 bool general_numeric; /* Flag for general, numeric comparison.
222 Handle numbers in exponential notation. */
223 bool human_numeric; /* Flag for sorting by human readable
224 units with either SI xor IEC prefixes. */
225 bool month; /* Flag for comparison by month name. */
226 bool reverse; /* Reverse the sense of comparison. */
227 bool version; /* sort by version number */
228 bool obsolete_used; /* obsolescent key option format is used. */
229 struct keyfield *next; /* Next keyfield to try. */
232 struct month
234 char const *name;
235 int val;
238 /* Binary merge tree node. */
239 struct merge_node
241 struct line *lo; /* Lines to merge from LO child node. */
242 struct line *hi; /* Lines to merge from HI child ndoe. */
243 struct line *end_lo; /* End of available lines from LO. */
244 struct line *end_hi; /* End of available lines from HI. */
245 struct line **dest; /* Pointer to destination of merge. */
246 size_t nlo; /* Total Lines remaining from LO. */
247 size_t nhi; /* Total lines remaining from HI. */
248 struct merge_node *parent; /* Parent node. */
249 struct merge_node *lo_child; /* LO child node. */
250 struct merge_node *hi_child; /* HI child node. */
251 unsigned int level; /* Level in merge tree. */
252 bool queued; /* Node is already in heap. */
253 pthread_mutex_t lock; /* Lock for node operations. */
256 /* Priority queue of merge nodes. */
257 struct merge_node_queue
259 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
260 pthread_mutex_t mutex; /* Lock for queue operations. */
261 pthread_cond_t cond; /* Conditional wait for empty queue to populate
262 when popping. */
265 /* Used to implement --unique (-u). */
266 static struct line saved_line;
268 /* FIXME: None of these tables work with multibyte character sets.
269 Also, there are many other bugs when handling multibyte characters.
270 One way to fix this is to rewrite 'sort' to use wide characters
271 internally, but doing this with good performance is a bit
272 tricky. */
274 /* Table of blanks. */
275 static bool blanks[UCHAR_LIM];
277 /* Table of non-printing characters. */
278 static bool nonprinting[UCHAR_LIM];
280 /* Table of non-dictionary characters (not letters, digits, or blanks). */
281 static bool nondictionary[UCHAR_LIM];
283 /* Translation table folding lower case to upper. */
284 static char fold_toupper[UCHAR_LIM];
286 #define MONTHS_PER_YEAR 12
288 /* Table mapping month names to integers.
289 Alphabetic order allows binary search. */
290 static struct month monthtab[] =
292 {"APR", 4},
293 {"AUG", 8},
294 {"DEC", 12},
295 {"FEB", 2},
296 {"JAN", 1},
297 {"JUL", 7},
298 {"JUN", 6},
299 {"MAR", 3},
300 {"MAY", 5},
301 {"NOV", 11},
302 {"OCT", 10},
303 {"SEP", 9}
306 /* During the merge phase, the number of files to merge at once. */
307 #define NMERGE_DEFAULT 16
309 /* Minimum size for a merge or check buffer. */
310 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
312 /* Minimum sort size; the code might not work with smaller sizes. */
313 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
315 /* The number of bytes needed for a merge or check buffer, which can
316 function relatively efficiently even if it holds only one line. If
317 a longer line is seen, this value is increased. */
318 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
320 /* The approximate maximum number of bytes of main memory to use, as
321 specified by the user. Zero if the user has not specified a size. */
322 static size_t sort_size;
324 /* The initial allocation factor for non-regular files.
325 This is used, e.g., when reading from a pipe.
326 Don't make it too big, since it is multiplied by ~130 to
327 obtain the size of the actual buffer sort will allocate.
328 Also, there may be 8 threads all doing this at the same time. */
329 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
331 /* Array of directory names in which any temporary files are to be created. */
332 static char const **temp_dirs;
334 /* Number of temporary directory names used. */
335 static size_t temp_dir_count;
337 /* Number of allocated slots in temp_dirs. */
338 static size_t temp_dir_alloc;
340 /* Flag to reverse the order of all comparisons. */
341 static bool reverse;
343 /* Flag for stable sort. This turns off the last ditch bytewise
344 comparison of lines, and instead leaves lines in the same order
345 they were read if all keys compare equal. */
346 static bool stable;
348 /* If TAB has this value, blanks separate fields. */
349 enum { TAB_DEFAULT = CHAR_MAX + 1 };
351 /* Tab character separating fields. If TAB_DEFAULT, then fields are
352 separated by the empty string between a non-blank character and a blank
353 character. */
354 static int tab = TAB_DEFAULT;
356 /* Flag to remove consecutive duplicate lines from the output.
357 Only the last of a sequence of equal lines will be output. */
358 static bool unique;
360 /* Nonzero if any of the input files are the standard input. */
361 static bool have_read_stdin;
363 /* List of key field comparisons to be tried. */
364 static struct keyfield *keylist;
366 /* Program used to (de)compress temp files. Must accept -d. */
367 static char const *compress_program;
369 /* Annotate the output with extra info to aid the user. */
370 static bool debug;
372 /* Maximum number of files to merge in one go. If more than this
373 number are present, temp files will be used. */
374 static unsigned int nmerge = NMERGE_DEFAULT;
376 /* Report MESSAGE for FILE, then clean up and exit.
377 If FILE is null, it represents standard output. */
379 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
380 static void
381 die (char const *message, char const *file)
383 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
384 exit (SORT_FAILURE);
387 void
388 usage (int status)
390 if (status != EXIT_SUCCESS)
391 emit_try_help ();
392 else
394 printf (_("\
395 Usage: %s [OPTION]... [FILE]...\n\
396 or: %s [OPTION]... --files0-from=F\n\
398 program_name, program_name);
399 fputs (_("\
400 Write sorted concatenation of all FILE(s) to standard output.\n\
401 "), stdout);
403 emit_mandatory_arg_note ();
405 fputs (_("\
406 Ordering options:\n\
408 "), stdout);
409 fputs (_("\
410 -b, --ignore-leading-blanks ignore leading blanks\n\
411 -d, --dictionary-order consider only blanks and alphanumeric characters\
413 -f, --ignore-case fold lower case to upper case characters\n\
414 "), stdout);
415 fputs (_("\
416 -g, --general-numeric-sort compare according to general numerical value\n\
417 -i, --ignore-nonprinting consider only printable characters\n\
418 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
419 "), stdout);
420 fputs (_("\
421 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
422 "), stdout);
423 fputs (_("\
424 -n, --numeric-sort compare according to string numerical value\n\
425 -R, --random-sort sort by random hash of keys\n\
426 --random-source=FILE get random bytes from FILE\n\
427 -r, --reverse reverse the result of comparisons\n\
428 "), stdout);
429 fputs (_("\
430 --sort=WORD sort according to WORD:\n\
431 general-numeric -g, human-numeric -h, month -M,\
433 numeric -n, random -R, version -V\n\
434 -V, --version-sort natural sort of (version) numbers within text\n\
436 "), stdout);
437 fputs (_("\
438 Other options:\n\
440 "), stdout);
441 fputs (_("\
442 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
443 for more use temp files\n\
444 "), stdout);
445 fputs (_("\
446 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
447 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
449 --compress-program=PROG compress temporaries with PROG;\n\
450 decompress them with PROG -d\n\
451 "), stdout);
452 fputs (_("\
453 --debug annotate the part of the line used to sort,\n\
454 and warn about questionable usage to stderr\n\
455 --files0-from=F read input from the files specified by\n\
456 NUL-terminated names in file F;\n\
457 If F is - then read names from standard input\n\
458 "), stdout);
459 fputs (_("\
460 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
461 -m, --merge merge already sorted files; do not sort\n\
462 "), stdout);
463 fputs (_("\
464 -o, --output=FILE write result to FILE instead of standard output\n\
465 -s, --stable stabilize sort by disabling last-resort comparison\
467 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
468 "), stdout);
469 printf (_("\
470 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
471 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
472 multiple options specify multiple directories\n\
473 --parallel=N change the number of sorts run concurrently to N\n\
474 -u, --unique with -c, check for strict ordering;\n\
475 without -c, output only the first of an equal run\
477 "), DEFAULT_TMPDIR);
478 fputs (_("\
479 -z, --zero-terminated end lines with 0 byte, not newline\n\
480 "), stdout);
481 fputs (HELP_OPTION_DESCRIPTION, stdout);
482 fputs (VERSION_OPTION_DESCRIPTION, stdout);
483 fputs (_("\
485 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
486 field number and C a character position in the field; both are origin 1, and\n\
487 the stop position defaults to the line's end. If neither -t nor -b is in\n\
488 effect, characters in a field are counted from the beginning of the preceding\n\
489 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
491 which override global ordering options for that key. If no key is given, use\n\
492 the entire line as the key.\n\
494 SIZE may be followed by the following multiplicative suffixes:\n\
495 "), stdout);
496 fputs (_("\
497 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
499 With no FILE, or when FILE is -, read standard input.\n\
501 *** WARNING ***\n\
502 The locale specified by the environment affects sort order.\n\
503 Set LC_ALL=C to get the traditional sort order that uses\n\
504 native byte values.\n\
505 "), stdout );
506 emit_ancillary_info ();
509 exit (status);
512 /* For long options that have no equivalent short option, use a
513 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
514 enum
516 CHECK_OPTION = CHAR_MAX + 1,
517 COMPRESS_PROGRAM_OPTION,
518 DEBUG_PROGRAM_OPTION,
519 FILES0_FROM_OPTION,
520 NMERGE_OPTION,
521 RANDOM_SOURCE_OPTION,
522 SORT_OPTION,
523 PARALLEL_OPTION
526 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
528 static struct option const long_options[] =
530 {"ignore-leading-blanks", no_argument, NULL, 'b'},
531 {"check", optional_argument, NULL, CHECK_OPTION},
532 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
533 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
534 {"dictionary-order", no_argument, NULL, 'd'},
535 {"ignore-case", no_argument, NULL, 'f'},
536 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
537 {"general-numeric-sort", no_argument, NULL, 'g'},
538 {"ignore-nonprinting", no_argument, NULL, 'i'},
539 {"key", required_argument, NULL, 'k'},
540 {"merge", no_argument, NULL, 'm'},
541 {"month-sort", no_argument, NULL, 'M'},
542 {"numeric-sort", no_argument, NULL, 'n'},
543 {"human-numeric-sort", no_argument, NULL, 'h'},
544 {"version-sort", no_argument, NULL, 'V'},
545 {"random-sort", no_argument, NULL, 'R'},
546 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
547 {"sort", required_argument, NULL, SORT_OPTION},
548 {"output", required_argument, NULL, 'o'},
549 {"reverse", no_argument, NULL, 'r'},
550 {"stable", no_argument, NULL, 's'},
551 {"batch-size", required_argument, NULL, NMERGE_OPTION},
552 {"buffer-size", required_argument, NULL, 'S'},
553 {"field-separator", required_argument, NULL, 't'},
554 {"temporary-directory", required_argument, NULL, 'T'},
555 {"unique", no_argument, NULL, 'u'},
556 {"zero-terminated", no_argument, NULL, 'z'},
557 {"parallel", required_argument, NULL, PARALLEL_OPTION},
558 {GETOPT_HELP_OPTION_DECL},
559 {GETOPT_VERSION_OPTION_DECL},
560 {NULL, 0, NULL, 0},
563 #define CHECK_TABLE \
564 _ct_("quiet", 'C') \
565 _ct_("silent", 'C') \
566 _ct_("diagnose-first", 'c')
568 static char const *const check_args[] =
570 #define _ct_(_s, _c) _s,
571 CHECK_TABLE NULL
572 #undef _ct_
574 static char const check_types[] =
576 #define _ct_(_s, _c) _c,
577 CHECK_TABLE
578 #undef _ct_
581 #define SORT_TABLE \
582 _st_("general-numeric", 'g') \
583 _st_("human-numeric", 'h') \
584 _st_("month", 'M') \
585 _st_("numeric", 'n') \
586 _st_("random", 'R') \
587 _st_("version", 'V')
589 static char const *const sort_args[] =
591 #define _st_(_s, _c) _s,
592 SORT_TABLE NULL
593 #undef _st_
595 static char const sort_types[] =
597 #define _st_(_s, _c) _c,
598 SORT_TABLE
599 #undef _st_
602 /* The set of signals that are caught. */
603 static sigset_t caught_signals;
605 /* Critical section status. */
606 struct cs_status
608 bool valid;
609 sigset_t sigs;
612 /* Enter a critical section. */
613 static struct cs_status
614 cs_enter (void)
616 struct cs_status status;
617 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
618 return status;
621 /* Leave a critical section. */
622 static void
623 cs_leave (struct cs_status status)
625 if (status.valid)
627 /* Ignore failure when restoring the signal mask. */
628 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
632 /* Possible states for a temp file. If compressed, the file's status
633 is unreaped or reaped, depending on whether 'sort' has waited for
634 the subprocess to finish. */
635 enum { UNCOMPRESSED, UNREAPED, REAPED };
637 /* The list of temporary files. */
638 struct tempnode
640 struct tempnode *volatile next;
641 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
642 char state;
643 char name[1]; /* Actual size is 1 + file name length. */
645 static struct tempnode *volatile temphead;
646 static struct tempnode *volatile *temptail = &temphead;
648 /* A file to be sorted. */
649 struct sortfile
651 /* The file's name. */
652 char const *name;
654 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
655 struct tempnode *temp;
658 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
659 static Hash_table *proctab;
661 enum { INIT_PROCTAB_SIZE = 47 };
663 static size_t
664 proctab_hasher (void const *entry, size_t tabsize)
666 struct tempnode const *node = entry;
667 return node->pid % tabsize;
670 static bool
671 proctab_comparator (void const *e1, void const *e2)
673 struct tempnode const *n1 = e1;
674 struct tempnode const *n2 = e2;
675 return n1->pid == n2->pid;
678 /* The number of unreaped child processes. */
679 static pid_t nprocs;
681 static bool delete_proc (pid_t);
683 /* If PID is positive, wait for the child process with that PID to
684 exit, and assume that PID has already been removed from the process
685 table. If PID is 0 or -1, clean up some child that has exited (by
686 waiting for it, and removing it from the proc table) and return the
687 child's process ID. However, if PID is 0 and no children have
688 exited, return 0 without waiting. */
690 static pid_t
691 reap (pid_t pid)
693 int status;
694 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
696 if (cpid < 0)
697 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
698 compress_program);
699 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
701 if (! WIFEXITED (status) || WEXITSTATUS (status))
702 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
703 compress_program);
704 --nprocs;
707 return cpid;
710 /* TEMP represents a new process; add it to the process table. Create
711 the process table the first time it's called. */
713 static void
714 register_proc (struct tempnode *temp)
716 if (! proctab)
718 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
719 proctab_hasher,
720 proctab_comparator,
721 NULL);
722 if (! proctab)
723 xalloc_die ();
726 temp->state = UNREAPED;
728 if (! hash_insert (proctab, temp))
729 xalloc_die ();
732 /* If PID is in the process table, remove it and return true.
733 Otherwise, return false. */
735 static bool
736 delete_proc (pid_t pid)
738 struct tempnode test;
740 test.pid = pid;
741 struct tempnode *node = hash_delete (proctab, &test);
742 if (! node)
743 return false;
744 node->state = REAPED;
745 return true;
748 /* Remove PID from the process table, and wait for it to exit if it
749 hasn't already. */
751 static void
752 wait_proc (pid_t pid)
754 if (delete_proc (pid))
755 reap (pid);
758 /* Reap any exited children. Do not block; reap only those that have
759 already exited. */
761 static void
762 reap_exited (void)
764 while (0 < nprocs && reap (0))
765 continue;
768 /* Reap at least one exited child, waiting if necessary. */
770 static void
771 reap_some (void)
773 reap (-1);
774 reap_exited ();
777 /* Reap all children, waiting if necessary. */
779 static void
780 reap_all (void)
782 while (0 < nprocs)
783 reap (-1);
786 /* Clean up any remaining temporary files. */
788 static void
789 cleanup (void)
791 struct tempnode const *node;
793 for (node = temphead; node; node = node->next)
794 unlink (node->name);
795 temphead = NULL;
798 /* Cleanup actions to take when exiting. */
800 static void
801 exit_cleanup (void)
803 if (temphead)
805 /* Clean up any remaining temporary files in a critical section so
806 that a signal handler does not try to clean them too. */
807 struct cs_status cs = cs_enter ();
808 cleanup ();
809 cs_leave (cs);
812 close_stdout ();
815 /* Create a new temporary file, returning its newly allocated tempnode.
816 Store into *PFD the file descriptor open for writing.
817 If the creation fails, return NULL and store -1 into *PFD if the
818 failure is due to file descriptor exhaustion and
819 SURVIVE_FD_EXHAUSTION; otherwise, die. */
821 static struct tempnode *
822 create_temp_file (int *pfd, bool survive_fd_exhaustion)
824 static char const slashbase[] = "/sortXXXXXX";
825 static size_t temp_dir_index;
826 int fd;
827 int saved_errno;
828 char const *temp_dir = temp_dirs[temp_dir_index];
829 size_t len = strlen (temp_dir);
830 struct tempnode *node =
831 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
832 char *file = node->name;
833 struct cs_status cs;
835 memcpy (file, temp_dir, len);
836 memcpy (file + len, slashbase, sizeof slashbase);
837 node->next = NULL;
838 if (++temp_dir_index == temp_dir_count)
839 temp_dir_index = 0;
841 /* Create the temporary file in a critical section, to avoid races. */
842 cs = cs_enter ();
843 fd = mkstemp (file);
844 if (0 <= fd)
846 *temptail = node;
847 temptail = &node->next;
849 saved_errno = errno;
850 cs_leave (cs);
851 errno = saved_errno;
853 if (fd < 0)
855 if (! (survive_fd_exhaustion && errno == EMFILE))
856 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
857 quote (temp_dir));
858 free (node);
859 node = NULL;
862 *pfd = fd;
863 return node;
866 /* Return a stream for FILE, opened with mode HOW. A null FILE means
867 standard output; HOW should be "w". When opening for input, "-"
868 means standard input. To avoid confusion, do not return file
869 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
870 opening an ordinary FILE. Return NULL if unsuccessful.
872 fadvise() is used to specify an access pattern for input files.
873 There are a few hints we could possibly provide,
874 and after careful testing it was decided that
875 specifying POSIX_FADV_SEQUENTIAL was not detrimental
876 to any cases. On Linux 2.6.31, this option doubles
877 the size of read ahead performed and thus was seen to
878 benefit these cases:
879 Merging
880 Sorting with a smaller internal buffer
881 Reading from faster flash devices
883 In _addition_ one could also specify other hints...
885 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
886 at least uses that to _synchronously_ prepopulate the cache
887 with the specified range. While sort does need to
888 read all of its input before outputting, a synchronous
889 read of the whole file up front precludes any processing
890 that sort could do in parallel with the system doing
891 read ahead of the data. This was seen to have negative effects
892 in a couple of cases:
893 Merging
894 Sorting with a smaller internal buffer
895 Note this option was seen to shorten the runtime for sort
896 on a multicore system with lots of RAM and other processes
897 competing for CPU. It could be argued that more explicit
898 scheduling hints with 'nice' et. al. are more appropriate
899 for this situation.
901 POSIX_FADV_NOREUSE is a possibility as it could lower
902 the priority of input data in the cache as sort will
903 only need to process it once. However its functionality
904 has changed over Linux kernel versions and as of 2.6.31
905 it does nothing and thus we can't depend on what it might
906 do in future.
908 POSIX_FADV_DONTNEED is not appropriate for user specified
909 input files, but for temp files we do want to drop the
910 cache immediately after processing. This is done implicitly
911 however when the files are unlinked. */
913 static FILE *
914 stream_open (char const *file, char const *how)
916 FILE *fp;
918 if (*how == 'r')
920 if (STREQ (file, "-"))
922 have_read_stdin = true;
923 fp = stdin;
925 else
926 fp = fopen (file, how);
927 fadvise (fp, FADVISE_SEQUENTIAL);
929 else if (*how == 'w')
931 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
932 error (SORT_FAILURE, errno, _("%s: error truncating"),
933 quote (file));
934 fp = stdout;
936 else
937 assert (!"unexpected mode passed to stream_open");
939 return fp;
942 /* Same as stream_open, except always return a non-null value; die on
943 failure. */
945 static FILE *
946 xfopen (char const *file, char const *how)
948 FILE *fp = stream_open (file, how);
949 if (!fp)
950 die (_("open failed"), file);
951 return fp;
954 /* Close FP, whose name is FILE, and report any errors. */
956 static void
957 xfclose (FILE *fp, char const *file)
959 switch (fileno (fp))
961 case STDIN_FILENO:
962 /* Allow reading stdin from tty more than once. */
963 if (feof (fp))
964 clearerr (fp);
965 break;
967 case STDOUT_FILENO:
968 /* Don't close stdout just yet. close_stdout does that. */
969 if (fflush (fp) != 0)
970 die (_("fflush failed"), file);
971 break;
973 default:
974 if (fclose (fp) != 0)
975 die (_("close failed"), file);
976 break;
980 static void
981 move_fd_or_die (int oldfd, int newfd)
983 if (oldfd != newfd)
985 if (dup2 (oldfd, newfd) < 0)
986 error (SORT_FAILURE, errno, _("dup2 failed"));
987 close (oldfd);
991 /* Fork a child process for piping to and do common cleanup. The
992 TRIES parameter tells us how many times to try to fork before
993 giving up. Return the PID of the child, or -1 (setting errno)
994 on failure. */
996 static pid_t
997 pipe_fork (int pipefds[2], size_t tries)
999 #if HAVE_WORKING_FORK
1000 struct tempnode *saved_temphead;
1001 int saved_errno;
1002 double wait_retry = 0.25;
1003 pid_t pid IF_LINT ( = -1);
1004 struct cs_status cs;
1006 if (pipe (pipefds) < 0)
1007 return -1;
1009 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1010 uncontrolled subprocess generation can hurt performance significantly.
1011 Allow at most NMERGE + 2 subprocesses, on the theory that there
1012 may be some useful parallelism by letting compression for the
1013 previous merge finish (1 subprocess) in parallel with the current
1014 merge (NMERGE + 1 subprocesses). */
1016 if (nmerge + 1 < nprocs)
1017 reap_some ();
1019 while (tries--)
1021 /* This is so the child process won't delete our temp files
1022 if it receives a signal before exec-ing. */
1023 cs = cs_enter ();
1024 saved_temphead = temphead;
1025 temphead = NULL;
1027 pid = fork ();
1028 saved_errno = errno;
1029 if (pid)
1030 temphead = saved_temphead;
1032 cs_leave (cs);
1033 errno = saved_errno;
1035 if (0 <= pid || errno != EAGAIN)
1036 break;
1037 else
1039 xnanosleep (wait_retry);
1040 wait_retry *= 2;
1041 reap_exited ();
1045 if (pid < 0)
1047 saved_errno = errno;
1048 close (pipefds[0]);
1049 close (pipefds[1]);
1050 errno = saved_errno;
1052 else if (pid == 0)
1054 close (STDIN_FILENO);
1055 close (STDOUT_FILENO);
1057 else
1058 ++nprocs;
1060 return pid;
1062 #else /* ! HAVE_WORKING_FORK */
1063 return -1;
1064 #endif
1067 /* Create a temporary file and, if asked for, start a compressor
1068 to that file. Set *PFP to the file handle and return
1069 the address of the new temp node. If the creation
1070 fails, return NULL if the failure is due to file descriptor
1071 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1073 static struct tempnode *
1074 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1076 int tempfd;
1077 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1078 if (! node)
1079 return NULL;
1081 node->state = UNCOMPRESSED;
1083 if (compress_program)
1085 int pipefds[2];
1087 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1088 if (0 < node->pid)
1090 close (tempfd);
1091 close (pipefds[0]);
1092 tempfd = pipefds[1];
1094 register_proc (node);
1096 else if (node->pid == 0)
1098 close (pipefds[1]);
1099 move_fd_or_die (tempfd, STDOUT_FILENO);
1100 move_fd_or_die (pipefds[0], STDIN_FILENO);
1102 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1103 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1104 compress_program);
1108 *pfp = fdopen (tempfd, "w");
1109 if (! *pfp)
1110 die (_("couldn't create temporary file"), node->name);
1112 return node;
1115 /* Create a temporary file and, if asked for, start a compressor
1116 to that file. Set *PFP to the file handle and return the address
1117 of the new temp node. Die on failure. */
1119 static struct tempnode *
1120 create_temp (FILE **pfp)
1122 return maybe_create_temp (pfp, false);
1125 /* Open a compressed temp file and start a decompression process through
1126 which to filter the input. Return NULL (setting errno to
1127 EMFILE) if we ran out of file descriptors, and die on any other
1128 kind of failure. */
1130 static FILE *
1131 open_temp (struct tempnode *temp)
1133 int tempfd, pipefds[2];
1134 FILE *fp = NULL;
1136 if (temp->state == UNREAPED)
1137 wait_proc (temp->pid);
1139 tempfd = open (temp->name, O_RDONLY);
1140 if (tempfd < 0)
1141 return NULL;
1143 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1145 switch (child)
1147 case -1:
1148 if (errno != EMFILE)
1149 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1150 compress_program);
1151 close (tempfd);
1152 errno = EMFILE;
1153 break;
1155 case 0:
1156 close (pipefds[0]);
1157 move_fd_or_die (tempfd, STDIN_FILENO);
1158 move_fd_or_die (pipefds[1], STDOUT_FILENO);
1160 execlp (compress_program, compress_program, "-d", (char *) NULL);
1161 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1162 compress_program);
1164 default:
1165 temp->pid = child;
1166 register_proc (temp);
1167 close (tempfd);
1168 close (pipefds[1]);
1170 fp = fdopen (pipefds[0], "r");
1171 if (! fp)
1173 int saved_errno = errno;
1174 close (pipefds[0]);
1175 errno = saved_errno;
1177 break;
1180 return fp;
1183 /* Append DIR to the array of temporary directory names. */
1184 static void
1185 add_temp_dir (char const *dir)
1187 if (temp_dir_count == temp_dir_alloc)
1188 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1190 temp_dirs[temp_dir_count++] = dir;
1193 /* Remove NAME from the list of temporary files. */
1195 static void
1196 zaptemp (char const *name)
1198 struct tempnode *volatile *pnode;
1199 struct tempnode *node;
1200 struct tempnode *next;
1201 int unlink_status;
1202 int unlink_errno = 0;
1203 struct cs_status cs;
1205 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1206 continue;
1208 if (node->state == UNREAPED)
1209 wait_proc (node->pid);
1211 /* Unlink the temporary file in a critical section to avoid races. */
1212 next = node->next;
1213 cs = cs_enter ();
1214 unlink_status = unlink (name);
1215 unlink_errno = errno;
1216 *pnode = next;
1217 cs_leave (cs);
1219 if (unlink_status != 0)
1220 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1221 if (! next)
1222 temptail = pnode;
1223 free (node);
1226 #if HAVE_NL_LANGINFO
1228 static int
1229 struct_month_cmp (void const *m1, void const *m2)
1231 struct month const *month1 = m1;
1232 struct month const *month2 = m2;
1233 return strcmp (month1->name, month2->name);
1236 #endif
1238 /* Initialize the character class tables. */
1240 static void
1241 inittables (void)
1243 size_t i;
1245 for (i = 0; i < UCHAR_LIM; ++i)
1247 blanks[i] = !! isblank (i);
1248 nonprinting[i] = ! isprint (i);
1249 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1250 fold_toupper[i] = toupper (i);
1253 #if HAVE_NL_LANGINFO
1254 /* If we're not in the "C" locale, read different names for months. */
1255 if (hard_LC_TIME)
1257 for (i = 0; i < MONTHS_PER_YEAR; i++)
1259 char const *s;
1260 size_t s_len;
1261 size_t j, k;
1262 char *name;
1264 s = nl_langinfo (ABMON_1 + i);
1265 s_len = strlen (s);
1266 monthtab[i].name = name = xmalloc (s_len + 1);
1267 monthtab[i].val = i + 1;
1269 for (j = k = 0; j < s_len; j++)
1270 if (! isblank (to_uchar (s[j])))
1271 name[k++] = fold_toupper[to_uchar (s[j])];
1272 name[k] = '\0';
1274 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1276 #endif
1279 /* Specify how many inputs may be merged at once.
1280 This may be set on the command-line with the
1281 --batch-size option. */
1282 static void
1283 specify_nmerge (int oi, char c, char const *s)
1285 uintmax_t n;
1286 struct rlimit rlimit;
1287 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1289 /* Try to find out how many file descriptors we'll be able
1290 to open. We need at least nmerge + 3 (STDIN_FILENO,
1291 STDOUT_FILENO and STDERR_FILENO). */
1292 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1293 ? rlimit.rlim_cur
1294 : OPEN_MAX)
1295 - 3);
1297 if (e == LONGINT_OK)
1299 nmerge = n;
1300 if (nmerge != n)
1301 e = LONGINT_OVERFLOW;
1302 else
1304 if (nmerge < 2)
1306 error (0, 0, _("invalid --%s argument %s"),
1307 long_options[oi].name, quote (s));
1308 error (SORT_FAILURE, 0,
1309 _("minimum --%s argument is %s"),
1310 long_options[oi].name, quote ("2"));
1312 else if (max_nmerge < nmerge)
1314 e = LONGINT_OVERFLOW;
1316 else
1317 return;
1321 if (e == LONGINT_OVERFLOW)
1323 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1324 error (0, 0, _("--%s argument %s too large"),
1325 long_options[oi].name, quote (s));
1326 error (SORT_FAILURE, 0,
1327 _("maximum --%s argument with current rlimit is %s"),
1328 long_options[oi].name,
1329 uinttostr (max_nmerge, max_nmerge_buf));
1331 else
1332 xstrtol_fatal (e, oi, c, long_options, s);
1335 /* Specify the amount of main memory to use when sorting. */
1336 static void
1337 specify_sort_size (int oi, char c, char const *s)
1339 uintmax_t n;
1340 char *suffix;
1341 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1343 /* The default unit is KiB. */
1344 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1346 if (n <= UINTMAX_MAX / 1024)
1347 n *= 1024;
1348 else
1349 e = LONGINT_OVERFLOW;
1352 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1353 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1354 switch (suffix[0])
1356 case 'b':
1357 e = LONGINT_OK;
1358 break;
1360 case '%':
1362 double mem = physmem_total () * n / 100;
1364 /* Use "<", not "<=", to avoid problems with rounding. */
1365 if (mem < UINTMAX_MAX)
1367 n = mem;
1368 e = LONGINT_OK;
1370 else
1371 e = LONGINT_OVERFLOW;
1373 break;
1376 if (e == LONGINT_OK)
1378 /* If multiple sort sizes are specified, take the maximum, so
1379 that option order does not matter. */
1380 if (n < sort_size)
1381 return;
1383 sort_size = n;
1384 if (sort_size == n)
1386 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1387 return;
1390 e = LONGINT_OVERFLOW;
1393 xstrtol_fatal (e, oi, c, long_options, s);
1396 /* Specify the number of threads to spawn during internal sort. */
1397 static size_t
1398 specify_nthreads (int oi, char c, char const *s)
1400 unsigned long int nthreads;
1401 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1402 if (e == LONGINT_OVERFLOW)
1403 return SIZE_MAX;
1404 if (e != LONGINT_OK)
1405 xstrtol_fatal (e, oi, c, long_options, s);
1406 if (SIZE_MAX < nthreads)
1407 nthreads = SIZE_MAX;
1408 if (nthreads == 0)
1409 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1410 return nthreads;
1413 /* Return the default sort size. */
1414 static size_t
1415 default_sort_size (void)
1417 /* Let SIZE be MEM, but no more than the maximum object size,
1418 total memory, or system resource limits. Don't bother to check
1419 for values like RLIM_INFINITY since in practice they are not much
1420 less than SIZE_MAX. */
1421 size_t size = SIZE_MAX;
1422 struct rlimit rlimit;
1423 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1424 size = rlimit.rlim_cur;
1425 #ifdef RLIMIT_AS
1426 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1427 size = rlimit.rlim_cur;
1428 #endif
1430 /* Leave a large safety margin for the above limits, as failure can
1431 occur when they are exceeded. */
1432 size /= 2;
1434 #ifdef RLIMIT_RSS
1435 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1436 Exceeding RSS is not fatal, but can be quite slow. */
1437 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1438 size = rlimit.rlim_cur / 16 * 15;
1439 #endif
1441 /* Let MEM be available memory or 1/8 of total memory, whichever
1442 is greater. */
1443 double avail = physmem_available ();
1444 double total = physmem_total ();
1445 double mem = MAX (avail, total / 8);
1447 /* Leave a 1/4 margin for physical memory. */
1448 if (total * 0.75 < size)
1449 size = total * 0.75;
1451 /* Return the minimum of MEM and SIZE, but no less than
1452 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1453 right when only one argument is floating point. */
1454 if (mem < size)
1455 size = mem;
1456 return MAX (size, MIN_SORT_SIZE);
1459 /* Return the sort buffer size to use with the input files identified
1460 by FPS and FILES, which are alternate names of the same files.
1461 NFILES gives the number of input files; NFPS may be less. Assume
1462 that each input line requires LINE_BYTES extra bytes' worth of line
1463 information. Do not exceed the size bound specified by the user
1464 (or a default size bound, if the user does not specify one). */
1466 static size_t
1467 sort_buffer_size (FILE *const *fps, size_t nfps,
1468 char *const *files, size_t nfiles,
1469 size_t line_bytes)
1471 /* A bound on the input size. If zero, the bound hasn't been
1472 determined yet. */
1473 static size_t size_bound;
1475 /* In the worst case, each input byte is a newline. */
1476 size_t worst_case_per_input_byte = line_bytes + 1;
1478 /* Keep enough room for one extra input line and an extra byte.
1479 This extra room might be needed when preparing to read EOF. */
1480 size_t size = worst_case_per_input_byte + 1;
1482 size_t i;
1484 for (i = 0; i < nfiles; i++)
1486 struct stat st;
1487 off_t file_size;
1488 size_t worst_case;
1490 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1491 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1492 : stat (files[i], &st))
1493 != 0)
1494 die (_("stat failed"), files[i]);
1496 if (S_ISREG (st.st_mode))
1497 file_size = st.st_size;
1498 else
1500 /* The file has unknown size. If the user specified a sort
1501 buffer size, use that; otherwise, guess the size. */
1502 if (sort_size)
1503 return sort_size;
1504 file_size = INPUT_FILE_SIZE_GUESS;
1507 if (! size_bound)
1509 size_bound = sort_size;
1510 if (! size_bound)
1511 size_bound = default_sort_size ();
1514 /* Add the amount of memory needed to represent the worst case
1515 where the input consists entirely of newlines followed by a
1516 single non-newline. Check for overflow. */
1517 worst_case = file_size * worst_case_per_input_byte + 1;
1518 if (file_size != worst_case / worst_case_per_input_byte
1519 || size_bound - size <= worst_case)
1520 return size_bound;
1521 size += worst_case;
1524 return size;
1527 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1528 must be at least sizeof (struct line). Allocate ALLOC bytes
1529 initially. */
1531 static void
1532 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1534 /* Ensure that the line array is properly aligned. If the desired
1535 size cannot be allocated, repeatedly halve it until allocation
1536 succeeds. The smaller allocation may hurt overall performance,
1537 but that's better than failing. */
1538 while (true)
1540 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1541 buf->buf = malloc (alloc);
1542 if (buf->buf)
1543 break;
1544 alloc /= 2;
1545 if (alloc <= line_bytes + 1)
1546 xalloc_die ();
1549 buf->line_bytes = line_bytes;
1550 buf->alloc = alloc;
1551 buf->used = buf->left = buf->nlines = 0;
1552 buf->eof = false;
1555 /* Return one past the limit of the line array. */
1557 static inline struct line *
1558 buffer_linelim (struct buffer const *buf)
1560 void *linelim = buf->buf + buf->alloc;
1561 return linelim;
1564 /* Return a pointer to the first character of the field specified
1565 by KEY in LINE. */
1567 static char *
1568 begfield (struct line const *line, struct keyfield const *key)
1570 char *ptr = line->text, *lim = ptr + line->length - 1;
1571 size_t sword = key->sword;
1572 size_t schar = key->schar;
1574 /* The leading field separator itself is included in a field when -t
1575 is absent. */
1577 if (tab != TAB_DEFAULT)
1578 while (ptr < lim && sword--)
1580 while (ptr < lim && *ptr != tab)
1581 ++ptr;
1582 if (ptr < lim)
1583 ++ptr;
1585 else
1586 while (ptr < lim && sword--)
1588 while (ptr < lim && blanks[to_uchar (*ptr)])
1589 ++ptr;
1590 while (ptr < lim && !blanks[to_uchar (*ptr)])
1591 ++ptr;
1594 /* If we're ignoring leading blanks when computing the Start
1595 of the field, skip past them here. */
1596 if (key->skipsblanks)
1597 while (ptr < lim && blanks[to_uchar (*ptr)])
1598 ++ptr;
1600 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1601 ptr = MIN (lim, ptr + schar);
1603 return ptr;
1606 /* Return the limit of (a pointer to the first character after) the field
1607 in LINE specified by KEY. */
1609 static char *
1610 limfield (struct line const *line, struct keyfield const *key)
1612 char *ptr = line->text, *lim = ptr + line->length - 1;
1613 size_t eword = key->eword, echar = key->echar;
1615 if (echar == 0)
1616 eword++; /* Skip all of end field. */
1618 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1619 whichever comes first. If there are more than EWORD fields, leave
1620 PTR pointing at the beginning of the field having zero-based index,
1621 EWORD. If a delimiter character was specified (via -t), then that
1622 'beginning' is the first character following the delimiting TAB.
1623 Otherwise, leave PTR pointing at the first 'blank' character after
1624 the preceding field. */
1625 if (tab != TAB_DEFAULT)
1626 while (ptr < lim && eword--)
1628 while (ptr < lim && *ptr != tab)
1629 ++ptr;
1630 if (ptr < lim && (eword || echar))
1631 ++ptr;
1633 else
1634 while (ptr < lim && eword--)
1636 while (ptr < lim && blanks[to_uchar (*ptr)])
1637 ++ptr;
1638 while (ptr < lim && !blanks[to_uchar (*ptr)])
1639 ++ptr;
1642 #ifdef POSIX_UNSPECIFIED
1643 /* The following block of code makes GNU sort incompatible with
1644 standard Unix sort, so it's ifdef'd out for now.
1645 The POSIX spec isn't clear on how to interpret this.
1646 FIXME: request clarification.
1648 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1649 Date: Thu, 30 May 96 12:20:41 -0400
1650 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1652 [...]I believe I've found another bug in 'sort'.
1654 $ cat /tmp/sort.in
1655 a b c 2 d
1656 pq rs 1 t
1657 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1658 a b c 2 d
1659 pq rs 1 t
1660 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1661 pq rs 1 t
1662 a b c 2 d
1664 Unix sort produced the answer I expected: sort on the single character
1665 in column 7. GNU sort produced different results, because it disagrees
1666 on the interpretation of the key-end spec "M.N". Unix sort reads this
1667 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1668 "skip M-1 fields, then either N-1 characters or the rest of the current
1669 field, whichever comes first". This extra clause applies only to
1670 key-ends, not key-starts.
1673 /* Make LIM point to the end of (one byte past) the current field. */
1674 if (tab != TAB_DEFAULT)
1676 char *newlim;
1677 newlim = memchr (ptr, tab, lim - ptr);
1678 if (newlim)
1679 lim = newlim;
1681 else
1683 char *newlim;
1684 newlim = ptr;
1685 while (newlim < lim && blanks[to_uchar (*newlim)])
1686 ++newlim;
1687 while (newlim < lim && !blanks[to_uchar (*newlim)])
1688 ++newlim;
1689 lim = newlim;
1691 #endif
1693 if (echar != 0) /* We need to skip over a portion of the end field. */
1695 /* If we're ignoring leading blanks when computing the End
1696 of the field, skip past them here. */
1697 if (key->skipeblanks)
1698 while (ptr < lim && blanks[to_uchar (*ptr)])
1699 ++ptr;
1701 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1702 ptr = MIN (lim, ptr + echar);
1705 return ptr;
1708 /* Fill BUF reading from FP, moving buf->left bytes from the end
1709 of buf->buf to the beginning first. If EOF is reached and the
1710 file wasn't terminated by a newline, supply one. Set up BUF's line
1711 table too. FILE is the name of the file corresponding to FP.
1712 Return true if some input was read. */
1714 static bool
1715 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1717 struct keyfield const *key = keylist;
1718 char eol = eolchar;
1719 size_t line_bytes = buf->line_bytes;
1720 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1722 if (buf->eof)
1723 return false;
1725 if (buf->used != buf->left)
1727 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1728 buf->used = buf->left;
1729 buf->nlines = 0;
1732 while (true)
1734 char *ptr = buf->buf + buf->used;
1735 struct line *linelim = buffer_linelim (buf);
1736 struct line *line = linelim - buf->nlines;
1737 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1738 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1740 while (line_bytes + 1 < avail)
1742 /* Read as many bytes as possible, but do not read so many
1743 bytes that there might not be enough room for the
1744 corresponding line array. The worst case is when the
1745 rest of the input file consists entirely of newlines,
1746 except that the last byte is not a newline. */
1747 size_t readsize = (avail - 1) / (line_bytes + 1);
1748 size_t bytes_read = fread (ptr, 1, readsize, fp);
1749 char *ptrlim = ptr + bytes_read;
1750 char *p;
1751 avail -= bytes_read;
1753 if (bytes_read != readsize)
1755 if (ferror (fp))
1756 die (_("read failed"), file);
1757 if (feof (fp))
1759 buf->eof = true;
1760 if (buf->buf == ptrlim)
1761 return false;
1762 if (line_start != ptrlim && ptrlim[-1] != eol)
1763 *ptrlim++ = eol;
1767 /* Find and record each line in the just-read input. */
1768 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1770 /* Delimit the line with NUL. This eliminates the need to
1771 temporarily replace the last byte with NUL when calling
1772 xmemcoll(), which increases performance. */
1773 *p = '\0';
1774 ptr = p + 1;
1775 line--;
1776 line->text = line_start;
1777 line->length = ptr - line_start;
1778 mergesize = MAX (mergesize, line->length);
1779 avail -= line_bytes;
1781 if (key)
1783 /* Precompute the position of the first key for
1784 efficiency. */
1785 line->keylim = (key->eword == SIZE_MAX
1787 : limfield (line, key));
1789 if (key->sword != SIZE_MAX)
1790 line->keybeg = begfield (line, key);
1791 else
1793 if (key->skipsblanks)
1794 while (blanks[to_uchar (*line_start)])
1795 line_start++;
1796 line->keybeg = line_start;
1800 line_start = ptr;
1803 ptr = ptrlim;
1804 if (buf->eof)
1805 break;
1808 buf->used = ptr - buf->buf;
1809 buf->nlines = buffer_linelim (buf) - line;
1810 if (buf->nlines != 0)
1812 buf->left = ptr - line_start;
1813 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1814 return true;
1818 /* The current input line is too long to fit in the buffer.
1819 Increase the buffer size and try again, keeping it properly
1820 aligned. */
1821 size_t line_alloc = buf->alloc / sizeof (struct line);
1822 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1823 buf->alloc = line_alloc * sizeof (struct line);
1828 /* Table that maps characters to order-of-magnitude values. */
1829 static char const unit_order[UCHAR_LIM] =
1831 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1832 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1833 /* This initializer syntax works on all C99 hosts. For now, use
1834 it only on non-ASCII hosts, to ease the pain of porting to
1835 pre-C99 ASCII hosts. */
1836 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1837 ['k']=1,
1838 #else
1839 /* Generate the following table with this command:
1840 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1841 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1842 |fmt */
1843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1846 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1854 #endif
1857 /* Return an integer that represents the order of magnitude of the
1858 unit following the number. The number may contain thousands
1859 separators and a decimal point, but it may not contain leading blanks.
1860 Negative numbers get negative orders; zero numbers have a zero order. */
1862 static int _GL_ATTRIBUTE_PURE
1863 find_unit_order (char const *number)
1865 bool minus_sign = (*number == '-');
1866 char const *p = number + minus_sign;
1867 int nonzero = 0;
1868 unsigned char ch;
1870 /* Scan to end of number.
1871 Decimals or separators not followed by digits stop the scan.
1872 Numbers ending in decimals or separators are thus considered
1873 to be lacking in units.
1874 FIXME: add support for multibyte thousands_sep and decimal_point. */
1878 while (ISDIGIT (ch = *p++))
1879 nonzero |= ch - '0';
1881 while (ch == thousands_sep);
1883 if (ch == decimal_point)
1884 while (ISDIGIT (ch = *p++))
1885 nonzero |= ch - '0';
1887 if (nonzero)
1889 int order = unit_order[ch];
1890 return (minus_sign ? -order : order);
1892 else
1893 return 0;
1896 /* Compare numbers A and B ending in units with SI or IEC prefixes
1897 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1899 static int
1900 human_numcompare (char const *a, char const *b)
1902 while (blanks[to_uchar (*a)])
1903 a++;
1904 while (blanks[to_uchar (*b)])
1905 b++;
1907 int diff = find_unit_order (a) - find_unit_order (b);
1908 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1911 /* Compare strings A and B as numbers without explicitly converting them to
1912 machine numbers. Comparatively slow for short strings, but asymptotically
1913 hideously fast. */
1915 static int
1916 numcompare (char const *a, char const *b)
1918 while (blanks[to_uchar (*a)])
1919 a++;
1920 while (blanks[to_uchar (*b)])
1921 b++;
1923 return strnumcmp (a, b, decimal_point, thousands_sep);
1926 /* Work around a problem whereby the long double value returned by glibc's
1927 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1928 A and B before calling strtold. FIXME: remove this function once
1929 gnulib guarantees that strtold's result is always well defined. */
1930 static int
1931 nan_compare (char const *sa, char const *sb)
1933 long_double a;
1934 memset (&a, 0, sizeof a);
1935 a = strtold (sa, NULL);
1937 long_double b;
1938 memset (&b, 0, sizeof b);
1939 b = strtold (sb, NULL);
1941 return memcmp (&a, &b, sizeof a);
1944 static int
1945 general_numcompare (char const *sa, char const *sb)
1947 /* FIXME: maybe add option to try expensive FP conversion
1948 only if A and B can't be compared more cheaply/accurately. */
1950 char *ea;
1951 char *eb;
1952 long_double a = strtold (sa, &ea);
1953 long_double b = strtold (sb, &eb);
1955 /* Put conversion errors at the start of the collating sequence. */
1956 if (sa == ea)
1957 return sb == eb ? 0 : -1;
1958 if (sb == eb)
1959 return 1;
1961 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1962 conversion errors but before numbers; sort them by internal
1963 bit-pattern, for lack of a more portable alternative. */
1964 return (a < b ? -1
1965 : a > b ? 1
1966 : a == b ? 0
1967 : b == b ? -1
1968 : a == a ? 1
1969 : nan_compare (sa, sb));
1972 /* Return an integer in 1..12 of the month name MONTH.
1973 Return 0 if the name in S is not recognized. */
1975 static int
1976 getmonth (char const *month, char **ea)
1978 size_t lo = 0;
1979 size_t hi = MONTHS_PER_YEAR;
1981 while (blanks[to_uchar (*month)])
1982 month++;
1986 size_t ix = (lo + hi) / 2;
1987 char const *m = month;
1988 char const *n = monthtab[ix].name;
1990 for (;; m++, n++)
1992 if (!*n)
1994 if (ea)
1995 *ea = (char *) m;
1996 return monthtab[ix].val;
1998 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2000 hi = ix;
2001 break;
2003 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2005 lo = ix + 1;
2006 break;
2010 while (lo < hi);
2012 return 0;
2015 /* A randomly chosen MD5 state, used for random comparison. */
2016 static struct md5_ctx random_md5_state;
2018 /* Initialize the randomly chosen MD5 state. */
2020 static void
2021 random_md5_state_init (char const *random_source)
2023 unsigned char buf[MD5_DIGEST_SIZE];
2024 struct randread_source *r = randread_new (random_source, sizeof buf);
2025 if (! r)
2026 die (_("open failed"), random_source);
2027 randread (r, buf, sizeof buf);
2028 if (randread_free (r) != 0)
2029 die (_("close failed"), random_source);
2030 md5_init_ctx (&random_md5_state);
2031 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2034 /* This is like strxfrm, except it reports any error and exits. */
2036 static size_t
2037 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2039 errno = 0;
2040 size_t translated_size = strxfrm (dest, src, destsize);
2042 if (errno)
2044 error (0, errno, _("string transformation failed"));
2045 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2046 error (SORT_FAILURE, 0,
2047 _("the untransformed string was %s"),
2048 quotearg_n_style (0, locale_quoting_style, src));
2051 return translated_size;
2054 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2055 using one or more random hash functions. TEXTA[LENA] and
2056 TEXTB[LENB] must be zero. */
2058 static int
2059 compare_random (char *restrict texta, size_t lena,
2060 char *restrict textb, size_t lenb)
2062 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2063 data. This is used to break ties if there is a checksum
2064 collision, and this is good enough given the astronomically low
2065 probability of a collision. */
2066 int xfrm_diff = 0;
2068 char stackbuf[4000];
2069 char *buf = stackbuf;
2070 size_t bufsize = sizeof stackbuf;
2071 void *allocated = NULL;
2072 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2073 struct md5_ctx s[2];
2074 s[0] = s[1] = random_md5_state;
2076 if (hard_LC_COLLATE)
2078 char const *lima = texta + lena;
2079 char const *limb = textb + lenb;
2081 while (true)
2083 /* Transform the text into the basis of comparison, so that byte
2084 strings that would otherwise considered to be equal are
2085 considered equal here even if their bytes differ.
2087 Each time through this loop, transform one
2088 null-terminated string's worth from TEXTA or from TEXTB
2089 or both. That way, there's no need to store the
2090 transformation of the whole line, if it contains many
2091 null-terminated strings. */
2093 /* Store the transformed data into a big-enough buffer. */
2095 /* A 3X size guess avoids the overhead of calling strxfrm
2096 twice on typical implementations. Don't worry about
2097 size_t overflow, as the guess need not be correct. */
2098 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2099 if (bufsize < guess_bufsize)
2101 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2102 free (allocated);
2103 buf = allocated = malloc (bufsize);
2104 if (! buf)
2106 buf = stackbuf;
2107 bufsize = sizeof stackbuf;
2111 size_t sizea =
2112 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2113 bool a_fits = sizea <= bufsize;
2114 size_t sizeb =
2115 (textb < limb
2116 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2117 (a_fits ? bufsize - sizea : 0))
2118 + 1)
2119 : 0);
2121 if (! (a_fits && sizea + sizeb <= bufsize))
2123 bufsize = sizea + sizeb;
2124 if (bufsize < SIZE_MAX / 3)
2125 bufsize = bufsize * 3 / 2;
2126 free (allocated);
2127 buf = allocated = xmalloc (bufsize);
2128 if (texta < lima)
2129 strxfrm (buf, texta, sizea);
2130 if (textb < limb)
2131 strxfrm (buf + sizea, textb, sizeb);
2134 /* Advance past NULs to the next part of each input string,
2135 exiting the loop if both strings are exhausted. When
2136 exiting the loop, prepare to finish off the tiebreaker
2137 comparison properly. */
2138 if (texta < lima)
2139 texta += strlen (texta) + 1;
2140 if (textb < limb)
2141 textb += strlen (textb) + 1;
2142 if (! (texta < lima || textb < limb))
2144 lena = sizea; texta = buf;
2145 lenb = sizeb; textb = buf + sizea;
2146 break;
2149 /* Accumulate the transformed data in the corresponding
2150 checksums. */
2151 md5_process_bytes (buf, sizea, &s[0]);
2152 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2154 /* Update the tiebreaker comparison of the transformed data. */
2155 if (! xfrm_diff)
2157 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2158 if (! xfrm_diff)
2159 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2164 /* Compute and compare the checksums. */
2165 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2166 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2167 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2169 /* Fall back on the tiebreaker if the checksums collide. */
2170 if (! diff)
2172 if (! xfrm_diff)
2174 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2175 if (! xfrm_diff)
2176 xfrm_diff = (lena > lenb) - (lena < lenb);
2179 diff = xfrm_diff;
2182 free (allocated);
2184 return diff;
2187 /* Return the printable width of the block of memory starting at
2188 TEXT and ending just before LIM, counting each tab as one byte.
2189 FIXME: Should we generally be counting non printable chars? */
2191 static size_t
2192 debug_width (char const *text, char const *lim)
2194 size_t width = mbsnwidth (text, lim - text, 0);
2195 while (text < lim)
2196 width += (*text++ == '\t');
2197 return width;
2200 /* For debug mode, "underline" a key at the
2201 specified offset and screen width. */
2203 static void
2204 mark_key (size_t offset, size_t width)
2206 while (offset--)
2207 putchar (' ');
2209 if (!width)
2210 printf (_("^ no match for key\n"));
2211 else
2214 putchar ('_');
2215 while (--width);
2217 putchar ('\n');
2221 /* Return true if KEY is a numeric key. */
2223 static inline bool
2224 key_numeric (struct keyfield const *key)
2226 return key->numeric || key->general_numeric || key->human_numeric;
2229 /* For LINE, output a debugging line that underlines KEY in LINE.
2230 If KEY is null, underline the whole line. */
2232 static void
2233 debug_key (struct line const *line, struct keyfield const *key)
2235 char *text = line->text;
2236 char *beg = text;
2237 char *lim = text + line->length - 1;
2239 if (key)
2241 if (key->sword != SIZE_MAX)
2242 beg = begfield (line, key);
2243 if (key->eword != SIZE_MAX)
2244 lim = limfield (line, key);
2246 if (key->skipsblanks || key->month || key_numeric (key))
2248 char saved = *lim;
2249 *lim = '\0';
2251 while (blanks[to_uchar (*beg)])
2252 beg++;
2254 char *tighter_lim = beg;
2256 if (lim < beg)
2257 tighter_lim = lim;
2258 else if (key->month)
2259 getmonth (beg, &tighter_lim);
2260 else if (key->general_numeric)
2261 ignore_value (strtold (beg, &tighter_lim));
2262 else if (key->numeric || key->human_numeric)
2264 char *p = beg + (beg < lim && *beg == '-');
2265 bool found_digit = false;
2266 unsigned char ch;
2270 while (ISDIGIT (ch = *p++))
2271 found_digit = true;
2273 while (ch == thousands_sep);
2275 if (ch == decimal_point)
2276 while (ISDIGIT (ch = *p++))
2277 found_digit = true;
2279 if (found_digit)
2280 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2282 else
2283 tighter_lim = lim;
2285 *lim = saved;
2286 lim = tighter_lim;
2290 size_t offset = debug_width (text, beg);
2291 size_t width = debug_width (beg, lim);
2292 mark_key (offset, width);
2295 /* Debug LINE by underlining its keys. */
2297 static void
2298 debug_line (struct line const *line)
2300 struct keyfield const *key = keylist;
2303 debug_key (line, key);
2304 while (key && ((key = key->next) || ! (unique || stable)));
2307 /* Return whether sorting options specified for key. */
2309 static bool
2310 default_key_compare (struct keyfield const *key)
2312 return ! (key->ignore
2313 || key->translate
2314 || key->skipsblanks
2315 || key->skipeblanks
2316 || key_numeric (key)
2317 || key->month
2318 || key->version
2319 || key->random
2320 /* || key->reverse */
2324 /* Convert a key to the short options used to specify it. */
2326 static void
2327 key_to_opts (struct keyfield const *key, char *opts)
2329 if (key->skipsblanks || key->skipeblanks)
2330 *opts++ = 'b';/* either disables global -b */
2331 if (key->ignore == nondictionary)
2332 *opts++ = 'd';
2333 if (key->translate)
2334 *opts++ = 'f';
2335 if (key->general_numeric)
2336 *opts++ = 'g';
2337 if (key->human_numeric)
2338 *opts++ = 'h';
2339 if (key->ignore == nonprinting)
2340 *opts++ = 'i';
2341 if (key->month)
2342 *opts++ = 'M';
2343 if (key->numeric)
2344 *opts++ = 'n';
2345 if (key->random)
2346 *opts++ = 'R';
2347 if (key->reverse)
2348 *opts++ = 'r';
2349 if (key->version)
2350 *opts++ = 'V';
2351 *opts = '\0';
2354 /* Output data independent key warnings to stderr. */
2356 static void
2357 key_warnings (struct keyfield const *gkey, bool gkey_only)
2359 struct keyfield const *key;
2360 struct keyfield ugkey = *gkey;
2361 unsigned long keynum = 1;
2363 for (key = keylist; key; key = key->next, keynum++)
2365 if (key->obsolete_used)
2367 size_t sword = key->sword;
2368 size_t eword = key->eword;
2369 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2370 /* obsolescent syntax +A.x -B.y is equivalent to:
2371 -k A+1.x+1,B.y (when y = 0)
2372 -k A+1.x+1,B+1.y (when y > 0) */
2373 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2374 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2375 char *po = obuf;
2376 char *pn = nbuf;
2378 if (sword == SIZE_MAX)
2379 sword++;
2381 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2382 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2383 if (key->eword != SIZE_MAX)
2385 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2386 stpcpy (stpcpy (pn, ","),
2387 umaxtostr (eword + 1
2388 + (key->echar == SIZE_MAX), tmp));
2390 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2391 quote_n (0, obuf), quote_n (1, nbuf));
2394 /* Warn about field specs that will never match. */
2395 if (key->sword != SIZE_MAX && key->eword < key->sword)
2396 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2398 /* Warn about significant leading blanks. */
2399 bool implicit_skip = key_numeric (key) || key->month;
2400 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2401 && !(key->schar || key->echar);
2402 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2403 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2404 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2405 || (!key->skipsblanks && key->schar)
2406 || (!key->skipeblanks && key->echar)))
2407 error (0, 0, _("leading blanks are significant in key %lu; "
2408 "consider also specifying 'b'"), keynum);
2410 /* Warn about numeric comparisons spanning fields,
2411 as field delimiters could be interpreted as part
2412 of the number (maybe only in other locales). */
2413 if (!gkey_only && key_numeric (key))
2415 size_t sword = key->sword + 1;
2416 size_t eword = key->eword + 1;
2417 if (!sword)
2418 sword++;
2419 if (!eword || sword < eword)
2420 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2421 keynum);
2424 /* Flag global options not copied or specified in any key. */
2425 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2426 ugkey.ignore = NULL;
2427 if (ugkey.translate && (ugkey.translate == key->translate))
2428 ugkey.translate = NULL;
2429 ugkey.skipsblanks &= !key->skipsblanks;
2430 ugkey.skipeblanks &= !key->skipeblanks;
2431 ugkey.month &= !key->month;
2432 ugkey.numeric &= !key->numeric;
2433 ugkey.general_numeric &= !key->general_numeric;
2434 ugkey.human_numeric &= !key->human_numeric;
2435 ugkey.random &= !key->random;
2436 ugkey.version &= !key->version;
2437 ugkey.reverse &= !key->reverse;
2440 /* Warn about ignored global options flagged above.
2441 Note if gkey is the only one in the list, all flags are cleared. */
2442 if (!default_key_compare (&ugkey)
2443 || (ugkey.reverse && (stable || unique) && keylist))
2445 bool ugkey_reverse = ugkey.reverse;
2446 if (!(stable || unique))
2447 ugkey.reverse = false;
2448 /* The following is too big, but guaranteed to be "big enough". */
2449 char opts[sizeof short_options];
2450 key_to_opts (&ugkey, opts);
2451 error (0, 0,
2452 ngettext ("option '-%s' is ignored",
2453 "options '-%s' are ignored",
2454 select_plural (strlen (opts))), opts);
2455 ugkey.reverse = ugkey_reverse;
2457 if (ugkey.reverse && !(stable || unique) && keylist)
2458 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2461 /* Compare two lines A and B trying every key in sequence until there
2462 are no more keys or a difference is found. */
2464 static int
2465 keycompare (struct line const *a, struct line const *b)
2467 struct keyfield *key = keylist;
2469 /* For the first iteration only, the key positions have been
2470 precomputed for us. */
2471 char *texta = a->keybeg;
2472 char *textb = b->keybeg;
2473 char *lima = a->keylim;
2474 char *limb = b->keylim;
2476 int diff;
2478 while (true)
2480 char const *translate = key->translate;
2481 bool const *ignore = key->ignore;
2483 /* Treat field ends before field starts as empty fields. */
2484 lima = MAX (texta, lima);
2485 limb = MAX (textb, limb);
2487 /* Find the lengths. */
2488 size_t lena = lima - texta;
2489 size_t lenb = limb - textb;
2491 if (hard_LC_COLLATE || key_numeric (key)
2492 || key->month || key->random || key->version)
2494 char *ta;
2495 char *tb;
2496 size_t tlena;
2497 size_t tlenb;
2499 char enda IF_LINT (= 0);
2500 char endb IF_LINT (= 0);
2501 void *allocated IF_LINT (= NULL);
2502 char stackbuf[4000];
2504 if (ignore || translate)
2506 /* Compute with copies of the keys, which are the result of
2507 translating or ignoring characters, and which need their
2508 own storage. */
2510 size_t i;
2512 /* Allocate space for copies. */
2513 size_t size = lena + 1 + lenb + 1;
2514 if (size <= sizeof stackbuf)
2515 ta = stackbuf, allocated = NULL;
2516 else
2517 ta = allocated = xmalloc (size);
2518 tb = ta + lena + 1;
2520 /* Put into each copy a version of the key in which the
2521 requested characters are ignored or translated. */
2522 for (tlena = i = 0; i < lena; i++)
2523 if (! (ignore && ignore[to_uchar (texta[i])]))
2524 ta[tlena++] = (translate
2525 ? translate[to_uchar (texta[i])]
2526 : texta[i]);
2527 ta[tlena] = '\0';
2529 for (tlenb = i = 0; i < lenb; i++)
2530 if (! (ignore && ignore[to_uchar (textb[i])]))
2531 tb[tlenb++] = (translate
2532 ? translate[to_uchar (textb[i])]
2533 : textb[i]);
2534 tb[tlenb] = '\0';
2536 else
2538 /* Use the keys in-place, temporarily null-terminated. */
2539 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2540 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2543 if (key->numeric)
2544 diff = numcompare (ta, tb);
2545 else if (key->general_numeric)
2546 diff = general_numcompare (ta, tb);
2547 else if (key->human_numeric)
2548 diff = human_numcompare (ta, tb);
2549 else if (key->month)
2550 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2551 else if (key->random)
2552 diff = compare_random (ta, tlena, tb, tlenb);
2553 else if (key->version)
2554 diff = filevercmp (ta, tb);
2555 else
2557 /* Locale-dependent string sorting. This is slower than
2558 C-locale sorting, which is implemented below. */
2559 if (tlena == 0)
2560 diff = - NONZERO (tlenb);
2561 else if (tlenb == 0)
2562 diff = 1;
2563 else
2564 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2567 if (ignore || translate)
2568 free (allocated);
2569 else
2571 ta[tlena] = enda;
2572 tb[tlenb] = endb;
2575 else if (ignore)
2577 #define CMP_WITH_IGNORE(A, B) \
2578 do \
2580 while (true) \
2582 while (texta < lima && ignore[to_uchar (*texta)]) \
2583 ++texta; \
2584 while (textb < limb && ignore[to_uchar (*textb)]) \
2585 ++textb; \
2586 if (! (texta < lima && textb < limb)) \
2587 break; \
2588 diff = to_uchar (A) - to_uchar (B); \
2589 if (diff) \
2590 goto not_equal; \
2591 ++texta; \
2592 ++textb; \
2595 diff = (texta < lima) - (textb < limb); \
2597 while (0)
2599 if (translate)
2600 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2601 translate[to_uchar (*textb)]);
2602 else
2603 CMP_WITH_IGNORE (*texta, *textb);
2605 else if (lena == 0)
2606 diff = - NONZERO (lenb);
2607 else if (lenb == 0)
2608 goto greater;
2609 else
2611 if (translate)
2613 while (texta < lima && textb < limb)
2615 diff = (to_uchar (translate[to_uchar (*texta++)])
2616 - to_uchar (translate[to_uchar (*textb++)]));
2617 if (diff)
2618 goto not_equal;
2621 else
2623 diff = memcmp (texta, textb, MIN (lena, lenb));
2624 if (diff)
2625 goto not_equal;
2627 diff = lena < lenb ? -1 : lena != lenb;
2630 if (diff)
2631 goto not_equal;
2633 key = key->next;
2634 if (! key)
2635 break;
2637 /* Find the beginning and limit of the next field. */
2638 if (key->eword != SIZE_MAX)
2639 lima = limfield (a, key), limb = limfield (b, key);
2640 else
2641 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2643 if (key->sword != SIZE_MAX)
2644 texta = begfield (a, key), textb = begfield (b, key);
2645 else
2647 texta = a->text, textb = b->text;
2648 if (key->skipsblanks)
2650 while (texta < lima && blanks[to_uchar (*texta)])
2651 ++texta;
2652 while (textb < limb && blanks[to_uchar (*textb)])
2653 ++textb;
2658 return 0;
2660 greater:
2661 diff = 1;
2662 not_equal:
2663 return key->reverse ? -diff : diff;
2666 /* Compare two lines A and B, returning negative, zero, or positive
2667 depending on whether A compares less than, equal to, or greater than B. */
2669 static int
2670 compare (struct line const *a, struct line const *b)
2672 int diff;
2673 size_t alen, blen;
2675 /* First try to compare on the specified keys (if any).
2676 The only two cases with no key at all are unadorned sort,
2677 and unadorned sort -r. */
2678 if (keylist)
2680 diff = keycompare (a, b);
2681 if (diff || unique || stable)
2682 return diff;
2685 /* If the keys all compare equal (or no keys were specified)
2686 fall through to the default comparison. */
2687 alen = a->length - 1, blen = b->length - 1;
2689 if (alen == 0)
2690 diff = - NONZERO (blen);
2691 else if (blen == 0)
2692 diff = 1;
2693 else if (hard_LC_COLLATE)
2695 /* Note xmemcoll0 is a performance enhancement as
2696 it will not unconditionally write '\0' after the
2697 passed in buffers, which was seen to give around
2698 a 3% increase in performance for short lines. */
2699 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2701 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2702 diff = alen < blen ? -1 : alen != blen;
2704 return reverse ? -diff : diff;
2707 /* Write LINE to output stream FP; the output file's name is
2708 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2709 otherwise. If debugging is enabled and FP is standard output,
2710 append some debugging information. */
2712 static void
2713 write_line (struct line const *line, FILE *fp, char const *output_file)
2715 char *buf = line->text;
2716 size_t n_bytes = line->length;
2717 char *ebuf = buf + n_bytes;
2719 if (!output_file && debug)
2721 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2722 char const *c = buf;
2724 while (c < ebuf)
2726 char wc = *c++;
2727 if (wc == '\t')
2728 wc = '>';
2729 else if (c == ebuf)
2730 wc = '\n';
2731 if (fputc (wc, fp) == EOF)
2732 die (_("write failed"), output_file);
2735 debug_line (line);
2737 else
2739 ebuf[-1] = eolchar;
2740 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2741 die (_("write failed"), output_file);
2742 ebuf[-1] = '\0';
2746 /* Check that the lines read from FILE_NAME come in order. Return
2747 true if they are in order. If CHECKONLY == 'c', also print a
2748 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2749 they are not in order. */
2751 static bool
2752 check (char const *file_name, char checkonly)
2754 FILE *fp = xfopen (file_name, "r");
2755 struct buffer buf; /* Input buffer. */
2756 struct line temp; /* Copy of previous line. */
2757 size_t alloc = 0;
2758 uintmax_t line_number = 0;
2759 struct keyfield const *key = keylist;
2760 bool nonunique = ! unique;
2761 bool ordered = true;
2763 initbuf (&buf, sizeof (struct line),
2764 MAX (merge_buffer_size, sort_size));
2765 temp.text = NULL;
2767 while (fillbuf (&buf, fp, file_name))
2769 struct line const *line = buffer_linelim (&buf);
2770 struct line const *linebase = line - buf.nlines;
2772 /* Make sure the line saved from the old buffer contents is
2773 less than or equal to the first line of the new buffer. */
2774 if (alloc && nonunique <= compare (&temp, line - 1))
2776 found_disorder:
2778 if (checkonly == 'c')
2780 struct line const *disorder_line = line - 1;
2781 uintmax_t disorder_line_number =
2782 buffer_linelim (&buf) - disorder_line + line_number;
2783 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2784 fprintf (stderr, _("%s: %s:%s: disorder: "),
2785 program_name, file_name,
2786 umaxtostr (disorder_line_number, hr_buf));
2787 write_line (disorder_line, stderr, _("standard error"));
2790 ordered = false;
2791 break;
2795 /* Compare each line in the buffer with its successor. */
2796 while (linebase < --line)
2797 if (nonunique <= compare (line, line - 1))
2798 goto found_disorder;
2800 line_number += buf.nlines;
2802 /* Save the last line of the buffer. */
2803 if (alloc < line->length)
2807 alloc *= 2;
2808 if (! alloc)
2810 alloc = line->length;
2811 break;
2814 while (alloc < line->length);
2816 free (temp.text);
2817 temp.text = xmalloc (alloc);
2819 memcpy (temp.text, line->text, line->length);
2820 temp.length = line->length;
2821 if (key)
2823 temp.keybeg = temp.text + (line->keybeg - line->text);
2824 temp.keylim = temp.text + (line->keylim - line->text);
2828 xfclose (fp, file_name);
2829 free (buf.buf);
2830 free (temp.text);
2831 return ordered;
2834 /* Open FILES (there are NFILES of them) and store the resulting array
2835 of stream pointers into (*PFPS). Allocate the array. Return the
2836 number of successfully opened files, setting errno if this value is
2837 less than NFILES. */
2839 static size_t
2840 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2842 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2843 int i;
2845 /* Open as many input files as we can. */
2846 for (i = 0; i < nfiles; i++)
2848 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2849 ? open_temp (files[i].temp)
2850 : stream_open (files[i].name, "r"));
2851 if (!fps[i])
2852 break;
2855 return i;
2858 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2859 files (all of which are at the start of the FILES array), and
2860 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2861 FPS is the vector of open stream corresponding to the files.
2862 Close input and output streams before returning.
2863 OUTPUT_FILE gives the name of the output file. If it is NULL,
2864 the output file is standard output. */
2866 static void
2867 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2868 FILE *ofp, char const *output_file, FILE **fps)
2870 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2871 /* Input buffers for each file. */
2872 struct line saved; /* Saved line storage for unique check. */
2873 struct line const *savedline = NULL;
2874 /* &saved if there is a saved line. */
2875 size_t savealloc = 0; /* Size allocated for the saved line. */
2876 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2877 /* Current line in each line table. */
2878 struct line const **base = xnmalloc (nfiles, sizeof *base);
2879 /* Base of each line table. */
2880 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2881 /* Table representing a permutation of fps,
2882 such that cur[ord[0]] is the smallest line
2883 and will be next output. */
2884 size_t i;
2885 size_t j;
2886 size_t t;
2887 struct keyfield const *key = keylist;
2888 saved.text = NULL;
2890 /* Read initial lines from each input file. */
2891 for (i = 0; i < nfiles; )
2893 initbuf (&buffer[i], sizeof (struct line),
2894 MAX (merge_buffer_size, sort_size / nfiles));
2895 if (fillbuf (&buffer[i], fps[i], files[i].name))
2897 struct line const *linelim = buffer_linelim (&buffer[i]);
2898 cur[i] = linelim - 1;
2899 base[i] = linelim - buffer[i].nlines;
2900 i++;
2902 else
2904 /* fps[i] is empty; eliminate it from future consideration. */
2905 xfclose (fps[i], files[i].name);
2906 if (i < ntemps)
2908 ntemps--;
2909 zaptemp (files[i].name);
2911 free (buffer[i].buf);
2912 --nfiles;
2913 for (j = i; j < nfiles; ++j)
2915 files[j] = files[j + 1];
2916 fps[j] = fps[j + 1];
2921 /* Set up the ord table according to comparisons among input lines.
2922 Since this only reorders two items if one is strictly greater than
2923 the other, it is stable. */
2924 for (i = 0; i < nfiles; ++i)
2925 ord[i] = i;
2926 for (i = 1; i < nfiles; ++i)
2927 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2928 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2930 /* Repeatedly output the smallest line until no input remains. */
2931 while (nfiles)
2933 struct line const *smallest = cur[ord[0]];
2935 /* If uniquified output is turned on, output only the first of
2936 an identical series of lines. */
2937 if (unique)
2939 if (savedline && compare (savedline, smallest))
2941 savedline = NULL;
2942 write_line (&saved, ofp, output_file);
2944 if (!savedline)
2946 savedline = &saved;
2947 if (savealloc < smallest->length)
2950 if (! savealloc)
2952 savealloc = smallest->length;
2953 break;
2955 while ((savealloc *= 2) < smallest->length);
2957 free (saved.text);
2958 saved.text = xmalloc (savealloc);
2960 saved.length = smallest->length;
2961 memcpy (saved.text, smallest->text, saved.length);
2962 if (key)
2964 saved.keybeg =
2965 saved.text + (smallest->keybeg - smallest->text);
2966 saved.keylim =
2967 saved.text + (smallest->keylim - smallest->text);
2971 else
2972 write_line (smallest, ofp, output_file);
2974 /* Check if we need to read more lines into core. */
2975 if (base[ord[0]] < smallest)
2976 cur[ord[0]] = smallest - 1;
2977 else
2979 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2981 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2982 cur[ord[0]] = linelim - 1;
2983 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2985 else
2987 /* We reached EOF on fps[ord[0]]. */
2988 for (i = 1; i < nfiles; ++i)
2989 if (ord[i] > ord[0])
2990 --ord[i];
2991 --nfiles;
2992 xfclose (fps[ord[0]], files[ord[0]].name);
2993 if (ord[0] < ntemps)
2995 ntemps--;
2996 zaptemp (files[ord[0]].name);
2998 free (buffer[ord[0]].buf);
2999 for (i = ord[0]; i < nfiles; ++i)
3001 fps[i] = fps[i + 1];
3002 files[i] = files[i + 1];
3003 buffer[i] = buffer[i + 1];
3004 cur[i] = cur[i + 1];
3005 base[i] = base[i + 1];
3007 for (i = 0; i < nfiles; ++i)
3008 ord[i] = ord[i + 1];
3009 continue;
3013 /* The new line just read in may be larger than other lines
3014 already in main memory; push it back in the queue until we
3015 encounter a line larger than it. Optimize for the common
3016 case where the new line is smallest. */
3018 size_t lo = 1;
3019 size_t hi = nfiles;
3020 size_t probe = lo;
3021 size_t ord0 = ord[0];
3022 size_t count_of_smaller_lines;
3024 while (lo < hi)
3026 int cmp = compare (cur[ord0], cur[ord[probe]]);
3027 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3028 hi = probe;
3029 else
3030 lo = probe + 1;
3031 probe = (lo + hi) / 2;
3034 count_of_smaller_lines = lo - 1;
3035 for (j = 0; j < count_of_smaller_lines; j++)
3036 ord[j] = ord[j + 1];
3037 ord[count_of_smaller_lines] = ord0;
3041 if (unique && savedline)
3043 write_line (&saved, ofp, output_file);
3044 free (saved.text);
3047 xfclose (ofp, output_file);
3048 free (fps);
3049 free (buffer);
3050 free (ord);
3051 free (base);
3052 free (cur);
3055 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3056 files (all of which are at the start of the FILES array), and
3057 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3058 Close input and output files before returning.
3059 OUTPUT_FILE gives the name of the output file.
3061 Return the number of files successfully merged. This number can be
3062 less than NFILES if we ran low on file descriptors, but in this
3063 case it is never less than 2. */
3065 static size_t
3066 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3067 FILE *ofp, char const *output_file)
3069 FILE **fps;
3070 size_t nopened = open_input_files (files, nfiles, &fps);
3071 if (nopened < nfiles && nopened < 2)
3072 die (_("open failed"), files[nopened].name);
3073 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3074 return nopened;
3077 /* Merge into T (of size NLINES) the two sorted arrays of lines
3078 LO (with NLINES / 2 members), and
3079 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3080 T and LO point just past their respective arrays, and the arrays
3081 are in reverse order. NLINES must be at least 2. */
3083 static void
3084 mergelines (struct line *restrict t, size_t nlines,
3085 struct line const *restrict lo)
3087 size_t nlo = nlines / 2;
3088 size_t nhi = nlines - nlo;
3089 struct line *hi = t - nlo;
3091 while (true)
3092 if (compare (lo - 1, hi - 1) <= 0)
3094 *--t = *--lo;
3095 if (! --nlo)
3097 /* HI must equal T now, and there is no need to copy from
3098 HI to T. */
3099 return;
3102 else
3104 *--t = *--hi;
3105 if (! --nhi)
3108 *--t = *--lo;
3109 while (--nlo);
3111 return;
3116 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3117 Do this all within one thread. NLINES must be at least 2.
3118 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3119 Otherwise the sort is in-place and TEMP is half-sized.
3120 The input and output arrays are in reverse order, and LINES and
3121 TEMP point just past the end of their respective arrays.
3123 Use a recursive divide-and-conquer algorithm, in the style
3124 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3125 the optimization suggested by exercise 5.2.4-10; this requires room
3126 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3127 writes that this memory optimization was originally published by
3128 D. A. Bell, Comp J. 1 (1958), 75. */
3130 static void
3131 sequential_sort (struct line *restrict lines, size_t nlines,
3132 struct line *restrict temp, bool to_temp)
3134 if (nlines == 2)
3136 /* Declare 'swap' as int, not bool, to work around a bug
3137 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3138 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3139 int swap = (0 < compare (&lines[-1], &lines[-2]));
3140 if (to_temp)
3142 temp[-1] = lines[-1 - swap];
3143 temp[-2] = lines[-2 + swap];
3145 else if (swap)
3147 temp[-1] = lines[-1];
3148 lines[-1] = lines[-2];
3149 lines[-2] = temp[-1];
3152 else
3154 size_t nlo = nlines / 2;
3155 size_t nhi = nlines - nlo;
3156 struct line *lo = lines;
3157 struct line *hi = lines - nlo;
3159 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3160 if (1 < nlo)
3161 sequential_sort (lo, nlo, temp, !to_temp);
3162 else if (!to_temp)
3163 temp[-1] = lo[-1];
3165 struct line *dest;
3166 struct line const *sorted_lo;
3167 if (to_temp)
3169 dest = temp;
3170 sorted_lo = lines;
3172 else
3174 dest = lines;
3175 sorted_lo = temp;
3177 mergelines (dest, nlines, sorted_lo);
3181 static struct merge_node *init_node (struct merge_node *restrict,
3182 struct merge_node *restrict,
3183 struct line *, size_t, size_t, bool);
3186 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3187 lines, with destination DEST. */
3188 static struct merge_node *
3189 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3191 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3193 struct merge_node *root = merge_tree;
3194 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3195 root->dest = NULL;
3196 root->nlo = root->nhi = nlines;
3197 root->parent = NULL;
3198 root->level = MERGE_END;
3199 root->queued = false;
3200 pthread_mutex_init (&root->lock, NULL);
3202 init_node (root, root + 1, dest, nthreads, nlines, false);
3203 return merge_tree;
3206 /* Destroy the merge tree. */
3207 static void
3208 merge_tree_destroy (struct merge_node *merge_tree)
3210 free (merge_tree);
3213 /* Initialize a merge tree node and its descendants. The node's
3214 parent is PARENT. The node and its descendants are taken from the
3215 array of nodes NODE_POOL. Their destination starts at DEST; they
3216 will consume NTHREADS threads. The total number of sort lines is
3217 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3218 its parent. */
3220 static struct merge_node *
3221 init_node (struct merge_node *restrict parent,
3222 struct merge_node *restrict node_pool,
3223 struct line *dest, size_t nthreads,
3224 size_t total_lines, bool is_lo_child)
3226 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3227 size_t nlo = nlines / 2;
3228 size_t nhi = nlines - nlo;
3229 struct line *lo = dest - total_lines;
3230 struct line *hi = lo - nlo;
3231 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3233 struct merge_node *node = node_pool++;
3234 node->lo = node->end_lo = lo;
3235 node->hi = node->end_hi = hi;
3236 node->dest = parent_end;
3237 node->nlo = nlo;
3238 node->nhi = nhi;
3239 node->parent = parent;
3240 node->level = parent->level + 1;
3241 node->queued = false;
3242 pthread_mutex_init (&node->lock, NULL);
3244 if (nthreads > 1)
3246 size_t lo_threads = nthreads / 2;
3247 size_t hi_threads = nthreads - lo_threads;
3248 node->lo_child = node_pool;
3249 node_pool = init_node (node, node_pool, lo, lo_threads,
3250 total_lines, true);
3251 node->hi_child = node_pool;
3252 node_pool = init_node (node, node_pool, hi, hi_threads,
3253 total_lines, false);
3255 else
3257 node->lo_child = NULL;
3258 node->hi_child = NULL;
3260 return node_pool;
3264 /* Compare two merge nodes A and B for priority. */
3266 static int
3267 compare_nodes (void const *a, void const *b)
3269 struct merge_node const *nodea = a;
3270 struct merge_node const *nodeb = b;
3271 if (nodea->level == nodeb->level)
3272 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3273 return nodea->level < nodeb->level;
3276 /* Lock a merge tree NODE. */
3278 static inline void
3279 lock_node (struct merge_node *node)
3281 pthread_mutex_lock (&node->lock);
3284 /* Unlock a merge tree NODE. */
3286 static inline void
3287 unlock_node (struct merge_node *node)
3289 pthread_mutex_unlock (&node->lock);
3292 /* Destroy merge QUEUE. */
3294 static void
3295 queue_destroy (struct merge_node_queue *queue)
3297 heap_free (queue->priority_queue);
3298 pthread_cond_destroy (&queue->cond);
3299 pthread_mutex_destroy (&queue->mutex);
3302 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3303 NTHREADS threads. */
3305 static void
3306 queue_init (struct merge_node_queue *queue, size_t nthreads)
3308 /* Though it's highly unlikely all nodes are in the heap at the same
3309 time, the heap should accommodate all of them. Counting a NULL
3310 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3311 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3312 pthread_mutex_init (&queue->mutex, NULL);
3313 pthread_cond_init (&queue->cond, NULL);
3316 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3317 does not need to lock NODE. */
3319 static void
3320 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3322 pthread_mutex_lock (&queue->mutex);
3323 heap_insert (queue->priority_queue, node);
3324 node->queued = true;
3325 pthread_mutex_unlock (&queue->mutex);
3326 pthread_cond_signal (&queue->cond);
3329 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3331 static struct merge_node *
3332 queue_pop (struct merge_node_queue *queue)
3334 struct merge_node *node;
3335 pthread_mutex_lock (&queue->mutex);
3336 while (! (node = heap_remove_top (queue->priority_queue)))
3337 pthread_cond_wait (&queue->cond, &queue->mutex);
3338 pthread_mutex_unlock (&queue->mutex);
3339 lock_node (node);
3340 node->queued = false;
3341 return node;
3344 /* Output LINE to TFP, unless -u is specified and the line compares
3345 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3346 is null if TFP is standard output.
3348 This function does not save the line for comparison later, so it is
3349 appropriate only for internal sort. */
3351 static void
3352 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3354 if (unique)
3356 if (saved_line.text && ! compare (line, &saved_line))
3357 return;
3358 saved_line = *line;
3361 write_line (line, tfp, temp_output);
3364 /* Merge the lines currently available to a NODE in the binary
3365 merge tree. Merge a number of lines appropriate for this merge
3366 level, assuming TOTAL_LINES is the total number of lines.
3368 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3369 the name of TFP, or is null if TFP is standard output. */
3371 static void
3372 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3373 FILE *tfp, char const *temp_output)
3375 struct line *lo_orig = node->lo;
3376 struct line *hi_orig = node->hi;
3377 size_t to_merge = MAX_MERGE (total_lines, node->level);
3378 size_t merged_lo;
3379 size_t merged_hi;
3381 if (node->level > MERGE_ROOT)
3383 /* Merge to destination buffer. */
3384 struct line *dest = *node->dest;
3385 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3386 if (compare (node->lo - 1, node->hi - 1) <= 0)
3387 *--dest = *--node->lo;
3388 else
3389 *--dest = *--node->hi;
3391 merged_lo = lo_orig - node->lo;
3392 merged_hi = hi_orig - node->hi;
3394 if (node->nhi == merged_hi)
3395 while (node->lo != node->end_lo && to_merge--)
3396 *--dest = *--node->lo;
3397 else if (node->nlo == merged_lo)
3398 while (node->hi != node->end_hi && to_merge--)
3399 *--dest = *--node->hi;
3400 *node->dest = dest;
3402 else
3404 /* Merge directly to output. */
3405 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3407 if (compare (node->lo - 1, node->hi - 1) <= 0)
3408 write_unique (--node->lo, tfp, temp_output);
3409 else
3410 write_unique (--node->hi, tfp, temp_output);
3413 merged_lo = lo_orig - node->lo;
3414 merged_hi = hi_orig - node->hi;
3416 if (node->nhi == merged_hi)
3418 while (node->lo != node->end_lo && to_merge--)
3419 write_unique (--node->lo, tfp, temp_output);
3421 else if (node->nlo == merged_lo)
3423 while (node->hi != node->end_hi && to_merge--)
3424 write_unique (--node->hi, tfp, temp_output);
3428 /* Update NODE. */
3429 merged_lo = lo_orig - node->lo;
3430 merged_hi = hi_orig - node->hi;
3431 node->nlo -= merged_lo;
3432 node->nhi -= merged_hi;
3435 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3436 NODE's children has available lines and the other either has
3437 available lines or has exhausted its lines. */
3439 static void
3440 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3442 if (! node->queued)
3444 bool lo_avail = (node->lo - node->end_lo) != 0;
3445 bool hi_avail = (node->hi - node->end_hi) != 0;
3446 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3447 queue_insert (queue, node);
3451 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3453 static void
3454 queue_check_insert_parent (struct merge_node_queue *queue,
3455 struct merge_node *node)
3457 if (node->level > MERGE_ROOT)
3459 lock_node (node->parent);
3460 queue_check_insert (queue, node->parent);
3461 unlock_node (node->parent);
3463 else if (node->nlo + node->nhi == 0)
3465 /* If the MERGE_ROOT NODE has finished merging, insert the
3466 MERGE_END node. */
3467 queue_insert (queue, node->parent);
3471 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3472 some of those lines, until the MERGE_END node is popped.
3473 TOTAL_LINES is the total number of lines. If merging at the top
3474 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3475 null if TFP is standard output. */
3477 static void
3478 merge_loop (struct merge_node_queue *queue,
3479 size_t total_lines, FILE *tfp, char const *temp_output)
3481 while (1)
3483 struct merge_node *node = queue_pop (queue);
3485 if (node->level == MERGE_END)
3487 unlock_node (node);
3488 /* Reinsert so other threads can pop it. */
3489 queue_insert (queue, node);
3490 break;
3492 mergelines_node (node, total_lines, tfp, temp_output);
3493 queue_check_insert (queue, node);
3494 queue_check_insert_parent (queue, node);
3496 unlock_node (node);
3501 static void sortlines (struct line *restrict, size_t, size_t,
3502 struct merge_node *, struct merge_node_queue *,
3503 FILE *, char const *);
3505 /* Thread arguments for sortlines_thread. */
3507 struct thread_args
3509 /* Source, i.e., the array of lines to sort. This points just past
3510 the end of the array. */
3511 struct line *lines;
3513 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3514 size_t nthreads;
3516 /* Number of lines in LINES and DEST. */
3517 size_t const total_lines;
3519 /* Merge node. Lines from this node and this node's sibling will merged
3520 to this node's parent. */
3521 struct merge_node *const node;
3523 /* The priority queue controlling available work for the entire
3524 internal sort. */
3525 struct merge_node_queue *const queue;
3527 /* If at the top level, the file to output to, and the file's name.
3528 If the file is standard output, the file's name is null. */
3529 FILE *tfp;
3530 char const *output_temp;
3533 /* Like sortlines, except with a signature acceptable to pthread_create. */
3535 static void *
3536 sortlines_thread (void *data)
3538 struct thread_args const *args = data;
3539 sortlines (args->lines, args->nthreads, args->total_lines,
3540 args->node, args->queue, args->tfp,
3541 args->output_temp);
3542 return NULL;
3545 /* Sort lines, possibly in parallel. The arguments are as in struct
3546 thread_args above.
3548 The algorithm has three phases: node creation, sequential sort,
3549 and binary merge.
3551 During node creation, sortlines recursively visits each node in the
3552 binary merge tree and creates a NODE structure corresponding to all the
3553 future line merging NODE is responsible for. For each call to
3554 sortlines, half the available threads are assigned to each recursive
3555 call, until a leaf node having only 1 available thread is reached.
3557 Each leaf node then performs two sequential sorts, one on each half of
3558 the lines it is responsible for. It records in its NODE structure that
3559 there are two sorted sublists available to merge from, and inserts its
3560 NODE into the priority queue.
3562 The binary merge phase then begins. Each thread drops into a loop
3563 where the thread retrieves a NODE from the priority queue, merges lines
3564 available to that NODE, and potentially insert NODE or its parent back
3565 into the queue if there are sufficient available lines for them to
3566 merge. This continues until all lines at all nodes of the merge tree
3567 have been merged. */
3569 static void
3570 sortlines (struct line *restrict lines, size_t nthreads,
3571 size_t total_lines, struct merge_node *node,
3572 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3574 size_t nlines = node->nlo + node->nhi;
3576 /* Calculate thread arguments. */
3577 size_t lo_threads = nthreads / 2;
3578 size_t hi_threads = nthreads - lo_threads;
3579 pthread_t thread;
3580 struct thread_args args = {lines, lo_threads, total_lines,
3581 node->lo_child, queue, tfp, temp_output};
3583 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3584 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3586 sortlines (lines - node->nlo, hi_threads, total_lines,
3587 node->hi_child, queue, tfp, temp_output);
3588 pthread_join (thread, NULL);
3590 else
3592 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3593 Sort with 1 thread. */
3594 size_t nlo = node->nlo;
3595 size_t nhi = node->nhi;
3596 struct line *temp = lines - total_lines;
3597 if (1 < nhi)
3598 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3599 if (1 < nlo)
3600 sequential_sort (lines, nlo, temp, false);
3602 /* Update merge NODE. No need to lock yet. */
3603 node->lo = lines;
3604 node->hi = lines - nlo;
3605 node->end_lo = lines - nlo;
3606 node->end_hi = lines - nlo - nhi;
3608 queue_insert (queue, node);
3609 merge_loop (queue, total_lines, tfp, temp_output);
3612 pthread_mutex_destroy (&node->lock);
3615 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3616 the same as OUTFILE. If found, replace each with the same
3617 temporary copy that can be merged into OUTFILE without destroying
3618 OUTFILE before it is completely read. This temporary copy does not
3619 count as a merge temp, so don't worry about incrementing NTEMPS in
3620 the caller; final cleanup will remove it, not zaptemp.
3622 This test ensures that an otherwise-erroneous use like
3623 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3624 It's not clear that POSIX requires this nicety.
3625 Detect common error cases, but don't try to catch obscure cases like
3626 "cat ... FILE ... | sort -m -o FILE"
3627 where traditional "sort" doesn't copy the input and where
3628 people should know that they're getting into trouble anyway.
3629 Catching these obscure cases would slow down performance in
3630 common cases. */
3632 static void
3633 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3634 size_t nfiles, char const *outfile)
3636 size_t i;
3637 bool got_outstat = false;
3638 struct stat outstat;
3639 struct tempnode *tempcopy = NULL;
3641 for (i = ntemps; i < nfiles; i++)
3643 bool is_stdin = STREQ (files[i].name, "-");
3644 bool same;
3645 struct stat instat;
3647 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3648 same = true;
3649 else
3651 if (! got_outstat)
3653 if (fstat (STDOUT_FILENO, &outstat) != 0)
3654 break;
3655 got_outstat = true;
3658 same = (((is_stdin
3659 ? fstat (STDIN_FILENO, &instat)
3660 : stat (files[i].name, &instat))
3661 == 0)
3662 && SAME_INODE (instat, outstat));
3665 if (same)
3667 if (! tempcopy)
3669 FILE *tftp;
3670 tempcopy = create_temp (&tftp);
3671 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3674 files[i].name = tempcopy->name;
3675 files[i].temp = tempcopy;
3680 /* Scan the input files to ensure all are accessible.
3681 Otherwise exit with a diagnostic.
3683 Note this will catch common issues with permissions etc.
3684 but will fail to notice issues where you can open() but not read(),
3685 like when a directory is specified on some systems.
3686 Catching these obscure cases could slow down performance in
3687 common cases. */
3689 static void
3690 check_inputs (char *const *files, size_t nfiles)
3692 size_t i;
3693 for (i = 0; i < nfiles; i++)
3695 if (STREQ (files[i], "-"))
3696 continue;
3698 if (euidaccess (files[i], R_OK) != 0)
3699 die (_("cannot read"), files[i]);
3703 /* Ensure a specified output file can be created or written to,
3704 and point stdout to it. Do not truncate the file.
3705 Exit with a diagnostic on failure. */
3707 static void
3708 check_output (char const *outfile)
3710 if (outfile)
3712 int outfd = open (outfile, O_WRONLY | O_CREAT | O_BINARY, MODE_RW_UGO);
3713 if (outfd < 0)
3714 die (_("open failed"), outfile);
3715 move_fd_or_die (outfd, STDOUT_FILENO);
3719 /* Merge the input FILES. NTEMPS is the number of files at the
3720 start of FILES that are temporary; it is zero at the top level.
3721 NFILES is the total number of files. Put the output in
3722 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3724 static void
3725 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3726 char const *output_file)
3728 while (nmerge < nfiles)
3730 /* Number of input files processed so far. */
3731 size_t in;
3733 /* Number of output files generated so far. */
3734 size_t out;
3736 /* nfiles % NMERGE; this counts input files that are left over
3737 after all full-sized merges have been done. */
3738 size_t remainder;
3740 /* Number of easily-available slots at the next loop iteration. */
3741 size_t cheap_slots;
3743 /* Do as many NMERGE-size merges as possible. In the case that
3744 nmerge is bogus, increment by the maximum number of file
3745 descriptors allowed. */
3746 for (out = in = 0; nmerge <= nfiles - in; out++)
3748 FILE *tfp;
3749 struct tempnode *temp = create_temp (&tfp);
3750 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3751 nmerge, tfp, temp->name);
3752 ntemps -= MIN (ntemps, num_merged);
3753 files[out].name = temp->name;
3754 files[out].temp = temp;
3755 in += num_merged;
3758 remainder = nfiles - in;
3759 cheap_slots = nmerge - out % nmerge;
3761 if (cheap_slots < remainder)
3763 /* So many files remain that they can't all be put into the last
3764 NMERGE-sized output window. Do one more merge. Merge as few
3765 files as possible, to avoid needless I/O. */
3766 size_t nshortmerge = remainder - cheap_slots + 1;
3767 FILE *tfp;
3768 struct tempnode *temp = create_temp (&tfp);
3769 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3770 nshortmerge, tfp, temp->name);
3771 ntemps -= MIN (ntemps, num_merged);
3772 files[out].name = temp->name;
3773 files[out++].temp = temp;
3774 in += num_merged;
3777 /* Put the remaining input files into the last NMERGE-sized output
3778 window, so they will be merged in the next pass. */
3779 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3780 ntemps += out;
3781 nfiles -= in - out;
3784 avoid_trashing_input (files, ntemps, nfiles, output_file);
3786 /* We aren't guaranteed that this final mergefiles will work, therefore we
3787 try to merge into the output, and then merge as much as we can into a
3788 temp file if we can't. Repeat. */
3790 while (true)
3792 /* Merge directly into the output file if possible. */
3793 FILE **fps;
3794 size_t nopened = open_input_files (files, nfiles, &fps);
3796 if (nopened == nfiles)
3798 FILE *ofp = stream_open (output_file, "w");
3799 if (ofp)
3801 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3802 break;
3804 if (errno != EMFILE || nopened <= 2)
3805 die (_("open failed"), output_file);
3807 else if (nopened <= 2)
3808 die (_("open failed"), files[nopened].name);
3810 /* We ran out of file descriptors. Close one of the input
3811 files, to gain a file descriptor. Then create a temporary
3812 file with our spare file descriptor. Retry if that failed
3813 (e.g., some other process could open a file between the time
3814 we closed and tried to create). */
3815 FILE *tfp;
3816 struct tempnode *temp;
3819 nopened--;
3820 xfclose (fps[nopened], files[nopened].name);
3821 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3823 while (!temp);
3825 /* Merge into the newly allocated temporary. */
3826 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3827 fps);
3828 ntemps -= MIN (ntemps, nopened);
3829 files[0].name = temp->name;
3830 files[0].temp = temp;
3832 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3833 ntemps++;
3834 nfiles -= nopened - 1;
3838 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3840 static void
3841 sort (char *const *files, size_t nfiles, char const *output_file,
3842 size_t nthreads)
3844 struct buffer buf;
3845 IF_LINT (buf.buf = NULL);
3846 size_t ntemps = 0;
3847 bool output_file_created = false;
3849 buf.alloc = 0;
3851 while (nfiles)
3853 char const *temp_output;
3854 char const *file = *files;
3855 FILE *fp = xfopen (file, "r");
3856 FILE *tfp;
3858 size_t bytes_per_line;
3859 if (nthreads > 1)
3861 /* Get log P. */
3862 size_t tmp = 1;
3863 size_t mult = 1;
3864 while (tmp < nthreads)
3866 tmp *= 2;
3867 mult++;
3869 bytes_per_line = (mult * sizeof (struct line));
3871 else
3872 bytes_per_line = sizeof (struct line) * 3 / 2;
3874 if (! buf.alloc)
3875 initbuf (&buf, bytes_per_line,
3876 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3877 buf.eof = false;
3878 files++;
3879 nfiles--;
3881 while (fillbuf (&buf, fp, file))
3883 struct line *line;
3885 if (buf.eof && nfiles
3886 && (bytes_per_line + 1
3887 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3889 /* End of file, but there is more input and buffer room.
3890 Concatenate the next input file; this is faster in
3891 the usual case. */
3892 buf.left = buf.used;
3893 break;
3896 saved_line.text = NULL;
3897 line = buffer_linelim (&buf);
3898 if (buf.eof && !nfiles && !ntemps && !buf.left)
3900 xfclose (fp, file);
3901 tfp = xfopen (output_file, "w");
3902 temp_output = output_file;
3903 output_file_created = true;
3905 else
3907 ++ntemps;
3908 temp_output = create_temp (&tfp)->name;
3910 if (1 < buf.nlines)
3912 struct merge_node_queue queue;
3913 queue_init (&queue, nthreads);
3914 struct merge_node *merge_tree =
3915 merge_tree_init (nthreads, buf.nlines, line);
3916 struct merge_node *root = merge_tree + 1;
3918 sortlines (line, nthreads, buf.nlines, root,
3919 &queue, tfp, temp_output);
3920 queue_destroy (&queue);
3921 pthread_mutex_destroy (&root->lock);
3922 merge_tree_destroy (merge_tree);
3924 else
3925 write_unique (line - 1, tfp, temp_output);
3927 xfclose (tfp, temp_output);
3929 if (output_file_created)
3930 goto finish;
3932 xfclose (fp, file);
3935 finish:
3936 free (buf.buf);
3938 if (! output_file_created)
3940 size_t i;
3941 struct tempnode *node = temphead;
3942 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3943 for (i = 0; node; i++)
3945 tempfiles[i].name = node->name;
3946 tempfiles[i].temp = node;
3947 node = node->next;
3949 merge (tempfiles, ntemps, ntemps, output_file);
3950 free (tempfiles);
3953 reap_all ();
3956 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3958 static void
3959 insertkey (struct keyfield *key_arg)
3961 struct keyfield **p;
3962 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3964 for (p = &keylist; *p; p = &(*p)->next)
3965 continue;
3966 *p = key;
3967 key->next = NULL;
3970 /* Report a bad field specification SPEC, with extra info MSGID. */
3972 static void badfieldspec (char const *, char const *)
3973 ATTRIBUTE_NORETURN;
3974 static void
3975 badfieldspec (char const *spec, char const *msgid)
3977 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3978 _(msgid), quote (spec));
3979 abort ();
3982 /* Report incompatible options. */
3984 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3985 static void
3986 incompatible_options (char const *opts)
3988 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), opts);
3989 abort ();
3992 /* Check compatibility of ordering options. */
3994 static void
3995 check_ordering_compatibility (void)
3997 struct keyfield *key;
3999 for (key = keylist; key; key = key->next)
4000 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4001 + key->month + (key->version | key->random | !!key->ignore)))
4003 /* The following is too big, but guaranteed to be "big enough". */
4004 char opts[sizeof short_options];
4005 /* Clear flags we're not interested in. */
4006 key->skipsblanks = key->skipeblanks = key->reverse = false;
4007 key_to_opts (key, opts);
4008 incompatible_options (opts);
4012 /* Parse the leading integer in STRING and store the resulting value
4013 (which must fit into size_t) into *VAL. Return the address of the
4014 suffix after the integer. If the value is too large, silently
4015 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4016 failure; otherwise, report MSGID and exit on failure. */
4018 static char const *
4019 parse_field_count (char const *string, size_t *val, char const *msgid)
4021 char *suffix;
4022 uintmax_t n;
4024 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4026 case LONGINT_OK:
4027 case LONGINT_INVALID_SUFFIX_CHAR:
4028 *val = n;
4029 if (*val == n)
4030 break;
4031 /* Fall through. */
4032 case LONGINT_OVERFLOW:
4033 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4034 *val = SIZE_MAX;
4035 break;
4037 case LONGINT_INVALID:
4038 if (msgid)
4039 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4040 _(msgid), quote (string));
4041 return NULL;
4044 return suffix;
4047 /* Handle interrupts and hangups. */
4049 static void
4050 sighandler (int sig)
4052 if (! SA_NOCLDSTOP)
4053 signal (sig, SIG_IGN);
4055 cleanup ();
4057 signal (sig, SIG_DFL);
4058 raise (sig);
4061 /* Set the ordering options for KEY specified in S.
4062 Return the address of the first character in S that
4063 is not a valid ordering option.
4064 BLANKTYPE is the kind of blanks that 'b' should skip. */
4066 static char *
4067 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4069 while (*s)
4071 switch (*s)
4073 case 'b':
4074 if (blanktype == bl_start || blanktype == bl_both)
4075 key->skipsblanks = true;
4076 if (blanktype == bl_end || blanktype == bl_both)
4077 key->skipeblanks = true;
4078 break;
4079 case 'd':
4080 key->ignore = nondictionary;
4081 break;
4082 case 'f':
4083 key->translate = fold_toupper;
4084 break;
4085 case 'g':
4086 key->general_numeric = true;
4087 break;
4088 case 'h':
4089 key->human_numeric = true;
4090 break;
4091 case 'i':
4092 /* Option order should not matter, so don't let -i override
4093 -d. -d implies -i, but -i does not imply -d. */
4094 if (! key->ignore)
4095 key->ignore = nonprinting;
4096 break;
4097 case 'M':
4098 key->month = true;
4099 break;
4100 case 'n':
4101 key->numeric = true;
4102 break;
4103 case 'R':
4104 key->random = true;
4105 break;
4106 case 'r':
4107 key->reverse = true;
4108 break;
4109 case 'V':
4110 key->version = true;
4111 break;
4112 default:
4113 return (char *) s;
4115 ++s;
4117 return (char *) s;
4120 /* Initialize KEY. */
4122 static struct keyfield *
4123 key_init (struct keyfield *key)
4125 memset (key, 0, sizeof *key);
4126 key->eword = SIZE_MAX;
4127 return key;
4131 main (int argc, char **argv)
4133 struct keyfield *key;
4134 struct keyfield key_buf;
4135 struct keyfield gkey;
4136 bool gkey_only = false;
4137 char const *s;
4138 int c = 0;
4139 char checkonly = 0;
4140 bool mergeonly = false;
4141 char *random_source = NULL;
4142 bool need_random = false;
4143 size_t nthreads = 0;
4144 size_t nfiles = 0;
4145 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4146 bool obsolete_usage = (posix2_version () < 200112);
4147 char **files;
4148 char *files_from = NULL;
4149 struct Tokens tok;
4150 char const *outfile = NULL;
4152 initialize_main (&argc, &argv);
4153 set_program_name (argv[0]);
4154 setlocale (LC_ALL, "");
4155 bindtextdomain (PACKAGE, LOCALEDIR);
4156 textdomain (PACKAGE);
4158 initialize_exit_failure (SORT_FAILURE);
4160 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4161 #if HAVE_NL_LANGINFO
4162 hard_LC_TIME = hard_locale (LC_TIME);
4163 #endif
4165 /* Get locale's representation of the decimal point. */
4167 struct lconv const *locale = localeconv ();
4169 /* If the locale doesn't define a decimal point, or if the decimal
4170 point is multibyte, use the C locale's decimal point. FIXME:
4171 add support for multibyte decimal points. */
4172 decimal_point = to_uchar (locale->decimal_point[0]);
4173 if (! decimal_point || locale->decimal_point[1])
4174 decimal_point = '.';
4176 /* FIXME: add support for multibyte thousands separators. */
4177 thousands_sep = to_uchar (*locale->thousands_sep);
4178 if (! thousands_sep || locale->thousands_sep[1])
4179 thousands_sep = -1;
4182 have_read_stdin = false;
4183 inittables ();
4186 size_t i;
4187 static int const sig[] =
4189 /* The usual suspects. */
4190 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4191 #ifdef SIGPOLL
4192 SIGPOLL,
4193 #endif
4194 #ifdef SIGPROF
4195 SIGPROF,
4196 #endif
4197 #ifdef SIGVTALRM
4198 SIGVTALRM,
4199 #endif
4200 #ifdef SIGXCPU
4201 SIGXCPU,
4202 #endif
4203 #ifdef SIGXFSZ
4204 SIGXFSZ,
4205 #endif
4207 enum { nsigs = ARRAY_CARDINALITY (sig) };
4209 #if SA_NOCLDSTOP
4210 struct sigaction act;
4212 sigemptyset (&caught_signals);
4213 for (i = 0; i < nsigs; i++)
4215 sigaction (sig[i], NULL, &act);
4216 if (act.sa_handler != SIG_IGN)
4217 sigaddset (&caught_signals, sig[i]);
4220 act.sa_handler = sighandler;
4221 act.sa_mask = caught_signals;
4222 act.sa_flags = 0;
4224 for (i = 0; i < nsigs; i++)
4225 if (sigismember (&caught_signals, sig[i]))
4226 sigaction (sig[i], &act, NULL);
4227 #else
4228 for (i = 0; i < nsigs; i++)
4229 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4231 signal (sig[i], sighandler);
4232 siginterrupt (sig[i], 1);
4234 #endif
4236 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4238 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4239 atexit (exit_cleanup);
4241 key_init (&gkey);
4242 gkey.sword = SIZE_MAX;
4244 files = xnmalloc (argc, sizeof *files);
4246 while (true)
4248 /* Parse an operand as a file after "--" was seen; or if
4249 pedantic and a file was seen, unless the POSIX version
4250 predates 1003.1-2001 and -c was not seen and the operand is
4251 "-o FILE" or "-oFILE". */
4252 int oi = -1;
4254 if (c == -1
4255 || (posixly_correct && nfiles != 0
4256 && ! (obsolete_usage
4257 && ! checkonly
4258 && optind != argc
4259 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4260 && (argv[optind][2] || optind + 1 != argc)))
4261 || ((c = getopt_long (argc, argv, short_options,
4262 long_options, &oi))
4263 == -1))
4265 if (argc <= optind)
4266 break;
4267 files[nfiles++] = argv[optind++];
4269 else switch (c)
4271 case 1:
4272 key = NULL;
4273 if (optarg[0] == '+')
4275 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4276 && ISDIGIT (argv[optind][1]));
4277 obsolete_usage |= minus_pos_usage && !posixly_correct;
4278 if (obsolete_usage)
4280 /* Treat +POS1 [-POS2] as a key if possible; but silently
4281 treat an operand as a file if it is not a valid +POS1. */
4282 key = key_init (&key_buf);
4283 s = parse_field_count (optarg + 1, &key->sword, NULL);
4284 if (s && *s == '.')
4285 s = parse_field_count (s + 1, &key->schar, NULL);
4286 if (! (key->sword || key->schar))
4287 key->sword = SIZE_MAX;
4288 if (! s || *set_ordering (s, key, bl_start))
4289 key = NULL;
4290 else
4292 if (minus_pos_usage)
4294 char const *optarg1 = argv[optind++];
4295 s = parse_field_count (optarg1 + 1, &key->eword,
4296 N_("invalid number after '-'"));
4297 /* When called with a non-NULL message ID,
4298 parse_field_count cannot return NULL. Tell static
4299 analysis tools that dereferencing S is safe. */
4300 assert (s);
4301 if (*s == '.')
4302 s = parse_field_count (s + 1, &key->echar,
4303 N_("invalid number after '.'"));
4304 if (!key->echar && key->eword)
4306 /* obsolescent syntax +A.x -B.y is equivalent to:
4307 -k A+1.x+1,B.y (when y = 0)
4308 -k A+1.x+1,B+1.y (when y > 0)
4309 So eword is decremented as in the -k case
4310 only when the end field (B) is specified and
4311 echar (y) is 0. */
4312 key->eword--;
4314 if (*set_ordering (s, key, bl_end))
4315 badfieldspec (optarg1,
4316 N_("stray character in field spec"));
4318 key->obsolete_used = true;
4319 insertkey (key);
4323 if (! key)
4324 files[nfiles++] = optarg;
4325 break;
4327 case SORT_OPTION:
4328 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4329 /* Fall through. */
4330 case 'b':
4331 case 'd':
4332 case 'f':
4333 case 'g':
4334 case 'h':
4335 case 'i':
4336 case 'M':
4337 case 'n':
4338 case 'r':
4339 case 'R':
4340 case 'V':
4342 char str[2];
4343 str[0] = c;
4344 str[1] = '\0';
4345 set_ordering (str, &gkey, bl_both);
4347 break;
4349 case CHECK_OPTION:
4350 c = (optarg
4351 ? XARGMATCH ("--check", optarg, check_args, check_types)
4352 : 'c');
4353 /* Fall through. */
4354 case 'c':
4355 case 'C':
4356 if (checkonly && checkonly != c)
4357 incompatible_options ("cC");
4358 checkonly = c;
4359 break;
4361 case COMPRESS_PROGRAM_OPTION:
4362 if (compress_program && !STREQ (compress_program, optarg))
4363 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4364 compress_program = optarg;
4365 break;
4367 case DEBUG_PROGRAM_OPTION:
4368 debug = true;
4369 break;
4371 case FILES0_FROM_OPTION:
4372 files_from = optarg;
4373 break;
4375 case 'k':
4376 key = key_init (&key_buf);
4378 /* Get POS1. */
4379 s = parse_field_count (optarg, &key->sword,
4380 N_("invalid number at field start"));
4381 if (! key->sword--)
4383 /* Provoke with 'sort -k0' */
4384 badfieldspec (optarg, N_("field number is zero"));
4386 if (*s == '.')
4388 s = parse_field_count (s + 1, &key->schar,
4389 N_("invalid number after '.'"));
4390 if (! key->schar--)
4392 /* Provoke with 'sort -k1.0' */
4393 badfieldspec (optarg, N_("character offset is zero"));
4396 if (! (key->sword || key->schar))
4397 key->sword = SIZE_MAX;
4398 s = set_ordering (s, key, bl_start);
4399 if (*s != ',')
4401 key->eword = SIZE_MAX;
4402 key->echar = 0;
4404 else
4406 /* Get POS2. */
4407 s = parse_field_count (s + 1, &key->eword,
4408 N_("invalid number after ','"));
4409 if (! key->eword--)
4411 /* Provoke with 'sort -k1,0' */
4412 badfieldspec (optarg, N_("field number is zero"));
4414 if (*s == '.')
4416 s = parse_field_count (s + 1, &key->echar,
4417 N_("invalid number after '.'"));
4419 s = set_ordering (s, key, bl_end);
4421 if (*s)
4422 badfieldspec (optarg, N_("stray character in field spec"));
4423 insertkey (key);
4424 break;
4426 case 'm':
4427 mergeonly = true;
4428 break;
4430 case NMERGE_OPTION:
4431 specify_nmerge (oi, c, optarg);
4432 break;
4434 case 'o':
4435 if (outfile && !STREQ (outfile, optarg))
4436 error (SORT_FAILURE, 0, _("multiple output files specified"));
4437 outfile = optarg;
4438 break;
4440 case RANDOM_SOURCE_OPTION:
4441 if (random_source && !STREQ (random_source, optarg))
4442 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4443 random_source = optarg;
4444 break;
4446 case 's':
4447 stable = true;
4448 break;
4450 case 'S':
4451 specify_sort_size (oi, c, optarg);
4452 break;
4454 case 't':
4456 char newtab = optarg[0];
4457 if (! newtab)
4458 error (SORT_FAILURE, 0, _("empty tab"));
4459 if (optarg[1])
4461 if (STREQ (optarg, "\\0"))
4462 newtab = '\0';
4463 else
4465 /* Provoke with 'sort -txx'. Complain about
4466 "multi-character tab" instead of "multibyte tab", so
4467 that the diagnostic's wording does not need to be
4468 changed once multibyte characters are supported. */
4469 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4470 quote (optarg));
4473 if (tab != TAB_DEFAULT && tab != newtab)
4474 error (SORT_FAILURE, 0, _("incompatible tabs"));
4475 tab = newtab;
4477 break;
4479 case 'T':
4480 add_temp_dir (optarg);
4481 break;
4483 case PARALLEL_OPTION:
4484 nthreads = specify_nthreads (oi, c, optarg);
4485 break;
4487 case 'u':
4488 unique = true;
4489 break;
4491 case 'y':
4492 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4493 through Solaris 7. It is also accepted by many non-Solaris
4494 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4495 -y is marked as obsolete starting with Solaris 8 (1999), but is
4496 still accepted as of Solaris 10 prerelease (2004).
4498 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4499 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4500 and which in general ignores the argument after "-y" if it
4501 consists entirely of digits (it can even be empty). */
4502 if (optarg == argv[optind - 1])
4504 char const *p;
4505 for (p = optarg; ISDIGIT (*p); p++)
4506 continue;
4507 optind -= (*p != '\0');
4509 break;
4511 case 'z':
4512 eolchar = 0;
4513 break;
4515 case_GETOPT_HELP_CHAR;
4517 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4519 default:
4520 usage (SORT_FAILURE);
4524 if (files_from)
4526 FILE *stream;
4528 /* When using --files0-from=F, you may not specify any files
4529 on the command-line. */
4530 if (nfiles)
4532 error (0, 0, _("extra operand %s"), quote (files[0]));
4533 fprintf (stderr, "%s\n",
4534 _("file operands cannot be combined with --files0-from"));
4535 usage (SORT_FAILURE);
4538 if (STREQ (files_from, "-"))
4539 stream = stdin;
4540 else
4542 stream = fopen (files_from, "r");
4543 if (stream == NULL)
4544 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4545 quote (files_from));
4548 readtokens0_init (&tok);
4550 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4551 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4552 quote (files_from));
4554 if (tok.n_tok)
4556 size_t i;
4557 free (files);
4558 files = tok.tok;
4559 nfiles = tok.n_tok;
4560 for (i = 0; i < nfiles; i++)
4562 if (STREQ (files[i], "-"))
4563 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4564 "no file name of %s allowed"),
4565 quote (files[i]));
4566 else if (files[i][0] == '\0')
4568 /* Using the standard 'filename:line-number:' prefix here is
4569 not totally appropriate, since NUL is the separator,
4570 not NL, but it might be better than nothing. */
4571 unsigned long int file_number = i + 1;
4572 error (SORT_FAILURE, 0,
4573 _("%s:%lu: invalid zero-length file name"),
4574 quotearg_colon (files_from), file_number);
4578 else
4579 error (SORT_FAILURE, 0, _("no input from %s"),
4580 quote (files_from));
4583 /* Inheritance of global options to individual keys. */
4584 for (key = keylist; key; key = key->next)
4586 if (default_key_compare (key) && !key->reverse)
4588 key->ignore = gkey.ignore;
4589 key->translate = gkey.translate;
4590 key->skipsblanks = gkey.skipsblanks;
4591 key->skipeblanks = gkey.skipeblanks;
4592 key->month = gkey.month;
4593 key->numeric = gkey.numeric;
4594 key->general_numeric = gkey.general_numeric;
4595 key->human_numeric = gkey.human_numeric;
4596 key->version = gkey.version;
4597 key->random = gkey.random;
4598 key->reverse = gkey.reverse;
4601 need_random |= key->random;
4604 if (!keylist && !default_key_compare (&gkey))
4606 gkey_only = true;
4607 insertkey (&gkey);
4608 need_random |= gkey.random;
4611 check_ordering_compatibility ();
4613 if (debug)
4615 if (checkonly || outfile)
4617 static char opts[] = "X --debug";
4618 opts[0] = (checkonly ? checkonly : 'o');
4619 incompatible_options (opts);
4622 /* Always output the locale in debug mode, since this
4623 is such a common source of confusion. */
4624 if (hard_LC_COLLATE)
4625 error (0, 0, _("using %s sorting rules"),
4626 quote (setlocale (LC_COLLATE, NULL)));
4627 else
4628 error (0, 0, _("using simple byte comparison"));
4629 key_warnings (&gkey, gkey_only);
4632 reverse = gkey.reverse;
4634 if (need_random)
4635 random_md5_state_init (random_source);
4637 if (temp_dir_count == 0)
4639 char const *tmp_dir = getenv ("TMPDIR");
4640 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4643 if (nfiles == 0)
4645 static char *minus = (char *) "-";
4646 nfiles = 1;
4647 free (files);
4648 files = &minus;
4651 /* Need to re-check that we meet the minimum requirement for memory
4652 usage with the final value for NMERGE. */
4653 if (0 < sort_size)
4654 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4656 if (checkonly)
4658 if (nfiles > 1)
4659 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4660 quote (files[1]), checkonly);
4662 if (outfile)
4664 static char opts[] = {0, 'o', 0};
4665 opts[0] = checkonly;
4666 incompatible_options (opts);
4669 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4670 input is not properly sorted. */
4671 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4674 /* Check all inputs are accessible, or exit immediately. */
4675 check_inputs (files, nfiles);
4677 /* Check output is writable, or exit immediately. */
4678 check_output (outfile);
4680 if (mergeonly)
4682 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4683 size_t i;
4685 for (i = 0; i < nfiles; ++i)
4686 sortfiles[i].name = files[i];
4688 merge (sortfiles, 0, nfiles, outfile);
4689 IF_LINT (free (sortfiles));
4691 else
4693 if (!nthreads)
4695 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4696 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4699 /* Avoid integer overflow later. */
4700 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4701 nthreads = MIN (nthreads, nthreads_max);
4703 sort (files, nfiles, outfile, nthreads);
4706 if (have_read_stdin && fclose (stdin) == EOF)
4707 die (_("close failed"), "-");
4709 exit (EXIT_SUCCESS);