maint: sort: remove the last uses of "'%s'" in diagnostics
[coreutils/ericb.git] / src / sort.c
blob6875a6a93d7f1dfd3cabca2987458eb63981a0fd
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 #include <signal.h>
30 #include "system.h"
31 #include "argmatch.h"
32 #include "error.h"
33 #include "fadvise.h"
34 #include "filevercmp.h"
35 #include "hard-locale.h"
36 #include "hash.h"
37 #include "heap.h"
38 #include "ignore-value.h"
39 #include "md5.h"
40 #include "mbswidth.h"
41 #include "nproc.h"
42 #include "physmem.h"
43 #include "posixver.h"
44 #include "quote.h"
45 #include "quotearg.h"
46 #include "randread.h"
47 #include "readtokens0.h"
48 #include "stdio--.h"
49 #include "stdlib--.h"
50 #include "strnumcmp.h"
51 #include "xmemcoll.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
55 #if HAVE_SYS_RESOURCE_H
56 # include <sys/resource.h>
57 #endif
58 #ifndef RLIMIT_DATA
59 struct rlimit { size_t rlim_cur; };
60 # define getrlimit(Resource, Rlp) (-1)
61 #endif
63 /* The official name of this program (e.g., no 'g' prefix). */
64 #define PROGRAM_NAME "sort"
66 #define AUTHORS \
67 proper_name ("Mike Haertel"), \
68 proper_name ("Paul Eggert")
70 #if HAVE_LANGINFO_CODESET
71 # include <langinfo.h>
72 #endif
74 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
75 present. */
76 #ifndef SA_NOCLDSTOP
77 # define SA_NOCLDSTOP 0
78 /* No sigprocmask. Always 'return' zero. */
79 # define sigprocmask(How, Set, Oset) (0)
80 # define sigset_t int
81 # if ! HAVE_SIGINTERRUPT
82 # define siginterrupt(sig, flag) /* empty */
83 # endif
84 #endif
86 #if !defined OPEN_MAX && defined NR_OPEN
87 # define OPEN_MAX NR_OPEN
88 #endif
89 #if !defined OPEN_MAX
90 # define OPEN_MAX 20
91 #endif
93 #define UCHAR_LIM (UCHAR_MAX + 1)
95 #if HAVE_C99_STRTOLD
96 # define long_double long double
97 #else
98 # define long_double double
99 # undef strtold
100 # define strtold strtod
101 #endif
103 #ifndef DEFAULT_TMPDIR
104 # define DEFAULT_TMPDIR "/tmp"
105 #endif
107 /* Maximum number of lines to merge every time a NODE is taken from
108 the merge queue. Node is at LEVEL in the binary merge tree,
109 and is responsible for merging TOTAL lines. */
110 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
112 /* Heuristic value for the number of lines for which it is worth creating
113 a subthread, during an internal merge sort. I.e., it is a small number
114 of "average" lines for which sorting via two threads is faster than
115 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
116 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
117 lines of gensort -a output is sorted slightly faster with --parallel=2
118 than with --parallel=1. By contrast, using --parallel=1 is about 10%
119 faster than using --parallel=2 with a 64K-line input. */
120 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
121 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
123 /* The number of threads after which there are
124 diminishing performance gains. */
125 enum { DEFAULT_MAX_THREADS = 8 };
127 /* Exit statuses. */
128 enum
130 /* POSIX says to exit with status 1 if invoked with -c and the
131 input is not properly sorted. */
132 SORT_OUT_OF_ORDER = 1,
134 /* POSIX says any other irregular exit must exit with a status
135 code greater than 1. */
136 SORT_FAILURE = 2
139 enum
141 /* The number of times we should try to fork a compression process
142 (we retry if the fork call fails). We don't _need_ to compress
143 temp files, this is just to reduce disk access, so this number
144 can be small. Each retry doubles in duration. */
145 MAX_FORK_TRIES_COMPRESS = 4,
147 /* The number of times we should try to fork a decompression process.
148 If we can't fork a decompression process, we can't sort, so this
149 number should be big. Each retry doubles in duration. */
150 MAX_FORK_TRIES_DECOMPRESS = 9
153 enum
155 /* Level of the end-of-merge node, one level above the root. */
156 MERGE_END = 0,
158 /* Level of the root node in merge tree. */
159 MERGE_ROOT = 1
162 /* The representation of the decimal point in the current locale. */
163 static int decimal_point;
165 /* Thousands separator; if -1, then there isn't one. */
166 static int thousands_sep;
168 /* Nonzero if the corresponding locales are hard. */
169 static bool hard_LC_COLLATE;
170 #if HAVE_NL_LANGINFO
171 static bool hard_LC_TIME;
172 #endif
174 #define NONZERO(x) ((x) != 0)
176 /* The kind of blanks for '-b' to skip in various options. */
177 enum blanktype { bl_start, bl_end, bl_both };
179 /* The character marking end of line. Default to \n. */
180 static char eolchar = '\n';
182 /* Lines are held in core as counted strings. */
183 struct line
185 char *text; /* Text of the line. */
186 size_t length; /* Length including final newline. */
187 char *keybeg; /* Start of first key. */
188 char *keylim; /* Limit of first key. */
191 /* Input buffers. */
192 struct buffer
194 char *buf; /* Dynamically allocated buffer,
195 partitioned into 3 regions:
196 - input data;
197 - unused area;
198 - an array of lines, in reverse order. */
199 size_t used; /* Number of bytes used for input data. */
200 size_t nlines; /* Number of lines in the line array. */
201 size_t alloc; /* Number of bytes allocated. */
202 size_t left; /* Number of bytes left from previous reads. */
203 size_t line_bytes; /* Number of bytes to reserve for each line. */
204 bool eof; /* An EOF has been read. */
207 /* Sort key. */
208 struct keyfield
210 size_t sword; /* Zero-origin 'word' to start at. */
211 size_t schar; /* Additional characters to skip. */
212 size_t eword; /* Zero-origin last 'word' of key. */
213 size_t echar; /* Additional characters in field. */
214 bool const *ignore; /* Boolean array of characters to ignore. */
215 char const *translate; /* Translation applied to characters. */
216 bool skipsblanks; /* Skip leading blanks when finding start. */
217 bool skipeblanks; /* Skip leading blanks when finding end. */
218 bool numeric; /* Flag for numeric comparison. Handle
219 strings of digits with optional decimal
220 point, but no exponential notation. */
221 bool random; /* Sort by random hash of key. */
222 bool general_numeric; /* Flag for general, numeric comparison.
223 Handle numbers in exponential notation. */
224 bool human_numeric; /* Flag for sorting by human readable
225 units with either SI xor IEC prefixes. */
226 bool month; /* Flag for comparison by month name. */
227 bool reverse; /* Reverse the sense of comparison. */
228 bool version; /* sort by version number */
229 bool obsolete_used; /* obsolescent key option format is used. */
230 struct keyfield *next; /* Next keyfield to try. */
233 struct month
235 char const *name;
236 int val;
239 /* Binary merge tree node. */
240 struct merge_node
242 struct line *lo; /* Lines to merge from LO child node. */
243 struct line *hi; /* Lines to merge from HI child ndoe. */
244 struct line *end_lo; /* End of available lines from LO. */
245 struct line *end_hi; /* End of available lines from HI. */
246 struct line **dest; /* Pointer to destination of merge. */
247 size_t nlo; /* Total Lines remaining from LO. */
248 size_t nhi; /* Total lines remaining from HI. */
249 struct merge_node *parent; /* Parent node. */
250 struct merge_node *lo_child; /* LO child node. */
251 struct merge_node *hi_child; /* HI child node. */
252 unsigned int level; /* Level in merge tree. */
253 bool queued; /* Node is already in heap. */
254 pthread_mutex_t lock; /* Lock for node operations. */
257 /* Priority queue of merge nodes. */
258 struct merge_node_queue
260 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
261 pthread_mutex_t mutex; /* Lock for queue operations. */
262 pthread_cond_t cond; /* Conditional wait for empty queue to populate
263 when popping. */
266 /* FIXME: None of these tables work with multibyte character sets.
267 Also, there are many other bugs when handling multibyte characters.
268 One way to fix this is to rewrite 'sort' to use wide characters
269 internally, but doing this with good performance is a bit
270 tricky. */
272 /* Table of blanks. */
273 static bool blanks[UCHAR_LIM];
275 /* Table of non-printing characters. */
276 static bool nonprinting[UCHAR_LIM];
278 /* Table of non-dictionary characters (not letters, digits, or blanks). */
279 static bool nondictionary[UCHAR_LIM];
281 /* Translation table folding lower case to upper. */
282 static char fold_toupper[UCHAR_LIM];
284 #define MONTHS_PER_YEAR 12
286 /* Table mapping month names to integers.
287 Alphabetic order allows binary search. */
288 static struct month monthtab[] =
290 {"APR", 4},
291 {"AUG", 8},
292 {"DEC", 12},
293 {"FEB", 2},
294 {"JAN", 1},
295 {"JUL", 7},
296 {"JUN", 6},
297 {"MAR", 3},
298 {"MAY", 5},
299 {"NOV", 11},
300 {"OCT", 10},
301 {"SEP", 9}
304 /* During the merge phase, the number of files to merge at once. */
305 #define NMERGE_DEFAULT 16
307 /* Minimum size for a merge or check buffer. */
308 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
310 /* Minimum sort size; the code might not work with smaller sizes. */
311 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
313 /* The number of bytes needed for a merge or check buffer, which can
314 function relatively efficiently even if it holds only one line. If
315 a longer line is seen, this value is increased. */
316 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
318 /* The approximate maximum number of bytes of main memory to use, as
319 specified by the user. Zero if the user has not specified a size. */
320 static size_t sort_size;
322 /* The initial allocation factor for non-regular files.
323 This is used, e.g., when reading from a pipe.
324 Don't make it too big, since it is multiplied by ~130 to
325 obtain the size of the actual buffer sort will allocate.
326 Also, there may be 8 threads all doing this at the same time. */
327 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
329 /* Array of directory names in which any temporary files are to be created. */
330 static char const **temp_dirs;
332 /* Number of temporary directory names used. */
333 static size_t temp_dir_count;
335 /* Number of allocated slots in temp_dirs. */
336 static size_t temp_dir_alloc;
338 /* Flag to reverse the order of all comparisons. */
339 static bool reverse;
341 /* Flag for stable sort. This turns off the last ditch bytewise
342 comparison of lines, and instead leaves lines in the same order
343 they were read if all keys compare equal. */
344 static bool stable;
346 /* If TAB has this value, blanks separate fields. */
347 enum { TAB_DEFAULT = CHAR_MAX + 1 };
349 /* Tab character separating fields. If TAB_DEFAULT, then fields are
350 separated by the empty string between a non-blank character and a blank
351 character. */
352 static int tab = TAB_DEFAULT;
354 /* Flag to remove consecutive duplicate lines from the output.
355 Only the last of a sequence of equal lines will be output. */
356 static bool unique;
358 /* Nonzero if any of the input files are the standard input. */
359 static bool have_read_stdin;
361 /* List of key field comparisons to be tried. */
362 static struct keyfield *keylist;
364 /* Program used to (de)compress temp files. Must accept -d. */
365 static char const *compress_program;
367 /* Annotate the output with extra info to aid the user. */
368 static bool debug;
370 /* Maximum number of files to merge in one go. If more than this
371 number are present, temp files will be used. */
372 static unsigned int nmerge = NMERGE_DEFAULT;
374 /* Report MESSAGE for FILE, then clean up and exit.
375 If FILE is null, it represents standard output. */
377 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
378 static void
379 die (char const *message, char const *file)
381 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
382 exit (SORT_FAILURE);
385 void
386 usage (int status)
388 if (status != EXIT_SUCCESS)
389 emit_try_help ();
390 else
392 printf (_("\
393 Usage: %s [OPTION]... [FILE]...\n\
394 or: %s [OPTION]... --files0-from=F\n\
396 program_name, program_name);
397 fputs (_("\
398 Write sorted concatenation of all FILE(s) to standard output.\n\
400 "), stdout);
401 fputs (_("\
402 Mandatory arguments to long options are mandatory for short options too.\n\
403 "), stdout);
404 fputs (_("\
405 Ordering options:\n\
407 "), stdout);
408 fputs (_("\
409 -b, --ignore-leading-blanks ignore leading blanks\n\
410 -d, --dictionary-order consider only blanks and alphanumeric characters\
412 -f, --ignore-case fold lower case to upper case characters\n\
413 "), stdout);
414 fputs (_("\
415 -g, --general-numeric-sort compare according to general numerical value\n\
416 -i, --ignore-nonprinting consider only printable characters\n\
417 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
418 "), stdout);
419 fputs (_("\
420 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
421 "), stdout);
422 fputs (_("\
423 -n, --numeric-sort compare according to string numerical value\n\
424 -R, --random-sort sort by random hash of keys\n\
425 --random-source=FILE get random bytes from FILE\n\
426 -r, --reverse reverse the result of comparisons\n\
427 "), stdout);
428 fputs (_("\
429 --sort=WORD sort according to WORD:\n\
430 general-numeric -g, human-numeric -h, month -M,\
432 numeric -n, random -R, version -V\n\
433 -V, --version-sort natural sort of (version) numbers within text\n\
435 "), stdout);
436 fputs (_("\
437 Other options:\n\
439 "), stdout);
440 fputs (_("\
441 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
442 for more use temp files\n\
443 "), stdout);
444 fputs (_("\
445 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
446 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
448 --compress-program=PROG compress temporaries with PROG;\n\
449 decompress them with PROG -d\n\
450 "), stdout);
451 fputs (_("\
452 --debug annotate the part of the line used to sort,\n\
453 and warn about questionable usage to stderr\n\
454 --files0-from=F read input from the files specified by\n\
455 NUL-terminated names in file F;\n\
456 If F is - then read names from standard input\n\
457 "), stdout);
458 fputs (_("\
459 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
460 -m, --merge merge already sorted files; do not sort\n\
461 "), stdout);
462 fputs (_("\
463 -o, --output=FILE write result to FILE instead of standard output\n\
464 -s, --stable stabilize sort by disabling last-resort comparison\
466 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
467 "), stdout);
468 printf (_("\
469 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
470 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
471 multiple options specify multiple directories\n\
472 --parallel=N change the number of sorts run concurrently to N\n\
473 -u, --unique with -c, check for strict ordering;\n\
474 without -c, output only the first of an equal run\
476 "), DEFAULT_TMPDIR);
477 fputs (_("\
478 -z, --zero-terminated end lines with 0 byte, not newline\n\
479 "), stdout);
480 fputs (HELP_OPTION_DESCRIPTION, stdout);
481 fputs (VERSION_OPTION_DESCRIPTION, stdout);
482 fputs (_("\
484 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
485 field number and C a character position in the field; both are origin 1, and\n\
486 the stop position defaults to the line's end. If neither -t nor -b is in\n\
487 effect, characters in a field are counted from the beginning of the preceding\n\
488 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
490 which override global ordering options for that key. If no key is given, use\n\
491 the entire line as the key.\n\
493 SIZE may be followed by the following multiplicative suffixes:\n\
494 "), stdout);
495 fputs (_("\
496 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
498 With no FILE, or when FILE is -, read standard input.\n\
500 *** WARNING ***\n\
501 The locale specified by the environment affects sort order.\n\
502 Set LC_ALL=C to get the traditional sort order that uses\n\
503 native byte values.\n\
504 "), stdout );
505 emit_ancillary_info ();
508 exit (status);
511 /* For long options that have no equivalent short option, use a
512 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
513 enum
515 CHECK_OPTION = CHAR_MAX + 1,
516 COMPRESS_PROGRAM_OPTION,
517 DEBUG_PROGRAM_OPTION,
518 FILES0_FROM_OPTION,
519 NMERGE_OPTION,
520 RANDOM_SOURCE_OPTION,
521 SORT_OPTION,
522 PARALLEL_OPTION
525 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
527 static struct option const long_options[] =
529 {"ignore-leading-blanks", no_argument, NULL, 'b'},
530 {"check", optional_argument, NULL, CHECK_OPTION},
531 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
532 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
533 {"dictionary-order", no_argument, NULL, 'd'},
534 {"ignore-case", no_argument, NULL, 'f'},
535 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
536 {"general-numeric-sort", no_argument, NULL, 'g'},
537 {"ignore-nonprinting", no_argument, NULL, 'i'},
538 {"key", required_argument, NULL, 'k'},
539 {"merge", no_argument, NULL, 'm'},
540 {"month-sort", no_argument, NULL, 'M'},
541 {"numeric-sort", no_argument, NULL, 'n'},
542 {"human-numeric-sort", no_argument, NULL, 'h'},
543 {"version-sort", no_argument, NULL, 'V'},
544 {"random-sort", no_argument, NULL, 'R'},
545 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
546 {"sort", required_argument, NULL, SORT_OPTION},
547 {"output", required_argument, NULL, 'o'},
548 {"reverse", no_argument, NULL, 'r'},
549 {"stable", no_argument, NULL, 's'},
550 {"batch-size", required_argument, NULL, NMERGE_OPTION},
551 {"buffer-size", required_argument, NULL, 'S'},
552 {"field-separator", required_argument, NULL, 't'},
553 {"temporary-directory", required_argument, NULL, 'T'},
554 {"unique", no_argument, NULL, 'u'},
555 {"zero-terminated", no_argument, NULL, 'z'},
556 {"parallel", required_argument, NULL, PARALLEL_OPTION},
557 {GETOPT_HELP_OPTION_DECL},
558 {GETOPT_VERSION_OPTION_DECL},
559 {NULL, 0, NULL, 0},
562 #define CHECK_TABLE \
563 _ct_("quiet", 'C') \
564 _ct_("silent", 'C') \
565 _ct_("diagnose-first", 'c')
567 static char const *const check_args[] =
569 #define _ct_(_s, _c) _s,
570 CHECK_TABLE NULL
571 #undef _ct_
573 static char const check_types[] =
575 #define _ct_(_s, _c) _c,
576 CHECK_TABLE
577 #undef _ct_
580 #define SORT_TABLE \
581 _st_("general-numeric", 'g') \
582 _st_("human-numeric", 'h') \
583 _st_("month", 'M') \
584 _st_("numeric", 'n') \
585 _st_("random", 'R') \
586 _st_("version", 'V')
588 static char const *const sort_args[] =
590 #define _st_(_s, _c) _s,
591 SORT_TABLE NULL
592 #undef _st_
594 static char const sort_types[] =
596 #define _st_(_s, _c) _c,
597 SORT_TABLE
598 #undef _st_
601 /* The set of signals that are caught. */
602 static sigset_t caught_signals;
604 /* Critical section status. */
605 struct cs_status
607 bool valid;
608 sigset_t sigs;
611 /* Enter a critical section. */
612 static struct cs_status
613 cs_enter (void)
615 struct cs_status status;
616 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
617 return status;
620 /* Leave a critical section. */
621 static void
622 cs_leave (struct cs_status status)
624 if (status.valid)
626 /* Ignore failure when restoring the signal mask. */
627 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
631 /* Possible states for a temp file. If compressed, the file's status
632 is unreaped or reaped, depending on whether 'sort' has waited for
633 the subprocess to finish. */
634 enum { UNCOMPRESSED, UNREAPED, REAPED };
636 /* The list of temporary files. */
637 struct tempnode
639 struct tempnode *volatile next;
640 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
641 char state;
642 char name[1]; /* Actual size is 1 + file name length. */
644 static struct tempnode *volatile temphead;
645 static struct tempnode *volatile *temptail = &temphead;
647 /* A file to be sorted. */
648 struct sortfile
650 /* The file's name. */
651 char const *name;
653 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
654 struct tempnode *temp;
657 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
658 static Hash_table *proctab;
660 enum { INIT_PROCTAB_SIZE = 47 };
662 static size_t
663 proctab_hasher (void const *entry, size_t tabsize)
665 struct tempnode const *node = entry;
666 return node->pid % tabsize;
669 static bool
670 proctab_comparator (void const *e1, void const *e2)
672 struct tempnode const *n1 = e1;
673 struct tempnode const *n2 = e2;
674 return n1->pid == n2->pid;
677 /* The number of unreaped child processes. */
678 static pid_t nprocs;
680 static bool delete_proc (pid_t);
682 /* If PID is positive, wait for the child process with that PID to
683 exit, and assume that PID has already been removed from the process
684 table. If PID is 0 or -1, clean up some child that has exited (by
685 waiting for it, and removing it from the proc table) and return the
686 child's process ID. However, if PID is 0 and no children have
687 exited, return 0 without waiting. */
689 static pid_t
690 reap (pid_t pid)
692 int status;
693 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
695 if (cpid < 0)
696 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
697 compress_program);
698 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
700 if (! WIFEXITED (status) || WEXITSTATUS (status))
701 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
702 compress_program);
703 --nprocs;
706 return cpid;
709 /* TEMP represents a new process; add it to the process table. Create
710 the process table the first time it's called. */
712 static void
713 register_proc (struct tempnode *temp)
715 if (! proctab)
717 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
718 proctab_hasher,
719 proctab_comparator,
720 NULL);
721 if (! proctab)
722 xalloc_die ();
725 temp->state = UNREAPED;
727 if (! hash_insert (proctab, temp))
728 xalloc_die ();
731 /* If PID is in the process table, remove it and return true.
732 Otherwise, return false. */
734 static bool
735 delete_proc (pid_t pid)
737 struct tempnode test;
739 test.pid = pid;
740 struct tempnode *node = hash_delete (proctab, &test);
741 if (! node)
742 return false;
743 node->state = REAPED;
744 return true;
747 /* Remove PID from the process table, and wait for it to exit if it
748 hasn't already. */
750 static void
751 wait_proc (pid_t pid)
753 if (delete_proc (pid))
754 reap (pid);
757 /* Reap any exited children. Do not block; reap only those that have
758 already exited. */
760 static void
761 reap_exited (void)
763 while (0 < nprocs && reap (0))
764 continue;
767 /* Reap at least one exited child, waiting if necessary. */
769 static void
770 reap_some (void)
772 reap (-1);
773 reap_exited ();
776 /* Reap all children, waiting if necessary. */
778 static void
779 reap_all (void)
781 while (0 < nprocs)
782 reap (-1);
785 /* Clean up any remaining temporary files. */
787 static void
788 cleanup (void)
790 struct tempnode const *node;
792 for (node = temphead; node; node = node->next)
793 unlink (node->name);
794 temphead = NULL;
797 /* Cleanup actions to take when exiting. */
799 static void
800 exit_cleanup (void)
802 if (temphead)
804 /* Clean up any remaining temporary files in a critical section so
805 that a signal handler does not try to clean them too. */
806 struct cs_status cs = cs_enter ();
807 cleanup ();
808 cs_leave (cs);
811 close_stdout ();
814 /* Create a new temporary file, returning its newly allocated tempnode.
815 Store into *PFD the file descriptor open for writing.
816 If the creation fails, return NULL and store -1 into *PFD if the
817 failure is due to file descriptor exhaustion and
818 SURVIVE_FD_EXHAUSTION; otherwise, die. */
820 static struct tempnode *
821 create_temp_file (int *pfd, bool survive_fd_exhaustion)
823 static char const slashbase[] = "/sortXXXXXX";
824 static size_t temp_dir_index;
825 int fd;
826 int saved_errno;
827 char const *temp_dir = temp_dirs[temp_dir_index];
828 size_t len = strlen (temp_dir);
829 struct tempnode *node =
830 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
831 char *file = node->name;
832 struct cs_status cs;
834 memcpy (file, temp_dir, len);
835 memcpy (file + len, slashbase, sizeof slashbase);
836 node->next = NULL;
837 if (++temp_dir_index == temp_dir_count)
838 temp_dir_index = 0;
840 /* Create the temporary file in a critical section, to avoid races. */
841 cs = cs_enter ();
842 fd = mkstemp (file);
843 if (0 <= fd)
845 *temptail = node;
846 temptail = &node->next;
848 saved_errno = errno;
849 cs_leave (cs);
850 errno = saved_errno;
852 if (fd < 0)
854 if (! (survive_fd_exhaustion && errno == EMFILE))
855 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
856 quote (temp_dir));
857 free (node);
858 node = NULL;
861 *pfd = fd;
862 return node;
865 /* Return a stream for FILE, opened with mode HOW. A null FILE means
866 standard output; HOW should be "w". When opening for input, "-"
867 means standard input. To avoid confusion, do not return file
868 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
869 opening an ordinary FILE. Return NULL if unsuccessful.
871 fadvise() is used to specify an access pattern for input files.
872 There are a few hints we could possibly provide,
873 and after careful testing it was decided that
874 specifying POSIX_FADV_SEQUENTIAL was not detrimental
875 to any cases. On Linux 2.6.31, this option doubles
876 the size of read ahead performed and thus was seen to
877 benefit these cases:
878 Merging
879 Sorting with a smaller internal buffer
880 Reading from faster flash devices
882 In _addition_ one could also specify other hints...
884 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
885 at least uses that to _synchronously_ prepopulate the cache
886 with the specified range. While sort does need to
887 read all of its input before outputting, a synchronous
888 read of the whole file up front precludes any processing
889 that sort could do in parallel with the system doing
890 read ahead of the data. This was seen to have negative effects
891 in a couple of cases:
892 Merging
893 Sorting with a smaller internal buffer
894 Note this option was seen to shorten the runtime for sort
895 on a multicore system with lots of RAM and other processes
896 competing for CPU. It could be argued that more explicit
897 scheduling hints with 'nice' et. al. are more appropriate
898 for this situation.
900 POSIX_FADV_NOREUSE is a possibility as it could lower
901 the priority of input data in the cache as sort will
902 only need to process it once. However its functionality
903 has changed over Linux kernel versions and as of 2.6.31
904 it does nothing and thus we can't depend on what it might
905 do in future.
907 POSIX_FADV_DONTNEED is not appropriate for user specified
908 input files, but for temp files we do want to drop the
909 cache immediately after processing. This is done implicitly
910 however when the files are unlinked. */
912 static FILE *
913 stream_open (char const *file, char const *how)
915 if (!file)
916 return stdout;
917 if (*how == 'r')
919 FILE *fp;
920 if (STREQ (file, "-"))
922 have_read_stdin = true;
923 fp = stdin;
925 else
926 fp = fopen (file, how);
927 fadvise (fp, FADVISE_SEQUENTIAL);
928 return fp;
930 return fopen (file, how);
933 /* Same as stream_open, except always return a non-null value; die on
934 failure. */
936 static FILE *
937 xfopen (char const *file, char const *how)
939 FILE *fp = stream_open (file, how);
940 if (!fp)
941 die (_("open failed"), file);
942 return fp;
945 /* Close FP, whose name is FILE, and report any errors. */
947 static void
948 xfclose (FILE *fp, char const *file)
950 switch (fileno (fp))
952 case STDIN_FILENO:
953 /* Allow reading stdin from tty more than once. */
954 if (feof (fp))
955 clearerr (fp);
956 break;
958 case STDOUT_FILENO:
959 /* Don't close stdout just yet. close_stdout does that. */
960 if (fflush (fp) != 0)
961 die (_("fflush failed"), file);
962 break;
964 default:
965 if (fclose (fp) != 0)
966 die (_("close failed"), file);
967 break;
971 static void
972 dup2_or_die (int oldfd, int newfd)
974 if (dup2 (oldfd, newfd) < 0)
975 error (SORT_FAILURE, errno, _("dup2 failed"));
978 /* Fork a child process for piping to and do common cleanup. The
979 TRIES parameter tells us how many times to try to fork before
980 giving up. Return the PID of the child, or -1 (setting errno)
981 on failure. */
983 static pid_t
984 pipe_fork (int pipefds[2], size_t tries)
986 #if HAVE_WORKING_FORK
987 struct tempnode *saved_temphead;
988 int saved_errno;
989 double wait_retry = 0.25;
990 pid_t pid IF_LINT ( = -1);
991 struct cs_status cs;
993 if (pipe (pipefds) < 0)
994 return -1;
996 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
997 uncontrolled subprocess generation can hurt performance significantly.
998 Allow at most NMERGE + 2 subprocesses, on the theory that there
999 may be some useful parallelism by letting compression for the
1000 previous merge finish (1 subprocess) in parallel with the current
1001 merge (NMERGE + 1 subprocesses). */
1003 if (nmerge + 1 < nprocs)
1004 reap_some ();
1006 while (tries--)
1008 /* This is so the child process won't delete our temp files
1009 if it receives a signal before exec-ing. */
1010 cs = cs_enter ();
1011 saved_temphead = temphead;
1012 temphead = NULL;
1014 pid = fork ();
1015 saved_errno = errno;
1016 if (pid)
1017 temphead = saved_temphead;
1019 cs_leave (cs);
1020 errno = saved_errno;
1022 if (0 <= pid || errno != EAGAIN)
1023 break;
1024 else
1026 xnanosleep (wait_retry);
1027 wait_retry *= 2;
1028 reap_exited ();
1032 if (pid < 0)
1034 saved_errno = errno;
1035 close (pipefds[0]);
1036 close (pipefds[1]);
1037 errno = saved_errno;
1039 else if (pid == 0)
1041 close (STDIN_FILENO);
1042 close (STDOUT_FILENO);
1044 else
1045 ++nprocs;
1047 return pid;
1049 #else /* ! HAVE_WORKING_FORK */
1050 return -1;
1051 #endif
1054 /* Create a temporary file and, if asked for, start a compressor
1055 to that file. Set *PFP to the file handle and return
1056 the address of the new temp node. If the creation
1057 fails, return NULL if the failure is due to file descriptor
1058 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1060 static struct tempnode *
1061 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1063 int tempfd;
1064 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1065 if (! node)
1066 return NULL;
1068 node->state = UNCOMPRESSED;
1070 if (compress_program)
1072 int pipefds[2];
1074 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1075 if (0 < node->pid)
1077 close (tempfd);
1078 close (pipefds[0]);
1079 tempfd = pipefds[1];
1081 register_proc (node);
1083 else if (node->pid == 0)
1085 close (pipefds[1]);
1086 dup2_or_die (tempfd, STDOUT_FILENO);
1087 close (tempfd);
1088 dup2_or_die (pipefds[0], STDIN_FILENO);
1089 close (pipefds[0]);
1091 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1092 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1093 compress_program);
1097 *pfp = fdopen (tempfd, "w");
1098 if (! *pfp)
1099 die (_("couldn't create temporary file"), node->name);
1101 return node;
1104 /* Create a temporary file and, if asked for, start a compressor
1105 to that file. Set *PFP to the file handle and return the address
1106 of the new temp node. Die on failure. */
1108 static struct tempnode *
1109 create_temp (FILE **pfp)
1111 return maybe_create_temp (pfp, false);
1114 /* Open a compressed temp file and start a decompression process through
1115 which to filter the input. Return NULL (setting errno to
1116 EMFILE) if we ran out of file descriptors, and die on any other
1117 kind of failure. */
1119 static FILE *
1120 open_temp (struct tempnode *temp)
1122 int tempfd, pipefds[2];
1123 FILE *fp = NULL;
1125 if (temp->state == UNREAPED)
1126 wait_proc (temp->pid);
1128 tempfd = open (temp->name, O_RDONLY);
1129 if (tempfd < 0)
1130 return NULL;
1132 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1134 switch (child)
1136 case -1:
1137 if (errno != EMFILE)
1138 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1139 compress_program);
1140 close (tempfd);
1141 errno = EMFILE;
1142 break;
1144 case 0:
1145 close (pipefds[0]);
1146 dup2_or_die (tempfd, STDIN_FILENO);
1147 close (tempfd);
1148 dup2_or_die (pipefds[1], STDOUT_FILENO);
1149 close (pipefds[1]);
1151 execlp (compress_program, compress_program, "-d", (char *) NULL);
1152 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1153 compress_program);
1155 default:
1156 temp->pid = child;
1157 register_proc (temp);
1158 close (tempfd);
1159 close (pipefds[1]);
1161 fp = fdopen (pipefds[0], "r");
1162 if (! fp)
1164 int saved_errno = errno;
1165 close (pipefds[0]);
1166 errno = saved_errno;
1168 break;
1171 return fp;
1174 /* Append DIR to the array of temporary directory names. */
1175 static void
1176 add_temp_dir (char const *dir)
1178 if (temp_dir_count == temp_dir_alloc)
1179 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1181 temp_dirs[temp_dir_count++] = dir;
1184 /* Remove NAME from the list of temporary files. */
1186 static void
1187 zaptemp (char const *name)
1189 struct tempnode *volatile *pnode;
1190 struct tempnode *node;
1191 struct tempnode *next;
1192 int unlink_status;
1193 int unlink_errno = 0;
1194 struct cs_status cs;
1196 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1197 continue;
1199 if (node->state == UNREAPED)
1200 wait_proc (node->pid);
1202 /* Unlink the temporary file in a critical section to avoid races. */
1203 next = node->next;
1204 cs = cs_enter ();
1205 unlink_status = unlink (name);
1206 unlink_errno = errno;
1207 *pnode = next;
1208 cs_leave (cs);
1210 if (unlink_status != 0)
1211 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1212 if (! next)
1213 temptail = pnode;
1214 free (node);
1217 #if HAVE_NL_LANGINFO
1219 static int
1220 struct_month_cmp (void const *m1, void const *m2)
1222 struct month const *month1 = m1;
1223 struct month const *month2 = m2;
1224 return strcmp (month1->name, month2->name);
1227 #endif
1229 /* Initialize the character class tables. */
1231 static void
1232 inittables (void)
1234 size_t i;
1236 for (i = 0; i < UCHAR_LIM; ++i)
1238 blanks[i] = !! isblank (i);
1239 nonprinting[i] = ! isprint (i);
1240 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1241 fold_toupper[i] = toupper (i);
1244 #if HAVE_NL_LANGINFO
1245 /* If we're not in the "C" locale, read different names for months. */
1246 if (hard_LC_TIME)
1248 for (i = 0; i < MONTHS_PER_YEAR; i++)
1250 char const *s;
1251 size_t s_len;
1252 size_t j, k;
1253 char *name;
1255 s = nl_langinfo (ABMON_1 + i);
1256 s_len = strlen (s);
1257 monthtab[i].name = name = xmalloc (s_len + 1);
1258 monthtab[i].val = i + 1;
1260 for (j = k = 0; j < s_len; j++)
1261 if (! isblank (to_uchar (s[j])))
1262 name[k++] = fold_toupper[to_uchar (s[j])];
1263 name[k] = '\0';
1265 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1267 #endif
1270 /* Specify how many inputs may be merged at once.
1271 This may be set on the command-line with the
1272 --batch-size option. */
1273 static void
1274 specify_nmerge (int oi, char c, char const *s)
1276 uintmax_t n;
1277 struct rlimit rlimit;
1278 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1280 /* Try to find out how many file descriptors we'll be able
1281 to open. We need at least nmerge + 3 (STDIN_FILENO,
1282 STDOUT_FILENO and STDERR_FILENO). */
1283 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1284 ? rlimit.rlim_cur
1285 : OPEN_MAX)
1286 - 3);
1288 if (e == LONGINT_OK)
1290 nmerge = n;
1291 if (nmerge != n)
1292 e = LONGINT_OVERFLOW;
1293 else
1295 if (nmerge < 2)
1297 error (0, 0, _("invalid --%s argument %s"),
1298 long_options[oi].name, quote (s));
1299 error (SORT_FAILURE, 0,
1300 _("minimum --%s argument is %s"),
1301 long_options[oi].name, quote ("2"));
1303 else if (max_nmerge < nmerge)
1305 e = LONGINT_OVERFLOW;
1307 else
1308 return;
1312 if (e == LONGINT_OVERFLOW)
1314 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1315 error (0, 0, _("--%s argument %s too large"),
1316 long_options[oi].name, quote (s));
1317 error (SORT_FAILURE, 0,
1318 _("maximum --%s argument with current rlimit is %s"),
1319 long_options[oi].name,
1320 uinttostr (max_nmerge, max_nmerge_buf));
1322 else
1323 xstrtol_fatal (e, oi, c, long_options, s);
1326 /* Specify the amount of main memory to use when sorting. */
1327 static void
1328 specify_sort_size (int oi, char c, char const *s)
1330 uintmax_t n;
1331 char *suffix;
1332 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1334 /* The default unit is KiB. */
1335 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1337 if (n <= UINTMAX_MAX / 1024)
1338 n *= 1024;
1339 else
1340 e = LONGINT_OVERFLOW;
1343 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1344 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1345 switch (suffix[0])
1347 case 'b':
1348 e = LONGINT_OK;
1349 break;
1351 case '%':
1353 double mem = physmem_total () * n / 100;
1355 /* Use "<", not "<=", to avoid problems with rounding. */
1356 if (mem < UINTMAX_MAX)
1358 n = mem;
1359 e = LONGINT_OK;
1361 else
1362 e = LONGINT_OVERFLOW;
1364 break;
1367 if (e == LONGINT_OK)
1369 /* If multiple sort sizes are specified, take the maximum, so
1370 that option order does not matter. */
1371 if (n < sort_size)
1372 return;
1374 sort_size = n;
1375 if (sort_size == n)
1377 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1378 return;
1381 e = LONGINT_OVERFLOW;
1384 xstrtol_fatal (e, oi, c, long_options, s);
1387 /* Specify the number of threads to spawn during internal sort. */
1388 static size_t
1389 specify_nthreads (int oi, char c, char const *s)
1391 unsigned long int nthreads;
1392 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1393 if (e == LONGINT_OVERFLOW)
1394 return SIZE_MAX;
1395 if (e != LONGINT_OK)
1396 xstrtol_fatal (e, oi, c, long_options, s);
1397 if (SIZE_MAX < nthreads)
1398 nthreads = SIZE_MAX;
1399 if (nthreads == 0)
1400 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1401 return nthreads;
1405 /* Return the default sort size. */
1406 static size_t
1407 default_sort_size (void)
1409 /* Let MEM be available memory or 1/8 of total memory, whichever
1410 is greater. */
1411 double avail = physmem_available ();
1412 double total = physmem_total ();
1413 double mem = MAX (avail, total / 8);
1414 struct rlimit rlimit;
1416 /* Let SIZE be MEM, but no more than the maximum object size or
1417 system resource limits. Avoid the MIN macro here, as it is not
1418 quite right when only one argument is floating point. Don't
1419 bother to check for values like RLIM_INFINITY since in practice
1420 they are not much less than SIZE_MAX. */
1421 size_t size = SIZE_MAX;
1422 if (mem < size)
1423 size = mem;
1424 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1425 size = rlimit.rlim_cur;
1426 #ifdef RLIMIT_AS
1427 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1428 size = rlimit.rlim_cur;
1429 #endif
1431 /* Leave a large safety margin for the above limits, as failure can
1432 occur when they are exceeded. */
1433 size /= 2;
1435 #ifdef RLIMIT_RSS
1436 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1437 Exceeding RSS is not fatal, but can be quite slow. */
1438 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1439 size = rlimit.rlim_cur / 16 * 15;
1440 #endif
1442 /* Use no less than the minimum. */
1443 return MAX (size, MIN_SORT_SIZE);
1446 /* Return the sort buffer size to use with the input files identified
1447 by FPS and FILES, which are alternate names of the same files.
1448 NFILES gives the number of input files; NFPS may be less. Assume
1449 that each input line requires LINE_BYTES extra bytes' worth of line
1450 information. Do not exceed the size bound specified by the user
1451 (or a default size bound, if the user does not specify one). */
1453 static size_t
1454 sort_buffer_size (FILE *const *fps, size_t nfps,
1455 char *const *files, size_t nfiles,
1456 size_t line_bytes)
1458 /* A bound on the input size. If zero, the bound hasn't been
1459 determined yet. */
1460 static size_t size_bound;
1462 /* In the worst case, each input byte is a newline. */
1463 size_t worst_case_per_input_byte = line_bytes + 1;
1465 /* Keep enough room for one extra input line and an extra byte.
1466 This extra room might be needed when preparing to read EOF. */
1467 size_t size = worst_case_per_input_byte + 1;
1469 size_t i;
1471 for (i = 0; i < nfiles; i++)
1473 struct stat st;
1474 off_t file_size;
1475 size_t worst_case;
1477 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1478 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1479 : stat (files[i], &st))
1480 != 0)
1481 die (_("stat failed"), files[i]);
1483 if (S_ISREG (st.st_mode))
1484 file_size = st.st_size;
1485 else
1487 /* The file has unknown size. If the user specified a sort
1488 buffer size, use that; otherwise, guess the size. */
1489 if (sort_size)
1490 return sort_size;
1491 file_size = INPUT_FILE_SIZE_GUESS;
1494 if (! size_bound)
1496 size_bound = sort_size;
1497 if (! size_bound)
1498 size_bound = default_sort_size ();
1501 /* Add the amount of memory needed to represent the worst case
1502 where the input consists entirely of newlines followed by a
1503 single non-newline. Check for overflow. */
1504 worst_case = file_size * worst_case_per_input_byte + 1;
1505 if (file_size != worst_case / worst_case_per_input_byte
1506 || size_bound - size <= worst_case)
1507 return size_bound;
1508 size += worst_case;
1511 return size;
1514 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1515 must be at least sizeof (struct line). Allocate ALLOC bytes
1516 initially. */
1518 static void
1519 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1521 /* Ensure that the line array is properly aligned. If the desired
1522 size cannot be allocated, repeatedly halve it until allocation
1523 succeeds. The smaller allocation may hurt overall performance,
1524 but that's better than failing. */
1525 while (true)
1527 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1528 buf->buf = malloc (alloc);
1529 if (buf->buf)
1530 break;
1531 alloc /= 2;
1532 if (alloc <= line_bytes + 1)
1533 xalloc_die ();
1536 buf->line_bytes = line_bytes;
1537 buf->alloc = alloc;
1538 buf->used = buf->left = buf->nlines = 0;
1539 buf->eof = false;
1542 /* Return one past the limit of the line array. */
1544 static inline struct line *
1545 buffer_linelim (struct buffer const *buf)
1547 return (struct line *) (buf->buf + buf->alloc);
1550 /* Return a pointer to the first character of the field specified
1551 by KEY in LINE. */
1553 static char *
1554 begfield (struct line const *line, struct keyfield const *key)
1556 char *ptr = line->text, *lim = ptr + line->length - 1;
1557 size_t sword = key->sword;
1558 size_t schar = key->schar;
1560 /* The leading field separator itself is included in a field when -t
1561 is absent. */
1563 if (tab != TAB_DEFAULT)
1564 while (ptr < lim && sword--)
1566 while (ptr < lim && *ptr != tab)
1567 ++ptr;
1568 if (ptr < lim)
1569 ++ptr;
1571 else
1572 while (ptr < lim && sword--)
1574 while (ptr < lim && blanks[to_uchar (*ptr)])
1575 ++ptr;
1576 while (ptr < lim && !blanks[to_uchar (*ptr)])
1577 ++ptr;
1580 /* If we're ignoring leading blanks when computing the Start
1581 of the field, skip past them here. */
1582 if (key->skipsblanks)
1583 while (ptr < lim && blanks[to_uchar (*ptr)])
1584 ++ptr;
1586 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1587 ptr = MIN (lim, ptr + schar);
1589 return ptr;
1592 /* Return the limit of (a pointer to the first character after) the field
1593 in LINE specified by KEY. */
1595 static char *
1596 limfield (struct line const *line, struct keyfield const *key)
1598 char *ptr = line->text, *lim = ptr + line->length - 1;
1599 size_t eword = key->eword, echar = key->echar;
1601 if (echar == 0)
1602 eword++; /* Skip all of end field. */
1604 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1605 whichever comes first. If there are more than EWORD fields, leave
1606 PTR pointing at the beginning of the field having zero-based index,
1607 EWORD. If a delimiter character was specified (via -t), then that
1608 'beginning' is the first character following the delimiting TAB.
1609 Otherwise, leave PTR pointing at the first 'blank' character after
1610 the preceding field. */
1611 if (tab != TAB_DEFAULT)
1612 while (ptr < lim && eword--)
1614 while (ptr < lim && *ptr != tab)
1615 ++ptr;
1616 if (ptr < lim && (eword || echar))
1617 ++ptr;
1619 else
1620 while (ptr < lim && eword--)
1622 while (ptr < lim && blanks[to_uchar (*ptr)])
1623 ++ptr;
1624 while (ptr < lim && !blanks[to_uchar (*ptr)])
1625 ++ptr;
1628 #ifdef POSIX_UNSPECIFIED
1629 /* The following block of code makes GNU sort incompatible with
1630 standard Unix sort, so it's ifdef'd out for now.
1631 The POSIX spec isn't clear on how to interpret this.
1632 FIXME: request clarification.
1634 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1635 Date: Thu, 30 May 96 12:20:41 -0400
1636 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1638 [...]I believe I've found another bug in 'sort'.
1640 $ cat /tmp/sort.in
1641 a b c 2 d
1642 pq rs 1 t
1643 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1644 a b c 2 d
1645 pq rs 1 t
1646 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1647 pq rs 1 t
1648 a b c 2 d
1650 Unix sort produced the answer I expected: sort on the single character
1651 in column 7. GNU sort produced different results, because it disagrees
1652 on the interpretation of the key-end spec "M.N". Unix sort reads this
1653 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1654 "skip M-1 fields, then either N-1 characters or the rest of the current
1655 field, whichever comes first". This extra clause applies only to
1656 key-ends, not key-starts.
1659 /* Make LIM point to the end of (one byte past) the current field. */
1660 if (tab != TAB_DEFAULT)
1662 char *newlim;
1663 newlim = memchr (ptr, tab, lim - ptr);
1664 if (newlim)
1665 lim = newlim;
1667 else
1669 char *newlim;
1670 newlim = ptr;
1671 while (newlim < lim && blanks[to_uchar (*newlim)])
1672 ++newlim;
1673 while (newlim < lim && !blanks[to_uchar (*newlim)])
1674 ++newlim;
1675 lim = newlim;
1677 #endif
1679 if (echar != 0) /* We need to skip over a portion of the end field. */
1681 /* If we're ignoring leading blanks when computing the End
1682 of the field, skip past them here. */
1683 if (key->skipeblanks)
1684 while (ptr < lim && blanks[to_uchar (*ptr)])
1685 ++ptr;
1687 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1688 ptr = MIN (lim, ptr + echar);
1691 return ptr;
1694 /* Fill BUF reading from FP, moving buf->left bytes from the end
1695 of buf->buf to the beginning first. If EOF is reached and the
1696 file wasn't terminated by a newline, supply one. Set up BUF's line
1697 table too. FILE is the name of the file corresponding to FP.
1698 Return true if some input was read. */
1700 static bool
1701 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1703 struct keyfield const *key = keylist;
1704 char eol = eolchar;
1705 size_t line_bytes = buf->line_bytes;
1706 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1708 if (buf->eof)
1709 return false;
1711 if (buf->used != buf->left)
1713 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1714 buf->used = buf->left;
1715 buf->nlines = 0;
1718 while (true)
1720 char *ptr = buf->buf + buf->used;
1721 struct line *linelim = buffer_linelim (buf);
1722 struct line *line = linelim - buf->nlines;
1723 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1724 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1726 while (line_bytes + 1 < avail)
1728 /* Read as many bytes as possible, but do not read so many
1729 bytes that there might not be enough room for the
1730 corresponding line array. The worst case is when the
1731 rest of the input file consists entirely of newlines,
1732 except that the last byte is not a newline. */
1733 size_t readsize = (avail - 1) / (line_bytes + 1);
1734 size_t bytes_read = fread (ptr, 1, readsize, fp);
1735 char *ptrlim = ptr + bytes_read;
1736 char *p;
1737 avail -= bytes_read;
1739 if (bytes_read != readsize)
1741 if (ferror (fp))
1742 die (_("read failed"), file);
1743 if (feof (fp))
1745 buf->eof = true;
1746 if (buf->buf == ptrlim)
1747 return false;
1748 if (line_start != ptrlim && ptrlim[-1] != eol)
1749 *ptrlim++ = eol;
1753 /* Find and record each line in the just-read input. */
1754 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1756 /* Delimit the line with NUL. This eliminates the need to
1757 temporarily replace the last byte with NUL when calling
1758 xmemcoll(), which increases performance. */
1759 *p = '\0';
1760 ptr = p + 1;
1761 line--;
1762 line->text = line_start;
1763 line->length = ptr - line_start;
1764 mergesize = MAX (mergesize, line->length);
1765 avail -= line_bytes;
1767 if (key)
1769 /* Precompute the position of the first key for
1770 efficiency. */
1771 line->keylim = (key->eword == SIZE_MAX
1773 : limfield (line, key));
1775 if (key->sword != SIZE_MAX)
1776 line->keybeg = begfield (line, key);
1777 else
1779 if (key->skipsblanks)
1780 while (blanks[to_uchar (*line_start)])
1781 line_start++;
1782 line->keybeg = line_start;
1786 line_start = ptr;
1789 ptr = ptrlim;
1790 if (buf->eof)
1791 break;
1794 buf->used = ptr - buf->buf;
1795 buf->nlines = buffer_linelim (buf) - line;
1796 if (buf->nlines != 0)
1798 buf->left = ptr - line_start;
1799 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1800 return true;
1804 /* The current input line is too long to fit in the buffer.
1805 Double the buffer size and try again, keeping it properly
1806 aligned. */
1807 size_t line_alloc = buf->alloc / sizeof (struct line);
1808 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1809 buf->alloc = line_alloc * sizeof (struct line);
1814 /* Table that maps characters to order-of-magnitude values. */
1815 static char const unit_order[UCHAR_LIM] =
1817 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1818 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1819 /* This initializer syntax works on all C99 hosts. For now, use
1820 it only on non-ASCII hosts, to ease the pain of porting to
1821 pre-C99 ASCII hosts. */
1822 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1823 ['k']=1,
1824 #else
1825 /* Generate the following table with this command:
1826 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1827 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1828 |fmt */
1829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1832 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1840 #endif
1843 /* Return an integer that represents the order of magnitude of the
1844 unit following the number. The number may contain thousands
1845 separators and a decimal point, but it may not contain leading blanks.
1846 Negative numbers get negative orders; zero numbers have a zero order. */
1848 static int _GL_ATTRIBUTE_PURE
1849 find_unit_order (char const *number)
1851 bool minus_sign = (*number == '-');
1852 char const *p = number + minus_sign;
1853 int nonzero = 0;
1854 unsigned char ch;
1856 /* Scan to end of number.
1857 Decimals or separators not followed by digits stop the scan.
1858 Numbers ending in decimals or separators are thus considered
1859 to be lacking in units.
1860 FIXME: add support for multibyte thousands_sep and decimal_point. */
1864 while (ISDIGIT (ch = *p++))
1865 nonzero |= ch - '0';
1867 while (ch == thousands_sep);
1869 if (ch == decimal_point)
1870 while (ISDIGIT (ch = *p++))
1871 nonzero |= ch - '0';
1873 if (nonzero)
1875 int order = unit_order[ch];
1876 return (minus_sign ? -order : order);
1878 else
1879 return 0;
1882 /* Compare numbers A and B ending in units with SI or IEC prefixes
1883 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1885 static int
1886 human_numcompare (char const *a, char const *b)
1888 while (blanks[to_uchar (*a)])
1889 a++;
1890 while (blanks[to_uchar (*b)])
1891 b++;
1893 int diff = find_unit_order (a) - find_unit_order (b);
1894 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1897 /* Compare strings A and B as numbers without explicitly converting them to
1898 machine numbers. Comparatively slow for short strings, but asymptotically
1899 hideously fast. */
1901 static int
1902 numcompare (char const *a, char const *b)
1904 while (blanks[to_uchar (*a)])
1905 a++;
1906 while (blanks[to_uchar (*b)])
1907 b++;
1909 return strnumcmp (a, b, decimal_point, thousands_sep);
1912 /* Work around a problem whereby the long double value returned by glibc's
1913 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1914 A and B before calling strtold. FIXME: remove this function once
1915 gnulib guarantees that strtold's result is always well defined. */
1916 static int
1917 nan_compare (char const *sa, char const *sb)
1919 long_double a;
1920 memset (&a, 0, sizeof a);
1921 a = strtold (sa, NULL);
1923 long_double b;
1924 memset (&b, 0, sizeof b);
1925 b = strtold (sb, NULL);
1927 return memcmp (&a, &b, sizeof a);
1930 static int
1931 general_numcompare (char const *sa, char const *sb)
1933 /* FIXME: maybe add option to try expensive FP conversion
1934 only if A and B can't be compared more cheaply/accurately. */
1936 char *ea;
1937 char *eb;
1938 long_double a = strtold (sa, &ea);
1939 long_double b = strtold (sb, &eb);
1941 /* Put conversion errors at the start of the collating sequence. */
1942 if (sa == ea)
1943 return sb == eb ? 0 : -1;
1944 if (sb == eb)
1945 return 1;
1947 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1948 conversion errors but before numbers; sort them by internal
1949 bit-pattern, for lack of a more portable alternative. */
1950 return (a < b ? -1
1951 : a > b ? 1
1952 : a == b ? 0
1953 : b == b ? -1
1954 : a == a ? 1
1955 : nan_compare (sa, sb));
1958 /* Return an integer in 1..12 of the month name MONTH.
1959 Return 0 if the name in S is not recognized. */
1961 static int
1962 getmonth (char const *month, char **ea)
1964 size_t lo = 0;
1965 size_t hi = MONTHS_PER_YEAR;
1967 while (blanks[to_uchar (*month)])
1968 month++;
1972 size_t ix = (lo + hi) / 2;
1973 char const *m = month;
1974 char const *n = monthtab[ix].name;
1976 for (;; m++, n++)
1978 if (!*n)
1980 if (ea)
1981 *ea = (char *) m;
1982 return monthtab[ix].val;
1984 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
1986 hi = ix;
1987 break;
1989 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
1991 lo = ix + 1;
1992 break;
1996 while (lo < hi);
1998 return 0;
2001 /* A randomly chosen MD5 state, used for random comparison. */
2002 static struct md5_ctx random_md5_state;
2004 /* Initialize the randomly chosen MD5 state. */
2006 static void
2007 random_md5_state_init (char const *random_source)
2009 unsigned char buf[MD5_DIGEST_SIZE];
2010 struct randread_source *r = randread_new (random_source, sizeof buf);
2011 if (! r)
2012 die (_("open failed"), random_source);
2013 randread (r, buf, sizeof buf);
2014 if (randread_free (r) != 0)
2015 die (_("close failed"), random_source);
2016 md5_init_ctx (&random_md5_state);
2017 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2020 /* This is like strxfrm, except it reports any error and exits. */
2022 static size_t
2023 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2025 errno = 0;
2026 size_t translated_size = strxfrm (dest, src, destsize);
2028 if (errno)
2030 error (0, errno, _("string transformation failed"));
2031 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2032 error (SORT_FAILURE, 0,
2033 _("the untransformed string was %s"),
2034 quotearg_n_style (0, locale_quoting_style, src));
2037 return translated_size;
2040 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2041 using one or more random hash functions. TEXTA[LENA] and
2042 TEXTB[LENB] must be zero. */
2044 static int
2045 compare_random (char *restrict texta, size_t lena,
2046 char *restrict textb, size_t lenb)
2048 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2049 data. This is used to break ties if there is a checksum
2050 collision, and this is good enough given the astronomically low
2051 probability of a collision. */
2052 int xfrm_diff = 0;
2054 char stackbuf[4000];
2055 char *buf = stackbuf;
2056 size_t bufsize = sizeof stackbuf;
2057 void *allocated = NULL;
2058 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2059 struct md5_ctx s[2];
2060 s[0] = s[1] = random_md5_state;
2062 if (hard_LC_COLLATE)
2064 char const *lima = texta + lena;
2065 char const *limb = textb + lenb;
2067 while (true)
2069 /* Transform the text into the basis of comparison, so that byte
2070 strings that would otherwise considered to be equal are
2071 considered equal here even if their bytes differ.
2073 Each time through this loop, transform one
2074 null-terminated string's worth from TEXTA or from TEXTB
2075 or both. That way, there's no need to store the
2076 transformation of the whole line, if it contains many
2077 null-terminated strings. */
2079 /* Store the transformed data into a big-enough buffer. */
2081 /* A 3X size guess avoids the overhead of calling strxfrm
2082 twice on typical implementations. Don't worry about
2083 size_t overflow, as the guess need not be correct. */
2084 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2085 if (bufsize < guess_bufsize)
2087 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2088 free (allocated);
2089 buf = allocated = malloc (bufsize);
2090 if (! buf)
2092 buf = stackbuf;
2093 bufsize = sizeof stackbuf;
2097 size_t sizea =
2098 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2099 bool a_fits = sizea <= bufsize;
2100 size_t sizeb =
2101 (textb < limb
2102 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2103 (a_fits ? bufsize - sizea : 0))
2104 + 1)
2105 : 0);
2107 if (! (a_fits && sizea + sizeb <= bufsize))
2109 bufsize = sizea + sizeb;
2110 if (bufsize < SIZE_MAX / 3)
2111 bufsize = bufsize * 3 / 2;
2112 free (allocated);
2113 buf = allocated = xmalloc (bufsize);
2114 if (texta < lima)
2115 strxfrm (buf, texta, sizea);
2116 if (textb < limb)
2117 strxfrm (buf + sizea, textb, sizeb);
2120 /* Advance past NULs to the next part of each input string,
2121 exiting the loop if both strings are exhausted. When
2122 exiting the loop, prepare to finish off the tiebreaker
2123 comparison properly. */
2124 if (texta < lima)
2125 texta += strlen (texta) + 1;
2126 if (textb < limb)
2127 textb += strlen (textb) + 1;
2128 if (! (texta < lima || textb < limb))
2130 lena = sizea; texta = buf;
2131 lenb = sizeb; textb = buf + sizea;
2132 break;
2135 /* Accumulate the transformed data in the corresponding
2136 checksums. */
2137 md5_process_bytes (buf, sizea, &s[0]);
2138 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2140 /* Update the tiebreaker comparison of the transformed data. */
2141 if (! xfrm_diff)
2143 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2144 if (! xfrm_diff)
2145 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2150 /* Compute and compare the checksums. */
2151 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2152 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2153 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2155 /* Fall back on the tiebreaker if the checksums collide. */
2156 if (! diff)
2158 if (! xfrm_diff)
2160 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2161 if (! xfrm_diff)
2162 xfrm_diff = (lena > lenb) - (lena < lenb);
2165 diff = xfrm_diff;
2168 free (allocated);
2170 return diff;
2173 /* Return the printable width of the block of memory starting at
2174 TEXT and ending just before LIM, counting each tab as one byte.
2175 FIXME: Should we generally be counting non printable chars? */
2177 static size_t
2178 debug_width (char const *text, char const *lim)
2180 size_t width = mbsnwidth (text, lim - text, 0);
2181 while (text < lim)
2182 width += (*text++ == '\t');
2183 return width;
2186 /* For debug mode, "underline" a key at the
2187 specified offset and screen width. */
2189 static void
2190 mark_key (size_t offset, size_t width)
2192 while (offset--)
2193 putchar (' ');
2195 if (!width)
2196 printf (_("^ no match for key\n"));
2197 else
2200 putchar ('_');
2201 while (--width);
2203 putchar ('\n');
2207 /* Return true if KEY is a numeric key. */
2209 static inline bool
2210 key_numeric (struct keyfield const *key)
2212 return key->numeric || key->general_numeric || key->human_numeric;
2215 /* For LINE, output a debugging line that underlines KEY in LINE.
2216 If KEY is null, underline the whole line. */
2218 static void
2219 debug_key (struct line const *line, struct keyfield const *key)
2221 char *text = line->text;
2222 char *beg = text;
2223 char *lim = text + line->length - 1;
2225 if (key)
2227 if (key->sword != SIZE_MAX)
2228 beg = begfield (line, key);
2229 if (key->eword != SIZE_MAX)
2230 lim = limfield (line, key);
2232 if (key->skipsblanks || key->month || key_numeric (key))
2234 char saved = *lim;
2235 *lim = '\0';
2237 while (blanks[to_uchar (*beg)])
2238 beg++;
2240 char *tighter_lim = beg;
2242 if (lim < beg)
2243 tighter_lim = lim;
2244 else if (key->month)
2245 getmonth (beg, &tighter_lim);
2246 else if (key->general_numeric)
2247 ignore_value (strtold (beg, &tighter_lim));
2248 else if (key->numeric || key->human_numeric)
2250 char *p = beg + (beg < lim && *beg == '-');
2251 bool found_digit = false;
2252 unsigned char ch;
2256 while (ISDIGIT (ch = *p++))
2257 found_digit = true;
2259 while (ch == thousands_sep);
2261 if (ch == decimal_point)
2262 while (ISDIGIT (ch = *p++))
2263 found_digit = true;
2265 if (found_digit)
2266 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2268 else
2269 tighter_lim = lim;
2271 *lim = saved;
2272 lim = tighter_lim;
2276 size_t offset = debug_width (text, beg);
2277 size_t width = debug_width (beg, lim);
2278 mark_key (offset, width);
2281 /* Debug LINE by underlining its keys. */
2283 static void
2284 debug_line (struct line const *line)
2286 struct keyfield const *key = keylist;
2289 debug_key (line, key);
2290 while (key && ((key = key->next) || ! (unique || stable)));
2293 /* Return whether sorting options specified for key. */
2295 static bool
2296 default_key_compare (struct keyfield const *key)
2298 return ! (key->ignore
2299 || key->translate
2300 || key->skipsblanks
2301 || key->skipeblanks
2302 || key_numeric (key)
2303 || key->month
2304 || key->version
2305 || key->random
2306 /* || key->reverse */
2310 /* Convert a key to the short options used to specify it. */
2312 static void
2313 key_to_opts (struct keyfield const *key, char *opts)
2315 if (key->skipsblanks || key->skipeblanks)
2316 *opts++ = 'b';/* either disables global -b */
2317 if (key->ignore == nondictionary)
2318 *opts++ = 'd';
2319 if (key->translate)
2320 *opts++ = 'f';
2321 if (key->general_numeric)
2322 *opts++ = 'g';
2323 if (key->human_numeric)
2324 *opts++ = 'h';
2325 if (key->ignore == nonprinting)
2326 *opts++ = 'i';
2327 if (key->month)
2328 *opts++ = 'M';
2329 if (key->numeric)
2330 *opts++ = 'n';
2331 if (key->random)
2332 *opts++ = 'R';
2333 if (key->reverse)
2334 *opts++ = 'r';
2335 if (key->version)
2336 *opts++ = 'V';
2337 *opts = '\0';
2340 /* Output data independent key warnings to stderr. */
2342 static void
2343 key_warnings (struct keyfield const *gkey, bool gkey_only)
2345 struct keyfield const *key;
2346 struct keyfield ugkey = *gkey;
2347 unsigned long keynum = 1;
2349 for (key = keylist; key; key = key->next, keynum++)
2351 if (key->obsolete_used)
2353 size_t sword = key->sword;
2354 size_t eword = key->eword;
2355 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2356 /* obsolescent syntax +A.x -B.y is equivalent to:
2357 -k A+1.x+1,B.y (when y = 0)
2358 -k A+1.x+1,B+1.y (when y > 0) */
2359 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2360 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2361 char *po = obuf;
2362 char *pn = nbuf;
2364 if (sword == SIZE_MAX)
2365 sword++;
2367 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2368 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2369 if (key->eword != SIZE_MAX)
2371 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2372 stpcpy (stpcpy (pn, ","),
2373 umaxtostr (eword + 1
2374 + (key->echar == SIZE_MAX), tmp));
2376 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2377 quote_n (0, obuf), quote_n (1, nbuf));
2380 /* Warn about field specs that will never match. */
2381 if (key->sword != SIZE_MAX && key->eword < key->sword)
2382 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2384 /* Warn about significant leading blanks. */
2385 bool implicit_skip = key_numeric (key) || key->month;
2386 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2387 && !(key->schar || key->echar);
2388 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2389 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2390 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2391 || (!key->skipsblanks && key->schar)
2392 || (!key->skipeblanks && key->echar)))
2393 error (0, 0, _("leading blanks are significant in key %lu; "
2394 "consider also specifying 'b'"), keynum);
2396 /* Warn about numeric comparisons spanning fields,
2397 as field delimiters could be interpreted as part
2398 of the number (maybe only in other locales). */
2399 if (!gkey_only && key_numeric (key))
2401 size_t sword = key->sword + 1;
2402 size_t eword = key->eword + 1;
2403 if (!sword)
2404 sword++;
2405 if (!eword || sword < eword)
2406 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2407 keynum);
2410 /* Flag global options not copied or specified in any key. */
2411 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2412 ugkey.ignore = NULL;
2413 if (ugkey.translate && (ugkey.translate == key->translate))
2414 ugkey.translate = NULL;
2415 ugkey.skipsblanks &= !key->skipsblanks;
2416 ugkey.skipeblanks &= !key->skipeblanks;
2417 ugkey.month &= !key->month;
2418 ugkey.numeric &= !key->numeric;
2419 ugkey.general_numeric &= !key->general_numeric;
2420 ugkey.human_numeric &= !key->human_numeric;
2421 ugkey.random &= !key->random;
2422 ugkey.version &= !key->version;
2423 ugkey.reverse &= !key->reverse;
2426 /* Warn about ignored global options flagged above.
2427 Note if gkey is the only one in the list, all flags are cleared. */
2428 if (!default_key_compare (&ugkey)
2429 || (ugkey.reverse && (stable || unique) && keylist))
2431 bool ugkey_reverse = ugkey.reverse;
2432 if (!(stable || unique))
2433 ugkey.reverse = false;
2434 /* The following is too big, but guaranteed to be "big enough". */
2435 char opts[sizeof short_options];
2436 key_to_opts (&ugkey, opts);
2437 error (0, 0,
2438 ngettext ("option '-%s' is ignored",
2439 "options '-%s' are ignored",
2440 select_plural (strlen (opts))), opts);
2441 ugkey.reverse = ugkey_reverse;
2443 if (ugkey.reverse && !(stable || unique) && keylist)
2444 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2447 /* Compare two lines A and B trying every key in sequence until there
2448 are no more keys or a difference is found. */
2450 static int
2451 keycompare (struct line const *a, struct line const *b)
2453 struct keyfield *key = keylist;
2455 /* For the first iteration only, the key positions have been
2456 precomputed for us. */
2457 char *texta = a->keybeg;
2458 char *textb = b->keybeg;
2459 char *lima = a->keylim;
2460 char *limb = b->keylim;
2462 int diff;
2464 while (true)
2466 char const *translate = key->translate;
2467 bool const *ignore = key->ignore;
2469 /* Treat field ends before field starts as empty fields. */
2470 lima = MAX (texta, lima);
2471 limb = MAX (textb, limb);
2473 /* Find the lengths. */
2474 size_t lena = lima - texta;
2475 size_t lenb = limb - textb;
2477 if (hard_LC_COLLATE || key_numeric (key)
2478 || key->month || key->random || key->version)
2480 char *ta;
2481 char *tb;
2482 size_t tlena;
2483 size_t tlenb;
2485 char enda IF_LINT (= 0);
2486 char endb IF_LINT (= 0);
2487 void *allocated IF_LINT (= NULL);
2488 char stackbuf[4000];
2490 if (ignore || translate)
2492 /* Compute with copies of the keys, which are the result of
2493 translating or ignoring characters, and which need their
2494 own storage. */
2496 size_t i;
2498 /* Allocate space for copies. */
2499 size_t size = lena + 1 + lenb + 1;
2500 if (size <= sizeof stackbuf)
2501 ta = stackbuf, allocated = NULL;
2502 else
2503 ta = allocated = xmalloc (size);
2504 tb = ta + lena + 1;
2506 /* Put into each copy a version of the key in which the
2507 requested characters are ignored or translated. */
2508 for (tlena = i = 0; i < lena; i++)
2509 if (! (ignore && ignore[to_uchar (texta[i])]))
2510 ta[tlena++] = (translate
2511 ? translate[to_uchar (texta[i])]
2512 : texta[i]);
2513 ta[tlena] = '\0';
2515 for (tlenb = i = 0; i < lenb; i++)
2516 if (! (ignore && ignore[to_uchar (textb[i])]))
2517 tb[tlenb++] = (translate
2518 ? translate[to_uchar (textb[i])]
2519 : textb[i]);
2520 tb[tlenb] = '\0';
2522 else
2524 /* Use the keys in-place, temporarily null-terminated. */
2525 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2526 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2529 if (key->numeric)
2530 diff = numcompare (ta, tb);
2531 else if (key->general_numeric)
2532 diff = general_numcompare (ta, tb);
2533 else if (key->human_numeric)
2534 diff = human_numcompare (ta, tb);
2535 else if (key->month)
2536 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2537 else if (key->random)
2538 diff = compare_random (ta, tlena, tb, tlenb);
2539 else if (key->version)
2540 diff = filevercmp (ta, tb);
2541 else
2543 /* Locale-dependent string sorting. This is slower than
2544 C-locale sorting, which is implemented below. */
2545 if (tlena == 0)
2546 diff = - NONZERO (tlenb);
2547 else if (tlenb == 0)
2548 diff = 1;
2549 else
2550 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2553 if (ignore || translate)
2554 free (allocated);
2555 else
2557 ta[tlena] = enda;
2558 tb[tlenb] = endb;
2561 else if (ignore)
2563 #define CMP_WITH_IGNORE(A, B) \
2564 do \
2566 while (true) \
2568 while (texta < lima && ignore[to_uchar (*texta)]) \
2569 ++texta; \
2570 while (textb < limb && ignore[to_uchar (*textb)]) \
2571 ++textb; \
2572 if (! (texta < lima && textb < limb)) \
2573 break; \
2574 diff = to_uchar (A) - to_uchar (B); \
2575 if (diff) \
2576 goto not_equal; \
2577 ++texta; \
2578 ++textb; \
2581 diff = (texta < lima) - (textb < limb); \
2583 while (0)
2585 if (translate)
2586 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2587 translate[to_uchar (*textb)]);
2588 else
2589 CMP_WITH_IGNORE (*texta, *textb);
2591 else if (lena == 0)
2592 diff = - NONZERO (lenb);
2593 else if (lenb == 0)
2594 goto greater;
2595 else
2597 if (translate)
2599 while (texta < lima && textb < limb)
2601 diff = (to_uchar (translate[to_uchar (*texta++)])
2602 - to_uchar (translate[to_uchar (*textb++)]));
2603 if (diff)
2604 goto not_equal;
2607 else
2609 diff = memcmp (texta, textb, MIN (lena, lenb));
2610 if (diff)
2611 goto not_equal;
2613 diff = lena < lenb ? -1 : lena != lenb;
2616 if (diff)
2617 goto not_equal;
2619 key = key->next;
2620 if (! key)
2621 break;
2623 /* Find the beginning and limit of the next field. */
2624 if (key->eword != SIZE_MAX)
2625 lima = limfield (a, key), limb = limfield (b, key);
2626 else
2627 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2629 if (key->sword != SIZE_MAX)
2630 texta = begfield (a, key), textb = begfield (b, key);
2631 else
2633 texta = a->text, textb = b->text;
2634 if (key->skipsblanks)
2636 while (texta < lima && blanks[to_uchar (*texta)])
2637 ++texta;
2638 while (textb < limb && blanks[to_uchar (*textb)])
2639 ++textb;
2644 return 0;
2646 greater:
2647 diff = 1;
2648 not_equal:
2649 return key->reverse ? -diff : diff;
2652 /* Compare two lines A and B, returning negative, zero, or positive
2653 depending on whether A compares less than, equal to, or greater than B. */
2655 static int
2656 compare (struct line const *a, struct line const *b)
2658 int diff;
2659 size_t alen, blen;
2661 /* First try to compare on the specified keys (if any).
2662 The only two cases with no key at all are unadorned sort,
2663 and unadorned sort -r. */
2664 if (keylist)
2666 diff = keycompare (a, b);
2667 if (diff || unique || stable)
2668 return diff;
2671 /* If the keys all compare equal (or no keys were specified)
2672 fall through to the default comparison. */
2673 alen = a->length - 1, blen = b->length - 1;
2675 if (alen == 0)
2676 diff = - NONZERO (blen);
2677 else if (blen == 0)
2678 diff = 1;
2679 else if (hard_LC_COLLATE)
2681 /* Note xmemcoll0 is a performance enhancement as
2682 it will not unconditionally write '\0' after the
2683 passed in buffers, which was seen to give around
2684 a 3% increase in performance for short lines. */
2685 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2687 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2688 diff = alen < blen ? -1 : alen != blen;
2690 return reverse ? -diff : diff;
2693 /* Write LINE to output stream FP; the output file's name is
2694 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2695 otherwise. If debugging is enabled and FP is standard output,
2696 append some debugging information. */
2698 static void
2699 write_line (struct line const *line, FILE *fp, char const *output_file)
2701 char *buf = line->text;
2702 size_t n_bytes = line->length;
2703 char *ebuf = buf + n_bytes;
2705 if (!output_file && debug)
2707 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2708 char const *c = buf;
2710 while (c < ebuf)
2712 char wc = *c++;
2713 if (wc == '\t')
2714 wc = '>';
2715 else if (c == ebuf)
2716 wc = '\n';
2717 if (fputc (wc, fp) == EOF)
2718 die (_("write failed"), output_file);
2721 debug_line (line);
2723 else
2725 ebuf[-1] = eolchar;
2726 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2727 die (_("write failed"), output_file);
2728 ebuf[-1] = '\0';
2732 /* Check that the lines read from FILE_NAME come in order. Return
2733 true if they are in order. If CHECKONLY == 'c', also print a
2734 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2735 they are not in order. */
2737 static bool
2738 check (char const *file_name, char checkonly)
2740 FILE *fp = xfopen (file_name, "r");
2741 struct buffer buf; /* Input buffer. */
2742 struct line temp; /* Copy of previous line. */
2743 size_t alloc = 0;
2744 uintmax_t line_number = 0;
2745 struct keyfield const *key = keylist;
2746 bool nonunique = ! unique;
2747 bool ordered = true;
2749 initbuf (&buf, sizeof (struct line),
2750 MAX (merge_buffer_size, sort_size));
2751 temp.text = NULL;
2753 while (fillbuf (&buf, fp, file_name))
2755 struct line const *line = buffer_linelim (&buf);
2756 struct line const *linebase = line - buf.nlines;
2758 /* Make sure the line saved from the old buffer contents is
2759 less than or equal to the first line of the new buffer. */
2760 if (alloc && nonunique <= compare (&temp, line - 1))
2762 found_disorder:
2764 if (checkonly == 'c')
2766 struct line const *disorder_line = line - 1;
2767 uintmax_t disorder_line_number =
2768 buffer_linelim (&buf) - disorder_line + line_number;
2769 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2770 fprintf (stderr, _("%s: %s:%s: disorder: "),
2771 program_name, file_name,
2772 umaxtostr (disorder_line_number, hr_buf));
2773 write_line (disorder_line, stderr, _("standard error"));
2776 ordered = false;
2777 break;
2781 /* Compare each line in the buffer with its successor. */
2782 while (linebase < --line)
2783 if (nonunique <= compare (line, line - 1))
2784 goto found_disorder;
2786 line_number += buf.nlines;
2788 /* Save the last line of the buffer. */
2789 if (alloc < line->length)
2793 alloc *= 2;
2794 if (! alloc)
2796 alloc = line->length;
2797 break;
2800 while (alloc < line->length);
2802 free (temp.text);
2803 temp.text = xmalloc (alloc);
2805 memcpy (temp.text, line->text, line->length);
2806 temp.length = line->length;
2807 if (key)
2809 temp.keybeg = temp.text + (line->keybeg - line->text);
2810 temp.keylim = temp.text + (line->keylim - line->text);
2814 xfclose (fp, file_name);
2815 free (buf.buf);
2816 free (temp.text);
2817 return ordered;
2820 /* Open FILES (there are NFILES of them) and store the resulting array
2821 of stream pointers into (*PFPS). Allocate the array. Return the
2822 number of successfully opened files, setting errno if this value is
2823 less than NFILES. */
2825 static size_t
2826 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2828 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2829 int i;
2831 /* Open as many input files as we can. */
2832 for (i = 0; i < nfiles; i++)
2834 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2835 ? open_temp (files[i].temp)
2836 : stream_open (files[i].name, "r"));
2837 if (!fps[i])
2838 break;
2841 return i;
2844 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2845 files (all of which are at the start of the FILES array), and
2846 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2847 FPS is the vector of open stream corresponding to the files.
2848 Close input and output streams before returning.
2849 OUTPUT_FILE gives the name of the output file. If it is NULL,
2850 the output file is standard output. */
2852 static void
2853 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2854 FILE *ofp, char const *output_file, FILE **fps)
2856 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2857 /* Input buffers for each file. */
2858 struct line saved; /* Saved line storage for unique check. */
2859 struct line const *savedline = NULL;
2860 /* &saved if there is a saved line. */
2861 size_t savealloc = 0; /* Size allocated for the saved line. */
2862 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2863 /* Current line in each line table. */
2864 struct line const **base = xnmalloc (nfiles, sizeof *base);
2865 /* Base of each line table. */
2866 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2867 /* Table representing a permutation of fps,
2868 such that cur[ord[0]] is the smallest line
2869 and will be next output. */
2870 size_t i;
2871 size_t j;
2872 size_t t;
2873 struct keyfield const *key = keylist;
2874 saved.text = NULL;
2876 /* Read initial lines from each input file. */
2877 for (i = 0; i < nfiles; )
2879 initbuf (&buffer[i], sizeof (struct line),
2880 MAX (merge_buffer_size, sort_size / nfiles));
2881 if (fillbuf (&buffer[i], fps[i], files[i].name))
2883 struct line const *linelim = buffer_linelim (&buffer[i]);
2884 cur[i] = linelim - 1;
2885 base[i] = linelim - buffer[i].nlines;
2886 i++;
2888 else
2890 /* fps[i] is empty; eliminate it from future consideration. */
2891 xfclose (fps[i], files[i].name);
2892 if (i < ntemps)
2894 ntemps--;
2895 zaptemp (files[i].name);
2897 free (buffer[i].buf);
2898 --nfiles;
2899 for (j = i; j < nfiles; ++j)
2901 files[j] = files[j + 1];
2902 fps[j] = fps[j + 1];
2907 /* Set up the ord table according to comparisons among input lines.
2908 Since this only reorders two items if one is strictly greater than
2909 the other, it is stable. */
2910 for (i = 0; i < nfiles; ++i)
2911 ord[i] = i;
2912 for (i = 1; i < nfiles; ++i)
2913 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2914 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2916 /* Repeatedly output the smallest line until no input remains. */
2917 while (nfiles)
2919 struct line const *smallest = cur[ord[0]];
2921 /* If uniquified output is turned on, output only the first of
2922 an identical series of lines. */
2923 if (unique)
2925 if (savedline && compare (savedline, smallest))
2927 savedline = NULL;
2928 write_line (&saved, ofp, output_file);
2930 if (!savedline)
2932 savedline = &saved;
2933 if (savealloc < smallest->length)
2936 if (! savealloc)
2938 savealloc = smallest->length;
2939 break;
2941 while ((savealloc *= 2) < smallest->length);
2943 free (saved.text);
2944 saved.text = xmalloc (savealloc);
2946 saved.length = smallest->length;
2947 memcpy (saved.text, smallest->text, saved.length);
2948 if (key)
2950 saved.keybeg =
2951 saved.text + (smallest->keybeg - smallest->text);
2952 saved.keylim =
2953 saved.text + (smallest->keylim - smallest->text);
2957 else
2958 write_line (smallest, ofp, output_file);
2960 /* Check if we need to read more lines into core. */
2961 if (base[ord[0]] < smallest)
2962 cur[ord[0]] = smallest - 1;
2963 else
2965 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2967 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2968 cur[ord[0]] = linelim - 1;
2969 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2971 else
2973 /* We reached EOF on fps[ord[0]]. */
2974 for (i = 1; i < nfiles; ++i)
2975 if (ord[i] > ord[0])
2976 --ord[i];
2977 --nfiles;
2978 xfclose (fps[ord[0]], files[ord[0]].name);
2979 if (ord[0] < ntemps)
2981 ntemps--;
2982 zaptemp (files[ord[0]].name);
2984 free (buffer[ord[0]].buf);
2985 for (i = ord[0]; i < nfiles; ++i)
2987 fps[i] = fps[i + 1];
2988 files[i] = files[i + 1];
2989 buffer[i] = buffer[i + 1];
2990 cur[i] = cur[i + 1];
2991 base[i] = base[i + 1];
2993 for (i = 0; i < nfiles; ++i)
2994 ord[i] = ord[i + 1];
2995 continue;
2999 /* The new line just read in may be larger than other lines
3000 already in main memory; push it back in the queue until we
3001 encounter a line larger than it. Optimize for the common
3002 case where the new line is smallest. */
3004 size_t lo = 1;
3005 size_t hi = nfiles;
3006 size_t probe = lo;
3007 size_t ord0 = ord[0];
3008 size_t count_of_smaller_lines;
3010 while (lo < hi)
3012 int cmp = compare (cur[ord0], cur[ord[probe]]);
3013 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3014 hi = probe;
3015 else
3016 lo = probe + 1;
3017 probe = (lo + hi) / 2;
3020 count_of_smaller_lines = lo - 1;
3021 for (j = 0; j < count_of_smaller_lines; j++)
3022 ord[j] = ord[j + 1];
3023 ord[count_of_smaller_lines] = ord0;
3027 if (unique && savedline)
3029 write_line (&saved, ofp, output_file);
3030 free (saved.text);
3033 xfclose (ofp, output_file);
3034 free (fps);
3035 free (buffer);
3036 free (ord);
3037 free (base);
3038 free (cur);
3041 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3042 files (all of which are at the start of the FILES array), and
3043 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3044 Close input and output files before returning.
3045 OUTPUT_FILE gives the name of the output file.
3047 Return the number of files successfully merged. This number can be
3048 less than NFILES if we ran low on file descriptors, but in this
3049 case it is never less than 2. */
3051 static size_t
3052 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3053 FILE *ofp, char const *output_file)
3055 FILE **fps;
3056 size_t nopened = open_input_files (files, nfiles, &fps);
3057 if (nopened < nfiles && nopened < 2)
3058 die (_("open failed"), files[nopened].name);
3059 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3060 return nopened;
3063 /* Merge into T (of size NLINES) the two sorted arrays of lines
3064 LO (with NLINES / 2 members), and
3065 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3066 T and LO point just past their respective arrays, and the arrays
3067 are in reverse order. NLINES must be at least 2. */
3069 static void
3070 mergelines (struct line *restrict t, size_t nlines,
3071 struct line const *restrict lo)
3073 size_t nlo = nlines / 2;
3074 size_t nhi = nlines - nlo;
3075 struct line *hi = t - nlo;
3077 while (true)
3078 if (compare (lo - 1, hi - 1) <= 0)
3080 *--t = *--lo;
3081 if (! --nlo)
3083 /* HI must equal T now, and there is no need to copy from
3084 HI to T. */
3085 return;
3088 else
3090 *--t = *--hi;
3091 if (! --nhi)
3094 *--t = *--lo;
3095 while (--nlo);
3097 return;
3102 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3103 Do this all within one thread. NLINES must be at least 2.
3104 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3105 Otherwise the sort is in-place and TEMP is half-sized.
3106 The input and output arrays are in reverse order, and LINES and
3107 TEMP point just past the end of their respective arrays.
3109 Use a recursive divide-and-conquer algorithm, in the style
3110 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3111 the optimization suggested by exercise 5.2.4-10; this requires room
3112 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3113 writes that this memory optimization was originally published by
3114 D. A. Bell, Comp J. 1 (1958), 75. */
3116 static void
3117 sequential_sort (struct line *restrict lines, size_t nlines,
3118 struct line *restrict temp, bool to_temp)
3120 if (nlines == 2)
3122 /* Declare 'swap' as int, not bool, to work around a bug
3123 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3124 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3125 int swap = (0 < compare (&lines[-1], &lines[-2]));
3126 if (to_temp)
3128 temp[-1] = lines[-1 - swap];
3129 temp[-2] = lines[-2 + swap];
3131 else if (swap)
3133 temp[-1] = lines[-1];
3134 lines[-1] = lines[-2];
3135 lines[-2] = temp[-1];
3138 else
3140 size_t nlo = nlines / 2;
3141 size_t nhi = nlines - nlo;
3142 struct line *lo = lines;
3143 struct line *hi = lines - nlo;
3145 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3146 if (1 < nlo)
3147 sequential_sort (lo, nlo, temp, !to_temp);
3148 else if (!to_temp)
3149 temp[-1] = lo[-1];
3151 struct line *dest;
3152 struct line const *sorted_lo;
3153 if (to_temp)
3155 dest = temp;
3156 sorted_lo = lines;
3158 else
3160 dest = lines;
3161 sorted_lo = temp;
3163 mergelines (dest, nlines, sorted_lo);
3167 static struct merge_node *init_node (struct merge_node *restrict,
3168 struct merge_node *restrict,
3169 struct line *, size_t, size_t, bool);
3172 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3173 lines, with destination DEST. */
3174 static struct merge_node *
3175 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3177 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3179 struct merge_node *root = merge_tree;
3180 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3181 root->dest = NULL;
3182 root->nlo = root->nhi = nlines;
3183 root->parent = NULL;
3184 root->level = MERGE_END;
3185 root->queued = false;
3186 pthread_mutex_init (&root->lock, NULL);
3188 init_node (root, root + 1, dest, nthreads, nlines, false);
3189 return merge_tree;
3192 /* Destroy the merge tree. */
3193 static void
3194 merge_tree_destroy (struct merge_node *merge_tree)
3196 free (merge_tree);
3199 /* Initialize a merge tree node and its descendants. The node's
3200 parent is PARENT. The node and its descendants are taken from the
3201 array of nodes NODE_POOL. Their destination starts at DEST; they
3202 will consume NTHREADS threads. The total number of sort lines is
3203 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3204 its parent. */
3206 static struct merge_node *
3207 init_node (struct merge_node *restrict parent,
3208 struct merge_node *restrict node_pool,
3209 struct line *dest, size_t nthreads,
3210 size_t total_lines, bool is_lo_child)
3212 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3213 size_t nlo = nlines / 2;
3214 size_t nhi = nlines - nlo;
3215 struct line *lo = dest - total_lines;
3216 struct line *hi = lo - nlo;
3217 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3219 struct merge_node *node = node_pool++;
3220 node->lo = node->end_lo = lo;
3221 node->hi = node->end_hi = hi;
3222 node->dest = parent_end;
3223 node->nlo = nlo;
3224 node->nhi = nhi;
3225 node->parent = parent;
3226 node->level = parent->level + 1;
3227 node->queued = false;
3228 pthread_mutex_init (&node->lock, NULL);
3230 if (nthreads > 1)
3232 size_t lo_threads = nthreads / 2;
3233 size_t hi_threads = nthreads - lo_threads;
3234 node->lo_child = node_pool;
3235 node_pool = init_node (node, node_pool, lo, lo_threads,
3236 total_lines, true);
3237 node->hi_child = node_pool;
3238 node_pool = init_node (node, node_pool, hi, hi_threads,
3239 total_lines, false);
3241 else
3243 node->lo_child = NULL;
3244 node->hi_child = NULL;
3246 return node_pool;
3250 /* Compare two merge nodes A and B for priority. */
3252 static int
3253 compare_nodes (void const *a, void const *b)
3255 struct merge_node const *nodea = a;
3256 struct merge_node const *nodeb = b;
3257 if (nodea->level == nodeb->level)
3258 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3259 return nodea->level < nodeb->level;
3262 /* Lock a merge tree NODE. */
3264 static inline void
3265 lock_node (struct merge_node *node)
3267 pthread_mutex_lock (&node->lock);
3270 /* Unlock a merge tree NODE. */
3272 static inline void
3273 unlock_node (struct merge_node *node)
3275 pthread_mutex_unlock (&node->lock);
3278 /* Destroy merge QUEUE. */
3280 static void
3281 queue_destroy (struct merge_node_queue *queue)
3283 heap_free (queue->priority_queue);
3284 pthread_cond_destroy (&queue->cond);
3285 pthread_mutex_destroy (&queue->mutex);
3288 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3289 NTHREADS threads. */
3291 static void
3292 queue_init (struct merge_node_queue *queue, size_t nthreads)
3294 /* Though it's highly unlikely all nodes are in the heap at the same
3295 time, the heap should accommodate all of them. Counting a NULL
3296 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3297 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3298 pthread_mutex_init (&queue->mutex, NULL);
3299 pthread_cond_init (&queue->cond, NULL);
3302 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3303 does not need to lock NODE. */
3305 static void
3306 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3308 pthread_mutex_lock (&queue->mutex);
3309 heap_insert (queue->priority_queue, node);
3310 node->queued = true;
3311 pthread_mutex_unlock (&queue->mutex);
3312 pthread_cond_signal (&queue->cond);
3315 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3317 static struct merge_node *
3318 queue_pop (struct merge_node_queue *queue)
3320 struct merge_node *node;
3321 pthread_mutex_lock (&queue->mutex);
3322 while (! (node = heap_remove_top (queue->priority_queue)))
3323 pthread_cond_wait (&queue->cond, &queue->mutex);
3324 pthread_mutex_unlock (&queue->mutex);
3325 lock_node (node);
3326 node->queued = false;
3327 return node;
3330 /* Output LINE to TFP, unless -u is specified and the line compares
3331 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3332 is null if TFP is standard output.
3334 This function does not save the line for comparison later, so it is
3335 appropriate only for internal sort. */
3337 static void
3338 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3340 static struct line saved;
3342 if (unique)
3344 if (saved.text && ! compare (line, &saved))
3345 return;
3346 saved = *line;
3349 write_line (line, tfp, temp_output);
3352 /* Merge the lines currently available to a NODE in the binary
3353 merge tree. Merge a number of lines appropriate for this merge
3354 level, assuming TOTAL_LINES is the total number of lines.
3356 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3357 the name of TFP, or is null if TFP is standard output. */
3359 static void
3360 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3361 FILE *tfp, char const *temp_output)
3363 struct line *lo_orig = node->lo;
3364 struct line *hi_orig = node->hi;
3365 size_t to_merge = MAX_MERGE (total_lines, node->level);
3366 size_t merged_lo;
3367 size_t merged_hi;
3369 if (node->level > MERGE_ROOT)
3371 /* Merge to destination buffer. */
3372 struct line *dest = *node->dest;
3373 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3374 if (compare (node->lo - 1, node->hi - 1) <= 0)
3375 *--dest = *--node->lo;
3376 else
3377 *--dest = *--node->hi;
3379 merged_lo = lo_orig - node->lo;
3380 merged_hi = hi_orig - node->hi;
3382 if (node->nhi == merged_hi)
3383 while (node->lo != node->end_lo && to_merge--)
3384 *--dest = *--node->lo;
3385 else if (node->nlo == merged_lo)
3386 while (node->hi != node->end_hi && to_merge--)
3387 *--dest = *--node->hi;
3388 *node->dest = dest;
3390 else
3392 /* Merge directly to output. */
3393 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3395 if (compare (node->lo - 1, node->hi - 1) <= 0)
3396 write_unique (--node->lo, tfp, temp_output);
3397 else
3398 write_unique (--node->hi, tfp, temp_output);
3401 merged_lo = lo_orig - node->lo;
3402 merged_hi = hi_orig - node->hi;
3404 if (node->nhi == merged_hi)
3406 while (node->lo != node->end_lo && to_merge--)
3407 write_unique (--node->lo, tfp, temp_output);
3409 else if (node->nlo == merged_lo)
3411 while (node->hi != node->end_hi && to_merge--)
3412 write_unique (--node->hi, tfp, temp_output);
3416 /* Update NODE. */
3417 merged_lo = lo_orig - node->lo;
3418 merged_hi = hi_orig - node->hi;
3419 node->nlo -= merged_lo;
3420 node->nhi -= merged_hi;
3423 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3424 NODE's children has available lines and the other either has
3425 available lines or has exhausted its lines. */
3427 static void
3428 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3430 if (! node->queued)
3432 bool lo_avail = (node->lo - node->end_lo) != 0;
3433 bool hi_avail = (node->hi - node->end_hi) != 0;
3434 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3435 queue_insert (queue, node);
3439 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3441 static void
3442 queue_check_insert_parent (struct merge_node_queue *queue,
3443 struct merge_node *node)
3445 if (node->level > MERGE_ROOT)
3447 lock_node (node->parent);
3448 queue_check_insert (queue, node->parent);
3449 unlock_node (node->parent);
3451 else if (node->nlo + node->nhi == 0)
3453 /* If the MERGE_ROOT NODE has finished merging, insert the
3454 MERGE_END node. */
3455 queue_insert (queue, node->parent);
3459 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3460 some of those lines, until the MERGE_END node is popped.
3461 TOTAL_LINES is the total number of lines. If merging at the top
3462 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3463 null if TFP is standard output. */
3465 static void
3466 merge_loop (struct merge_node_queue *queue,
3467 size_t total_lines, FILE *tfp, char const *temp_output)
3469 while (1)
3471 struct merge_node *node = queue_pop (queue);
3473 if (node->level == MERGE_END)
3475 unlock_node (node);
3476 /* Reinsert so other threads can pop it. */
3477 queue_insert (queue, node);
3478 break;
3480 mergelines_node (node, total_lines, tfp, temp_output);
3481 queue_check_insert (queue, node);
3482 queue_check_insert_parent (queue, node);
3484 unlock_node (node);
3489 static void sortlines (struct line *restrict, size_t, size_t,
3490 struct merge_node *, struct merge_node_queue *,
3491 FILE *, char const *);
3493 /* Thread arguments for sortlines_thread. */
3495 struct thread_args
3497 /* Source, i.e., the array of lines to sort. This points just past
3498 the end of the array. */
3499 struct line *lines;
3501 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3502 size_t nthreads;
3504 /* Number of lines in LINES and DEST. */
3505 size_t const total_lines;
3507 /* Merge node. Lines from this node and this node's sibling will merged
3508 to this node's parent. */
3509 struct merge_node *const node;
3511 /* The priority queue controlling available work for the entire
3512 internal sort. */
3513 struct merge_node_queue *const queue;
3515 /* If at the top level, the file to output to, and the file's name.
3516 If the file is standard output, the file's name is null. */
3517 FILE *tfp;
3518 char const *output_temp;
3521 /* Like sortlines, except with a signature acceptable to pthread_create. */
3523 static void *
3524 sortlines_thread (void *data)
3526 struct thread_args const *args = data;
3527 sortlines (args->lines, args->nthreads, args->total_lines,
3528 args->node, args->queue, args->tfp,
3529 args->output_temp);
3530 return NULL;
3533 /* Sort lines, possibly in parallel. The arguments are as in struct
3534 thread_args above.
3536 The algorithm has three phases: node creation, sequential sort,
3537 and binary merge.
3539 During node creation, sortlines recursively visits each node in the
3540 binary merge tree and creates a NODE structure corresponding to all the
3541 future line merging NODE is responsible for. For each call to
3542 sortlines, half the available threads are assigned to each recursive
3543 call, until a leaf node having only 1 available thread is reached.
3545 Each leaf node then performs two sequential sorts, one on each half of
3546 the lines it is responsible for. It records in its NODE structure that
3547 there are two sorted sublists available to merge from, and inserts its
3548 NODE into the priority queue.
3550 The binary merge phase then begins. Each thread drops into a loop
3551 where the thread retrieves a NODE from the priority queue, merges lines
3552 available to that NODE, and potentially insert NODE or its parent back
3553 into the queue if there are sufficient available lines for them to
3554 merge. This continues until all lines at all nodes of the merge tree
3555 have been merged. */
3557 static void
3558 sortlines (struct line *restrict lines, size_t nthreads,
3559 size_t total_lines, struct merge_node *node,
3560 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3562 size_t nlines = node->nlo + node->nhi;
3564 /* Calculate thread arguments. */
3565 size_t lo_threads = nthreads / 2;
3566 size_t hi_threads = nthreads - lo_threads;
3567 pthread_t thread;
3568 struct thread_args args = {lines, lo_threads, total_lines,
3569 node->lo_child, queue, tfp, temp_output};
3571 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3572 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3574 sortlines (lines - node->nlo, hi_threads, total_lines,
3575 node->hi_child, queue, tfp, temp_output);
3576 pthread_join (thread, NULL);
3578 else
3580 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3581 Sort with 1 thread. */
3582 size_t nlo = node->nlo;
3583 size_t nhi = node->nhi;
3584 struct line *temp = lines - total_lines;
3585 if (1 < nhi)
3586 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3587 if (1 < nlo)
3588 sequential_sort (lines, nlo, temp, false);
3590 /* Update merge NODE. No need to lock yet. */
3591 node->lo = lines;
3592 node->hi = lines - nlo;
3593 node->end_lo = lines - nlo;
3594 node->end_hi = lines - nlo - nhi;
3596 queue_insert (queue, node);
3597 merge_loop (queue, total_lines, tfp, temp_output);
3600 pthread_mutex_destroy (&node->lock);
3603 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3604 the same as OUTFILE. If found, replace each with the same
3605 temporary copy that can be merged into OUTFILE without destroying
3606 OUTFILE before it is completely read. This temporary copy does not
3607 count as a merge temp, so don't worry about incrementing NTEMPS in
3608 the caller; final cleanup will remove it, not zaptemp.
3610 This test ensures that an otherwise-erroneous use like
3611 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3612 It's not clear that POSIX requires this nicety.
3613 Detect common error cases, but don't try to catch obscure cases like
3614 "cat ... FILE ... | sort -m -o FILE"
3615 where traditional "sort" doesn't copy the input and where
3616 people should know that they're getting into trouble anyway.
3617 Catching these obscure cases would slow down performance in
3618 common cases. */
3620 static void
3621 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3622 size_t nfiles, char const *outfile)
3624 size_t i;
3625 bool got_outstat = false;
3626 struct stat outstat;
3627 struct tempnode *tempcopy = NULL;
3629 for (i = ntemps; i < nfiles; i++)
3631 bool is_stdin = STREQ (files[i].name, "-");
3632 bool same;
3633 struct stat instat;
3635 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3636 same = true;
3637 else
3639 if (! got_outstat)
3641 if ((outfile
3642 ? stat (outfile, &outstat)
3643 : fstat (STDOUT_FILENO, &outstat))
3644 != 0)
3645 break;
3646 got_outstat = true;
3649 same = (((is_stdin
3650 ? fstat (STDIN_FILENO, &instat)
3651 : stat (files[i].name, &instat))
3652 == 0)
3653 && SAME_INODE (instat, outstat));
3656 if (same)
3658 if (! tempcopy)
3660 FILE *tftp;
3661 tempcopy = create_temp (&tftp);
3662 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3665 files[i].name = tempcopy->name;
3666 files[i].temp = tempcopy;
3671 /* Merge the input FILES. NTEMPS is the number of files at the
3672 start of FILES that are temporary; it is zero at the top level.
3673 NFILES is the total number of files. Put the output in
3674 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3676 static void
3677 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3678 char const *output_file)
3680 while (nmerge < nfiles)
3682 /* Number of input files processed so far. */
3683 size_t in;
3685 /* Number of output files generated so far. */
3686 size_t out;
3688 /* nfiles % NMERGE; this counts input files that are left over
3689 after all full-sized merges have been done. */
3690 size_t remainder;
3692 /* Number of easily-available slots at the next loop iteration. */
3693 size_t cheap_slots;
3695 /* Do as many NMERGE-size merges as possible. In the case that
3696 nmerge is bogus, increment by the maximum number of file
3697 descriptors allowed. */
3698 for (out = in = 0; nmerge <= nfiles - in; out++)
3700 FILE *tfp;
3701 struct tempnode *temp = create_temp (&tfp);
3702 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3703 nmerge, tfp, temp->name);
3704 ntemps -= MIN (ntemps, num_merged);
3705 files[out].name = temp->name;
3706 files[out].temp = temp;
3707 in += num_merged;
3710 remainder = nfiles - in;
3711 cheap_slots = nmerge - out % nmerge;
3713 if (cheap_slots < remainder)
3715 /* So many files remain that they can't all be put into the last
3716 NMERGE-sized output window. Do one more merge. Merge as few
3717 files as possible, to avoid needless I/O. */
3718 size_t nshortmerge = remainder - cheap_slots + 1;
3719 FILE *tfp;
3720 struct tempnode *temp = create_temp (&tfp);
3721 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3722 nshortmerge, tfp, temp->name);
3723 ntemps -= MIN (ntemps, num_merged);
3724 files[out].name = temp->name;
3725 files[out++].temp = temp;
3726 in += num_merged;
3729 /* Put the remaining input files into the last NMERGE-sized output
3730 window, so they will be merged in the next pass. */
3731 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3732 ntemps += out;
3733 nfiles -= in - out;
3736 avoid_trashing_input (files, ntemps, nfiles, output_file);
3738 /* We aren't guaranteed that this final mergefiles will work, therefore we
3739 try to merge into the output, and then merge as much as we can into a
3740 temp file if we can't. Repeat. */
3742 while (true)
3744 /* Merge directly into the output file if possible. */
3745 FILE **fps;
3746 size_t nopened = open_input_files (files, nfiles, &fps);
3748 if (nopened == nfiles)
3750 FILE *ofp = stream_open (output_file, "w");
3751 if (ofp)
3753 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3754 break;
3756 if (errno != EMFILE || nopened <= 2)
3757 die (_("open failed"), output_file);
3759 else if (nopened <= 2)
3760 die (_("open failed"), files[nopened].name);
3762 /* We ran out of file descriptors. Close one of the input
3763 files, to gain a file descriptor. Then create a temporary
3764 file with our spare file descriptor. Retry if that failed
3765 (e.g., some other process could open a file between the time
3766 we closed and tried to create). */
3767 FILE *tfp;
3768 struct tempnode *temp;
3771 nopened--;
3772 xfclose (fps[nopened], files[nopened].name);
3773 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3775 while (!temp);
3777 /* Merge into the newly allocated temporary. */
3778 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3779 fps);
3780 ntemps -= MIN (ntemps, nopened);
3781 files[0].name = temp->name;
3782 files[0].temp = temp;
3784 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3785 ntemps++;
3786 nfiles -= nopened - 1;
3790 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3792 static void
3793 sort (char *const *files, size_t nfiles, char const *output_file,
3794 size_t nthreads)
3796 struct buffer buf;
3797 IF_LINT (buf.buf = NULL);
3798 size_t ntemps = 0;
3799 bool output_file_created = false;
3801 buf.alloc = 0;
3803 while (nfiles)
3805 char const *temp_output;
3806 char const *file = *files;
3807 FILE *fp = xfopen (file, "r");
3808 FILE *tfp;
3810 size_t bytes_per_line;
3811 if (nthreads > 1)
3813 /* Get log P. */
3814 size_t tmp = 1;
3815 size_t mult = 1;
3816 while (tmp < nthreads)
3818 tmp *= 2;
3819 mult++;
3821 bytes_per_line = (mult * sizeof (struct line));
3823 else
3824 bytes_per_line = sizeof (struct line) * 3 / 2;
3826 if (! buf.alloc)
3827 initbuf (&buf, bytes_per_line,
3828 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3829 buf.eof = false;
3830 files++;
3831 nfiles--;
3833 while (fillbuf (&buf, fp, file))
3835 struct line *line;
3837 if (buf.eof && nfiles
3838 && (bytes_per_line + 1
3839 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3841 /* End of file, but there is more input and buffer room.
3842 Concatenate the next input file; this is faster in
3843 the usual case. */
3844 buf.left = buf.used;
3845 break;
3848 line = buffer_linelim (&buf);
3849 if (buf.eof && !nfiles && !ntemps && !buf.left)
3851 xfclose (fp, file);
3852 tfp = xfopen (output_file, "w");
3853 temp_output = output_file;
3854 output_file_created = true;
3856 else
3858 ++ntemps;
3859 temp_output = create_temp (&tfp)->name;
3861 if (1 < buf.nlines)
3863 struct merge_node_queue queue;
3864 queue_init (&queue, nthreads);
3865 struct merge_node *merge_tree =
3866 merge_tree_init (nthreads, buf.nlines, line);
3867 struct merge_node *root = merge_tree + 1;
3869 sortlines (line, nthreads, buf.nlines, root,
3870 &queue, tfp, temp_output);
3871 queue_destroy (&queue);
3872 pthread_mutex_destroy (&root->lock);
3873 merge_tree_destroy (merge_tree);
3875 else
3876 write_unique (line - 1, tfp, temp_output);
3878 xfclose (tfp, temp_output);
3880 if (output_file_created)
3881 goto finish;
3883 xfclose (fp, file);
3886 finish:
3887 free (buf.buf);
3889 if (! output_file_created)
3891 size_t i;
3892 struct tempnode *node = temphead;
3893 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3894 for (i = 0; node; i++)
3896 tempfiles[i].name = node->name;
3897 tempfiles[i].temp = node;
3898 node = node->next;
3900 merge (tempfiles, ntemps, ntemps, output_file);
3901 free (tempfiles);
3904 reap_all ();
3907 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3909 static void
3910 insertkey (struct keyfield *key_arg)
3912 struct keyfield **p;
3913 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3915 for (p = &keylist; *p; p = &(*p)->next)
3916 continue;
3917 *p = key;
3918 key->next = NULL;
3921 /* Report a bad field specification SPEC, with extra info MSGID. */
3923 static void badfieldspec (char const *, char const *)
3924 ATTRIBUTE_NORETURN;
3925 static void
3926 badfieldspec (char const *spec, char const *msgid)
3928 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3929 _(msgid), quote (spec));
3930 abort ();
3933 /* Report incompatible options. */
3935 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3936 static void
3937 incompatible_options (char const *opts)
3939 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), opts);
3940 abort ();
3943 /* Check compatibility of ordering options. */
3945 static void
3946 check_ordering_compatibility (void)
3948 struct keyfield *key;
3950 for (key = keylist; key; key = key->next)
3951 if (1 < (key->numeric + key->general_numeric + key->human_numeric
3952 + key->month + (key->version | key->random | !!key->ignore)))
3954 /* The following is too big, but guaranteed to be "big enough". */
3955 char opts[sizeof short_options];
3956 /* Clear flags we're not interested in. */
3957 key->skipsblanks = key->skipeblanks = key->reverse = false;
3958 key_to_opts (key, opts);
3959 incompatible_options (opts);
3963 /* Parse the leading integer in STRING and store the resulting value
3964 (which must fit into size_t) into *VAL. Return the address of the
3965 suffix after the integer. If the value is too large, silently
3966 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3967 failure; otherwise, report MSGID and exit on failure. */
3969 static char const *
3970 parse_field_count (char const *string, size_t *val, char const *msgid)
3972 char *suffix;
3973 uintmax_t n;
3975 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3977 case LONGINT_OK:
3978 case LONGINT_INVALID_SUFFIX_CHAR:
3979 *val = n;
3980 if (*val == n)
3981 break;
3982 /* Fall through. */
3983 case LONGINT_OVERFLOW:
3984 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3985 *val = SIZE_MAX;
3986 break;
3988 case LONGINT_INVALID:
3989 if (msgid)
3990 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3991 _(msgid), quote (string));
3992 return NULL;
3995 return suffix;
3998 /* Handle interrupts and hangups. */
4000 static void
4001 sighandler (int sig)
4003 if (! SA_NOCLDSTOP)
4004 signal (sig, SIG_IGN);
4006 cleanup ();
4008 signal (sig, SIG_DFL);
4009 raise (sig);
4012 /* Set the ordering options for KEY specified in S.
4013 Return the address of the first character in S that
4014 is not a valid ordering option.
4015 BLANKTYPE is the kind of blanks that 'b' should skip. */
4017 static char *
4018 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4020 while (*s)
4022 switch (*s)
4024 case 'b':
4025 if (blanktype == bl_start || blanktype == bl_both)
4026 key->skipsblanks = true;
4027 if (blanktype == bl_end || blanktype == bl_both)
4028 key->skipeblanks = true;
4029 break;
4030 case 'd':
4031 key->ignore = nondictionary;
4032 break;
4033 case 'f':
4034 key->translate = fold_toupper;
4035 break;
4036 case 'g':
4037 key->general_numeric = true;
4038 break;
4039 case 'h':
4040 key->human_numeric = true;
4041 break;
4042 case 'i':
4043 /* Option order should not matter, so don't let -i override
4044 -d. -d implies -i, but -i does not imply -d. */
4045 if (! key->ignore)
4046 key->ignore = nonprinting;
4047 break;
4048 case 'M':
4049 key->month = true;
4050 break;
4051 case 'n':
4052 key->numeric = true;
4053 break;
4054 case 'R':
4055 key->random = true;
4056 break;
4057 case 'r':
4058 key->reverse = true;
4059 break;
4060 case 'V':
4061 key->version = true;
4062 break;
4063 default:
4064 return (char *) s;
4066 ++s;
4068 return (char *) s;
4071 /* Initialize KEY. */
4073 static struct keyfield *
4074 key_init (struct keyfield *key)
4076 memset (key, 0, sizeof *key);
4077 key->eword = SIZE_MAX;
4078 return key;
4082 main (int argc, char **argv)
4084 struct keyfield *key;
4085 struct keyfield key_buf;
4086 struct keyfield gkey;
4087 bool gkey_only = false;
4088 char const *s;
4089 int c = 0;
4090 char checkonly = 0;
4091 bool mergeonly = false;
4092 char *random_source = NULL;
4093 bool need_random = false;
4094 size_t nthreads = 0;
4095 size_t nfiles = 0;
4096 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4097 bool obsolete_usage = (posix2_version () < 200112);
4098 char **files;
4099 char *files_from = NULL;
4100 struct Tokens tok;
4101 char const *outfile = NULL;
4103 initialize_main (&argc, &argv);
4104 set_program_name (argv[0]);
4105 setlocale (LC_ALL, "");
4106 bindtextdomain (PACKAGE, LOCALEDIR);
4107 textdomain (PACKAGE);
4109 initialize_exit_failure (SORT_FAILURE);
4111 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4112 #if HAVE_NL_LANGINFO
4113 hard_LC_TIME = hard_locale (LC_TIME);
4114 #endif
4116 /* Get locale's representation of the decimal point. */
4118 struct lconv const *locale = localeconv ();
4120 /* If the locale doesn't define a decimal point, or if the decimal
4121 point is multibyte, use the C locale's decimal point. FIXME:
4122 add support for multibyte decimal points. */
4123 decimal_point = to_uchar (locale->decimal_point[0]);
4124 if (! decimal_point || locale->decimal_point[1])
4125 decimal_point = '.';
4127 /* FIXME: add support for multibyte thousands separators. */
4128 thousands_sep = to_uchar (*locale->thousands_sep);
4129 if (! thousands_sep || locale->thousands_sep[1])
4130 thousands_sep = -1;
4133 have_read_stdin = false;
4134 inittables ();
4137 size_t i;
4138 static int const sig[] =
4140 /* The usual suspects. */
4141 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4142 #ifdef SIGPOLL
4143 SIGPOLL,
4144 #endif
4145 #ifdef SIGPROF
4146 SIGPROF,
4147 #endif
4148 #ifdef SIGVTALRM
4149 SIGVTALRM,
4150 #endif
4151 #ifdef SIGXCPU
4152 SIGXCPU,
4153 #endif
4154 #ifdef SIGXFSZ
4155 SIGXFSZ,
4156 #endif
4158 enum { nsigs = ARRAY_CARDINALITY (sig) };
4160 #if SA_NOCLDSTOP
4161 struct sigaction act;
4163 sigemptyset (&caught_signals);
4164 for (i = 0; i < nsigs; i++)
4166 sigaction (sig[i], NULL, &act);
4167 if (act.sa_handler != SIG_IGN)
4168 sigaddset (&caught_signals, sig[i]);
4171 act.sa_handler = sighandler;
4172 act.sa_mask = caught_signals;
4173 act.sa_flags = 0;
4175 for (i = 0; i < nsigs; i++)
4176 if (sigismember (&caught_signals, sig[i]))
4177 sigaction (sig[i], &act, NULL);
4178 #else
4179 for (i = 0; i < nsigs; i++)
4180 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4182 signal (sig[i], sighandler);
4183 siginterrupt (sig[i], 1);
4185 #endif
4187 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4189 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4190 atexit (exit_cleanup);
4192 key_init (&gkey);
4193 gkey.sword = SIZE_MAX;
4195 files = xnmalloc (argc, sizeof *files);
4197 while (true)
4199 /* Parse an operand as a file after "--" was seen; or if
4200 pedantic and a file was seen, unless the POSIX version
4201 predates 1003.1-2001 and -c was not seen and the operand is
4202 "-o FILE" or "-oFILE". */
4203 int oi = -1;
4205 if (c == -1
4206 || (posixly_correct && nfiles != 0
4207 && ! (obsolete_usage
4208 && ! checkonly
4209 && optind != argc
4210 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4211 && (argv[optind][2] || optind + 1 != argc)))
4212 || ((c = getopt_long (argc, argv, short_options,
4213 long_options, &oi))
4214 == -1))
4216 if (argc <= optind)
4217 break;
4218 files[nfiles++] = argv[optind++];
4220 else switch (c)
4222 case 1:
4223 key = NULL;
4224 if (optarg[0] == '+')
4226 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4227 && ISDIGIT (argv[optind][1]));
4228 obsolete_usage |= minus_pos_usage && !posixly_correct;
4229 if (obsolete_usage)
4231 /* Treat +POS1 [-POS2] as a key if possible; but silently
4232 treat an operand as a file if it is not a valid +POS1. */
4233 key = key_init (&key_buf);
4234 s = parse_field_count (optarg + 1, &key->sword, NULL);
4235 if (s && *s == '.')
4236 s = parse_field_count (s + 1, &key->schar, NULL);
4237 if (! (key->sword || key->schar))
4238 key->sword = SIZE_MAX;
4239 if (! s || *set_ordering (s, key, bl_start))
4240 key = NULL;
4241 else
4243 if (minus_pos_usage)
4245 char const *optarg1 = argv[optind++];
4246 s = parse_field_count (optarg1 + 1, &key->eword,
4247 N_("invalid number after '-'"));
4248 if (*s == '.')
4249 s = parse_field_count (s + 1, &key->echar,
4250 N_("invalid number after '.'"));
4251 if (!key->echar && key->eword)
4253 /* obsolescent syntax +A.x -B.y is equivalent to:
4254 -k A+1.x+1,B.y (when y = 0)
4255 -k A+1.x+1,B+1.y (when y > 0)
4256 So eword is decremented as in the -k case
4257 only when the end field (B) is specified and
4258 echar (y) is 0. */
4259 key->eword--;
4261 if (*set_ordering (s, key, bl_end))
4262 badfieldspec (optarg1,
4263 N_("stray character in field spec"));
4265 key->obsolete_used = true;
4266 insertkey (key);
4270 if (! key)
4271 files[nfiles++] = optarg;
4272 break;
4274 case SORT_OPTION:
4275 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4276 /* Fall through. */
4277 case 'b':
4278 case 'd':
4279 case 'f':
4280 case 'g':
4281 case 'h':
4282 case 'i':
4283 case 'M':
4284 case 'n':
4285 case 'r':
4286 case 'R':
4287 case 'V':
4289 char str[2];
4290 str[0] = c;
4291 str[1] = '\0';
4292 set_ordering (str, &gkey, bl_both);
4294 break;
4296 case CHECK_OPTION:
4297 c = (optarg
4298 ? XARGMATCH ("--check", optarg, check_args, check_types)
4299 : 'c');
4300 /* Fall through. */
4301 case 'c':
4302 case 'C':
4303 if (checkonly && checkonly != c)
4304 incompatible_options ("cC");
4305 checkonly = c;
4306 break;
4308 case COMPRESS_PROGRAM_OPTION:
4309 if (compress_program && !STREQ (compress_program, optarg))
4310 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4311 compress_program = optarg;
4312 break;
4314 case DEBUG_PROGRAM_OPTION:
4315 debug = true;
4316 break;
4318 case FILES0_FROM_OPTION:
4319 files_from = optarg;
4320 break;
4322 case 'k':
4323 key = key_init (&key_buf);
4325 /* Get POS1. */
4326 s = parse_field_count (optarg, &key->sword,
4327 N_("invalid number at field start"));
4328 if (! key->sword--)
4330 /* Provoke with 'sort -k0' */
4331 badfieldspec (optarg, N_("field number is zero"));
4333 if (*s == '.')
4335 s = parse_field_count (s + 1, &key->schar,
4336 N_("invalid number after '.'"));
4337 if (! key->schar--)
4339 /* Provoke with 'sort -k1.0' */
4340 badfieldspec (optarg, N_("character offset is zero"));
4343 if (! (key->sword || key->schar))
4344 key->sword = SIZE_MAX;
4345 s = set_ordering (s, key, bl_start);
4346 if (*s != ',')
4348 key->eword = SIZE_MAX;
4349 key->echar = 0;
4351 else
4353 /* Get POS2. */
4354 s = parse_field_count (s + 1, &key->eword,
4355 N_("invalid number after ','"));
4356 if (! key->eword--)
4358 /* Provoke with 'sort -k1,0' */
4359 badfieldspec (optarg, N_("field number is zero"));
4361 if (*s == '.')
4363 s = parse_field_count (s + 1, &key->echar,
4364 N_("invalid number after '.'"));
4366 s = set_ordering (s, key, bl_end);
4368 if (*s)
4369 badfieldspec (optarg, N_("stray character in field spec"));
4370 insertkey (key);
4371 break;
4373 case 'm':
4374 mergeonly = true;
4375 break;
4377 case NMERGE_OPTION:
4378 specify_nmerge (oi, c, optarg);
4379 break;
4381 case 'o':
4382 if (outfile && !STREQ (outfile, optarg))
4383 error (SORT_FAILURE, 0, _("multiple output files specified"));
4384 outfile = optarg;
4385 break;
4387 case RANDOM_SOURCE_OPTION:
4388 if (random_source && !STREQ (random_source, optarg))
4389 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4390 random_source = optarg;
4391 break;
4393 case 's':
4394 stable = true;
4395 break;
4397 case 'S':
4398 specify_sort_size (oi, c, optarg);
4399 break;
4401 case 't':
4403 char newtab = optarg[0];
4404 if (! newtab)
4405 error (SORT_FAILURE, 0, _("empty tab"));
4406 if (optarg[1])
4408 if (STREQ (optarg, "\\0"))
4409 newtab = '\0';
4410 else
4412 /* Provoke with 'sort -txx'. Complain about
4413 "multi-character tab" instead of "multibyte tab", so
4414 that the diagnostic's wording does not need to be
4415 changed once multibyte characters are supported. */
4416 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4417 quote (optarg));
4420 if (tab != TAB_DEFAULT && tab != newtab)
4421 error (SORT_FAILURE, 0, _("incompatible tabs"));
4422 tab = newtab;
4424 break;
4426 case 'T':
4427 add_temp_dir (optarg);
4428 break;
4430 case PARALLEL_OPTION:
4431 nthreads = specify_nthreads (oi, c, optarg);
4432 break;
4434 case 'u':
4435 unique = true;
4436 break;
4438 case 'y':
4439 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4440 through Solaris 7. It is also accepted by many non-Solaris
4441 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4442 -y is marked as obsolete starting with Solaris 8 (1999), but is
4443 still accepted as of Solaris 10 prerelease (2004).
4445 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4446 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4447 and which in general ignores the argument after "-y" if it
4448 consists entirely of digits (it can even be empty). */
4449 if (optarg == argv[optind - 1])
4451 char const *p;
4452 for (p = optarg; ISDIGIT (*p); p++)
4453 continue;
4454 optind -= (*p != '\0');
4456 break;
4458 case 'z':
4459 eolchar = 0;
4460 break;
4462 case_GETOPT_HELP_CHAR;
4464 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4466 default:
4467 usage (SORT_FAILURE);
4471 if (files_from)
4473 FILE *stream;
4475 /* When using --files0-from=F, you may not specify any files
4476 on the command-line. */
4477 if (nfiles)
4479 error (0, 0, _("extra operand %s"), quote (files[0]));
4480 fprintf (stderr, "%s\n",
4481 _("file operands cannot be combined with --files0-from"));
4482 usage (SORT_FAILURE);
4485 if (STREQ (files_from, "-"))
4486 stream = stdin;
4487 else
4489 stream = fopen (files_from, "r");
4490 if (stream == NULL)
4491 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4492 quote (files_from));
4495 readtokens0_init (&tok);
4497 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4498 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4499 quote (files_from));
4501 if (tok.n_tok)
4503 size_t i;
4504 free (files);
4505 files = tok.tok;
4506 nfiles = tok.n_tok;
4507 for (i = 0; i < nfiles; i++)
4509 if (STREQ (files[i], "-"))
4510 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4511 "no file name of %s allowed"),
4512 quote (files[i]));
4513 else if (files[i][0] == '\0')
4515 /* Using the standard 'filename:line-number:' prefix here is
4516 not totally appropriate, since NUL is the separator,
4517 not NL, but it might be better than nothing. */
4518 unsigned long int file_number = i + 1;
4519 error (SORT_FAILURE, 0,
4520 _("%s:%lu: invalid zero-length file name"),
4521 quotearg_colon (files_from), file_number);
4525 else
4526 error (SORT_FAILURE, 0, _("no input from %s"),
4527 quote (files_from));
4530 /* Inheritance of global options to individual keys. */
4531 for (key = keylist; key; key = key->next)
4533 if (default_key_compare (key) && !key->reverse)
4535 key->ignore = gkey.ignore;
4536 key->translate = gkey.translate;
4537 key->skipsblanks = gkey.skipsblanks;
4538 key->skipeblanks = gkey.skipeblanks;
4539 key->month = gkey.month;
4540 key->numeric = gkey.numeric;
4541 key->general_numeric = gkey.general_numeric;
4542 key->human_numeric = gkey.human_numeric;
4543 key->version = gkey.version;
4544 key->random = gkey.random;
4545 key->reverse = gkey.reverse;
4548 need_random |= key->random;
4551 if (!keylist && !default_key_compare (&gkey))
4553 gkey_only = true;
4554 insertkey (&gkey);
4555 need_random |= gkey.random;
4558 check_ordering_compatibility ();
4560 if (debug)
4562 if (checkonly || outfile)
4564 static char opts[] = "X --debug";
4565 opts[0] = (checkonly ? checkonly : 'o');
4566 incompatible_options (opts);
4569 /* Always output the locale in debug mode, since this
4570 is such a common source of confusion. */
4571 if (hard_LC_COLLATE)
4572 error (0, 0, _("using %s sorting rules"),
4573 quote (setlocale (LC_COLLATE, NULL)));
4574 else
4575 error (0, 0, _("using simple byte comparison"));
4576 key_warnings (&gkey, gkey_only);
4579 reverse = gkey.reverse;
4581 if (need_random)
4582 random_md5_state_init (random_source);
4584 if (temp_dir_count == 0)
4586 char const *tmp_dir = getenv ("TMPDIR");
4587 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4590 if (nfiles == 0)
4592 static char *minus = (char *) "-";
4593 nfiles = 1;
4594 free (files);
4595 files = &minus;
4598 /* Need to re-check that we meet the minimum requirement for memory
4599 usage with the final value for NMERGE. */
4600 if (0 < sort_size)
4601 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4603 if (checkonly)
4605 if (nfiles > 1)
4606 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4607 quote (files[1]), checkonly);
4609 if (outfile)
4611 static char opts[] = {0, 'o', 0};
4612 opts[0] = checkonly;
4613 incompatible_options (opts);
4616 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4617 input is not properly sorted. */
4618 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4621 if (mergeonly)
4623 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4624 size_t i;
4626 for (i = 0; i < nfiles; ++i)
4627 sortfiles[i].name = files[i];
4629 merge (sortfiles, 0, nfiles, outfile);
4630 IF_LINT (free (sortfiles));
4632 else
4634 if (!nthreads)
4636 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4637 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4640 /* Avoid integer overflow later. */
4641 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4642 nthreads = MIN (nthreads, nthreads_max);
4644 sort (files, nfiles, outfile, nthreads);
4647 if (have_read_stdin && fclose (stdin) == EOF)
4648 die (_("close failed"), "-");
4650 exit (EXIT_SUCCESS);