build: support fully excluding stdbuf from the build
[coreutils/ericb.git] / src / sort.c
blobafa45a8b3801d0ba2d0693799cef65517eebb0fb
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 #include <signal.h>
30 #include "system.h"
31 #include "argmatch.h"
32 #include "error.h"
33 #include "filevercmp.h"
34 #include "hard-locale.h"
35 #include "hash.h"
36 #include "heap.h"
37 #include "ignore-value.h"
38 #include "md5.h"
39 #include "mbswidth.h"
40 #include "nproc.h"
41 #include "physmem.h"
42 #include "posixver.h"
43 #include "quote.h"
44 #include "quotearg.h"
45 #include "randread.h"
46 #include "readtokens0.h"
47 #include "stdio--.h"
48 #include "stdlib--.h"
49 #include "strnumcmp.h"
50 #include "xmemcoll.h"
51 #include "xmemxfrm.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
55 #if HAVE_SYS_RESOURCE_H
56 # include <sys/resource.h>
57 #endif
58 #ifndef RLIMIT_DATA
59 struct rlimit { size_t rlim_cur; };
60 # define getrlimit(Resource, Rlp) (-1)
61 #endif
63 /* The official name of this program (e.g., no `g' prefix). */
64 #define PROGRAM_NAME "sort"
66 #define AUTHORS \
67 proper_name ("Mike Haertel"), \
68 proper_name ("Paul Eggert")
70 #if HAVE_LANGINFO_CODESET
71 # include <langinfo.h>
72 #endif
74 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
75 present. */
76 #ifndef SA_NOCLDSTOP
77 # define SA_NOCLDSTOP 0
78 /* No sigprocmask. Always 'return' zero. */
79 # define sigprocmask(How, Set, Oset) (0)
80 # define sigset_t int
81 # if ! HAVE_SIGINTERRUPT
82 # define siginterrupt(sig, flag) /* empty */
83 # endif
84 #endif
86 #if !defined OPEN_MAX && defined NR_OPEN
87 # define OPEN_MAX NR_OPEN
88 #endif
89 #if !defined OPEN_MAX
90 # define OPEN_MAX 20
91 #endif
93 #define UCHAR_LIM (UCHAR_MAX + 1)
95 #ifndef DEFAULT_TMPDIR
96 # define DEFAULT_TMPDIR "/tmp"
97 #endif
99 /* Maximum number of lines to merge every time a NODE is taken from
100 the MERGE_QUEUE. Node is at LEVEL in the binary merge tree,
101 and is responsible for merging TOTAL lines. */
102 #define MAX_MERGE(total, level) ((total) / ((2 << level) * (2 << level)) + 1)
104 /* Heuristic value for the number of lines for which it is worth
105 creating a subthread, during an internal merge sort, on a machine
106 that has processors galore. Currently this number is just a guess.
107 This value must be at least 4. We don't know of any machine where
108 this number has any practical effect. */
109 enum { SUBTHREAD_LINES_HEURISTIC = 4 };
111 /* Exit statuses. */
112 enum
114 /* POSIX says to exit with status 1 if invoked with -c and the
115 input is not properly sorted. */
116 SORT_OUT_OF_ORDER = 1,
118 /* POSIX says any other irregular exit must exit with a status
119 code greater than 1. */
120 SORT_FAILURE = 2
123 enum
125 /* The number of times we should try to fork a compression process
126 (we retry if the fork call fails). We don't _need_ to compress
127 temp files, this is just to reduce disk access, so this number
128 can be small. Each retry doubles in duration. */
129 MAX_FORK_TRIES_COMPRESS = 4,
131 /* The number of times we should try to fork a decompression process.
132 If we can't fork a decompression process, we can't sort, so this
133 number should be big. Each retry doubles in duration. */
134 MAX_FORK_TRIES_DECOMPRESS = 9
137 enum
139 /* Level of the end-of-merge node, one level above the root. */
140 MERGE_END = 0,
142 /* Level of the root node in merge tree. */
143 MERGE_ROOT = 1
146 /* The representation of the decimal point in the current locale. */
147 static int decimal_point;
149 /* Thousands separator; if -1, then there isn't one. */
150 static int thousands_sep;
152 /* Nonzero if the corresponding locales are hard. */
153 static bool hard_LC_COLLATE;
154 #if HAVE_NL_LANGINFO
155 static bool hard_LC_TIME;
156 #endif
158 #define NONZERO(x) ((x) != 0)
160 /* The kind of blanks for '-b' to skip in various options. */
161 enum blanktype { bl_start, bl_end, bl_both };
163 /* The character marking end of line. Default to \n. */
164 static char eolchar = '\n';
166 /* Lines are held in core as counted strings. */
167 struct line
169 char *text; /* Text of the line. */
170 size_t length; /* Length including final newline. */
171 char *keybeg; /* Start of first key. */
172 char *keylim; /* Limit of first key. */
175 /* Input buffers. */
176 struct buffer
178 char *buf; /* Dynamically allocated buffer,
179 partitioned into 3 regions:
180 - input data;
181 - unused area;
182 - an array of lines, in reverse order. */
183 size_t used; /* Number of bytes used for input data. */
184 size_t nlines; /* Number of lines in the line array. */
185 size_t alloc; /* Number of bytes allocated. */
186 size_t left; /* Number of bytes left from previous reads. */
187 size_t line_bytes; /* Number of bytes to reserve for each line. */
188 bool eof; /* An EOF has been read. */
191 struct keyfield
193 size_t sword; /* Zero-origin 'word' to start at. */
194 size_t schar; /* Additional characters to skip. */
195 size_t eword; /* Zero-origin last 'word' of key. */
196 size_t echar; /* Additional characters in field. */
197 bool const *ignore; /* Boolean array of characters to ignore. */
198 char const *translate; /* Translation applied to characters. */
199 bool skipsblanks; /* Skip leading blanks when finding start. */
200 bool skipeblanks; /* Skip leading blanks when finding end. */
201 bool numeric; /* Flag for numeric comparison. Handle
202 strings of digits with optional decimal
203 point, but no exponential notation. */
204 bool random; /* Sort by random hash of key. */
205 bool general_numeric; /* Flag for general, numeric comparison.
206 Handle numbers in exponential notation. */
207 bool human_numeric; /* Flag for sorting by human readable
208 units with either SI xor IEC prefixes. */
209 int iec_present; /* Flag for checking for mixed SI and IEC. */
210 bool month; /* Flag for comparison by month name. */
211 bool reverse; /* Reverse the sense of comparison. */
212 bool version; /* sort by version number */
213 bool obsolete_used; /* obsolescent key option format is used. */
214 struct keyfield *next; /* Next keyfield to try. */
217 struct month
219 char const *name;
220 int val;
223 /* Binary merge tree node. */
224 struct merge_node
226 struct line *lo; /* Lines to merge from LO child node. */
227 struct line *hi; /* Lines to merge from HI child ndoe. */
228 struct line *end_lo; /* End of available lines from LO. */
229 struct line *end_hi; /* End of available lines from HI. */
230 struct line **dest; /* Pointer to destination of merge. */
231 size_t nlo; /* Total Lines remaining from LO. */
232 size_t nhi; /* Total lines remaining from HI. */
233 size_t level; /* Level in merge tree. */
234 struct merge_node *parent; /* Parent node. */
235 bool queued; /* Node is already in heap. */
236 pthread_spinlock_t *lock; /* Lock for node operations. */
239 /* Priority queue of merge nodes. */
240 struct merge_node_queue
242 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
243 pthread_mutex_t mutex; /* Lock for queue operations. */
244 pthread_cond_t cond; /* Conditional wait for empty queue to populate
245 when popping. */
248 /* FIXME: None of these tables work with multibyte character sets.
249 Also, there are many other bugs when handling multibyte characters.
250 One way to fix this is to rewrite `sort' to use wide characters
251 internally, but doing this with good performance is a bit
252 tricky. */
254 /* Table of blanks. */
255 static bool blanks[UCHAR_LIM];
257 /* Table of non-printing characters. */
258 static bool nonprinting[UCHAR_LIM];
260 /* Table of non-dictionary characters (not letters, digits, or blanks). */
261 static bool nondictionary[UCHAR_LIM];
263 /* Translation table folding lower case to upper. */
264 static unsigned char fold_toupper[UCHAR_LIM];
266 #define MONTHS_PER_YEAR 12
268 /* Table mapping month names to integers.
269 Alphabetic order allows binary search. */
270 static struct month monthtab[] =
272 {"APR", 4},
273 {"AUG", 8},
274 {"DEC", 12},
275 {"FEB", 2},
276 {"JAN", 1},
277 {"JUL", 7},
278 {"JUN", 6},
279 {"MAR", 3},
280 {"MAY", 5},
281 {"NOV", 11},
282 {"OCT", 10},
283 {"SEP", 9}
286 /* During the merge phase, the number of files to merge at once. */
287 #define NMERGE_DEFAULT 16
289 /* Minimum size for a merge or check buffer. */
290 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
292 /* Minimum sort size; the code might not work with smaller sizes. */
293 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
295 /* The number of bytes needed for a merge or check buffer, which can
296 function relatively efficiently even if it holds only one line. If
297 a longer line is seen, this value is increased. */
298 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
300 /* The approximate maximum number of bytes of main memory to use, as
301 specified by the user. Zero if the user has not specified a size. */
302 static size_t sort_size;
304 /* The guessed size for non-regular files. */
305 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
307 /* Array of directory names in which any temporary files are to be created. */
308 static char const **temp_dirs;
310 /* Number of temporary directory names used. */
311 static size_t temp_dir_count;
313 /* Number of allocated slots in temp_dirs. */
314 static size_t temp_dir_alloc;
316 /* Flag to reverse the order of all comparisons. */
317 static bool reverse;
319 /* Flag for stable sort. This turns off the last ditch bytewise
320 comparison of lines, and instead leaves lines in the same order
321 they were read if all keys compare equal. */
322 static bool stable;
324 /* If TAB has this value, blanks separate fields. */
325 enum { TAB_DEFAULT = CHAR_MAX + 1 };
327 /* Tab character separating fields. If TAB_DEFAULT, then fields are
328 separated by the empty string between a non-blank character and a blank
329 character. */
330 static int tab = TAB_DEFAULT;
332 /* Flag to remove consecutive duplicate lines from the output.
333 Only the last of a sequence of equal lines will be output. */
334 static bool unique;
336 /* Nonzero if any of the input files are the standard input. */
337 static bool have_read_stdin;
339 /* List of key field comparisons to be tried. */
340 static struct keyfield *keylist;
342 /* Program used to (de)compress temp files. Must accept -d. */
343 static char const *compress_program;
345 /* Annotate the output with extra info to aid the user. */
346 static bool debug;
348 /* Maximum number of files to merge in one go. If more than this
349 number are present, temp files will be used. */
350 static unsigned int nmerge = NMERGE_DEFAULT;
352 /* Report MESSAGE for FILE, then clean up and exit.
353 If FILE is null, it represents standard output. */
355 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
356 static void
357 die (char const *message, char const *file)
359 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
360 exit (SORT_FAILURE);
363 void
364 usage (int status)
366 if (status != EXIT_SUCCESS)
367 fprintf (stderr, _("Try `%s --help' for more information.\n"),
368 program_name);
369 else
371 printf (_("\
372 Usage: %s [OPTION]... [FILE]...\n\
373 or: %s [OPTION]... --files0-from=F\n\
375 program_name, program_name);
376 fputs (_("\
377 Write sorted concatenation of all FILE(s) to standard output.\n\
379 "), stdout);
380 fputs (_("\
381 Mandatory arguments to long options are mandatory for short options too.\n\
382 "), stdout);
383 fputs (_("\
384 Ordering options:\n\
386 "), stdout);
387 fputs (_("\
388 -b, --ignore-leading-blanks ignore leading blanks\n\
389 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
390 -f, --ignore-case fold lower case to upper case characters\n\
391 "), stdout);
392 fputs (_("\
393 -g, --general-numeric-sort compare according to general numerical value\n\
394 -i, --ignore-nonprinting consider only printable characters\n\
395 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
396 "), stdout);
397 fputs (_("\
398 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
399 "), stdout);
400 fputs (_("\
401 -n, --numeric-sort compare according to string numerical value\n\
402 -R, --random-sort sort by random hash of keys\n\
403 --random-source=FILE get random bytes from FILE\n\
404 -r, --reverse reverse the result of comparisons\n\
405 "), stdout);
406 fputs (_("\
407 --sort=WORD sort according to WORD:\n\
408 general-numeric -g, human-numeric -h, month -M,\n\
409 numeric -n, random -R, version -V\n\
410 -V, --version-sort natural sort of (version) numbers within text\n\
412 "), stdout);
413 fputs (_("\
414 Other options:\n\
416 "), stdout);
417 fputs (_("\
418 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
419 for more use temp files\n\
420 "), stdout);
421 fputs (_("\
422 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
423 -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\
424 --compress-program=PROG compress temporaries with PROG;\n\
425 decompress them with PROG -d\n\
426 "), stdout);
427 fputs (_("\
428 --debug annotate the part of the line used to sort,\n\
429 and warn about questionable usage to stderr\n\
430 --files0-from=F read input from the files specified by\n\
431 NUL-terminated names in file F;\n\
432 If F is - then read names from standard input\n\
433 "), stdout);
434 fputs (_("\
435 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
436 (default end of line). See POS syntax below\n\
437 -m, --merge merge already sorted files; do not sort\n\
438 "), stdout);
439 fputs (_("\
440 -o, --output=FILE write result to FILE instead of standard output\n\
441 -s, --stable stabilize sort by disabling last-resort comparison\n\
442 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
443 "), stdout);
444 printf (_("\
445 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
446 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
447 multiple options specify multiple directories\n\
448 --parallel=N limit the number of sorts run concurrently to N\n\
449 -u, --unique with -c, check for strict ordering;\n\
450 without -c, output only the first of an equal run\n\
451 "), DEFAULT_TMPDIR);
452 fputs (_("\
453 -z, --zero-terminated end lines with 0 byte, not newline\n\
454 "), stdout);
455 fputs (HELP_OPTION_DESCRIPTION, stdout);
456 fputs (VERSION_OPTION_DESCRIPTION, stdout);
457 fputs (_("\
459 POS is F[.C][OPTS], where F is the field number and C the character position\n\
460 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
461 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
462 one or more single-letter ordering options, which override global ordering\n\
463 options for that key. If no key is given, use the entire line as the key.\n\
465 SIZE may be followed by the following multiplicative suffixes:\n\
466 "), stdout);
467 fputs (_("\
468 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
470 With no FILE, or when FILE is -, read standard input.\n\
472 *** WARNING ***\n\
473 The locale specified by the environment affects sort order.\n\
474 Set LC_ALL=C to get the traditional sort order that uses\n\
475 native byte values.\n\
476 "), stdout );
477 emit_ancillary_info ();
480 exit (status);
483 /* For long options that have no equivalent short option, use a
484 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
485 enum
487 CHECK_OPTION = CHAR_MAX + 1,
488 COMPRESS_PROGRAM_OPTION,
489 DEBUG_PROGRAM_OPTION,
490 FILES0_FROM_OPTION,
491 NMERGE_OPTION,
492 RANDOM_SOURCE_OPTION,
493 SORT_OPTION,
494 PARALLEL_OPTION
497 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
499 static struct option const long_options[] =
501 {"ignore-leading-blanks", no_argument, NULL, 'b'},
502 {"check", optional_argument, NULL, CHECK_OPTION},
503 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
504 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
505 {"dictionary-order", no_argument, NULL, 'd'},
506 {"ignore-case", no_argument, NULL, 'f'},
507 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
508 {"general-numeric-sort", no_argument, NULL, 'g'},
509 {"ignore-nonprinting", no_argument, NULL, 'i'},
510 {"key", required_argument, NULL, 'k'},
511 {"merge", no_argument, NULL, 'm'},
512 {"month-sort", no_argument, NULL, 'M'},
513 {"numeric-sort", no_argument, NULL, 'n'},
514 {"human-numeric-sort", no_argument, NULL, 'h'},
515 {"version-sort", no_argument, NULL, 'V'},
516 {"random-sort", no_argument, NULL, 'R'},
517 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
518 {"sort", required_argument, NULL, SORT_OPTION},
519 {"output", required_argument, NULL, 'o'},
520 {"reverse", no_argument, NULL, 'r'},
521 {"stable", no_argument, NULL, 's'},
522 {"batch-size", required_argument, NULL, NMERGE_OPTION},
523 {"buffer-size", required_argument, NULL, 'S'},
524 {"field-separator", required_argument, NULL, 't'},
525 {"temporary-directory", required_argument, NULL, 'T'},
526 {"unique", no_argument, NULL, 'u'},
527 {"zero-terminated", no_argument, NULL, 'z'},
528 {"parallel", required_argument, NULL, PARALLEL_OPTION},
529 {GETOPT_HELP_OPTION_DECL},
530 {GETOPT_VERSION_OPTION_DECL},
531 {NULL, 0, NULL, 0},
534 #define CHECK_TABLE \
535 _ct_("quiet", 'C') \
536 _ct_("silent", 'C') \
537 _ct_("diagnose-first", 'c')
539 static char const *const check_args[] =
541 #define _ct_(_s, _c) _s,
542 CHECK_TABLE NULL
543 #undef _ct_
545 static char const check_types[] =
547 #define _ct_(_s, _c) _c,
548 CHECK_TABLE
549 #undef _ct_
552 #define SORT_TABLE \
553 _st_("general-numeric", 'g') \
554 _st_("human-numeric", 'h') \
555 _st_("month", 'M') \
556 _st_("numeric", 'n') \
557 _st_("random", 'R') \
558 _st_("version", 'V')
560 static char const *const sort_args[] =
562 #define _st_(_s, _c) _s,
563 SORT_TABLE NULL
564 #undef _st_
566 static char const sort_types[] =
568 #define _st_(_s, _c) _c,
569 SORT_TABLE
570 #undef _st_
573 /* The set of signals that are caught. */
574 static sigset_t caught_signals;
576 /* Critical section status. */
577 struct cs_status
579 bool valid;
580 sigset_t sigs;
583 /* Enter a critical section. */
584 static struct cs_status
585 cs_enter (void)
587 struct cs_status status;
588 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
589 return status;
592 /* Leave a critical section. */
593 static void
594 cs_leave (struct cs_status status)
596 if (status.valid)
598 /* Ignore failure when restoring the signal mask. */
599 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
603 /* The list of temporary files. */
604 struct tempnode
606 struct tempnode *volatile next;
607 pid_t pid; /* If compressed, the pid of compressor, else zero */
608 char name[1]; /* Actual size is 1 + file name length. */
610 static struct tempnode *volatile temphead;
611 static struct tempnode *volatile *temptail = &temphead;
613 struct sortfile
615 char const *name;
616 pid_t pid; /* If compressed, the pid of compressor, else zero */
619 /* A table where we store compression process states. We clean up all
620 processes in a timely manner so as not to exhaust system resources,
621 so we store the info on whether the process is still running, or has
622 been reaped here. */
623 static Hash_table *proctab;
625 enum { INIT_PROCTAB_SIZE = 47 };
627 enum procstate { ALIVE, ZOMBIE };
629 /* A proctab entry. The COUNT field is there in case we fork a new
630 compression process that has the same PID as an old zombie process
631 that is still in the table (because the process to decompress the
632 temp file it was associated with hasn't started yet). */
633 struct procnode
635 pid_t pid;
636 enum procstate state;
637 size_t count;
640 static size_t
641 proctab_hasher (const void *entry, size_t tabsize)
643 const struct procnode *node = entry;
644 return node->pid % tabsize;
647 static bool
648 proctab_comparator (const void *e1, const void *e2)
650 const struct procnode *n1 = e1, *n2 = e2;
651 return n1->pid == n2->pid;
654 /* The total number of forked processes (compressors and decompressors)
655 that have not been reaped yet. */
656 static size_t nprocs;
658 /* The number of child processes we'll allow before we try to reap some. */
659 enum { MAX_PROCS_BEFORE_REAP = 2 };
661 /* If 0 < PID, wait for the child process with that PID to exit.
662 If PID is -1, clean up a random child process which has finished and
663 return the process ID of that child. If PID is -1 and no processes
664 have quit yet, return 0 without waiting. */
666 static pid_t
667 reap (pid_t pid)
669 int status;
670 pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
672 if (cpid < 0)
673 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
674 compress_program);
675 else if (0 < cpid)
677 if (! WIFEXITED (status) || WEXITSTATUS (status))
678 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
679 compress_program);
680 --nprocs;
683 return cpid;
686 /* Add the PID of a running compression process to proctab, or update
687 the entry COUNT and STATE fields if it's already there. This also
688 creates the table for us the first time it's called. */
690 static void
691 register_proc (pid_t pid)
693 struct procnode test, *node;
695 if (! proctab)
697 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
698 proctab_hasher,
699 proctab_comparator,
700 free);
701 if (! proctab)
702 xalloc_die ();
705 test.pid = pid;
706 node = hash_lookup (proctab, &test);
707 if (node)
709 node->state = ALIVE;
710 ++node->count;
712 else
714 node = xmalloc (sizeof *node);
715 node->pid = pid;
716 node->state = ALIVE;
717 node->count = 1;
718 if (hash_insert (proctab, node) == NULL)
719 xalloc_die ();
723 /* This is called when we reap a random process. We don't know
724 whether we have reaped a compression process or a decompression
725 process until we look in the table. If there's an ALIVE entry for
726 it, then we have reaped a compression process, so change the state
727 to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */
729 static void
730 update_proc (pid_t pid)
732 struct procnode test, *node;
734 test.pid = pid;
735 node = hash_lookup (proctab, &test);
736 if (node)
737 node->state = ZOMBIE;
740 /* This is for when we need to wait for a compression process to exit.
741 If it has a ZOMBIE entry in the table then it's already dead and has
742 been reaped. Note that if there's an ALIVE entry for it, it still may
743 already have died and been reaped if a second process was created with
744 the same PID. This is probably exceedingly rare, but to be on the safe
745 side we will have to wait for any compression process with this PID. */
747 static void
748 wait_proc (pid_t pid)
750 struct procnode test, *node;
752 test.pid = pid;
753 node = hash_lookup (proctab, &test);
754 if (node->state == ALIVE)
755 reap (pid);
757 node->state = ZOMBIE;
758 if (! --node->count)
760 hash_delete (proctab, node);
761 free (node);
765 /* Keep reaping finished children as long as there are more to reap.
766 This doesn't block waiting for any of them, it only reaps those
767 that are already dead. */
769 static void
770 reap_some (void)
772 pid_t pid;
774 while (0 < nprocs && (pid = reap (-1)))
775 update_proc (pid);
778 /* Clean up any remaining temporary files. */
780 static void
781 cleanup (void)
783 struct tempnode const *node;
785 for (node = temphead; node; node = node->next)
786 unlink (node->name);
787 temphead = NULL;
790 /* Cleanup actions to take when exiting. */
792 static void
793 exit_cleanup (void)
795 if (temphead)
797 /* Clean up any remaining temporary files in a critical section so
798 that a signal handler does not try to clean them too. */
799 struct cs_status cs = cs_enter ();
800 cleanup ();
801 cs_leave (cs);
804 close_stdout ();
807 /* Create a new temporary file, returning its newly allocated tempnode.
808 Store into *PFD the file descriptor open for writing.
809 If the creation fails, return NULL and store -1 into *PFD if the
810 failure is due to file descriptor exhaustion and
811 SURVIVE_FD_EXHAUSTION; otherwise, die. */
813 static struct tempnode *
814 create_temp_file (int *pfd, bool survive_fd_exhaustion)
816 static char const slashbase[] = "/sortXXXXXX";
817 static size_t temp_dir_index;
818 int fd;
819 int saved_errno;
820 char const *temp_dir = temp_dirs[temp_dir_index];
821 size_t len = strlen (temp_dir);
822 struct tempnode *node =
823 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
824 char *file = node->name;
825 struct cs_status cs;
827 memcpy (file, temp_dir, len);
828 memcpy (file + len, slashbase, sizeof slashbase);
829 node->next = NULL;
830 node->pid = 0;
831 if (++temp_dir_index == temp_dir_count)
832 temp_dir_index = 0;
834 /* Create the temporary file in a critical section, to avoid races. */
835 cs = cs_enter ();
836 fd = mkstemp (file);
837 if (0 <= fd)
839 *temptail = node;
840 temptail = &node->next;
842 saved_errno = errno;
843 cs_leave (cs);
844 errno = saved_errno;
846 if (fd < 0)
848 if (! (survive_fd_exhaustion && errno == EMFILE))
849 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
850 quote (temp_dir));
851 free (node);
852 node = NULL;
855 *pfd = fd;
856 return node;
859 /* Predeclare an access pattern for input files.
860 Ignore any errors -- this is only advisory.
862 There are a few hints we could possibly provide,
863 and after careful testing it was decided that
864 specifying POSIX_FADV_SEQUENTIAL was not detrimental
865 to any cases. On Linux 2.6.31, this option doubles
866 the size of read ahead performed and thus was seen to
867 benefit these cases:
868 Merging
869 Sorting with a smaller internal buffer
870 Reading from faster flash devices
872 In _addition_ one could also specify other hints...
874 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
875 at least uses that to _synchronously_ prepopulate the cache
876 with the specified range. While sort does need to
877 read all of its input before outputting, a synchronous
878 read of the whole file up front precludes any processing
879 that sort could do in parallel with the system doing
880 read ahead of the data. This was seen to have negative effects
881 in a couple of cases:
882 Merging
883 Sorting with a smaller internal buffer
884 Note this option was seen to shorten the runtime for sort
885 on a multicore system with lots of RAM and other processes
886 competing for CPU. It could be argued that more explicit
887 scheduling hints with `nice` et. al. are more appropriate
888 for this situation.
890 POSIX_FADV_NOREUSE is a possibility as it could lower
891 the priority of input data in the cache as sort will
892 only need to process it once. However its functionality
893 has changed over Linux kernel versions and as of 2.6.31
894 it does nothing and thus we can't depend on what it might
895 do in future.
897 POSIX_FADV_DONTNEED is not appropriate for user specified
898 input files, but for temp files we do want to drop the
899 cache immediately after processing. This is done implicitly
900 however when the files are unlinked. */
902 static void
903 fadvise_input (FILE *fp)
905 #if HAVE_POSIX_FADVISE
906 if (fp)
908 int fd = fileno (fp);
909 ignore_value (posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL));
911 #endif
914 /* Return a stream for FILE, opened with mode HOW. A null FILE means
915 standard output; HOW should be "w". When opening for input, "-"
916 means standard input. To avoid confusion, do not return file
917 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
918 opening an ordinary FILE. Return NULL if unsuccessful. */
920 static FILE *
921 stream_open (const char *file, const char *how)
923 if (!file)
924 return stdout;
925 if (*how == 'r')
927 FILE *fp;
928 if (STREQ (file, "-"))
930 have_read_stdin = true;
931 fp = stdin;
933 else
934 fp = fopen (file, how);
935 fadvise_input (fp);
936 return fp;
938 return fopen (file, how);
941 /* Same as stream_open, except always return a non-null value; die on
942 failure. */
944 static FILE *
945 xfopen (const char *file, const char *how)
947 FILE *fp = stream_open (file, how);
948 if (!fp)
949 die (_("open failed"), file);
950 return fp;
953 /* Close FP, whose name is FILE, and report any errors. */
955 static void
956 xfclose (FILE *fp, char const *file)
958 switch (fileno (fp))
960 case STDIN_FILENO:
961 /* Allow reading stdin from tty more than once. */
962 if (feof (fp))
963 clearerr (fp);
964 break;
966 case STDOUT_FILENO:
967 /* Don't close stdout just yet. close_stdout does that. */
968 if (fflush (fp) != 0)
969 die (_("fflush failed"), file);
970 break;
972 default:
973 if (fclose (fp) != 0)
974 die (_("close failed"), file);
975 break;
979 static void
980 dup2_or_die (int oldfd, int newfd)
982 if (dup2 (oldfd, newfd) < 0)
983 error (SORT_FAILURE, errno, _("dup2 failed"));
986 /* Fork a child process for piping to and do common cleanup. The
987 TRIES parameter tells us how many times to try to fork before
988 giving up. Return the PID of the child, or -1 (setting errno)
989 on failure. */
991 static pid_t
992 pipe_fork (int pipefds[2], size_t tries)
994 #if HAVE_WORKING_FORK
995 struct tempnode *saved_temphead;
996 int saved_errno;
997 double wait_retry = 0.25;
998 pid_t pid IF_LINT ( = -1);
999 struct cs_status cs;
1001 if (pipe (pipefds) < 0)
1002 return -1;
1004 while (tries--)
1006 /* This is so the child process won't delete our temp files
1007 if it receives a signal before exec-ing. */
1008 cs = cs_enter ();
1009 saved_temphead = temphead;
1010 temphead = NULL;
1012 pid = fork ();
1013 saved_errno = errno;
1014 if (pid)
1015 temphead = saved_temphead;
1017 cs_leave (cs);
1018 errno = saved_errno;
1020 if (0 <= pid || errno != EAGAIN)
1021 break;
1022 else
1024 xnanosleep (wait_retry);
1025 wait_retry *= 2;
1026 reap_some ();
1030 if (pid < 0)
1032 saved_errno = errno;
1033 close (pipefds[0]);
1034 close (pipefds[1]);
1035 errno = saved_errno;
1037 else if (pid == 0)
1039 close (STDIN_FILENO);
1040 close (STDOUT_FILENO);
1042 else
1043 ++nprocs;
1045 return pid;
1047 #else /* ! HAVE_WORKING_FORK */
1048 return -1;
1049 #endif
1052 /* Create a temporary file and start a compression program to filter output
1053 to that file. Set *PFP to the file handle and if PPID is non-NULL,
1054 set *PPID to the PID of the newly-created process. If the creation
1055 fails, return NULL if the failure is due to file descriptor
1056 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1058 static char *
1059 maybe_create_temp (FILE **pfp, pid_t *ppid, bool survive_fd_exhaustion)
1061 int tempfd;
1062 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1063 char *name;
1065 if (! node)
1066 return NULL;
1068 name = node->name;
1070 if (compress_program)
1072 int pipefds[2];
1074 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1075 if (0 < node->pid)
1077 close (tempfd);
1078 close (pipefds[0]);
1079 tempfd = pipefds[1];
1081 register_proc (node->pid);
1083 else if (node->pid == 0)
1085 close (pipefds[1]);
1086 dup2_or_die (tempfd, STDOUT_FILENO);
1087 close (tempfd);
1088 dup2_or_die (pipefds[0], STDIN_FILENO);
1089 close (pipefds[0]);
1091 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1092 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1093 compress_program);
1095 else
1096 node->pid = 0;
1099 *pfp = fdopen (tempfd, "w");
1100 if (! *pfp)
1101 die (_("couldn't create temporary file"), name);
1103 if (ppid)
1104 *ppid = node->pid;
1106 return name;
1109 /* Create a temporary file and start a compression program to filter output
1110 to that file. Set *PFP to the file handle and if *PPID is non-NULL,
1111 set it to the PID of the newly-created process. Die on failure. */
1113 static char *
1114 create_temp (FILE **pfp, pid_t *ppid)
1116 return maybe_create_temp (pfp, ppid, false);
1119 /* Open a compressed temp file and start a decompression process through
1120 which to filter the input. PID must be the valid processes ID of the
1121 process used to compress the file. Return NULL (setting errno to
1122 EMFILE) if we ran out of file descriptors, and die on any other
1123 kind of failure. */
1125 static FILE *
1126 open_temp (const char *name, pid_t pid)
1128 int tempfd, pipefds[2];
1129 FILE *fp = NULL;
1131 wait_proc (pid);
1133 tempfd = open (name, O_RDONLY);
1134 if (tempfd < 0)
1135 return NULL;
1137 switch (pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS))
1139 case -1:
1140 if (errno != EMFILE)
1141 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1142 compress_program);
1143 close (tempfd);
1144 errno = EMFILE;
1145 break;
1147 case 0:
1148 close (pipefds[0]);
1149 dup2_or_die (tempfd, STDIN_FILENO);
1150 close (tempfd);
1151 dup2_or_die (pipefds[1], STDOUT_FILENO);
1152 close (pipefds[1]);
1154 execlp (compress_program, compress_program, "-d", (char *) NULL);
1155 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1156 compress_program);
1158 default:
1159 close (tempfd);
1160 close (pipefds[1]);
1162 fp = fdopen (pipefds[0], "r");
1163 if (! fp)
1165 int saved_errno = errno;
1166 close (pipefds[0]);
1167 errno = saved_errno;
1169 break;
1172 return fp;
1175 /* Append DIR to the array of temporary directory names. */
1176 static void
1177 add_temp_dir (char const *dir)
1179 if (temp_dir_count == temp_dir_alloc)
1180 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1182 temp_dirs[temp_dir_count++] = dir;
1185 /* Remove NAME from the list of temporary files. */
1187 static void
1188 zaptemp (const char *name)
1190 struct tempnode *volatile *pnode;
1191 struct tempnode *node;
1192 struct tempnode *next;
1193 int unlink_status;
1194 int unlink_errno = 0;
1195 struct cs_status cs;
1197 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1198 continue;
1200 /* Unlink the temporary file in a critical section to avoid races. */
1201 next = node->next;
1202 cs = cs_enter ();
1203 unlink_status = unlink (name);
1204 unlink_errno = errno;
1205 *pnode = next;
1206 cs_leave (cs);
1208 if (unlink_status != 0)
1209 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1210 if (! next)
1211 temptail = pnode;
1212 free (node);
1215 #if HAVE_NL_LANGINFO
1217 static int
1218 struct_month_cmp (const void *m1, const void *m2)
1220 struct month const *month1 = m1;
1221 struct month const *month2 = m2;
1222 return strcmp (month1->name, month2->name);
1225 #endif
1227 /* Initialize the character class tables. */
1229 static void
1230 inittables (void)
1232 size_t i;
1234 for (i = 0; i < UCHAR_LIM; ++i)
1236 blanks[i] = !! isblank (i);
1237 nonprinting[i] = ! isprint (i);
1238 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1239 fold_toupper[i] = toupper (i);
1242 #if HAVE_NL_LANGINFO
1243 /* If we're not in the "C" locale, read different names for months. */
1244 if (hard_LC_TIME)
1246 for (i = 0; i < MONTHS_PER_YEAR; i++)
1248 char const *s;
1249 size_t s_len;
1250 size_t j, k;
1251 char *name;
1253 s = (char *) nl_langinfo (ABMON_1 + i);
1254 s_len = strlen (s);
1255 monthtab[i].name = name = xmalloc (s_len + 1);
1256 monthtab[i].val = i + 1;
1258 for (j = k = 0; j < s_len; j++)
1259 if (! isblank (to_uchar (s[j])))
1260 name[k++] = fold_toupper[to_uchar (s[j])];
1261 name[k] = '\0';
1263 qsort ((void *) monthtab, MONTHS_PER_YEAR,
1264 sizeof *monthtab, struct_month_cmp);
1266 #endif
1269 /* Specify how many inputs may be merged at once.
1270 This may be set on the command-line with the
1271 --batch-size option. */
1272 static void
1273 specify_nmerge (int oi, char c, char const *s)
1275 uintmax_t n;
1276 struct rlimit rlimit;
1277 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1279 /* Try to find out how many file descriptors we'll be able
1280 to open. We need at least nmerge + 3 (STDIN_FILENO,
1281 STDOUT_FILENO and STDERR_FILENO). */
1282 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1283 ? rlimit.rlim_cur
1284 : OPEN_MAX)
1285 - 3);
1287 if (e == LONGINT_OK)
1289 nmerge = n;
1290 if (nmerge != n)
1291 e = LONGINT_OVERFLOW;
1292 else
1294 if (nmerge < 2)
1296 error (0, 0, _("invalid --%s argument %s"),
1297 long_options[oi].name, quote (s));
1298 error (SORT_FAILURE, 0,
1299 _("minimum --%s argument is %s"),
1300 long_options[oi].name, quote ("2"));
1302 else if (max_nmerge < nmerge)
1304 e = LONGINT_OVERFLOW;
1306 else
1307 return;
1311 if (e == LONGINT_OVERFLOW)
1313 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1314 error (0, 0, _("--%s argument %s too large"),
1315 long_options[oi].name, quote (s));
1316 error (SORT_FAILURE, 0,
1317 _("maximum --%s argument with current rlimit is %s"),
1318 long_options[oi].name,
1319 uinttostr (max_nmerge, max_nmerge_buf));
1321 else
1322 xstrtol_fatal (e, oi, c, long_options, s);
1325 /* Specify the amount of main memory to use when sorting. */
1326 static void
1327 specify_sort_size (int oi, char c, char const *s)
1329 uintmax_t n;
1330 char *suffix;
1331 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1333 /* The default unit is KiB. */
1334 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1336 if (n <= UINTMAX_MAX / 1024)
1337 n *= 1024;
1338 else
1339 e = LONGINT_OVERFLOW;
1342 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1343 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1344 switch (suffix[0])
1346 case 'b':
1347 e = LONGINT_OK;
1348 break;
1350 case '%':
1352 double mem = physmem_total () * n / 100;
1354 /* Use "<", not "<=", to avoid problems with rounding. */
1355 if (mem < UINTMAX_MAX)
1357 n = mem;
1358 e = LONGINT_OK;
1360 else
1361 e = LONGINT_OVERFLOW;
1363 break;
1366 if (e == LONGINT_OK)
1368 /* If multiple sort sizes are specified, take the maximum, so
1369 that option order does not matter. */
1370 if (n < sort_size)
1371 return;
1373 sort_size = n;
1374 if (sort_size == n)
1376 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1377 return;
1380 e = LONGINT_OVERFLOW;
1383 xstrtol_fatal (e, oi, c, long_options, s);
1386 /* Specify the number of threads to spawn during internal sort. */
1387 static unsigned long int
1388 specify_nthreads (int oi, char c, char const *s)
1390 unsigned long int nthreads;
1391 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1392 if (e == LONGINT_OVERFLOW)
1393 return ULONG_MAX;
1394 if (e != LONGINT_OK)
1395 xstrtol_fatal (e, oi, c, long_options, s);
1396 if (nthreads == 0)
1397 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1398 return nthreads;
1402 /* Return the default sort size. */
1403 static size_t
1404 default_sort_size (void)
1406 /* Let MEM be available memory or 1/8 of total memory, whichever
1407 is greater. */
1408 double avail = physmem_available ();
1409 double total = physmem_total ();
1410 double mem = MAX (avail, total / 8);
1411 struct rlimit rlimit;
1413 /* Let SIZE be MEM, but no more than the maximum object size or
1414 system resource limits. Avoid the MIN macro here, as it is not
1415 quite right when only one argument is floating point. Don't
1416 bother to check for values like RLIM_INFINITY since in practice
1417 they are not much less than SIZE_MAX. */
1418 size_t size = SIZE_MAX;
1419 if (mem < size)
1420 size = mem;
1421 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1422 size = rlimit.rlim_cur;
1423 #ifdef RLIMIT_AS
1424 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1425 size = rlimit.rlim_cur;
1426 #endif
1428 /* Leave a large safety margin for the above limits, as failure can
1429 occur when they are exceeded. */
1430 size /= 2;
1432 #ifdef RLIMIT_RSS
1433 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1434 Exceeding RSS is not fatal, but can be quite slow. */
1435 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1436 size = rlimit.rlim_cur / 16 * 15;
1437 #endif
1439 /* Use no less than the minimum. */
1440 return MAX (size, MIN_SORT_SIZE);
1443 /* Return the sort buffer size to use with the input files identified
1444 by FPS and FILES, which are alternate names of the same files.
1445 NFILES gives the number of input files; NFPS may be less. Assume
1446 that each input line requires LINE_BYTES extra bytes' worth of line
1447 information. Do not exceed the size bound specified by the user
1448 (or a default size bound, if the user does not specify one). */
1450 static size_t
1451 sort_buffer_size (FILE *const *fps, size_t nfps,
1452 char *const *files, size_t nfiles,
1453 size_t line_bytes)
1455 /* A bound on the input size. If zero, the bound hasn't been
1456 determined yet. */
1457 static size_t size_bound;
1459 /* In the worst case, each input byte is a newline. */
1460 size_t worst_case_per_input_byte = line_bytes + 1;
1462 /* Keep enough room for one extra input line and an extra byte.
1463 This extra room might be needed when preparing to read EOF. */
1464 size_t size = worst_case_per_input_byte + 1;
1466 size_t i;
1468 for (i = 0; i < nfiles; i++)
1470 struct stat st;
1471 off_t file_size;
1472 size_t worst_case;
1474 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1475 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1476 : stat (files[i], &st))
1477 != 0)
1478 die (_("stat failed"), files[i]);
1480 if (S_ISREG (st.st_mode))
1481 file_size = st.st_size;
1482 else
1484 /* The file has unknown size. If the user specified a sort
1485 buffer size, use that; otherwise, guess the size. */
1486 if (sort_size)
1487 return sort_size;
1488 file_size = INPUT_FILE_SIZE_GUESS;
1491 if (! size_bound)
1493 size_bound = sort_size;
1494 if (! size_bound)
1495 size_bound = default_sort_size ();
1498 /* Add the amount of memory needed to represent the worst case
1499 where the input consists entirely of newlines followed by a
1500 single non-newline. Check for overflow. */
1501 worst_case = file_size * worst_case_per_input_byte + 1;
1502 if (file_size != worst_case / worst_case_per_input_byte
1503 || size_bound - size <= worst_case)
1504 return size_bound;
1505 size += worst_case;
1508 return size;
1511 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1512 must be at least sizeof (struct line). Allocate ALLOC bytes
1513 initially. */
1515 static void
1516 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1518 /* Ensure that the line array is properly aligned. If the desired
1519 size cannot be allocated, repeatedly halve it until allocation
1520 succeeds. The smaller allocation may hurt overall performance,
1521 but that's better than failing. */
1522 while (true)
1524 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1525 buf->buf = malloc (alloc);
1526 if (buf->buf)
1527 break;
1528 alloc /= 2;
1529 if (alloc <= line_bytes + 1)
1530 xalloc_die ();
1533 buf->line_bytes = line_bytes;
1534 buf->alloc = alloc;
1535 buf->used = buf->left = buf->nlines = 0;
1536 buf->eof = false;
1539 /* Return one past the limit of the line array. */
1541 static inline struct line *
1542 buffer_linelim (struct buffer const *buf)
1544 return (struct line *) (buf->buf + buf->alloc);
1547 /* Return a pointer to the first character of the field specified
1548 by KEY in LINE. */
1550 static char *
1551 begfield (const struct line *line, const struct keyfield *key)
1553 char *ptr = line->text, *lim = ptr + line->length - 1;
1554 size_t sword = key->sword;
1555 size_t schar = key->schar;
1557 /* The leading field separator itself is included in a field when -t
1558 is absent. */
1560 if (tab != TAB_DEFAULT)
1561 while (ptr < lim && sword--)
1563 while (ptr < lim && *ptr != tab)
1564 ++ptr;
1565 if (ptr < lim)
1566 ++ptr;
1568 else
1569 while (ptr < lim && sword--)
1571 while (ptr < lim && blanks[to_uchar (*ptr)])
1572 ++ptr;
1573 while (ptr < lim && !blanks[to_uchar (*ptr)])
1574 ++ptr;
1577 /* If we're ignoring leading blanks when computing the Start
1578 of the field, skip past them here. */
1579 if (key->skipsblanks)
1580 while (ptr < lim && blanks[to_uchar (*ptr)])
1581 ++ptr;
1583 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1584 ptr = MIN (lim, ptr + schar);
1586 return ptr;
1589 /* Return the limit of (a pointer to the first character after) the field
1590 in LINE specified by KEY. */
1592 static char *
1593 limfield (const struct line *line, const struct keyfield *key)
1595 char *ptr = line->text, *lim = ptr + line->length - 1;
1596 size_t eword = key->eword, echar = key->echar;
1598 if (echar == 0)
1599 eword++; /* Skip all of end field. */
1601 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1602 whichever comes first. If there are more than EWORD fields, leave
1603 PTR pointing at the beginning of the field having zero-based index,
1604 EWORD. If a delimiter character was specified (via -t), then that
1605 `beginning' is the first character following the delimiting TAB.
1606 Otherwise, leave PTR pointing at the first `blank' character after
1607 the preceding field. */
1608 if (tab != TAB_DEFAULT)
1609 while (ptr < lim && eword--)
1611 while (ptr < lim && *ptr != tab)
1612 ++ptr;
1613 if (ptr < lim && (eword || echar))
1614 ++ptr;
1616 else
1617 while (ptr < lim && eword--)
1619 while (ptr < lim && blanks[to_uchar (*ptr)])
1620 ++ptr;
1621 while (ptr < lim && !blanks[to_uchar (*ptr)])
1622 ++ptr;
1625 #ifdef POSIX_UNSPECIFIED
1626 /* The following block of code makes GNU sort incompatible with
1627 standard Unix sort, so it's ifdef'd out for now.
1628 The POSIX spec isn't clear on how to interpret this.
1629 FIXME: request clarification.
1631 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1632 Date: Thu, 30 May 96 12:20:41 -0400
1633 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1635 [...]I believe I've found another bug in `sort'.
1637 $ cat /tmp/sort.in
1638 a b c 2 d
1639 pq rs 1 t
1640 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1641 a b c 2 d
1642 pq rs 1 t
1643 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1644 pq rs 1 t
1645 a b c 2 d
1647 Unix sort produced the answer I expected: sort on the single character
1648 in column 7. GNU sort produced different results, because it disagrees
1649 on the interpretation of the key-end spec "M.N". Unix sort reads this
1650 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1651 "skip M-1 fields, then either N-1 characters or the rest of the current
1652 field, whichever comes first". This extra clause applies only to
1653 key-ends, not key-starts.
1656 /* Make LIM point to the end of (one byte past) the current field. */
1657 if (tab != TAB_DEFAULT)
1659 char *newlim;
1660 newlim = memchr (ptr, tab, lim - ptr);
1661 if (newlim)
1662 lim = newlim;
1664 else
1666 char *newlim;
1667 newlim = ptr;
1668 while (newlim < lim && blanks[to_uchar (*newlim)])
1669 ++newlim;
1670 while (newlim < lim && !blanks[to_uchar (*newlim)])
1671 ++newlim;
1672 lim = newlim;
1674 #endif
1676 if (echar != 0) /* We need to skip over a portion of the end field. */
1678 /* If we're ignoring leading blanks when computing the End
1679 of the field, skip past them here. */
1680 if (key->skipeblanks)
1681 while (ptr < lim && blanks[to_uchar (*ptr)])
1682 ++ptr;
1684 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1685 ptr = MIN (lim, ptr + echar);
1688 return ptr;
1691 /* Fill BUF reading from FP, moving buf->left bytes from the end
1692 of buf->buf to the beginning first. If EOF is reached and the
1693 file wasn't terminated by a newline, supply one. Set up BUF's line
1694 table too. FILE is the name of the file corresponding to FP.
1695 Return true if some input was read. */
1697 static bool
1698 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1700 struct keyfield const *key = keylist;
1701 char eol = eolchar;
1702 size_t line_bytes = buf->line_bytes;
1703 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1705 if (buf->eof)
1706 return false;
1708 if (buf->used != buf->left)
1710 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1711 buf->used = buf->left;
1712 buf->nlines = 0;
1715 while (true)
1717 char *ptr = buf->buf + buf->used;
1718 struct line *linelim = buffer_linelim (buf);
1719 struct line *line = linelim - buf->nlines;
1720 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1721 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1723 while (line_bytes + 1 < avail)
1725 /* Read as many bytes as possible, but do not read so many
1726 bytes that there might not be enough room for the
1727 corresponding line array. The worst case is when the
1728 rest of the input file consists entirely of newlines,
1729 except that the last byte is not a newline. */
1730 size_t readsize = (avail - 1) / (line_bytes + 1);
1731 size_t bytes_read = fread (ptr, 1, readsize, fp);
1732 char *ptrlim = ptr + bytes_read;
1733 char *p;
1734 avail -= bytes_read;
1736 if (bytes_read != readsize)
1738 if (ferror (fp))
1739 die (_("read failed"), file);
1740 if (feof (fp))
1742 buf->eof = true;
1743 if (buf->buf == ptrlim)
1744 return false;
1745 if (ptrlim[-1] != eol)
1746 *ptrlim++ = eol;
1750 /* Find and record each line in the just-read input. */
1751 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1753 /* Delimit the line with NUL. This eliminates the need to
1754 temporarily replace the last byte with NUL when calling
1755 xmemcoll(), which increases performance. */
1756 *p = '\0';
1757 ptr = p + 1;
1758 line--;
1759 line->text = line_start;
1760 line->length = ptr - line_start;
1761 mergesize = MAX (mergesize, line->length);
1762 avail -= line_bytes;
1764 if (key)
1766 /* Precompute the position of the first key for
1767 efficiency. */
1768 line->keylim = (key->eword == SIZE_MAX
1770 : limfield (line, key));
1772 if (key->sword != SIZE_MAX)
1773 line->keybeg = begfield (line, key);
1774 else
1776 if (key->skipsblanks)
1777 while (blanks[to_uchar (*line_start)])
1778 line_start++;
1779 line->keybeg = line_start;
1783 line_start = ptr;
1786 ptr = ptrlim;
1787 if (buf->eof)
1788 break;
1791 buf->used = ptr - buf->buf;
1792 buf->nlines = buffer_linelim (buf) - line;
1793 if (buf->nlines != 0)
1795 buf->left = ptr - line_start;
1796 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1797 return true;
1801 /* The current input line is too long to fit in the buffer.
1802 Double the buffer size and try again, keeping it properly
1803 aligned. */
1804 size_t line_alloc = buf->alloc / sizeof (struct line);
1805 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1806 buf->alloc = line_alloc * sizeof (struct line);
1811 /* Exit with an error if a mixture of SI and IEC units detected. */
1813 static bool
1814 check_mixed_SI_IEC (char prefix, struct keyfield *key)
1816 int iec_present = prefix == 'i';
1817 if (key)
1819 if (key->iec_present != -1 && iec_present != key->iec_present)
1820 error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
1821 key->iec_present = iec_present;
1823 return iec_present;
1826 /* Return an integer which represents the order of magnitude of
1827 the unit following the number. NUMBER can contain thousands separators
1828 or a decimal point, but not have preceeding blanks.
1829 Negative numbers return a negative unit order. */
1831 static int
1832 find_unit_order (const char *number, struct keyfield *key, char const **endptr)
1834 static const char orders [UCHAR_LIM] =
1836 #if SOME_DAY_WE_WILL_REQUIRE_C99
1837 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1838 ['k']=1,
1839 #else
1840 /* Generate the following table with this command:
1841 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1842 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1843 |fmt */
1844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1847 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1855 #endif
1858 const unsigned char *p = number;
1860 int sign = 1;
1862 if (*p == '-')
1864 sign = -1;
1865 p++;
1868 /* Scan to end of number.
1869 Decimals or separators not followed by digits stop the scan.
1870 Numbers ending in decimals or separators are thus considered
1871 to be lacking in units.
1872 FIXME: add support for multibyte thousands_sep and decimal_point. */
1874 while (ISDIGIT (*p))
1876 p++;
1878 if (*p == decimal_point && ISDIGIT (*(p + 1)))
1879 p += 2;
1880 else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
1881 p += 2;
1884 int order = orders[*p];
1886 /* For valid units check for MiB vs MB etc. */
1887 if (order)
1889 p++;
1890 p += check_mixed_SI_IEC (*p, key);
1893 if (endptr)
1894 *endptr = p;
1896 return sign * order;
1899 /* Compare numbers ending in units with SI xor IEC prefixes
1900 <none/unknown> < K/k < M < G < T < P < E < Z < Y
1901 Assume that numbers are properly abbreviated.
1902 i.e. input will never have both 6000K and 5M. */
1904 static int
1905 human_numcompare (const char *a, const char *b, struct keyfield *key,
1906 char const **ea)
1908 while (blanks[to_uchar (*a)])
1909 a++;
1910 while (blanks[to_uchar (*b)])
1911 b++;
1913 int order_a = find_unit_order (a, key, ea);
1914 int order_b = find_unit_order (b, key, NULL);
1916 return (order_a > order_b ? 1
1917 : order_a < order_b ? -1
1918 : strnumcmp (a, b, decimal_point, thousands_sep));
1921 /* Compare strings A and B as numbers without explicitly converting them to
1922 machine numbers. Comparatively slow for short strings, but asymptotically
1923 hideously fast. */
1925 static int
1926 numcompare (const char *a, const char *b, char const **ea)
1928 while (blanks[to_uchar (*a)])
1929 a++;
1930 while (blanks[to_uchar (*b)])
1931 b++;
1933 if (debug)
1935 /* Approximate strnumcmp extents with find_unit_order. */
1936 if (find_unit_order (a, NULL, ea))
1938 *ea -= 1; /* ignore the order letter */
1939 *ea -= (**ea == 'i'); /* and IEC prefix */
1943 return strnumcmp (a, b, decimal_point, thousands_sep);
1946 static int
1947 general_numcompare (const char *sa, const char *sb, char const **ea)
1949 /* FIXME: maybe add option to try expensive FP conversion
1950 only if A and B can't be compared more cheaply/accurately. */
1952 #if HAVE_C99_STRTOLD /* provided by c-strtold module. */
1953 # define long_double long double
1954 #else
1955 # define long_double double
1956 # undef strtold
1957 # define strtold strtod
1958 #endif
1960 char *eb;
1961 long_double a = strtold (sa, (char **) ea);
1962 long_double b = strtold (sb, &eb);
1964 /* Put conversion errors at the start of the collating sequence. */
1965 if (sa == *ea)
1966 return sb == eb ? 0 : -1;
1967 if (sb == eb)
1968 return 1;
1970 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1971 conversion errors but before numbers; sort them by internal
1972 bit-pattern, for lack of a more portable alternative. */
1973 return (a < b ? -1
1974 : a > b ? 1
1975 : a == b ? 0
1976 : b == b ? -1
1977 : a == a ? 1
1978 : memcmp ((char *) &a, (char *) &b, sizeof a));
1981 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1982 Return 0 if the name in S is not recognized. */
1984 static int
1985 getmonth (char const *month, size_t len, char const **ea)
1987 size_t lo = 0;
1988 size_t hi = MONTHS_PER_YEAR;
1989 char const *monthlim = month + len;
1991 while (true)
1993 if (month == monthlim)
1994 return 0;
1995 if (!blanks[to_uchar (*month)])
1996 break;
1997 ++month;
2002 size_t ix = (lo + hi) / 2;
2003 char const *m = month;
2004 char const *n = monthtab[ix].name;
2006 for (;; m++, n++)
2008 if (!*n)
2010 if (ea)
2011 *ea = m;
2012 return monthtab[ix].val;
2014 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
2016 hi = ix;
2017 break;
2019 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
2021 lo = ix + 1;
2022 break;
2026 while (lo < hi);
2028 return 0;
2031 /* A randomly chosen MD5 state, used for random comparison. */
2032 static struct md5_ctx random_md5_state;
2034 /* Initialize the randomly chosen MD5 state. */
2036 static void
2037 random_md5_state_init (char const *random_source)
2039 unsigned char buf[MD5_DIGEST_SIZE];
2040 struct randread_source *r = randread_new (random_source, sizeof buf);
2041 if (! r)
2042 die (_("open failed"), random_source);
2043 randread (r, buf, sizeof buf);
2044 if (randread_free (r) != 0)
2045 die (_("close failed"), random_source);
2046 md5_init_ctx (&random_md5_state);
2047 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2050 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
2051 with length LENGTHB. Return negative if less, zero if equal,
2052 positive if greater. */
2054 static int
2055 cmp_hashes (char const *texta, size_t lena,
2056 char const *textb, size_t lenb)
2058 /* Try random hashes until a pair of hashes disagree. But if the
2059 first pair of random hashes agree, check whether the keys are
2060 identical and if so report no difference. */
2061 int diff;
2062 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2063 struct md5_ctx s[2];
2064 s[0] = s[1] = random_md5_state;
2065 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2066 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2067 diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2068 if (! diff)
2070 /* Break ties with memcmp. This is good enough given the
2071 astronomically low probability of an MD5 collision. */
2072 diff = memcmp (texta, textb, MIN (lena, lenb));
2073 if (! diff)
2074 diff = lena < lenb ? -1 : lena != lenb;
2077 return diff;
2080 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2081 using one or more random hash functions. */
2083 static int
2084 compare_random (char *restrict texta, size_t lena,
2085 char *restrict textb, size_t lenb)
2087 int diff;
2089 if (! hard_LC_COLLATE)
2090 diff = cmp_hashes (texta, lena, textb, lenb);
2091 else
2093 /* Transform the text into the basis of comparison, so that byte
2094 strings that would otherwise considered to be equal are
2095 considered equal here even if their bytes differ. */
2097 char *buf = NULL;
2098 char stackbuf[4000];
2099 size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
2100 bool a_fits = tlena <= sizeof stackbuf;
2101 size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
2102 (a_fits ? sizeof stackbuf - tlena : 0),
2103 textb, lenb);
2105 if (a_fits && tlena + tlenb <= sizeof stackbuf)
2106 buf = stackbuf;
2107 else
2109 /* Adding 1 to the buffer size lets xmemxfrm run a bit
2110 faster by avoiding the need for an extra buffer copy. */
2111 buf = xmalloc (tlena + tlenb + 1);
2112 xmemxfrm (buf, tlena + 1, texta, lena);
2113 xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
2116 diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
2118 if (buf != stackbuf)
2119 free (buf);
2122 return diff;
2125 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2126 using filevercmp. See lib/filevercmp.h for function description. */
2128 static int
2129 compare_version (char *restrict texta, size_t lena,
2130 char *restrict textb, size_t lenb)
2132 int diff;
2134 /* It is necessary to save the character after the end of the field.
2135 "filevercmp" works with NUL terminated strings. Our blocks of
2136 text are not necessarily terminated with a NUL byte. */
2137 char sv_a = texta[lena];
2138 char sv_b = textb[lenb];
2140 texta[lena] = '\0';
2141 textb[lenb] = '\0';
2143 diff = filevercmp (texta, textb);
2145 texta[lena] = sv_a;
2146 textb[lenb] = sv_b;
2148 return diff;
2151 /* For debug mode, count tabs in the passed string
2152 so we can adjust the widths returned by mbswidth.
2153 FIXME: Should we generally be counting non printable chars? */
2155 static size_t
2156 count_tabs (char const *text, const size_t len)
2158 size_t tabs = 0;
2159 size_t tlen = strnlen (text, len);
2161 while (tlen--)
2163 if (*text++ == '\t')
2164 tabs++;
2167 return tabs;
2170 /* For debug mode, "underline" a key at the
2171 specified offset and screen width. */
2173 static void
2174 mark_key (size_t offset, size_t width)
2176 printf ("%*s", (int) offset, "");
2178 if (!width)
2179 printf (_("^ no match for key\n"));
2180 else
2182 while (width--)
2183 putchar ('_');
2184 putchar ('\n');
2188 /* For debug mode, determine the screen offset and width
2189 to highlight for a key, and then output the highlight. */
2191 static void
2192 debug_key (char const *sline, char const *sfield, char const *efield,
2193 size_t flen, bool skipb)
2195 char const *sa = sfield;
2197 if (skipb) /* This key type implicitly skips leading blanks. */
2199 while (sa < efield && blanks[to_uchar (*sa)])
2201 sa++;
2202 if (flen)
2203 flen--; /* This assumes TABs same width as SPACEs. */
2207 size_t offset = mbsnwidth (sline, sfield - sline, 0) + (sa - sfield);
2208 offset += count_tabs (sline, sfield - sline);
2210 size_t width = mbsnwidth (sa, flen, 0);
2211 width += count_tabs (sa, flen);
2213 mark_key (offset, width);
2216 /* Testing if a key is numeric is done in various places. */
2218 static inline bool
2219 key_numeric (struct keyfield const *key)
2221 return key->numeric || key->general_numeric || key->human_numeric;
2224 /* Return whether sorting options specified for key. */
2226 static bool
2227 default_key_compare (struct keyfield const *key)
2229 return ! (key->ignore
2230 || key->translate
2231 || key->skipsblanks
2232 || key->skipeblanks
2233 || key_numeric (key)
2234 || key->month
2235 || key->version
2236 || key->random
2237 /* || key->reverse */
2241 /* Convert a key to the short options used to specify it. */
2243 static void
2244 key_to_opts (struct keyfield const *key, char *opts)
2246 if (key->skipsblanks || key->skipeblanks)
2247 *opts++ = 'b';/* either disables global -b */
2248 if (key->ignore == nondictionary)
2249 *opts++ = 'd';
2250 if (key->translate)
2251 *opts++ = 'f';
2252 if (key->general_numeric)
2253 *opts++ = 'g';
2254 if (key->human_numeric)
2255 *opts++ = 'h';
2256 if (key->ignore == nonprinting)
2257 *opts++ = 'i';
2258 if (key->month)
2259 *opts++ = 'M';
2260 if (key->numeric)
2261 *opts++ = 'n';
2262 if (key->random)
2263 *opts++ = 'R';
2264 if (key->reverse)
2265 *opts++ = 'r';
2266 if (key->version)
2267 *opts++ = 'V';
2268 *opts = '\0';
2271 /* Output data independent key warnings to stderr. */
2273 static void
2274 key_warnings (struct keyfield const *gkey, bool gkey_only)
2276 struct keyfield const *key;
2277 struct keyfield ugkey = *gkey;
2278 unsigned long keynum = 1;
2280 for (key = keylist; key; key = key->next, keynum++)
2282 if (key->obsolete_used)
2284 size_t sword = key->sword;
2285 size_t eword = key->eword;
2286 char tmp[INT_BUFSIZE_BOUND (sword)];
2287 /* obsolescent syntax +A.x -B.y is equivalent to:
2288 -k A+1.x+1,B.y (when y = 0)
2289 -k A+1.x+1,B+1.y (when y > 0) */
2290 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2291 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2292 char *po = obuf;
2293 char *pn = nbuf;
2295 if (sword == SIZE_MAX)
2296 sword++;
2298 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2299 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2300 if (key->eword != SIZE_MAX)
2302 po = stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2303 pn = stpcpy (stpcpy (pn, ","),
2304 umaxtostr (eword + 1
2305 + (key->echar == SIZE_MAX), tmp));
2307 error (0, 0, _("obsolescent key `%s' used; consider `%s' instead"),
2308 obuf, nbuf);
2311 /* Warn about field specs that will never match. */
2312 if (key->sword != SIZE_MAX && key->eword < key->sword)
2313 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2315 /* Warn about significant leading blanks. */
2316 bool implicit_skip = key_numeric (key) || key->month;
2317 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2318 && !(key->schar || key->echar);
2319 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2320 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2321 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2322 || (!key->skipsblanks && key->schar)
2323 || (!key->skipeblanks && key->echar)))
2324 error (0, 0, _("leading blanks are significant in key %lu; "
2325 "consider also specifying `b'"), keynum);
2327 /* Warn about numeric comparisons spanning fields,
2328 as field delimiters could be interpreted as part
2329 of the number (maybe only in other locales). */
2330 if (!gkey_only && key_numeric (key))
2332 size_t sword = key->sword + 1;
2333 size_t eword = key->eword + 1;
2334 if (!sword)
2335 sword++;
2336 if (sword != eword)
2337 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2338 keynum);
2341 /* Flag global options not copied or specified in any key. */
2342 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2343 ugkey.ignore = NULL;
2344 if (ugkey.translate && (ugkey.translate == key->translate))
2345 ugkey.translate = NULL;
2346 ugkey.skipsblanks &= !key->skipsblanks;
2347 ugkey.skipeblanks &= !key->skipeblanks;
2348 ugkey.month &= !key->month;
2349 ugkey.numeric &= !key->numeric;
2350 ugkey.general_numeric &= !key->general_numeric;
2351 ugkey.human_numeric &= !key->human_numeric;
2352 ugkey.random &= !key->random;
2353 ugkey.version &= !key->version;
2354 ugkey.reverse &= !key->reverse;
2357 /* Warn about ignored global options flagged above.
2358 Note if gkey is the only one in the list, all flags are cleared. */
2359 if (!default_key_compare (&ugkey)
2360 || (ugkey.reverse && (stable || unique) && keylist))
2362 bool ugkey_reverse = ugkey.reverse;
2363 if (!(stable || unique))
2364 ugkey.reverse = false;
2365 /* The following is too big, but guaranteed to be "big enough". */
2366 char opts[sizeof short_options];
2367 key_to_opts (&ugkey, opts);
2368 error (0, 0,
2369 ngettext ("option `-%s' is ignored",
2370 "options `-%s' are ignored",
2371 select_plural (strlen (opts))), opts);
2372 ugkey.reverse = ugkey_reverse;
2374 if (ugkey.reverse && !(stable || unique) && keylist)
2375 error (0, 0, _("option `-r' only applies to last-resort comparison"));
2378 /* Compare two lines A and B trying every key in sequence until there
2379 are no more keys or a difference is found. */
2381 static int
2382 keycompare (const struct line *a, const struct line *b, bool show_debug)
2384 struct keyfield *key = keylist;
2386 /* For the first iteration only, the key positions have been
2387 precomputed for us. */
2388 char *texta = a->keybeg;
2389 char *textb = b->keybeg;
2390 char *lima = a->keylim;
2391 char *limb = b->keylim;
2393 int diff;
2395 while (true)
2397 char const *translate = key->translate;
2398 bool const *ignore = key->ignore;
2399 bool skipb = false; /* Whether key type auto skips leading blanks. */
2401 /* Treat field ends before field starts as empty fields. */
2402 lima = MAX (texta, lima);
2403 limb = MAX (textb, limb);
2405 /* Find the lengths. */
2406 size_t lena = lima - texta;
2407 size_t lenb = limb - textb;
2409 /* Actually compare the fields. */
2411 if (key->random)
2412 diff = compare_random (texta, lena, textb, lenb);
2413 else if (key_numeric (key))
2415 char savea = *lima, saveb = *limb;
2416 char const* ea = lima;
2418 *lima = *limb = '\0';
2419 diff = (key->numeric ? numcompare (texta, textb, &ea)
2420 : key->general_numeric ? general_numcompare (texta, textb,
2421 &ea)
2422 : human_numcompare (texta, textb, key, &ea));
2423 if (show_debug)
2425 lena = ea - texta;
2426 skipb = true;
2428 *lima = savea, *limb = saveb;
2430 else if (key->version)
2431 diff = compare_version (texta, lena, textb, lenb);
2432 else if (key->month)
2434 char const *ea = lima;
2436 int amon = getmonth (texta, lena, &ea);
2437 diff = amon - getmonth (textb, lenb, NULL);
2439 if (show_debug)
2441 lena = amon ? ea - texta : 0;
2442 skipb = true;
2445 /* Sorting like this may become slow, so in a simple locale the user
2446 can select a faster sort that is similar to ascii sort. */
2447 else if (hard_LC_COLLATE)
2449 /* FIXME: for debug, account for skipped chars, while handling mb chars.
2450 Generally perhaps xmemfrm could be used to determine chars that are
2451 excluded from the collating order? */
2452 if (ignore || translate)
2454 char buf[4000];
2455 size_t size = lena + 1 + lenb + 1;
2456 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
2457 char *copy_b = copy_a + lena + 1;
2458 size_t new_len_a, new_len_b, i;
2460 /* Ignore and/or translate chars before comparing. */
2461 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
2463 if (i < lena)
2465 copy_a[new_len_a] = (translate
2466 ? translate[to_uchar (texta[i])]
2467 : texta[i]);
2468 if (!ignore || !ignore[to_uchar (texta[i])])
2469 ++new_len_a;
2471 if (i < lenb)
2473 copy_b[new_len_b] = (translate
2474 ? translate[to_uchar (textb[i])]
2475 : textb [i]);
2476 if (!ignore || !ignore[to_uchar (textb[i])])
2477 ++new_len_b;
2481 copy_a[new_len_a++] = '\0';
2482 copy_b[new_len_b++] = '\0';
2483 diff = xmemcoll0 (copy_a, new_len_a, copy_b, new_len_b);
2485 if (sizeof buf < size)
2486 free (copy_a);
2488 else if (lena == 0)
2489 diff = - NONZERO (lenb);
2490 else if (lenb == 0)
2491 goto greater;
2492 else
2493 diff = xmemcoll (texta, lena, textb, lenb);
2495 else if (ignore)
2497 char *savea = texta;
2499 #define CMP_WITH_IGNORE(A, B) \
2500 do \
2502 while (true) \
2504 while (texta < lima && ignore[to_uchar (*texta)]) \
2505 ++texta; \
2506 while (textb < limb && ignore[to_uchar (*textb)]) \
2507 ++textb; \
2508 if (! (texta < lima && textb < limb)) \
2509 break; \
2510 diff = to_uchar (A) - to_uchar (B); \
2511 if (diff) \
2512 goto not_equal; \
2513 ++texta; \
2514 ++textb; \
2517 diff = (texta < lima) - (textb < limb); \
2519 while (0)
2521 if (translate)
2522 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2523 translate[to_uchar (*textb)]);
2524 else
2525 CMP_WITH_IGNORE (*texta, *textb);
2527 /* We only need to restore this for debug_key
2528 in which case the keys being compared are equal. */
2529 texta = savea;
2531 else if (lena == 0)
2532 diff = - NONZERO (lenb);
2533 else if (lenb == 0)
2534 goto greater;
2535 else
2537 if (translate)
2539 char *savea = texta;
2541 while (texta < lima && textb < limb)
2543 diff = (to_uchar (translate[to_uchar (*texta++)])
2544 - to_uchar (translate[to_uchar (*textb++)]));
2545 if (diff)
2546 goto not_equal;
2549 /* We only need to restore this for debug_key
2550 in which case the keys being compared are equal. */
2551 texta = savea;
2553 else
2555 diff = memcmp (texta, textb, MIN (lena, lenb));
2556 if (diff)
2557 goto not_equal;
2559 diff = lena < lenb ? -1 : lena != lenb;
2562 if (diff)
2563 goto not_equal;
2565 if (show_debug)
2566 debug_key (a->text, texta, lima, lena, skipb);
2568 key = key->next;
2569 if (! key)
2570 break;
2572 /* Find the beginning and limit of the next field. */
2573 if (key->eword != SIZE_MAX)
2574 lima = limfield (a, key), limb = limfield (b, key);
2575 else
2576 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2578 if (key->sword != SIZE_MAX)
2579 texta = begfield (a, key), textb = begfield (b, key);
2580 else
2582 texta = a->text, textb = b->text;
2583 if (key->skipsblanks)
2585 while (texta < lima && blanks[to_uchar (*texta)])
2586 ++texta;
2587 while (textb < limb && blanks[to_uchar (*textb)])
2588 ++textb;
2593 return 0;
2595 greater:
2596 diff = 1;
2597 not_equal:
2598 return key->reverse ? -diff : diff;
2601 /* Compare two lines A and B, returning negative, zero, or positive
2602 depending on whether A compares less than, equal to, or greater than B. */
2604 static int
2605 compare (const struct line *a, const struct line *b, bool show_debug)
2607 int diff;
2608 size_t alen, blen;
2610 /* First try to compare on the specified keys (if any).
2611 The only two cases with no key at all are unadorned sort,
2612 and unadorned sort -r. */
2613 if (keylist)
2615 diff = keycompare (a, b, show_debug);
2616 if (diff || unique || stable)
2617 return diff;
2620 /* If the keys all compare equal (or no keys were specified)
2621 fall through to the default comparison. */
2622 alen = a->length - 1, blen = b->length - 1;
2624 if (show_debug)
2625 debug_key (a->text, a->text, a->text + alen, alen, false);
2627 if (alen == 0)
2628 diff = - NONZERO (blen);
2629 else if (blen == 0)
2630 diff = 1;
2631 else if (hard_LC_COLLATE)
2633 /* Note xmemcoll0 is a performance enhancement as
2634 it will not unconditionally write '\0' after the
2635 passed in buffers, which was seen to give around
2636 a 3% increase in performance for short lines. */
2637 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2639 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2640 diff = alen < blen ? -1 : alen != blen;
2642 return reverse ? -diff : diff;
2645 static void
2646 write_bytes (struct line const *line, FILE *fp, char const *output_file)
2648 char *buf = line->text;
2649 size_t n_bytes = line->length;
2650 char *ebuf = buf + n_bytes;
2652 /* Convert TABs to '>' and \0 to \n when -z specified. */
2653 if (debug && fp == stdout)
2655 char const *c = buf;
2657 while (c < ebuf)
2659 char wc = *c++;
2660 if (wc == '\t')
2661 wc = '>';
2662 else if (c == ebuf)
2663 wc = '\n';
2664 if (fputc (wc, fp) == EOF)
2665 die (_("write failed"), output_file);
2668 compare (line, line, true);
2670 else
2672 ebuf[-1] = eolchar;
2673 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2674 die (_("write failed"), output_file);
2675 ebuf[-1] = '\0';
2679 /* Check that the lines read from FILE_NAME come in order. Return
2680 true if they are in order. If CHECKONLY == 'c', also print a
2681 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2682 they are not in order. */
2684 static bool
2685 check (char const *file_name, char checkonly)
2687 FILE *fp = xfopen (file_name, "r");
2688 struct buffer buf; /* Input buffer. */
2689 struct line temp; /* Copy of previous line. */
2690 size_t alloc = 0;
2691 uintmax_t line_number = 0;
2692 struct keyfield const *key = keylist;
2693 bool nonunique = ! unique;
2694 bool ordered = true;
2696 initbuf (&buf, sizeof (struct line),
2697 MAX (merge_buffer_size, sort_size));
2698 temp.text = NULL;
2700 while (fillbuf (&buf, fp, file_name))
2702 struct line const *line = buffer_linelim (&buf);
2703 struct line const *linebase = line - buf.nlines;
2705 /* Make sure the line saved from the old buffer contents is
2706 less than or equal to the first line of the new buffer. */
2707 if (alloc && nonunique <= compare (&temp, line - 1, false))
2709 found_disorder:
2711 if (checkonly == 'c')
2713 struct line const *disorder_line = line - 1;
2714 uintmax_t disorder_line_number =
2715 buffer_linelim (&buf) - disorder_line + line_number;
2716 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2717 fprintf (stderr, _("%s: %s:%s: disorder: "),
2718 program_name, file_name,
2719 umaxtostr (disorder_line_number, hr_buf));
2720 if (debug)
2721 fputc ('\n', stderr);
2722 write_bytes (disorder_line, debug ? stdout : stderr,
2723 debug ? _("standard out") : _("standard error"));
2726 ordered = false;
2727 break;
2731 /* Compare each line in the buffer with its successor. */
2732 while (linebase < --line)
2733 if (nonunique <= compare (line, line - 1, false))
2734 goto found_disorder;
2736 line_number += buf.nlines;
2738 /* Save the last line of the buffer. */
2739 if (alloc < line->length)
2743 alloc *= 2;
2744 if (! alloc)
2746 alloc = line->length;
2747 break;
2750 while (alloc < line->length);
2752 temp.text = xrealloc (temp.text, alloc);
2754 memcpy (temp.text, line->text, line->length);
2755 temp.length = line->length;
2756 if (key)
2758 temp.keybeg = temp.text + (line->keybeg - line->text);
2759 temp.keylim = temp.text + (line->keylim - line->text);
2763 xfclose (fp, file_name);
2764 free (buf.buf);
2765 free (temp.text);
2766 return ordered;
2769 /* Open FILES (there are NFILES of them) and store the resulting array
2770 of stream pointers into (*PFPS). Allocate the array. Return the
2771 number of successfully opened files, setting errno if this value is
2772 less than NFILES. */
2774 static size_t
2775 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2777 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2778 int i;
2780 /* Open as many input files as we can. */
2781 for (i = 0; i < nfiles; i++)
2783 fps[i] = (files[i].pid
2784 ? open_temp (files[i].name, files[i].pid)
2785 : stream_open (files[i].name, "r"));
2786 if (!fps[i])
2787 break;
2790 return i;
2793 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2794 files (all of which are at the start of the FILES array), and
2795 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2796 FPS is the vector of open stream corresponding to the files.
2797 Close input and output streams before returning.
2798 OUTPUT_FILE gives the name of the output file. If it is NULL,
2799 the output file is standard output. */
2801 static void
2802 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2803 FILE *ofp, char const *output_file, FILE **fps)
2805 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2806 /* Input buffers for each file. */
2807 struct line saved; /* Saved line storage for unique check. */
2808 struct line const *savedline = NULL;
2809 /* &saved if there is a saved line. */
2810 size_t savealloc = 0; /* Size allocated for the saved line. */
2811 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2812 /* Current line in each line table. */
2813 struct line const **base = xnmalloc (nfiles, sizeof *base);
2814 /* Base of each line table. */
2815 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2816 /* Table representing a permutation of fps,
2817 such that cur[ord[0]] is the smallest line
2818 and will be next output. */
2819 size_t i;
2820 size_t j;
2821 size_t t;
2822 struct keyfield const *key = keylist;
2823 saved.text = NULL;
2825 /* Read initial lines from each input file. */
2826 for (i = 0; i < nfiles; )
2828 initbuf (&buffer[i], sizeof (struct line),
2829 MAX (merge_buffer_size, sort_size / nfiles));
2830 if (fillbuf (&buffer[i], fps[i], files[i].name))
2832 struct line const *linelim = buffer_linelim (&buffer[i]);
2833 cur[i] = linelim - 1;
2834 base[i] = linelim - buffer[i].nlines;
2835 i++;
2837 else
2839 /* fps[i] is empty; eliminate it from future consideration. */
2840 xfclose (fps[i], files[i].name);
2841 if (i < ntemps)
2843 ntemps--;
2844 zaptemp (files[i].name);
2846 free (buffer[i].buf);
2847 --nfiles;
2848 for (j = i; j < nfiles; ++j)
2850 files[j] = files[j + 1];
2851 fps[j] = fps[j + 1];
2856 /* Set up the ord table according to comparisons among input lines.
2857 Since this only reorders two items if one is strictly greater than
2858 the other, it is stable. */
2859 for (i = 0; i < nfiles; ++i)
2860 ord[i] = i;
2861 for (i = 1; i < nfiles; ++i)
2862 if (0 < compare (cur[ord[i - 1]], cur[ord[i]], false))
2863 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2865 /* Repeatedly output the smallest line until no input remains. */
2866 while (nfiles)
2868 struct line const *smallest = cur[ord[0]];
2870 /* If uniquified output is turned on, output only the first of
2871 an identical series of lines. */
2872 if (unique)
2874 if (savedline && compare (savedline, smallest, false))
2876 savedline = NULL;
2877 write_bytes (&saved, ofp, output_file);
2879 if (!savedline)
2881 savedline = &saved;
2882 if (savealloc < smallest->length)
2885 if (! savealloc)
2887 savealloc = smallest->length;
2888 break;
2890 while ((savealloc *= 2) < smallest->length);
2892 saved.text = xrealloc (saved.text, savealloc);
2894 saved.length = smallest->length;
2895 memcpy (saved.text, smallest->text, saved.length);
2896 if (key)
2898 saved.keybeg =
2899 saved.text + (smallest->keybeg - smallest->text);
2900 saved.keylim =
2901 saved.text + (smallest->keylim - smallest->text);
2905 else
2906 write_bytes (smallest, ofp, output_file);
2908 /* Check if we need to read more lines into core. */
2909 if (base[ord[0]] < smallest)
2910 cur[ord[0]] = smallest - 1;
2911 else
2913 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2915 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2916 cur[ord[0]] = linelim - 1;
2917 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2919 else
2921 /* We reached EOF on fps[ord[0]]. */
2922 for (i = 1; i < nfiles; ++i)
2923 if (ord[i] > ord[0])
2924 --ord[i];
2925 --nfiles;
2926 xfclose (fps[ord[0]], files[ord[0]].name);
2927 if (ord[0] < ntemps)
2929 ntemps--;
2930 zaptemp (files[ord[0]].name);
2932 free (buffer[ord[0]].buf);
2933 for (i = ord[0]; i < nfiles; ++i)
2935 fps[i] = fps[i + 1];
2936 files[i] = files[i + 1];
2937 buffer[i] = buffer[i + 1];
2938 cur[i] = cur[i + 1];
2939 base[i] = base[i + 1];
2941 for (i = 0; i < nfiles; ++i)
2942 ord[i] = ord[i + 1];
2943 continue;
2947 /* The new line just read in may be larger than other lines
2948 already in main memory; push it back in the queue until we
2949 encounter a line larger than it. Optimize for the common
2950 case where the new line is smallest. */
2952 size_t lo = 1;
2953 size_t hi = nfiles;
2954 size_t probe = lo;
2955 size_t ord0 = ord[0];
2956 size_t count_of_smaller_lines;
2958 while (lo < hi)
2960 int cmp = compare (cur[ord0], cur[ord[probe]], false);
2961 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2962 hi = probe;
2963 else
2964 lo = probe + 1;
2965 probe = (lo + hi) / 2;
2968 count_of_smaller_lines = lo - 1;
2969 for (j = 0; j < count_of_smaller_lines; j++)
2970 ord[j] = ord[j + 1];
2971 ord[count_of_smaller_lines] = ord0;
2974 /* Free up some resources every once in a while. */
2975 if (MAX_PROCS_BEFORE_REAP < nprocs)
2976 reap_some ();
2979 if (unique && savedline)
2981 write_bytes (&saved, ofp, output_file);
2982 free (saved.text);
2985 xfclose (ofp, output_file);
2986 free (fps);
2987 free (buffer);
2988 free (ord);
2989 free (base);
2990 free (cur);
2993 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2994 files (all of which are at the start of the FILES array), and
2995 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2996 Close input and output files before returning.
2997 OUTPUT_FILE gives the name of the output file.
2999 Return the number of files successfully merged. This number can be
3000 less than NFILES if we ran low on file descriptors, but in this
3001 case it is never less than 2. */
3003 static size_t
3004 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3005 FILE *ofp, char const *output_file)
3007 FILE **fps;
3008 size_t nopened = open_input_files (files, nfiles, &fps);
3009 if (nopened < nfiles && nopened < 2)
3010 die (_("open failed"), files[nopened].name);
3011 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3012 return nopened;
3015 /* Merge into T (of size NLINES) the two sorted arrays of lines
3016 LO (with NLINES / 2 members), and
3017 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3018 T and LO point just past their respective arrays, and the arrays
3019 are in reverse order. NLINES must be at least 2. */
3021 static inline void
3022 mergelines (struct line *restrict t, size_t nlines,
3023 struct line const *restrict lo)
3025 size_t nlo = nlines / 2;
3026 size_t nhi = nlines - nlo;
3027 struct line *hi = t - nlo;
3029 while (true)
3030 if (compare (lo - 1, hi - 1, false) <= 0)
3032 *--t = *--lo;
3033 if (! --nlo)
3035 /* HI must equal T now, and there is no need to copy from
3036 HI to T. */
3037 return;
3040 else
3042 *--t = *--hi;
3043 if (! --nhi)
3046 *--t = *--lo;
3047 while (--nlo);
3049 return;
3054 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3055 NLINES must be at least 2.
3056 The input and output arrays are in reverse order, and LINES and
3057 TEMP point just past the end of their respective arrays.
3059 Use a recursive divide-and-conquer algorithm, in the style
3060 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3061 the optimization suggested by exercise 5.2.4-10; this requires room
3062 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3063 writes that this memory optimization was originally published by
3064 D. A. Bell, Comp J. 1 (1958), 75. */
3066 static void
3067 sequential_sort (struct line *restrict lines, size_t nlines,
3068 struct line *restrict temp, bool to_temp)
3070 if (nlines == 2)
3072 /* Declare `swap' as int, not bool, to work around a bug
3073 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3074 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3075 int swap = (0 < compare (&lines[-1], &lines[-2], false));
3076 if (to_temp)
3078 temp[-1] = lines[-1 - swap];
3079 temp[-2] = lines[-2 + swap];
3081 else if (swap)
3083 temp[-1] = lines[-1];
3084 lines[-1] = lines[-2];
3085 lines[-2] = temp[-1];
3088 else
3090 size_t nlo = nlines / 2;
3091 size_t nhi = nlines - nlo;
3092 struct line *lo = lines;
3093 struct line *hi = lines - nlo;
3095 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3096 if (1 < nlo)
3097 sequential_sort (lo, nlo, temp, !to_temp);
3098 else if (!to_temp)
3099 temp[-1] = lo[-1];
3101 struct line *dest;
3102 struct line const *sorted_lo;
3103 if (to_temp)
3105 dest = temp;
3106 sorted_lo = lines;
3108 else
3110 dest = lines;
3111 sorted_lo = temp;
3113 mergelines (dest, nlines, sorted_lo);
3117 /* Compare two NODEs for priority. The NODE with the higher (numerically
3118 lower) level has priority. If tie, the NODE with the most remaining
3119 lines has priority. */
3121 static int
3122 compare_nodes (const void *a, const void *b)
3124 const struct merge_node *nodea = (const struct merge_node *) a;
3125 const struct merge_node *nodeb = (const struct merge_node *) b;
3126 if (nodea->level == nodeb->level)
3127 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3128 return nodea->level < nodeb->level;
3131 /* Lock a merge tree NODE.
3132 Note spin locks were seen to perform better than mutexes
3133 as long as the number of threads is limited to the
3134 number of processors. */
3136 static inline void
3137 lock_node (struct merge_node *const restrict node)
3139 pthread_spin_lock (node->lock);
3142 /* Unlock a merge tree NODE. */
3144 static inline void
3145 unlock_node (struct merge_node *const restrict node)
3147 pthread_spin_unlock (node->lock);
3150 /* Destroy merge QUEUE. */
3152 static inline void
3153 queue_destroy (struct merge_node_queue *const restrict queue)
3155 heap_free (queue->priority_queue);
3156 pthread_cond_destroy (&queue->cond);
3157 pthread_mutex_destroy (&queue->mutex);
3160 /* Initialize merge QUEUE, allocating space for a maximum of RESERVE nodes.
3161 Though it's highly unlikely all nodes are in the heap at the same time,
3162 RESERVE should accommodate all of them. Counting a NULL dummy head for the
3163 heap, RESERVE should be 2 * NTHREADS. */
3165 static inline void
3166 queue_init (struct merge_node_queue *const restrict queue, size_t reserve)
3168 queue->priority_queue = (struct heap *) heap_alloc (compare_nodes, reserve);
3169 pthread_mutex_init (&queue->mutex, NULL);
3170 pthread_cond_init (&queue->cond, NULL);
3173 /* Insert NODE into priority QUEUE. Assume caller either holds lock on NODE
3174 or does not need to lock NODE. */
3176 static inline void
3177 queue_insert (struct merge_node_queue *const restrict queue,
3178 struct merge_node *const restrict node)
3180 pthread_mutex_lock (&queue->mutex);
3181 heap_insert (queue->priority_queue, node);
3182 node->queued = true;
3183 pthread_mutex_unlock (&queue->mutex);
3184 pthread_cond_signal (&queue->cond);
3187 /* Pop NODE off priority QUEUE. Guarantee a non-null, spinlocked NODE. */
3189 static inline struct merge_node *
3190 queue_pop (struct merge_node_queue *const restrict queue)
3192 struct merge_node *node = NULL;
3194 while (!node)
3196 pthread_mutex_lock (&queue->mutex);
3197 if (queue->priority_queue->count)
3198 node = (struct merge_node *) heap_remove_top (queue->priority_queue);
3199 else
3201 /* Go into conditional wait if no NODE is immediately available. */
3202 pthread_cond_wait (&queue->cond, &queue->mutex);
3204 pthread_mutex_unlock (&queue->mutex);
3206 lock_node (node);
3207 node->queued = false;
3208 return node;
3211 /* If UNQIUE is set, checks to make sure line isn't a duplicate before
3212 outputting. If UNIQUE is not set, output the passed in line. Note that
3213 this function does not actually save the line, nor any key information,
3214 thus is only appropriate for internal sort. */
3216 static inline void
3217 write_unique (struct line *const restrict line, FILE *tfp,
3218 const char *temp_output)
3220 static struct line *saved = NULL;
3222 if (!unique)
3223 write_bytes (line, tfp, temp_output);
3224 else if (!saved || compare (line, saved, false))
3226 saved = line;
3227 write_bytes (line, tfp, temp_output);
3231 /* Merge the lines currently available to a NODE in the binary
3232 merge tree, up to a maximum specified by MAX_MERGE. */
3234 static inline size_t
3235 mergelines_node (struct merge_node *const restrict node, size_t total_lines,
3236 FILE *tfp, const char *temp_output)
3238 struct line *lo_orig = node->lo;
3239 struct line *hi_orig = node->hi;
3240 size_t to_merge = MAX_MERGE (total_lines, node->level);
3241 size_t merged_lo;
3242 size_t merged_hi;
3244 if (node->level > MERGE_ROOT)
3246 /* Merge to destination buffer. */
3247 struct line *dest = *node->dest;
3248 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3249 if (compare (node->lo - 1, node->hi - 1, false) <= 0)
3250 *--dest = *--node->lo;
3251 else
3252 *--dest = *--node->hi;
3254 merged_lo = lo_orig - node->lo;
3255 merged_hi = hi_orig - node->hi;
3257 if (node->nhi == merged_hi)
3258 while (node->lo != node->end_lo && to_merge--)
3259 *--dest = *--node->lo;
3260 else if (node->nlo == merged_lo)
3261 while (node->hi != node->end_hi && to_merge--)
3262 *--dest = *--node->hi;
3264 else
3266 /* Merge directly to output. */
3267 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3269 if (compare (node->lo - 1, node->hi - 1, false) <= 0)
3270 write_unique (--node->lo, tfp, temp_output);
3271 else
3272 write_unique (--node->hi, tfp, temp_output);
3275 merged_lo = lo_orig - node->lo;
3276 merged_hi = hi_orig - node->hi;
3278 if (node->nhi == merged_hi)
3280 while (node->lo != node->end_lo && to_merge--)
3281 write_unique (--node->lo, tfp, temp_output);
3283 else if (node->nlo == merged_lo)
3285 while (node->hi != node->end_hi && to_merge--)
3286 write_unique (--node->hi, tfp, temp_output);
3288 node->dest -= lo_orig - node->lo + hi_orig - node->hi;
3291 /* Update NODE. */
3292 merged_lo = lo_orig - node->lo;
3293 merged_hi = hi_orig - node->hi;
3294 node->nlo -= merged_lo;
3295 node->nhi -= merged_hi;
3296 return merged_lo + merged_hi;
3299 /* Insert NODE into QUEUE if it passes insertion checks. */
3301 static inline void
3302 check_insert (struct merge_node *node,
3303 struct merge_node_queue *const restrict queue)
3305 size_t lo_avail = node->lo - node->end_lo;
3306 size_t hi_avail = node->hi - node->end_hi;
3308 /* Conditions for insertion:
3309 1. NODE is not already in heap.
3310 2. NODE has available lines from both it's children, OR one child has
3311 available lines, but the other has exhausted all its lines. */
3312 if ((!node->queued)
3313 && ((lo_avail && (hi_avail || !(node->nhi)))
3314 || (hi_avail && !(node->nlo))))
3316 queue_insert (queue, node);
3320 /* Update parent merge tree NODE. */
3322 static inline void
3323 update_parent (struct merge_node *const restrict node, size_t merged,
3324 struct merge_node_queue *const restrict queue)
3326 if (node->level > MERGE_ROOT)
3328 lock_node (node->parent);
3329 *node->dest -= merged;
3330 check_insert (node->parent, queue);
3331 unlock_node (node->parent);
3333 else if (node->nlo + node->nhi == 0)
3335 /* If the MERGE_ROOT NODE has finished merging, insert the
3336 MERGE_END node. */
3337 queue_insert (queue, node->parent);
3341 /* Repeatedly pop QUEUE for a NODE with lines to merge, and merge at least
3342 some of those lines, until the MERGE_END node is popped. */
3344 static void
3345 merge_loop (struct merge_node_queue *const restrict queue,
3346 const size_t total_lines, FILE *tfp, const char *temp_output)
3348 while (1)
3350 struct merge_node *node = queue_pop (queue);
3352 if (node->level == MERGE_END)
3354 unlock_node (node);
3355 /* Reinsert so other threads can pop it. */
3356 queue_insert (queue, node);
3357 break;
3359 size_t merged_lines = mergelines_node (node, total_lines, tfp,
3360 temp_output);
3361 check_insert (node, queue);
3362 update_parent (node, merged_lines, queue);
3364 unlock_node (node);
3369 static void sortlines (struct line *restrict, struct line *restrict,
3370 unsigned long int, const size_t,
3371 struct merge_node *const restrict, bool,
3372 struct merge_node_queue *const restrict,
3373 FILE *, const char *);
3375 /* Thread arguments for sortlines_thread. */
3377 struct thread_args
3379 struct line *lines;
3380 struct line *dest;
3381 unsigned long int nthreads;
3382 const size_t total_lines;
3383 struct merge_node *const restrict parent;
3384 bool lo_child;
3385 struct merge_node_queue *const restrict merge_queue;
3386 FILE *tfp;
3387 const char *output_temp;
3390 /* Like sortlines, except with a signature acceptable to pthread_create. */
3392 static void *
3393 sortlines_thread (void *data)
3395 struct thread_args const *args = data;
3396 sortlines (args->lines, args->dest, args->nthreads, args->total_lines,
3397 args->parent, args->lo_child, args->merge_queue,
3398 args->tfp, args->output_temp);
3399 return NULL;
3402 /* There are three phases to the algorithm: node creation, sequential sort,
3403 and binary merge.
3405 During node creation, sortlines recursively visits each node in the
3406 binary merge tree and creates a NODE structure corresponding to all the
3407 future line merging NODE is responsible for. For each call to
3408 sortlines, half the available threads are assigned to each recursive
3409 call, until a leaf node having only 1 available thread is reached.
3411 Each leaf node then performs two sequential sorts, one on each half of
3412 the lines it is responsible for. It records in its NODE structure that
3413 there are two sorted sublists available to merge from, and inserts its
3414 NODE into the priority queue.
3416 The binary merge phase then begins. Each thread drops into a loop
3417 where the thread retrieves a NODE from the priority queue, merges lines
3418 available to that NODE, and potentially insert NODE or its parent back
3419 into the queue if there are sufficient available lines for them to
3420 merge. This continues until all lines at all nodes of the merge tree
3421 have been merged. */
3423 static void
3424 sortlines (struct line *restrict lines, struct line *restrict dest,
3425 unsigned long int nthreads, const size_t total_lines,
3426 struct merge_node *const restrict parent, bool lo_child,
3427 struct merge_node_queue *const restrict merge_queue,
3428 FILE *tfp, const char *temp_output)
3430 /* Create merge tree NODE. */
3431 size_t nlines = (lo_child)? parent->nlo : parent->nhi;
3432 size_t nlo = nlines / 2;
3433 size_t nhi = nlines - nlo;
3434 struct line *lo = dest - total_lines;
3435 struct line *hi = lo - nlo;
3436 struct line **parent_end = (lo_child)? &parent->end_lo : &parent->end_hi;
3437 pthread_spinlock_t lock;
3438 pthread_spin_init (&lock, PTHREAD_PROCESS_PRIVATE);
3439 struct merge_node node = {lo, hi, lo, hi, parent_end, nlo, nhi,
3440 parent->level + 1, parent, false, &lock};
3442 /* Calculate thread arguments. */
3443 unsigned long int lo_threads = nthreads / 2;
3444 unsigned long int hi_threads = nthreads - lo_threads;
3445 pthread_t thread;
3446 struct thread_args args = {lines, lo, lo_threads, total_lines, &node,
3447 true, merge_queue, tfp, temp_output};
3449 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3450 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3452 sortlines (lines - nlo, hi, hi_threads, total_lines, &node, false,
3453 merge_queue, tfp, temp_output);
3454 pthread_join (thread, NULL);
3456 else
3458 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3459 Sort with 1 thread. */
3460 struct line *temp = lines - total_lines;
3461 if (1 < nhi)
3462 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3463 if (1 < nlo)
3464 sequential_sort (lines, nlo, temp, false);
3466 /* Update merge NODE. No need to lock yet. */
3467 node.lo = lines;
3468 node.hi = lines - nlo;
3469 node.end_lo = lines - nlo;
3470 node.end_hi = lines - nlo - nhi;
3472 queue_insert (merge_queue, &node);
3473 merge_loop (merge_queue, total_lines, tfp, temp_output);
3477 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
3478 the same as OUTFILE. If found, merge the found instances (and perhaps
3479 some other files) into a temporary file so that it can in turn be
3480 merged into OUTFILE without destroying OUTFILE before it is completely
3481 read. Return the new value of NFILES, which differs from the old if
3482 some merging occurred.
3484 This test ensures that an otherwise-erroneous use like
3485 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3486 It's not clear that POSIX requires this nicety.
3487 Detect common error cases, but don't try to catch obscure cases like
3488 "cat ... FILE ... | sort -m -o FILE"
3489 where traditional "sort" doesn't copy the input and where
3490 people should know that they're getting into trouble anyway.
3491 Catching these obscure cases would slow down performance in
3492 common cases. */
3494 static size_t
3495 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3496 size_t nfiles, char const *outfile)
3498 size_t i;
3499 bool got_outstat = false;
3500 struct stat outstat;
3502 for (i = ntemps; i < nfiles; i++)
3504 bool is_stdin = STREQ (files[i].name, "-");
3505 bool same;
3506 struct stat instat;
3508 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3509 same = true;
3510 else
3512 if (! got_outstat)
3514 if ((outfile
3515 ? stat (outfile, &outstat)
3516 : fstat (STDOUT_FILENO, &outstat))
3517 != 0)
3518 break;
3519 got_outstat = true;
3522 same = (((is_stdin
3523 ? fstat (STDIN_FILENO, &instat)
3524 : stat (files[i].name, &instat))
3525 == 0)
3526 && SAME_INODE (instat, outstat));
3529 if (same)
3531 FILE *tftp;
3532 pid_t pid;
3533 char *temp = create_temp (&tftp, &pid);
3534 size_t num_merged = 0;
3537 num_merged += mergefiles (&files[i], 0, nfiles - i, tftp, temp);
3538 files[i].name = temp;
3539 files[i].pid = pid;
3541 if (i + num_merged < nfiles)
3542 memmove (&files[i + 1], &files[i + num_merged],
3543 num_merged * sizeof *files);
3544 ntemps += 1;
3545 nfiles -= num_merged - 1;;
3546 i += num_merged;
3548 while (i < nfiles);
3552 return nfiles;
3555 /* Merge the input FILES. NTEMPS is the number of files at the
3556 start of FILES that are temporary; it is zero at the top level.
3557 NFILES is the total number of files. Put the output in
3558 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3560 static void
3561 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3562 char const *output_file)
3564 while (nmerge < nfiles)
3566 /* Number of input files processed so far. */
3567 size_t in;
3569 /* Number of output files generated so far. */
3570 size_t out;
3572 /* nfiles % NMERGE; this counts input files that are left over
3573 after all full-sized merges have been done. */
3574 size_t remainder;
3576 /* Number of easily-available slots at the next loop iteration. */
3577 size_t cheap_slots;
3579 /* Do as many NMERGE-size merges as possible. In the case that
3580 nmerge is bogus, increment by the maximum number of file
3581 descriptors allowed. */
3582 for (out = in = 0; nmerge <= nfiles - in; out++)
3584 FILE *tfp;
3585 pid_t pid;
3586 char *temp = create_temp (&tfp, &pid);
3587 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3588 nmerge, tfp, temp);
3589 ntemps -= MIN (ntemps, num_merged);
3590 files[out].name = temp;
3591 files[out].pid = pid;
3592 in += num_merged;
3595 remainder = nfiles - in;
3596 cheap_slots = nmerge - out % nmerge;
3598 if (cheap_slots < remainder)
3600 /* So many files remain that they can't all be put into the last
3601 NMERGE-sized output window. Do one more merge. Merge as few
3602 files as possible, to avoid needless I/O. */
3603 size_t nshortmerge = remainder - cheap_slots + 1;
3604 FILE *tfp;
3605 pid_t pid;
3606 char *temp = create_temp (&tfp, &pid);
3607 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3608 nshortmerge, tfp, temp);
3609 ntemps -= MIN (ntemps, num_merged);
3610 files[out].name = temp;
3611 files[out++].pid = pid;
3612 in += num_merged;
3615 /* Put the remaining input files into the last NMERGE-sized output
3616 window, so they will be merged in the next pass. */
3617 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3618 ntemps += out;
3619 nfiles -= in - out;
3622 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
3624 /* We aren't guaranteed that this final mergefiles will work, therefore we
3625 try to merge into the output, and then merge as much as we can into a
3626 temp file if we can't. Repeat. */
3628 while (true)
3630 /* Merge directly into the output file if possible. */
3631 FILE **fps;
3632 size_t nopened = open_input_files (files, nfiles, &fps);
3634 if (nopened == nfiles)
3636 FILE *ofp = stream_open (output_file, "w");
3637 if (ofp)
3639 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3640 break;
3642 if (errno != EMFILE || nopened <= 2)
3643 die (_("open failed"), output_file);
3645 else if (nopened <= 2)
3646 die (_("open failed"), files[nopened].name);
3648 /* We ran out of file descriptors. Close one of the input
3649 files, to gain a file descriptor. Then create a temporary
3650 file with our spare file descriptor. Retry if that failed
3651 (e.g., some other process could open a file between the time
3652 we closed and tried to create). */
3653 FILE *tfp;
3654 pid_t pid;
3655 char *temp;
3658 nopened--;
3659 xfclose (fps[nopened], files[nopened].name);
3660 temp = maybe_create_temp (&tfp, &pid, ! (nopened <= 2));
3662 while (!temp);
3664 /* Merge into the newly allocated temporary. */
3665 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp, fps);
3666 ntemps -= MIN (ntemps, nopened);
3667 files[0].name = temp;
3668 files[0].pid = pid;
3670 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3671 ntemps++;
3672 nfiles -= nopened - 1;
3676 /* Sort NFILES FILES onto OUTPUT_FILE. */
3678 static void
3679 sort (char * const *files, size_t nfiles, char const *output_file,
3680 unsigned long int nthreads)
3682 struct buffer buf;
3683 size_t ntemps = 0;
3684 bool output_file_created = false;
3686 buf.alloc = 0;
3688 while (nfiles)
3690 char const *temp_output;
3691 char const *file = *files;
3692 FILE *fp = xfopen (file, "r");
3693 FILE *tfp;
3695 size_t bytes_per_line;
3696 if (nthreads > 1)
3698 /* Get log P. */
3699 unsigned long int tmp = 1;
3700 size_t mult = 1;
3701 while (tmp < nthreads)
3703 tmp *= 2;
3704 mult++;
3706 bytes_per_line = (mult * sizeof (struct line));
3708 else
3709 bytes_per_line = sizeof (struct line) * 3 / 2;
3711 if (! buf.alloc)
3712 initbuf (&buf, bytes_per_line,
3713 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3714 buf.eof = false;
3715 files++;
3716 nfiles--;
3718 while (fillbuf (&buf, fp, file))
3720 struct line *line;
3722 if (buf.eof && nfiles
3723 && (bytes_per_line + 1
3724 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3726 /* End of file, but there is more input and buffer room.
3727 Concatenate the next input file; this is faster in
3728 the usual case. */
3729 buf.left = buf.used;
3730 break;
3733 line = buffer_linelim (&buf);
3734 if (buf.eof && !nfiles && !ntemps && !buf.left)
3736 xfclose (fp, file);
3737 tfp = xfopen (output_file, "w");
3738 temp_output = output_file;
3739 output_file_created = true;
3741 else
3743 ++ntemps;
3744 temp_output = create_temp (&tfp, NULL);
3746 if (1 < buf.nlines)
3748 struct merge_node_queue merge_queue;
3749 queue_init (&merge_queue, 2 * nthreads);
3751 pthread_spinlock_t lock;
3752 pthread_spin_init (&lock, PTHREAD_PROCESS_PRIVATE);
3753 struct merge_node node =
3754 {NULL, NULL, NULL, NULL, NULL, buf.nlines,
3755 buf.nlines, MERGE_END, NULL, false, &lock};
3757 sortlines (line, line, nthreads, buf.nlines, &node, true,
3758 &merge_queue, tfp, temp_output);
3759 queue_destroy (&merge_queue);
3761 else
3762 write_unique (line - 1, tfp, temp_output);
3764 xfclose (tfp, temp_output);
3766 /* Free up some resources every once in a while. */
3767 if (MAX_PROCS_BEFORE_REAP < nprocs)
3768 reap_some ();
3770 if (output_file_created)
3771 goto finish;
3773 xfclose (fp, file);
3776 finish:
3777 free (buf.buf);
3779 if (! output_file_created)
3781 size_t i;
3782 struct tempnode *node = temphead;
3783 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3784 for (i = 0; node; i++)
3786 tempfiles[i].name = node->name;
3787 tempfiles[i].pid = node->pid;
3788 node = node->next;
3790 merge (tempfiles, ntemps, ntemps, output_file);
3791 free (tempfiles);
3795 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3797 static void
3798 insertkey (struct keyfield *key_arg)
3800 struct keyfield **p;
3801 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3803 for (p = &keylist; *p; p = &(*p)->next)
3804 continue;
3805 *p = key;
3806 key->next = NULL;
3809 /* Report a bad field specification SPEC, with extra info MSGID. */
3811 static void badfieldspec (char const *, char const *)
3812 ATTRIBUTE_NORETURN;
3813 static void
3814 badfieldspec (char const *spec, char const *msgid)
3816 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3817 _(msgid), quote (spec));
3818 abort ();
3821 /* Report incompatible options. */
3823 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3824 static void
3825 incompatible_options (char const *opts)
3827 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3828 abort ();
3831 /* Check compatibility of ordering options. */
3833 static void
3834 check_ordering_compatibility (void)
3836 struct keyfield *key;
3838 for (key = keylist; key; key = key->next)
3839 if ((1 < (key->random + key->numeric + key->general_numeric + key->month
3840 + key->version + !!key->ignore + key->human_numeric))
3841 || (key->random && key->translate))
3843 /* The following is too big, but guaranteed to be "big enough". */
3844 char opts[sizeof short_options];
3845 /* Clear flags we're not interested in. */
3846 key->skipsblanks = key->skipeblanks = key->reverse = false;
3847 key_to_opts (key, opts);
3848 incompatible_options (opts);
3852 /* Parse the leading integer in STRING and store the resulting value
3853 (which must fit into size_t) into *VAL. Return the address of the
3854 suffix after the integer. If the value is too large, silently
3855 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3856 failure; otherwise, report MSGID and exit on failure. */
3858 static char const *
3859 parse_field_count (char const *string, size_t *val, char const *msgid)
3861 char *suffix;
3862 uintmax_t n;
3864 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3866 case LONGINT_OK:
3867 case LONGINT_INVALID_SUFFIX_CHAR:
3868 *val = n;
3869 if (*val == n)
3870 break;
3871 /* Fall through. */
3872 case LONGINT_OVERFLOW:
3873 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3874 *val = SIZE_MAX;
3875 break;
3877 case LONGINT_INVALID:
3878 if (msgid)
3879 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3880 _(msgid), quote (string));
3881 return NULL;
3884 return suffix;
3887 /* Handle interrupts and hangups. */
3889 static void
3890 sighandler (int sig)
3892 if (! SA_NOCLDSTOP)
3893 signal (sig, SIG_IGN);
3895 cleanup ();
3897 signal (sig, SIG_DFL);
3898 raise (sig);
3901 /* Set the ordering options for KEY specified in S.
3902 Return the address of the first character in S that
3903 is not a valid ordering option.
3904 BLANKTYPE is the kind of blanks that 'b' should skip. */
3906 static char *
3907 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
3909 while (*s)
3911 switch (*s)
3913 case 'b':
3914 if (blanktype == bl_start || blanktype == bl_both)
3915 key->skipsblanks = true;
3916 if (blanktype == bl_end || blanktype == bl_both)
3917 key->skipeblanks = true;
3918 break;
3919 case 'd':
3920 key->ignore = nondictionary;
3921 break;
3922 case 'f':
3923 key->translate = fold_toupper;
3924 break;
3925 case 'g':
3926 key->general_numeric = true;
3927 break;
3928 case 'h':
3929 key->human_numeric = true;
3930 break;
3931 case 'i':
3932 /* Option order should not matter, so don't let -i override
3933 -d. -d implies -i, but -i does not imply -d. */
3934 if (! key->ignore)
3935 key->ignore = nonprinting;
3936 break;
3937 case 'M':
3938 key->month = true;
3939 break;
3940 case 'n':
3941 key->numeric = true;
3942 break;
3943 case 'R':
3944 key->random = true;
3945 break;
3946 case 'r':
3947 key->reverse = true;
3948 break;
3949 case 'V':
3950 key->version = true;
3951 break;
3952 default:
3953 return (char *) s;
3955 ++s;
3957 return (char *) s;
3960 static struct keyfield *
3961 key_init (struct keyfield *key)
3963 memset (key, 0, sizeof *key);
3964 key->eword = SIZE_MAX;
3965 key->iec_present = -1;
3966 return key;
3970 main (int argc, char **argv)
3972 struct keyfield *key;
3973 struct keyfield key_buf;
3974 struct keyfield gkey;
3975 bool gkey_only = false;
3976 char const *s;
3977 int c = 0;
3978 char checkonly = 0;
3979 bool mergeonly = false;
3980 char *random_source = NULL;
3981 bool need_random = false;
3982 unsigned long int nthreads = 0;
3983 size_t nfiles = 0;
3984 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
3985 bool obsolete_usage = (posix2_version () < 200112);
3986 char **files;
3987 char *files_from = NULL;
3988 struct Tokens tok;
3989 char const *outfile = NULL;
3991 initialize_main (&argc, &argv);
3992 set_program_name (argv[0]);
3993 setlocale (LC_ALL, "");
3994 bindtextdomain (PACKAGE, LOCALEDIR);
3995 textdomain (PACKAGE);
3997 initialize_exit_failure (SORT_FAILURE);
3999 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4000 #if HAVE_NL_LANGINFO
4001 hard_LC_TIME = hard_locale (LC_TIME);
4002 #endif
4004 /* Get locale's representation of the decimal point. */
4006 struct lconv const *locale = localeconv ();
4008 /* If the locale doesn't define a decimal point, or if the decimal
4009 point is multibyte, use the C locale's decimal point. FIXME:
4010 add support for multibyte decimal points. */
4011 decimal_point = to_uchar (locale->decimal_point[0]);
4012 if (! decimal_point || locale->decimal_point[1])
4013 decimal_point = '.';
4015 /* FIXME: add support for multibyte thousands separators. */
4016 thousands_sep = to_uchar (*locale->thousands_sep);
4017 if (! thousands_sep || locale->thousands_sep[1])
4018 thousands_sep = -1;
4021 have_read_stdin = false;
4022 inittables ();
4025 size_t i;
4026 static int const sig[] =
4028 /* The usual suspects. */
4029 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4030 #ifdef SIGPOLL
4031 SIGPOLL,
4032 #endif
4033 #ifdef SIGPROF
4034 SIGPROF,
4035 #endif
4036 #ifdef SIGVTALRM
4037 SIGVTALRM,
4038 #endif
4039 #ifdef SIGXCPU
4040 SIGXCPU,
4041 #endif
4042 #ifdef SIGXFSZ
4043 SIGXFSZ,
4044 #endif
4046 enum { nsigs = ARRAY_CARDINALITY (sig) };
4048 #if SA_NOCLDSTOP
4049 struct sigaction act;
4051 sigemptyset (&caught_signals);
4052 for (i = 0; i < nsigs; i++)
4054 sigaction (sig[i], NULL, &act);
4055 if (act.sa_handler != SIG_IGN)
4056 sigaddset (&caught_signals, sig[i]);
4059 act.sa_handler = sighandler;
4060 act.sa_mask = caught_signals;
4061 act.sa_flags = 0;
4063 for (i = 0; i < nsigs; i++)
4064 if (sigismember (&caught_signals, sig[i]))
4065 sigaction (sig[i], &act, NULL);
4066 #else
4067 for (i = 0; i < nsigs; i++)
4068 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4070 signal (sig[i], sighandler);
4071 siginterrupt (sig[i], 1);
4073 #endif
4075 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4077 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4078 atexit (exit_cleanup);
4080 key_init (&gkey);
4081 gkey.sword = SIZE_MAX;
4083 files = xnmalloc (argc, sizeof *files);
4085 while (true)
4087 /* Parse an operand as a file after "--" was seen; or if
4088 pedantic and a file was seen, unless the POSIX version
4089 predates 1003.1-2001 and -c was not seen and the operand is
4090 "-o FILE" or "-oFILE". */
4091 int oi = -1;
4093 if (c == -1
4094 || (posixly_correct && nfiles != 0
4095 && ! (obsolete_usage
4096 && ! checkonly
4097 && optind != argc
4098 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4099 && (argv[optind][2] || optind + 1 != argc)))
4100 || ((c = getopt_long (argc, argv, short_options,
4101 long_options, &oi))
4102 == -1))
4104 if (argc <= optind)
4105 break;
4106 files[nfiles++] = argv[optind++];
4108 else switch (c)
4110 case 1:
4111 key = NULL;
4112 if (optarg[0] == '+')
4114 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4115 && ISDIGIT (argv[optind][1]));
4116 obsolete_usage |= minus_pos_usage && !posixly_correct;
4117 if (obsolete_usage)
4119 /* Treat +POS1 [-POS2] as a key if possible; but silently
4120 treat an operand as a file if it is not a valid +POS1. */
4121 key = key_init (&key_buf);
4122 s = parse_field_count (optarg + 1, &key->sword, NULL);
4123 if (s && *s == '.')
4124 s = parse_field_count (s + 1, &key->schar, NULL);
4125 if (! (key->sword || key->schar))
4126 key->sword = SIZE_MAX;
4127 if (! s || *set_ordering (s, key, bl_start))
4128 key = NULL;
4129 else
4131 if (minus_pos_usage)
4133 char const *optarg1 = argv[optind++];
4134 s = parse_field_count (optarg1 + 1, &key->eword,
4135 N_("invalid number after `-'"));
4136 if (*s == '.')
4137 s = parse_field_count (s + 1, &key->echar,
4138 N_("invalid number after `.'"));
4139 if (!key->echar && key->eword)
4141 /* obsolescent syntax +A.x -B.y is equivalent to:
4142 -k A+1.x+1,B.y (when y = 0)
4143 -k A+1.x+1,B+1.y (when y > 0)
4144 So eword is decremented as in the -k case
4145 only when the end field (B) is specified and
4146 echar (y) is 0. */
4147 key->eword--;
4149 if (*set_ordering (s, key, bl_end))
4150 badfieldspec (optarg1,
4151 N_("stray character in field spec"));
4153 key->obsolete_used = true;
4154 insertkey (key);
4158 if (! key)
4159 files[nfiles++] = optarg;
4160 break;
4162 case SORT_OPTION:
4163 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4164 /* Fall through. */
4165 case 'b':
4166 case 'd':
4167 case 'f':
4168 case 'g':
4169 case 'h':
4170 case 'i':
4171 case 'M':
4172 case 'n':
4173 case 'r':
4174 case 'R':
4175 case 'V':
4177 char str[2];
4178 str[0] = c;
4179 str[1] = '\0';
4180 set_ordering (str, &gkey, bl_both);
4182 break;
4184 case CHECK_OPTION:
4185 c = (optarg
4186 ? XARGMATCH ("--check", optarg, check_args, check_types)
4187 : 'c');
4188 /* Fall through. */
4189 case 'c':
4190 case 'C':
4191 if (checkonly && checkonly != c)
4192 incompatible_options ("cC");
4193 checkonly = c;
4194 break;
4196 case COMPRESS_PROGRAM_OPTION:
4197 if (compress_program && !STREQ (compress_program, optarg))
4198 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4199 compress_program = optarg;
4200 break;
4202 case DEBUG_PROGRAM_OPTION:
4203 debug = true;
4204 break;
4206 case FILES0_FROM_OPTION:
4207 files_from = optarg;
4208 break;
4210 case 'k':
4211 key = key_init (&key_buf);
4213 /* Get POS1. */
4214 s = parse_field_count (optarg, &key->sword,
4215 N_("invalid number at field start"));
4216 if (! key->sword--)
4218 /* Provoke with `sort -k0' */
4219 badfieldspec (optarg, N_("field number is zero"));
4221 if (*s == '.')
4223 s = parse_field_count (s + 1, &key->schar,
4224 N_("invalid number after `.'"));
4225 if (! key->schar--)
4227 /* Provoke with `sort -k1.0' */
4228 badfieldspec (optarg, N_("character offset is zero"));
4231 if (! (key->sword || key->schar))
4232 key->sword = SIZE_MAX;
4233 s = set_ordering (s, key, bl_start);
4234 if (*s != ',')
4236 key->eword = SIZE_MAX;
4237 key->echar = 0;
4239 else
4241 /* Get POS2. */
4242 s = parse_field_count (s + 1, &key->eword,
4243 N_("invalid number after `,'"));
4244 if (! key->eword--)
4246 /* Provoke with `sort -k1,0' */
4247 badfieldspec (optarg, N_("field number is zero"));
4249 if (*s == '.')
4251 s = parse_field_count (s + 1, &key->echar,
4252 N_("invalid number after `.'"));
4254 s = set_ordering (s, key, bl_end);
4256 if (*s)
4257 badfieldspec (optarg, N_("stray character in field spec"));
4258 insertkey (key);
4259 break;
4261 case 'm':
4262 mergeonly = true;
4263 break;
4265 case NMERGE_OPTION:
4266 specify_nmerge (oi, c, optarg);
4267 break;
4269 case 'o':
4270 if (outfile && !STREQ (outfile, optarg))
4271 error (SORT_FAILURE, 0, _("multiple output files specified"));
4272 outfile = optarg;
4273 break;
4275 case RANDOM_SOURCE_OPTION:
4276 if (random_source && !STREQ (random_source, optarg))
4277 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4278 random_source = optarg;
4279 break;
4281 case 's':
4282 stable = true;
4283 break;
4285 case 'S':
4286 specify_sort_size (oi, c, optarg);
4287 break;
4289 case 't':
4291 char newtab = optarg[0];
4292 if (! newtab)
4293 error (SORT_FAILURE, 0, _("empty tab"));
4294 if (optarg[1])
4296 if (STREQ (optarg, "\\0"))
4297 newtab = '\0';
4298 else
4300 /* Provoke with `sort -txx'. Complain about
4301 "multi-character tab" instead of "multibyte tab", so
4302 that the diagnostic's wording does not need to be
4303 changed once multibyte characters are supported. */
4304 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4305 quote (optarg));
4308 if (tab != TAB_DEFAULT && tab != newtab)
4309 error (SORT_FAILURE, 0, _("incompatible tabs"));
4310 tab = newtab;
4312 break;
4314 case 'T':
4315 add_temp_dir (optarg);
4316 break;
4318 case PARALLEL_OPTION:
4319 nthreads = specify_nthreads (oi, c, optarg);
4320 break;
4322 case 'u':
4323 unique = true;
4324 break;
4326 case 'y':
4327 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4328 through Solaris 7. It is also accepted by many non-Solaris
4329 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4330 -y is marked as obsolete starting with Solaris 8 (1999), but is
4331 still accepted as of Solaris 10 prerelease (2004).
4333 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4334 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4335 and which in general ignores the argument after "-y" if it
4336 consists entirely of digits (it can even be empty). */
4337 if (optarg == argv[optind - 1])
4339 char const *p;
4340 for (p = optarg; ISDIGIT (*p); p++)
4341 continue;
4342 optind -= (*p != '\0');
4344 break;
4346 case 'z':
4347 eolchar = 0;
4348 break;
4350 case_GETOPT_HELP_CHAR;
4352 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4354 default:
4355 usage (SORT_FAILURE);
4359 if (files_from)
4361 FILE *stream;
4363 /* When using --files0-from=F, you may not specify any files
4364 on the command-line. */
4365 if (nfiles)
4367 error (0, 0, _("extra operand %s"), quote (files[0]));
4368 fprintf (stderr, "%s\n",
4369 _("file operands cannot be combined with --files0-from"));
4370 usage (SORT_FAILURE);
4373 if (STREQ (files_from, "-"))
4374 stream = stdin;
4375 else
4377 stream = fopen (files_from, "r");
4378 if (stream == NULL)
4379 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4380 quote (files_from));
4383 readtokens0_init (&tok);
4385 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4386 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4387 quote (files_from));
4389 if (tok.n_tok)
4391 size_t i;
4392 free (files);
4393 files = tok.tok;
4394 nfiles = tok.n_tok;
4395 for (i = 0; i < nfiles; i++)
4397 if (STREQ (files[i], "-"))
4398 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4399 "no file name of %s allowed"),
4400 quote (files[i]));
4401 else if (files[i][0] == '\0')
4403 /* Using the standard `filename:line-number:' prefix here is
4404 not totally appropriate, since NUL is the separator, not NL,
4405 but it might be better than nothing. */
4406 unsigned long int file_number = i + 1;
4407 error (SORT_FAILURE, 0,
4408 _("%s:%lu: invalid zero-length file name"),
4409 quotearg_colon (files_from), file_number);
4413 else
4414 error (SORT_FAILURE, 0, _("no input from %s"),
4415 quote (files_from));
4418 /* Inheritance of global options to individual keys. */
4419 for (key = keylist; key; key = key->next)
4421 if (default_key_compare (key) && !key->reverse)
4423 key->ignore = gkey.ignore;
4424 key->translate = gkey.translate;
4425 key->skipsblanks = gkey.skipsblanks;
4426 key->skipeblanks = gkey.skipeblanks;
4427 key->month = gkey.month;
4428 key->numeric = gkey.numeric;
4429 key->general_numeric = gkey.general_numeric;
4430 key->human_numeric = gkey.human_numeric;
4431 key->version = gkey.version;
4432 key->random = gkey.random;
4433 key->reverse = gkey.reverse;
4436 need_random |= key->random;
4439 if (!keylist && !default_key_compare (&gkey))
4441 gkey_only = true;
4442 insertkey (&gkey);
4443 need_random |= gkey.random;
4446 check_ordering_compatibility ();
4448 /* Disable this combination so that users are less likely
4449 to inadvertantly update a file with debugging enabled.
4450 Also it simplifies the code for handling temp files. */
4451 if (debug && outfile)
4452 error (SORT_FAILURE, 0, _("options -o and --debug are incompatible"));
4454 if (debug)
4456 /* Always output the locale in debug mode, since this
4457 is such a common source of confusion. */
4458 if (hard_LC_COLLATE)
4459 error (0, 0, _("using %s sorting rules"),
4460 quote (setlocale (LC_COLLATE, NULL)));
4461 else
4462 error (0, 0, _("using simple byte comparison"));
4463 key_warnings (&gkey, gkey_only);
4466 reverse = gkey.reverse;
4468 if (need_random)
4469 random_md5_state_init (random_source);
4471 if (temp_dir_count == 0)
4473 char const *tmp_dir = getenv ("TMPDIR");
4474 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4477 if (nfiles == 0)
4479 static char *minus = (char *) "-";
4480 nfiles = 1;
4481 free (files);
4482 files = &minus;
4485 /* Need to re-check that we meet the minimum requirement for memory
4486 usage with the final value for NMERGE. */
4487 if (0 < sort_size)
4488 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4490 if (checkonly)
4492 if (nfiles > 1)
4493 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4494 quote (files[1]), checkonly);
4496 if (outfile)
4498 static char opts[] = {0, 'o', 0};
4499 opts[0] = checkonly;
4500 incompatible_options (opts);
4503 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4504 input is not properly sorted. */
4505 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4508 if (mergeonly)
4510 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4511 size_t i;
4513 for (i = 0; i < nfiles; ++i)
4514 sortfiles[i].name = files[i];
4516 merge (sortfiles, 0, nfiles, outfile);
4517 IF_LINT (free (sortfiles));
4519 else
4521 /* If NTHREADS > number of cores on the machine, spinlocking
4522 could be wasteful. */
4523 unsigned long int np2 = num_processors (NPROC_CURRENT_OVERRIDABLE);
4524 if (!nthreads || nthreads > np2)
4525 nthreads = np2;
4527 sort (files, nfiles, outfile, nthreads);
4530 if (have_read_stdin && fclose (stdin) == EOF)
4531 die (_("close failed"), "-");
4533 exit (EXIT_SUCCESS);