sort: -R now uses less memory on long lines with internal NULs
[coreutils.git] / src / sort.c
blob54382631e93078402b588dfa2079d9ecfd3be714
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 #include <signal.h>
30 #include "system.h"
31 #include "argmatch.h"
32 #include "error.h"
33 #include "fadvise.h"
34 #include "filevercmp.h"
35 #include "hard-locale.h"
36 #include "hash.h"
37 #include "heap.h"
38 #include "md5.h"
39 #include "mbswidth.h"
40 #include "nproc.h"
41 #include "physmem.h"
42 #include "posixver.h"
43 #include "quote.h"
44 #include "quotearg.h"
45 #include "randread.h"
46 #include "readtokens0.h"
47 #include "stdio--.h"
48 #include "stdlib--.h"
49 #include "strnumcmp.h"
50 #include "xmemcoll.h"
51 #include "xnanosleep.h"
52 #include "xstrtol.h"
54 #if HAVE_SYS_RESOURCE_H
55 # include <sys/resource.h>
56 #endif
57 #ifndef RLIMIT_DATA
58 struct rlimit { size_t rlim_cur; };
59 # define getrlimit(Resource, Rlp) (-1)
60 #endif
62 /* The official name of this program (e.g., no `g' prefix). */
63 #define PROGRAM_NAME "sort"
65 #define AUTHORS \
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
71 #endif
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
74 present. */
75 #ifndef SA_NOCLDSTOP
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
79 # define sigset_t int
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
82 # endif
83 #endif
85 #if !defined OPEN_MAX && defined NR_OPEN
86 # define OPEN_MAX NR_OPEN
87 #endif
88 #if !defined OPEN_MAX
89 # define OPEN_MAX 20
90 #endif
92 #define UCHAR_LIM (UCHAR_MAX + 1)
94 #ifndef DEFAULT_TMPDIR
95 # define DEFAULT_TMPDIR "/tmp"
96 #endif
98 /* Maximum number of lines to merge every time a NODE is taken from
99 the MERGE_QUEUE. Node is at LEVEL in the binary merge tree,
100 and is responsible for merging TOTAL lines. */
101 #define MAX_MERGE(total, level) ((total) / ((2 << level) * (2 << level)) + 1)
103 /* Heuristic value for the number of lines for which it is worth
104 creating a subthread, during an internal merge sort, on a machine
105 that has processors galore. Currently this number is just a guess.
106 This value must be at least 4. We don't know of any machine where
107 this number has any practical effect. */
108 enum { SUBTHREAD_LINES_HEURISTIC = 4 };
110 /* Exit statuses. */
111 enum
113 /* POSIX says to exit with status 1 if invoked with -c and the
114 input is not properly sorted. */
115 SORT_OUT_OF_ORDER = 1,
117 /* POSIX says any other irregular exit must exit with a status
118 code greater than 1. */
119 SORT_FAILURE = 2
122 enum
124 /* The number of times we should try to fork a compression process
125 (we retry if the fork call fails). We don't _need_ to compress
126 temp files, this is just to reduce disk access, so this number
127 can be small. Each retry doubles in duration. */
128 MAX_FORK_TRIES_COMPRESS = 4,
130 /* The number of times we should try to fork a decompression process.
131 If we can't fork a decompression process, we can't sort, so this
132 number should be big. Each retry doubles in duration. */
133 MAX_FORK_TRIES_DECOMPRESS = 9
136 enum
138 /* Level of the end-of-merge node, one level above the root. */
139 MERGE_END = 0,
141 /* Level of the root node in merge tree. */
142 MERGE_ROOT = 1
145 /* The representation of the decimal point in the current locale. */
146 static int decimal_point;
148 /* Thousands separator; if -1, then there isn't one. */
149 static int thousands_sep;
151 /* Nonzero if the corresponding locales are hard. */
152 static bool hard_LC_COLLATE;
153 #if HAVE_NL_LANGINFO
154 static bool hard_LC_TIME;
155 #endif
157 #define NONZERO(x) ((x) != 0)
159 /* The kind of blanks for '-b' to skip in various options. */
160 enum blanktype { bl_start, bl_end, bl_both };
162 /* The character marking end of line. Default to \n. */
163 static char eolchar = '\n';
165 /* Lines are held in core as counted strings. */
166 struct line
168 char *text; /* Text of the line. */
169 size_t length; /* Length including final newline. */
170 char *keybeg; /* Start of first key. */
171 char *keylim; /* Limit of first key. */
174 /* Input buffers. */
175 struct buffer
177 char *buf; /* Dynamically allocated buffer,
178 partitioned into 3 regions:
179 - input data;
180 - unused area;
181 - an array of lines, in reverse order. */
182 size_t used; /* Number of bytes used for input data. */
183 size_t nlines; /* Number of lines in the line array. */
184 size_t alloc; /* Number of bytes allocated. */
185 size_t left; /* Number of bytes left from previous reads. */
186 size_t line_bytes; /* Number of bytes to reserve for each line. */
187 bool eof; /* An EOF has been read. */
190 struct keyfield
192 size_t sword; /* Zero-origin 'word' to start at. */
193 size_t schar; /* Additional characters to skip. */
194 size_t eword; /* Zero-origin last 'word' of key. */
195 size_t echar; /* Additional characters in field. */
196 bool const *ignore; /* Boolean array of characters to ignore. */
197 char const *translate; /* Translation applied to characters. */
198 bool skipsblanks; /* Skip leading blanks when finding start. */
199 bool skipeblanks; /* Skip leading blanks when finding end. */
200 bool numeric; /* Flag for numeric comparison. Handle
201 strings of digits with optional decimal
202 point, but no exponential notation. */
203 bool random; /* Sort by random hash of key. */
204 bool general_numeric; /* Flag for general, numeric comparison.
205 Handle numbers in exponential notation. */
206 bool human_numeric; /* Flag for sorting by human readable
207 units with either SI xor IEC prefixes. */
208 bool month; /* Flag for comparison by month name. */
209 bool reverse; /* Reverse the sense of comparison. */
210 bool version; /* sort by version number */
211 bool obsolete_used; /* obsolescent key option format is used. */
212 struct keyfield *next; /* Next keyfield to try. */
215 struct month
217 char const *name;
218 int val;
221 /* Binary merge tree node. */
222 struct merge_node
224 struct line *lo; /* Lines to merge from LO child node. */
225 struct line *hi; /* Lines to merge from HI child ndoe. */
226 struct line *end_lo; /* End of available lines from LO. */
227 struct line *end_hi; /* End of available lines from HI. */
228 struct line **dest; /* Pointer to destination of merge. */
229 size_t nlo; /* Total Lines remaining from LO. */
230 size_t nhi; /* Total lines remaining from HI. */
231 size_t level; /* Level in merge tree. */
232 struct merge_node *parent; /* Parent node. */
233 bool queued; /* Node is already in heap. */
234 pthread_spinlock_t *lock; /* Lock for node operations. */
237 /* Priority queue of merge nodes. */
238 struct merge_node_queue
240 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
241 pthread_mutex_t mutex; /* Lock for queue operations. */
242 pthread_cond_t cond; /* Conditional wait for empty queue to populate
243 when popping. */
246 /* FIXME: None of these tables work with multibyte character sets.
247 Also, there are many other bugs when handling multibyte characters.
248 One way to fix this is to rewrite `sort' to use wide characters
249 internally, but doing this with good performance is a bit
250 tricky. */
252 /* Table of blanks. */
253 static bool blanks[UCHAR_LIM];
255 /* Table of non-printing characters. */
256 static bool nonprinting[UCHAR_LIM];
258 /* Table of non-dictionary characters (not letters, digits, or blanks). */
259 static bool nondictionary[UCHAR_LIM];
261 /* Translation table folding lower case to upper. */
262 static unsigned char fold_toupper[UCHAR_LIM];
264 #define MONTHS_PER_YEAR 12
266 /* Table mapping month names to integers.
267 Alphabetic order allows binary search. */
268 static struct month monthtab[] =
270 {"APR", 4},
271 {"AUG", 8},
272 {"DEC", 12},
273 {"FEB", 2},
274 {"JAN", 1},
275 {"JUL", 7},
276 {"JUN", 6},
277 {"MAR", 3},
278 {"MAY", 5},
279 {"NOV", 11},
280 {"OCT", 10},
281 {"SEP", 9}
284 /* During the merge phase, the number of files to merge at once. */
285 #define NMERGE_DEFAULT 16
287 /* Minimum size for a merge or check buffer. */
288 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
290 /* Minimum sort size; the code might not work with smaller sizes. */
291 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
293 /* The number of bytes needed for a merge or check buffer, which can
294 function relatively efficiently even if it holds only one line. If
295 a longer line is seen, this value is increased. */
296 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
298 /* The approximate maximum number of bytes of main memory to use, as
299 specified by the user. Zero if the user has not specified a size. */
300 static size_t sort_size;
302 /* The guessed size for non-regular files. */
303 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
305 /* Array of directory names in which any temporary files are to be created. */
306 static char const **temp_dirs;
308 /* Number of temporary directory names used. */
309 static size_t temp_dir_count;
311 /* Number of allocated slots in temp_dirs. */
312 static size_t temp_dir_alloc;
314 /* Flag to reverse the order of all comparisons. */
315 static bool reverse;
317 /* Flag for stable sort. This turns off the last ditch bytewise
318 comparison of lines, and instead leaves lines in the same order
319 they were read if all keys compare equal. */
320 static bool stable;
322 /* If TAB has this value, blanks separate fields. */
323 enum { TAB_DEFAULT = CHAR_MAX + 1 };
325 /* Tab character separating fields. If TAB_DEFAULT, then fields are
326 separated by the empty string between a non-blank character and a blank
327 character. */
328 static int tab = TAB_DEFAULT;
330 /* Flag to remove consecutive duplicate lines from the output.
331 Only the last of a sequence of equal lines will be output. */
332 static bool unique;
334 /* Nonzero if any of the input files are the standard input. */
335 static bool have_read_stdin;
337 /* List of key field comparisons to be tried. */
338 static struct keyfield *keylist;
340 /* Program used to (de)compress temp files. Must accept -d. */
341 static char const *compress_program;
343 /* Annotate the output with extra info to aid the user. */
344 static bool debug;
346 /* Maximum number of files to merge in one go. If more than this
347 number are present, temp files will be used. */
348 static unsigned int nmerge = NMERGE_DEFAULT;
350 /* Report MESSAGE for FILE, then clean up and exit.
351 If FILE is null, it represents standard output. */
353 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
354 static void
355 die (char const *message, char const *file)
357 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
358 exit (SORT_FAILURE);
361 void
362 usage (int status)
364 if (status != EXIT_SUCCESS)
365 fprintf (stderr, _("Try `%s --help' for more information.\n"),
366 program_name);
367 else
369 printf (_("\
370 Usage: %s [OPTION]... [FILE]...\n\
371 or: %s [OPTION]... --files0-from=F\n\
373 program_name, program_name);
374 fputs (_("\
375 Write sorted concatenation of all FILE(s) to standard output.\n\
377 "), stdout);
378 fputs (_("\
379 Mandatory arguments to long options are mandatory for short options too.\n\
380 "), stdout);
381 fputs (_("\
382 Ordering options:\n\
384 "), stdout);
385 fputs (_("\
386 -b, --ignore-leading-blanks ignore leading blanks\n\
387 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
388 -f, --ignore-case fold lower case to upper case characters\n\
389 "), stdout);
390 fputs (_("\
391 -g, --general-numeric-sort compare according to general numerical value\n\
392 -i, --ignore-nonprinting consider only printable characters\n\
393 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
394 "), stdout);
395 fputs (_("\
396 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
397 "), stdout);
398 fputs (_("\
399 -n, --numeric-sort compare according to string numerical value\n\
400 -R, --random-sort sort by random hash of keys\n\
401 --random-source=FILE get random bytes from FILE\n\
402 -r, --reverse reverse the result of comparisons\n\
403 "), stdout);
404 fputs (_("\
405 --sort=WORD sort according to WORD:\n\
406 general-numeric -g, human-numeric -h, month -M,\n\
407 numeric -n, random -R, version -V\n\
408 -V, --version-sort natural sort of (version) numbers within text\n\
410 "), stdout);
411 fputs (_("\
412 Other options:\n\
414 "), stdout);
415 fputs (_("\
416 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
417 for more use temp files\n\
418 "), stdout);
419 fputs (_("\
420 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
421 -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\
422 --compress-program=PROG compress temporaries with PROG;\n\
423 decompress them with PROG -d\n\
424 "), stdout);
425 fputs (_("\
426 --debug annotate the part of the line used to sort,\n\
427 and warn about questionable usage to stderr\n\
428 --files0-from=F read input from the files specified by\n\
429 NUL-terminated names in file F;\n\
430 If F is - then read names from standard input\n\
431 "), stdout);
432 fputs (_("\
433 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
434 (default end of line). See POS syntax below\n\
435 -m, --merge merge already sorted files; do not sort\n\
436 "), stdout);
437 fputs (_("\
438 -o, --output=FILE write result to FILE instead of standard output\n\
439 -s, --stable stabilize sort by disabling last-resort comparison\n\
440 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
441 "), stdout);
442 printf (_("\
443 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
444 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
445 multiple options specify multiple directories\n\
446 --parallel=N limit the number of sorts run concurrently to N\n\
447 -u, --unique with -c, check for strict ordering;\n\
448 without -c, output only the first of an equal run\n\
449 "), DEFAULT_TMPDIR);
450 fputs (_("\
451 -z, --zero-terminated end lines with 0 byte, not newline\n\
452 "), stdout);
453 fputs (HELP_OPTION_DESCRIPTION, stdout);
454 fputs (VERSION_OPTION_DESCRIPTION, stdout);
455 fputs (_("\
457 POS is F[.C][OPTS], where F is the field number and C the character position\n\
458 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
459 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
460 one or more single-letter ordering options, which override global ordering\n\
461 options for that key. If no key is given, use the entire line as the key.\n\
463 SIZE may be followed by the following multiplicative suffixes:\n\
464 "), stdout);
465 fputs (_("\
466 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
468 With no FILE, or when FILE is -, read standard input.\n\
470 *** WARNING ***\n\
471 The locale specified by the environment affects sort order.\n\
472 Set LC_ALL=C to get the traditional sort order that uses\n\
473 native byte values.\n\
474 "), stdout );
475 emit_ancillary_info ();
478 exit (status);
481 /* For long options that have no equivalent short option, use a
482 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
483 enum
485 CHECK_OPTION = CHAR_MAX + 1,
486 COMPRESS_PROGRAM_OPTION,
487 DEBUG_PROGRAM_OPTION,
488 FILES0_FROM_OPTION,
489 NMERGE_OPTION,
490 RANDOM_SOURCE_OPTION,
491 SORT_OPTION,
492 PARALLEL_OPTION
495 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
497 static struct option const long_options[] =
499 {"ignore-leading-blanks", no_argument, NULL, 'b'},
500 {"check", optional_argument, NULL, CHECK_OPTION},
501 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
502 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
503 {"dictionary-order", no_argument, NULL, 'd'},
504 {"ignore-case", no_argument, NULL, 'f'},
505 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
506 {"general-numeric-sort", no_argument, NULL, 'g'},
507 {"ignore-nonprinting", no_argument, NULL, 'i'},
508 {"key", required_argument, NULL, 'k'},
509 {"merge", no_argument, NULL, 'm'},
510 {"month-sort", no_argument, NULL, 'M'},
511 {"numeric-sort", no_argument, NULL, 'n'},
512 {"human-numeric-sort", no_argument, NULL, 'h'},
513 {"version-sort", no_argument, NULL, 'V'},
514 {"random-sort", no_argument, NULL, 'R'},
515 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
516 {"sort", required_argument, NULL, SORT_OPTION},
517 {"output", required_argument, NULL, 'o'},
518 {"reverse", no_argument, NULL, 'r'},
519 {"stable", no_argument, NULL, 's'},
520 {"batch-size", required_argument, NULL, NMERGE_OPTION},
521 {"buffer-size", required_argument, NULL, 'S'},
522 {"field-separator", required_argument, NULL, 't'},
523 {"temporary-directory", required_argument, NULL, 'T'},
524 {"unique", no_argument, NULL, 'u'},
525 {"zero-terminated", no_argument, NULL, 'z'},
526 {"parallel", required_argument, NULL, PARALLEL_OPTION},
527 {GETOPT_HELP_OPTION_DECL},
528 {GETOPT_VERSION_OPTION_DECL},
529 {NULL, 0, NULL, 0},
532 #define CHECK_TABLE \
533 _ct_("quiet", 'C') \
534 _ct_("silent", 'C') \
535 _ct_("diagnose-first", 'c')
537 static char const *const check_args[] =
539 #define _ct_(_s, _c) _s,
540 CHECK_TABLE NULL
541 #undef _ct_
543 static char const check_types[] =
545 #define _ct_(_s, _c) _c,
546 CHECK_TABLE
547 #undef _ct_
550 #define SORT_TABLE \
551 _st_("general-numeric", 'g') \
552 _st_("human-numeric", 'h') \
553 _st_("month", 'M') \
554 _st_("numeric", 'n') \
555 _st_("random", 'R') \
556 _st_("version", 'V')
558 static char const *const sort_args[] =
560 #define _st_(_s, _c) _s,
561 SORT_TABLE NULL
562 #undef _st_
564 static char const sort_types[] =
566 #define _st_(_s, _c) _c,
567 SORT_TABLE
568 #undef _st_
571 /* The set of signals that are caught. */
572 static sigset_t caught_signals;
574 /* Critical section status. */
575 struct cs_status
577 bool valid;
578 sigset_t sigs;
581 /* Enter a critical section. */
582 static struct cs_status
583 cs_enter (void)
585 struct cs_status status;
586 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
587 return status;
590 /* Leave a critical section. */
591 static void
592 cs_leave (struct cs_status status)
594 if (status.valid)
596 /* Ignore failure when restoring the signal mask. */
597 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
601 /* The list of temporary files. */
602 struct tempnode
604 struct tempnode *volatile next;
605 pid_t pid; /* If compressed, the pid of compressor, else zero */
606 char name[1]; /* Actual size is 1 + file name length. */
608 static struct tempnode *volatile temphead;
609 static struct tempnode *volatile *temptail = &temphead;
611 struct sortfile
613 char const *name;
614 pid_t pid; /* If compressed, the pid of compressor, else zero */
617 /* A table where we store compression process states. We clean up all
618 processes in a timely manner so as not to exhaust system resources,
619 so we store the info on whether the process is still running, or has
620 been reaped here. */
621 static Hash_table *proctab;
623 enum { INIT_PROCTAB_SIZE = 47 };
625 enum procstate { ALIVE, ZOMBIE };
627 /* A proctab entry. The COUNT field is there in case we fork a new
628 compression process that has the same PID as an old zombie process
629 that is still in the table (because the process to decompress the
630 temp file it was associated with hasn't started yet). */
631 struct procnode
633 pid_t pid;
634 enum procstate state;
635 size_t count;
638 static size_t
639 proctab_hasher (void const *entry, size_t tabsize)
641 struct procnode const *node = entry;
642 return node->pid % tabsize;
645 static bool
646 proctab_comparator (void const *e1, void const *e2)
648 struct procnode const *n1 = e1, *n2 = e2;
649 return n1->pid == n2->pid;
652 /* The total number of forked processes (compressors and decompressors)
653 that have not been reaped yet. */
654 static size_t nprocs;
656 /* The number of child processes we'll allow before we try to reap some. */
657 enum { MAX_PROCS_BEFORE_REAP = 2 };
659 /* If 0 < PID, wait for the child process with that PID to exit.
660 If PID is -1, clean up a random child process which has finished and
661 return the process ID of that child. If PID is -1 and no processes
662 have quit yet, return 0 without waiting. */
664 static pid_t
665 reap (pid_t pid)
667 int status;
668 pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
670 if (cpid < 0)
671 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
672 compress_program);
673 else if (0 < cpid)
675 if (! WIFEXITED (status) || WEXITSTATUS (status))
676 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
677 compress_program);
678 --nprocs;
681 return cpid;
684 /* Add the PID of a running compression process to proctab, or update
685 the entry COUNT and STATE fields if it's already there. This also
686 creates the table for us the first time it's called. */
688 static void
689 register_proc (pid_t pid)
691 struct procnode test, *node;
693 if (! proctab)
695 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
696 proctab_hasher,
697 proctab_comparator,
698 free);
699 if (! proctab)
700 xalloc_die ();
703 test.pid = pid;
704 node = hash_lookup (proctab, &test);
705 if (node)
707 node->state = ALIVE;
708 ++node->count;
710 else
712 node = xmalloc (sizeof *node);
713 node->pid = pid;
714 node->state = ALIVE;
715 node->count = 1;
716 if (hash_insert (proctab, node) == NULL)
717 xalloc_die ();
721 /* This is called when we reap a random process. We don't know
722 whether we have reaped a compression process or a decompression
723 process until we look in the table. If there's an ALIVE entry for
724 it, then we have reaped a compression process, so change the state
725 to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */
727 static void
728 update_proc (pid_t pid)
730 struct procnode test, *node;
732 test.pid = pid;
733 node = hash_lookup (proctab, &test);
734 if (node)
735 node->state = ZOMBIE;
738 /* This is for when we need to wait for a compression process to exit.
739 If it has a ZOMBIE entry in the table then it's already dead and has
740 been reaped. Note that if there's an ALIVE entry for it, it still may
741 already have died and been reaped if a second process was created with
742 the same PID. This is probably exceedingly rare, but to be on the safe
743 side we will have to wait for any compression process with this PID. */
745 static void
746 wait_proc (pid_t pid)
748 struct procnode test, *node;
750 test.pid = pid;
751 node = hash_lookup (proctab, &test);
752 if (node->state == ALIVE)
753 reap (pid);
755 node->state = ZOMBIE;
756 if (! --node->count)
758 hash_delete (proctab, node);
759 free (node);
763 /* Keep reaping finished children as long as there are more to reap.
764 This doesn't block waiting for any of them, it only reaps those
765 that are already dead. */
767 static void
768 reap_some (void)
770 pid_t pid;
772 while (0 < nprocs && (pid = reap (-1)))
773 update_proc (pid);
776 /* Clean up any remaining temporary files. */
778 static void
779 cleanup (void)
781 struct tempnode const *node;
783 for (node = temphead; node; node = node->next)
784 unlink (node->name);
785 temphead = NULL;
788 /* Cleanup actions to take when exiting. */
790 static void
791 exit_cleanup (void)
793 if (temphead)
795 /* Clean up any remaining temporary files in a critical section so
796 that a signal handler does not try to clean them too. */
797 struct cs_status cs = cs_enter ();
798 cleanup ();
799 cs_leave (cs);
802 close_stdout ();
805 /* Create a new temporary file, returning its newly allocated tempnode.
806 Store into *PFD the file descriptor open for writing.
807 If the creation fails, return NULL and store -1 into *PFD if the
808 failure is due to file descriptor exhaustion and
809 SURVIVE_FD_EXHAUSTION; otherwise, die. */
811 static struct tempnode *
812 create_temp_file (int *pfd, bool survive_fd_exhaustion)
814 static char const slashbase[] = "/sortXXXXXX";
815 static size_t temp_dir_index;
816 int fd;
817 int saved_errno;
818 char const *temp_dir = temp_dirs[temp_dir_index];
819 size_t len = strlen (temp_dir);
820 struct tempnode *node =
821 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
822 char *file = node->name;
823 struct cs_status cs;
825 memcpy (file, temp_dir, len);
826 memcpy (file + len, slashbase, sizeof slashbase);
827 node->next = NULL;
828 node->pid = 0;
829 if (++temp_dir_index == temp_dir_count)
830 temp_dir_index = 0;
832 /* Create the temporary file in a critical section, to avoid races. */
833 cs = cs_enter ();
834 fd = mkstemp (file);
835 if (0 <= fd)
837 *temptail = node;
838 temptail = &node->next;
840 saved_errno = errno;
841 cs_leave (cs);
842 errno = saved_errno;
844 if (fd < 0)
846 if (! (survive_fd_exhaustion && errno == EMFILE))
847 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
848 quote (temp_dir));
849 free (node);
850 node = NULL;
853 *pfd = fd;
854 return node;
857 /* Return a stream for FILE, opened with mode HOW. A null FILE means
858 standard output; HOW should be "w". When opening for input, "-"
859 means standard input. To avoid confusion, do not return file
860 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
861 opening an ordinary FILE. Return NULL if unsuccessful.
863 fadvise() is used to specify an access pattern for input files.
864 There are a few hints we could possibly provide,
865 and after careful testing it was decided that
866 specifying POSIX_FADV_SEQUENTIAL was not detrimental
867 to any cases. On Linux 2.6.31, this option doubles
868 the size of read ahead performed and thus was seen to
869 benefit these cases:
870 Merging
871 Sorting with a smaller internal buffer
872 Reading from faster flash devices
874 In _addition_ one could also specify other hints...
876 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
877 at least uses that to _synchronously_ prepopulate the cache
878 with the specified range. While sort does need to
879 read all of its input before outputting, a synchronous
880 read of the whole file up front precludes any processing
881 that sort could do in parallel with the system doing
882 read ahead of the data. This was seen to have negative effects
883 in a couple of cases:
884 Merging
885 Sorting with a smaller internal buffer
886 Note this option was seen to shorten the runtime for sort
887 on a multicore system with lots of RAM and other processes
888 competing for CPU. It could be argued that more explicit
889 scheduling hints with `nice` et. al. are more appropriate
890 for this situation.
892 POSIX_FADV_NOREUSE is a possibility as it could lower
893 the priority of input data in the cache as sort will
894 only need to process it once. However its functionality
895 has changed over Linux kernel versions and as of 2.6.31
896 it does nothing and thus we can't depend on what it might
897 do in future.
899 POSIX_FADV_DONTNEED is not appropriate for user specified
900 input files, but for temp files we do want to drop the
901 cache immediately after processing. This is done implicitly
902 however when the files are unlinked. */
904 static FILE *
905 stream_open (char const *file, char const *how)
907 if (!file)
908 return stdout;
909 if (*how == 'r')
911 FILE *fp;
912 if (STREQ (file, "-"))
914 have_read_stdin = true;
915 fp = stdin;
917 else
918 fp = fopen (file, how);
919 fadvise (fp, FADVISE_SEQUENTIAL);
920 return fp;
922 return fopen (file, how);
925 /* Same as stream_open, except always return a non-null value; die on
926 failure. */
928 static FILE *
929 xfopen (char const *file, char const *how)
931 FILE *fp = stream_open (file, how);
932 if (!fp)
933 die (_("open failed"), file);
934 return fp;
937 /* Close FP, whose name is FILE, and report any errors. */
939 static void
940 xfclose (FILE *fp, char const *file)
942 switch (fileno (fp))
944 case STDIN_FILENO:
945 /* Allow reading stdin from tty more than once. */
946 if (feof (fp))
947 clearerr (fp);
948 break;
950 case STDOUT_FILENO:
951 /* Don't close stdout just yet. close_stdout does that. */
952 if (fflush (fp) != 0)
953 die (_("fflush failed"), file);
954 break;
956 default:
957 if (fclose (fp) != 0)
958 die (_("close failed"), file);
959 break;
963 static void
964 dup2_or_die (int oldfd, int newfd)
966 if (dup2 (oldfd, newfd) < 0)
967 error (SORT_FAILURE, errno, _("dup2 failed"));
970 /* Fork a child process for piping to and do common cleanup. The
971 TRIES parameter tells us how many times to try to fork before
972 giving up. Return the PID of the child, or -1 (setting errno)
973 on failure. */
975 static pid_t
976 pipe_fork (int pipefds[2], size_t tries)
978 #if HAVE_WORKING_FORK
979 struct tempnode *saved_temphead;
980 int saved_errno;
981 double wait_retry = 0.25;
982 pid_t pid IF_LINT ( = -1);
983 struct cs_status cs;
985 if (pipe (pipefds) < 0)
986 return -1;
988 while (tries--)
990 /* This is so the child process won't delete our temp files
991 if it receives a signal before exec-ing. */
992 cs = cs_enter ();
993 saved_temphead = temphead;
994 temphead = NULL;
996 pid = fork ();
997 saved_errno = errno;
998 if (pid)
999 temphead = saved_temphead;
1001 cs_leave (cs);
1002 errno = saved_errno;
1004 if (0 <= pid || errno != EAGAIN)
1005 break;
1006 else
1008 xnanosleep (wait_retry);
1009 wait_retry *= 2;
1010 reap_some ();
1014 if (pid < 0)
1016 saved_errno = errno;
1017 close (pipefds[0]);
1018 close (pipefds[1]);
1019 errno = saved_errno;
1021 else if (pid == 0)
1023 close (STDIN_FILENO);
1024 close (STDOUT_FILENO);
1026 else
1027 ++nprocs;
1029 return pid;
1031 #else /* ! HAVE_WORKING_FORK */
1032 return -1;
1033 #endif
1036 /* Create a temporary file and start a compression program to filter output
1037 to that file. Set *PFP to the file handle and if PPID is non-NULL,
1038 set *PPID to the PID of the newly-created process. If the creation
1039 fails, return NULL if the failure is due to file descriptor
1040 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1042 static char *
1043 maybe_create_temp (FILE **pfp, pid_t *ppid, bool survive_fd_exhaustion)
1045 int tempfd;
1046 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1047 char *name;
1049 if (! node)
1050 return NULL;
1052 name = node->name;
1054 if (compress_program)
1056 int pipefds[2];
1058 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1059 if (0 < node->pid)
1061 close (tempfd);
1062 close (pipefds[0]);
1063 tempfd = pipefds[1];
1065 register_proc (node->pid);
1067 else if (node->pid == 0)
1069 close (pipefds[1]);
1070 dup2_or_die (tempfd, STDOUT_FILENO);
1071 close (tempfd);
1072 dup2_or_die (pipefds[0], STDIN_FILENO);
1073 close (pipefds[0]);
1075 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1076 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1077 compress_program);
1079 else
1080 node->pid = 0;
1083 *pfp = fdopen (tempfd, "w");
1084 if (! *pfp)
1085 die (_("couldn't create temporary file"), name);
1087 if (ppid)
1088 *ppid = node->pid;
1090 return name;
1093 /* Create a temporary file and start a compression program to filter output
1094 to that file. Set *PFP to the file handle and if *PPID is non-NULL,
1095 set it to the PID of the newly-created process. Die on failure. */
1097 static char *
1098 create_temp (FILE **pfp, pid_t *ppid)
1100 return maybe_create_temp (pfp, ppid, false);
1103 /* Open a compressed temp file and start a decompression process through
1104 which to filter the input. PID must be the valid processes ID of the
1105 process used to compress the file. Return NULL (setting errno to
1106 EMFILE) if we ran out of file descriptors, and die on any other
1107 kind of failure. */
1109 static FILE *
1110 open_temp (char const *name, pid_t pid)
1112 int tempfd, pipefds[2];
1113 FILE *fp = NULL;
1115 wait_proc (pid);
1117 tempfd = open (name, O_RDONLY);
1118 if (tempfd < 0)
1119 return NULL;
1121 switch (pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS))
1123 case -1:
1124 if (errno != EMFILE)
1125 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1126 compress_program);
1127 close (tempfd);
1128 errno = EMFILE;
1129 break;
1131 case 0:
1132 close (pipefds[0]);
1133 dup2_or_die (tempfd, STDIN_FILENO);
1134 close (tempfd);
1135 dup2_or_die (pipefds[1], STDOUT_FILENO);
1136 close (pipefds[1]);
1138 execlp (compress_program, compress_program, "-d", (char *) NULL);
1139 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1140 compress_program);
1142 default:
1143 close (tempfd);
1144 close (pipefds[1]);
1146 fp = fdopen (pipefds[0], "r");
1147 if (! fp)
1149 int saved_errno = errno;
1150 close (pipefds[0]);
1151 errno = saved_errno;
1153 break;
1156 return fp;
1159 /* Append DIR to the array of temporary directory names. */
1160 static void
1161 add_temp_dir (char const *dir)
1163 if (temp_dir_count == temp_dir_alloc)
1164 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1166 temp_dirs[temp_dir_count++] = dir;
1169 /* Remove NAME from the list of temporary files. */
1171 static void
1172 zaptemp (char const *name)
1174 struct tempnode *volatile *pnode;
1175 struct tempnode *node;
1176 struct tempnode *next;
1177 int unlink_status;
1178 int unlink_errno = 0;
1179 struct cs_status cs;
1181 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1182 continue;
1184 /* Unlink the temporary file in a critical section to avoid races. */
1185 next = node->next;
1186 cs = cs_enter ();
1187 unlink_status = unlink (name);
1188 unlink_errno = errno;
1189 *pnode = next;
1190 cs_leave (cs);
1192 if (unlink_status != 0)
1193 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1194 if (! next)
1195 temptail = pnode;
1196 free (node);
1199 #if HAVE_NL_LANGINFO
1201 static int
1202 struct_month_cmp (void const *m1, void const *m2)
1204 struct month const *month1 = m1;
1205 struct month const *month2 = m2;
1206 return strcmp (month1->name, month2->name);
1209 #endif
1211 /* Initialize the character class tables. */
1213 static void
1214 inittables (void)
1216 size_t i;
1218 for (i = 0; i < UCHAR_LIM; ++i)
1220 blanks[i] = !! isblank (i);
1221 nonprinting[i] = ! isprint (i);
1222 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1223 fold_toupper[i] = toupper (i);
1226 #if HAVE_NL_LANGINFO
1227 /* If we're not in the "C" locale, read different names for months. */
1228 if (hard_LC_TIME)
1230 for (i = 0; i < MONTHS_PER_YEAR; i++)
1232 char const *s;
1233 size_t s_len;
1234 size_t j, k;
1235 char *name;
1237 s = nl_langinfo (ABMON_1 + i);
1238 s_len = strlen (s);
1239 monthtab[i].name = name = xmalloc (s_len + 1);
1240 monthtab[i].val = i + 1;
1242 for (j = k = 0; j < s_len; j++)
1243 if (! isblank (to_uchar (s[j])))
1244 name[k++] = fold_toupper[to_uchar (s[j])];
1245 name[k] = '\0';
1247 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1249 #endif
1252 /* Specify how many inputs may be merged at once.
1253 This may be set on the command-line with the
1254 --batch-size option. */
1255 static void
1256 specify_nmerge (int oi, char c, char const *s)
1258 uintmax_t n;
1259 struct rlimit rlimit;
1260 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1262 /* Try to find out how many file descriptors we'll be able
1263 to open. We need at least nmerge + 3 (STDIN_FILENO,
1264 STDOUT_FILENO and STDERR_FILENO). */
1265 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1266 ? rlimit.rlim_cur
1267 : OPEN_MAX)
1268 - 3);
1270 if (e == LONGINT_OK)
1272 nmerge = n;
1273 if (nmerge != n)
1274 e = LONGINT_OVERFLOW;
1275 else
1277 if (nmerge < 2)
1279 error (0, 0, _("invalid --%s argument %s"),
1280 long_options[oi].name, quote (s));
1281 error (SORT_FAILURE, 0,
1282 _("minimum --%s argument is %s"),
1283 long_options[oi].name, quote ("2"));
1285 else if (max_nmerge < nmerge)
1287 e = LONGINT_OVERFLOW;
1289 else
1290 return;
1294 if (e == LONGINT_OVERFLOW)
1296 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1297 error (0, 0, _("--%s argument %s too large"),
1298 long_options[oi].name, quote (s));
1299 error (SORT_FAILURE, 0,
1300 _("maximum --%s argument with current rlimit is %s"),
1301 long_options[oi].name,
1302 uinttostr (max_nmerge, max_nmerge_buf));
1304 else
1305 xstrtol_fatal (e, oi, c, long_options, s);
1308 /* Specify the amount of main memory to use when sorting. */
1309 static void
1310 specify_sort_size (int oi, char c, char const *s)
1312 uintmax_t n;
1313 char *suffix;
1314 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1316 /* The default unit is KiB. */
1317 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1319 if (n <= UINTMAX_MAX / 1024)
1320 n *= 1024;
1321 else
1322 e = LONGINT_OVERFLOW;
1325 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1326 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1327 switch (suffix[0])
1329 case 'b':
1330 e = LONGINT_OK;
1331 break;
1333 case '%':
1335 double mem = physmem_total () * n / 100;
1337 /* Use "<", not "<=", to avoid problems with rounding. */
1338 if (mem < UINTMAX_MAX)
1340 n = mem;
1341 e = LONGINT_OK;
1343 else
1344 e = LONGINT_OVERFLOW;
1346 break;
1349 if (e == LONGINT_OK)
1351 /* If multiple sort sizes are specified, take the maximum, so
1352 that option order does not matter. */
1353 if (n < sort_size)
1354 return;
1356 sort_size = n;
1357 if (sort_size == n)
1359 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1360 return;
1363 e = LONGINT_OVERFLOW;
1366 xstrtol_fatal (e, oi, c, long_options, s);
1369 /* Specify the number of threads to spawn during internal sort. */
1370 static unsigned long int
1371 specify_nthreads (int oi, char c, char const *s)
1373 unsigned long int nthreads;
1374 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1375 if (e == LONGINT_OVERFLOW)
1376 return ULONG_MAX;
1377 if (e != LONGINT_OK)
1378 xstrtol_fatal (e, oi, c, long_options, s);
1379 if (nthreads == 0)
1380 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1381 return nthreads;
1385 /* Return the default sort size. */
1386 static size_t
1387 default_sort_size (void)
1389 /* Let MEM be available memory or 1/8 of total memory, whichever
1390 is greater. */
1391 double avail = physmem_available ();
1392 double total = physmem_total ();
1393 double mem = MAX (avail, total / 8);
1394 struct rlimit rlimit;
1396 /* Let SIZE be MEM, but no more than the maximum object size or
1397 system resource limits. Avoid the MIN macro here, as it is not
1398 quite right when only one argument is floating point. Don't
1399 bother to check for values like RLIM_INFINITY since in practice
1400 they are not much less than SIZE_MAX. */
1401 size_t size = SIZE_MAX;
1402 if (mem < size)
1403 size = mem;
1404 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1405 size = rlimit.rlim_cur;
1406 #ifdef RLIMIT_AS
1407 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1408 size = rlimit.rlim_cur;
1409 #endif
1411 /* Leave a large safety margin for the above limits, as failure can
1412 occur when they are exceeded. */
1413 size /= 2;
1415 #ifdef RLIMIT_RSS
1416 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1417 Exceeding RSS is not fatal, but can be quite slow. */
1418 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1419 size = rlimit.rlim_cur / 16 * 15;
1420 #endif
1422 /* Use no less than the minimum. */
1423 return MAX (size, MIN_SORT_SIZE);
1426 /* Return the sort buffer size to use with the input files identified
1427 by FPS and FILES, which are alternate names of the same files.
1428 NFILES gives the number of input files; NFPS may be less. Assume
1429 that each input line requires LINE_BYTES extra bytes' worth of line
1430 information. Do not exceed the size bound specified by the user
1431 (or a default size bound, if the user does not specify one). */
1433 static size_t
1434 sort_buffer_size (FILE *const *fps, size_t nfps,
1435 char *const *files, size_t nfiles,
1436 size_t line_bytes)
1438 /* A bound on the input size. If zero, the bound hasn't been
1439 determined yet. */
1440 static size_t size_bound;
1442 /* In the worst case, each input byte is a newline. */
1443 size_t worst_case_per_input_byte = line_bytes + 1;
1445 /* Keep enough room for one extra input line and an extra byte.
1446 This extra room might be needed when preparing to read EOF. */
1447 size_t size = worst_case_per_input_byte + 1;
1449 size_t i;
1451 for (i = 0; i < nfiles; i++)
1453 struct stat st;
1454 off_t file_size;
1455 size_t worst_case;
1457 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1458 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1459 : stat (files[i], &st))
1460 != 0)
1461 die (_("stat failed"), files[i]);
1463 if (S_ISREG (st.st_mode))
1464 file_size = st.st_size;
1465 else
1467 /* The file has unknown size. If the user specified a sort
1468 buffer size, use that; otherwise, guess the size. */
1469 if (sort_size)
1470 return sort_size;
1471 file_size = INPUT_FILE_SIZE_GUESS;
1474 if (! size_bound)
1476 size_bound = sort_size;
1477 if (! size_bound)
1478 size_bound = default_sort_size ();
1481 /* Add the amount of memory needed to represent the worst case
1482 where the input consists entirely of newlines followed by a
1483 single non-newline. Check for overflow. */
1484 worst_case = file_size * worst_case_per_input_byte + 1;
1485 if (file_size != worst_case / worst_case_per_input_byte
1486 || size_bound - size <= worst_case)
1487 return size_bound;
1488 size += worst_case;
1491 return size;
1494 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1495 must be at least sizeof (struct line). Allocate ALLOC bytes
1496 initially. */
1498 static void
1499 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1501 /* Ensure that the line array is properly aligned. If the desired
1502 size cannot be allocated, repeatedly halve it until allocation
1503 succeeds. The smaller allocation may hurt overall performance,
1504 but that's better than failing. */
1505 while (true)
1507 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1508 buf->buf = malloc (alloc);
1509 if (buf->buf)
1510 break;
1511 alloc /= 2;
1512 if (alloc <= line_bytes + 1)
1513 xalloc_die ();
1516 buf->line_bytes = line_bytes;
1517 buf->alloc = alloc;
1518 buf->used = buf->left = buf->nlines = 0;
1519 buf->eof = false;
1522 /* Return one past the limit of the line array. */
1524 static inline struct line *
1525 buffer_linelim (struct buffer const *buf)
1527 return (struct line *) (buf->buf + buf->alloc);
1530 /* Return a pointer to the first character of the field specified
1531 by KEY in LINE. */
1533 static char *
1534 begfield (struct line const *line, struct keyfield const *key)
1536 char *ptr = line->text, *lim = ptr + line->length - 1;
1537 size_t sword = key->sword;
1538 size_t schar = key->schar;
1540 /* The leading field separator itself is included in a field when -t
1541 is absent. */
1543 if (tab != TAB_DEFAULT)
1544 while (ptr < lim && sword--)
1546 while (ptr < lim && *ptr != tab)
1547 ++ptr;
1548 if (ptr < lim)
1549 ++ptr;
1551 else
1552 while (ptr < lim && sword--)
1554 while (ptr < lim && blanks[to_uchar (*ptr)])
1555 ++ptr;
1556 while (ptr < lim && !blanks[to_uchar (*ptr)])
1557 ++ptr;
1560 /* If we're ignoring leading blanks when computing the Start
1561 of the field, skip past them here. */
1562 if (key->skipsblanks)
1563 while (ptr < lim && blanks[to_uchar (*ptr)])
1564 ++ptr;
1566 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1567 ptr = MIN (lim, ptr + schar);
1569 return ptr;
1572 /* Return the limit of (a pointer to the first character after) the field
1573 in LINE specified by KEY. */
1575 static char *
1576 limfield (struct line const *line, struct keyfield const *key)
1578 char *ptr = line->text, *lim = ptr + line->length - 1;
1579 size_t eword = key->eword, echar = key->echar;
1581 if (echar == 0)
1582 eword++; /* Skip all of end field. */
1584 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1585 whichever comes first. If there are more than EWORD fields, leave
1586 PTR pointing at the beginning of the field having zero-based index,
1587 EWORD. If a delimiter character was specified (via -t), then that
1588 `beginning' is the first character following the delimiting TAB.
1589 Otherwise, leave PTR pointing at the first `blank' character after
1590 the preceding field. */
1591 if (tab != TAB_DEFAULT)
1592 while (ptr < lim && eword--)
1594 while (ptr < lim && *ptr != tab)
1595 ++ptr;
1596 if (ptr < lim && (eword || echar))
1597 ++ptr;
1599 else
1600 while (ptr < lim && eword--)
1602 while (ptr < lim && blanks[to_uchar (*ptr)])
1603 ++ptr;
1604 while (ptr < lim && !blanks[to_uchar (*ptr)])
1605 ++ptr;
1608 #ifdef POSIX_UNSPECIFIED
1609 /* The following block of code makes GNU sort incompatible with
1610 standard Unix sort, so it's ifdef'd out for now.
1611 The POSIX spec isn't clear on how to interpret this.
1612 FIXME: request clarification.
1614 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1615 Date: Thu, 30 May 96 12:20:41 -0400
1616 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1618 [...]I believe I've found another bug in `sort'.
1620 $ cat /tmp/sort.in
1621 a b c 2 d
1622 pq rs 1 t
1623 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1624 a b c 2 d
1625 pq rs 1 t
1626 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1627 pq rs 1 t
1628 a b c 2 d
1630 Unix sort produced the answer I expected: sort on the single character
1631 in column 7. GNU sort produced different results, because it disagrees
1632 on the interpretation of the key-end spec "M.N". Unix sort reads this
1633 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1634 "skip M-1 fields, then either N-1 characters or the rest of the current
1635 field, whichever comes first". This extra clause applies only to
1636 key-ends, not key-starts.
1639 /* Make LIM point to the end of (one byte past) the current field. */
1640 if (tab != TAB_DEFAULT)
1642 char *newlim;
1643 newlim = memchr (ptr, tab, lim - ptr);
1644 if (newlim)
1645 lim = newlim;
1647 else
1649 char *newlim;
1650 newlim = ptr;
1651 while (newlim < lim && blanks[to_uchar (*newlim)])
1652 ++newlim;
1653 while (newlim < lim && !blanks[to_uchar (*newlim)])
1654 ++newlim;
1655 lim = newlim;
1657 #endif
1659 if (echar != 0) /* We need to skip over a portion of the end field. */
1661 /* If we're ignoring leading blanks when computing the End
1662 of the field, skip past them here. */
1663 if (key->skipeblanks)
1664 while (ptr < lim && blanks[to_uchar (*ptr)])
1665 ++ptr;
1667 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1668 ptr = MIN (lim, ptr + echar);
1671 return ptr;
1674 /* Fill BUF reading from FP, moving buf->left bytes from the end
1675 of buf->buf to the beginning first. If EOF is reached and the
1676 file wasn't terminated by a newline, supply one. Set up BUF's line
1677 table too. FILE is the name of the file corresponding to FP.
1678 Return true if some input was read. */
1680 static bool
1681 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1683 struct keyfield const *key = keylist;
1684 char eol = eolchar;
1685 size_t line_bytes = buf->line_bytes;
1686 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1688 if (buf->eof)
1689 return false;
1691 if (buf->used != buf->left)
1693 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1694 buf->used = buf->left;
1695 buf->nlines = 0;
1698 while (true)
1700 char *ptr = buf->buf + buf->used;
1701 struct line *linelim = buffer_linelim (buf);
1702 struct line *line = linelim - buf->nlines;
1703 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1704 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1706 while (line_bytes + 1 < avail)
1708 /* Read as many bytes as possible, but do not read so many
1709 bytes that there might not be enough room for the
1710 corresponding line array. The worst case is when the
1711 rest of the input file consists entirely of newlines,
1712 except that the last byte is not a newline. */
1713 size_t readsize = (avail - 1) / (line_bytes + 1);
1714 size_t bytes_read = fread (ptr, 1, readsize, fp);
1715 char *ptrlim = ptr + bytes_read;
1716 char *p;
1717 avail -= bytes_read;
1719 if (bytes_read != readsize)
1721 if (ferror (fp))
1722 die (_("read failed"), file);
1723 if (feof (fp))
1725 buf->eof = true;
1726 if (buf->buf == ptrlim)
1727 return false;
1728 if (line_start != ptrlim && ptrlim[-1] != eol)
1729 *ptrlim++ = eol;
1733 /* Find and record each line in the just-read input. */
1734 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1736 /* Delimit the line with NUL. This eliminates the need to
1737 temporarily replace the last byte with NUL when calling
1738 xmemcoll(), which increases performance. */
1739 *p = '\0';
1740 ptr = p + 1;
1741 line--;
1742 line->text = line_start;
1743 line->length = ptr - line_start;
1744 mergesize = MAX (mergesize, line->length);
1745 avail -= line_bytes;
1747 if (key)
1749 /* Precompute the position of the first key for
1750 efficiency. */
1751 line->keylim = (key->eword == SIZE_MAX
1753 : limfield (line, key));
1755 if (key->sword != SIZE_MAX)
1756 line->keybeg = begfield (line, key);
1757 else
1759 if (key->skipsblanks)
1760 while (blanks[to_uchar (*line_start)])
1761 line_start++;
1762 line->keybeg = line_start;
1766 line_start = ptr;
1769 ptr = ptrlim;
1770 if (buf->eof)
1771 break;
1774 buf->used = ptr - buf->buf;
1775 buf->nlines = buffer_linelim (buf) - line;
1776 if (buf->nlines != 0)
1778 buf->left = ptr - line_start;
1779 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1780 return true;
1784 /* The current input line is too long to fit in the buffer.
1785 Double the buffer size and try again, keeping it properly
1786 aligned. */
1787 size_t line_alloc = buf->alloc / sizeof (struct line);
1788 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1789 buf->alloc = line_alloc * sizeof (struct line);
1794 /* Return an integer that represents the order of magnitude of the
1795 unit following the number. The number may contain thousands
1796 separators and a decimal point, but it may not contain leading blanks.
1797 Negative numbers get negative orders; zero numbers have a zero order.
1798 Store the address of the end of the number into *ENDPTR. */
1800 static int
1801 find_unit_order (char const *number, char **endptr)
1803 static char const orders[UCHAR_LIM] =
1805 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1806 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1807 /* This initializer syntax works on all C99 hosts. For now, use
1808 it only on non-ASCII hosts, to ease the pain of porting to
1809 pre-C99 ASCII hosts. */
1810 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1811 ['k']=1,
1812 #else
1813 /* Generate the following table with this command:
1814 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1815 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1816 |fmt */
1817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1819 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1820 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1821 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1828 #endif
1831 bool minus_sign = (*number == '-');
1832 char const *p = number + minus_sign;
1833 int nonzero = 0;
1834 unsigned char ch;
1836 /* Scan to end of number.
1837 Decimals or separators not followed by digits stop the scan.
1838 Numbers ending in decimals or separators are thus considered
1839 to be lacking in units.
1840 FIXME: add support for multibyte thousands_sep and decimal_point. */
1844 while (ISDIGIT (ch = *p++))
1845 nonzero |= ch - '0';
1847 while (ch == thousands_sep);
1849 if (ch == decimal_point)
1850 while (ISDIGIT (ch = *p++))
1851 nonzero |= ch - '0';
1853 int order = (nonzero ? orders[ch] : 0);
1854 *endptr = (char *) p - !order;
1855 return (minus_sign ? -order : order);
1858 /* Compare numbers A and B ending in units with SI or IEC prefixes
1859 <none/unknown> < K/k < M < G < T < P < E < Z < Y
1860 This may temporarily modify the strings. Store into *EA the end
1861 of the string A. */
1863 static int
1864 human_numcompare (char *a, char *b, char **ea)
1866 char *endb;
1868 while (blanks[to_uchar (*a)])
1869 a++;
1870 while (blanks[to_uchar (*b)])
1871 b++;
1873 int diff = find_unit_order (a, ea) - find_unit_order (b, &endb);
1874 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1877 /* Compare strings A and B as numbers without explicitly converting them to
1878 machine numbers. Comparatively slow for short strings, but asymptotically
1879 hideously fast. */
1881 static int
1882 numcompare (char const *a, char const *b, char **ea)
1884 while (blanks[to_uchar (*a)])
1885 a++;
1886 while (blanks[to_uchar (*b)])
1887 b++;
1889 if (debug)
1891 /* Approximate strnumcmp extents with find_unit_order. */
1892 int order = find_unit_order (a, ea);
1893 *ea -= !!order;
1896 return strnumcmp (a, b, decimal_point, thousands_sep);
1899 static int
1900 general_numcompare (char const *sa, char const *sb, char **ea)
1902 /* FIXME: maybe add option to try expensive FP conversion
1903 only if A and B can't be compared more cheaply/accurately. */
1905 #if HAVE_C99_STRTOLD
1906 # define long_double long double
1907 #else
1908 # define long_double double
1909 # undef strtold
1910 # define strtold strtod
1911 #endif
1913 char *eb;
1914 long_double a = strtold (sa, ea);
1915 long_double b = strtold (sb, &eb);
1917 /* Put conversion errors at the start of the collating sequence. */
1918 if (sa == *ea)
1919 return sb == eb ? 0 : -1;
1920 if (sb == eb)
1921 return 1;
1923 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1924 conversion errors but before numbers; sort them by internal
1925 bit-pattern, for lack of a more portable alternative. */
1926 return (a < b ? -1
1927 : a > b ? 1
1928 : a == b ? 0
1929 : b == b ? -1
1930 : a == a ? 1
1931 : memcmp (&a, &b, sizeof a));
1934 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1935 Return 0 if the name in S is not recognized. */
1937 static int
1938 getmonth (char const *month, size_t len, char const **ea)
1940 size_t lo = 0;
1941 size_t hi = MONTHS_PER_YEAR;
1942 char const *monthlim = month + len;
1944 while (true)
1946 if (month == monthlim)
1947 return 0;
1948 if (!blanks[to_uchar (*month)])
1949 break;
1950 ++month;
1955 size_t ix = (lo + hi) / 2;
1956 char const *m = month;
1957 char const *n = monthtab[ix].name;
1959 for (;; m++, n++)
1961 if (!*n)
1963 if (ea)
1964 *ea = m;
1965 return monthtab[ix].val;
1967 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1969 hi = ix;
1970 break;
1972 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1974 lo = ix + 1;
1975 break;
1979 while (lo < hi);
1981 return 0;
1984 /* A randomly chosen MD5 state, used for random comparison. */
1985 static struct md5_ctx random_md5_state;
1987 /* Initialize the randomly chosen MD5 state. */
1989 static void
1990 random_md5_state_init (char const *random_source)
1992 unsigned char buf[MD5_DIGEST_SIZE];
1993 struct randread_source *r = randread_new (random_source, sizeof buf);
1994 if (! r)
1995 die (_("open failed"), random_source);
1996 randread (r, buf, sizeof buf);
1997 if (randread_free (r) != 0)
1998 die (_("close failed"), random_source);
1999 md5_init_ctx (&random_md5_state);
2000 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2003 /* This is like strxfrm, except it reports any error and exits. */
2005 static size_t
2006 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2008 errno = 0;
2009 size_t translated_size = strxfrm (dest, src, destsize);
2011 if (errno)
2013 error (0, errno, _("string transformation failed"));
2014 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2015 error (SORT_FAILURE, 0,
2016 _("the untransformed string was %s"),
2017 quotearg_n_style (0, locale_quoting_style, src));
2020 return translated_size;
2023 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2024 using one or more random hash functions. */
2026 static int
2027 compare_random (char *restrict texta, size_t lena,
2028 char *restrict textb, size_t lenb)
2030 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2031 data. This is used to break ties if there is an checksum
2032 collision, and this is good enough given the astronomically low
2033 probability of a collision. */
2034 int xfrm_diff = 0;
2036 char stackbuf[4000];
2037 char *buf = stackbuf;
2038 size_t bufsize = sizeof stackbuf;
2039 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2040 struct md5_ctx s[2];
2041 s[0] = s[1] = random_md5_state;
2043 if (hard_LC_COLLATE)
2045 /* Null-terminate the keys so that strxfrm knows where to stop. */
2046 char *lima = texta + lena; char enda = *lima; *lima = '\0';
2047 char *limb = textb + lenb; char endb = *limb; *limb = '\0';
2049 while (true)
2051 /* Transform the text into the basis of comparison, so that byte
2052 strings that would otherwise considered to be equal are
2053 considered equal here even if their bytes differ.
2055 Each time through this loop, transform one
2056 null-terminated string's worth from TEXTA or from TEXTB
2057 or both. That way, there's no need to store the
2058 transformation of the whole line, if it contains many
2059 null-terminated strings. */
2061 /* Store the transformed data into a big-enough buffer. */
2063 size_t sizea =
2064 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2065 bool a_fits = sizea <= bufsize;
2066 size_t sizeb =
2067 (textb < limb
2068 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2069 (a_fits ? bufsize - sizea : 0))
2070 + 1)
2071 : 0);
2073 if (! (a_fits && sizea + sizeb <= bufsize))
2075 bufsize = sizea + sizeb;
2076 if (bufsize < SIZE_MAX / 3)
2077 bufsize = bufsize * 3 / 2;
2078 buf = (buf == stackbuf
2079 ? xmalloc (bufsize)
2080 : xrealloc (buf, bufsize));
2081 if (texta < lima)
2082 strxfrm (buf, texta, sizea);
2083 if (textb < limb)
2084 strxfrm (buf + sizea, textb, sizeb);
2087 /* Advance past NULs to the next part of each input string,
2088 exiting the loop if both strings are exhausted. When
2089 exiting the loop, prepare to finish off the tiebreaker
2090 comparison properly. */
2091 if (texta < lima)
2092 texta += strlen (texta) + 1;
2093 if (textb < limb)
2094 textb += strlen (textb) + 1;
2095 if (! (texta < lima || textb < limb))
2097 lena = sizea; texta = buf;
2098 lenb = sizeb; textb = buf + sizea;
2099 break;
2102 /* Accumulate the transformed data in the corresponding
2103 checksums. */
2104 md5_process_bytes (buf, sizea, &s[0]);
2105 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2107 /* Update the tiebreaker comparison of the transformed data. */
2108 if (! xfrm_diff)
2110 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2111 if (! xfrm_diff)
2112 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2116 *lima = enda;
2117 *limb = endb;
2120 /* Compute and compare the checksums. */
2121 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2122 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2123 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2125 /* Fall back on the tiebreaker if the checksums collide. */
2126 if (! diff)
2128 if (! xfrm_diff)
2130 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2131 if (! xfrm_diff)
2132 xfrm_diff = (lena > lenb) - (lena < lenb);
2135 diff = xfrm_diff;
2138 if (buf != stackbuf)
2139 free (buf);
2141 return diff;
2144 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2145 using filevercmp. See lib/filevercmp.h for function description. */
2147 static int
2148 compare_version (char *restrict texta, size_t lena,
2149 char *restrict textb, size_t lenb)
2151 int diff;
2153 /* It is necessary to save the character after the end of the field.
2154 "filevercmp" works with NUL terminated strings. Our blocks of
2155 text are not necessarily terminated with a NUL byte. */
2156 char sv_a = texta[lena];
2157 char sv_b = textb[lenb];
2159 texta[lena] = '\0';
2160 textb[lenb] = '\0';
2162 diff = filevercmp (texta, textb);
2164 texta[lena] = sv_a;
2165 textb[lenb] = sv_b;
2167 return diff;
2170 /* Return the printable width of the block of memory starting at
2171 TEXT and ending just before LIM, counting each tab as one byte.
2172 FIXME: Should we generally be counting non printable chars? */
2174 static size_t
2175 debug_width (char const *text, char const *lim)
2177 size_t width = mbsnwidth (text, lim - text, 0);
2178 while (text < lim)
2179 width += (*text++ == '\t');
2180 return width;
2183 /* For debug mode, "underline" a key at the
2184 specified offset and screen width. */
2186 static void
2187 mark_key (size_t offset, size_t width)
2189 while (offset--)
2190 putchar (' ');
2192 if (!width)
2193 printf (_("^ no match for key\n"));
2194 else
2197 putchar ('_');
2198 while (--width);
2200 putchar ('\n');
2204 /* For debug mode, determine the screen offset and width
2205 to highlight for a key, and then output the highlight. */
2207 static void
2208 debug_key (char const *sline, char const *sfield, char const *efield,
2209 size_t flen, bool skipb)
2211 char const *sa = sfield;
2213 if (skipb) /* This key type implicitly skips leading blanks. */
2215 while (sa < efield && blanks[to_uchar (*sa)])
2217 sa++;
2218 if (flen)
2219 flen--; /* This assumes TABs same width as SPACEs. */
2223 size_t offset = debug_width (sline, sfield) + (sa - sfield);
2224 size_t width = debug_width (sa, sa + flen);
2225 mark_key (offset, width);
2228 /* Testing if a key is numeric is done in various places. */
2230 static inline bool
2231 key_numeric (struct keyfield const *key)
2233 return key->numeric || key->general_numeric || key->human_numeric;
2236 /* Return whether sorting options specified for key. */
2238 static bool
2239 default_key_compare (struct keyfield const *key)
2241 return ! (key->ignore
2242 || key->translate
2243 || key->skipsblanks
2244 || key->skipeblanks
2245 || key_numeric (key)
2246 || key->month
2247 || key->version
2248 || key->random
2249 /* || key->reverse */
2253 /* Convert a key to the short options used to specify it. */
2255 static void
2256 key_to_opts (struct keyfield const *key, char *opts)
2258 if (key->skipsblanks || key->skipeblanks)
2259 *opts++ = 'b';/* either disables global -b */
2260 if (key->ignore == nondictionary)
2261 *opts++ = 'd';
2262 if (key->translate)
2263 *opts++ = 'f';
2264 if (key->general_numeric)
2265 *opts++ = 'g';
2266 if (key->human_numeric)
2267 *opts++ = 'h';
2268 if (key->ignore == nonprinting)
2269 *opts++ = 'i';
2270 if (key->month)
2271 *opts++ = 'M';
2272 if (key->numeric)
2273 *opts++ = 'n';
2274 if (key->random)
2275 *opts++ = 'R';
2276 if (key->reverse)
2277 *opts++ = 'r';
2278 if (key->version)
2279 *opts++ = 'V';
2280 *opts = '\0';
2283 /* Output data independent key warnings to stderr. */
2285 static void
2286 key_warnings (struct keyfield const *gkey, bool gkey_only)
2288 struct keyfield const *key;
2289 struct keyfield ugkey = *gkey;
2290 unsigned long keynum = 1;
2292 for (key = keylist; key; key = key->next, keynum++)
2294 if (key->obsolete_used)
2296 size_t sword = key->sword;
2297 size_t eword = key->eword;
2298 char tmp[INT_BUFSIZE_BOUND (sword)];
2299 /* obsolescent syntax +A.x -B.y is equivalent to:
2300 -k A+1.x+1,B.y (when y = 0)
2301 -k A+1.x+1,B+1.y (when y > 0) */
2302 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2303 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2304 char *po = obuf;
2305 char *pn = nbuf;
2307 if (sword == SIZE_MAX)
2308 sword++;
2310 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2311 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2312 if (key->eword != SIZE_MAX)
2314 po = stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2315 pn = stpcpy (stpcpy (pn, ","),
2316 umaxtostr (eword + 1
2317 + (key->echar == SIZE_MAX), tmp));
2319 error (0, 0, _("obsolescent key `%s' used; consider `%s' instead"),
2320 obuf, nbuf);
2323 /* Warn about field specs that will never match. */
2324 if (key->sword != SIZE_MAX && key->eword < key->sword)
2325 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2327 /* Warn about significant leading blanks. */
2328 bool implicit_skip = key_numeric (key) || key->month;
2329 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2330 && !(key->schar || key->echar);
2331 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2332 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2333 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2334 || (!key->skipsblanks && key->schar)
2335 || (!key->skipeblanks && key->echar)))
2336 error (0, 0, _("leading blanks are significant in key %lu; "
2337 "consider also specifying `b'"), keynum);
2339 /* Warn about numeric comparisons spanning fields,
2340 as field delimiters could be interpreted as part
2341 of the number (maybe only in other locales). */
2342 if (!gkey_only && key_numeric (key))
2344 size_t sword = key->sword + 1;
2345 size_t eword = key->eword + 1;
2346 if (!sword)
2347 sword++;
2348 if (sword != eword)
2349 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2350 keynum);
2353 /* Flag global options not copied or specified in any key. */
2354 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2355 ugkey.ignore = NULL;
2356 if (ugkey.translate && (ugkey.translate == key->translate))
2357 ugkey.translate = NULL;
2358 ugkey.skipsblanks &= !key->skipsblanks;
2359 ugkey.skipeblanks &= !key->skipeblanks;
2360 ugkey.month &= !key->month;
2361 ugkey.numeric &= !key->numeric;
2362 ugkey.general_numeric &= !key->general_numeric;
2363 ugkey.human_numeric &= !key->human_numeric;
2364 ugkey.random &= !key->random;
2365 ugkey.version &= !key->version;
2366 ugkey.reverse &= !key->reverse;
2369 /* Warn about ignored global options flagged above.
2370 Note if gkey is the only one in the list, all flags are cleared. */
2371 if (!default_key_compare (&ugkey)
2372 || (ugkey.reverse && (stable || unique) && keylist))
2374 bool ugkey_reverse = ugkey.reverse;
2375 if (!(stable || unique))
2376 ugkey.reverse = false;
2377 /* The following is too big, but guaranteed to be "big enough". */
2378 char opts[sizeof short_options];
2379 key_to_opts (&ugkey, opts);
2380 error (0, 0,
2381 ngettext ("option `-%s' is ignored",
2382 "options `-%s' are ignored",
2383 select_plural (strlen (opts))), opts);
2384 ugkey.reverse = ugkey_reverse;
2386 if (ugkey.reverse && !(stable || unique) && keylist)
2387 error (0, 0, _("option `-r' only applies to last-resort comparison"));
2390 /* Compare two lines A and B trying every key in sequence until there
2391 are no more keys or a difference is found. */
2393 static int
2394 keycompare (struct line const *a, struct line const *b, bool show_debug)
2396 struct keyfield *key = keylist;
2398 /* For the first iteration only, the key positions have been
2399 precomputed for us. */
2400 char *texta = a->keybeg;
2401 char *textb = b->keybeg;
2402 char *lima = a->keylim;
2403 char *limb = b->keylim;
2405 int diff;
2407 while (true)
2409 char const *translate = key->translate;
2410 bool const *ignore = key->ignore;
2411 bool skipb = false; /* Whether key type auto skips leading blanks. */
2413 /* Treat field ends before field starts as empty fields. */
2414 lima = MAX (texta, lima);
2415 limb = MAX (textb, limb);
2417 /* Find the lengths. */
2418 size_t lena = lima - texta;
2419 size_t lenb = limb - textb;
2421 /* Actually compare the fields. */
2423 if (key->random)
2424 diff = compare_random (texta, lena, textb, lenb);
2425 else if (key_numeric (key))
2427 char savea = *lima, saveb = *limb;
2428 char *ea = lima;
2430 *lima = *limb = '\0';
2431 diff = (key->numeric ? numcompare (texta, textb, &ea)
2432 : key->general_numeric ? general_numcompare (texta, textb,
2433 &ea)
2434 : human_numcompare (texta, textb, &ea));
2435 if (show_debug)
2437 lena = ea - texta;
2438 skipb = true;
2440 *lima = savea, *limb = saveb;
2442 else if (key->version)
2443 diff = compare_version (texta, lena, textb, lenb);
2444 else if (key->month)
2446 char const *ea = lima;
2448 int amon = getmonth (texta, lena, &ea);
2449 diff = amon - getmonth (textb, lenb, NULL);
2451 if (show_debug)
2453 lena = amon ? ea - texta : 0;
2454 skipb = true;
2457 /* Sorting like this may become slow, so in a simple locale the user
2458 can select a faster sort that is similar to ascii sort. */
2459 else if (hard_LC_COLLATE)
2461 /* FIXME: for debug, account for skipped chars, while handling mb chars.
2462 Generally perhaps xmemfrm could be used to determine chars that are
2463 excluded from the collating order? */
2464 if (ignore || translate)
2466 char buf[4000];
2467 size_t size = lena + 1 + lenb + 1;
2468 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
2469 char *copy_b = copy_a + lena + 1;
2470 size_t new_len_a, new_len_b, i;
2472 /* Ignore and/or translate chars before comparing. */
2473 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
2475 if (i < lena)
2477 copy_a[new_len_a] = (translate
2478 ? translate[to_uchar (texta[i])]
2479 : texta[i]);
2480 if (!ignore || !ignore[to_uchar (texta[i])])
2481 ++new_len_a;
2483 if (i < lenb)
2485 copy_b[new_len_b] = (translate
2486 ? translate[to_uchar (textb[i])]
2487 : textb [i]);
2488 if (!ignore || !ignore[to_uchar (textb[i])])
2489 ++new_len_b;
2493 copy_a[new_len_a++] = '\0';
2494 copy_b[new_len_b++] = '\0';
2495 diff = xmemcoll0 (copy_a, new_len_a, copy_b, new_len_b);
2497 if (sizeof buf < size)
2498 free (copy_a);
2500 else if (lena == 0)
2501 diff = - NONZERO (lenb);
2502 else if (lenb == 0)
2503 goto greater;
2504 else
2505 diff = xmemcoll (texta, lena, textb, lenb);
2507 else if (ignore)
2509 char *savea = texta;
2511 #define CMP_WITH_IGNORE(A, B) \
2512 do \
2514 while (true) \
2516 while (texta < lima && ignore[to_uchar (*texta)]) \
2517 ++texta; \
2518 while (textb < limb && ignore[to_uchar (*textb)]) \
2519 ++textb; \
2520 if (! (texta < lima && textb < limb)) \
2521 break; \
2522 diff = to_uchar (A) - to_uchar (B); \
2523 if (diff) \
2524 goto not_equal; \
2525 ++texta; \
2526 ++textb; \
2529 diff = (texta < lima) - (textb < limb); \
2531 while (0)
2533 if (translate)
2534 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2535 translate[to_uchar (*textb)]);
2536 else
2537 CMP_WITH_IGNORE (*texta, *textb);
2539 /* We only need to restore this for debug_key
2540 in which case the keys being compared are equal. */
2541 texta = savea;
2543 else if (lena == 0)
2544 diff = - NONZERO (lenb);
2545 else if (lenb == 0)
2546 goto greater;
2547 else
2549 if (translate)
2551 char *savea = texta;
2553 while (texta < lima && textb < limb)
2555 diff = (to_uchar (translate[to_uchar (*texta++)])
2556 - to_uchar (translate[to_uchar (*textb++)]));
2557 if (diff)
2558 goto not_equal;
2561 /* We only need to restore this for debug_key
2562 in which case the keys being compared are equal. */
2563 texta = savea;
2565 else
2567 diff = memcmp (texta, textb, MIN (lena, lenb));
2568 if (diff)
2569 goto not_equal;
2571 diff = lena < lenb ? -1 : lena != lenb;
2574 if (diff)
2575 goto not_equal;
2577 if (show_debug)
2578 debug_key (a->text, texta, lima, lena, skipb);
2580 key = key->next;
2581 if (! key)
2582 break;
2584 /* Find the beginning and limit of the next field. */
2585 if (key->eword != SIZE_MAX)
2586 lima = limfield (a, key), limb = limfield (b, key);
2587 else
2588 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2590 if (key->sword != SIZE_MAX)
2591 texta = begfield (a, key), textb = begfield (b, key);
2592 else
2594 texta = a->text, textb = b->text;
2595 if (key->skipsblanks)
2597 while (texta < lima && blanks[to_uchar (*texta)])
2598 ++texta;
2599 while (textb < limb && blanks[to_uchar (*textb)])
2600 ++textb;
2605 return 0;
2607 greater:
2608 diff = 1;
2609 not_equal:
2610 return key->reverse ? -diff : diff;
2613 /* Compare two lines A and B, returning negative, zero, or positive
2614 depending on whether A compares less than, equal to, or greater than B. */
2616 static int
2617 compare (struct line const *a, struct line const *b, bool show_debug)
2619 int diff;
2620 size_t alen, blen;
2622 /* First try to compare on the specified keys (if any).
2623 The only two cases with no key at all are unadorned sort,
2624 and unadorned sort -r. */
2625 if (keylist)
2627 diff = keycompare (a, b, show_debug);
2628 if (diff || unique || stable)
2629 return diff;
2632 /* If the keys all compare equal (or no keys were specified)
2633 fall through to the default comparison. */
2634 alen = a->length - 1, blen = b->length - 1;
2636 if (show_debug)
2637 debug_key (a->text, a->text, a->text + alen, alen, false);
2639 if (alen == 0)
2640 diff = - NONZERO (blen);
2641 else if (blen == 0)
2642 diff = 1;
2643 else if (hard_LC_COLLATE)
2645 /* Note xmemcoll0 is a performance enhancement as
2646 it will not unconditionally write '\0' after the
2647 passed in buffers, which was seen to give around
2648 a 3% increase in performance for short lines. */
2649 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2651 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2652 diff = alen < blen ? -1 : alen != blen;
2654 return reverse ? -diff : diff;
2657 static void
2658 write_bytes (struct line const *line, FILE *fp, char const *output_file)
2660 char *buf = line->text;
2661 size_t n_bytes = line->length;
2662 char *ebuf = buf + n_bytes;
2664 /* Convert TABs to '>' and \0 to \n when -z specified. */
2665 if (debug && fp == stdout)
2667 char const *c = buf;
2669 while (c < ebuf)
2671 char wc = *c++;
2672 if (wc == '\t')
2673 wc = '>';
2674 else if (c == ebuf)
2675 wc = '\n';
2676 if (fputc (wc, fp) == EOF)
2677 die (_("write failed"), output_file);
2680 compare (line, line, true);
2682 else
2684 ebuf[-1] = eolchar;
2685 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2686 die (_("write failed"), output_file);
2687 ebuf[-1] = '\0';
2691 /* Check that the lines read from FILE_NAME come in order. Return
2692 true if they are in order. If CHECKONLY == 'c', also print a
2693 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2694 they are not in order. */
2696 static bool
2697 check (char const *file_name, char checkonly)
2699 FILE *fp = xfopen (file_name, "r");
2700 struct buffer buf; /* Input buffer. */
2701 struct line temp; /* Copy of previous line. */
2702 size_t alloc = 0;
2703 uintmax_t line_number = 0;
2704 struct keyfield const *key = keylist;
2705 bool nonunique = ! unique;
2706 bool ordered = true;
2708 initbuf (&buf, sizeof (struct line),
2709 MAX (merge_buffer_size, sort_size));
2710 temp.text = NULL;
2712 while (fillbuf (&buf, fp, file_name))
2714 struct line const *line = buffer_linelim (&buf);
2715 struct line const *linebase = line - buf.nlines;
2717 /* Make sure the line saved from the old buffer contents is
2718 less than or equal to the first line of the new buffer. */
2719 if (alloc && nonunique <= compare (&temp, line - 1, false))
2721 found_disorder:
2723 if (checkonly == 'c')
2725 struct line const *disorder_line = line - 1;
2726 uintmax_t disorder_line_number =
2727 buffer_linelim (&buf) - disorder_line + line_number;
2728 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2729 fprintf (stderr, _("%s: %s:%s: disorder: "),
2730 program_name, file_name,
2731 umaxtostr (disorder_line_number, hr_buf));
2732 if (debug)
2733 fputc ('\n', stderr);
2734 write_bytes (disorder_line, debug ? stdout : stderr,
2735 debug ? _("standard out") : _("standard error"));
2738 ordered = false;
2739 break;
2743 /* Compare each line in the buffer with its successor. */
2744 while (linebase < --line)
2745 if (nonunique <= compare (line, line - 1, false))
2746 goto found_disorder;
2748 line_number += buf.nlines;
2750 /* Save the last line of the buffer. */
2751 if (alloc < line->length)
2755 alloc *= 2;
2756 if (! alloc)
2758 alloc = line->length;
2759 break;
2762 while (alloc < line->length);
2764 temp.text = xrealloc (temp.text, alloc);
2766 memcpy (temp.text, line->text, line->length);
2767 temp.length = line->length;
2768 if (key)
2770 temp.keybeg = temp.text + (line->keybeg - line->text);
2771 temp.keylim = temp.text + (line->keylim - line->text);
2775 xfclose (fp, file_name);
2776 free (buf.buf);
2777 free (temp.text);
2778 return ordered;
2781 /* Open FILES (there are NFILES of them) and store the resulting array
2782 of stream pointers into (*PFPS). Allocate the array. Return the
2783 number of successfully opened files, setting errno if this value is
2784 less than NFILES. */
2786 static size_t
2787 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2789 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2790 int i;
2792 /* Open as many input files as we can. */
2793 for (i = 0; i < nfiles; i++)
2795 fps[i] = (files[i].pid
2796 ? open_temp (files[i].name, files[i].pid)
2797 : stream_open (files[i].name, "r"));
2798 if (!fps[i])
2799 break;
2802 return i;
2805 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2806 files (all of which are at the start of the FILES array), and
2807 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2808 FPS is the vector of open stream corresponding to the files.
2809 Close input and output streams before returning.
2810 OUTPUT_FILE gives the name of the output file. If it is NULL,
2811 the output file is standard output. */
2813 static void
2814 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2815 FILE *ofp, char const *output_file, FILE **fps)
2817 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2818 /* Input buffers for each file. */
2819 struct line saved; /* Saved line storage for unique check. */
2820 struct line const *savedline = NULL;
2821 /* &saved if there is a saved line. */
2822 size_t savealloc = 0; /* Size allocated for the saved line. */
2823 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2824 /* Current line in each line table. */
2825 struct line const **base = xnmalloc (nfiles, sizeof *base);
2826 /* Base of each line table. */
2827 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2828 /* Table representing a permutation of fps,
2829 such that cur[ord[0]] is the smallest line
2830 and will be next output. */
2831 size_t i;
2832 size_t j;
2833 size_t t;
2834 struct keyfield const *key = keylist;
2835 saved.text = NULL;
2837 /* Read initial lines from each input file. */
2838 for (i = 0; i < nfiles; )
2840 initbuf (&buffer[i], sizeof (struct line),
2841 MAX (merge_buffer_size, sort_size / nfiles));
2842 if (fillbuf (&buffer[i], fps[i], files[i].name))
2844 struct line const *linelim = buffer_linelim (&buffer[i]);
2845 cur[i] = linelim - 1;
2846 base[i] = linelim - buffer[i].nlines;
2847 i++;
2849 else
2851 /* fps[i] is empty; eliminate it from future consideration. */
2852 xfclose (fps[i], files[i].name);
2853 if (i < ntemps)
2855 ntemps--;
2856 zaptemp (files[i].name);
2858 free (buffer[i].buf);
2859 --nfiles;
2860 for (j = i; j < nfiles; ++j)
2862 files[j] = files[j + 1];
2863 fps[j] = fps[j + 1];
2868 /* Set up the ord table according to comparisons among input lines.
2869 Since this only reorders two items if one is strictly greater than
2870 the other, it is stable. */
2871 for (i = 0; i < nfiles; ++i)
2872 ord[i] = i;
2873 for (i = 1; i < nfiles; ++i)
2874 if (0 < compare (cur[ord[i - 1]], cur[ord[i]], false))
2875 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2877 /* Repeatedly output the smallest line until no input remains. */
2878 while (nfiles)
2880 struct line const *smallest = cur[ord[0]];
2882 /* If uniquified output is turned on, output only the first of
2883 an identical series of lines. */
2884 if (unique)
2886 if (savedline && compare (savedline, smallest, false))
2888 savedline = NULL;
2889 write_bytes (&saved, ofp, output_file);
2891 if (!savedline)
2893 savedline = &saved;
2894 if (savealloc < smallest->length)
2897 if (! savealloc)
2899 savealloc = smallest->length;
2900 break;
2902 while ((savealloc *= 2) < smallest->length);
2904 saved.text = xrealloc (saved.text, savealloc);
2906 saved.length = smallest->length;
2907 memcpy (saved.text, smallest->text, saved.length);
2908 if (key)
2910 saved.keybeg =
2911 saved.text + (smallest->keybeg - smallest->text);
2912 saved.keylim =
2913 saved.text + (smallest->keylim - smallest->text);
2917 else
2918 write_bytes (smallest, ofp, output_file);
2920 /* Check if we need to read more lines into core. */
2921 if (base[ord[0]] < smallest)
2922 cur[ord[0]] = smallest - 1;
2923 else
2925 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2927 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2928 cur[ord[0]] = linelim - 1;
2929 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2931 else
2933 /* We reached EOF on fps[ord[0]]. */
2934 for (i = 1; i < nfiles; ++i)
2935 if (ord[i] > ord[0])
2936 --ord[i];
2937 --nfiles;
2938 xfclose (fps[ord[0]], files[ord[0]].name);
2939 if (ord[0] < ntemps)
2941 ntemps--;
2942 zaptemp (files[ord[0]].name);
2944 free (buffer[ord[0]].buf);
2945 for (i = ord[0]; i < nfiles; ++i)
2947 fps[i] = fps[i + 1];
2948 files[i] = files[i + 1];
2949 buffer[i] = buffer[i + 1];
2950 cur[i] = cur[i + 1];
2951 base[i] = base[i + 1];
2953 for (i = 0; i < nfiles; ++i)
2954 ord[i] = ord[i + 1];
2955 continue;
2959 /* The new line just read in may be larger than other lines
2960 already in main memory; push it back in the queue until we
2961 encounter a line larger than it. Optimize for the common
2962 case where the new line is smallest. */
2964 size_t lo = 1;
2965 size_t hi = nfiles;
2966 size_t probe = lo;
2967 size_t ord0 = ord[0];
2968 size_t count_of_smaller_lines;
2970 while (lo < hi)
2972 int cmp = compare (cur[ord0], cur[ord[probe]], false);
2973 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2974 hi = probe;
2975 else
2976 lo = probe + 1;
2977 probe = (lo + hi) / 2;
2980 count_of_smaller_lines = lo - 1;
2981 for (j = 0; j < count_of_smaller_lines; j++)
2982 ord[j] = ord[j + 1];
2983 ord[count_of_smaller_lines] = ord0;
2986 /* Free up some resources every once in a while. */
2987 if (MAX_PROCS_BEFORE_REAP < nprocs)
2988 reap_some ();
2991 if (unique && savedline)
2993 write_bytes (&saved, ofp, output_file);
2994 free (saved.text);
2997 xfclose (ofp, output_file);
2998 free (fps);
2999 free (buffer);
3000 free (ord);
3001 free (base);
3002 free (cur);
3005 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3006 files (all of which are at the start of the FILES array), and
3007 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3008 Close input and output files before returning.
3009 OUTPUT_FILE gives the name of the output file.
3011 Return the number of files successfully merged. This number can be
3012 less than NFILES if we ran low on file descriptors, but in this
3013 case it is never less than 2. */
3015 static size_t
3016 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3017 FILE *ofp, char const *output_file)
3019 FILE **fps;
3020 size_t nopened = open_input_files (files, nfiles, &fps);
3021 if (nopened < nfiles && nopened < 2)
3022 die (_("open failed"), files[nopened].name);
3023 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3024 return nopened;
3027 /* Merge into T (of size NLINES) the two sorted arrays of lines
3028 LO (with NLINES / 2 members), and
3029 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3030 T and LO point just past their respective arrays, and the arrays
3031 are in reverse order. NLINES must be at least 2. */
3033 static void
3034 mergelines (struct line *restrict t, size_t nlines,
3035 struct line const *restrict lo)
3037 size_t nlo = nlines / 2;
3038 size_t nhi = nlines - nlo;
3039 struct line *hi = t - nlo;
3041 while (true)
3042 if (compare (lo - 1, hi - 1, false) <= 0)
3044 *--t = *--lo;
3045 if (! --nlo)
3047 /* HI must equal T now, and there is no need to copy from
3048 HI to T. */
3049 return;
3052 else
3054 *--t = *--hi;
3055 if (! --nhi)
3058 *--t = *--lo;
3059 while (--nlo);
3061 return;
3066 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3067 NLINES must be at least 2.
3068 The input and output arrays are in reverse order, and LINES and
3069 TEMP point just past the end of their respective arrays.
3071 Use a recursive divide-and-conquer algorithm, in the style
3072 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3073 the optimization suggested by exercise 5.2.4-10; this requires room
3074 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3075 writes that this memory optimization was originally published by
3076 D. A. Bell, Comp J. 1 (1958), 75. */
3078 static void
3079 sequential_sort (struct line *restrict lines, size_t nlines,
3080 struct line *restrict temp, bool to_temp)
3082 if (nlines == 2)
3084 /* Declare `swap' as int, not bool, to work around a bug
3085 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3086 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3087 int swap = (0 < compare (&lines[-1], &lines[-2], false));
3088 if (to_temp)
3090 temp[-1] = lines[-1 - swap];
3091 temp[-2] = lines[-2 + swap];
3093 else if (swap)
3095 temp[-1] = lines[-1];
3096 lines[-1] = lines[-2];
3097 lines[-2] = temp[-1];
3100 else
3102 size_t nlo = nlines / 2;
3103 size_t nhi = nlines - nlo;
3104 struct line *lo = lines;
3105 struct line *hi = lines - nlo;
3107 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3108 if (1 < nlo)
3109 sequential_sort (lo, nlo, temp, !to_temp);
3110 else if (!to_temp)
3111 temp[-1] = lo[-1];
3113 struct line *dest;
3114 struct line const *sorted_lo;
3115 if (to_temp)
3117 dest = temp;
3118 sorted_lo = lines;
3120 else
3122 dest = lines;
3123 sorted_lo = temp;
3125 mergelines (dest, nlines, sorted_lo);
3129 /* Compare two NODEs for priority. The NODE with the higher (numerically
3130 lower) level has priority. If tie, the NODE with the most remaining
3131 lines has priority. */
3133 static int
3134 compare_nodes (void const *a, void const *b)
3136 struct merge_node const *nodea = a;
3137 struct merge_node const *nodeb = b;
3138 if (nodea->level == nodeb->level)
3139 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3140 return nodea->level < nodeb->level;
3143 /* Lock a merge tree NODE.
3144 Note spin locks were seen to perform better than mutexes
3145 as long as the number of threads is limited to the
3146 number of processors. */
3148 static inline void
3149 lock_node (struct merge_node *node)
3151 pthread_spin_lock (node->lock);
3154 /* Unlock a merge tree NODE. */
3156 static inline void
3157 unlock_node (struct merge_node *node)
3159 pthread_spin_unlock (node->lock);
3162 /* Destroy merge QUEUE. */
3164 static void
3165 queue_destroy (struct merge_node_queue *queue)
3167 heap_free (queue->priority_queue);
3168 pthread_cond_destroy (&queue->cond);
3169 pthread_mutex_destroy (&queue->mutex);
3172 /* Initialize merge QUEUE, allocating space for a maximum of RESERVE nodes.
3173 Though it's highly unlikely all nodes are in the heap at the same time,
3174 RESERVE should accommodate all of them. Counting a NULL dummy head for the
3175 heap, RESERVE should be 2 * NTHREADS. */
3177 static void
3178 queue_init (struct merge_node_queue *queue, size_t reserve)
3180 queue->priority_queue = heap_alloc (compare_nodes, reserve);
3181 pthread_mutex_init (&queue->mutex, NULL);
3182 pthread_cond_init (&queue->cond, NULL);
3185 /* Insert NODE into priority QUEUE. Assume caller either holds lock on NODE
3186 or does not need to lock NODE. */
3188 static void
3189 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3191 pthread_mutex_lock (&queue->mutex);
3192 heap_insert (queue->priority_queue, node);
3193 node->queued = true;
3194 pthread_mutex_unlock (&queue->mutex);
3195 pthread_cond_signal (&queue->cond);
3198 /* Pop NODE off priority QUEUE. Guarantee a non-null, spinlocked NODE. */
3200 static struct merge_node *
3201 queue_pop (struct merge_node_queue *queue)
3203 struct merge_node *node;
3204 pthread_mutex_lock (&queue->mutex);
3205 while (! (node = heap_remove_top (queue->priority_queue)))
3206 pthread_cond_wait (&queue->cond, &queue->mutex);
3207 pthread_mutex_unlock (&queue->mutex);
3208 lock_node (node);
3209 node->queued = false;
3210 return node;
3213 /* If UNQIUE is set, checks to make sure line isn't a duplicate before
3214 outputting. If UNIQUE is not set, output the passed in line. Note that
3215 this function does not actually save the line, nor any key information,
3216 thus is only appropriate for internal sort. */
3218 static void
3219 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3221 static struct line const *saved = NULL;
3223 if (!unique)
3224 write_bytes (line, tfp, temp_output);
3225 else if (!saved || compare (line, saved, false))
3227 saved = line;
3228 write_bytes (line, tfp, temp_output);
3232 /* Merge the lines currently available to a NODE in the binary
3233 merge tree, up to a maximum specified by MAX_MERGE. */
3235 static size_t
3236 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3237 FILE *tfp, char const *temp_output)
3239 struct line *lo_orig = node->lo;
3240 struct line *hi_orig = node->hi;
3241 size_t to_merge = MAX_MERGE (total_lines, node->level);
3242 size_t merged_lo;
3243 size_t merged_hi;
3245 if (node->level > MERGE_ROOT)
3247 /* Merge to destination buffer. */
3248 struct line *dest = *node->dest;
3249 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3250 if (compare (node->lo - 1, node->hi - 1, false) <= 0)
3251 *--dest = *--node->lo;
3252 else
3253 *--dest = *--node->hi;
3255 merged_lo = lo_orig - node->lo;
3256 merged_hi = hi_orig - node->hi;
3258 if (node->nhi == merged_hi)
3259 while (node->lo != node->end_lo && to_merge--)
3260 *--dest = *--node->lo;
3261 else if (node->nlo == merged_lo)
3262 while (node->hi != node->end_hi && to_merge--)
3263 *--dest = *--node->hi;
3265 else
3267 /* Merge directly to output. */
3268 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3270 if (compare (node->lo - 1, node->hi - 1, false) <= 0)
3271 write_unique (--node->lo, tfp, temp_output);
3272 else
3273 write_unique (--node->hi, tfp, temp_output);
3276 merged_lo = lo_orig - node->lo;
3277 merged_hi = hi_orig - node->hi;
3279 if (node->nhi == merged_hi)
3281 while (node->lo != node->end_lo && to_merge--)
3282 write_unique (--node->lo, tfp, temp_output);
3284 else if (node->nlo == merged_lo)
3286 while (node->hi != node->end_hi && to_merge--)
3287 write_unique (--node->hi, tfp, temp_output);
3289 node->dest -= lo_orig - node->lo + hi_orig - node->hi;
3292 /* Update NODE. */
3293 merged_lo = lo_orig - node->lo;
3294 merged_hi = hi_orig - node->hi;
3295 node->nlo -= merged_lo;
3296 node->nhi -= merged_hi;
3297 return merged_lo + merged_hi;
3300 /* Insert NODE into QUEUE if it passes insertion checks. */
3302 static void
3303 check_insert (struct merge_node *node, struct merge_node_queue *queue)
3305 size_t lo_avail = node->lo - node->end_lo;
3306 size_t hi_avail = node->hi - node->end_hi;
3308 /* Conditions for insertion:
3309 1. NODE is not already in heap.
3310 2. NODE has available lines from both it's children, OR one child has
3311 available lines, but the other has exhausted all its lines. */
3312 if ((!node->queued)
3313 && ((lo_avail && (hi_avail || !(node->nhi)))
3314 || (hi_avail && !(node->nlo))))
3316 queue_insert (queue, node);
3320 /* Update parent merge tree NODE. */
3322 static void
3323 update_parent (struct merge_node *node, size_t merged,
3324 struct merge_node_queue *queue)
3326 if (node->level > MERGE_ROOT)
3328 lock_node (node->parent);
3329 *node->dest -= merged;
3330 check_insert (node->parent, queue);
3331 unlock_node (node->parent);
3333 else if (node->nlo + node->nhi == 0)
3335 /* If the MERGE_ROOT NODE has finished merging, insert the
3336 MERGE_END node. */
3337 queue_insert (queue, node->parent);
3341 /* Repeatedly pop QUEUE for a NODE with lines to merge, and merge at least
3342 some of those lines, until the MERGE_END node is popped. */
3344 static void
3345 merge_loop (struct merge_node_queue *queue,
3346 size_t total_lines, FILE *tfp, char const *temp_output)
3348 while (1)
3350 struct merge_node *node = queue_pop (queue);
3352 if (node->level == MERGE_END)
3354 unlock_node (node);
3355 /* Reinsert so other threads can pop it. */
3356 queue_insert (queue, node);
3357 break;
3359 size_t merged_lines = mergelines_node (node, total_lines, tfp,
3360 temp_output);
3361 check_insert (node, queue);
3362 update_parent (node, merged_lines, queue);
3364 unlock_node (node);
3369 static void sortlines (struct line *restrict, struct line *restrict,
3370 unsigned long int, size_t,
3371 struct merge_node *, bool,
3372 struct merge_node_queue *,
3373 FILE *, char const *);
3375 /* Thread arguments for sortlines_thread. */
3377 struct thread_args
3379 struct line *lines;
3380 struct line *dest;
3381 unsigned long int nthreads;
3382 size_t const total_lines;
3383 struct merge_node *const parent;
3384 bool lo_child;
3385 struct merge_node_queue *const merge_queue;
3386 FILE *tfp;
3387 char const *output_temp;
3390 /* Like sortlines, except with a signature acceptable to pthread_create. */
3392 static void *
3393 sortlines_thread (void *data)
3395 struct thread_args const *args = data;
3396 sortlines (args->lines, args->dest, args->nthreads, args->total_lines,
3397 args->parent, args->lo_child, args->merge_queue,
3398 args->tfp, args->output_temp);
3399 return NULL;
3402 /* There are three phases to the algorithm: node creation, sequential sort,
3403 and binary merge.
3405 During node creation, sortlines recursively visits each node in the
3406 binary merge tree and creates a NODE structure corresponding to all the
3407 future line merging NODE is responsible for. For each call to
3408 sortlines, half the available threads are assigned to each recursive
3409 call, until a leaf node having only 1 available thread is reached.
3411 Each leaf node then performs two sequential sorts, one on each half of
3412 the lines it is responsible for. It records in its NODE structure that
3413 there are two sorted sublists available to merge from, and inserts its
3414 NODE into the priority queue.
3416 The binary merge phase then begins. Each thread drops into a loop
3417 where the thread retrieves a NODE from the priority queue, merges lines
3418 available to that NODE, and potentially insert NODE or its parent back
3419 into the queue if there are sufficient available lines for them to
3420 merge. This continues until all lines at all nodes of the merge tree
3421 have been merged. */
3423 static void
3424 sortlines (struct line *restrict lines, struct line *restrict dest,
3425 unsigned long int nthreads, size_t total_lines,
3426 struct merge_node *parent, bool lo_child,
3427 struct merge_node_queue *merge_queue,
3428 FILE *tfp, char const *temp_output)
3430 /* Create merge tree NODE. */
3431 size_t nlines = (lo_child)? parent->nlo : parent->nhi;
3432 size_t nlo = nlines / 2;
3433 size_t nhi = nlines - nlo;
3434 struct line *lo = dest - total_lines;
3435 struct line *hi = lo - nlo;
3436 struct line **parent_end = (lo_child)? &parent->end_lo : &parent->end_hi;
3437 pthread_spinlock_t lock;
3438 pthread_spin_init (&lock, PTHREAD_PROCESS_PRIVATE);
3439 struct merge_node node = {lo, hi, lo, hi, parent_end, nlo, nhi,
3440 parent->level + 1, parent, false, &lock};
3442 /* Calculate thread arguments. */
3443 unsigned long int lo_threads = nthreads / 2;
3444 unsigned long int hi_threads = nthreads - lo_threads;
3445 pthread_t thread;
3446 struct thread_args args = {lines, lo, lo_threads, total_lines, &node,
3447 true, merge_queue, tfp, temp_output};
3449 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3450 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3452 sortlines (lines - nlo, hi, hi_threads, total_lines, &node, false,
3453 merge_queue, tfp, temp_output);
3454 pthread_join (thread, NULL);
3456 else
3458 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3459 Sort with 1 thread. */
3460 struct line *temp = lines - total_lines;
3461 if (1 < nhi)
3462 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3463 if (1 < nlo)
3464 sequential_sort (lines, nlo, temp, false);
3466 /* Update merge NODE. No need to lock yet. */
3467 node.lo = lines;
3468 node.hi = lines - nlo;
3469 node.end_lo = lines - nlo;
3470 node.end_hi = lines - nlo - nhi;
3472 queue_insert (merge_queue, &node);
3473 merge_loop (merge_queue, total_lines, tfp, temp_output);
3477 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
3478 the same as OUTFILE. If found, merge the found instances (and perhaps
3479 some other files) into a temporary file so that it can in turn be
3480 merged into OUTFILE without destroying OUTFILE before it is completely
3481 read. Return the new value of NFILES, which differs from the old if
3482 some merging occurred.
3484 This test ensures that an otherwise-erroneous use like
3485 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3486 It's not clear that POSIX requires this nicety.
3487 Detect common error cases, but don't try to catch obscure cases like
3488 "cat ... FILE ... | sort -m -o FILE"
3489 where traditional "sort" doesn't copy the input and where
3490 people should know that they're getting into trouble anyway.
3491 Catching these obscure cases would slow down performance in
3492 common cases. */
3494 static size_t
3495 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3496 size_t nfiles, char const *outfile)
3498 size_t i;
3499 bool got_outstat = false;
3500 struct stat outstat;
3502 for (i = ntemps; i < nfiles; i++)
3504 bool is_stdin = STREQ (files[i].name, "-");
3505 bool same;
3506 struct stat instat;
3508 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3509 same = true;
3510 else
3512 if (! got_outstat)
3514 if ((outfile
3515 ? stat (outfile, &outstat)
3516 : fstat (STDOUT_FILENO, &outstat))
3517 != 0)
3518 break;
3519 got_outstat = true;
3522 same = (((is_stdin
3523 ? fstat (STDIN_FILENO, &instat)
3524 : stat (files[i].name, &instat))
3525 == 0)
3526 && SAME_INODE (instat, outstat));
3529 if (same)
3531 FILE *tftp;
3532 pid_t pid;
3533 char *temp = create_temp (&tftp, &pid);
3534 size_t num_merged = 0;
3537 num_merged += mergefiles (&files[i], 0, nfiles - i, tftp, temp);
3538 files[i].name = temp;
3539 files[i].pid = pid;
3541 if (i + num_merged < nfiles)
3542 memmove (&files[i + 1], &files[i + num_merged],
3543 num_merged * sizeof *files);
3544 ntemps += 1;
3545 nfiles -= num_merged - 1;;
3546 i += num_merged;
3548 while (i < nfiles);
3552 return nfiles;
3555 /* Merge the input FILES. NTEMPS is the number of files at the
3556 start of FILES that are temporary; it is zero at the top level.
3557 NFILES is the total number of files. Put the output in
3558 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3560 static void
3561 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3562 char const *output_file)
3564 while (nmerge < nfiles)
3566 /* Number of input files processed so far. */
3567 size_t in;
3569 /* Number of output files generated so far. */
3570 size_t out;
3572 /* nfiles % NMERGE; this counts input files that are left over
3573 after all full-sized merges have been done. */
3574 size_t remainder;
3576 /* Number of easily-available slots at the next loop iteration. */
3577 size_t cheap_slots;
3579 /* Do as many NMERGE-size merges as possible. In the case that
3580 nmerge is bogus, increment by the maximum number of file
3581 descriptors allowed. */
3582 for (out = in = 0; nmerge <= nfiles - in; out++)
3584 FILE *tfp;
3585 pid_t pid;
3586 char *temp = create_temp (&tfp, &pid);
3587 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3588 nmerge, tfp, temp);
3589 ntemps -= MIN (ntemps, num_merged);
3590 files[out].name = temp;
3591 files[out].pid = pid;
3592 in += num_merged;
3595 remainder = nfiles - in;
3596 cheap_slots = nmerge - out % nmerge;
3598 if (cheap_slots < remainder)
3600 /* So many files remain that they can't all be put into the last
3601 NMERGE-sized output window. Do one more merge. Merge as few
3602 files as possible, to avoid needless I/O. */
3603 size_t nshortmerge = remainder - cheap_slots + 1;
3604 FILE *tfp;
3605 pid_t pid;
3606 char *temp = create_temp (&tfp, &pid);
3607 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3608 nshortmerge, tfp, temp);
3609 ntemps -= MIN (ntemps, num_merged);
3610 files[out].name = temp;
3611 files[out++].pid = pid;
3612 in += num_merged;
3615 /* Put the remaining input files into the last NMERGE-sized output
3616 window, so they will be merged in the next pass. */
3617 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3618 ntemps += out;
3619 nfiles -= in - out;
3622 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
3624 /* We aren't guaranteed that this final mergefiles will work, therefore we
3625 try to merge into the output, and then merge as much as we can into a
3626 temp file if we can't. Repeat. */
3628 while (true)
3630 /* Merge directly into the output file if possible. */
3631 FILE **fps;
3632 size_t nopened = open_input_files (files, nfiles, &fps);
3634 if (nopened == nfiles)
3636 FILE *ofp = stream_open (output_file, "w");
3637 if (ofp)
3639 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3640 break;
3642 if (errno != EMFILE || nopened <= 2)
3643 die (_("open failed"), output_file);
3645 else if (nopened <= 2)
3646 die (_("open failed"), files[nopened].name);
3648 /* We ran out of file descriptors. Close one of the input
3649 files, to gain a file descriptor. Then create a temporary
3650 file with our spare file descriptor. Retry if that failed
3651 (e.g., some other process could open a file between the time
3652 we closed and tried to create). */
3653 FILE *tfp;
3654 pid_t pid;
3655 char *temp;
3658 nopened--;
3659 xfclose (fps[nopened], files[nopened].name);
3660 temp = maybe_create_temp (&tfp, &pid, ! (nopened <= 2));
3662 while (!temp);
3664 /* Merge into the newly allocated temporary. */
3665 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp, fps);
3666 ntemps -= MIN (ntemps, nopened);
3667 files[0].name = temp;
3668 files[0].pid = pid;
3670 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3671 ntemps++;
3672 nfiles -= nopened - 1;
3676 /* Sort NFILES FILES onto OUTPUT_FILE. */
3678 static void
3679 sort (char * const *files, size_t nfiles, char const *output_file,
3680 unsigned long int nthreads)
3682 struct buffer buf;
3683 size_t ntemps = 0;
3684 bool output_file_created = false;
3686 buf.alloc = 0;
3688 while (nfiles)
3690 char const *temp_output;
3691 char const *file = *files;
3692 FILE *fp = xfopen (file, "r");
3693 FILE *tfp;
3695 size_t bytes_per_line;
3696 if (nthreads > 1)
3698 /* Get log P. */
3699 unsigned long int tmp = 1;
3700 size_t mult = 1;
3701 while (tmp < nthreads)
3703 tmp *= 2;
3704 mult++;
3706 bytes_per_line = (mult * sizeof (struct line));
3708 else
3709 bytes_per_line = sizeof (struct line) * 3 / 2;
3711 if (! buf.alloc)
3712 initbuf (&buf, bytes_per_line,
3713 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3714 buf.eof = false;
3715 files++;
3716 nfiles--;
3718 while (fillbuf (&buf, fp, file))
3720 struct line *line;
3722 if (buf.eof && nfiles
3723 && (bytes_per_line + 1
3724 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3726 /* End of file, but there is more input and buffer room.
3727 Concatenate the next input file; this is faster in
3728 the usual case. */
3729 buf.left = buf.used;
3730 break;
3733 line = buffer_linelim (&buf);
3734 if (buf.eof && !nfiles && !ntemps && !buf.left)
3736 xfclose (fp, file);
3737 tfp = xfopen (output_file, "w");
3738 temp_output = output_file;
3739 output_file_created = true;
3741 else
3743 ++ntemps;
3744 temp_output = create_temp (&tfp, NULL);
3746 if (1 < buf.nlines)
3748 struct merge_node_queue merge_queue;
3749 queue_init (&merge_queue, 2 * nthreads);
3751 pthread_spinlock_t lock;
3752 pthread_spin_init (&lock, PTHREAD_PROCESS_PRIVATE);
3753 struct merge_node node =
3754 {NULL, NULL, NULL, NULL, NULL, buf.nlines,
3755 buf.nlines, MERGE_END, NULL, false, &lock};
3757 sortlines (line, line, nthreads, buf.nlines, &node, true,
3758 &merge_queue, tfp, temp_output);
3759 queue_destroy (&merge_queue);
3761 else
3762 write_unique (line - 1, tfp, temp_output);
3764 xfclose (tfp, temp_output);
3766 /* Free up some resources every once in a while. */
3767 if (MAX_PROCS_BEFORE_REAP < nprocs)
3768 reap_some ();
3770 if (output_file_created)
3771 goto finish;
3773 xfclose (fp, file);
3776 finish:
3777 free (buf.buf);
3779 if (! output_file_created)
3781 size_t i;
3782 struct tempnode *node = temphead;
3783 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3784 for (i = 0; node; i++)
3786 tempfiles[i].name = node->name;
3787 tempfiles[i].pid = node->pid;
3788 node = node->next;
3790 merge (tempfiles, ntemps, ntemps, output_file);
3791 free (tempfiles);
3795 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3797 static void
3798 insertkey (struct keyfield *key_arg)
3800 struct keyfield **p;
3801 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3803 for (p = &keylist; *p; p = &(*p)->next)
3804 continue;
3805 *p = key;
3806 key->next = NULL;
3809 /* Report a bad field specification SPEC, with extra info MSGID. */
3811 static void badfieldspec (char const *, char const *)
3812 ATTRIBUTE_NORETURN;
3813 static void
3814 badfieldspec (char const *spec, char const *msgid)
3816 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3817 _(msgid), quote (spec));
3818 abort ();
3821 /* Report incompatible options. */
3823 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3824 static void
3825 incompatible_options (char const *opts)
3827 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3828 abort ();
3831 /* Check compatibility of ordering options. */
3833 static void
3834 check_ordering_compatibility (void)
3836 struct keyfield *key;
3838 for (key = keylist; key; key = key->next)
3839 if ((1 < (key->random + key->numeric + key->general_numeric + key->month
3840 + key->version + !!key->ignore + key->human_numeric))
3841 || (key->random && key->translate))
3843 /* The following is too big, but guaranteed to be "big enough". */
3844 char opts[sizeof short_options];
3845 /* Clear flags we're not interested in. */
3846 key->skipsblanks = key->skipeblanks = key->reverse = false;
3847 key_to_opts (key, opts);
3848 incompatible_options (opts);
3852 /* Parse the leading integer in STRING and store the resulting value
3853 (which must fit into size_t) into *VAL. Return the address of the
3854 suffix after the integer. If the value is too large, silently
3855 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3856 failure; otherwise, report MSGID and exit on failure. */
3858 static char const *
3859 parse_field_count (char const *string, size_t *val, char const *msgid)
3861 char *suffix;
3862 uintmax_t n;
3864 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3866 case LONGINT_OK:
3867 case LONGINT_INVALID_SUFFIX_CHAR:
3868 *val = n;
3869 if (*val == n)
3870 break;
3871 /* Fall through. */
3872 case LONGINT_OVERFLOW:
3873 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3874 *val = SIZE_MAX;
3875 break;
3877 case LONGINT_INVALID:
3878 if (msgid)
3879 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3880 _(msgid), quote (string));
3881 return NULL;
3884 return suffix;
3887 /* Handle interrupts and hangups. */
3889 static void
3890 sighandler (int sig)
3892 if (! SA_NOCLDSTOP)
3893 signal (sig, SIG_IGN);
3895 cleanup ();
3897 signal (sig, SIG_DFL);
3898 raise (sig);
3901 /* Set the ordering options for KEY specified in S.
3902 Return the address of the first character in S that
3903 is not a valid ordering option.
3904 BLANKTYPE is the kind of blanks that 'b' should skip. */
3906 static char *
3907 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
3909 while (*s)
3911 switch (*s)
3913 case 'b':
3914 if (blanktype == bl_start || blanktype == bl_both)
3915 key->skipsblanks = true;
3916 if (blanktype == bl_end || blanktype == bl_both)
3917 key->skipeblanks = true;
3918 break;
3919 case 'd':
3920 key->ignore = nondictionary;
3921 break;
3922 case 'f':
3923 key->translate = fold_toupper;
3924 break;
3925 case 'g':
3926 key->general_numeric = true;
3927 break;
3928 case 'h':
3929 key->human_numeric = true;
3930 break;
3931 case 'i':
3932 /* Option order should not matter, so don't let -i override
3933 -d. -d implies -i, but -i does not imply -d. */
3934 if (! key->ignore)
3935 key->ignore = nonprinting;
3936 break;
3937 case 'M':
3938 key->month = true;
3939 break;
3940 case 'n':
3941 key->numeric = true;
3942 break;
3943 case 'R':
3944 key->random = true;
3945 break;
3946 case 'r':
3947 key->reverse = true;
3948 break;
3949 case 'V':
3950 key->version = true;
3951 break;
3952 default:
3953 return (char *) s;
3955 ++s;
3957 return (char *) s;
3960 static struct keyfield *
3961 key_init (struct keyfield *key)
3963 memset (key, 0, sizeof *key);
3964 key->eword = SIZE_MAX;
3965 return key;
3969 main (int argc, char **argv)
3971 struct keyfield *key;
3972 struct keyfield key_buf;
3973 struct keyfield gkey;
3974 bool gkey_only = false;
3975 char const *s;
3976 int c = 0;
3977 char checkonly = 0;
3978 bool mergeonly = false;
3979 char *random_source = NULL;
3980 bool need_random = false;
3981 unsigned long int nthreads = 0;
3982 size_t nfiles = 0;
3983 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
3984 bool obsolete_usage = (posix2_version () < 200112);
3985 char **files;
3986 char *files_from = NULL;
3987 struct Tokens tok;
3988 char const *outfile = NULL;
3990 initialize_main (&argc, &argv);
3991 set_program_name (argv[0]);
3992 setlocale (LC_ALL, "");
3993 bindtextdomain (PACKAGE, LOCALEDIR);
3994 textdomain (PACKAGE);
3996 initialize_exit_failure (SORT_FAILURE);
3998 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3999 #if HAVE_NL_LANGINFO
4000 hard_LC_TIME = hard_locale (LC_TIME);
4001 #endif
4003 /* Get locale's representation of the decimal point. */
4005 struct lconv const *locale = localeconv ();
4007 /* If the locale doesn't define a decimal point, or if the decimal
4008 point is multibyte, use the C locale's decimal point. FIXME:
4009 add support for multibyte decimal points. */
4010 decimal_point = to_uchar (locale->decimal_point[0]);
4011 if (! decimal_point || locale->decimal_point[1])
4012 decimal_point = '.';
4014 /* FIXME: add support for multibyte thousands separators. */
4015 thousands_sep = to_uchar (*locale->thousands_sep);
4016 if (! thousands_sep || locale->thousands_sep[1])
4017 thousands_sep = -1;
4020 have_read_stdin = false;
4021 inittables ();
4024 size_t i;
4025 static int const sig[] =
4027 /* The usual suspects. */
4028 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4029 #ifdef SIGPOLL
4030 SIGPOLL,
4031 #endif
4032 #ifdef SIGPROF
4033 SIGPROF,
4034 #endif
4035 #ifdef SIGVTALRM
4036 SIGVTALRM,
4037 #endif
4038 #ifdef SIGXCPU
4039 SIGXCPU,
4040 #endif
4041 #ifdef SIGXFSZ
4042 SIGXFSZ,
4043 #endif
4045 enum { nsigs = ARRAY_CARDINALITY (sig) };
4047 #if SA_NOCLDSTOP
4048 struct sigaction act;
4050 sigemptyset (&caught_signals);
4051 for (i = 0; i < nsigs; i++)
4053 sigaction (sig[i], NULL, &act);
4054 if (act.sa_handler != SIG_IGN)
4055 sigaddset (&caught_signals, sig[i]);
4058 act.sa_handler = sighandler;
4059 act.sa_mask = caught_signals;
4060 act.sa_flags = 0;
4062 for (i = 0; i < nsigs; i++)
4063 if (sigismember (&caught_signals, sig[i]))
4064 sigaction (sig[i], &act, NULL);
4065 #else
4066 for (i = 0; i < nsigs; i++)
4067 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4069 signal (sig[i], sighandler);
4070 siginterrupt (sig[i], 1);
4072 #endif
4074 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4076 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4077 atexit (exit_cleanup);
4079 key_init (&gkey);
4080 gkey.sword = SIZE_MAX;
4082 files = xnmalloc (argc, sizeof *files);
4084 while (true)
4086 /* Parse an operand as a file after "--" was seen; or if
4087 pedantic and a file was seen, unless the POSIX version
4088 predates 1003.1-2001 and -c was not seen and the operand is
4089 "-o FILE" or "-oFILE". */
4090 int oi = -1;
4092 if (c == -1
4093 || (posixly_correct && nfiles != 0
4094 && ! (obsolete_usage
4095 && ! checkonly
4096 && optind != argc
4097 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4098 && (argv[optind][2] || optind + 1 != argc)))
4099 || ((c = getopt_long (argc, argv, short_options,
4100 long_options, &oi))
4101 == -1))
4103 if (argc <= optind)
4104 break;
4105 files[nfiles++] = argv[optind++];
4107 else switch (c)
4109 case 1:
4110 key = NULL;
4111 if (optarg[0] == '+')
4113 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4114 && ISDIGIT (argv[optind][1]));
4115 obsolete_usage |= minus_pos_usage && !posixly_correct;
4116 if (obsolete_usage)
4118 /* Treat +POS1 [-POS2] as a key if possible; but silently
4119 treat an operand as a file if it is not a valid +POS1. */
4120 key = key_init (&key_buf);
4121 s = parse_field_count (optarg + 1, &key->sword, NULL);
4122 if (s && *s == '.')
4123 s = parse_field_count (s + 1, &key->schar, NULL);
4124 if (! (key->sword || key->schar))
4125 key->sword = SIZE_MAX;
4126 if (! s || *set_ordering (s, key, bl_start))
4127 key = NULL;
4128 else
4130 if (minus_pos_usage)
4132 char const *optarg1 = argv[optind++];
4133 s = parse_field_count (optarg1 + 1, &key->eword,
4134 N_("invalid number after `-'"));
4135 if (*s == '.')
4136 s = parse_field_count (s + 1, &key->echar,
4137 N_("invalid number after `.'"));
4138 if (!key->echar && key->eword)
4140 /* obsolescent syntax +A.x -B.y is equivalent to:
4141 -k A+1.x+1,B.y (when y = 0)
4142 -k A+1.x+1,B+1.y (when y > 0)
4143 So eword is decremented as in the -k case
4144 only when the end field (B) is specified and
4145 echar (y) is 0. */
4146 key->eword--;
4148 if (*set_ordering (s, key, bl_end))
4149 badfieldspec (optarg1,
4150 N_("stray character in field spec"));
4152 key->obsolete_used = true;
4153 insertkey (key);
4157 if (! key)
4158 files[nfiles++] = optarg;
4159 break;
4161 case SORT_OPTION:
4162 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4163 /* Fall through. */
4164 case 'b':
4165 case 'd':
4166 case 'f':
4167 case 'g':
4168 case 'h':
4169 case 'i':
4170 case 'M':
4171 case 'n':
4172 case 'r':
4173 case 'R':
4174 case 'V':
4176 char str[2];
4177 str[0] = c;
4178 str[1] = '\0';
4179 set_ordering (str, &gkey, bl_both);
4181 break;
4183 case CHECK_OPTION:
4184 c = (optarg
4185 ? XARGMATCH ("--check", optarg, check_args, check_types)
4186 : 'c');
4187 /* Fall through. */
4188 case 'c':
4189 case 'C':
4190 if (checkonly && checkonly != c)
4191 incompatible_options ("cC");
4192 checkonly = c;
4193 break;
4195 case COMPRESS_PROGRAM_OPTION:
4196 if (compress_program && !STREQ (compress_program, optarg))
4197 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4198 compress_program = optarg;
4199 break;
4201 case DEBUG_PROGRAM_OPTION:
4202 debug = true;
4203 break;
4205 case FILES0_FROM_OPTION:
4206 files_from = optarg;
4207 break;
4209 case 'k':
4210 key = key_init (&key_buf);
4212 /* Get POS1. */
4213 s = parse_field_count (optarg, &key->sword,
4214 N_("invalid number at field start"));
4215 if (! key->sword--)
4217 /* Provoke with `sort -k0' */
4218 badfieldspec (optarg, N_("field number is zero"));
4220 if (*s == '.')
4222 s = parse_field_count (s + 1, &key->schar,
4223 N_("invalid number after `.'"));
4224 if (! key->schar--)
4226 /* Provoke with `sort -k1.0' */
4227 badfieldspec (optarg, N_("character offset is zero"));
4230 if (! (key->sword || key->schar))
4231 key->sword = SIZE_MAX;
4232 s = set_ordering (s, key, bl_start);
4233 if (*s != ',')
4235 key->eword = SIZE_MAX;
4236 key->echar = 0;
4238 else
4240 /* Get POS2. */
4241 s = parse_field_count (s + 1, &key->eword,
4242 N_("invalid number after `,'"));
4243 if (! key->eword--)
4245 /* Provoke with `sort -k1,0' */
4246 badfieldspec (optarg, N_("field number is zero"));
4248 if (*s == '.')
4250 s = parse_field_count (s + 1, &key->echar,
4251 N_("invalid number after `.'"));
4253 s = set_ordering (s, key, bl_end);
4255 if (*s)
4256 badfieldspec (optarg, N_("stray character in field spec"));
4257 insertkey (key);
4258 break;
4260 case 'm':
4261 mergeonly = true;
4262 break;
4264 case NMERGE_OPTION:
4265 specify_nmerge (oi, c, optarg);
4266 break;
4268 case 'o':
4269 if (outfile && !STREQ (outfile, optarg))
4270 error (SORT_FAILURE, 0, _("multiple output files specified"));
4271 outfile = optarg;
4272 break;
4274 case RANDOM_SOURCE_OPTION:
4275 if (random_source && !STREQ (random_source, optarg))
4276 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4277 random_source = optarg;
4278 break;
4280 case 's':
4281 stable = true;
4282 break;
4284 case 'S':
4285 specify_sort_size (oi, c, optarg);
4286 break;
4288 case 't':
4290 char newtab = optarg[0];
4291 if (! newtab)
4292 error (SORT_FAILURE, 0, _("empty tab"));
4293 if (optarg[1])
4295 if (STREQ (optarg, "\\0"))
4296 newtab = '\0';
4297 else
4299 /* Provoke with `sort -txx'. Complain about
4300 "multi-character tab" instead of "multibyte tab", so
4301 that the diagnostic's wording does not need to be
4302 changed once multibyte characters are supported. */
4303 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4304 quote (optarg));
4307 if (tab != TAB_DEFAULT && tab != newtab)
4308 error (SORT_FAILURE, 0, _("incompatible tabs"));
4309 tab = newtab;
4311 break;
4313 case 'T':
4314 add_temp_dir (optarg);
4315 break;
4317 case PARALLEL_OPTION:
4318 nthreads = specify_nthreads (oi, c, optarg);
4319 break;
4321 case 'u':
4322 unique = true;
4323 break;
4325 case 'y':
4326 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4327 through Solaris 7. It is also accepted by many non-Solaris
4328 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4329 -y is marked as obsolete starting with Solaris 8 (1999), but is
4330 still accepted as of Solaris 10 prerelease (2004).
4332 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4333 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4334 and which in general ignores the argument after "-y" if it
4335 consists entirely of digits (it can even be empty). */
4336 if (optarg == argv[optind - 1])
4338 char const *p;
4339 for (p = optarg; ISDIGIT (*p); p++)
4340 continue;
4341 optind -= (*p != '\0');
4343 break;
4345 case 'z':
4346 eolchar = 0;
4347 break;
4349 case_GETOPT_HELP_CHAR;
4351 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4353 default:
4354 usage (SORT_FAILURE);
4358 if (files_from)
4360 FILE *stream;
4362 /* When using --files0-from=F, you may not specify any files
4363 on the command-line. */
4364 if (nfiles)
4366 error (0, 0, _("extra operand %s"), quote (files[0]));
4367 fprintf (stderr, "%s\n",
4368 _("file operands cannot be combined with --files0-from"));
4369 usage (SORT_FAILURE);
4372 if (STREQ (files_from, "-"))
4373 stream = stdin;
4374 else
4376 stream = fopen (files_from, "r");
4377 if (stream == NULL)
4378 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4379 quote (files_from));
4382 readtokens0_init (&tok);
4384 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4385 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4386 quote (files_from));
4388 if (tok.n_tok)
4390 size_t i;
4391 free (files);
4392 files = tok.tok;
4393 nfiles = tok.n_tok;
4394 for (i = 0; i < nfiles; i++)
4396 if (STREQ (files[i], "-"))
4397 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4398 "no file name of %s allowed"),
4399 quote (files[i]));
4400 else if (files[i][0] == '\0')
4402 /* Using the standard `filename:line-number:' prefix here is
4403 not totally appropriate, since NUL is the separator, not NL,
4404 but it might be better than nothing. */
4405 unsigned long int file_number = i + 1;
4406 error (SORT_FAILURE, 0,
4407 _("%s:%lu: invalid zero-length file name"),
4408 quotearg_colon (files_from), file_number);
4412 else
4413 error (SORT_FAILURE, 0, _("no input from %s"),
4414 quote (files_from));
4417 /* Inheritance of global options to individual keys. */
4418 for (key = keylist; key; key = key->next)
4420 if (default_key_compare (key) && !key->reverse)
4422 key->ignore = gkey.ignore;
4423 key->translate = gkey.translate;
4424 key->skipsblanks = gkey.skipsblanks;
4425 key->skipeblanks = gkey.skipeblanks;
4426 key->month = gkey.month;
4427 key->numeric = gkey.numeric;
4428 key->general_numeric = gkey.general_numeric;
4429 key->human_numeric = gkey.human_numeric;
4430 key->version = gkey.version;
4431 key->random = gkey.random;
4432 key->reverse = gkey.reverse;
4435 need_random |= key->random;
4438 if (!keylist && !default_key_compare (&gkey))
4440 gkey_only = true;
4441 insertkey (&gkey);
4442 need_random |= gkey.random;
4445 check_ordering_compatibility ();
4447 /* Disable this combination so that users are less likely
4448 to inadvertantly update a file with debugging enabled.
4449 Also it simplifies the code for handling temp files. */
4450 if (debug && outfile)
4451 error (SORT_FAILURE, 0, _("options -o and --debug are incompatible"));
4453 if (debug)
4455 /* Always output the locale in debug mode, since this
4456 is such a common source of confusion. */
4457 if (hard_LC_COLLATE)
4458 error (0, 0, _("using %s sorting rules"),
4459 quote (setlocale (LC_COLLATE, NULL)));
4460 else
4461 error (0, 0, _("using simple byte comparison"));
4462 key_warnings (&gkey, gkey_only);
4465 reverse = gkey.reverse;
4467 if (need_random)
4468 random_md5_state_init (random_source);
4470 if (temp_dir_count == 0)
4472 char const *tmp_dir = getenv ("TMPDIR");
4473 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4476 if (nfiles == 0)
4478 static char *minus = (char *) "-";
4479 nfiles = 1;
4480 free (files);
4481 files = &minus;
4484 /* Need to re-check that we meet the minimum requirement for memory
4485 usage with the final value for NMERGE. */
4486 if (0 < sort_size)
4487 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4489 if (checkonly)
4491 if (nfiles > 1)
4492 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4493 quote (files[1]), checkonly);
4495 if (outfile)
4497 static char opts[] = {0, 'o', 0};
4498 opts[0] = checkonly;
4499 incompatible_options (opts);
4502 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4503 input is not properly sorted. */
4504 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4507 if (mergeonly)
4509 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4510 size_t i;
4512 for (i = 0; i < nfiles; ++i)
4513 sortfiles[i].name = files[i];
4515 merge (sortfiles, 0, nfiles, outfile);
4516 IF_LINT (free (sortfiles));
4518 else
4520 /* If NTHREADS > number of cores on the machine, spinlocking
4521 could be wasteful. */
4522 unsigned long int np2 = num_processors (NPROC_CURRENT_OVERRIDABLE);
4523 if (!nthreads || nthreads > np2)
4524 nthreads = np2;
4526 sort (files, nfiles, outfile, nthreads);
4529 if (have_read_stdin && fclose (stdin) == EOF)
4530 die (_("close failed"), "-");
4532 exit (EXIT_SUCCESS);