timeout: add the --kill-after option
[coreutils/ericb.git] / src / sort.c
blob5a937dd31f9fd3af499f6bdfecc8f358b022a935
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <sys/types.h>
27 #include <sys/wait.h>
28 #include <signal.h>
29 #include "system.h"
30 #include "argmatch.h"
31 #include "error.h"
32 #include "filevercmp.h"
33 #include "hard-locale.h"
34 #include "hash.h"
35 #include "ignore-value.h"
36 #include "md5.h"
37 #include "physmem.h"
38 #include "posixver.h"
39 #include "quote.h"
40 #include "quotearg.h"
41 #include "randread.h"
42 #include "readtokens0.h"
43 #include "stdio--.h"
44 #include "stdlib--.h"
45 #include "strnumcmp.h"
46 #include "xmemcoll.h"
47 #include "xmemxfrm.h"
48 #include "xnanosleep.h"
49 #include "xstrtol.h"
51 #if HAVE_SYS_RESOURCE_H
52 # include <sys/resource.h>
53 #endif
54 #ifndef RLIMIT_DATA
55 struct rlimit { size_t rlim_cur; };
56 # define getrlimit(Resource, Rlp) (-1)
57 #endif
59 /* The official name of this program (e.g., no `g' prefix). */
60 #define PROGRAM_NAME "sort"
62 #define AUTHORS \
63 proper_name ("Mike Haertel"), \
64 proper_name ("Paul Eggert")
66 #if HAVE_LANGINFO_CODESET
67 # include <langinfo.h>
68 #endif
70 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
71 present. */
72 #ifndef SA_NOCLDSTOP
73 # define SA_NOCLDSTOP 0
74 /* No sigprocmask. Always 'return' zero. */
75 # define sigprocmask(How, Set, Oset) (0)
76 # define sigset_t int
77 # if ! HAVE_SIGINTERRUPT
78 # define siginterrupt(sig, flag) /* empty */
79 # endif
80 #endif
82 #if !defined OPEN_MAX && defined NR_OPEN
83 # define OPEN_MAX NR_OPEN
84 #endif
85 #if !defined OPEN_MAX
86 # define OPEN_MAX 20
87 #endif
89 #define UCHAR_LIM (UCHAR_MAX + 1)
91 #ifndef DEFAULT_TMPDIR
92 # define DEFAULT_TMPDIR "/tmp"
93 #endif
95 /* Exit statuses. */
96 enum
98 /* POSIX says to exit with status 1 if invoked with -c and the
99 input is not properly sorted. */
100 SORT_OUT_OF_ORDER = 1,
102 /* POSIX says any other irregular exit must exit with a status
103 code greater than 1. */
104 SORT_FAILURE = 2
107 enum
109 /* The number of times we should try to fork a compression process
110 (we retry if the fork call fails). We don't _need_ to compress
111 temp files, this is just to reduce disk access, so this number
112 can be small. Each retry doubles in duration. */
113 MAX_FORK_TRIES_COMPRESS = 4,
115 /* The number of times we should try to fork a decompression process.
116 If we can't fork a decompression process, we can't sort, so this
117 number should be big. Each retry doubles in duration. */
118 MAX_FORK_TRIES_DECOMPRESS = 9
121 /* The representation of the decimal point in the current locale. */
122 static int decimal_point;
124 /* Thousands separator; if -1, then there isn't one. */
125 static int thousands_sep;
127 /* Nonzero if the corresponding locales are hard. */
128 static bool hard_LC_COLLATE;
129 #if HAVE_NL_LANGINFO
130 static bool hard_LC_TIME;
131 #endif
133 #define NONZERO(x) ((x) != 0)
135 /* The kind of blanks for '-b' to skip in various options. */
136 enum blanktype { bl_start, bl_end, bl_both };
138 /* The character marking end of line. Default to \n. */
139 static char eolchar = '\n';
141 /* Lines are held in core as counted strings. */
142 struct line
144 char *text; /* Text of the line. */
145 size_t length; /* Length including final newline. */
146 char *keybeg; /* Start of first key. */
147 char *keylim; /* Limit of first key. */
150 /* Input buffers. */
151 struct buffer
153 char *buf; /* Dynamically allocated buffer,
154 partitioned into 3 regions:
155 - input data;
156 - unused area;
157 - an array of lines, in reverse order. */
158 size_t used; /* Number of bytes used for input data. */
159 size_t nlines; /* Number of lines in the line array. */
160 size_t alloc; /* Number of bytes allocated. */
161 size_t left; /* Number of bytes left from previous reads. */
162 size_t line_bytes; /* Number of bytes to reserve for each line. */
163 bool eof; /* An EOF has been read. */
166 struct keyfield
168 size_t sword; /* Zero-origin 'word' to start at. */
169 size_t schar; /* Additional characters to skip. */
170 size_t eword; /* Zero-origin first word after field. */
171 size_t echar; /* Additional characters in field. */
172 bool const *ignore; /* Boolean array of characters to ignore. */
173 char const *translate; /* Translation applied to characters. */
174 bool skipsblanks; /* Skip leading blanks when finding start. */
175 bool skipeblanks; /* Skip leading blanks when finding end. */
176 bool numeric; /* Flag for numeric comparison. Handle
177 strings of digits with optional decimal
178 point, but no exponential notation. */
179 bool random; /* Sort by random hash of key. */
180 bool general_numeric; /* Flag for general, numeric comparison.
181 Handle numbers in exponential notation. */
182 bool human_numeric; /* Flag for sorting by human readable
183 units with either SI xor IEC prefixes. */
184 int iec_present; /* Flag for checking for mixed SI and IEC. */
185 bool month; /* Flag for comparison by month name. */
186 bool reverse; /* Reverse the sense of comparison. */
187 bool version; /* sort by version number */
188 struct keyfield *next; /* Next keyfield to try. */
191 struct month
193 char const *name;
194 int val;
197 /* FIXME: None of these tables work with multibyte character sets.
198 Also, there are many other bugs when handling multibyte characters.
199 One way to fix this is to rewrite `sort' to use wide characters
200 internally, but doing this with good performance is a bit
201 tricky. */
203 /* Table of blanks. */
204 static bool blanks[UCHAR_LIM];
206 /* Table of non-printing characters. */
207 static bool nonprinting[UCHAR_LIM];
209 /* Table of non-dictionary characters (not letters, digits, or blanks). */
210 static bool nondictionary[UCHAR_LIM];
212 /* Translation table folding lower case to upper. */
213 static unsigned char fold_toupper[UCHAR_LIM];
215 #define MONTHS_PER_YEAR 12
217 /* Table mapping month names to integers.
218 Alphabetic order allows binary search. */
219 static struct month monthtab[] =
221 {"APR", 4},
222 {"AUG", 8},
223 {"DEC", 12},
224 {"FEB", 2},
225 {"JAN", 1},
226 {"JUL", 7},
227 {"JUN", 6},
228 {"MAR", 3},
229 {"MAY", 5},
230 {"NOV", 11},
231 {"OCT", 10},
232 {"SEP", 9}
235 /* During the merge phase, the number of files to merge at once. */
236 #define NMERGE_DEFAULT 16
238 /* Minimum size for a merge or check buffer. */
239 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
241 /* Minimum sort size; the code might not work with smaller sizes. */
242 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
244 /* The number of bytes needed for a merge or check buffer, which can
245 function relatively efficiently even if it holds only one line. If
246 a longer line is seen, this value is increased. */
247 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
249 /* The approximate maximum number of bytes of main memory to use, as
250 specified by the user. Zero if the user has not specified a size. */
251 static size_t sort_size;
253 /* The guessed size for non-regular files. */
254 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
256 /* Array of directory names in which any temporary files are to be created. */
257 static char const **temp_dirs;
259 /* Number of temporary directory names used. */
260 static size_t temp_dir_count;
262 /* Number of allocated slots in temp_dirs. */
263 static size_t temp_dir_alloc;
265 /* Flag to reverse the order of all comparisons. */
266 static bool reverse;
268 /* Flag for stable sort. This turns off the last ditch bytewise
269 comparison of lines, and instead leaves lines in the same order
270 they were read if all keys compare equal. */
271 static bool stable;
273 /* If TAB has this value, blanks separate fields. */
274 enum { TAB_DEFAULT = CHAR_MAX + 1 };
276 /* Tab character separating fields. If TAB_DEFAULT, then fields are
277 separated by the empty string between a non-blank character and a blank
278 character. */
279 static int tab = TAB_DEFAULT;
281 /* Flag to remove consecutive duplicate lines from the output.
282 Only the last of a sequence of equal lines will be output. */
283 static bool unique;
285 /* Nonzero if any of the input files are the standard input. */
286 static bool have_read_stdin;
288 /* List of key field comparisons to be tried. */
289 static struct keyfield *keylist;
291 /* Program used to (de)compress temp files. Must accept -d. */
292 static char const *compress_program;
294 /* Maximum number of files to merge in one go. If more than this
295 number are present, temp files will be used. */
296 static unsigned int nmerge = NMERGE_DEFAULT;
298 static void sortlines_temp (struct line *, size_t, struct line *);
300 /* Report MESSAGE for FILE, then clean up and exit.
301 If FILE is null, it represents standard output. */
303 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
304 static void
305 die (char const *message, char const *file)
307 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
308 exit (SORT_FAILURE);
311 void
312 usage (int status)
314 if (status != EXIT_SUCCESS)
315 fprintf (stderr, _("Try `%s --help' for more information.\n"),
316 program_name);
317 else
319 printf (_("\
320 Usage: %s [OPTION]... [FILE]...\n\
321 or: %s [OPTION]... --files0-from=F\n\
323 program_name, program_name);
324 fputs (_("\
325 Write sorted concatenation of all FILE(s) to standard output.\n\
327 "), stdout);
328 fputs (_("\
329 Mandatory arguments to long options are mandatory for short options too.\n\
330 "), stdout);
331 fputs (_("\
332 Ordering options:\n\
334 "), stdout);
335 fputs (_("\
336 -b, --ignore-leading-blanks ignore leading blanks\n\
337 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
338 -f, --ignore-case fold lower case to upper case characters\n\
339 "), stdout);
340 fputs (_("\
341 -g, --general-numeric-sort compare according to general numerical value\n\
342 -i, --ignore-nonprinting consider only printable characters\n\
343 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
344 "), stdout);
345 fputs (_("\
346 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
347 "), stdout);
348 fputs (_("\
349 -n, --numeric-sort compare according to string numerical value\n\
350 -R, --random-sort sort by random hash of keys\n\
351 --random-source=FILE get random bytes from FILE\n\
352 -r, --reverse reverse the result of comparisons\n\
353 "), stdout);
354 fputs (_("\
355 --sort=WORD sort according to WORD:\n\
356 general-numeric -g, human-numeric -h, month -M,\n\
357 numeric -n, random -R, version -V\n\
358 -V, --version-sort natural sort of (version) numbers within text\n\
360 "), stdout);
361 fputs (_("\
362 Other options:\n\
364 "), stdout);
365 fputs (_("\
366 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
367 for more use temp files\n\
368 "), stdout);
369 fputs (_("\
370 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
371 -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\
372 --compress-program=PROG compress temporaries with PROG;\n\
373 decompress them with PROG -d\n\
374 --files0-from=F read input from the files specified by\n\
375 NUL-terminated names in file F;\n\
376 If F is - then read names from standard input\n\
377 "), stdout);
378 fputs (_("\
379 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
380 (default end of line)\n\
381 -m, --merge merge already sorted files; do not sort\n\
382 "), stdout);
383 fputs (_("\
384 -o, --output=FILE write result to FILE instead of standard output\n\
385 -s, --stable stabilize sort by disabling last-resort comparison\n\
386 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
387 "), stdout);
388 printf (_("\
389 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
390 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
391 multiple options specify multiple directories\n\
392 -u, --unique with -c, check for strict ordering;\n\
393 without -c, output only the first of an equal run\n\
394 "), DEFAULT_TMPDIR);
395 fputs (_("\
396 -z, --zero-terminated end lines with 0 byte, not newline\n\
397 "), stdout);
398 fputs (HELP_OPTION_DESCRIPTION, stdout);
399 fputs (VERSION_OPTION_DESCRIPTION, stdout);
400 fputs (_("\
402 POS is F[.C][OPTS], where F is the field number and C the character position\n\
403 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
404 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
405 one or more single-letter ordering options, which override global ordering\n\
406 options for that key. If no key is given, use the entire line as the key.\n\
408 SIZE may be followed by the following multiplicative suffixes:\n\
409 "), stdout);
410 fputs (_("\
411 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
413 With no FILE, or when FILE is -, read standard input.\n\
415 *** WARNING ***\n\
416 The locale specified by the environment affects sort order.\n\
417 Set LC_ALL=C to get the traditional sort order that uses\n\
418 native byte values.\n\
419 "), stdout );
420 emit_ancillary_info ();
423 exit (status);
426 /* For long options that have no equivalent short option, use a
427 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
428 enum
430 CHECK_OPTION = CHAR_MAX + 1,
431 COMPRESS_PROGRAM_OPTION,
432 FILES0_FROM_OPTION,
433 NMERGE_OPTION,
434 RANDOM_SOURCE_OPTION,
435 SORT_OPTION
438 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
440 static struct option const long_options[] =
442 {"ignore-leading-blanks", no_argument, NULL, 'b'},
443 {"check", optional_argument, NULL, CHECK_OPTION},
444 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
445 {"dictionary-order", no_argument, NULL, 'd'},
446 {"ignore-case", no_argument, NULL, 'f'},
447 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
448 {"general-numeric-sort", no_argument, NULL, 'g'},
449 {"ignore-nonprinting", no_argument, NULL, 'i'},
450 {"key", required_argument, NULL, 'k'},
451 {"merge", no_argument, NULL, 'm'},
452 {"month-sort", no_argument, NULL, 'M'},
453 {"numeric-sort", no_argument, NULL, 'n'},
454 {"human-numeric-sort", no_argument, NULL, 'h'},
455 {"version-sort", no_argument, NULL, 'V'},
456 {"random-sort", no_argument, NULL, 'R'},
457 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
458 {"sort", required_argument, NULL, SORT_OPTION},
459 {"output", required_argument, NULL, 'o'},
460 {"reverse", no_argument, NULL, 'r'},
461 {"stable", no_argument, NULL, 's'},
462 {"batch-size", required_argument, NULL, NMERGE_OPTION},
463 {"buffer-size", required_argument, NULL, 'S'},
464 {"field-separator", required_argument, NULL, 't'},
465 {"temporary-directory", required_argument, NULL, 'T'},
466 {"unique", no_argument, NULL, 'u'},
467 {"zero-terminated", no_argument, NULL, 'z'},
468 {GETOPT_HELP_OPTION_DECL},
469 {GETOPT_VERSION_OPTION_DECL},
470 {NULL, 0, NULL, 0},
473 #define CHECK_TABLE \
474 _ct_("quiet", 'C') \
475 _ct_("silent", 'C') \
476 _ct_("diagnose-first", 'c')
478 static char const *const check_args[] =
480 #define _ct_(_s, _c) _s,
481 CHECK_TABLE NULL
482 #undef _ct_
484 static char const check_types[] =
486 #define _ct_(_s, _c) _c,
487 CHECK_TABLE
488 #undef _ct_
491 #define SORT_TABLE \
492 _st_("general-numeric", 'g') \
493 _st_("human-numeric", 'h') \
494 _st_("month", 'M') \
495 _st_("numeric", 'n') \
496 _st_("random", 'R') \
497 _st_("version", 'V')
499 static char const *const sort_args[] =
501 #define _st_(_s, _c) _s,
502 SORT_TABLE NULL
503 #undef _st_
505 static char const sort_types[] =
507 #define _st_(_s, _c) _c,
508 SORT_TABLE
509 #undef _st_
512 /* The set of signals that are caught. */
513 static sigset_t caught_signals;
515 /* Critical section status. */
516 struct cs_status
518 bool valid;
519 sigset_t sigs;
522 /* Enter a critical section. */
523 static struct cs_status
524 cs_enter (void)
526 struct cs_status status;
527 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
528 return status;
531 /* Leave a critical section. */
532 static void
533 cs_leave (struct cs_status status)
535 if (status.valid)
537 /* Ignore failure when restoring the signal mask. */
538 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
542 /* The list of temporary files. */
543 struct tempnode
545 struct tempnode *volatile next;
546 pid_t pid; /* If compressed, the pid of compressor, else zero */
547 char name[1]; /* Actual size is 1 + file name length. */
549 static struct tempnode *volatile temphead;
550 static struct tempnode *volatile *temptail = &temphead;
552 struct sortfile
554 char const *name;
555 pid_t pid; /* If compressed, the pid of compressor, else zero */
558 /* A table where we store compression process states. We clean up all
559 processes in a timely manner so as not to exhaust system resources,
560 so we store the info on whether the process is still running, or has
561 been reaped here. */
562 static Hash_table *proctab;
564 enum { INIT_PROCTAB_SIZE = 47 };
566 enum procstate { ALIVE, ZOMBIE };
568 /* A proctab entry. The COUNT field is there in case we fork a new
569 compression process that has the same PID as an old zombie process
570 that is still in the table (because the process to decompress the
571 temp file it was associated with hasn't started yet). */
572 struct procnode
574 pid_t pid;
575 enum procstate state;
576 size_t count;
579 static size_t
580 proctab_hasher (const void *entry, size_t tabsize)
582 const struct procnode *node = entry;
583 return node->pid % tabsize;
586 static bool
587 proctab_comparator (const void *e1, const void *e2)
589 const struct procnode *n1 = e1, *n2 = e2;
590 return n1->pid == n2->pid;
593 /* The total number of forked processes (compressors and decompressors)
594 that have not been reaped yet. */
595 static size_t nprocs;
597 /* The number of child processes we'll allow before we try to reap some. */
598 enum { MAX_PROCS_BEFORE_REAP = 2 };
600 /* If 0 < PID, wait for the child process with that PID to exit.
601 If PID is -1, clean up a random child process which has finished and
602 return the process ID of that child. If PID is -1 and no processes
603 have quit yet, return 0 without waiting. */
605 static pid_t
606 reap (pid_t pid)
608 int status;
609 pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
611 if (cpid < 0)
612 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
613 compress_program);
614 else if (0 < cpid)
616 if (! WIFEXITED (status) || WEXITSTATUS (status))
617 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
618 compress_program);
619 --nprocs;
622 return cpid;
625 /* Add the PID of a running compression process to proctab, or update
626 the entry COUNT and STATE fields if it's already there. This also
627 creates the table for us the first time it's called. */
629 static void
630 register_proc (pid_t pid)
632 struct procnode test, *node;
634 if (! proctab)
636 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
637 proctab_hasher,
638 proctab_comparator,
639 free);
640 if (! proctab)
641 xalloc_die ();
644 test.pid = pid;
645 node = hash_lookup (proctab, &test);
646 if (node)
648 node->state = ALIVE;
649 ++node->count;
651 else
653 node = xmalloc (sizeof *node);
654 node->pid = pid;
655 node->state = ALIVE;
656 node->count = 1;
657 if (hash_insert (proctab, node) == NULL)
658 xalloc_die ();
662 /* This is called when we reap a random process. We don't know
663 whether we have reaped a compression process or a decompression
664 process until we look in the table. If there's an ALIVE entry for
665 it, then we have reaped a compression process, so change the state
666 to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */
668 static void
669 update_proc (pid_t pid)
671 struct procnode test, *node;
673 test.pid = pid;
674 node = hash_lookup (proctab, &test);
675 if (node)
676 node->state = ZOMBIE;
679 /* This is for when we need to wait for a compression process to exit.
680 If it has a ZOMBIE entry in the table then it's already dead and has
681 been reaped. Note that if there's an ALIVE entry for it, it still may
682 already have died and been reaped if a second process was created with
683 the same PID. This is probably exceedingly rare, but to be on the safe
684 side we will have to wait for any compression process with this PID. */
686 static void
687 wait_proc (pid_t pid)
689 struct procnode test, *node;
691 test.pid = pid;
692 node = hash_lookup (proctab, &test);
693 if (node->state == ALIVE)
694 reap (pid);
696 node->state = ZOMBIE;
697 if (! --node->count)
699 hash_delete (proctab, node);
700 free (node);
704 /* Keep reaping finished children as long as there are more to reap.
705 This doesn't block waiting for any of them, it only reaps those
706 that are already dead. */
708 static void
709 reap_some (void)
711 pid_t pid;
713 while (0 < nprocs && (pid = reap (-1)))
714 update_proc (pid);
717 /* Clean up any remaining temporary files. */
719 static void
720 cleanup (void)
722 struct tempnode const *node;
724 for (node = temphead; node; node = node->next)
725 unlink (node->name);
726 temphead = NULL;
729 /* Cleanup actions to take when exiting. */
731 static void
732 exit_cleanup (void)
734 if (temphead)
736 /* Clean up any remaining temporary files in a critical section so
737 that a signal handler does not try to clean them too. */
738 struct cs_status cs = cs_enter ();
739 cleanup ();
740 cs_leave (cs);
743 close_stdout ();
746 /* Create a new temporary file, returning its newly allocated tempnode.
747 Store into *PFD the file descriptor open for writing.
748 If the creation fails, return NULL and store -1 into *PFD if the
749 failure is due to file descriptor exhaustion and
750 SURVIVE_FD_EXHAUSTION; otherwise, die. */
752 static struct tempnode *
753 create_temp_file (int *pfd, bool survive_fd_exhaustion)
755 static char const slashbase[] = "/sortXXXXXX";
756 static size_t temp_dir_index;
757 int fd;
758 int saved_errno;
759 char const *temp_dir = temp_dirs[temp_dir_index];
760 size_t len = strlen (temp_dir);
761 struct tempnode *node =
762 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
763 char *file = node->name;
764 struct cs_status cs;
766 memcpy (file, temp_dir, len);
767 memcpy (file + len, slashbase, sizeof slashbase);
768 node->next = NULL;
769 node->pid = 0;
770 if (++temp_dir_index == temp_dir_count)
771 temp_dir_index = 0;
773 /* Create the temporary file in a critical section, to avoid races. */
774 cs = cs_enter ();
775 fd = mkstemp (file);
776 if (0 <= fd)
778 *temptail = node;
779 temptail = &node->next;
781 saved_errno = errno;
782 cs_leave (cs);
783 errno = saved_errno;
785 if (fd < 0)
787 if (! (survive_fd_exhaustion && errno == EMFILE))
788 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
789 quote (temp_dir));
790 free (node);
791 node = NULL;
794 *pfd = fd;
795 return node;
798 /* Predeclare an access pattern for input files.
799 Ignore any errors -- this is only advisory.
801 There are a few hints we could possibly provide,
802 and after careful testing it was decided that
803 specifying POSIX_FADV_SEQUENTIAL was not detrimental
804 to any cases. On Linux 2.6.31, this option doubles
805 the size of read ahead performed and thus was seen to
806 benefit these cases:
807 Merging
808 Sorting with a smaller internal buffer
809 Reading from faster flash devices
811 In _addition_ one could also specify other hints...
813 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
814 at least uses that to _synchronously_ prepopulate the cache
815 with the specified range. While sort does need to
816 read all of its input before outputting, a synchronous
817 read of the whole file up front precludes any processing
818 that sort could do in parallel with the system doing
819 read ahead of the data. This was seen to have negative effects
820 in a couple of cases:
821 Merging
822 Sorting with a smaller internal buffer
823 Note this option was seen to shorten the runtime for sort
824 on a multicore system with lots of RAM and other processes
825 competing for CPU. It could be argued that more explicit
826 scheduling hints with `nice` et. al. are more appropriate
827 for this situation.
829 POSIX_FADV_NOREUSE is a possibility as it could lower
830 the priority of input data in the cache as sort will
831 only need to process it once. However its functionality
832 has changed over Linux kernel versions and as of 2.6.31
833 it does nothing and thus we can't depend on what it might
834 do in future.
836 POSIX_FADV_DONTNEED is not appropriate for user specified
837 input files, but for temp files we do want to drop the
838 cache immediately after processing. This is done implicitly
839 however when the files are unlinked. */
841 static void
842 fadvise_input (FILE *fp)
844 #if HAVE_POSIX_FADVISE
845 if (fp)
847 int fd = fileno (fp);
848 ignore_value (posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL));
850 #endif
853 /* Return a stream for FILE, opened with mode HOW. A null FILE means
854 standard output; HOW should be "w". When opening for input, "-"
855 means standard input. To avoid confusion, do not return file
856 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
857 opening an ordinary FILE. Return NULL if unsuccessful. */
859 static FILE *
860 stream_open (const char *file, const char *how)
862 if (!file)
863 return stdout;
864 if (*how == 'r')
866 FILE *fp;
867 if (STREQ (file, "-"))
869 have_read_stdin = true;
870 fp = stdin;
872 else
873 fp = fopen (file, how);
874 fadvise_input (fp);
875 return fp;
877 return fopen (file, how);
880 /* Same as stream_open, except always return a non-null value; die on
881 failure. */
883 static FILE *
884 xfopen (const char *file, const char *how)
886 FILE *fp = stream_open (file, how);
887 if (!fp)
888 die (_("open failed"), file);
889 return fp;
892 /* Close FP, whose name is FILE, and report any errors. */
894 static void
895 xfclose (FILE *fp, char const *file)
897 switch (fileno (fp))
899 case STDIN_FILENO:
900 /* Allow reading stdin from tty more than once. */
901 if (feof (fp))
902 clearerr (fp);
903 break;
905 case STDOUT_FILENO:
906 /* Don't close stdout just yet. close_stdout does that. */
907 if (fflush (fp) != 0)
908 die (_("fflush failed"), file);
909 break;
911 default:
912 if (fclose (fp) != 0)
913 die (_("close failed"), file);
914 break;
918 static void
919 dup2_or_die (int oldfd, int newfd)
921 if (dup2 (oldfd, newfd) < 0)
922 error (SORT_FAILURE, errno, _("dup2 failed"));
925 /* Fork a child process for piping to and do common cleanup. The
926 TRIES parameter tells us how many times to try to fork before
927 giving up. Return the PID of the child, or -1 (setting errno)
928 on failure. */
930 static pid_t
931 pipe_fork (int pipefds[2], size_t tries)
933 #if HAVE_WORKING_FORK
934 struct tempnode *saved_temphead;
935 int saved_errno;
936 double wait_retry = 0.25;
937 pid_t pid IF_LINT (= -1);
938 struct cs_status cs;
940 if (pipe (pipefds) < 0)
941 return -1;
943 while (tries--)
945 /* This is so the child process won't delete our temp files
946 if it receives a signal before exec-ing. */
947 cs = cs_enter ();
948 saved_temphead = temphead;
949 temphead = NULL;
951 pid = fork ();
952 saved_errno = errno;
953 if (pid)
954 temphead = saved_temphead;
956 cs_leave (cs);
957 errno = saved_errno;
959 if (0 <= pid || errno != EAGAIN)
960 break;
961 else
963 xnanosleep (wait_retry);
964 wait_retry *= 2;
965 reap_some ();
969 if (pid < 0)
971 saved_errno = errno;
972 close (pipefds[0]);
973 close (pipefds[1]);
974 errno = saved_errno;
976 else if (pid == 0)
978 close (STDIN_FILENO);
979 close (STDOUT_FILENO);
981 else
982 ++nprocs;
984 return pid;
986 #else /* ! HAVE_WORKING_FORK */
987 return -1;
988 #endif
991 /* Create a temporary file and start a compression program to filter output
992 to that file. Set *PFP to the file handle and if PPID is non-NULL,
993 set *PPID to the PID of the newly-created process. If the creation
994 fails, return NULL if the failure is due to file descriptor
995 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
997 static char *
998 maybe_create_temp (FILE **pfp, pid_t *ppid, bool survive_fd_exhaustion)
1000 int tempfd;
1001 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1002 char *name;
1004 if (! node)
1005 return NULL;
1007 name = node->name;
1009 if (compress_program)
1011 int pipefds[2];
1013 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1014 if (0 < node->pid)
1016 close (tempfd);
1017 close (pipefds[0]);
1018 tempfd = pipefds[1];
1020 register_proc (node->pid);
1022 else if (node->pid == 0)
1024 close (pipefds[1]);
1025 dup2_or_die (tempfd, STDOUT_FILENO);
1026 close (tempfd);
1027 dup2_or_die (pipefds[0], STDIN_FILENO);
1028 close (pipefds[0]);
1030 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1031 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1032 compress_program);
1034 else
1035 node->pid = 0;
1038 *pfp = fdopen (tempfd, "w");
1039 if (! *pfp)
1040 die (_("couldn't create temporary file"), name);
1042 if (ppid)
1043 *ppid = node->pid;
1045 return name;
1048 /* Create a temporary file and start a compression program to filter output
1049 to that file. Set *PFP to the file handle and if *PPID is non-NULL,
1050 set it to the PID of the newly-created process. Die on failure. */
1052 static char *
1053 create_temp (FILE **pfp, pid_t *ppid)
1055 return maybe_create_temp (pfp, ppid, false);
1058 /* Open a compressed temp file and start a decompression process through
1059 which to filter the input. PID must be the valid processes ID of the
1060 process used to compress the file. Return NULL (setting errno to
1061 EMFILE) if we ran out of file descriptors, and die on any other
1062 kind of failure. */
1064 static FILE *
1065 open_temp (const char *name, pid_t pid)
1067 int tempfd, pipefds[2];
1068 FILE *fp = NULL;
1070 wait_proc (pid);
1072 tempfd = open (name, O_RDONLY);
1073 if (tempfd < 0)
1074 return NULL;
1076 switch (pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS))
1078 case -1:
1079 if (errno != EMFILE)
1080 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1081 compress_program);
1082 close (tempfd);
1083 errno = EMFILE;
1084 break;
1086 case 0:
1087 close (pipefds[0]);
1088 dup2_or_die (tempfd, STDIN_FILENO);
1089 close (tempfd);
1090 dup2_or_die (pipefds[1], STDOUT_FILENO);
1091 close (pipefds[1]);
1093 execlp (compress_program, compress_program, "-d", (char *) NULL);
1094 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1095 compress_program);
1097 default:
1098 close (tempfd);
1099 close (pipefds[1]);
1101 fp = fdopen (pipefds[0], "r");
1102 if (! fp)
1104 int saved_errno = errno;
1105 close (pipefds[0]);
1106 errno = saved_errno;
1108 break;
1111 return fp;
1114 static void
1115 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
1117 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
1118 die (_("write failed"), output_file);
1121 /* Append DIR to the array of temporary directory names. */
1122 static void
1123 add_temp_dir (char const *dir)
1125 if (temp_dir_count == temp_dir_alloc)
1126 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1128 temp_dirs[temp_dir_count++] = dir;
1131 /* Remove NAME from the list of temporary files. */
1133 static void
1134 zaptemp (const char *name)
1136 struct tempnode *volatile *pnode;
1137 struct tempnode *node;
1138 struct tempnode *next;
1139 int unlink_status;
1140 int unlink_errno = 0;
1141 struct cs_status cs;
1143 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1144 continue;
1146 /* Unlink the temporary file in a critical section to avoid races. */
1147 next = node->next;
1148 cs = cs_enter ();
1149 unlink_status = unlink (name);
1150 unlink_errno = errno;
1151 *pnode = next;
1152 cs_leave (cs);
1154 if (unlink_status != 0)
1155 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1156 if (! next)
1157 temptail = pnode;
1158 free (node);
1161 #if HAVE_NL_LANGINFO
1163 static int
1164 struct_month_cmp (const void *m1, const void *m2)
1166 struct month const *month1 = m1;
1167 struct month const *month2 = m2;
1168 return strcmp (month1->name, month2->name);
1171 #endif
1173 /* Initialize the character class tables. */
1175 static void
1176 inittables (void)
1178 size_t i;
1180 for (i = 0; i < UCHAR_LIM; ++i)
1182 blanks[i] = !! isblank (i);
1183 nonprinting[i] = ! isprint (i);
1184 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1185 fold_toupper[i] = toupper (i);
1188 #if HAVE_NL_LANGINFO
1189 /* If we're not in the "C" locale, read different names for months. */
1190 if (hard_LC_TIME)
1192 for (i = 0; i < MONTHS_PER_YEAR; i++)
1194 char const *s;
1195 size_t s_len;
1196 size_t j, k;
1197 char *name;
1199 s = (char *) nl_langinfo (ABMON_1 + i);
1200 s_len = strlen (s);
1201 monthtab[i].name = name = xmalloc (s_len + 1);
1202 monthtab[i].val = i + 1;
1204 for (j = k = 0; j < s_len; j++)
1205 if (! isblank (to_uchar (s[j])))
1206 name[k++] = fold_toupper[to_uchar (s[j])];
1207 name[k] = '\0';
1209 qsort ((void *) monthtab, MONTHS_PER_YEAR,
1210 sizeof *monthtab, struct_month_cmp);
1212 #endif
1215 /* Specify how many inputs may be merged at once.
1216 This may be set on the command-line with the
1217 --batch-size option. */
1218 static void
1219 specify_nmerge (int oi, char c, char const *s)
1221 uintmax_t n;
1222 struct rlimit rlimit;
1223 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1225 /* Try to find out how many file descriptors we'll be able
1226 to open. We need at least nmerge + 3 (STDIN_FILENO,
1227 STDOUT_FILENO and STDERR_FILENO). */
1228 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1229 ? rlimit.rlim_cur
1230 : OPEN_MAX)
1231 - 3);
1233 if (e == LONGINT_OK)
1235 nmerge = n;
1236 if (nmerge != n)
1237 e = LONGINT_OVERFLOW;
1238 else
1240 if (nmerge < 2)
1242 error (0, 0, _("invalid --%s argument %s"),
1243 long_options[oi].name, quote(s));
1244 error (SORT_FAILURE, 0,
1245 _("minimum --%s argument is %s"),
1246 long_options[oi].name, quote("2"));
1248 else if (max_nmerge < nmerge)
1250 e = LONGINT_OVERFLOW;
1252 else
1253 return;
1257 if (e == LONGINT_OVERFLOW)
1259 char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1260 error (0, 0, _("--%s argument %s too large"),
1261 long_options[oi].name, quote(s));
1262 error (SORT_FAILURE, 0,
1263 _("maximum --%s argument with current rlimit is %s"),
1264 long_options[oi].name,
1265 uinttostr (max_nmerge, max_nmerge_buf));
1267 else
1268 xstrtol_fatal (e, oi, c, long_options, s);
1271 /* Specify the amount of main memory to use when sorting. */
1272 static void
1273 specify_sort_size (int oi, char c, char const *s)
1275 uintmax_t n;
1276 char *suffix;
1277 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1279 /* The default unit is KiB. */
1280 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1282 if (n <= UINTMAX_MAX / 1024)
1283 n *= 1024;
1284 else
1285 e = LONGINT_OVERFLOW;
1288 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1289 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1290 switch (suffix[0])
1292 case 'b':
1293 e = LONGINT_OK;
1294 break;
1296 case '%':
1298 double mem = physmem_total () * n / 100;
1300 /* Use "<", not "<=", to avoid problems with rounding. */
1301 if (mem < UINTMAX_MAX)
1303 n = mem;
1304 e = LONGINT_OK;
1306 else
1307 e = LONGINT_OVERFLOW;
1309 break;
1312 if (e == LONGINT_OK)
1314 /* If multiple sort sizes are specified, take the maximum, so
1315 that option order does not matter. */
1316 if (n < sort_size)
1317 return;
1319 sort_size = n;
1320 if (sort_size == n)
1322 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1323 return;
1326 e = LONGINT_OVERFLOW;
1329 xstrtol_fatal (e, oi, c, long_options, s);
1332 /* Return the default sort size. */
1333 static size_t
1334 default_sort_size (void)
1336 /* Let MEM be available memory or 1/8 of total memory, whichever
1337 is greater. */
1338 double avail = physmem_available ();
1339 double total = physmem_total ();
1340 double mem = MAX (avail, total / 8);
1341 struct rlimit rlimit;
1343 /* Let SIZE be MEM, but no more than the maximum object size or
1344 system resource limits. Avoid the MIN macro here, as it is not
1345 quite right when only one argument is floating point. Don't
1346 bother to check for values like RLIM_INFINITY since in practice
1347 they are not much less than SIZE_MAX. */
1348 size_t size = SIZE_MAX;
1349 if (mem < size)
1350 size = mem;
1351 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1352 size = rlimit.rlim_cur;
1353 #ifdef RLIMIT_AS
1354 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1355 size = rlimit.rlim_cur;
1356 #endif
1358 /* Leave a large safety margin for the above limits, as failure can
1359 occur when they are exceeded. */
1360 size /= 2;
1362 #ifdef RLIMIT_RSS
1363 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1364 Exceeding RSS is not fatal, but can be quite slow. */
1365 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1366 size = rlimit.rlim_cur / 16 * 15;
1367 #endif
1369 /* Use no less than the minimum. */
1370 return MAX (size, MIN_SORT_SIZE);
1373 /* Return the sort buffer size to use with the input files identified
1374 by FPS and FILES, which are alternate names of the same files.
1375 NFILES gives the number of input files; NFPS may be less. Assume
1376 that each input line requires LINE_BYTES extra bytes' worth of line
1377 information. Do not exceed the size bound specified by the user
1378 (or a default size bound, if the user does not specify one). */
1380 static size_t
1381 sort_buffer_size (FILE *const *fps, size_t nfps,
1382 char *const *files, size_t nfiles,
1383 size_t line_bytes)
1385 /* A bound on the input size. If zero, the bound hasn't been
1386 determined yet. */
1387 static size_t size_bound;
1389 /* In the worst case, each input byte is a newline. */
1390 size_t worst_case_per_input_byte = line_bytes + 1;
1392 /* Keep enough room for one extra input line and an extra byte.
1393 This extra room might be needed when preparing to read EOF. */
1394 size_t size = worst_case_per_input_byte + 1;
1396 size_t i;
1398 for (i = 0; i < nfiles; i++)
1400 struct stat st;
1401 off_t file_size;
1402 size_t worst_case;
1404 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1405 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1406 : stat (files[i], &st))
1407 != 0)
1408 die (_("stat failed"), files[i]);
1410 if (S_ISREG (st.st_mode))
1411 file_size = st.st_size;
1412 else
1414 /* The file has unknown size. If the user specified a sort
1415 buffer size, use that; otherwise, guess the size. */
1416 if (sort_size)
1417 return sort_size;
1418 file_size = INPUT_FILE_SIZE_GUESS;
1421 if (! size_bound)
1423 size_bound = sort_size;
1424 if (! size_bound)
1425 size_bound = default_sort_size ();
1428 /* Add the amount of memory needed to represent the worst case
1429 where the input consists entirely of newlines followed by a
1430 single non-newline. Check for overflow. */
1431 worst_case = file_size * worst_case_per_input_byte + 1;
1432 if (file_size != worst_case / worst_case_per_input_byte
1433 || size_bound - size <= worst_case)
1434 return size_bound;
1435 size += worst_case;
1438 return size;
1441 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1442 must be at least sizeof (struct line). Allocate ALLOC bytes
1443 initially. */
1445 static void
1446 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1448 /* Ensure that the line array is properly aligned. If the desired
1449 size cannot be allocated, repeatedly halve it until allocation
1450 succeeds. The smaller allocation may hurt overall performance,
1451 but that's better than failing. */
1452 for (;;)
1454 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1455 buf->buf = malloc (alloc);
1456 if (buf->buf)
1457 break;
1458 alloc /= 2;
1459 if (alloc <= line_bytes + 1)
1460 xalloc_die ();
1463 buf->line_bytes = line_bytes;
1464 buf->alloc = alloc;
1465 buf->used = buf->left = buf->nlines = 0;
1466 buf->eof = false;
1469 /* Return one past the limit of the line array. */
1471 static inline struct line *
1472 buffer_linelim (struct buffer const *buf)
1474 return (struct line *) (buf->buf + buf->alloc);
1477 /* Return a pointer to the first character of the field specified
1478 by KEY in LINE. */
1480 static char *
1481 begfield (const struct line *line, const struct keyfield *key)
1483 char *ptr = line->text, *lim = ptr + line->length - 1;
1484 size_t sword = key->sword;
1485 size_t schar = key->schar;
1487 /* The leading field separator itself is included in a field when -t
1488 is absent. */
1490 if (tab != TAB_DEFAULT)
1491 while (ptr < lim && sword--)
1493 while (ptr < lim && *ptr != tab)
1494 ++ptr;
1495 if (ptr < lim)
1496 ++ptr;
1498 else
1499 while (ptr < lim && sword--)
1501 while (ptr < lim && blanks[to_uchar (*ptr)])
1502 ++ptr;
1503 while (ptr < lim && !blanks[to_uchar (*ptr)])
1504 ++ptr;
1507 /* If we're ignoring leading blanks when computing the Start
1508 of the field, skip past them here. */
1509 if (key->skipsblanks)
1510 while (ptr < lim && blanks[to_uchar (*ptr)])
1511 ++ptr;
1513 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1514 ptr = MIN (lim, ptr + schar);
1516 return ptr;
1519 /* Return the limit of (a pointer to the first character after) the field
1520 in LINE specified by KEY. */
1522 static char *
1523 limfield (const struct line *line, const struct keyfield *key)
1525 char *ptr = line->text, *lim = ptr + line->length - 1;
1526 size_t eword = key->eword, echar = key->echar;
1528 if (echar == 0)
1529 eword++; /* Skip all of end field. */
1531 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1532 whichever comes first. If there are more than EWORD fields, leave
1533 PTR pointing at the beginning of the field having zero-based index,
1534 EWORD. If a delimiter character was specified (via -t), then that
1535 `beginning' is the first character following the delimiting TAB.
1536 Otherwise, leave PTR pointing at the first `blank' character after
1537 the preceding field. */
1538 if (tab != TAB_DEFAULT)
1539 while (ptr < lim && eword--)
1541 while (ptr < lim && *ptr != tab)
1542 ++ptr;
1543 if (ptr < lim && (eword || echar))
1544 ++ptr;
1546 else
1547 while (ptr < lim && eword--)
1549 while (ptr < lim && blanks[to_uchar (*ptr)])
1550 ++ptr;
1551 while (ptr < lim && !blanks[to_uchar (*ptr)])
1552 ++ptr;
1555 #ifdef POSIX_UNSPECIFIED
1556 /* The following block of code makes GNU sort incompatible with
1557 standard Unix sort, so it's ifdef'd out for now.
1558 The POSIX spec isn't clear on how to interpret this.
1559 FIXME: request clarification.
1561 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1562 Date: Thu, 30 May 96 12:20:41 -0400
1563 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1565 [...]I believe I've found another bug in `sort'.
1567 $ cat /tmp/sort.in
1568 a b c 2 d
1569 pq rs 1 t
1570 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1571 a b c 2 d
1572 pq rs 1 t
1573 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1574 pq rs 1 t
1575 a b c 2 d
1577 Unix sort produced the answer I expected: sort on the single character
1578 in column 7. GNU sort produced different results, because it disagrees
1579 on the interpretation of the key-end spec "M.N". Unix sort reads this
1580 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1581 "skip M-1 fields, then either N-1 characters or the rest of the current
1582 field, whichever comes first". This extra clause applies only to
1583 key-ends, not key-starts.
1586 /* Make LIM point to the end of (one byte past) the current field. */
1587 if (tab != TAB_DEFAULT)
1589 char *newlim;
1590 newlim = memchr (ptr, tab, lim - ptr);
1591 if (newlim)
1592 lim = newlim;
1594 else
1596 char *newlim;
1597 newlim = ptr;
1598 while (newlim < lim && blanks[to_uchar (*newlim)])
1599 ++newlim;
1600 while (newlim < lim && !blanks[to_uchar (*newlim)])
1601 ++newlim;
1602 lim = newlim;
1604 #endif
1606 if (echar != 0) /* We need to skip over a portion of the end field. */
1608 /* If we're ignoring leading blanks when computing the End
1609 of the field, skip past them here. */
1610 if (key->skipeblanks)
1611 while (ptr < lim && blanks[to_uchar (*ptr)])
1612 ++ptr;
1614 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1615 ptr = MIN (lim, ptr + echar);
1618 return ptr;
1621 /* Fill BUF reading from FP, moving buf->left bytes from the end
1622 of buf->buf to the beginning first. If EOF is reached and the
1623 file wasn't terminated by a newline, supply one. Set up BUF's line
1624 table too. FILE is the name of the file corresponding to FP.
1625 Return true if some input was read. */
1627 static bool
1628 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1630 struct keyfield const *key = keylist;
1631 char eol = eolchar;
1632 size_t line_bytes = buf->line_bytes;
1633 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1635 if (buf->eof)
1636 return false;
1638 if (buf->used != buf->left)
1640 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1641 buf->used = buf->left;
1642 buf->nlines = 0;
1645 for (;;)
1647 char *ptr = buf->buf + buf->used;
1648 struct line *linelim = buffer_linelim (buf);
1649 struct line *line = linelim - buf->nlines;
1650 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1651 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1653 while (line_bytes + 1 < avail)
1655 /* Read as many bytes as possible, but do not read so many
1656 bytes that there might not be enough room for the
1657 corresponding line array. The worst case is when the
1658 rest of the input file consists entirely of newlines,
1659 except that the last byte is not a newline. */
1660 size_t readsize = (avail - 1) / (line_bytes + 1);
1661 size_t bytes_read = fread (ptr, 1, readsize, fp);
1662 char *ptrlim = ptr + bytes_read;
1663 char *p;
1664 avail -= bytes_read;
1666 if (bytes_read != readsize)
1668 if (ferror (fp))
1669 die (_("read failed"), file);
1670 if (feof (fp))
1672 buf->eof = true;
1673 if (buf->buf == ptrlim)
1674 return false;
1675 if (ptrlim[-1] != eol)
1676 *ptrlim++ = eol;
1680 /* Find and record each line in the just-read input. */
1681 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1683 ptr = p + 1;
1684 line--;
1685 line->text = line_start;
1686 line->length = ptr - line_start;
1687 mergesize = MAX (mergesize, line->length);
1688 avail -= line_bytes;
1690 if (key)
1692 /* Precompute the position of the first key for
1693 efficiency. */
1694 line->keylim = (key->eword == SIZE_MAX
1696 : limfield (line, key));
1698 if (key->sword != SIZE_MAX)
1699 line->keybeg = begfield (line, key);
1700 else
1702 if (key->skipsblanks)
1703 while (blanks[to_uchar (*line_start)])
1704 line_start++;
1705 line->keybeg = line_start;
1709 line_start = ptr;
1712 ptr = ptrlim;
1713 if (buf->eof)
1714 break;
1717 buf->used = ptr - buf->buf;
1718 buf->nlines = buffer_linelim (buf) - line;
1719 if (buf->nlines != 0)
1721 buf->left = ptr - line_start;
1722 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1723 return true;
1727 /* The current input line is too long to fit in the buffer.
1728 Double the buffer size and try again, keeping it properly
1729 aligned. */
1730 size_t line_alloc = buf->alloc / sizeof (struct line);
1731 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1732 buf->alloc = line_alloc * sizeof (struct line);
1737 /* Compare strings A and B as numbers without explicitly converting them to
1738 machine numbers. Comparatively slow for short strings, but asymptotically
1739 hideously fast. */
1741 static int
1742 numcompare (const char *a, const char *b)
1744 while (blanks[to_uchar (*a)])
1745 a++;
1746 while (blanks[to_uchar (*b)])
1747 b++;
1749 return strnumcmp (a, b, decimal_point, thousands_sep);
1752 /* Exit with an error if a mixture of SI and IEC units detected. */
1754 static void
1755 check_mixed_SI_IEC (char prefix, struct keyfield *key)
1757 int iec_present = prefix == 'i';
1758 if (key->iec_present != -1 && iec_present != key->iec_present)
1759 error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
1760 key->iec_present = iec_present;
1763 /* Return an integer which represents the order of magnitude of
1764 the unit following the number. NUMBER can contain thousands separators
1765 or a decimal point, but not have preceeding blanks.
1766 Negative numbers return a negative unit order. */
1768 static int
1769 find_unit_order (const char *number, struct keyfield *key)
1771 static const char orders [UCHAR_LIM] =
1773 #if SOME_DAY_WE_WILL_REQUIRE_C99
1774 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1775 ['k']=1,
1776 #else
1777 /* Generate the following table with this command:
1778 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1779 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1780 |fmt */
1781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1784 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1791 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1792 #endif
1795 const unsigned char *p = number;
1797 int sign = 1;
1799 if (*p == '-')
1801 sign = -1;
1802 p++;
1805 /* Scan to end of number.
1806 Decimals or separators not followed by digits stop the scan.
1807 Numbers ending in decimals or separators are thus considered
1808 to be lacking in units.
1809 FIXME: add support for multibyte thousands_sep and decimal_point. */
1811 while (ISDIGIT (*p))
1813 p++;
1815 if (*p == decimal_point && ISDIGIT (*(p + 1)))
1816 p += 2;
1817 else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
1818 p += 2;
1821 int order = orders[*p];
1823 /* For valid units check for MiB vs MB etc. */
1824 if (order)
1825 check_mixed_SI_IEC (*(p + 1), key);
1827 return sign * order;
1830 /* Compare numbers ending in units with SI xor IEC prefixes
1831 <none/unknown> < K/k < M < G < T < P < E < Z < Y
1832 Assume that numbers are properly abbreviated.
1833 i.e. input will never have both 6000K and 5M. */
1835 static int
1836 human_numcompare (const char *a, const char *b, struct keyfield *key)
1838 while (blanks[to_uchar (*a)])
1839 a++;
1840 while (blanks[to_uchar (*b)])
1841 b++;
1843 int order_a = find_unit_order (a, key);
1844 int order_b = find_unit_order (b, key);
1846 return (order_a > order_b ? 1
1847 : order_a < order_b ? -1
1848 : strnumcmp (a, b, decimal_point, thousands_sep));
1851 static int
1852 general_numcompare (const char *sa, const char *sb)
1854 /* FIXME: add option to warn about failed conversions. */
1855 /* FIXME: maybe add option to try expensive FP conversion
1856 only if A and B can't be compared more cheaply/accurately. */
1858 char *ea;
1859 char *eb;
1860 double a = strtod (sa, &ea);
1861 double b = strtod (sb, &eb);
1863 /* Put conversion errors at the start of the collating sequence. */
1864 if (sa == ea)
1865 return sb == eb ? 0 : -1;
1866 if (sb == eb)
1867 return 1;
1869 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1870 conversion errors but before numbers; sort them by internal
1871 bit-pattern, for lack of a more portable alternative. */
1872 return (a < b ? -1
1873 : a > b ? 1
1874 : a == b ? 0
1875 : b == b ? -1
1876 : a == a ? 1
1877 : memcmp ((char *) &a, (char *) &b, sizeof a));
1880 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1881 Return 0 if the name in S is not recognized. */
1883 static int
1884 getmonth (char const *month, size_t len)
1886 size_t lo = 0;
1887 size_t hi = MONTHS_PER_YEAR;
1888 char const *monthlim = month + len;
1890 for (;;)
1892 if (month == monthlim)
1893 return 0;
1894 if (!blanks[to_uchar (*month)])
1895 break;
1896 ++month;
1901 size_t ix = (lo + hi) / 2;
1902 char const *m = month;
1903 char const *n = monthtab[ix].name;
1905 for (;; m++, n++)
1907 if (!*n)
1908 return monthtab[ix].val;
1909 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1911 hi = ix;
1912 break;
1914 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1916 lo = ix + 1;
1917 break;
1921 while (lo < hi);
1923 return 0;
1926 /* A source of random data. */
1927 static struct randread_source *randread_source;
1929 /* Return the Ith randomly-generated state. The caller must invoke
1930 random_state (H) for all H less than I before invoking random_state
1931 (I). */
1933 static struct md5_ctx
1934 random_state (size_t i)
1936 /* An array of states resulting from the random data, and counts of
1937 its used and allocated members. */
1938 static struct md5_ctx *state;
1939 static size_t used;
1940 static size_t allocated;
1942 struct md5_ctx *s = &state[i];
1944 if (used <= i)
1946 unsigned char buf[MD5_DIGEST_SIZE];
1948 used++;
1950 if (allocated <= i)
1952 state = X2NREALLOC (state, &allocated);
1953 s = &state[i];
1956 randread (randread_source, buf, sizeof buf);
1957 md5_init_ctx (s);
1958 md5_process_bytes (buf, sizeof buf, s);
1961 return *s;
1964 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1965 with length LENGTHB. Return negative if less, zero if equal,
1966 positive if greater. */
1968 static int
1969 cmp_hashes (char const *texta, size_t lena,
1970 char const *textb, size_t lenb)
1972 /* Try random hashes until a pair of hashes disagree. But if the
1973 first pair of random hashes agree, check whether the keys are
1974 identical and if so report no difference. */
1975 int diff;
1976 size_t i;
1977 for (i = 0; ; i++)
1979 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1980 struct md5_ctx s[2];
1981 s[0] = s[1] = random_state (i);
1982 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
1983 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
1984 diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1985 if (diff != 0)
1986 break;
1987 if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1988 break;
1991 return diff;
1994 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1995 using one or more random hash functions. */
1997 static int
1998 compare_random (char *restrict texta, size_t lena,
1999 char *restrict textb, size_t lenb)
2001 int diff;
2003 if (! hard_LC_COLLATE)
2004 diff = cmp_hashes (texta, lena, textb, lenb);
2005 else
2007 /* Transform the text into the basis of comparison, so that byte
2008 strings that would otherwise considered to be equal are
2009 considered equal here even if their bytes differ. */
2011 char *buf = NULL;
2012 char stackbuf[4000];
2013 size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
2014 bool a_fits = tlena <= sizeof stackbuf;
2015 size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
2016 (a_fits ? sizeof stackbuf - tlena : 0),
2017 textb, lenb);
2019 if (a_fits && tlena + tlenb <= sizeof stackbuf)
2020 buf = stackbuf;
2021 else
2023 /* Adding 1 to the buffer size lets xmemxfrm run a bit
2024 faster by avoiding the need for an extra buffer copy. */
2025 buf = xmalloc (tlena + tlenb + 1);
2026 xmemxfrm (buf, tlena + 1, texta, lena);
2027 xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
2030 diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
2032 if (buf != stackbuf)
2033 free (buf);
2036 return diff;
2039 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2040 using filevercmp. See lib/filevercmp.h for function description. */
2042 static int
2043 compare_version (char *restrict texta, size_t lena,
2044 char *restrict textb, size_t lenb)
2046 int diff;
2048 /* It is necessary to save the character after the end of the field.
2049 "filevercmp" works with NUL terminated strings. Our blocks of
2050 text are not necessarily terminated with a NUL byte. */
2051 char sv_a = texta[lena];
2052 char sv_b = textb[lenb];
2054 texta[lena] = '\0';
2055 textb[lenb] = '\0';
2057 diff = filevercmp (texta, textb);
2059 texta[lena] = sv_a;
2060 textb[lenb] = sv_b;
2062 return diff;
2065 /* Compare two lines A and B trying every key in sequence until there
2066 are no more keys or a difference is found. */
2068 static int
2069 keycompare (const struct line *a, const struct line *b)
2071 struct keyfield *key = keylist;
2073 /* For the first iteration only, the key positions have been
2074 precomputed for us. */
2075 char *texta = a->keybeg;
2076 char *textb = b->keybeg;
2077 char *lima = a->keylim;
2078 char *limb = b->keylim;
2080 int diff;
2082 for (;;)
2084 char const *translate = key->translate;
2085 bool const *ignore = key->ignore;
2087 /* Treat field ends before field starts as empty fields. */
2088 lima = MAX (texta, lima);
2089 limb = MAX (textb, limb);
2091 /* Find the lengths. */
2092 size_t lena = lima - texta;
2093 size_t lenb = limb - textb;
2095 /* Actually compare the fields. */
2097 if (key->random)
2098 diff = compare_random (texta, lena, textb, lenb);
2099 else if (key->numeric || key->general_numeric || key->human_numeric)
2101 char savea = *lima, saveb = *limb;
2103 *lima = *limb = '\0';
2104 diff = (key->numeric ? numcompare (texta, textb)
2105 : key->general_numeric ? general_numcompare (texta, textb)
2106 : human_numcompare (texta, textb, key));
2107 *lima = savea, *limb = saveb;
2109 else if (key->version)
2110 diff = compare_version (texta, lena, textb, lenb);
2111 else if (key->month)
2112 diff = getmonth (texta, lena) - getmonth (textb, lenb);
2113 /* Sorting like this may become slow, so in a simple locale the user
2114 can select a faster sort that is similar to ascii sort. */
2115 else if (hard_LC_COLLATE)
2117 if (ignore || translate)
2119 char buf[4000];
2120 size_t size = lena + 1 + lenb + 1;
2121 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
2122 char *copy_b = copy_a + lena + 1;
2123 size_t new_len_a, new_len_b, i;
2125 /* Ignore and/or translate chars before comparing. */
2126 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
2128 if (i < lena)
2130 copy_a[new_len_a] = (translate
2131 ? translate[to_uchar (texta[i])]
2132 : texta[i]);
2133 if (!ignore || !ignore[to_uchar (texta[i])])
2134 ++new_len_a;
2136 if (i < lenb)
2138 copy_b[new_len_b] = (translate
2139 ? translate[to_uchar (textb[i])]
2140 : textb [i]);
2141 if (!ignore || !ignore[to_uchar (textb[i])])
2142 ++new_len_b;
2146 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
2148 if (sizeof buf < size)
2149 free (copy_a);
2151 else if (lena == 0)
2152 diff = - NONZERO (lenb);
2153 else if (lenb == 0)
2154 goto greater;
2155 else
2156 diff = xmemcoll (texta, lena, textb, lenb);
2158 else if (ignore)
2160 #define CMP_WITH_IGNORE(A, B) \
2161 do \
2163 for (;;) \
2165 while (texta < lima && ignore[to_uchar (*texta)]) \
2166 ++texta; \
2167 while (textb < limb && ignore[to_uchar (*textb)]) \
2168 ++textb; \
2169 if (! (texta < lima && textb < limb)) \
2170 break; \
2171 diff = to_uchar (A) - to_uchar (B); \
2172 if (diff) \
2173 goto not_equal; \
2174 ++texta; \
2175 ++textb; \
2178 diff = (texta < lima) - (textb < limb); \
2180 while (0)
2182 if (translate)
2183 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2184 translate[to_uchar (*textb)]);
2185 else
2186 CMP_WITH_IGNORE (*texta, *textb);
2188 else if (lena == 0)
2189 diff = - NONZERO (lenb);
2190 else if (lenb == 0)
2191 goto greater;
2192 else
2194 if (translate)
2196 while (texta < lima && textb < limb)
2198 diff = (to_uchar (translate[to_uchar (*texta++)])
2199 - to_uchar (translate[to_uchar (*textb++)]));
2200 if (diff)
2201 goto not_equal;
2204 else
2206 diff = memcmp (texta, textb, MIN (lena, lenb));
2207 if (diff)
2208 goto not_equal;
2210 diff = lena < lenb ? -1 : lena != lenb;
2213 if (diff)
2214 goto not_equal;
2216 key = key->next;
2217 if (! key)
2218 break;
2220 /* Find the beginning and limit of the next field. */
2221 if (key->eword != SIZE_MAX)
2222 lima = limfield (a, key), limb = limfield (b, key);
2223 else
2224 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2226 if (key->sword != SIZE_MAX)
2227 texta = begfield (a, key), textb = begfield (b, key);
2228 else
2230 texta = a->text, textb = b->text;
2231 if (key->skipsblanks)
2233 while (texta < lima && blanks[to_uchar (*texta)])
2234 ++texta;
2235 while (textb < limb && blanks[to_uchar (*textb)])
2236 ++textb;
2241 return 0;
2243 greater:
2244 diff = 1;
2245 not_equal:
2246 return key->reverse ? -diff : diff;
2249 /* Compare two lines A and B, returning negative, zero, or positive
2250 depending on whether A compares less than, equal to, or greater than B. */
2252 static int
2253 compare (const struct line *a, const struct line *b)
2255 int diff;
2256 size_t alen, blen;
2258 /* First try to compare on the specified keys (if any).
2259 The only two cases with no key at all are unadorned sort,
2260 and unadorned sort -r. */
2261 if (keylist)
2263 diff = keycompare (a, b);
2264 if (diff || unique || stable)
2265 return diff;
2268 /* If the keys all compare equal (or no keys were specified)
2269 fall through to the default comparison. */
2270 alen = a->length - 1, blen = b->length - 1;
2272 if (alen == 0)
2273 diff = - NONZERO (blen);
2274 else if (blen == 0)
2275 diff = 1;
2276 else if (hard_LC_COLLATE)
2277 diff = xmemcoll (a->text, alen, b->text, blen);
2278 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2279 diff = alen < blen ? -1 : alen != blen;
2281 return reverse ? -diff : diff;
2284 /* Check that the lines read from FILE_NAME come in order. Return
2285 true if they are in order. If CHECKONLY == 'c', also print a
2286 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2287 they are not in order. */
2289 static bool
2290 check (char const *file_name, char checkonly)
2292 FILE *fp = xfopen (file_name, "r");
2293 struct buffer buf; /* Input buffer. */
2294 struct line temp; /* Copy of previous line. */
2295 size_t alloc = 0;
2296 uintmax_t line_number = 0;
2297 struct keyfield const *key = keylist;
2298 bool nonunique = ! unique;
2299 bool ordered = true;
2301 initbuf (&buf, sizeof (struct line),
2302 MAX (merge_buffer_size, sort_size));
2303 temp.text = NULL;
2305 while (fillbuf (&buf, fp, file_name))
2307 struct line const *line = buffer_linelim (&buf);
2308 struct line const *linebase = line - buf.nlines;
2310 /* Make sure the line saved from the old buffer contents is
2311 less than or equal to the first line of the new buffer. */
2312 if (alloc && nonunique <= compare (&temp, line - 1))
2314 found_disorder:
2316 if (checkonly == 'c')
2318 struct line const *disorder_line = line - 1;
2319 uintmax_t disorder_line_number =
2320 buffer_linelim (&buf) - disorder_line + line_number;
2321 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2322 fprintf (stderr, _("%s: %s:%s: disorder: "),
2323 program_name, file_name,
2324 umaxtostr (disorder_line_number, hr_buf));
2325 write_bytes (disorder_line->text, disorder_line->length,
2326 stderr, _("standard error"));
2329 ordered = false;
2330 break;
2334 /* Compare each line in the buffer with its successor. */
2335 while (linebase < --line)
2336 if (nonunique <= compare (line, line - 1))
2337 goto found_disorder;
2339 line_number += buf.nlines;
2341 /* Save the last line of the buffer. */
2342 if (alloc < line->length)
2346 alloc *= 2;
2347 if (! alloc)
2349 alloc = line->length;
2350 break;
2353 while (alloc < line->length);
2355 temp.text = xrealloc (temp.text, alloc);
2357 memcpy (temp.text, line->text, line->length);
2358 temp.length = line->length;
2359 if (key)
2361 temp.keybeg = temp.text + (line->keybeg - line->text);
2362 temp.keylim = temp.text + (line->keylim - line->text);
2366 xfclose (fp, file_name);
2367 free (buf.buf);
2368 free (temp.text);
2369 return ordered;
2372 /* Open FILES (there are NFILES of them) and store the resulting array
2373 of stream pointers into (*PFPS). Allocate the array. Return the
2374 number of successfully opened files, setting errno if this value is
2375 less than NFILES. */
2377 static size_t
2378 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2380 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2381 int i;
2383 /* Open as many input files as we can. */
2384 for (i = 0; i < nfiles; i++)
2386 fps[i] = (files[i].pid
2387 ? open_temp (files[i].name, files[i].pid)
2388 : stream_open (files[i].name, "r"));
2389 if (!fps[i])
2390 break;
2393 return i;
2396 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2397 files (all of which are at the start of the FILES array), and
2398 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2399 FPS is the vector of open stream corresponding to the files.
2400 Close input and output streams before returning.
2401 OUTPUT_FILE gives the name of the output file. If it is NULL,
2402 the output file is standard output. */
2404 static void
2405 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2406 FILE *ofp, char const *output_file, FILE **fps)
2408 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2409 /* Input buffers for each file. */
2410 struct line saved; /* Saved line storage for unique check. */
2411 struct line const *savedline = NULL;
2412 /* &saved if there is a saved line. */
2413 size_t savealloc = 0; /* Size allocated for the saved line. */
2414 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2415 /* Current line in each line table. */
2416 struct line const **base = xnmalloc (nfiles, sizeof *base);
2417 /* Base of each line table. */
2418 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2419 /* Table representing a permutation of fps,
2420 such that cur[ord[0]] is the smallest line
2421 and will be next output. */
2422 size_t i;
2423 size_t j;
2424 size_t t;
2425 struct keyfield const *key = keylist;
2426 saved.text = NULL;
2428 /* Read initial lines from each input file. */
2429 for (i = 0; i < nfiles; )
2431 initbuf (&buffer[i], sizeof (struct line),
2432 MAX (merge_buffer_size, sort_size / nfiles));
2433 if (fillbuf (&buffer[i], fps[i], files[i].name))
2435 struct line const *linelim = buffer_linelim (&buffer[i]);
2436 cur[i] = linelim - 1;
2437 base[i] = linelim - buffer[i].nlines;
2438 i++;
2440 else
2442 /* fps[i] is empty; eliminate it from future consideration. */
2443 xfclose (fps[i], files[i].name);
2444 if (i < ntemps)
2446 ntemps--;
2447 zaptemp (files[i].name);
2449 free (buffer[i].buf);
2450 --nfiles;
2451 for (j = i; j < nfiles; ++j)
2453 files[j] = files[j + 1];
2454 fps[j] = fps[j + 1];
2459 /* Set up the ord table according to comparisons among input lines.
2460 Since this only reorders two items if one is strictly greater than
2461 the other, it is stable. */
2462 for (i = 0; i < nfiles; ++i)
2463 ord[i] = i;
2464 for (i = 1; i < nfiles; ++i)
2465 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2466 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2468 /* Repeatedly output the smallest line until no input remains. */
2469 while (nfiles)
2471 struct line const *smallest = cur[ord[0]];
2473 /* If uniquified output is turned on, output only the first of
2474 an identical series of lines. */
2475 if (unique)
2477 if (savedline && compare (savedline, smallest))
2479 savedline = NULL;
2480 write_bytes (saved.text, saved.length, ofp, output_file);
2482 if (!savedline)
2484 savedline = &saved;
2485 if (savealloc < smallest->length)
2488 if (! savealloc)
2490 savealloc = smallest->length;
2491 break;
2493 while ((savealloc *= 2) < smallest->length);
2495 saved.text = xrealloc (saved.text, savealloc);
2497 saved.length = smallest->length;
2498 memcpy (saved.text, smallest->text, saved.length);
2499 if (key)
2501 saved.keybeg =
2502 saved.text + (smallest->keybeg - smallest->text);
2503 saved.keylim =
2504 saved.text + (smallest->keylim - smallest->text);
2508 else
2509 write_bytes (smallest->text, smallest->length, ofp, output_file);
2511 /* Check if we need to read more lines into core. */
2512 if (base[ord[0]] < smallest)
2513 cur[ord[0]] = smallest - 1;
2514 else
2516 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2518 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2519 cur[ord[0]] = linelim - 1;
2520 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2522 else
2524 /* We reached EOF on fps[ord[0]]. */
2525 for (i = 1; i < nfiles; ++i)
2526 if (ord[i] > ord[0])
2527 --ord[i];
2528 --nfiles;
2529 xfclose (fps[ord[0]], files[ord[0]].name);
2530 if (ord[0] < ntemps)
2532 ntemps--;
2533 zaptemp (files[ord[0]].name);
2535 free (buffer[ord[0]].buf);
2536 for (i = ord[0]; i < nfiles; ++i)
2538 fps[i] = fps[i + 1];
2539 files[i] = files[i + 1];
2540 buffer[i] = buffer[i + 1];
2541 cur[i] = cur[i + 1];
2542 base[i] = base[i + 1];
2544 for (i = 0; i < nfiles; ++i)
2545 ord[i] = ord[i + 1];
2546 continue;
2550 /* The new line just read in may be larger than other lines
2551 already in main memory; push it back in the queue until we
2552 encounter a line larger than it. Optimize for the common
2553 case where the new line is smallest. */
2555 size_t lo = 1;
2556 size_t hi = nfiles;
2557 size_t probe = lo;
2558 size_t ord0 = ord[0];
2559 size_t count_of_smaller_lines;
2561 while (lo < hi)
2563 int cmp = compare (cur[ord0], cur[ord[probe]]);
2564 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2565 hi = probe;
2566 else
2567 lo = probe + 1;
2568 probe = (lo + hi) / 2;
2571 count_of_smaller_lines = lo - 1;
2572 for (j = 0; j < count_of_smaller_lines; j++)
2573 ord[j] = ord[j + 1];
2574 ord[count_of_smaller_lines] = ord0;
2577 /* Free up some resources every once in a while. */
2578 if (MAX_PROCS_BEFORE_REAP < nprocs)
2579 reap_some ();
2582 if (unique && savedline)
2584 write_bytes (saved.text, saved.length, ofp, output_file);
2585 free (saved.text);
2588 xfclose (ofp, output_file);
2589 free(fps);
2590 free(buffer);
2591 free(ord);
2592 free(base);
2593 free(cur);
2596 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2597 files (all of which are at the start of the FILES array), and
2598 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2599 Close input and output files before returning.
2600 OUTPUT_FILE gives the name of the output file.
2602 Return the number of files successfully merged. This number can be
2603 less than NFILES if we ran low on file descriptors, but in this
2604 case it is never less than 2. */
2606 static size_t
2607 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
2608 FILE *ofp, char const *output_file)
2610 FILE **fps;
2611 size_t nopened = open_input_files (files, nfiles, &fps);
2612 if (nopened < nfiles && nopened < 2)
2613 die (_("open failed"), files[nopened].name);
2614 mergefps (files, ntemps, nopened, ofp, output_file, fps);
2615 return nopened;
2618 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2619 and HI (with NHI members). T, LO, and HI point just past their
2620 respective arrays, and the arrays are in reverse order. NLO and
2621 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
2623 static inline void
2624 mergelines (struct line *t,
2625 struct line const *lo, size_t nlo,
2626 struct line const *hi, size_t nhi)
2628 for (;;)
2629 if (compare (lo - 1, hi - 1) <= 0)
2631 *--t = *--lo;
2632 if (! --nlo)
2634 /* HI - NHI equalled T - (NLO + NHI) when this function
2635 began. Therefore HI must equal T now, and there is no
2636 need to copy from HI to T. */
2637 return;
2640 else
2642 *--t = *--hi;
2643 if (! --nhi)
2646 *--t = *--lo;
2647 while (--nlo);
2649 return;
2654 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2655 NLINES must be at least 2.
2656 The input and output arrays are in reverse order, and LINES and
2657 TEMP point just past the end of their respective arrays.
2659 Use a recursive divide-and-conquer algorithm, in the style
2660 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
2661 the optimization suggested by exercise 5.2.4-10; this requires room
2662 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
2663 writes that this memory optimization was originally published by
2664 D. A. Bell, Comp J. 1 (1958), 75. */
2666 static void
2667 sortlines (struct line *lines, size_t nlines, struct line *temp)
2669 if (nlines == 2)
2671 if (0 < compare (&lines[-1], &lines[-2]))
2673 struct line tmp = lines[-1];
2674 lines[-1] = lines[-2];
2675 lines[-2] = tmp;
2678 else
2680 size_t nlo = nlines / 2;
2681 size_t nhi = nlines - nlo;
2682 struct line *lo = lines;
2683 struct line *hi = lines - nlo;
2684 struct line *sorted_lo = temp;
2686 sortlines (hi, nhi, temp);
2687 if (1 < nlo)
2688 sortlines_temp (lo, nlo, sorted_lo);
2689 else
2690 sorted_lo[-1] = lo[-1];
2692 mergelines (lines, sorted_lo, nlo, hi, nhi);
2696 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2697 rather than sorting in place. */
2699 static void
2700 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2702 if (nlines == 2)
2704 /* Declare `swap' as int, not bool, to work around a bug
2705 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2706 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
2707 int swap = (0 < compare (&lines[-1], &lines[-2]));
2708 temp[-1] = lines[-1 - swap];
2709 temp[-2] = lines[-2 + swap];
2711 else
2713 size_t nlo = nlines / 2;
2714 size_t nhi = nlines - nlo;
2715 struct line *lo = lines;
2716 struct line *hi = lines - nlo;
2717 struct line *sorted_hi = temp - nlo;
2719 sortlines_temp (hi, nhi, sorted_hi);
2720 if (1 < nlo)
2721 sortlines (lo, nlo, temp);
2723 mergelines (temp, lo, nlo, sorted_hi, nhi);
2727 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2728 the same as OUTFILE. If found, merge the found instances (and perhaps
2729 some other files) into a temporary file so that it can in turn be
2730 merged into OUTFILE without destroying OUTFILE before it is completely
2731 read. Return the new value of NFILES, which differs from the old if
2732 some merging occurred.
2734 This test ensures that an otherwise-erroneous use like
2735 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2736 It's not clear that POSIX requires this nicety.
2737 Detect common error cases, but don't try to catch obscure cases like
2738 "cat ... FILE ... | sort -m -o FILE"
2739 where traditional "sort" doesn't copy the input and where
2740 people should know that they're getting into trouble anyway.
2741 Catching these obscure cases would slow down performance in
2742 common cases. */
2744 static size_t
2745 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2746 size_t nfiles, char const *outfile)
2748 size_t i;
2749 bool got_outstat = false;
2750 struct stat outstat;
2752 for (i = ntemps; i < nfiles; i++)
2754 bool is_stdin = STREQ (files[i].name, "-");
2755 bool same;
2756 struct stat instat;
2758 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2759 same = true;
2760 else
2762 if (! got_outstat)
2764 if ((outfile
2765 ? stat (outfile, &outstat)
2766 : fstat (STDOUT_FILENO, &outstat))
2767 != 0)
2768 break;
2769 got_outstat = true;
2772 same = (((is_stdin
2773 ? fstat (STDIN_FILENO, &instat)
2774 : stat (files[i].name, &instat))
2775 == 0)
2776 && SAME_INODE (instat, outstat));
2779 if (same)
2781 FILE *tftp;
2782 pid_t pid;
2783 char *temp = create_temp (&tftp, &pid);
2784 size_t num_merged = 0;
2787 num_merged += mergefiles (&files[i], 0, nfiles - i, tftp, temp);
2788 files[i].name = temp;
2789 files[i].pid = pid;
2791 if (i + num_merged < nfiles)
2792 memmove(&files[i + 1], &files[i + num_merged],
2793 num_merged * sizeof *files);
2794 ntemps += 1;
2795 nfiles -= num_merged - 1;;
2796 i += num_merged;
2798 while (i < nfiles);
2802 return nfiles;
2805 /* Merge the input FILES. NTEMPS is the number of files at the
2806 start of FILES that are temporary; it is zero at the top level.
2807 NFILES is the total number of files. Put the output in
2808 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
2810 static void
2811 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2812 char const *output_file)
2814 while (nmerge < nfiles)
2816 /* Number of input files processed so far. */
2817 size_t in;
2819 /* Number of output files generated so far. */
2820 size_t out;
2822 /* nfiles % NMERGE; this counts input files that are left over
2823 after all full-sized merges have been done. */
2824 size_t remainder;
2826 /* Number of easily-available slots at the next loop iteration. */
2827 size_t cheap_slots;
2829 /* Do as many NMERGE-size merges as possible. In the case that
2830 nmerge is bogus, increment by the maximum number of file
2831 descriptors allowed. */
2832 for (out = in = 0; nmerge <= nfiles - in; out++)
2834 FILE *tfp;
2835 pid_t pid;
2836 char *temp = create_temp (&tfp, &pid);
2837 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
2838 nmerge, tfp, temp);
2839 ntemps -= MIN (ntemps, num_merged);
2840 files[out].name = temp;
2841 files[out].pid = pid;
2842 in += num_merged;
2845 remainder = nfiles - in;
2846 cheap_slots = nmerge - out % nmerge;
2848 if (cheap_slots < remainder)
2850 /* So many files remain that they can't all be put into the last
2851 NMERGE-sized output window. Do one more merge. Merge as few
2852 files as possible, to avoid needless I/O. */
2853 size_t nshortmerge = remainder - cheap_slots + 1;
2854 FILE *tfp;
2855 pid_t pid;
2856 char *temp = create_temp (&tfp, &pid);
2857 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
2858 nshortmerge, tfp, temp);
2859 ntemps -= MIN (ntemps, num_merged);
2860 files[out].name = temp;
2861 files[out++].pid = pid;
2862 in += num_merged;
2865 /* Put the remaining input files into the last NMERGE-sized output
2866 window, so they will be merged in the next pass. */
2867 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2868 ntemps += out;
2869 nfiles -= in - out;
2872 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2874 /* We aren't guaranteed that this final mergefiles will work, therefore we
2875 try to merge into the output, and then merge as much as we can into a
2876 temp file if we can't. Repeat. */
2878 for (;;)
2880 /* Merge directly into the output file if possible. */
2881 FILE **fps;
2882 size_t nopened = open_input_files (files, nfiles, &fps);
2884 if (nopened == nfiles)
2886 FILE *ofp = stream_open (output_file, "w");
2887 if (ofp)
2889 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
2890 break;
2892 if (errno != EMFILE || nopened <= 2)
2893 die (_("open failed"), output_file);
2895 else if (nopened <= 2)
2896 die (_("open failed"), files[nopened].name);
2898 /* We ran out of file descriptors. Close one of the input
2899 files, to gain a file descriptor. Then create a temporary
2900 file with our spare file descriptor. Retry if that failed
2901 (e.g., some other process could open a file between the time
2902 we closed and tried to create). */
2903 FILE *tfp;
2904 pid_t pid;
2905 char *temp;
2908 nopened--;
2909 xfclose (fps[nopened], files[nopened].name);
2910 temp = maybe_create_temp (&tfp, &pid, ! (nopened <= 2));
2912 while (!temp);
2914 /* Merge into the newly allocated temporary. */
2915 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp, fps);
2916 ntemps -= MIN (ntemps, nopened);
2917 files[0].name = temp;
2918 files[0].pid = pid;
2920 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
2921 ntemps++;
2922 nfiles -= nopened - 1;
2926 /* Sort NFILES FILES onto OUTPUT_FILE. */
2928 static void
2929 sort (char * const *files, size_t nfiles, char const *output_file)
2931 struct buffer buf;
2932 size_t ntemps = 0;
2933 bool output_file_created = false;
2935 buf.alloc = 0;
2937 while (nfiles)
2939 char const *temp_output;
2940 char const *file = *files;
2941 FILE *fp = xfopen (file, "r");
2942 FILE *tfp;
2943 size_t bytes_per_line = (2 * sizeof (struct line)
2944 - sizeof (struct line) / 2);
2946 if (! buf.alloc)
2947 initbuf (&buf, bytes_per_line,
2948 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2949 buf.eof = false;
2950 files++;
2951 nfiles--;
2953 while (fillbuf (&buf, fp, file))
2955 struct line *line;
2956 struct line *linebase;
2958 if (buf.eof && nfiles
2959 && (bytes_per_line + 1
2960 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2962 /* End of file, but there is more input and buffer room.
2963 Concatenate the next input file; this is faster in
2964 the usual case. */
2965 buf.left = buf.used;
2966 break;
2969 line = buffer_linelim (&buf);
2970 linebase = line - buf.nlines;
2971 if (1 < buf.nlines)
2972 sortlines (line, buf.nlines, linebase);
2973 if (buf.eof && !nfiles && !ntemps && !buf.left)
2975 xfclose (fp, file);
2976 tfp = xfopen (output_file, "w");
2977 temp_output = output_file;
2978 output_file_created = true;
2980 else
2982 ++ntemps;
2983 temp_output = create_temp (&tfp, NULL);
2988 line--;
2989 write_bytes (line->text, line->length, tfp, temp_output);
2990 if (unique)
2991 while (linebase < line && compare (line, line - 1) == 0)
2992 line--;
2994 while (linebase < line);
2996 xfclose (tfp, temp_output);
2998 /* Free up some resources every once in a while. */
2999 if (MAX_PROCS_BEFORE_REAP < nprocs)
3000 reap_some ();
3002 if (output_file_created)
3003 goto finish;
3005 xfclose (fp, file);
3008 finish:
3009 free (buf.buf);
3011 if (! output_file_created)
3013 size_t i;
3014 struct tempnode *node = temphead;
3015 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3016 for (i = 0; node; i++)
3018 tempfiles[i].name = node->name;
3019 tempfiles[i].pid = node->pid;
3020 node = node->next;
3022 merge (tempfiles, ntemps, ntemps, output_file);
3023 free (tempfiles);
3027 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3029 static void
3030 insertkey (struct keyfield *key_arg)
3032 struct keyfield **p;
3033 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3035 for (p = &keylist; *p; p = &(*p)->next)
3036 continue;
3037 *p = key;
3038 key->next = NULL;
3041 /* Report a bad field specification SPEC, with extra info MSGID. */
3043 static void badfieldspec (char const *, char const *)
3044 ATTRIBUTE_NORETURN;
3045 static void
3046 badfieldspec (char const *spec, char const *msgid)
3048 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3049 _(msgid), quote (spec));
3050 abort ();
3053 /* Report incompatible options. */
3055 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3056 static void
3057 incompatible_options (char const *opts)
3059 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3060 abort ();
3063 /* Check compatibility of ordering options. */
3065 static void
3066 check_ordering_compatibility (void)
3068 struct keyfield const *key;
3070 for (key = keylist; key; key = key->next)
3071 if ((1 < (key->random + key->numeric + key->general_numeric + key->month
3072 + key->version + !!key->ignore + key->human_numeric))
3073 || (key->random && key->translate))
3075 /* The following is too big, but guaranteed to be "big enough". */
3076 char opts[sizeof short_options];
3077 char *p = opts;
3078 if (key->ignore == nondictionary)
3079 *p++ = 'd';
3080 if (key->translate)
3081 *p++ = 'f';
3082 if (key->general_numeric)
3083 *p++ = 'g';
3084 if (key->human_numeric)
3085 *p++ = 'h';
3086 if (key->ignore == nonprinting)
3087 *p++ = 'i';
3088 if (key->month)
3089 *p++ = 'M';
3090 if (key->numeric)
3091 *p++ = 'n';
3092 if (key->version)
3093 *p++ = 'V';
3094 if (key->random)
3095 *p++ = 'R';
3096 *p = '\0';
3097 incompatible_options (opts);
3101 /* Parse the leading integer in STRING and store the resulting value
3102 (which must fit into size_t) into *VAL. Return the address of the
3103 suffix after the integer. If the value is too large, silently
3104 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3105 failure; otherwise, report MSGID and exit on failure. */
3107 static char const *
3108 parse_field_count (char const *string, size_t *val, char const *msgid)
3110 char *suffix;
3111 uintmax_t n;
3113 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3115 case LONGINT_OK:
3116 case LONGINT_INVALID_SUFFIX_CHAR:
3117 *val = n;
3118 if (*val == n)
3119 break;
3120 /* Fall through. */
3121 case LONGINT_OVERFLOW:
3122 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3123 *val = SIZE_MAX;
3124 break;
3126 case LONGINT_INVALID:
3127 if (msgid)
3128 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3129 _(msgid), quote (string));
3130 return NULL;
3133 return suffix;
3136 /* Handle interrupts and hangups. */
3138 static void
3139 sighandler (int sig)
3141 if (! SA_NOCLDSTOP)
3142 signal (sig, SIG_IGN);
3144 cleanup ();
3146 signal (sig, SIG_DFL);
3147 raise (sig);
3150 /* Set the ordering options for KEY specified in S.
3151 Return the address of the first character in S that
3152 is not a valid ordering option.
3153 BLANKTYPE is the kind of blanks that 'b' should skip. */
3155 static char *
3156 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
3158 while (*s)
3160 switch (*s)
3162 case 'b':
3163 if (blanktype == bl_start || blanktype == bl_both)
3164 key->skipsblanks = true;
3165 if (blanktype == bl_end || blanktype == bl_both)
3166 key->skipeblanks = true;
3167 break;
3168 case 'd':
3169 key->ignore = nondictionary;
3170 break;
3171 case 'f':
3172 key->translate = fold_toupper;
3173 break;
3174 case 'g':
3175 key->general_numeric = true;
3176 break;
3177 case 'h':
3178 key->human_numeric = true;
3179 break;
3180 case 'i':
3181 /* Option order should not matter, so don't let -i override
3182 -d. -d implies -i, but -i does not imply -d. */
3183 if (! key->ignore)
3184 key->ignore = nonprinting;
3185 break;
3186 case 'M':
3187 key->month = true;
3188 break;
3189 case 'n':
3190 key->numeric = true;
3191 break;
3192 case 'R':
3193 key->random = true;
3194 break;
3195 case 'r':
3196 key->reverse = true;
3197 break;
3198 case 'V':
3199 key->version = true;
3200 break;
3201 default:
3202 return (char *) s;
3204 ++s;
3206 return (char *) s;
3209 static struct keyfield *
3210 key_init (struct keyfield *key)
3212 memset (key, 0, sizeof *key);
3213 key->eword = SIZE_MAX;
3214 key->iec_present = -1;
3215 return key;
3219 main (int argc, char **argv)
3221 struct keyfield *key;
3222 struct keyfield key_buf;
3223 struct keyfield gkey;
3224 char const *s;
3225 int c = 0;
3226 char checkonly = 0;
3227 bool mergeonly = false;
3228 char *random_source = NULL;
3229 bool need_random = false;
3230 size_t nfiles = 0;
3231 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
3232 bool obsolete_usage = (posix2_version () < 200112);
3233 char **files;
3234 char *files_from = NULL;
3235 struct Tokens tok;
3236 char const *outfile = NULL;
3238 initialize_main (&argc, &argv);
3239 set_program_name (argv[0]);
3240 setlocale (LC_ALL, "");
3241 bindtextdomain (PACKAGE, LOCALEDIR);
3242 textdomain (PACKAGE);
3244 initialize_exit_failure (SORT_FAILURE);
3246 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3247 #if HAVE_NL_LANGINFO
3248 hard_LC_TIME = hard_locale (LC_TIME);
3249 #endif
3251 /* Get locale's representation of the decimal point. */
3253 struct lconv const *locale = localeconv ();
3255 /* If the locale doesn't define a decimal point, or if the decimal
3256 point is multibyte, use the C locale's decimal point. FIXME:
3257 add support for multibyte decimal points. */
3258 decimal_point = to_uchar (locale->decimal_point[0]);
3259 if (! decimal_point || locale->decimal_point[1])
3260 decimal_point = '.';
3262 /* FIXME: add support for multibyte thousands separators. */
3263 thousands_sep = to_uchar (*locale->thousands_sep);
3264 if (! thousands_sep || locale->thousands_sep[1])
3265 thousands_sep = -1;
3268 have_read_stdin = false;
3269 inittables ();
3272 size_t i;
3273 static int const sig[] =
3275 /* The usual suspects. */
3276 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
3277 #ifdef SIGPOLL
3278 SIGPOLL,
3279 #endif
3280 #ifdef SIGPROF
3281 SIGPROF,
3282 #endif
3283 #ifdef SIGVTALRM
3284 SIGVTALRM,
3285 #endif
3286 #ifdef SIGXCPU
3287 SIGXCPU,
3288 #endif
3289 #ifdef SIGXFSZ
3290 SIGXFSZ,
3291 #endif
3293 enum { nsigs = ARRAY_CARDINALITY (sig) };
3295 #if SA_NOCLDSTOP
3296 struct sigaction act;
3298 sigemptyset (&caught_signals);
3299 for (i = 0; i < nsigs; i++)
3301 sigaction (sig[i], NULL, &act);
3302 if (act.sa_handler != SIG_IGN)
3303 sigaddset (&caught_signals, sig[i]);
3306 act.sa_handler = sighandler;
3307 act.sa_mask = caught_signals;
3308 act.sa_flags = 0;
3310 for (i = 0; i < nsigs; i++)
3311 if (sigismember (&caught_signals, sig[i]))
3312 sigaction (sig[i], &act, NULL);
3313 #else
3314 for (i = 0; i < nsigs; i++)
3315 if (signal (sig[i], SIG_IGN) != SIG_IGN)
3317 signal (sig[i], sighandler);
3318 siginterrupt (sig[i], 1);
3320 #endif
3322 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
3324 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
3325 atexit (exit_cleanup);
3327 gkey.sword = gkey.eword = SIZE_MAX;
3328 gkey.ignore = NULL;
3329 gkey.translate = NULL;
3330 gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
3331 gkey.iec_present = -1;
3332 gkey.random = gkey.version = false;
3333 gkey.month = gkey.reverse = false;
3334 gkey.skipsblanks = gkey.skipeblanks = false;
3336 files = xnmalloc (argc, sizeof *files);
3338 for (;;)
3340 /* Parse an operand as a file after "--" was seen; or if
3341 pedantic and a file was seen, unless the POSIX version
3342 predates 1003.1-2001 and -c was not seen and the operand is
3343 "-o FILE" or "-oFILE". */
3344 int oi = -1;
3346 if (c == -1
3347 || (posixly_correct && nfiles != 0
3348 && ! (obsolete_usage
3349 && ! checkonly
3350 && optind != argc
3351 && argv[optind][0] == '-' && argv[optind][1] == 'o'
3352 && (argv[optind][2] || optind + 1 != argc)))
3353 || ((c = getopt_long (argc, argv, short_options,
3354 long_options, &oi))
3355 == -1))
3357 if (argc <= optind)
3358 break;
3359 files[nfiles++] = argv[optind++];
3361 else switch (c)
3363 case 1:
3364 key = NULL;
3365 if (optarg[0] == '+')
3367 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3368 && ISDIGIT (argv[optind][1]));
3369 obsolete_usage |= minus_pos_usage && !posixly_correct;
3370 if (obsolete_usage)
3372 /* Treat +POS1 [-POS2] as a key if possible; but silently
3373 treat an operand as a file if it is not a valid +POS1. */
3374 key = key_init (&key_buf);
3375 s = parse_field_count (optarg + 1, &key->sword, NULL);
3376 if (s && *s == '.')
3377 s = parse_field_count (s + 1, &key->schar, NULL);
3378 if (! (key->sword || key->schar))
3379 key->sword = SIZE_MAX;
3380 if (! s || *set_ordering (s, key, bl_start))
3381 key = NULL;
3382 else
3384 if (minus_pos_usage)
3386 char const *optarg1 = argv[optind++];
3387 s = parse_field_count (optarg1 + 1, &key->eword,
3388 N_("invalid number after `-'"));
3389 if (*s == '.')
3390 s = parse_field_count (s + 1, &key->echar,
3391 N_("invalid number after `.'"));
3392 if (*set_ordering (s, key, bl_end))
3393 badfieldspec (optarg1,
3394 N_("stray character in field spec"));
3396 insertkey (key);
3400 if (! key)
3401 files[nfiles++] = optarg;
3402 break;
3404 case SORT_OPTION:
3405 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3406 /* Fall through. */
3407 case 'b':
3408 case 'd':
3409 case 'f':
3410 case 'g':
3411 case 'h':
3412 case 'i':
3413 case 'M':
3414 case 'n':
3415 case 'r':
3416 case 'R':
3417 case 'V':
3419 char str[2];
3420 str[0] = c;
3421 str[1] = '\0';
3422 set_ordering (str, &gkey, bl_both);
3424 break;
3426 case CHECK_OPTION:
3427 c = (optarg
3428 ? XARGMATCH ("--check", optarg, check_args, check_types)
3429 : 'c');
3430 /* Fall through. */
3431 case 'c':
3432 case 'C':
3433 if (checkonly && checkonly != c)
3434 incompatible_options ("cC");
3435 checkonly = c;
3436 break;
3438 case COMPRESS_PROGRAM_OPTION:
3439 if (compress_program && !STREQ (compress_program, optarg))
3440 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3441 compress_program = optarg;
3442 break;
3444 case FILES0_FROM_OPTION:
3445 files_from = optarg;
3446 break;
3448 case 'k':
3449 key = key_init (&key_buf);
3451 /* Get POS1. */
3452 s = parse_field_count (optarg, &key->sword,
3453 N_("invalid number at field start"));
3454 if (! key->sword--)
3456 /* Provoke with `sort -k0' */
3457 badfieldspec (optarg, N_("field number is zero"));
3459 if (*s == '.')
3461 s = parse_field_count (s + 1, &key->schar,
3462 N_("invalid number after `.'"));
3463 if (! key->schar--)
3465 /* Provoke with `sort -k1.0' */
3466 badfieldspec (optarg, N_("character offset is zero"));
3469 if (! (key->sword || key->schar))
3470 key->sword = SIZE_MAX;
3471 s = set_ordering (s, key, bl_start);
3472 if (*s != ',')
3474 key->eword = SIZE_MAX;
3475 key->echar = 0;
3477 else
3479 /* Get POS2. */
3480 s = parse_field_count (s + 1, &key->eword,
3481 N_("invalid number after `,'"));
3482 if (! key->eword--)
3484 /* Provoke with `sort -k1,0' */
3485 badfieldspec (optarg, N_("field number is zero"));
3487 if (*s == '.')
3489 s = parse_field_count (s + 1, &key->echar,
3490 N_("invalid number after `.'"));
3492 s = set_ordering (s, key, bl_end);
3494 if (*s)
3495 badfieldspec (optarg, N_("stray character in field spec"));
3496 insertkey (key);
3497 break;
3499 case 'm':
3500 mergeonly = true;
3501 break;
3503 case NMERGE_OPTION:
3504 specify_nmerge (oi, c, optarg);
3505 break;
3507 case 'o':
3508 if (outfile && !STREQ (outfile, optarg))
3509 error (SORT_FAILURE, 0, _("multiple output files specified"));
3510 outfile = optarg;
3511 break;
3513 case RANDOM_SOURCE_OPTION:
3514 if (random_source && !STREQ (random_source, optarg))
3515 error (SORT_FAILURE, 0, _("multiple random sources specified"));
3516 random_source = optarg;
3517 break;
3519 case 's':
3520 stable = true;
3521 break;
3523 case 'S':
3524 specify_sort_size (oi, c, optarg);
3525 break;
3527 case 't':
3529 char newtab = optarg[0];
3530 if (! newtab)
3531 error (SORT_FAILURE, 0, _("empty tab"));
3532 if (optarg[1])
3534 if (STREQ (optarg, "\\0"))
3535 newtab = '\0';
3536 else
3538 /* Provoke with `sort -txx'. Complain about
3539 "multi-character tab" instead of "multibyte tab", so
3540 that the diagnostic's wording does not need to be
3541 changed once multibyte characters are supported. */
3542 error (SORT_FAILURE, 0, _("multi-character tab %s"),
3543 quote (optarg));
3546 if (tab != TAB_DEFAULT && tab != newtab)
3547 error (SORT_FAILURE, 0, _("incompatible tabs"));
3548 tab = newtab;
3550 break;
3552 case 'T':
3553 add_temp_dir (optarg);
3554 break;
3556 case 'u':
3557 unique = true;
3558 break;
3560 case 'y':
3561 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3562 through Solaris 7. It is also accepted by many non-Solaris
3563 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3564 -y is marked as obsolete starting with Solaris 8 (1999), but is
3565 still accepted as of Solaris 10 prerelease (2004).
3567 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3568 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3569 and which in general ignores the argument after "-y" if it
3570 consists entirely of digits (it can even be empty). */
3571 if (optarg == argv[optind - 1])
3573 char const *p;
3574 for (p = optarg; ISDIGIT (*p); p++)
3575 continue;
3576 optind -= (*p != '\0');
3578 break;
3580 case 'z':
3581 eolchar = 0;
3582 break;
3584 case_GETOPT_HELP_CHAR;
3586 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3588 default:
3589 usage (SORT_FAILURE);
3593 if (files_from)
3595 FILE *stream;
3597 /* When using --files0-from=F, you may not specify any files
3598 on the command-line. */
3599 if (nfiles)
3601 error (0, 0, _("extra operand %s"), quote (files[0]));
3602 fprintf (stderr, "%s\n",
3603 _("file operands cannot be combined with --files0-from"));
3604 usage (SORT_FAILURE);
3607 if (STREQ (files_from, "-"))
3608 stream = stdin;
3609 else
3611 stream = fopen (files_from, "r");
3612 if (stream == NULL)
3613 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3614 quote (files_from));
3617 readtokens0_init (&tok);
3619 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3620 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3621 quote (files_from));
3623 if (tok.n_tok)
3625 size_t i;
3626 free (files);
3627 files = tok.tok;
3628 nfiles = tok.n_tok;
3629 for (i = 0; i < nfiles; i++)
3631 if (STREQ (files[i], "-"))
3632 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3633 "no file name of %s allowed"),
3634 quote (files[i]));
3635 else if (files[i][0] == '\0')
3637 /* Using the standard `filename:line-number:' prefix here is
3638 not totally appropriate, since NUL is the separator, not NL,
3639 but it might be better than nothing. */
3640 unsigned long int file_number = i + 1;
3641 error (SORT_FAILURE, 0,
3642 _("%s:%lu: invalid zero-length file name"),
3643 quotearg_colon (files_from), file_number);
3647 else
3648 error (SORT_FAILURE, 0, _("no input from %s"),
3649 quote (files_from));
3652 /* Inheritance of global options to individual keys. */
3653 for (key = keylist; key; key = key->next)
3655 if (! (key->ignore
3656 || key->translate
3657 || (key->skipsblanks
3658 || key->reverse
3659 || key->skipeblanks
3660 || key->month
3661 || key->numeric
3662 || key->version
3663 || key->general_numeric
3664 || key->human_numeric
3665 || key->random)))
3667 key->ignore = gkey.ignore;
3668 key->translate = gkey.translate;
3669 key->skipsblanks = gkey.skipsblanks;
3670 key->skipeblanks = gkey.skipeblanks;
3671 key->month = gkey.month;
3672 key->numeric = gkey.numeric;
3673 key->general_numeric = gkey.general_numeric;
3674 key->human_numeric = gkey.human_numeric;
3675 key->random = gkey.random;
3676 key->reverse = gkey.reverse;
3677 key->version = gkey.version;
3680 need_random |= key->random;
3683 if (!keylist && (gkey.ignore
3684 || gkey.translate
3685 || (gkey.skipsblanks
3686 || gkey.skipeblanks
3687 || gkey.month
3688 || gkey.numeric
3689 || gkey.general_numeric
3690 || gkey.human_numeric
3691 || gkey.random
3692 || gkey.version)))
3694 insertkey (&gkey);
3695 need_random |= gkey.random;
3698 check_ordering_compatibility ();
3700 reverse = gkey.reverse;
3702 if (need_random)
3704 randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3705 if (! randread_source)
3706 die (_("open failed"), random_source);
3709 if (temp_dir_count == 0)
3711 char const *tmp_dir = getenv ("TMPDIR");
3712 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3715 if (nfiles == 0)
3717 static char *minus = (char *) "-";
3718 nfiles = 1;
3719 free (files);
3720 files = &minus;
3723 /* Need to re-check that we meet the minimum requirement for memory
3724 usage with the final value for NMERGE. */
3725 if (0 < sort_size)
3726 sort_size = MAX (sort_size, MIN_SORT_SIZE);
3728 if (checkonly)
3730 if (nfiles > 1)
3731 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3732 quote (files[1]), checkonly);
3734 if (outfile)
3736 static char opts[] = {0, 'o', 0};
3737 opts[0] = checkonly;
3738 incompatible_options (opts);
3741 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3742 input is not properly sorted. */
3743 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3746 if (mergeonly)
3748 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3749 size_t i;
3751 for (i = 0; i < nfiles; ++i)
3752 sortfiles[i].name = files[i];
3754 merge (sortfiles, 0, nfiles, outfile);
3755 IF_LINT (free (sortfiles));
3757 else
3758 sort (files, nfiles, outfile);
3760 if (have_read_stdin && fclose (stdin) == EOF)
3761 die (_("close failed"), "-");
3763 exit (EXIT_SUCCESS);