global: convert indentation-TABs to spaces
[coreutils.git] / src / sort.c
blobb9ae19883cb8e71fbd7470870bcc2c48b4015ae6
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <sys/types.h>
27 #include <sys/wait.h>
28 #include <signal.h>
29 #include "system.h"
30 #include "argmatch.h"
31 #include "error.h"
32 #include "filevercmp.h"
33 #include "hash.h"
34 #include "md5.h"
35 #include "physmem.h"
36 #include "posixver.h"
37 #include "quote.h"
38 #include "quotearg.h"
39 #include "randread.h"
40 #include "readtokens0.h"
41 #include "stdio--.h"
42 #include "stdlib--.h"
43 #include "strnumcmp.h"
44 #include "xmemcoll.h"
45 #include "xmemxfrm.h"
46 #include "xstrtol.h"
48 #if HAVE_SYS_RESOURCE_H
49 # include <sys/resource.h>
50 #endif
51 #ifndef RLIMIT_DATA
52 struct rlimit { size_t rlim_cur; };
53 # define getrlimit(Resource, Rlp) (-1)
54 #endif
56 /* The official name of this program (e.g., no `g' prefix). */
57 #define PROGRAM_NAME "sort"
59 #define AUTHORS \
60 proper_name ("Mike Haertel"), \
61 proper_name ("Paul Eggert")
63 #if HAVE_LANGINFO_CODESET
64 # include <langinfo.h>
65 #endif
67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
68 present. */
69 #ifndef SA_NOCLDSTOP
70 # define SA_NOCLDSTOP 0
71 /* No sigprocmask. Always 'return' zero. */
72 # define sigprocmask(How, Set, Oset) (0)
73 # define sigset_t int
74 # if ! HAVE_SIGINTERRUPT
75 # define siginterrupt(sig, flag) /* empty */
76 # endif
77 #endif
79 #if !defined OPEN_MAX && defined NR_OPEN
80 # define OPEN_MAX NR_OPEN
81 #endif
82 #if !defined OPEN_MAX
83 # define OPEN_MAX 20
84 #endif
86 #define UCHAR_LIM (UCHAR_MAX + 1)
88 #ifndef DEFAULT_TMPDIR
89 # define DEFAULT_TMPDIR "/tmp"
90 #endif
92 /* Exit statuses. */
93 enum
95 /* POSIX says to exit with status 1 if invoked with -c and the
96 input is not properly sorted. */
97 SORT_OUT_OF_ORDER = 1,
99 /* POSIX says any other irregular exit must exit with a status
100 code greater than 1. */
101 SORT_FAILURE = 2
104 enum
106 /* The number of times we should try to fork a compression process
107 (we retry if the fork call fails). We don't _need_ to compress
108 temp files, this is just to reduce disk access, so this number
109 can be small. */
110 MAX_FORK_TRIES_COMPRESS = 2,
112 /* The number of times we should try to fork a decompression process.
113 If we can't fork a decompression process, we can't sort, so this
114 number should be big. */
115 MAX_FORK_TRIES_DECOMPRESS = 8
118 /* The representation of the decimal point in the current locale. */
119 static int decimal_point;
121 /* Thousands separator; if -1, then there isn't one. */
122 static int thousands_sep;
124 /* Nonzero if the corresponding locales are hard. */
125 static bool hard_LC_COLLATE;
126 #if HAVE_NL_LANGINFO
127 static bool hard_LC_TIME;
128 #endif
130 #define NONZERO(x) ((x) != 0)
132 /* The kind of blanks for '-b' to skip in various options. */
133 enum blanktype { bl_start, bl_end, bl_both };
135 /* The character marking end of line. Default to \n. */
136 static char eolchar = '\n';
138 /* Lines are held in core as counted strings. */
139 struct line
141 char *text; /* Text of the line. */
142 size_t length; /* Length including final newline. */
143 char *keybeg; /* Start of first key. */
144 char *keylim; /* Limit of first key. */
147 /* Input buffers. */
148 struct buffer
150 char *buf; /* Dynamically allocated buffer,
151 partitioned into 3 regions:
152 - input data;
153 - unused area;
154 - an array of lines, in reverse order. */
155 size_t used; /* Number of bytes used for input data. */
156 size_t nlines; /* Number of lines in the line array. */
157 size_t alloc; /* Number of bytes allocated. */
158 size_t left; /* Number of bytes left from previous reads. */
159 size_t line_bytes; /* Number of bytes to reserve for each line. */
160 bool eof; /* An EOF has been read. */
163 struct keyfield
165 size_t sword; /* Zero-origin 'word' to start at. */
166 size_t schar; /* Additional characters to skip. */
167 size_t eword; /* Zero-origin first word after field. */
168 size_t echar; /* Additional characters in field. */
169 bool const *ignore; /* Boolean array of characters to ignore. */
170 char const *translate; /* Translation applied to characters. */
171 bool skipsblanks; /* Skip leading blanks when finding start. */
172 bool skipeblanks; /* Skip leading blanks when finding end. */
173 bool numeric; /* Flag for numeric comparison. Handle
174 strings of digits with optional decimal
175 point, but no exponential notation. */
176 bool random; /* Sort by random hash of key. */
177 bool general_numeric; /* Flag for general, numeric comparison.
178 Handle numbers in exponential notation. */
179 bool human_numeric; /* Flag for sorting by human readable
180 units with either SI xor IEC prefixes. */
181 int si_present; /* Flag for checking for mixed SI and IEC. */
182 bool month; /* Flag for comparison by month name. */
183 bool reverse; /* Reverse the sense of comparison. */
184 bool version; /* sort by version number */
185 struct keyfield *next; /* Next keyfield to try. */
188 struct month
190 char const *name;
191 int val;
194 /* FIXME: None of these tables work with multibyte character sets.
195 Also, there are many other bugs when handling multibyte characters.
196 One way to fix this is to rewrite `sort' to use wide characters
197 internally, but doing this with good performance is a bit
198 tricky. */
200 /* Table of blanks. */
201 static bool blanks[UCHAR_LIM];
203 /* Table of non-printing characters. */
204 static bool nonprinting[UCHAR_LIM];
206 /* Table of non-dictionary characters (not letters, digits, or blanks). */
207 static bool nondictionary[UCHAR_LIM];
209 /* Translation table folding lower case to upper. */
210 static char fold_toupper[UCHAR_LIM];
212 #define MONTHS_PER_YEAR 12
214 /* Table mapping month names to integers.
215 Alphabetic order allows binary search. */
216 static struct month monthtab[] =
218 {"APR", 4},
219 {"AUG", 8},
220 {"DEC", 12},
221 {"FEB", 2},
222 {"JAN", 1},
223 {"JUL", 7},
224 {"JUN", 6},
225 {"MAR", 3},
226 {"MAY", 5},
227 {"NOV", 11},
228 {"OCT", 10},
229 {"SEP", 9}
232 /* During the merge phase, the number of files to merge at once. */
233 #define NMERGE_DEFAULT 16
235 /* Minimum size for a merge or check buffer. */
236 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
238 /* Minimum sort size; the code might not work with smaller sizes. */
239 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
241 /* The number of bytes needed for a merge or check buffer, which can
242 function relatively efficiently even if it holds only one line. If
243 a longer line is seen, this value is increased. */
244 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
246 /* The approximate maximum number of bytes of main memory to use, as
247 specified by the user. Zero if the user has not specified a size. */
248 static size_t sort_size;
250 /* The guessed size for non-regular files. */
251 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
253 /* Array of directory names in which any temporary files are to be created. */
254 static char const **temp_dirs;
256 /* Number of temporary directory names used. */
257 static size_t temp_dir_count;
259 /* Number of allocated slots in temp_dirs. */
260 static size_t temp_dir_alloc;
262 /* Flag to reverse the order of all comparisons. */
263 static bool reverse;
265 /* Flag for stable sort. This turns off the last ditch bytewise
266 comparison of lines, and instead leaves lines in the same order
267 they were read if all keys compare equal. */
268 static bool stable;
270 /* If TAB has this value, blanks separate fields. */
271 enum { TAB_DEFAULT = CHAR_MAX + 1 };
273 /* Tab character separating fields. If TAB_DEFAULT, then fields are
274 separated by the empty string between a non-blank character and a blank
275 character. */
276 static int tab = TAB_DEFAULT;
278 /* Flag to remove consecutive duplicate lines from the output.
279 Only the last of a sequence of equal lines will be output. */
280 static bool unique;
282 /* Nonzero if any of the input files are the standard input. */
283 static bool have_read_stdin;
285 /* List of key field comparisons to be tried. */
286 static struct keyfield *keylist;
288 /* Program used to (de)compress temp files. Must accept -d. */
289 static char const *compress_program;
291 /* Maximum number of files to merge in one go. If more than this
292 number are present, temp files will be used. */
293 static unsigned int nmerge = NMERGE_DEFAULT;
295 static void sortlines_temp (struct line *, size_t, struct line *);
297 /* Report MESSAGE for FILE, then clean up and exit.
298 If FILE is null, it represents standard output. */
300 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
301 static void
302 die (char const *message, char const *file)
304 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
305 exit (SORT_FAILURE);
308 void
309 usage (int status)
311 if (status != EXIT_SUCCESS)
312 fprintf (stderr, _("Try `%s --help' for more information.\n"),
313 program_name);
314 else
316 printf (_("\
317 Usage: %s [OPTION]... [FILE]...\n\
318 or: %s [OPTION]... --files0-from=F\n\
320 program_name, program_name);
321 fputs (_("\
322 Write sorted concatenation of all FILE(s) to standard output.\n\
324 "), stdout);
325 fputs (_("\
326 Mandatory arguments to long options are mandatory for short options too.\n\
327 "), stdout);
328 fputs (_("\
329 Ordering options:\n\
331 "), stdout);
332 fputs (_("\
333 -b, --ignore-leading-blanks ignore leading blanks\n\
334 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
335 -f, --ignore-case fold lower case to upper case characters\n\
336 "), stdout);
337 fputs (_("\
338 -g, --general-numeric-sort compare according to general numerical value\n\
339 -i, --ignore-nonprinting consider only printable characters\n\
340 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
341 "), stdout);
342 fputs (_("\
343 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
344 "), stdout);
345 fputs (_("\
346 -n, --numeric-sort compare according to string numerical value\n\
347 -R, --random-sort sort by random hash of keys\n\
348 --random-source=FILE get random bytes from FILE\n\
349 -r, --reverse reverse the result of comparisons\n\
350 "), stdout);
351 fputs (_("\
352 --sort=WORD sort according to WORD:\n\
353 general-numeric -g, human-numeric -h, month -M,\n\
354 numeric -n, random -R, version -V\n\
355 -V, --version-sort natural sort of (version) numbers within text\n\
357 "), stdout);
358 fputs (_("\
359 Other options:\n\
361 "), stdout);
362 fputs (_("\
363 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
364 for more use temp files\n\
365 "), stdout);
366 fputs (_("\
367 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
368 -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\
369 --compress-program=PROG compress temporaries with PROG;\n\
370 decompress them with PROG -d\n\
371 --files0-from=F read input from the files specified by\n\
372 NUL-terminated names in file F;\n\
373 If F is - then read names from standard input\n\
374 "), stdout);
375 fputs (_("\
376 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
377 (default end of line)\n\
378 -m, --merge merge already sorted files; do not sort\n\
379 "), stdout);
380 fputs (_("\
381 -o, --output=FILE write result to FILE instead of standard output\n\
382 -s, --stable stabilize sort by disabling last-resort comparison\n\
383 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
384 "), stdout);
385 printf (_("\
386 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
387 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
388 multiple options specify multiple directories\n\
389 -u, --unique with -c, check for strict ordering;\n\
390 without -c, output only the first of an equal run\n\
391 "), DEFAULT_TMPDIR);
392 fputs (_("\
393 -z, --zero-terminated end lines with 0 byte, not newline\n\
394 "), stdout);
395 fputs (HELP_OPTION_DESCRIPTION, stdout);
396 fputs (VERSION_OPTION_DESCRIPTION, stdout);
397 fputs (_("\
399 POS is F[.C][OPTS], where F is the field number and C the character position\n\
400 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
401 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
402 one or more single-letter ordering options, which override global ordering\n\
403 options for that key. If no key is given, use the entire line as the key.\n\
405 SIZE may be followed by the following multiplicative suffixes:\n\
406 "), stdout);
407 fputs (_("\
408 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
410 With no FILE, or when FILE is -, read standard input.\n\
412 *** WARNING ***\n\
413 The locale specified by the environment affects sort order.\n\
414 Set LC_ALL=C to get the traditional sort order that uses\n\
415 native byte values.\n\
416 "), stdout );
417 emit_bug_reporting_address ();
420 exit (status);
423 /* For long options that have no equivalent short option, use a
424 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
425 enum
427 CHECK_OPTION = CHAR_MAX + 1,
428 COMPRESS_PROGRAM_OPTION,
429 FILES0_FROM_OPTION,
430 NMERGE_OPTION,
431 RANDOM_SOURCE_OPTION,
432 SORT_OPTION
435 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
437 static struct option const long_options[] =
439 {"ignore-leading-blanks", no_argument, NULL, 'b'},
440 {"check", optional_argument, NULL, CHECK_OPTION},
441 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
442 {"dictionary-order", no_argument, NULL, 'd'},
443 {"ignore-case", no_argument, NULL, 'f'},
444 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
445 {"general-numeric-sort", no_argument, NULL, 'g'},
446 {"ignore-nonprinting", no_argument, NULL, 'i'},
447 {"key", required_argument, NULL, 'k'},
448 {"merge", no_argument, NULL, 'm'},
449 {"month-sort", no_argument, NULL, 'M'},
450 {"numeric-sort", no_argument, NULL, 'n'},
451 {"human-numeric-sort", no_argument, NULL, 'h'},
452 {"version-sort", no_argument, NULL, 'V'},
453 {"random-sort", no_argument, NULL, 'R'},
454 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
455 {"sort", required_argument, NULL, SORT_OPTION},
456 {"output", required_argument, NULL, 'o'},
457 {"reverse", no_argument, NULL, 'r'},
458 {"stable", no_argument, NULL, 's'},
459 {"batch-size", required_argument, NULL, NMERGE_OPTION},
460 {"buffer-size", required_argument, NULL, 'S'},
461 {"field-separator", required_argument, NULL, 't'},
462 {"temporary-directory", required_argument, NULL, 'T'},
463 {"unique", no_argument, NULL, 'u'},
464 {"zero-terminated", no_argument, NULL, 'z'},
465 {GETOPT_HELP_OPTION_DECL},
466 {GETOPT_VERSION_OPTION_DECL},
467 {NULL, 0, NULL, 0},
470 #define CHECK_TABLE \
471 _ct_("quiet", 'C') \
472 _ct_("silent", 'C') \
473 _ct_("diagnose-first", 'c')
475 static char const *const check_args[] =
477 #define _ct_(_s, _c) _s,
478 CHECK_TABLE NULL
479 #undef _ct_
481 static char const check_types[] =
483 #define _ct_(_s, _c) _c,
484 CHECK_TABLE
485 #undef _ct_
488 #define SORT_TABLE \
489 _st_("general-numeric", 'g') \
490 _st_("human-numeric", 'h') \
491 _st_("month", 'M') \
492 _st_("numeric", 'n') \
493 _st_("random", 'R') \
494 _st_("version", 'V')
496 static char const *const sort_args[] =
498 #define _st_(_s, _c) _s,
499 SORT_TABLE NULL
500 #undef _st_
502 static char const sort_types[] =
504 #define _st_(_s, _c) _c,
505 SORT_TABLE
506 #undef _st_
509 /* The set of signals that are caught. */
510 static sigset_t caught_signals;
512 /* Critical section status. */
513 struct cs_status
515 bool valid;
516 sigset_t sigs;
519 /* Enter a critical section. */
520 static struct cs_status
521 cs_enter (void)
523 struct cs_status status;
524 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
525 return status;
528 /* Leave a critical section. */
529 static void
530 cs_leave (struct cs_status status)
532 if (status.valid)
534 /* Ignore failure when restoring the signal mask. */
535 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
539 /* The list of temporary files. */
540 struct tempnode
542 struct tempnode *volatile next;
543 pid_t pid; /* If compressed, the pid of compressor, else zero */
544 char name[1]; /* Actual size is 1 + file name length. */
546 static struct tempnode *volatile temphead;
547 static struct tempnode *volatile *temptail = &temphead;
549 struct sortfile
551 char const *name;
552 pid_t pid; /* If compressed, the pid of compressor, else zero */
555 /* A table where we store compression process states. We clean up all
556 processes in a timely manner so as not to exhaust system resources,
557 so we store the info on whether the process is still running, or has
558 been reaped here. */
559 static Hash_table *proctab;
561 enum { INIT_PROCTAB_SIZE = 47 };
563 enum procstate { ALIVE, ZOMBIE };
565 /* A proctab entry. The COUNT field is there in case we fork a new
566 compression process that has the same PID as an old zombie process
567 that is still in the table (because the process to decompress the
568 temp file it was associated with hasn't started yet). */
569 struct procnode
571 pid_t pid;
572 enum procstate state;
573 size_t count;
576 static size_t
577 proctab_hasher (const void *entry, size_t tabsize)
579 const struct procnode *node = entry;
580 return node->pid % tabsize;
583 static bool
584 proctab_comparator (const void *e1, const void *e2)
586 const struct procnode *n1 = e1, *n2 = e2;
587 return n1->pid == n2->pid;
590 /* The total number of forked processes (compressors and decompressors)
591 that have not been reaped yet. */
592 static size_t nprocs;
594 /* The number of child processes we'll allow before we try to reap some. */
595 enum { MAX_PROCS_BEFORE_REAP = 2 };
597 /* If 0 < PID, wait for the child process with that PID to exit.
598 If PID is -1, clean up a random child process which has finished and
599 return the process ID of that child. If PID is -1 and no processes
600 have quit yet, return 0 without waiting. */
602 static pid_t
603 reap (pid_t pid)
605 int status;
606 pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
608 if (cpid < 0)
609 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
610 compress_program);
611 else if (0 < cpid)
613 if (! WIFEXITED (status) || WEXITSTATUS (status))
614 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
615 compress_program);
616 --nprocs;
619 return cpid;
622 /* Add the PID of a running compression process to proctab, or update
623 the entry COUNT and STATE fields if it's already there. This also
624 creates the table for us the first time it's called. */
626 static void
627 register_proc (pid_t pid)
629 struct procnode test, *node;
631 if (! proctab)
633 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
634 proctab_hasher,
635 proctab_comparator,
636 free);
637 if (! proctab)
638 xalloc_die ();
641 test.pid = pid;
642 node = hash_lookup (proctab, &test);
643 if (node)
645 node->state = ALIVE;
646 ++node->count;
648 else
650 node = xmalloc (sizeof *node);
651 node->pid = pid;
652 node->state = ALIVE;
653 node->count = 1;
654 if (hash_insert (proctab, node) == NULL)
655 xalloc_die ();
659 /* This is called when we reap a random process. We don't know
660 whether we have reaped a compression process or a decompression
661 process until we look in the table. If there's an ALIVE entry for
662 it, then we have reaped a compression process, so change the state
663 to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */
665 static void
666 update_proc (pid_t pid)
668 struct procnode test, *node;
670 test.pid = pid;
671 node = hash_lookup (proctab, &test);
672 if (node)
673 node->state = ZOMBIE;
676 /* This is for when we need to wait for a compression process to exit.
677 If it has a ZOMBIE entry in the table then it's already dead and has
678 been reaped. Note that if there's an ALIVE entry for it, it still may
679 already have died and been reaped if a second process was created with
680 the same PID. This is probably exceedingly rare, but to be on the safe
681 side we will have to wait for any compression process with this PID. */
683 static void
684 wait_proc (pid_t pid)
686 struct procnode test, *node;
688 test.pid = pid;
689 node = hash_lookup (proctab, &test);
690 if (node->state == ALIVE)
691 reap (pid);
693 node->state = ZOMBIE;
694 if (! --node->count)
696 hash_delete (proctab, node);
697 free (node);
701 /* Keep reaping finished children as long as there are more to reap.
702 This doesn't block waiting for any of them, it only reaps those
703 that are already dead. */
705 static void
706 reap_some (void)
708 pid_t pid;
710 while (0 < nprocs && (pid = reap (-1)))
711 update_proc (pid);
714 /* Clean up any remaining temporary files. */
716 static void
717 cleanup (void)
719 struct tempnode const *node;
721 for (node = temphead; node; node = node->next)
722 unlink (node->name);
723 temphead = NULL;
726 /* Cleanup actions to take when exiting. */
728 static void
729 exit_cleanup (void)
731 if (temphead)
733 /* Clean up any remaining temporary files in a critical section so
734 that a signal handler does not try to clean them too. */
735 struct cs_status cs = cs_enter ();
736 cleanup ();
737 cs_leave (cs);
740 close_stdout ();
743 /* Create a new temporary file, returning its newly allocated tempnode.
744 Store into *PFD the file descriptor open for writing.
745 If the creation fails, return NULL and store -1 into *PFD if the
746 failure is due to file descriptor exhaustion and
747 SURVIVE_FD_EXHAUSTION; otherwise, die. */
749 static struct tempnode *
750 create_temp_file (int *pfd, bool survive_fd_exhaustion)
752 static char const slashbase[] = "/sortXXXXXX";
753 static size_t temp_dir_index;
754 int fd;
755 int saved_errno;
756 char const *temp_dir = temp_dirs[temp_dir_index];
757 size_t len = strlen (temp_dir);
758 struct tempnode *node =
759 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
760 char *file = node->name;
761 struct cs_status cs;
763 memcpy (file, temp_dir, len);
764 memcpy (file + len, slashbase, sizeof slashbase);
765 node->next = NULL;
766 node->pid = 0;
767 if (++temp_dir_index == temp_dir_count)
768 temp_dir_index = 0;
770 /* Create the temporary file in a critical section, to avoid races. */
771 cs = cs_enter ();
772 fd = mkstemp (file);
773 if (0 <= fd)
775 *temptail = node;
776 temptail = &node->next;
778 saved_errno = errno;
779 cs_leave (cs);
780 errno = saved_errno;
782 if (fd < 0)
784 if (! (survive_fd_exhaustion && errno == EMFILE))
785 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
786 quote (temp_dir));
787 free (node);
788 node = NULL;
791 *pfd = fd;
792 return node;
795 /* Return a stream for FILE, opened with mode HOW. A null FILE means
796 standard output; HOW should be "w". When opening for input, "-"
797 means standard input. To avoid confusion, do not return file
798 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
799 opening an ordinary FILE. Return NULL if unsuccessful. */
801 static FILE *
802 stream_open (const char *file, const char *how)
804 if (!file)
805 return stdout;
806 if (STREQ (file, "-") && *how == 'r')
808 have_read_stdin = true;
809 return stdin;
811 return fopen (file, how);
814 /* Same as stream_open, except always return a non-null value; die on
815 failure. */
817 static FILE *
818 xfopen (const char *file, const char *how)
820 FILE *fp = stream_open (file, how);
821 if (!fp)
822 die (_("open failed"), file);
823 return fp;
826 /* Close FP, whose name is FILE, and report any errors. */
828 static void
829 xfclose (FILE *fp, char const *file)
831 switch (fileno (fp))
833 case STDIN_FILENO:
834 /* Allow reading stdin from tty more than once. */
835 if (feof (fp))
836 clearerr (fp);
837 break;
839 case STDOUT_FILENO:
840 /* Don't close stdout just yet. close_stdout does that. */
841 if (fflush (fp) != 0)
842 die (_("fflush failed"), file);
843 break;
845 default:
846 if (fclose (fp) != 0)
847 die (_("close failed"), file);
848 break;
852 static void
853 dup2_or_die (int oldfd, int newfd)
855 if (dup2 (oldfd, newfd) < 0)
856 error (SORT_FAILURE, errno, _("dup2 failed"));
859 /* Fork a child process for piping to and do common cleanup. The
860 TRIES parameter tells us how many times to try to fork before
861 giving up. Return the PID of the child, or -1 (setting errno)
862 on failure. */
864 static pid_t
865 pipe_fork (int pipefds[2], size_t tries)
867 #if HAVE_WORKING_FORK
868 struct tempnode *saved_temphead;
869 int saved_errno;
870 unsigned int wait_retry = 1;
871 pid_t pid IF_LINT (= -1);
872 struct cs_status cs;
874 if (pipe (pipefds) < 0)
875 return -1;
877 while (tries--)
879 /* This is so the child process won't delete our temp files
880 if it receives a signal before exec-ing. */
881 cs = cs_enter ();
882 saved_temphead = temphead;
883 temphead = NULL;
885 pid = fork ();
886 saved_errno = errno;
887 if (pid)
888 temphead = saved_temphead;
890 cs_leave (cs);
891 errno = saved_errno;
893 if (0 <= pid || errno != EAGAIN)
894 break;
895 else
897 sleep (wait_retry);
898 wait_retry *= 2;
899 reap_some ();
903 if (pid < 0)
905 saved_errno = errno;
906 close (pipefds[0]);
907 close (pipefds[1]);
908 errno = saved_errno;
910 else if (pid == 0)
912 close (STDIN_FILENO);
913 close (STDOUT_FILENO);
915 else
916 ++nprocs;
918 return pid;
920 #else /* ! HAVE_WORKING_FORK */
921 return -1;
922 #endif
925 /* Create a temporary file and start a compression program to filter output
926 to that file. Set *PFP to the file handle and if PPID is non-NULL,
927 set *PPID to the PID of the newly-created process. If the creation
928 fails, return NULL if the failure is due to file descriptor
929 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
931 static char *
932 maybe_create_temp (FILE **pfp, pid_t *ppid, bool survive_fd_exhaustion)
934 int tempfd;
935 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
936 char *name;
938 if (! node)
939 return NULL;
941 name = node->name;
943 if (compress_program)
945 int pipefds[2];
947 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
948 if (0 < node->pid)
950 close (tempfd);
951 close (pipefds[0]);
952 tempfd = pipefds[1];
954 register_proc (node->pid);
956 else if (node->pid == 0)
958 close (pipefds[1]);
959 dup2_or_die (tempfd, STDOUT_FILENO);
960 close (tempfd);
961 dup2_or_die (pipefds[0], STDIN_FILENO);
962 close (pipefds[0]);
964 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
965 error (SORT_FAILURE, errno, _("couldn't execute %s"),
966 compress_program);
968 else
969 node->pid = 0;
972 *pfp = fdopen (tempfd, "w");
973 if (! *pfp)
974 die (_("couldn't create temporary file"), name);
976 if (ppid)
977 *ppid = node->pid;
979 return name;
982 /* Create a temporary file and start a compression program to filter output
983 to that file. Set *PFP to the file handle and if *PPID is non-NULL,
984 set it to the PID of the newly-created process. Die on failure. */
986 static char *
987 create_temp (FILE **pfp, pid_t *ppid)
989 return maybe_create_temp (pfp, ppid, false);
992 /* Open a compressed temp file and start a decompression process through
993 which to filter the input. PID must be the valid processes ID of the
994 process used to compress the file. Return NULL (setting errno to
995 EMFILE) if we ran out of file descriptors, and die on any other
996 kind of failure. */
998 static FILE *
999 open_temp (const char *name, pid_t pid)
1001 int tempfd, pipefds[2];
1002 FILE *fp = NULL;
1004 wait_proc (pid);
1006 tempfd = open (name, O_RDONLY);
1007 if (tempfd < 0)
1008 return NULL;
1010 switch (pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS))
1012 case -1:
1013 if (errno != EMFILE)
1014 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1015 compress_program);
1016 close (tempfd);
1017 errno = EMFILE;
1018 break;
1020 case 0:
1021 close (pipefds[0]);
1022 dup2_or_die (tempfd, STDIN_FILENO);
1023 close (tempfd);
1024 dup2_or_die (pipefds[1], STDOUT_FILENO);
1025 close (pipefds[1]);
1027 execlp (compress_program, compress_program, "-d", (char *) NULL);
1028 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1029 compress_program);
1031 default:
1032 close (tempfd);
1033 close (pipefds[1]);
1035 fp = fdopen (pipefds[0], "r");
1036 if (! fp)
1038 int saved_errno = errno;
1039 close (pipefds[0]);
1040 errno = saved_errno;
1042 break;
1045 return fp;
1048 static void
1049 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
1051 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
1052 die (_("write failed"), output_file);
1055 /* Append DIR to the array of temporary directory names. */
1056 static void
1057 add_temp_dir (char const *dir)
1059 if (temp_dir_count == temp_dir_alloc)
1060 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1062 temp_dirs[temp_dir_count++] = dir;
1065 /* Remove NAME from the list of temporary files. */
1067 static void
1068 zaptemp (const char *name)
1070 struct tempnode *volatile *pnode;
1071 struct tempnode *node;
1072 struct tempnode *next;
1073 int unlink_status;
1074 int unlink_errno = 0;
1075 struct cs_status cs;
1077 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1078 continue;
1080 /* Unlink the temporary file in a critical section to avoid races. */
1081 next = node->next;
1082 cs = cs_enter ();
1083 unlink_status = unlink (name);
1084 unlink_errno = errno;
1085 *pnode = next;
1086 cs_leave (cs);
1088 if (unlink_status != 0)
1089 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1090 if (! next)
1091 temptail = pnode;
1092 free (node);
1095 #if HAVE_NL_LANGINFO
1097 static int
1098 struct_month_cmp (const void *m1, const void *m2)
1100 struct month const *month1 = m1;
1101 struct month const *month2 = m2;
1102 return strcmp (month1->name, month2->name);
1105 #endif
1107 /* Initialize the character class tables. */
1109 static void
1110 inittables (void)
1112 size_t i;
1114 for (i = 0; i < UCHAR_LIM; ++i)
1116 blanks[i] = !! isblank (i);
1117 nonprinting[i] = ! isprint (i);
1118 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1119 fold_toupper[i] = toupper (i);
1122 #if HAVE_NL_LANGINFO
1123 /* If we're not in the "C" locale, read different names for months. */
1124 if (hard_LC_TIME)
1126 for (i = 0; i < MONTHS_PER_YEAR; i++)
1128 char const *s;
1129 size_t s_len;
1130 size_t j;
1131 char *name;
1133 s = (char *) nl_langinfo (ABMON_1 + i);
1134 s_len = strlen (s);
1135 monthtab[i].name = name = xmalloc (s_len + 1);
1136 monthtab[i].val = i + 1;
1138 for (j = 0; j < s_len; j++)
1139 name[j] = fold_toupper[to_uchar (s[j])];
1140 name[j] = '\0';
1142 qsort ((void *) monthtab, MONTHS_PER_YEAR,
1143 sizeof *monthtab, struct_month_cmp);
1145 #endif
1148 /* Specify how many inputs may be merged at once.
1149 This may be set on the command-line with the
1150 --batch-size option. */
1151 static void
1152 specify_nmerge (int oi, char c, char const *s)
1154 uintmax_t n;
1155 struct rlimit rlimit;
1156 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1158 /* Try to find out how many file descriptors we'll be able
1159 to open. We need at least nmerge + 3 (STDIN_FILENO,
1160 STDOUT_FILENO and STDERR_FILENO). */
1161 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1162 ? rlimit.rlim_cur
1163 : OPEN_MAX)
1164 - 3);
1166 if (e == LONGINT_OK)
1168 nmerge = n;
1169 if (nmerge != n)
1170 e = LONGINT_OVERFLOW;
1171 else
1173 if (nmerge < 2)
1175 error (0, 0, _("invalid --%s argument %s"),
1176 long_options[oi].name, quote(s));
1177 error (SORT_FAILURE, 0,
1178 _("minimum --%s argument is %s"),
1179 long_options[oi].name, quote("2"));
1181 else if (max_nmerge < nmerge)
1183 e = LONGINT_OVERFLOW;
1185 else
1186 return;
1190 if (e == LONGINT_OVERFLOW)
1192 char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1193 error (0, 0, _("--%s argument %s too large"),
1194 long_options[oi].name, quote(s));
1195 error (SORT_FAILURE, 0,
1196 _("maximum --%s argument with current rlimit is %s"),
1197 long_options[oi].name,
1198 uinttostr (max_nmerge, max_nmerge_buf));
1200 else
1201 xstrtol_fatal (e, oi, c, long_options, s);
1204 /* Specify the amount of main memory to use when sorting. */
1205 static void
1206 specify_sort_size (int oi, char c, char const *s)
1208 uintmax_t n;
1209 char *suffix;
1210 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1212 /* The default unit is KiB. */
1213 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1215 if (n <= UINTMAX_MAX / 1024)
1216 n *= 1024;
1217 else
1218 e = LONGINT_OVERFLOW;
1221 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1222 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1223 switch (suffix[0])
1225 case 'b':
1226 e = LONGINT_OK;
1227 break;
1229 case '%':
1231 double mem = physmem_total () * n / 100;
1233 /* Use "<", not "<=", to avoid problems with rounding. */
1234 if (mem < UINTMAX_MAX)
1236 n = mem;
1237 e = LONGINT_OK;
1239 else
1240 e = LONGINT_OVERFLOW;
1242 break;
1245 if (e == LONGINT_OK)
1247 /* If multiple sort sizes are specified, take the maximum, so
1248 that option order does not matter. */
1249 if (n < sort_size)
1250 return;
1252 sort_size = n;
1253 if (sort_size == n)
1255 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1256 return;
1259 e = LONGINT_OVERFLOW;
1262 xstrtol_fatal (e, oi, c, long_options, s);
1265 /* Return the default sort size. */
1266 static size_t
1267 default_sort_size (void)
1269 /* Let MEM be available memory or 1/8 of total memory, whichever
1270 is greater. */
1271 double avail = physmem_available ();
1272 double total = physmem_total ();
1273 double mem = MAX (avail, total / 8);
1274 struct rlimit rlimit;
1276 /* Let SIZE be MEM, but no more than the maximum object size or
1277 system resource limits. Avoid the MIN macro here, as it is not
1278 quite right when only one argument is floating point. Don't
1279 bother to check for values like RLIM_INFINITY since in practice
1280 they are not much less than SIZE_MAX. */
1281 size_t size = SIZE_MAX;
1282 if (mem < size)
1283 size = mem;
1284 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1285 size = rlimit.rlim_cur;
1286 #ifdef RLIMIT_AS
1287 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1288 size = rlimit.rlim_cur;
1289 #endif
1291 /* Leave a large safety margin for the above limits, as failure can
1292 occur when they are exceeded. */
1293 size /= 2;
1295 #ifdef RLIMIT_RSS
1296 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1297 Exceeding RSS is not fatal, but can be quite slow. */
1298 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1299 size = rlimit.rlim_cur / 16 * 15;
1300 #endif
1302 /* Use no less than the minimum. */
1303 return MAX (size, MIN_SORT_SIZE);
1306 /* Return the sort buffer size to use with the input files identified
1307 by FPS and FILES, which are alternate names of the same files.
1308 NFILES gives the number of input files; NFPS may be less. Assume
1309 that each input line requires LINE_BYTES extra bytes' worth of line
1310 information. Do not exceed the size bound specified by the user
1311 (or a default size bound, if the user does not specify one). */
1313 static size_t
1314 sort_buffer_size (FILE *const *fps, size_t nfps,
1315 char *const *files, size_t nfiles,
1316 size_t line_bytes)
1318 /* A bound on the input size. If zero, the bound hasn't been
1319 determined yet. */
1320 static size_t size_bound;
1322 /* In the worst case, each input byte is a newline. */
1323 size_t worst_case_per_input_byte = line_bytes + 1;
1325 /* Keep enough room for one extra input line and an extra byte.
1326 This extra room might be needed when preparing to read EOF. */
1327 size_t size = worst_case_per_input_byte + 1;
1329 size_t i;
1331 for (i = 0; i < nfiles; i++)
1333 struct stat st;
1334 off_t file_size;
1335 size_t worst_case;
1337 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1338 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1339 : stat (files[i], &st))
1340 != 0)
1341 die (_("stat failed"), files[i]);
1343 if (S_ISREG (st.st_mode))
1344 file_size = st.st_size;
1345 else
1347 /* The file has unknown size. If the user specified a sort
1348 buffer size, use that; otherwise, guess the size. */
1349 if (sort_size)
1350 return sort_size;
1351 file_size = INPUT_FILE_SIZE_GUESS;
1354 if (! size_bound)
1356 size_bound = sort_size;
1357 if (! size_bound)
1358 size_bound = default_sort_size ();
1361 /* Add the amount of memory needed to represent the worst case
1362 where the input consists entirely of newlines followed by a
1363 single non-newline. Check for overflow. */
1364 worst_case = file_size * worst_case_per_input_byte + 1;
1365 if (file_size != worst_case / worst_case_per_input_byte
1366 || size_bound - size <= worst_case)
1367 return size_bound;
1368 size += worst_case;
1371 return size;
1374 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1375 must be at least sizeof (struct line). Allocate ALLOC bytes
1376 initially. */
1378 static void
1379 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1381 /* Ensure that the line array is properly aligned. If the desired
1382 size cannot be allocated, repeatedly halve it until allocation
1383 succeeds. The smaller allocation may hurt overall performance,
1384 but that's better than failing. */
1385 for (;;)
1387 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1388 buf->buf = malloc (alloc);
1389 if (buf->buf)
1390 break;
1391 alloc /= 2;
1392 if (alloc <= line_bytes + 1)
1393 xalloc_die ();
1396 buf->line_bytes = line_bytes;
1397 buf->alloc = alloc;
1398 buf->used = buf->left = buf->nlines = 0;
1399 buf->eof = false;
1402 /* Return one past the limit of the line array. */
1404 static inline struct line *
1405 buffer_linelim (struct buffer const *buf)
1407 return (struct line *) (buf->buf + buf->alloc);
1410 /* Return a pointer to the first character of the field specified
1411 by KEY in LINE. */
1413 static char *
1414 begfield (const struct line *line, const struct keyfield *key)
1416 char *ptr = line->text, *lim = ptr + line->length - 1;
1417 size_t sword = key->sword;
1418 size_t schar = key->schar;
1420 /* The leading field separator itself is included in a field when -t
1421 is absent. */
1423 if (tab != TAB_DEFAULT)
1424 while (ptr < lim && sword--)
1426 while (ptr < lim && *ptr != tab)
1427 ++ptr;
1428 if (ptr < lim)
1429 ++ptr;
1431 else
1432 while (ptr < lim && sword--)
1434 while (ptr < lim && blanks[to_uchar (*ptr)])
1435 ++ptr;
1436 while (ptr < lim && !blanks[to_uchar (*ptr)])
1437 ++ptr;
1440 /* If we're ignoring leading blanks when computing the Start
1441 of the field, skip past them here. */
1442 if (key->skipsblanks)
1443 while (ptr < lim && blanks[to_uchar (*ptr)])
1444 ++ptr;
1446 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1447 ptr = MIN (lim, ptr + schar);
1449 return ptr;
1452 /* Return the limit of (a pointer to the first character after) the field
1453 in LINE specified by KEY. */
1455 static char *
1456 limfield (const struct line *line, const struct keyfield *key)
1458 char *ptr = line->text, *lim = ptr + line->length - 1;
1459 size_t eword = key->eword, echar = key->echar;
1461 if (echar == 0)
1462 eword++; /* Skip all of end field. */
1464 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1465 whichever comes first. If there are more than EWORD fields, leave
1466 PTR pointing at the beginning of the field having zero-based index,
1467 EWORD. If a delimiter character was specified (via -t), then that
1468 `beginning' is the first character following the delimiting TAB.
1469 Otherwise, leave PTR pointing at the first `blank' character after
1470 the preceding field. */
1471 if (tab != TAB_DEFAULT)
1472 while (ptr < lim && eword--)
1474 while (ptr < lim && *ptr != tab)
1475 ++ptr;
1476 if (ptr < lim && (eword | echar))
1477 ++ptr;
1479 else
1480 while (ptr < lim && eword--)
1482 while (ptr < lim && blanks[to_uchar (*ptr)])
1483 ++ptr;
1484 while (ptr < lim && !blanks[to_uchar (*ptr)])
1485 ++ptr;
1488 #ifdef POSIX_UNSPECIFIED
1489 /* The following block of code makes GNU sort incompatible with
1490 standard Unix sort, so it's ifdef'd out for now.
1491 The POSIX spec isn't clear on how to interpret this.
1492 FIXME: request clarification.
1494 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1495 Date: Thu, 30 May 96 12:20:41 -0400
1496 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1498 [...]I believe I've found another bug in `sort'.
1500 $ cat /tmp/sort.in
1501 a b c 2 d
1502 pq rs 1 t
1503 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1504 a b c 2 d
1505 pq rs 1 t
1506 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1507 pq rs 1 t
1508 a b c 2 d
1510 Unix sort produced the answer I expected: sort on the single character
1511 in column 7. GNU sort produced different results, because it disagrees
1512 on the interpretation of the key-end spec "M.N". Unix sort reads this
1513 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1514 "skip M-1 fields, then either N-1 characters or the rest of the current
1515 field, whichever comes first". This extra clause applies only to
1516 key-ends, not key-starts.
1519 /* Make LIM point to the end of (one byte past) the current field. */
1520 if (tab != TAB_DEFAULT)
1522 char *newlim;
1523 newlim = memchr (ptr, tab, lim - ptr);
1524 if (newlim)
1525 lim = newlim;
1527 else
1529 char *newlim;
1530 newlim = ptr;
1531 while (newlim < lim && blanks[to_uchar (*newlim)])
1532 ++newlim;
1533 while (newlim < lim && !blanks[to_uchar (*newlim)])
1534 ++newlim;
1535 lim = newlim;
1537 #endif
1539 if (echar != 0) /* We need to skip over a portion of the end field. */
1541 /* If we're ignoring leading blanks when computing the End
1542 of the field, skip past them here. */
1543 if (key->skipeblanks)
1544 while (ptr < lim && blanks[to_uchar (*ptr)])
1545 ++ptr;
1547 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1548 ptr = MIN (lim, ptr + echar);
1551 return ptr;
1554 /* Fill BUF reading from FP, moving buf->left bytes from the end
1555 of buf->buf to the beginning first. If EOF is reached and the
1556 file wasn't terminated by a newline, supply one. Set up BUF's line
1557 table too. FILE is the name of the file corresponding to FP.
1558 Return true if some input was read. */
1560 static bool
1561 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1563 struct keyfield const *key = keylist;
1564 char eol = eolchar;
1565 size_t line_bytes = buf->line_bytes;
1566 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1568 if (buf->eof)
1569 return false;
1571 if (buf->used != buf->left)
1573 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1574 buf->used = buf->left;
1575 buf->nlines = 0;
1578 for (;;)
1580 char *ptr = buf->buf + buf->used;
1581 struct line *linelim = buffer_linelim (buf);
1582 struct line *line = linelim - buf->nlines;
1583 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1584 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1586 while (line_bytes + 1 < avail)
1588 /* Read as many bytes as possible, but do not read so many
1589 bytes that there might not be enough room for the
1590 corresponding line array. The worst case is when the
1591 rest of the input file consists entirely of newlines,
1592 except that the last byte is not a newline. */
1593 size_t readsize = (avail - 1) / (line_bytes + 1);
1594 size_t bytes_read = fread (ptr, 1, readsize, fp);
1595 char *ptrlim = ptr + bytes_read;
1596 char *p;
1597 avail -= bytes_read;
1599 if (bytes_read != readsize)
1601 if (ferror (fp))
1602 die (_("read failed"), file);
1603 if (feof (fp))
1605 buf->eof = true;
1606 if (buf->buf == ptrlim)
1607 return false;
1608 if (ptrlim[-1] != eol)
1609 *ptrlim++ = eol;
1613 /* Find and record each line in the just-read input. */
1614 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1616 ptr = p + 1;
1617 line--;
1618 line->text = line_start;
1619 line->length = ptr - line_start;
1620 mergesize = MAX (mergesize, line->length);
1621 avail -= line_bytes;
1623 if (key)
1625 /* Precompute the position of the first key for
1626 efficiency. */
1627 line->keylim = (key->eword == SIZE_MAX
1629 : limfield (line, key));
1631 if (key->sword != SIZE_MAX)
1632 line->keybeg = begfield (line, key);
1633 else
1635 if (key->skipsblanks)
1636 while (blanks[to_uchar (*line_start)])
1637 line_start++;
1638 line->keybeg = line_start;
1642 line_start = ptr;
1645 ptr = ptrlim;
1646 if (buf->eof)
1647 break;
1650 buf->used = ptr - buf->buf;
1651 buf->nlines = buffer_linelim (buf) - line;
1652 if (buf->nlines != 0)
1654 buf->left = ptr - line_start;
1655 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1656 return true;
1660 /* The current input line is too long to fit in the buffer.
1661 Double the buffer size and try again, keeping it properly
1662 aligned. */
1663 size_t line_alloc = buf->alloc / sizeof (struct line);
1664 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1665 buf->alloc = line_alloc * sizeof (struct line);
1670 /* Compare strings A and B as numbers without explicitly converting them to
1671 machine numbers. Comparatively slow for short strings, but asymptotically
1672 hideously fast. */
1674 static int
1675 numcompare (const char *a, const char *b)
1677 while (blanks[to_uchar (*a)])
1678 a++;
1679 while (blanks[to_uchar (*b)])
1680 b++;
1682 return strnumcmp (a, b, decimal_point, thousands_sep);
1685 /* Exit with an error if a mixture of SI and IEC units detected. */
1687 static void
1688 check_mixed_SI_IEC (char prefix, struct keyfield *key)
1690 int si_present = prefix == 'i';
1691 if (key->si_present != -1 && si_present != key->si_present)
1692 error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
1693 key->si_present = si_present;
1696 /* Return an integer which represents the order of magnitude of
1697 the unit following the number. NUMBER can contain thousands separators
1698 or a decimal point, but not have preceeding blanks.
1699 Negative numbers return a negative unit order. */
1701 static int
1702 find_unit_order (const char *number, struct keyfield *key)
1704 static const char orders [UCHAR_LIM] =
1706 #if SOME_DAY_WE_WILL_REQUIRE_C99
1707 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1708 ['k']=1,
1709 #else
1710 /* Generate the following table with this command:
1711 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1712 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1713 |fmt */
1714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1715 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1717 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1718 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1719 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1725 #endif
1728 const unsigned char *p = number;
1730 int sign = 1;
1732 if (*p == '-')
1734 sign = -1;
1735 p++;
1738 /* Scan to end of number.
1739 Decimals or separators not followed by digits stop the scan.
1740 Numbers ending in decimals or separators are thus considered
1741 to be lacking in units.
1742 FIXME: add support for multibyte thousands_sep and decimal_point. */
1744 while (ISDIGIT (*p))
1746 p++;
1748 if (*p == decimal_point && ISDIGIT (*(p + 1)))
1749 p += 2;
1750 else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
1751 p += 2;
1754 int order = orders[*p];
1756 /* For valid units check for MiB vs MB etc. */
1757 if (order)
1758 check_mixed_SI_IEC (*(p + 1), key);
1760 return sign * order;
1763 /* Compare numbers ending in units with SI xor IEC prefixes
1764 <none/unknown> < K/k < M < G < T < P < E < Z < Y
1765 Assume that numbers are properly abbreviated.
1766 i.e. input will never have both 6000K and 5M. */
1768 static int
1769 human_numcompare (const char *a, const char *b, struct keyfield *key)
1771 while (blanks[to_uchar (*a)])
1772 a++;
1773 while (blanks[to_uchar (*b)])
1774 b++;
1776 int order_a = find_unit_order (a, key);
1777 int order_b = find_unit_order (b, key);
1779 return (order_a > order_b ? 1
1780 : order_a < order_b ? -1
1781 : strnumcmp (a, b, decimal_point, thousands_sep));
1784 static int
1785 general_numcompare (const char *sa, const char *sb)
1787 /* FIXME: add option to warn about failed conversions. */
1788 /* FIXME: maybe add option to try expensive FP conversion
1789 only if A and B can't be compared more cheaply/accurately. */
1791 char *ea;
1792 char *eb;
1793 double a = strtod (sa, &ea);
1794 double b = strtod (sb, &eb);
1796 /* Put conversion errors at the start of the collating sequence. */
1797 if (sa == ea)
1798 return sb == eb ? 0 : -1;
1799 if (sb == eb)
1800 return 1;
1802 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1803 conversion errors but before numbers; sort them by internal
1804 bit-pattern, for lack of a more portable alternative. */
1805 return (a < b ? -1
1806 : a > b ? 1
1807 : a == b ? 0
1808 : b == b ? -1
1809 : a == a ? 1
1810 : memcmp ((char *) &a, (char *) &b, sizeof a));
1813 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1814 Return 0 if the name in S is not recognized. */
1816 static int
1817 getmonth (char const *month, size_t len)
1819 size_t lo = 0;
1820 size_t hi = MONTHS_PER_YEAR;
1821 char const *monthlim = month + len;
1823 for (;;)
1825 if (month == monthlim)
1826 return 0;
1827 if (!blanks[to_uchar (*month)])
1828 break;
1829 ++month;
1834 size_t ix = (lo + hi) / 2;
1835 char const *m = month;
1836 char const *n = monthtab[ix].name;
1838 for (;; m++, n++)
1840 if (!*n)
1841 return monthtab[ix].val;
1842 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1844 hi = ix;
1845 break;
1847 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1849 lo = ix + 1;
1850 break;
1854 while (lo < hi);
1856 return 0;
1859 /* A source of random data. */
1860 static struct randread_source *randread_source;
1862 /* Return the Ith randomly-generated state. The caller must invoke
1863 random_state (H) for all H less than I before invoking random_state
1864 (I). */
1866 static struct md5_ctx
1867 random_state (size_t i)
1869 /* An array of states resulting from the random data, and counts of
1870 its used and allocated members. */
1871 static struct md5_ctx *state;
1872 static size_t used;
1873 static size_t allocated;
1875 struct md5_ctx *s = &state[i];
1877 if (used <= i)
1879 unsigned char buf[MD5_DIGEST_SIZE];
1881 used++;
1883 if (allocated <= i)
1885 state = X2NREALLOC (state, &allocated);
1886 s = &state[i];
1889 randread (randread_source, buf, sizeof buf);
1890 md5_init_ctx (s);
1891 md5_process_bytes (buf, sizeof buf, s);
1894 return *s;
1897 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1898 with length LENGTHB. Return negative if less, zero if equal,
1899 positive if greater. */
1901 static int
1902 cmp_hashes (char const *texta, size_t lena,
1903 char const *textb, size_t lenb)
1905 /* Try random hashes until a pair of hashes disagree. But if the
1906 first pair of random hashes agree, check whether the keys are
1907 identical and if so report no difference. */
1908 int diff;
1909 size_t i;
1910 for (i = 0; ; i++)
1912 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1913 struct md5_ctx s[2];
1914 s[0] = s[1] = random_state (i);
1915 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
1916 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
1917 diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1918 if (diff != 0)
1919 break;
1920 if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1921 break;
1924 return diff;
1927 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1928 using one or more random hash functions. */
1930 static int
1931 compare_random (char *restrict texta, size_t lena,
1932 char *restrict textb, size_t lenb)
1934 int diff;
1936 if (! hard_LC_COLLATE)
1937 diff = cmp_hashes (texta, lena, textb, lenb);
1938 else
1940 /* Transform the text into the basis of comparison, so that byte
1941 strings that would otherwise considered to be equal are
1942 considered equal here even if their bytes differ. */
1944 char *buf = NULL;
1945 char stackbuf[4000];
1946 size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1947 bool a_fits = tlena <= sizeof stackbuf;
1948 size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1949 (a_fits ? sizeof stackbuf - tlena : 0),
1950 textb, lenb);
1952 if (a_fits && tlena + tlenb <= sizeof stackbuf)
1953 buf = stackbuf;
1954 else
1956 /* Adding 1 to the buffer size lets xmemxfrm run a bit
1957 faster by avoiding the need for an extra buffer copy. */
1958 buf = xmalloc (tlena + tlenb + 1);
1959 xmemxfrm (buf, tlena + 1, texta, lena);
1960 xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1963 diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1965 if (buf != stackbuf)
1966 free (buf);
1969 return diff;
1972 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1973 using filevercmp. See lib/filevercmp.h for function description. */
1975 static int
1976 compare_version (char *restrict texta, size_t lena,
1977 char *restrict textb, size_t lenb)
1979 int diff;
1981 /* It is necessary to save the character after the end of the field.
1982 "filevercmp" works with NUL terminated strings. Our blocks of
1983 text are not necessarily terminated with a NUL byte. */
1984 char sv_a = texta[lena];
1985 char sv_b = textb[lenb];
1987 texta[lena] = '\0';
1988 textb[lenb] = '\0';
1990 diff = filevercmp (texta, textb);
1992 texta[lena] = sv_a;
1993 textb[lenb] = sv_b;
1995 return diff;
1998 /* Compare two lines A and B trying every key in sequence until there
1999 are no more keys or a difference is found. */
2001 static int
2002 keycompare (const struct line *a, const struct line *b)
2004 struct keyfield *key = keylist;
2006 /* For the first iteration only, the key positions have been
2007 precomputed for us. */
2008 char *texta = a->keybeg;
2009 char *textb = b->keybeg;
2010 char *lima = a->keylim;
2011 char *limb = b->keylim;
2013 int diff;
2015 for (;;)
2017 char const *translate = key->translate;
2018 bool const *ignore = key->ignore;
2020 /* Treat field ends before field starts as empty fields. */
2021 lima = MAX (texta, lima);
2022 limb = MAX (textb, limb);
2024 /* Find the lengths. */
2025 size_t lena = lima - texta;
2026 size_t lenb = limb - textb;
2028 /* Actually compare the fields. */
2030 if (key->random)
2031 diff = compare_random (texta, lena, textb, lenb);
2032 else if (key->numeric | key->general_numeric | key->human_numeric)
2034 char savea = *lima, saveb = *limb;
2036 *lima = *limb = '\0';
2037 diff = (key->numeric ? numcompare (texta, textb)
2038 : key->general_numeric ? general_numcompare (texta, textb)
2039 : human_numcompare (texta, textb, key));
2040 *lima = savea, *limb = saveb;
2042 else if (key->version)
2043 diff = compare_version (texta, lena, textb, lenb);
2044 else if (key->month)
2045 diff = getmonth (texta, lena) - getmonth (textb, lenb);
2046 /* Sorting like this may become slow, so in a simple locale the user
2047 can select a faster sort that is similar to ascii sort. */
2048 else if (hard_LC_COLLATE)
2050 if (ignore || translate)
2052 char buf[4000];
2053 size_t size = lena + 1 + lenb + 1;
2054 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
2055 char *copy_b = copy_a + lena + 1;
2056 size_t new_len_a, new_len_b, i;
2058 /* Ignore and/or translate chars before comparing. */
2059 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
2061 if (i < lena)
2063 copy_a[new_len_a] = (translate
2064 ? translate[to_uchar (texta[i])]
2065 : texta[i]);
2066 if (!ignore || !ignore[to_uchar (texta[i])])
2067 ++new_len_a;
2069 if (i < lenb)
2071 copy_b[new_len_b] = (translate
2072 ? translate[to_uchar (textb[i])]
2073 : textb [i]);
2074 if (!ignore || !ignore[to_uchar (textb[i])])
2075 ++new_len_b;
2079 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
2081 if (sizeof buf < size)
2082 free (copy_a);
2084 else if (lena == 0)
2085 diff = - NONZERO (lenb);
2086 else if (lenb == 0)
2087 goto greater;
2088 else
2089 diff = xmemcoll (texta, lena, textb, lenb);
2091 else if (ignore)
2093 #define CMP_WITH_IGNORE(A, B) \
2094 do \
2096 for (;;) \
2098 while (texta < lima && ignore[to_uchar (*texta)]) \
2099 ++texta; \
2100 while (textb < limb && ignore[to_uchar (*textb)]) \
2101 ++textb; \
2102 if (! (texta < lima && textb < limb)) \
2103 break; \
2104 diff = to_uchar (A) - to_uchar (B); \
2105 if (diff) \
2106 goto not_equal; \
2107 ++texta; \
2108 ++textb; \
2111 diff = (texta < lima) - (textb < limb); \
2113 while (0)
2115 if (translate)
2116 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2117 translate[to_uchar (*textb)]);
2118 else
2119 CMP_WITH_IGNORE (*texta, *textb);
2121 else if (lena == 0)
2122 diff = - NONZERO (lenb);
2123 else if (lenb == 0)
2124 goto greater;
2125 else
2127 if (translate)
2129 while (texta < lima && textb < limb)
2131 diff = (to_uchar (translate[to_uchar (*texta++)])
2132 - to_uchar (translate[to_uchar (*textb++)]));
2133 if (diff)
2134 goto not_equal;
2137 else
2139 diff = memcmp (texta, textb, MIN (lena, lenb));
2140 if (diff)
2141 goto not_equal;
2143 diff = lena < lenb ? -1 : lena != lenb;
2146 if (diff)
2147 goto not_equal;
2149 key = key->next;
2150 if (! key)
2151 break;
2153 /* Find the beginning and limit of the next field. */
2154 if (key->eword != SIZE_MAX)
2155 lima = limfield (a, key), limb = limfield (b, key);
2156 else
2157 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2159 if (key->sword != SIZE_MAX)
2160 texta = begfield (a, key), textb = begfield (b, key);
2161 else
2163 texta = a->text, textb = b->text;
2164 if (key->skipsblanks)
2166 while (texta < lima && blanks[to_uchar (*texta)])
2167 ++texta;
2168 while (textb < limb && blanks[to_uchar (*textb)])
2169 ++textb;
2174 return 0;
2176 greater:
2177 diff = 1;
2178 not_equal:
2179 return key->reverse ? -diff : diff;
2182 /* Compare two lines A and B, returning negative, zero, or positive
2183 depending on whether A compares less than, equal to, or greater than B. */
2185 static int
2186 compare (const struct line *a, const struct line *b)
2188 int diff;
2189 size_t alen, blen;
2191 /* First try to compare on the specified keys (if any).
2192 The only two cases with no key at all are unadorned sort,
2193 and unadorned sort -r. */
2194 if (keylist)
2196 diff = keycompare (a, b);
2197 if (diff | unique | stable)
2198 return diff;
2201 /* If the keys all compare equal (or no keys were specified)
2202 fall through to the default comparison. */
2203 alen = a->length - 1, blen = b->length - 1;
2205 if (alen == 0)
2206 diff = - NONZERO (blen);
2207 else if (blen == 0)
2208 diff = 1;
2209 else if (hard_LC_COLLATE)
2210 diff = xmemcoll (a->text, alen, b->text, blen);
2211 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2212 diff = alen < blen ? -1 : alen != blen;
2214 return reverse ? -diff : diff;
2217 /* Check that the lines read from FILE_NAME come in order. Return
2218 true if they are in order. If CHECKONLY == 'c', also print a
2219 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2220 they are not in order. */
2222 static bool
2223 check (char const *file_name, char checkonly)
2225 FILE *fp = xfopen (file_name, "r");
2226 struct buffer buf; /* Input buffer. */
2227 struct line temp; /* Copy of previous line. */
2228 size_t alloc = 0;
2229 uintmax_t line_number = 0;
2230 struct keyfield const *key = keylist;
2231 bool nonunique = ! unique;
2232 bool ordered = true;
2234 initbuf (&buf, sizeof (struct line),
2235 MAX (merge_buffer_size, sort_size));
2236 temp.text = NULL;
2238 while (fillbuf (&buf, fp, file_name))
2240 struct line const *line = buffer_linelim (&buf);
2241 struct line const *linebase = line - buf.nlines;
2243 /* Make sure the line saved from the old buffer contents is
2244 less than or equal to the first line of the new buffer. */
2245 if (alloc && nonunique <= compare (&temp, line - 1))
2247 found_disorder:
2249 if (checkonly == 'c')
2251 struct line const *disorder_line = line - 1;
2252 uintmax_t disorder_line_number =
2253 buffer_linelim (&buf) - disorder_line + line_number;
2254 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2255 fprintf (stderr, _("%s: %s:%s: disorder: "),
2256 program_name, file_name,
2257 umaxtostr (disorder_line_number, hr_buf));
2258 write_bytes (disorder_line->text, disorder_line->length,
2259 stderr, _("standard error"));
2262 ordered = false;
2263 break;
2267 /* Compare each line in the buffer with its successor. */
2268 while (linebase < --line)
2269 if (nonunique <= compare (line, line - 1))
2270 goto found_disorder;
2272 line_number += buf.nlines;
2274 /* Save the last line of the buffer. */
2275 if (alloc < line->length)
2279 alloc *= 2;
2280 if (! alloc)
2282 alloc = line->length;
2283 break;
2286 while (alloc < line->length);
2288 temp.text = xrealloc (temp.text, alloc);
2290 memcpy (temp.text, line->text, line->length);
2291 temp.length = line->length;
2292 if (key)
2294 temp.keybeg = temp.text + (line->keybeg - line->text);
2295 temp.keylim = temp.text + (line->keylim - line->text);
2299 xfclose (fp, file_name);
2300 free (buf.buf);
2301 free (temp.text);
2302 return ordered;
2305 /* Open FILES (there are NFILES of them) and store the resulting array
2306 of stream pointers into (*PFPS). Allocate the array. Return the
2307 number of successfully opened files, setting errno if this value is
2308 less than NFILES. */
2310 static size_t
2311 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2313 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2314 int i;
2316 /* Open as many input files as we can. */
2317 for (i = 0; i < nfiles; i++)
2319 fps[i] = (files[i].pid
2320 ? open_temp (files[i].name, files[i].pid)
2321 : stream_open (files[i].name, "r"));
2322 if (!fps[i])
2323 break;
2326 return i;
2329 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2330 files (all of which are at the start of the FILES array), and
2331 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2332 FPS is the vector of open stream corresponding to the files.
2333 Close input and output streams before returning.
2334 OUTPUT_FILE gives the name of the output file. If it is NULL,
2335 the output file is standard output. */
2337 static void
2338 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2339 FILE *ofp, char const *output_file, FILE **fps)
2341 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2342 /* Input buffers for each file. */
2343 struct line saved; /* Saved line storage for unique check. */
2344 struct line const *savedline = NULL;
2345 /* &saved if there is a saved line. */
2346 size_t savealloc = 0; /* Size allocated for the saved line. */
2347 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2348 /* Current line in each line table. */
2349 struct line const **base = xnmalloc (nfiles, sizeof *base);
2350 /* Base of each line table. */
2351 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2352 /* Table representing a permutation of fps,
2353 such that cur[ord[0]] is the smallest line
2354 and will be next output. */
2355 size_t i;
2356 size_t j;
2357 size_t t;
2358 struct keyfield const *key = keylist;
2359 saved.text = NULL;
2361 /* Read initial lines from each input file. */
2362 for (i = 0; i < nfiles; )
2364 initbuf (&buffer[i], sizeof (struct line),
2365 MAX (merge_buffer_size, sort_size / nfiles));
2366 if (fillbuf (&buffer[i], fps[i], files[i].name))
2368 struct line const *linelim = buffer_linelim (&buffer[i]);
2369 cur[i] = linelim - 1;
2370 base[i] = linelim - buffer[i].nlines;
2371 i++;
2373 else
2375 /* fps[i] is empty; eliminate it from future consideration. */
2376 xfclose (fps[i], files[i].name);
2377 if (i < ntemps)
2379 ntemps--;
2380 zaptemp (files[i].name);
2382 free (buffer[i].buf);
2383 --nfiles;
2384 for (j = i; j < nfiles; ++j)
2386 files[j] = files[j + 1];
2387 fps[j] = fps[j + 1];
2392 /* Set up the ord table according to comparisons among input lines.
2393 Since this only reorders two items if one is strictly greater than
2394 the other, it is stable. */
2395 for (i = 0; i < nfiles; ++i)
2396 ord[i] = i;
2397 for (i = 1; i < nfiles; ++i)
2398 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2399 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2401 /* Repeatedly output the smallest line until no input remains. */
2402 while (nfiles)
2404 struct line const *smallest = cur[ord[0]];
2406 /* If uniquified output is turned on, output only the first of
2407 an identical series of lines. */
2408 if (unique)
2410 if (savedline && compare (savedline, smallest))
2412 savedline = NULL;
2413 write_bytes (saved.text, saved.length, ofp, output_file);
2415 if (!savedline)
2417 savedline = &saved;
2418 if (savealloc < smallest->length)
2421 if (! savealloc)
2423 savealloc = smallest->length;
2424 break;
2426 while ((savealloc *= 2) < smallest->length);
2428 saved.text = xrealloc (saved.text, savealloc);
2430 saved.length = smallest->length;
2431 memcpy (saved.text, smallest->text, saved.length);
2432 if (key)
2434 saved.keybeg =
2435 saved.text + (smallest->keybeg - smallest->text);
2436 saved.keylim =
2437 saved.text + (smallest->keylim - smallest->text);
2441 else
2442 write_bytes (smallest->text, smallest->length, ofp, output_file);
2444 /* Check if we need to read more lines into core. */
2445 if (base[ord[0]] < smallest)
2446 cur[ord[0]] = smallest - 1;
2447 else
2449 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2451 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2452 cur[ord[0]] = linelim - 1;
2453 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2455 else
2457 /* We reached EOF on fps[ord[0]]. */
2458 for (i = 1; i < nfiles; ++i)
2459 if (ord[i] > ord[0])
2460 --ord[i];
2461 --nfiles;
2462 xfclose (fps[ord[0]], files[ord[0]].name);
2463 if (ord[0] < ntemps)
2465 ntemps--;
2466 zaptemp (files[ord[0]].name);
2468 free (buffer[ord[0]].buf);
2469 for (i = ord[0]; i < nfiles; ++i)
2471 fps[i] = fps[i + 1];
2472 files[i] = files[i + 1];
2473 buffer[i] = buffer[i + 1];
2474 cur[i] = cur[i + 1];
2475 base[i] = base[i + 1];
2477 for (i = 0; i < nfiles; ++i)
2478 ord[i] = ord[i + 1];
2479 continue;
2483 /* The new line just read in may be larger than other lines
2484 already in main memory; push it back in the queue until we
2485 encounter a line larger than it. Optimize for the common
2486 case where the new line is smallest. */
2488 size_t lo = 1;
2489 size_t hi = nfiles;
2490 size_t probe = lo;
2491 size_t ord0 = ord[0];
2492 size_t count_of_smaller_lines;
2494 while (lo < hi)
2496 int cmp = compare (cur[ord0], cur[ord[probe]]);
2497 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2498 hi = probe;
2499 else
2500 lo = probe + 1;
2501 probe = (lo + hi) / 2;
2504 count_of_smaller_lines = lo - 1;
2505 for (j = 0; j < count_of_smaller_lines; j++)
2506 ord[j] = ord[j + 1];
2507 ord[count_of_smaller_lines] = ord0;
2510 /* Free up some resources every once in a while. */
2511 if (MAX_PROCS_BEFORE_REAP < nprocs)
2512 reap_some ();
2515 if (unique && savedline)
2517 write_bytes (saved.text, saved.length, ofp, output_file);
2518 free (saved.text);
2521 xfclose (ofp, output_file);
2522 free(fps);
2523 free(buffer);
2524 free(ord);
2525 free(base);
2526 free(cur);
2529 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2530 files (all of which are at the start of the FILES array), and
2531 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2532 Close input and output files before returning.
2533 OUTPUT_FILE gives the name of the output file.
2535 Return the number of files successfully merged. This number can be
2536 less than NFILES if we ran low on file descriptors, but in this
2537 case it is never less than 2. */
2539 static size_t
2540 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
2541 FILE *ofp, char const *output_file)
2543 FILE **fps;
2544 size_t nopened = open_input_files (files, nfiles, &fps);
2545 if (nopened < nfiles && nopened < 2)
2546 die (_("open failed"), files[nopened].name);
2547 mergefps (files, ntemps, nopened, ofp, output_file, fps);
2548 return nopened;
2551 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2552 and HI (with NHI members). T, LO, and HI point just past their
2553 respective arrays, and the arrays are in reverse order. NLO and
2554 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
2556 static inline void
2557 mergelines (struct line *t,
2558 struct line const *lo, size_t nlo,
2559 struct line const *hi, size_t nhi)
2561 for (;;)
2562 if (compare (lo - 1, hi - 1) <= 0)
2564 *--t = *--lo;
2565 if (! --nlo)
2567 /* HI - NHI equalled T - (NLO + NHI) when this function
2568 began. Therefore HI must equal T now, and there is no
2569 need to copy from HI to T. */
2570 return;
2573 else
2575 *--t = *--hi;
2576 if (! --nhi)
2579 *--t = *--lo;
2580 while (--nlo);
2582 return;
2587 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2588 NLINES must be at least 2.
2589 The input and output arrays are in reverse order, and LINES and
2590 TEMP point just past the end of their respective arrays.
2592 Use a recursive divide-and-conquer algorithm, in the style
2593 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
2594 the optimization suggested by exercise 5.2.4-10; this requires room
2595 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
2596 writes that this memory optimization was originally published by
2597 D. A. Bell, Comp J. 1 (1958), 75. */
2599 static void
2600 sortlines (struct line *lines, size_t nlines, struct line *temp)
2602 if (nlines == 2)
2604 if (0 < compare (&lines[-1], &lines[-2]))
2606 struct line tmp = lines[-1];
2607 lines[-1] = lines[-2];
2608 lines[-2] = tmp;
2611 else
2613 size_t nlo = nlines / 2;
2614 size_t nhi = nlines - nlo;
2615 struct line *lo = lines;
2616 struct line *hi = lines - nlo;
2617 struct line *sorted_lo = temp;
2619 sortlines (hi, nhi, temp);
2620 if (1 < nlo)
2621 sortlines_temp (lo, nlo, sorted_lo);
2622 else
2623 sorted_lo[-1] = lo[-1];
2625 mergelines (lines, sorted_lo, nlo, hi, nhi);
2629 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2630 rather than sorting in place. */
2632 static void
2633 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2635 if (nlines == 2)
2637 /* Declare `swap' as int, not bool, to work around a bug
2638 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2639 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
2640 int swap = (0 < compare (&lines[-1], &lines[-2]));
2641 temp[-1] = lines[-1 - swap];
2642 temp[-2] = lines[-2 + swap];
2644 else
2646 size_t nlo = nlines / 2;
2647 size_t nhi = nlines - nlo;
2648 struct line *lo = lines;
2649 struct line *hi = lines - nlo;
2650 struct line *sorted_hi = temp - nlo;
2652 sortlines_temp (hi, nhi, sorted_hi);
2653 if (1 < nlo)
2654 sortlines (lo, nlo, temp);
2656 mergelines (temp, lo, nlo, sorted_hi, nhi);
2660 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2661 the same as OUTFILE. If found, merge the found instances (and perhaps
2662 some other files) into a temporary file so that it can in turn be
2663 merged into OUTFILE without destroying OUTFILE before it is completely
2664 read. Return the new value of NFILES, which differs from the old if
2665 some merging occurred.
2667 This test ensures that an otherwise-erroneous use like
2668 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2669 It's not clear that POSIX requires this nicety.
2670 Detect common error cases, but don't try to catch obscure cases like
2671 "cat ... FILE ... | sort -m -o FILE"
2672 where traditional "sort" doesn't copy the input and where
2673 people should know that they're getting into trouble anyway.
2674 Catching these obscure cases would slow down performance in
2675 common cases. */
2677 static size_t
2678 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2679 size_t nfiles, char const *outfile)
2681 size_t i;
2682 bool got_outstat = false;
2683 struct stat outstat;
2685 for (i = ntemps; i < nfiles; i++)
2687 bool is_stdin = STREQ (files[i].name, "-");
2688 bool same;
2689 struct stat instat;
2691 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2692 same = true;
2693 else
2695 if (! got_outstat)
2697 if ((outfile
2698 ? stat (outfile, &outstat)
2699 : fstat (STDOUT_FILENO, &outstat))
2700 != 0)
2701 break;
2702 got_outstat = true;
2705 same = (((is_stdin
2706 ? fstat (STDIN_FILENO, &instat)
2707 : stat (files[i].name, &instat))
2708 == 0)
2709 && SAME_INODE (instat, outstat));
2712 if (same)
2714 FILE *tftp;
2715 pid_t pid;
2716 char *temp = create_temp (&tftp, &pid);
2717 size_t num_merged = 0;
2720 num_merged += mergefiles (&files[i], 0, nfiles - i, tftp, temp);
2721 files[i].name = temp;
2722 files[i].pid = pid;
2724 if (i + num_merged < nfiles)
2725 memmove(&files[i + 1], &files[i + num_merged],
2726 num_merged * sizeof *files);
2727 ntemps += 1;
2728 nfiles -= num_merged - 1;;
2729 i += num_merged;
2731 while (i < nfiles);
2735 return nfiles;
2738 /* Merge the input FILES. NTEMPS is the number of files at the
2739 start of FILES that are temporary; it is zero at the top level.
2740 NFILES is the total number of files. Put the output in
2741 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
2743 static void
2744 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2745 char const *output_file)
2747 while (nmerge < nfiles)
2749 /* Number of input files processed so far. */
2750 size_t in;
2752 /* Number of output files generated so far. */
2753 size_t out;
2755 /* nfiles % NMERGE; this counts input files that are left over
2756 after all full-sized merges have been done. */
2757 size_t remainder;
2759 /* Number of easily-available slots at the next loop iteration. */
2760 size_t cheap_slots;
2762 /* Do as many NMERGE-size merges as possible. In the case that
2763 nmerge is bogus, increment by the maximum number of file
2764 descriptors allowed. */
2765 for (out = in = 0; nmerge <= nfiles - in; out++)
2767 FILE *tfp;
2768 pid_t pid;
2769 char *temp = create_temp (&tfp, &pid);
2770 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
2771 nmerge, tfp, temp);
2772 ntemps -= MIN (ntemps, num_merged);
2773 files[out].name = temp;
2774 files[out].pid = pid;
2775 in += num_merged;
2778 remainder = nfiles - in;
2779 cheap_slots = nmerge - out % nmerge;
2781 if (cheap_slots < remainder)
2783 /* So many files remain that they can't all be put into the last
2784 NMERGE-sized output window. Do one more merge. Merge as few
2785 files as possible, to avoid needless I/O. */
2786 size_t nshortmerge = remainder - cheap_slots + 1;
2787 FILE *tfp;
2788 pid_t pid;
2789 char *temp = create_temp (&tfp, &pid);
2790 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
2791 nshortmerge, tfp, temp);
2792 ntemps -= MIN (ntemps, num_merged);
2793 files[out].name = temp;
2794 files[out++].pid = pid;
2795 in += num_merged;
2798 /* Put the remaining input files into the last NMERGE-sized output
2799 window, so they will be merged in the next pass. */
2800 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2801 ntemps += out;
2802 nfiles -= in - out;
2805 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2807 /* We aren't guaranteed that this final mergefiles will work, therefore we
2808 try to merge into the output, and then merge as much as we can into a
2809 temp file if we can't. Repeat. */
2811 for (;;)
2813 /* Merge directly into the output file if possible. */
2814 FILE **fps;
2815 size_t nopened = open_input_files (files, nfiles, &fps);
2817 if (nopened == nfiles)
2819 FILE *ofp = stream_open (output_file, "w");
2820 if (ofp)
2822 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
2823 break;
2825 if (errno != EMFILE || nopened <= 2)
2826 die (_("open failed"), output_file);
2828 else if (nopened <= 2)
2829 die (_("open failed"), files[nopened].name);
2831 /* We ran out of file descriptors. Close one of the input
2832 files, to gain a file descriptor. Then create a temporary
2833 file with our spare file descriptor. Retry if that failed
2834 (e.g., some other process could open a file between the time
2835 we closed and tried to create). */
2836 FILE *tfp;
2837 pid_t pid;
2838 char *temp;
2841 nopened--;
2842 xfclose (fps[nopened], files[nopened].name);
2843 temp = maybe_create_temp (&tfp, &pid, ! (nopened <= 2));
2845 while (!temp);
2847 /* Merge into the newly allocated temporary. */
2848 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp, fps);
2849 ntemps -= MIN (ntemps, nopened);
2850 files[0].name = temp;
2851 files[0].pid = pid;
2853 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
2854 ntemps++;
2855 nfiles -= nopened - 1;
2859 /* Sort NFILES FILES onto OUTPUT_FILE. */
2861 static void
2862 sort (char * const *files, size_t nfiles, char const *output_file)
2864 struct buffer buf;
2865 size_t ntemps = 0;
2866 bool output_file_created = false;
2868 buf.alloc = 0;
2870 while (nfiles)
2872 char const *temp_output;
2873 char const *file = *files;
2874 FILE *fp = xfopen (file, "r");
2875 FILE *tfp;
2876 size_t bytes_per_line = (2 * sizeof (struct line)
2877 - sizeof (struct line) / 2);
2879 if (! buf.alloc)
2880 initbuf (&buf, bytes_per_line,
2881 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2882 buf.eof = false;
2883 files++;
2884 nfiles--;
2886 while (fillbuf (&buf, fp, file))
2888 struct line *line;
2889 struct line *linebase;
2891 if (buf.eof && nfiles
2892 && (bytes_per_line + 1
2893 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2895 /* End of file, but there is more input and buffer room.
2896 Concatenate the next input file; this is faster in
2897 the usual case. */
2898 buf.left = buf.used;
2899 break;
2902 line = buffer_linelim (&buf);
2903 linebase = line - buf.nlines;
2904 if (1 < buf.nlines)
2905 sortlines (line, buf.nlines, linebase);
2906 if (buf.eof && !nfiles && !ntemps && !buf.left)
2908 xfclose (fp, file);
2909 tfp = xfopen (output_file, "w");
2910 temp_output = output_file;
2911 output_file_created = true;
2913 else
2915 ++ntemps;
2916 temp_output = create_temp (&tfp, NULL);
2921 line--;
2922 write_bytes (line->text, line->length, tfp, temp_output);
2923 if (unique)
2924 while (linebase < line && compare (line, line - 1) == 0)
2925 line--;
2927 while (linebase < line);
2929 xfclose (tfp, temp_output);
2931 /* Free up some resources every once in a while. */
2932 if (MAX_PROCS_BEFORE_REAP < nprocs)
2933 reap_some ();
2935 if (output_file_created)
2936 goto finish;
2938 xfclose (fp, file);
2941 finish:
2942 free (buf.buf);
2944 if (! output_file_created)
2946 size_t i;
2947 struct tempnode *node = temphead;
2948 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2949 for (i = 0; node; i++)
2951 tempfiles[i].name = node->name;
2952 tempfiles[i].pid = node->pid;
2953 node = node->next;
2955 merge (tempfiles, ntemps, ntemps, output_file);
2956 free (tempfiles);
2960 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
2962 static void
2963 insertkey (struct keyfield *key_arg)
2965 struct keyfield **p;
2966 struct keyfield *key = xmemdup (key_arg, sizeof *key);
2968 for (p = &keylist; *p; p = &(*p)->next)
2969 continue;
2970 *p = key;
2971 key->next = NULL;
2974 /* Report a bad field specification SPEC, with extra info MSGID. */
2976 static void badfieldspec (char const *, char const *)
2977 ATTRIBUTE_NORETURN;
2978 static void
2979 badfieldspec (char const *spec, char const *msgid)
2981 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2982 _(msgid), quote (spec));
2983 abort ();
2986 /* Report incompatible options. */
2988 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2989 static void
2990 incompatible_options (char const *opts)
2992 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2993 abort ();
2996 /* Check compatibility of ordering options. */
2998 static void
2999 check_ordering_compatibility (void)
3001 struct keyfield const *key;
3003 for (key = keylist; key; key = key->next)
3004 if ((1 < (key->random + key->numeric + key->general_numeric + key->month
3005 + key->version + !!key->ignore + key->human_numeric))
3006 || (key->random && key->translate))
3008 /* The following is too big, but guaranteed to be "big enough". */
3009 char opts[sizeof short_options];
3010 char *p = opts;
3011 if (key->ignore == nondictionary)
3012 *p++ = 'd';
3013 if (key->translate)
3014 *p++ = 'f';
3015 if (key->general_numeric)
3016 *p++ = 'g';
3017 if (key->human_numeric)
3018 *p++ = 'h';
3019 if (key->ignore == nonprinting)
3020 *p++ = 'i';
3021 if (key->month)
3022 *p++ = 'M';
3023 if (key->numeric)
3024 *p++ = 'n';
3025 if (key->version)
3026 *p++ = 'V';
3027 if (key->random)
3028 *p++ = 'R';
3029 *p = '\0';
3030 incompatible_options (opts);
3034 /* Parse the leading integer in STRING and store the resulting value
3035 (which must fit into size_t) into *VAL. Return the address of the
3036 suffix after the integer. If the value is too large, silently
3037 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3038 failure; otherwise, report MSGID and exit on failure. */
3040 static char const *
3041 parse_field_count (char const *string, size_t *val, char const *msgid)
3043 char *suffix;
3044 uintmax_t n;
3046 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3048 case LONGINT_OK:
3049 case LONGINT_INVALID_SUFFIX_CHAR:
3050 *val = n;
3051 if (*val == n)
3052 break;
3053 /* Fall through. */
3054 case LONGINT_OVERFLOW:
3055 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3056 *val = SIZE_MAX;
3057 break;
3059 case LONGINT_INVALID:
3060 if (msgid)
3061 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3062 _(msgid), quote (string));
3063 return NULL;
3066 return suffix;
3069 /* Handle interrupts and hangups. */
3071 static void
3072 sighandler (int sig)
3074 if (! SA_NOCLDSTOP)
3075 signal (sig, SIG_IGN);
3077 cleanup ();
3079 signal (sig, SIG_DFL);
3080 raise (sig);
3083 /* Set the ordering options for KEY specified in S.
3084 Return the address of the first character in S that
3085 is not a valid ordering option.
3086 BLANKTYPE is the kind of blanks that 'b' should skip. */
3088 static char *
3089 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
3091 while (*s)
3093 switch (*s)
3095 case 'b':
3096 if (blanktype == bl_start || blanktype == bl_both)
3097 key->skipsblanks = true;
3098 if (blanktype == bl_end || blanktype == bl_both)
3099 key->skipeblanks = true;
3100 break;
3101 case 'd':
3102 key->ignore = nondictionary;
3103 break;
3104 case 'f':
3105 key->translate = fold_toupper;
3106 break;
3107 case 'g':
3108 key->general_numeric = true;
3109 break;
3110 case 'h':
3111 key->human_numeric = true;
3112 break;
3113 case 'i':
3114 /* Option order should not matter, so don't let -i override
3115 -d. -d implies -i, but -i does not imply -d. */
3116 if (! key->ignore)
3117 key->ignore = nonprinting;
3118 break;
3119 case 'M':
3120 key->month = true;
3121 break;
3122 case 'n':
3123 key->numeric = true;
3124 break;
3125 case 'R':
3126 key->random = true;
3127 break;
3128 case 'r':
3129 key->reverse = true;
3130 break;
3131 case 'V':
3132 key->version = true;
3133 break;
3134 default:
3135 return (char *) s;
3137 ++s;
3139 return (char *) s;
3142 static struct keyfield *
3143 key_init (struct keyfield *key)
3145 memset (key, 0, sizeof *key);
3146 key->eword = SIZE_MAX;
3147 key->si_present = -1;
3148 return key;
3152 main (int argc, char **argv)
3154 struct keyfield *key;
3155 struct keyfield key_buf;
3156 struct keyfield gkey;
3157 char const *s;
3158 int c = 0;
3159 char checkonly = 0;
3160 bool mergeonly = false;
3161 char *random_source = NULL;
3162 bool need_random = false;
3163 size_t nfiles = 0;
3164 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
3165 bool obsolete_usage = (posix2_version () < 200112);
3166 char **files;
3167 char *files_from = NULL;
3168 struct Tokens tok;
3169 char const *outfile = NULL;
3171 initialize_main (&argc, &argv);
3172 set_program_name (argv[0]);
3173 setlocale (LC_ALL, "");
3174 bindtextdomain (PACKAGE, LOCALEDIR);
3175 textdomain (PACKAGE);
3177 initialize_exit_failure (SORT_FAILURE);
3179 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3180 #if HAVE_NL_LANGINFO
3181 hard_LC_TIME = hard_locale (LC_TIME);
3182 #endif
3184 /* Get locale's representation of the decimal point. */
3186 struct lconv const *locale = localeconv ();
3188 /* If the locale doesn't define a decimal point, or if the decimal
3189 point is multibyte, use the C locale's decimal point. FIXME:
3190 add support for multibyte decimal points. */
3191 decimal_point = to_uchar (locale->decimal_point[0]);
3192 if (! decimal_point || locale->decimal_point[1])
3193 decimal_point = '.';
3195 /* FIXME: add support for multibyte thousands separators. */
3196 thousands_sep = to_uchar (*locale->thousands_sep);
3197 if (! thousands_sep || locale->thousands_sep[1])
3198 thousands_sep = -1;
3201 have_read_stdin = false;
3202 inittables ();
3205 size_t i;
3206 static int const sig[] =
3208 /* The usual suspects. */
3209 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
3210 #ifdef SIGPOLL
3211 SIGPOLL,
3212 #endif
3213 #ifdef SIGPROF
3214 SIGPROF,
3215 #endif
3216 #ifdef SIGVTALRM
3217 SIGVTALRM,
3218 #endif
3219 #ifdef SIGXCPU
3220 SIGXCPU,
3221 #endif
3222 #ifdef SIGXFSZ
3223 SIGXFSZ,
3224 #endif
3226 enum { nsigs = ARRAY_CARDINALITY (sig) };
3228 #if SA_NOCLDSTOP
3229 struct sigaction act;
3231 sigemptyset (&caught_signals);
3232 for (i = 0; i < nsigs; i++)
3234 sigaction (sig[i], NULL, &act);
3235 if (act.sa_handler != SIG_IGN)
3236 sigaddset (&caught_signals, sig[i]);
3239 act.sa_handler = sighandler;
3240 act.sa_mask = caught_signals;
3241 act.sa_flags = 0;
3243 for (i = 0; i < nsigs; i++)
3244 if (sigismember (&caught_signals, sig[i]))
3245 sigaction (sig[i], &act, NULL);
3246 #else
3247 for (i = 0; i < nsigs; i++)
3248 if (signal (sig[i], SIG_IGN) != SIG_IGN)
3250 signal (sig[i], sighandler);
3251 siginterrupt (sig[i], 1);
3253 #endif
3256 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
3257 atexit (exit_cleanup);
3259 gkey.sword = gkey.eword = SIZE_MAX;
3260 gkey.ignore = NULL;
3261 gkey.translate = NULL;
3262 gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
3263 gkey.si_present = -1;
3264 gkey.random = gkey.version = false;
3265 gkey.month = gkey.reverse = false;
3266 gkey.skipsblanks = gkey.skipeblanks = false;
3268 files = xnmalloc (argc, sizeof *files);
3270 for (;;)
3272 /* Parse an operand as a file after "--" was seen; or if
3273 pedantic and a file was seen, unless the POSIX version
3274 predates 1003.1-2001 and -c was not seen and the operand is
3275 "-o FILE" or "-oFILE". */
3276 int oi = -1;
3278 if (c == -1
3279 || (posixly_correct && nfiles != 0
3280 && ! (obsolete_usage
3281 && ! checkonly
3282 && optind != argc
3283 && argv[optind][0] == '-' && argv[optind][1] == 'o'
3284 && (argv[optind][2] || optind + 1 != argc)))
3285 || ((c = getopt_long (argc, argv, short_options,
3286 long_options, &oi))
3287 == -1))
3289 if (argc <= optind)
3290 break;
3291 files[nfiles++] = argv[optind++];
3293 else switch (c)
3295 case 1:
3296 key = NULL;
3297 if (optarg[0] == '+')
3299 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3300 && ISDIGIT (argv[optind][1]));
3301 obsolete_usage |= minus_pos_usage & ~posixly_correct;
3302 if (obsolete_usage)
3304 /* Treat +POS1 [-POS2] as a key if possible; but silently
3305 treat an operand as a file if it is not a valid +POS1. */
3306 key = key_init (&key_buf);
3307 s = parse_field_count (optarg + 1, &key->sword, NULL);
3308 if (s && *s == '.')
3309 s = parse_field_count (s + 1, &key->schar, NULL);
3310 if (! (key->sword | key->schar))
3311 key->sword = SIZE_MAX;
3312 if (! s || *set_ordering (s, key, bl_start))
3313 key = NULL;
3314 else
3316 if (minus_pos_usage)
3318 char const *optarg1 = argv[optind++];
3319 s = parse_field_count (optarg1 + 1, &key->eword,
3320 N_("invalid number after `-'"));
3321 if (*s == '.')
3322 s = parse_field_count (s + 1, &key->echar,
3323 N_("invalid number after `.'"));
3324 if (*set_ordering (s, key, bl_end))
3325 badfieldspec (optarg1,
3326 N_("stray character in field spec"));
3328 insertkey (key);
3332 if (! key)
3333 files[nfiles++] = optarg;
3334 break;
3336 case SORT_OPTION:
3337 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3338 /* Fall through. */
3339 case 'b':
3340 case 'd':
3341 case 'f':
3342 case 'g':
3343 case 'h':
3344 case 'i':
3345 case 'M':
3346 case 'n':
3347 case 'r':
3348 case 'R':
3349 case 'V':
3351 char str[2];
3352 str[0] = c;
3353 str[1] = '\0';
3354 set_ordering (str, &gkey, bl_both);
3356 break;
3358 case CHECK_OPTION:
3359 c = (optarg
3360 ? XARGMATCH ("--check", optarg, check_args, check_types)
3361 : 'c');
3362 /* Fall through. */
3363 case 'c':
3364 case 'C':
3365 if (checkonly && checkonly != c)
3366 incompatible_options ("cC");
3367 checkonly = c;
3368 break;
3370 case COMPRESS_PROGRAM_OPTION:
3371 if (compress_program && !STREQ (compress_program, optarg))
3372 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3373 compress_program = optarg;
3374 break;
3376 case FILES0_FROM_OPTION:
3377 files_from = optarg;
3378 break;
3380 case 'k':
3381 key = key_init (&key_buf);
3383 /* Get POS1. */
3384 s = parse_field_count (optarg, &key->sword,
3385 N_("invalid number at field start"));
3386 if (! key->sword--)
3388 /* Provoke with `sort -k0' */
3389 badfieldspec (optarg, N_("field number is zero"));
3391 if (*s == '.')
3393 s = parse_field_count (s + 1, &key->schar,
3394 N_("invalid number after `.'"));
3395 if (! key->schar--)
3397 /* Provoke with `sort -k1.0' */
3398 badfieldspec (optarg, N_("character offset is zero"));
3401 if (! (key->sword | key->schar))
3402 key->sword = SIZE_MAX;
3403 s = set_ordering (s, key, bl_start);
3404 if (*s != ',')
3406 key->eword = SIZE_MAX;
3407 key->echar = 0;
3409 else
3411 /* Get POS2. */
3412 s = parse_field_count (s + 1, &key->eword,
3413 N_("invalid number after `,'"));
3414 if (! key->eword--)
3416 /* Provoke with `sort -k1,0' */
3417 badfieldspec (optarg, N_("field number is zero"));
3419 if (*s == '.')
3421 s = parse_field_count (s + 1, &key->echar,
3422 N_("invalid number after `.'"));
3424 s = set_ordering (s, key, bl_end);
3426 if (*s)
3427 badfieldspec (optarg, N_("stray character in field spec"));
3428 insertkey (key);
3429 break;
3431 case 'm':
3432 mergeonly = true;
3433 break;
3435 case NMERGE_OPTION:
3436 specify_nmerge (oi, c, optarg);
3437 break;
3439 case 'o':
3440 if (outfile && !STREQ (outfile, optarg))
3441 error (SORT_FAILURE, 0, _("multiple output files specified"));
3442 outfile = optarg;
3443 break;
3445 case RANDOM_SOURCE_OPTION:
3446 if (random_source && !STREQ (random_source, optarg))
3447 error (SORT_FAILURE, 0, _("multiple random sources specified"));
3448 random_source = optarg;
3449 break;
3451 case 's':
3452 stable = true;
3453 break;
3455 case 'S':
3456 specify_sort_size (oi, c, optarg);
3457 break;
3459 case 't':
3461 char newtab = optarg[0];
3462 if (! newtab)
3463 error (SORT_FAILURE, 0, _("empty tab"));
3464 if (optarg[1])
3466 if (STREQ (optarg, "\\0"))
3467 newtab = '\0';
3468 else
3470 /* Provoke with `sort -txx'. Complain about
3471 "multi-character tab" instead of "multibyte tab", so
3472 that the diagnostic's wording does not need to be
3473 changed once multibyte characters are supported. */
3474 error (SORT_FAILURE, 0, _("multi-character tab %s"),
3475 quote (optarg));
3478 if (tab != TAB_DEFAULT && tab != newtab)
3479 error (SORT_FAILURE, 0, _("incompatible tabs"));
3480 tab = newtab;
3482 break;
3484 case 'T':
3485 add_temp_dir (optarg);
3486 break;
3488 case 'u':
3489 unique = true;
3490 break;
3492 case 'y':
3493 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3494 through Solaris 7. It is also accepted by many non-Solaris
3495 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3496 -y is marked as obsolete starting with Solaris 8 (1999), but is
3497 still accepted as of Solaris 10 prerelease (2004).
3499 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3500 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3501 and which in general ignores the argument after "-y" if it
3502 consists entirely of digits (it can even be empty). */
3503 if (optarg == argv[optind - 1])
3505 char const *p;
3506 for (p = optarg; ISDIGIT (*p); p++)
3507 continue;
3508 optind -= (*p != '\0');
3510 break;
3512 case 'z':
3513 eolchar = 0;
3514 break;
3516 case_GETOPT_HELP_CHAR;
3518 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3520 default:
3521 usage (SORT_FAILURE);
3525 if (files_from)
3527 FILE *stream;
3529 /* When using --files0-from=F, you may not specify any files
3530 on the command-line. */
3531 if (nfiles)
3533 error (0, 0, _("extra operand %s"), quote (files[0]));
3534 fprintf (stderr, "%s\n",
3535 _("file operands cannot be combined with --files0-from"));
3536 usage (SORT_FAILURE);
3539 if (STREQ (files_from, "-"))
3540 stream = stdin;
3541 else
3543 stream = fopen (files_from, "r");
3544 if (stream == NULL)
3545 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3546 quote (files_from));
3549 readtokens0_init (&tok);
3551 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3552 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3553 quote (files_from));
3555 if (tok.n_tok)
3557 size_t i;
3558 free (files);
3559 files = tok.tok;
3560 nfiles = tok.n_tok;
3561 for (i = 0; i < nfiles; i++)
3563 if (STREQ (files[i], "-"))
3564 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3565 "no file name of %s allowed"),
3566 quote (files[i]));
3567 else if (files[i][0] == '\0')
3569 /* Using the standard `filename:line-number:' prefix here is
3570 not totally appropriate, since NUL is the separator, not NL,
3571 but it might be better than nothing. */
3572 unsigned long int file_number = i + 1;
3573 error (SORT_FAILURE, 0,
3574 _("%s:%lu: invalid zero-length file name"),
3575 quotearg_colon (files_from), file_number);
3579 else
3580 error (SORT_FAILURE, 0, _("no input from %s"),
3581 quote (files_from));
3584 /* Inheritance of global options to individual keys. */
3585 for (key = keylist; key; key = key->next)
3587 if (! (key->ignore
3588 || key->translate
3589 || (key->skipsblanks
3590 | key->reverse
3591 | key->skipeblanks
3592 | key->month
3593 | key->numeric
3594 | key->version
3595 | key->general_numeric
3596 | key->human_numeric
3597 | key->random)))
3599 key->ignore = gkey.ignore;
3600 key->translate = gkey.translate;
3601 key->skipsblanks = gkey.skipsblanks;
3602 key->skipeblanks = gkey.skipeblanks;
3603 key->month = gkey.month;
3604 key->numeric = gkey.numeric;
3605 key->general_numeric = gkey.general_numeric;
3606 key->human_numeric = gkey.human_numeric;
3607 key->random = gkey.random;
3608 key->reverse = gkey.reverse;
3609 key->version = gkey.version;
3612 need_random |= key->random;
3615 if (!keylist && (gkey.ignore
3616 || gkey.translate
3617 || (gkey.skipsblanks
3618 | gkey.skipeblanks
3619 | gkey.month
3620 | gkey.numeric
3621 | gkey.general_numeric
3622 | gkey.human_numeric
3623 | gkey.random
3624 | gkey.version)))
3626 insertkey (&gkey);
3627 need_random |= gkey.random;
3630 check_ordering_compatibility ();
3632 reverse = gkey.reverse;
3634 if (need_random)
3636 randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3637 if (! randread_source)
3638 die (_("open failed"), random_source);
3641 if (temp_dir_count == 0)
3643 char const *tmp_dir = getenv ("TMPDIR");
3644 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3647 if (nfiles == 0)
3649 static char *minus = (char *) "-";
3650 nfiles = 1;
3651 free (files);
3652 files = &minus;
3655 /* Need to re-check that we meet the minimum requirement for memory
3656 usage with the final value for NMERGE. */
3657 if (0 < sort_size)
3658 sort_size = MAX (sort_size, MIN_SORT_SIZE);
3660 if (checkonly)
3662 if (nfiles > 1)
3663 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3664 quote (files[1]), checkonly);
3666 if (outfile)
3668 static char opts[] = {0, 'o', 0};
3669 opts[0] = checkonly;
3670 incompatible_options (opts);
3673 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3674 input is not properly sorted. */
3675 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3678 if (mergeonly)
3680 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3681 size_t i;
3683 for (i = 0; i < nfiles; ++i)
3684 sortfiles[i].name = files[i];
3686 merge (sortfiles, 0, nfiles, outfile);
3687 IF_LINT (free (sortfiles));
3689 else
3690 sort (files, nfiles, outfile);
3692 if (have_read_stdin && fclose (stdin) == EOF)
3693 die (_("close failed"), "-");
3695 exit (EXIT_SUCCESS);