src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17    Written December 1988 by Mike Haertel.
  18    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  19    or (US mail) as Mike Haertel c/o Free Software Foundation.
  20
  21    Ørn E. Hansen added NLS support in 1997.  */
  22
  23 #include <config.h>
  24
  25 #include <getopt.h>
  26 #include <sys/types.h>
  27 #include <sys/wait.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "argmatch.h"
  31 #include "error.h"
  32 #include "filevercmp.h"
  33 #include "hard-locale.h"
  34 #include "hash.h"
  35 #include "ignore-value.h"
  36 #include "md5.h"
  37 #include "physmem.h"
  38 #include "posixver.h"
  39 #include "quote.h"
  40 #include "quotearg.h"
  41 #include "randread.h"
  42 #include "readtokens0.h"
  43 #include "stdio--.h"
  44 #include "stdlib--.h"
  45 #include "strnumcmp.h"
  46 #include "xmemcoll.h"
  47 #include "xmemxfrm.h"
  48 #include "xnanosleep.h"
  49 #include "xstrtol.h"
  50
  51 #if HAVE_SYS_RESOURCE_H
  52 # include <sys/resource.h>
  53 #endif
  54 #ifndef RLIMIT_DATA
  55 struct rlimit { size_t rlim_cur; };
  56 # define getrlimit(Resource, Rlp) (-1)
  57 #endif
  58
  59 /* The official name of this program (e.g., no `g' prefix).  */
  60 #define PROGRAM_NAME "sort"
  61
  62 #define AUTHORS \
  63   proper_name ("Mike Haertel"), \
  64   proper_name ("Paul Eggert")
  65
  66 #if HAVE_LANGINFO_CODESET
  67 # include <langinfo.h>
  68 #endif
  69
  70 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  71    present.  */
  72 #ifndef SA_NOCLDSTOP
  73 # define SA_NOCLDSTOP 0
  74 /* No sigprocmask.  Always 'return' zero. */
  75 # define sigprocmask(How, Set, Oset) (0)
  76 # define sigset_t int
  77 # if ! HAVE_SIGINTERRUPT
  78 #  define siginterrupt(sig, flag) /* empty */
  79 # endif
  80 #endif
  81
  82 #if !defined OPEN_MAX && defined NR_OPEN
  83 # define OPEN_MAX NR_OPEN
  84 #endif
  85 #if !defined OPEN_MAX
  86 # define OPEN_MAX 20
  87 #endif
  88
  89 #define UCHAR_LIM (UCHAR_MAX + 1)
  90
  91 #ifndef DEFAULT_TMPDIR
  92 # define DEFAULT_TMPDIR "/tmp"
  93 #endif
  94
  95 /* Exit statuses.  */
  96 enum
  97   {
  98     /* POSIX says to exit with status 1 if invoked with -c and the
  99        input is not properly sorted.  */
 100     SORT_OUT_OF_ORDER = 1,
 101
 102     /* POSIX says any other irregular exit must exit with a status
 103        code greater than 1.  */
 104     SORT_FAILURE = 2
 105   };
 106
 107 enum
 108   {
 109     /* The number of times we should try to fork a compression process
 110        (we retry if the fork call fails).  We don't _need_ to compress
 111        temp files, this is just to reduce disk access, so this number
 112        can be small.  Each retry doubles in duration.  */
 113     MAX_FORK_TRIES_COMPRESS = 4,
 114
 115     /* The number of times we should try to fork a decompression process.
 116        If we can't fork a decompression process, we can't sort, so this
 117        number should be big.  Each retry doubles in duration.  */
 118     MAX_FORK_TRIES_DECOMPRESS = 9
 119   };
 120
 121 /* The representation of the decimal point in the current locale.  */
 122 static int decimal_point;
 123
 124 /* Thousands separator; if -1, then there isn't one.  */
 125 static int thousands_sep;
 126
 127 /* Nonzero if the corresponding locales are hard.  */
 128 static bool hard_LC_COLLATE;
 129 #if HAVE_NL_LANGINFO
 130 static bool hard_LC_TIME;
 131 #endif
 132
 133 #define NONZERO(x) ((x) != 0)
 134
 135 /* The kind of blanks for '-b' to skip in various options. */
 136 enum blanktype { bl_start, bl_end, bl_both };
 137
 138 /* The character marking end of line. Default to \n. */
 139 static char eolchar = '\n';
 140
 141 /* Lines are held in core as counted strings. */
 142 struct line
 143 {
 144   char *text;                   /* Text of the line. */
 145   size_t length;                /* Length including final newline. */
 146   char *keybeg;                 /* Start of first key. */
 147   char *keylim;                 /* Limit of first key. */
 148 };
 149
 150 /* Input buffers. */
 151 struct buffer
 152 {
 153   char *buf;                    /* Dynamically allocated buffer,
 154                                    partitioned into 3 regions:
 155                                    - input data;
 156                                    - unused area;
 157                                    - an array of lines, in reverse order.  */
 158   size_t used;                  /* Number of bytes used for input data.  */
 159   size_t nlines;                /* Number of lines in the line array.  */
 160   size_t alloc;                 /* Number of bytes allocated. */
 161   size_t left;                  /* Number of bytes left from previous reads. */
 162   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 163   bool eof;                     /* An EOF has been read.  */
 164 };
 165
 166 struct keyfield
 167 {
 168   size_t sword;                 /* Zero-origin 'word' to start at. */
 169   size_t schar;                 /* Additional characters to skip. */
 170   size_t eword;                 /* Zero-origin first word after field. */
 171   size_t echar;                 /* Additional characters in field. */
 172   bool const *ignore;           /* Boolean array of characters to ignore. */
 173   char const *translate;        /* Translation applied to characters. */
 174   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 175   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 176   bool numeric;                 /* Flag for numeric comparison.  Handle
 177                                    strings of digits with optional decimal
 178                                    point, but no exponential notation. */
 179   bool random;                  /* Sort by random hash of key.  */
 180   bool general_numeric;         /* Flag for general, numeric comparison.
 181                                    Handle numbers in exponential notation. */
 182   bool human_numeric;           /* Flag for sorting by human readable
 183                                    units with either SI xor IEC prefixes. */
 184   int iec_present;              /* Flag for checking for mixed SI and IEC. */
 185   bool month;                   /* Flag for comparison by month name. */
 186   bool reverse;                 /* Reverse the sense of comparison. */
 187   bool version;                 /* sort by version number */
 188   struct keyfield *next;        /* Next keyfield to try. */
 189 };
 190
 191 struct month
 192 {
 193   char const *name;
 194   int val;
 195 };
 196
 197 /* FIXME: None of these tables work with multibyte character sets.
 198    Also, there are many other bugs when handling multibyte characters.
 199    One way to fix this is to rewrite `sort' to use wide characters
 200    internally, but doing this with good performance is a bit
 201    tricky.  */
 202
 203 /* Table of blanks.  */
 204 static bool blanks[UCHAR_LIM];
 205
 206 /* Table of non-printing characters. */
 207 static bool nonprinting[UCHAR_LIM];
 208
 209 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 210 static bool nondictionary[UCHAR_LIM];
 211
 212 /* Translation table folding lower case to upper.  */
 213 static unsigned char fold_toupper[UCHAR_LIM];
 214
 215 #define MONTHS_PER_YEAR 12
 216
 217 /* Table mapping month names to integers.
 218    Alphabetic order allows binary search. */
 219 static struct month monthtab[] =
 220 {
 221   {"APR", 4},
 222   {"AUG", 8},
 223   {"DEC", 12},
 224   {"FEB", 2},
 225   {"JAN", 1},
 226   {"JUL", 7},
 227   {"JUN", 6},
 228   {"MAR", 3},
 229   {"MAY", 5},
 230   {"NOV", 11},
 231   {"OCT", 10},
 232   {"SEP", 9}
 233 };
 234
 235 /* During the merge phase, the number of files to merge at once. */
 236 #define NMERGE_DEFAULT 16
 237
 238 /* Minimum size for a merge or check buffer.  */
 239 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 240
 241 /* Minimum sort size; the code might not work with smaller sizes.  */
 242 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 243
 244 /* The number of bytes needed for a merge or check buffer, which can
 245    function relatively efficiently even if it holds only one line.  If
 246    a longer line is seen, this value is increased.  */
 247 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 248
 249 /* The approximate maximum number of bytes of main memory to use, as
 250    specified by the user.  Zero if the user has not specified a size.  */
 251 static size_t sort_size;
 252
 253 /* The guessed size for non-regular files.  */
 254 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 255
 256 /* Array of directory names in which any temporary files are to be created. */
 257 static char const **temp_dirs;
 258
 259 /* Number of temporary directory names used.  */
 260 static size_t temp_dir_count;
 261
 262 /* Number of allocated slots in temp_dirs.  */
 263 static size_t temp_dir_alloc;
 264
 265 /* Flag to reverse the order of all comparisons. */
 266 static bool reverse;
 267
 268 /* Flag for stable sort.  This turns off the last ditch bytewise
 269    comparison of lines, and instead leaves lines in the same order
 270    they were read if all keys compare equal.  */
 271 static bool stable;
 272
 273 /* If TAB has this value, blanks separate fields.  */
 274 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 275
 276 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 277    separated by the empty string between a non-blank character and a blank
 278    character. */
 279 static int tab = TAB_DEFAULT;
 280
 281 /* Flag to remove consecutive duplicate lines from the output.
 282    Only the last of a sequence of equal lines will be output. */
 283 static bool unique;
 284
 285 /* Nonzero if any of the input files are the standard input. */
 286 static bool have_read_stdin;
 287
 288 /* List of key field comparisons to be tried.  */
 289 static struct keyfield *keylist;
 290
 291 /* Program used to (de)compress temp files.  Must accept -d.  */
 292 static char const *compress_program;
 293
 294 /* Maximum number of files to merge in one go.  If more than this
 295    number are present, temp files will be used. */
 296 static unsigned int nmerge = NMERGE_DEFAULT;
 297
 298 static void sortlines_temp (struct line *, size_t, struct line *);
 299
 300 /* Report MESSAGE for FILE, then clean up and exit.
 301    If FILE is null, it represents standard output.  */
 302
 303 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 304 static void
 305 die (char const *message, char const *file)
 306 {
 307   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 308   exit (SORT_FAILURE);
 309 }
 310
 311 void
 312 usage (int status)
 313 {
 314   if (status != EXIT_SUCCESS)
 315     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 316              program_name);
 317   else
 318     {
 319       printf (_("\
 320 Usage: %s [OPTION]... [FILE]...\n\
 321   or:  %s [OPTION]... --files0-from=F\n\
 322 "),
 323               program_name, program_name);
 324       fputs (_("\
 325 Write sorted concatenation of all FILE(s) to standard output.\n\
 326 \n\
 327 "), stdout);
 328       fputs (_("\
 329 Mandatory arguments to long options are mandatory for short options too.\n\
 330 "), stdout);
 331       fputs (_("\
 332 Ordering options:\n\
 333 \n\
 334 "), stdout);
 335       fputs (_("\
 336   -b, --ignore-leading-blanks  ignore leading blanks\n\
 337   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 338   -f, --ignore-case           fold lower case to upper case characters\n\
 339 "), stdout);
 340       fputs (_("\
 341   -g, --general-numeric-sort  compare according to general numerical value\n\
 342   -i, --ignore-nonprinting    consider only printable characters\n\
 343   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 344 "), stdout);
 345       fputs (_("\
 346   -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
 347 "), stdout);
 348       fputs (_("\
 349   -n, --numeric-sort          compare according to string numerical value\n\
 350   -R, --random-sort           sort by random hash of keys\n\
 351       --random-source=FILE    get random bytes from FILE\n\
 352   -r, --reverse               reverse the result of comparisons\n\
 353 "), stdout);
 354       fputs (_("\
 355       --sort=WORD             sort according to WORD:\n\
 356                                 general-numeric -g, human-numeric -h, month -M,\n\
 357                                 numeric -n, random -R, version -V\n\
 358   -V, --version-sort          natural sort of (version) numbers within text\n\
 359 \n\
 360 "), stdout);
 361       fputs (_("\
 362 Other options:\n\
 363 \n\
 364 "), stdout);
 365       fputs (_("\
 366       --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
 367                             for more use temp files\n\
 368 "), stdout);
 369       fputs (_("\
 370   -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
 371   -C, --check=quiet, --check=silent  like -c, but do not report first bad line\n\
 372       --compress-program=PROG  compress temporaries with PROG;\n\
 373                               decompress them with PROG -d\n\
 374       --files0-from=F       read input from the files specified by\n\
 375                             NUL-terminated names in file F;\n\
 376                             If F is - then read names from standard input\n\
 377 "), stdout);
 378       fputs (_("\
 379   -k, --key=POS1[,POS2]     start a key at POS1 (origin 1), end it at POS2\n\
 380                             (default end of line)\n\
 381   -m, --merge               merge already sorted files; do not sort\n\
 382 "), stdout);
 383       fputs (_("\
 384   -o, --output=FILE         write result to FILE instead of standard output\n\
 385   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 386   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 387 "), stdout);
 388       printf (_("\
 389   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 390   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 391                               multiple options specify multiple directories\n\
 392   -u, --unique              with -c, check for strict ordering;\n\
 393                               without -c, output only the first of an equal run\n\
 394 "), DEFAULT_TMPDIR);
 395       fputs (_("\
 396   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 397 "), stdout);
 398       fputs (HELP_OPTION_DESCRIPTION, stdout);
 399       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 400       fputs (_("\
 401 \n\
 402 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 403 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 404 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 405 one or more single-letter ordering options, which override global ordering\n\
 406 options for that key.  If no key is given, use the entire line as the key.\n\
 407 \n\
 408 SIZE may be followed by the following multiplicative suffixes:\n\
 409 "), stdout);
 410       fputs (_("\
 411 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 412 \n\
 413 With no FILE, or when FILE is -, read standard input.\n\
 414 \n\
 415 *** WARNING ***\n\
 416 The locale specified by the environment affects sort order.\n\
 417 Set LC_ALL=C to get the traditional sort order that uses\n\
 418 native byte values.\n\
 419 "), stdout );
 420       emit_ancillary_info ();
 421     }
 422
 423   exit (status);
 424 }
 425
 426 /* For long options that have no equivalent short option, use a
 427    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 428 enum
 429 {
 430   CHECK_OPTION = CHAR_MAX + 1,
 431   COMPRESS_PROGRAM_OPTION,
 432   FILES0_FROM_OPTION,
 433   NMERGE_OPTION,
 434   RANDOM_SOURCE_OPTION,
 435   SORT_OPTION
 436 };
 437
 438 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
 439
 440 static struct option const long_options[] =
 441 {
 442   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 443   {"check", optional_argument, NULL, CHECK_OPTION},
 444   {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
 445   {"dictionary-order", no_argument, NULL, 'd'},
 446   {"ignore-case", no_argument, NULL, 'f'},
 447   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
 448   {"general-numeric-sort", no_argument, NULL, 'g'},
 449   {"ignore-nonprinting", no_argument, NULL, 'i'},
 450   {"key", required_argument, NULL, 'k'},
 451   {"merge", no_argument, NULL, 'm'},
 452   {"month-sort", no_argument, NULL, 'M'},
 453   {"numeric-sort", no_argument, NULL, 'n'},
 454   {"human-numeric-sort", no_argument, NULL, 'h'},
 455   {"version-sort", no_argument, NULL, 'V'},
 456   {"random-sort", no_argument, NULL, 'R'},
 457   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 458   {"sort", required_argument, NULL, SORT_OPTION},
 459   {"output", required_argument, NULL, 'o'},
 460   {"reverse", no_argument, NULL, 'r'},
 461   {"stable", no_argument, NULL, 's'},
 462   {"batch-size", required_argument, NULL, NMERGE_OPTION},
 463   {"buffer-size", required_argument, NULL, 'S'},
 464   {"field-separator", required_argument, NULL, 't'},
 465   {"temporary-directory", required_argument, NULL, 'T'},
 466   {"unique", no_argument, NULL, 'u'},
 467   {"zero-terminated", no_argument, NULL, 'z'},
 468   {GETOPT_HELP_OPTION_DECL},
 469   {GETOPT_VERSION_OPTION_DECL},
 470   {NULL, 0, NULL, 0},
 471 };
 472
 473 #define CHECK_TABLE \
 474   _ct_("quiet",          'C') \
 475   _ct_("silent",         'C') \
 476   _ct_("diagnose-first", 'c')
 477
 478 static char const *const check_args[] =
 479 {
 480 #define _ct_(_s, _c) _s,
 481   CHECK_TABLE NULL
 482 #undef  _ct_
 483 };
 484 static char const check_types[] =
 485 {
 486 #define _ct_(_s, _c) _c,
 487   CHECK_TABLE
 488 #undef  _ct_
 489 };
 490
 491 #define SORT_TABLE \
 492   _st_("general-numeric", 'g') \
 493   _st_("human-numeric",   'h') \
 494   _st_("month",           'M') \
 495   _st_("numeric",         'n') \
 496   _st_("random",          'R') \
 497   _st_("version",         'V')
 498
 499 static char const *const sort_args[] =
 500 {
 501 #define _st_(_s, _c) _s,
 502   SORT_TABLE NULL
 503 #undef  _st_
 504 };
 505 static char const sort_types[] =
 506 {
 507 #define _st_(_s, _c) _c,
 508   SORT_TABLE
 509 #undef  _st_
 510 };
 511
 512 /* The set of signals that are caught.  */
 513 static sigset_t caught_signals;
 514
 515 /* Critical section status.  */
 516 struct cs_status
 517 {
 518   bool valid;
 519   sigset_t sigs;
 520 };
 521
 522 /* Enter a critical section.  */
 523 static struct cs_status
 524 cs_enter (void)
 525 {
 526   struct cs_status status;
 527   status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
 528   return status;
 529 }
 530
 531 /* Leave a critical section.  */
 532 static void
 533 cs_leave (struct cs_status status)
 534 {
 535   if (status.valid)
 536     {
 537       /* Ignore failure when restoring the signal mask. */
 538       sigprocmask (SIG_SETMASK, &status.sigs, NULL);
 539     }
 540 }
 541
 542 /* The list of temporary files. */
 543 struct tempnode
 544 {
 545   struct tempnode *volatile next;
 546   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 547   char name[1];  /* Actual size is 1 + file name length.  */
 548 };
 549 static struct tempnode *volatile temphead;
 550 static struct tempnode *volatile *temptail = &temphead;
 551
 552 struct sortfile
 553 {
 554   char const *name;
 555   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 556 };
 557
 558 /* A table where we store compression process states.  We clean up all
 559    processes in a timely manner so as not to exhaust system resources,
 560    so we store the info on whether the process is still running, or has
 561    been reaped here.  */
 562 static Hash_table *proctab;
 563
 564 enum { INIT_PROCTAB_SIZE = 47 };
 565
 566 enum procstate { ALIVE, ZOMBIE };
 567
 568 /* A proctab entry.  The COUNT field is there in case we fork a new
 569    compression process that has the same PID as an old zombie process
 570    that is still in the table (because the process to decompress the
 571    temp file it was associated with hasn't started yet).  */
 572 struct procnode
 573 {
 574   pid_t pid;
 575   enum procstate state;
 576   size_t count;
 577 };
 578
 579 static size_t
 580 proctab_hasher (const void *entry, size_t tabsize)
 581 {
 582   const struct procnode *node = entry;
 583   return node->pid % tabsize;
 584 }
 585
 586 static bool
 587 proctab_comparator (const void *e1, const void *e2)
 588 {
 589   const struct procnode *n1 = e1, *n2 = e2;
 590   return n1->pid == n2->pid;
 591 }
 592
 593 /* The total number of forked processes (compressors and decompressors)
 594    that have not been reaped yet. */
 595 static size_t nprocs;
 596
 597 /* The number of child processes we'll allow before we try to reap some. */
 598 enum { MAX_PROCS_BEFORE_REAP = 2 };
 599
 600 /* If 0 < PID, wait for the child process with that PID to exit.
 601    If PID is -1, clean up a random child process which has finished and
 602    return the process ID of that child.  If PID is -1 and no processes
 603    have quit yet, return 0 without waiting.  */
 604
 605 static pid_t
 606 reap (pid_t pid)
 607 {
 608   int status;
 609   pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
 610
 611   if (cpid < 0)
 612     error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
 613            compress_program);
 614   else if (0 < cpid)
 615     {
 616       if (! WIFEXITED (status) || WEXITSTATUS (status))
 617         error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
 618                compress_program);
 619       --nprocs;
 620     }
 621
 622   return cpid;
 623 }
 624
 625 /* Add the PID of a running compression process to proctab, or update
 626    the entry COUNT and STATE fields if it's already there.  This also
 627    creates the table for us the first time it's called.  */
 628
 629 static void
 630 register_proc (pid_t pid)
 631 {
 632   struct procnode test, *node;
 633
 634   if (! proctab)
 635     {
 636       proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
 637                                  proctab_hasher,
 638                                  proctab_comparator,
 639                                  free);
 640       if (! proctab)
 641         xalloc_die ();
 642     }
 643
 644   test.pid = pid;
 645   node = hash_lookup (proctab, &test);
 646   if (node)
 647     {
 648       node->state = ALIVE;
 649       ++node->count;
 650     }
 651   else
 652     {
 653       node = xmalloc (sizeof *node);
 654       node->pid = pid;
 655       node->state = ALIVE;
 656       node->count = 1;
 657       if (hash_insert (proctab, node) == NULL)
 658         xalloc_die ();
 659     }
 660 }
 661
 662 /* This is called when we reap a random process.  We don't know
 663    whether we have reaped a compression process or a decompression
 664    process until we look in the table.  If there's an ALIVE entry for
 665    it, then we have reaped a compression process, so change the state
 666    to ZOMBIE.  Otherwise, it's a decompression processes, so ignore it.  */
 667
 668 static void
 669 update_proc (pid_t pid)
 670 {
 671   struct procnode test, *node;
 672
 673   test.pid = pid;
 674   node = hash_lookup (proctab, &test);
 675   if (node)
 676     node->state = ZOMBIE;
 677 }
 678
 679 /* This is for when we need to wait for a compression process to exit.
 680    If it has a ZOMBIE entry in the table then it's already dead and has
 681    been reaped.  Note that if there's an ALIVE entry for it, it still may
 682    already have died and been reaped if a second process was created with
 683    the same PID.  This is probably exceedingly rare, but to be on the safe
 684    side we will have to wait for any compression process with this PID.  */
 685
 686 static void
 687 wait_proc (pid_t pid)
 688 {
 689   struct procnode test, *node;
 690
 691   test.pid = pid;
 692   node = hash_lookup (proctab, &test);
 693   if (node->state == ALIVE)
 694     reap (pid);
 695
 696   node->state = ZOMBIE;
 697   if (! --node->count)
 698     {
 699       hash_delete (proctab, node);
 700       free (node);
 701     }
 702 }
 703
 704 /* Keep reaping finished children as long as there are more to reap.
 705    This doesn't block waiting for any of them, it only reaps those
 706    that are already dead.  */
 707
 708 static void
 709 reap_some (void)
 710 {
 711   pid_t pid;
 712
 713   while (0 < nprocs && (pid = reap (-1)))
 714     update_proc (pid);
 715 }
 716
 717 /* Clean up any remaining temporary files.  */
 718
 719 static void
 720 cleanup (void)
 721 {
 722   struct tempnode const *node;
 723
 724   for (node = temphead; node; node = node->next)
 725     unlink (node->name);
 726   temphead = NULL;
 727 }
 728
 729 /* Cleanup actions to take when exiting.  */
 730
 731 static void
 732 exit_cleanup (void)
 733 {
 734   if (temphead)
 735     {
 736       /* Clean up any remaining temporary files in a critical section so
 737          that a signal handler does not try to clean them too.  */
 738       struct cs_status cs = cs_enter ();
 739       cleanup ();
 740       cs_leave (cs);
 741     }
 742
 743   close_stdout ();
 744 }
 745
 746 /* Create a new temporary file, returning its newly allocated tempnode.
 747    Store into *PFD the file descriptor open for writing.
 748    If the creation fails, return NULL and store -1 into *PFD if the
 749    failure is due to file descriptor exhaustion and
 750    SURVIVE_FD_EXHAUSTION; otherwise, die.  */
 751
 752 static struct tempnode *
 753 create_temp_file (int *pfd, bool survive_fd_exhaustion)
 754 {
 755   static char const slashbase[] = "/sortXXXXXX";
 756   static size_t temp_dir_index;
 757   int fd;
 758   int saved_errno;
 759   char const *temp_dir = temp_dirs[temp_dir_index];
 760   size_t len = strlen (temp_dir);
 761   struct tempnode *node =
 762     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 763   char *file = node->name;
 764   struct cs_status cs;
 765
 766   memcpy (file, temp_dir, len);
 767   memcpy (file + len, slashbase, sizeof slashbase);
 768   node->next = NULL;
 769   node->pid = 0;
 770   if (++temp_dir_index == temp_dir_count)
 771     temp_dir_index = 0;
 772
 773   /* Create the temporary file in a critical section, to avoid races.  */
 774   cs = cs_enter ();
 775   fd = mkstemp (file);
 776   if (0 <= fd)
 777     {
 778       *temptail = node;
 779       temptail = &node->next;
 780     }
 781   saved_errno = errno;
 782   cs_leave (cs);
 783   errno = saved_errno;
 784
 785   if (fd < 0)
 786     {
 787       if (! (survive_fd_exhaustion && errno == EMFILE))
 788         error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
 789                quote (temp_dir));
 790       free (node);
 791       node = NULL;
 792     }
 793
 794   *pfd = fd;
 795   return node;
 796 }
 797
 798 /* Predeclare an access pattern for input files.
 799    Ignore any errors -- this is only advisory.
 800
 801    There are a few hints we could possibly provide,
 802    and after careful testing it was decided that
 803    specifying POSIX_FADV_SEQUENTIAL was not detrimental
 804    to any cases.  On Linux 2.6.31, this option doubles
 805    the size of read ahead performed and thus was seen to
 806    benefit these cases:
 807      Merging
 808      Sorting with a smaller internal buffer
 809      Reading from faster flash devices
 810
 811    In _addition_ one could also specify other hints...
 812
 813    POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
 814    at least uses that to _synchronously_ prepopulate the cache
 815    with the specified range.  While sort does need to
 816    read all of its input before outputting, a synchronous
 817    read of the whole file up front precludes any processing
 818    that sort could do in parallel with the system doing
 819    read ahead of the data. This was seen to have negative effects
 820    in a couple of cases:
 821      Merging
 822      Sorting with a smaller internal buffer
 823    Note this option was seen to shorten the runtime for sort
 824    on a multicore system with lots of RAM and other processes
 825    competing for CPU.  It could be argued that more explicit
 826    scheduling hints with `nice` et. al. are more appropriate
 827    for this situation.
 828
 829    POSIX_FADV_NOREUSE is a possibility as it could lower
 830    the priority of input data in the cache as sort will
 831    only need to process it once.  However its functionality
 832    has changed over Linux kernel versions and as of 2.6.31
 833    it does nothing and thus we can't depend on what it might
 834    do in future.
 835
 836    POSIX_FADV_DONTNEED is not appropriate for user specified
 837    input files, but for temp files we do want to drop the
 838    cache immediately after processing.  This is done implicitly
 839    however when the files are unlinked.  */
 840
 841 static void
 842 fadvise_input (FILE *fp)
 843 {
 844 #if HAVE_POSIX_FADVISE
 845   if (fp)
 846     {
 847       int fd = fileno (fp);
 848       ignore_value (posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL));
 849     }
 850 #endif
 851 }
 852
 853 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 854    standard output; HOW should be "w".  When opening for input, "-"
 855    means standard input.  To avoid confusion, do not return file
 856    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 857    opening an ordinary FILE.  Return NULL if unsuccessful.  */
 858
 859 static FILE *
 860 stream_open (const char *file, const char *how)
 861 {
 862   if (!file)
 863     return stdout;
 864   if (*how == 'r')
 865     {
 866       FILE *fp;
 867       if (STREQ (file, "-"))
 868         {
 869           have_read_stdin = true;
 870           fp = stdin;
 871         }
 872       else
 873         fp = fopen (file, how);
 874       fadvise_input (fp);
 875       return fp;
 876     }
 877   return fopen (file, how);
 878 }
 879
 880 /* Same as stream_open, except always return a non-null value; die on
 881    failure.  */
 882
 883 static FILE *
 884 xfopen (const char *file, const char *how)
 885  {
 886   FILE *fp = stream_open (file, how);
 887   if (!fp)
 888     die (_("open failed"), file);
 889   return fp;
 890 }
 891
 892 /* Close FP, whose name is FILE, and report any errors.  */
 893
 894 static void
 895 xfclose (FILE *fp, char const *file)
 896 {
 897   switch (fileno (fp))
 898     {
 899     case STDIN_FILENO:
 900       /* Allow reading stdin from tty more than once.  */
 901       if (feof (fp))
 902         clearerr (fp);
 903       break;
 904
 905     case STDOUT_FILENO:
 906       /* Don't close stdout just yet.  close_stdout does that.  */
 907       if (fflush (fp) != 0)
 908         die (_("fflush failed"), file);
 909       break;
 910
 911     default:
 912       if (fclose (fp) != 0)
 913         die (_("close failed"), file);
 914       break;
 915     }
 916 }
 917
 918 static void
 919 dup2_or_die (int oldfd, int newfd)
 920 {
 921   if (dup2 (oldfd, newfd) < 0)
 922     error (SORT_FAILURE, errno, _("dup2 failed"));
 923 }
 924
 925 /* Fork a child process for piping to and do common cleanup.  The
 926    TRIES parameter tells us how many times to try to fork before
 927    giving up.  Return the PID of the child, or -1 (setting errno)
 928    on failure. */
 929
 930 static pid_t
 931 pipe_fork (int pipefds[2], size_t tries)
 932 {
 933 #if HAVE_WORKING_FORK
 934   struct tempnode *saved_temphead;
 935   int saved_errno;
 936   double wait_retry = 0.25;
 937   pid_t pid IF_LINT (= -1);
 938   struct cs_status cs;
 939
 940   if (pipe (pipefds) < 0)
 941     return -1;
 942
 943   while (tries--)
 944     {
 945       /* This is so the child process won't delete our temp files
 946          if it receives a signal before exec-ing.  */
 947       cs = cs_enter ();
 948       saved_temphead = temphead;
 949       temphead = NULL;
 950
 951       pid = fork ();
 952       saved_errno = errno;
 953       if (pid)
 954         temphead = saved_temphead;
 955
 956       cs_leave (cs);
 957       errno = saved_errno;
 958
 959       if (0 <= pid || errno != EAGAIN)
 960         break;
 961       else
 962         {
 963           xnanosleep (wait_retry);
 964           wait_retry *= 2;
 965           reap_some ();
 966         }
 967     }
 968
 969   if (pid < 0)
 970     {
 971       saved_errno = errno;
 972       close (pipefds[0]);
 973       close (pipefds[1]);
 974       errno = saved_errno;
 975     }
 976   else if (pid == 0)
 977     {
 978       close (STDIN_FILENO);
 979       close (STDOUT_FILENO);
 980     }
 981   else
 982     ++nprocs;
 983
 984   return pid;
 985
 986 #else  /* ! HAVE_WORKING_FORK */
 987   return -1;
 988 #endif
 989 }
 990
 991 /* Create a temporary file and start a compression program to filter output
 992    to that file.  Set *PFP to the file handle and if PPID is non-NULL,
 993    set *PPID to the PID of the newly-created process.  If the creation
 994    fails, return NULL if the failure is due to file descriptor
 995    exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die.  */
 996
 997 static char *
 998 maybe_create_temp (FILE **pfp, pid_t *ppid, bool survive_fd_exhaustion)
 999 {
1000   int tempfd;
1001   struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1002   char *name;
1003
1004   if (! node)
1005     return NULL;
1006
1007   name = node->name;
1008
1009   if (compress_program)
1010     {
1011       int pipefds[2];
1012
1013       node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1014       if (0 < node->pid)
1015         {
1016           close (tempfd);
1017           close (pipefds[0]);
1018           tempfd = pipefds[1];
1019
1020           register_proc (node->pid);
1021         }
1022       else if (node->pid == 0)
1023         {
1024           close (pipefds[1]);
1025           dup2_or_die (tempfd, STDOUT_FILENO);
1026           close (tempfd);
1027           dup2_or_die (pipefds[0], STDIN_FILENO);
1028           close (pipefds[0]);
1029
1030           if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1031             error (SORT_FAILURE, errno, _("couldn't execute %s"),
1032                    compress_program);
1033         }
1034       else
1035         node->pid = 0;
1036     }
1037
1038   *pfp = fdopen (tempfd, "w");
1039   if (! *pfp)
1040     die (_("couldn't create temporary file"), name);
1041
1042   if (ppid)
1043     *ppid = node->pid;
1044
1045   return name;
1046 }
1047
1048 /* Create a temporary file and start a compression program to filter output
1049    to that file.  Set *PFP to the file handle and if *PPID is non-NULL,
1050    set it to the PID of the newly-created process.  Die on failure.  */
1051
1052 static char *
1053 create_temp (FILE **pfp, pid_t *ppid)
1054 {
1055   return maybe_create_temp (pfp, ppid, false);
1056 }
1057
1058 /* Open a compressed temp file and start a decompression process through
1059    which to filter the input.  PID must be the valid processes ID of the
1060    process used to compress the file.  Return NULL (setting errno to
1061    EMFILE) if we ran out of file descriptors, and die on any other
1062    kind of failure.  */
1063
1064 static FILE *
1065 open_temp (const char *name, pid_t pid)
1066 {
1067   int tempfd, pipefds[2];
1068   FILE *fp = NULL;
1069
1070   wait_proc (pid);
1071
1072   tempfd = open (name, O_RDONLY);
1073   if (tempfd < 0)
1074     return NULL;
1075
1076   switch (pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS))
1077     {
1078     case -1:
1079       if (errno != EMFILE)
1080         error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1081                compress_program);
1082       close (tempfd);
1083       errno = EMFILE;
1084       break;
1085
1086     case 0:
1087       close (pipefds[0]);
1088       dup2_or_die (tempfd, STDIN_FILENO);
1089       close (tempfd);
1090       dup2_or_die (pipefds[1], STDOUT_FILENO);
1091       close (pipefds[1]);
1092
1093       execlp (compress_program, compress_program, "-d", (char *) NULL);
1094       error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1095              compress_program);
1096
1097     default:
1098       close (tempfd);
1099       close (pipefds[1]);
1100
1101       fp = fdopen (pipefds[0], "r");
1102       if (! fp)
1103         {
1104           int saved_errno = errno;
1105           close (pipefds[0]);
1106           errno = saved_errno;
1107         }
1108       break;
1109     }
1110
1111   return fp;
1112 }
1113
1114 static void
1115 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
1116 {
1117   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
1118     die (_("write failed"), output_file);
1119 }
1120
1121 /* Append DIR to the array of temporary directory names.  */
1122 static void
1123 add_temp_dir (char const *dir)
1124 {
1125   if (temp_dir_count == temp_dir_alloc)
1126     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1127
1128   temp_dirs[temp_dir_count++] = dir;
1129 }
1130
1131 /* Remove NAME from the list of temporary files.  */
1132
1133 static void
1134 zaptemp (const char *name)
1135 {
1136   struct tempnode *volatile *pnode;
1137   struct tempnode *node;
1138   struct tempnode *next;
1139   int unlink_status;
1140   int unlink_errno = 0;
1141   struct cs_status cs;
1142
1143   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1144     continue;
1145
1146   /* Unlink the temporary file in a critical section to avoid races.  */
1147   next = node->next;
1148   cs = cs_enter ();
1149   unlink_status = unlink (name);
1150   unlink_errno = errno;
1151   *pnode = next;
1152   cs_leave (cs);
1153
1154   if (unlink_status != 0)
1155     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1156   if (! next)
1157     temptail = pnode;
1158   free (node);
1159 }
1160
1161 #if HAVE_NL_LANGINFO
1162
1163 static int
1164 struct_month_cmp (const void *m1, const void *m2)
1165 {
1166   struct month const *month1 = m1;
1167   struct month const *month2 = m2;
1168   return strcmp (month1->name, month2->name);
1169 }
1170
1171 #endif
1172
1173 /* Initialize the character class tables. */
1174
1175 static void
1176 inittables (void)
1177 {
1178   size_t i;
1179
1180   for (i = 0; i < UCHAR_LIM; ++i)
1181     {
1182       blanks[i] = !! isblank (i);
1183       nonprinting[i] = ! isprint (i);
1184       nondictionary[i] = ! isalnum (i) && ! isblank (i);
1185       fold_toupper[i] = toupper (i);
1186     }
1187
1188 #if HAVE_NL_LANGINFO
1189   /* If we're not in the "C" locale, read different names for months.  */
1190   if (hard_LC_TIME)
1191     {
1192       for (i = 0; i < MONTHS_PER_YEAR; i++)
1193         {
1194           char const *s;
1195           size_t s_len;
1196           size_t j, k;
1197           char *name;
1198
1199           s = (char *) nl_langinfo (ABMON_1 + i);
1200           s_len = strlen (s);
1201           monthtab[i].name = name = xmalloc (s_len + 1);
1202           monthtab[i].val = i + 1;
1203
1204           for (j = k = 0; j < s_len; j++)
1205             if (! isblank (to_uchar (s[j])))
1206               name[k++] = fold_toupper[to_uchar (s[j])];
1207           name[k] = '\0';
1208         }
1209       qsort ((void *) monthtab, MONTHS_PER_YEAR,
1210              sizeof *monthtab, struct_month_cmp);
1211     }
1212 #endif
1213 }
1214
1215 /* Specify how many inputs may be merged at once.
1216    This may be set on the command-line with the
1217    --batch-size option. */
1218 static void
1219 specify_nmerge (int oi, char c, char const *s)
1220 {
1221   uintmax_t n;
1222   struct rlimit rlimit;
1223   enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1224
1225   /* Try to find out how many file descriptors we'll be able
1226      to open.  We need at least nmerge + 3 (STDIN_FILENO,
1227      STDOUT_FILENO and STDERR_FILENO). */
1228   unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1229                               ? rlimit.rlim_cur
1230                               : OPEN_MAX)
1231                              - 3);
1232
1233   if (e == LONGINT_OK)
1234     {
1235       nmerge = n;
1236       if (nmerge != n)
1237         e = LONGINT_OVERFLOW;
1238       else
1239         {
1240           if (nmerge < 2)
1241             {
1242               error (0, 0, _("invalid --%s argument %s"),
1243                      long_options[oi].name, quote(s));
1244               error (SORT_FAILURE, 0,
1245                      _("minimum --%s argument is %s"),
1246                      long_options[oi].name, quote("2"));
1247             }
1248           else if (max_nmerge < nmerge)
1249             {
1250               e = LONGINT_OVERFLOW;
1251             }
1252           else
1253             return;
1254         }
1255     }
1256
1257   if (e == LONGINT_OVERFLOW)
1258     {
1259       char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1260       error (0, 0, _("--%s argument %s too large"),
1261              long_options[oi].name, quote(s));
1262       error (SORT_FAILURE, 0,
1263              _("maximum --%s argument with current rlimit is %s"),
1264              long_options[oi].name,
1265              uinttostr (max_nmerge, max_nmerge_buf));
1266     }
1267   else
1268     xstrtol_fatal (e, oi, c, long_options, s);
1269 }
1270
1271 /* Specify the amount of main memory to use when sorting.  */
1272 static void
1273 specify_sort_size (int oi, char c, char const *s)
1274 {
1275   uintmax_t n;
1276   char *suffix;
1277   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1278
1279   /* The default unit is KiB.  */
1280   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1281     {
1282       if (n <= UINTMAX_MAX / 1024)
1283         n *= 1024;
1284       else
1285         e = LONGINT_OVERFLOW;
1286     }
1287
1288   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1289   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1290     switch (suffix[0])
1291       {
1292       case 'b':
1293         e = LONGINT_OK;
1294         break;
1295
1296       case '%':
1297         {
1298           double mem = physmem_total () * n / 100;
1299
1300           /* Use "<", not "<=", to avoid problems with rounding.  */
1301           if (mem < UINTMAX_MAX)
1302             {
1303               n = mem;
1304               e = LONGINT_OK;
1305             }
1306           else
1307             e = LONGINT_OVERFLOW;
1308         }
1309         break;
1310       }
1311
1312   if (e == LONGINT_OK)
1313     {
1314       /* If multiple sort sizes are specified, take the maximum, so
1315          that option order does not matter.  */
1316       if (n < sort_size)
1317         return;
1318
1319       sort_size = n;
1320       if (sort_size == n)
1321         {
1322           sort_size = MAX (sort_size, MIN_SORT_SIZE);
1323           return;
1324         }
1325
1326       e = LONGINT_OVERFLOW;
1327     }
1328
1329   xstrtol_fatal (e, oi, c, long_options, s);
1330 }
1331
1332 /* Return the default sort size.  */
1333 static size_t
1334 default_sort_size (void)
1335 {
1336   /* Let MEM be available memory or 1/8 of total memory, whichever
1337      is greater.  */
1338   double avail = physmem_available ();
1339   double total = physmem_total ();
1340   double mem = MAX (avail, total / 8);
1341   struct rlimit rlimit;
1342
1343   /* Let SIZE be MEM, but no more than the maximum object size or
1344      system resource limits.  Avoid the MIN macro here, as it is not
1345      quite right when only one argument is floating point.  Don't
1346      bother to check for values like RLIM_INFINITY since in practice
1347      they are not much less than SIZE_MAX.  */
1348   size_t size = SIZE_MAX;
1349   if (mem < size)
1350     size = mem;
1351   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1352     size = rlimit.rlim_cur;
1353 #ifdef RLIMIT_AS
1354   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1355     size = rlimit.rlim_cur;
1356 #endif
1357
1358   /* Leave a large safety margin for the above limits, as failure can
1359      occur when they are exceeded.  */
1360   size /= 2;
1361
1362 #ifdef RLIMIT_RSS
1363   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1364      Exceeding RSS is not fatal, but can be quite slow.  */
1365   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1366     size = rlimit.rlim_cur / 16 * 15;
1367 #endif
1368
1369   /* Use no less than the minimum.  */
1370   return MAX (size, MIN_SORT_SIZE);
1371 }
1372
1373 /* Return the sort buffer size to use with the input files identified
1374    by FPS and FILES, which are alternate names of the same files.
1375    NFILES gives the number of input files; NFPS may be less.  Assume
1376    that each input line requires LINE_BYTES extra bytes' worth of line
1377    information.  Do not exceed the size bound specified by the user
1378    (or a default size bound, if the user does not specify one).  */
1379
1380 static size_t
1381 sort_buffer_size (FILE *const *fps, size_t nfps,
1382                   char *const *files, size_t nfiles,
1383                   size_t line_bytes)
1384 {
1385   /* A bound on the input size.  If zero, the bound hasn't been
1386      determined yet.  */
1387   static size_t size_bound;
1388
1389   /* In the worst case, each input byte is a newline.  */
1390   size_t worst_case_per_input_byte = line_bytes + 1;
1391
1392   /* Keep enough room for one extra input line and an extra byte.
1393      This extra room might be needed when preparing to read EOF.  */
1394   size_t size = worst_case_per_input_byte + 1;
1395
1396   size_t i;
1397
1398   for (i = 0; i < nfiles; i++)
1399     {
1400       struct stat st;
1401       off_t file_size;
1402       size_t worst_case;
1403
1404       if ((i < nfps ? fstat (fileno (fps[i]), &st)
1405            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1406            : stat (files[i], &st))
1407           != 0)
1408         die (_("stat failed"), files[i]);
1409
1410       if (S_ISREG (st.st_mode))
1411         file_size = st.st_size;
1412       else
1413         {
1414           /* The file has unknown size.  If the user specified a sort
1415              buffer size, use that; otherwise, guess the size.  */
1416           if (sort_size)
1417             return sort_size;
1418           file_size = INPUT_FILE_SIZE_GUESS;
1419         }
1420
1421       if (! size_bound)
1422         {
1423           size_bound = sort_size;
1424           if (! size_bound)
1425             size_bound = default_sort_size ();
1426         }
1427
1428       /* Add the amount of memory needed to represent the worst case
1429          where the input consists entirely of newlines followed by a
1430          single non-newline.  Check for overflow.  */
1431       worst_case = file_size * worst_case_per_input_byte + 1;
1432       if (file_size != worst_case / worst_case_per_input_byte
1433           || size_bound - size <= worst_case)
1434         return size_bound;
1435       size += worst_case;
1436     }
1437
1438   return size;
1439 }
1440
1441 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1442    must be at least sizeof (struct line).  Allocate ALLOC bytes
1443    initially.  */
1444
1445 static void
1446 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1447 {
1448   /* Ensure that the line array is properly aligned.  If the desired
1449      size cannot be allocated, repeatedly halve it until allocation
1450      succeeds.  The smaller allocation may hurt overall performance,
1451      but that's better than failing.  */
1452   for (;;)
1453     {
1454       alloc += sizeof (struct line) - alloc % sizeof (struct line);
1455       buf->buf = malloc (alloc);
1456       if (buf->buf)
1457         break;
1458       alloc /= 2;
1459       if (alloc <= line_bytes + 1)
1460         xalloc_die ();
1461     }
1462
1463   buf->line_bytes = line_bytes;
1464   buf->alloc = alloc;
1465   buf->used = buf->left = buf->nlines = 0;
1466   buf->eof = false;
1467 }
1468
1469 /* Return one past the limit of the line array.  */
1470
1471 static inline struct line *
1472 buffer_linelim (struct buffer const *buf)
1473 {
1474   return (struct line *) (buf->buf + buf->alloc);
1475 }
1476
1477 /* Return a pointer to the first character of the field specified
1478    by KEY in LINE. */
1479
1480 static char *
1481 begfield (const struct line *line, const struct keyfield *key)
1482 {
1483   char *ptr = line->text, *lim = ptr + line->length - 1;
1484   size_t sword = key->sword;
1485   size_t schar = key->schar;
1486
1487   /* The leading field separator itself is included in a field when -t
1488      is absent.  */
1489
1490   if (tab != TAB_DEFAULT)
1491     while (ptr < lim && sword--)
1492       {
1493         while (ptr < lim && *ptr != tab)
1494           ++ptr;
1495         if (ptr < lim)
1496           ++ptr;
1497       }
1498   else
1499     while (ptr < lim && sword--)
1500       {
1501         while (ptr < lim && blanks[to_uchar (*ptr)])
1502           ++ptr;
1503         while (ptr < lim && !blanks[to_uchar (*ptr)])
1504           ++ptr;
1505       }
1506
1507   /* If we're ignoring leading blanks when computing the Start
1508      of the field, skip past them here.  */
1509   if (key->skipsblanks)
1510     while (ptr < lim && blanks[to_uchar (*ptr)])
1511       ++ptr;
1512
1513   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1514   ptr = MIN (lim, ptr + schar);
1515
1516   return ptr;
1517 }
1518
1519 /* Return the limit of (a pointer to the first character after) the field
1520    in LINE specified by KEY. */
1521
1522 static char *
1523 limfield (const struct line *line, const struct keyfield *key)
1524 {
1525   char *ptr = line->text, *lim = ptr + line->length - 1;
1526   size_t eword = key->eword, echar = key->echar;
1527
1528   if (echar == 0)
1529     eword++; /* Skip all of end field.  */
1530
1531   /* Move PTR past EWORD fields or to one past the last byte on LINE,
1532      whichever comes first.  If there are more than EWORD fields, leave
1533      PTR pointing at the beginning of the field having zero-based index,
1534      EWORD.  If a delimiter character was specified (via -t), then that
1535      `beginning' is the first character following the delimiting TAB.
1536      Otherwise, leave PTR pointing at the first `blank' character after
1537      the preceding field.  */
1538   if (tab != TAB_DEFAULT)
1539     while (ptr < lim && eword--)
1540       {
1541         while (ptr < lim && *ptr != tab)
1542           ++ptr;
1543         if (ptr < lim && (eword || echar))
1544           ++ptr;
1545       }
1546   else
1547     while (ptr < lim && eword--)
1548       {
1549         while (ptr < lim && blanks[to_uchar (*ptr)])
1550           ++ptr;
1551         while (ptr < lim && !blanks[to_uchar (*ptr)])
1552           ++ptr;
1553       }
1554
1555 #ifdef POSIX_UNSPECIFIED
1556   /* The following block of code makes GNU sort incompatible with
1557      standard Unix sort, so it's ifdef'd out for now.
1558      The POSIX spec isn't clear on how to interpret this.
1559      FIXME: request clarification.
1560
1561      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1562      Date: Thu, 30 May 96 12:20:41 -0400
1563      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1564
1565      [...]I believe I've found another bug in `sort'.
1566
1567      $ cat /tmp/sort.in
1568      a b c 2 d
1569      pq rs 1 t
1570      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1571      a b c 2 d
1572      pq rs 1 t
1573      $ /bin/sort -k1.7,1.7 </tmp/sort.in
1574      pq rs 1 t
1575      a b c 2 d
1576
1577      Unix sort produced the answer I expected: sort on the single character
1578      in column 7.  GNU sort produced different results, because it disagrees
1579      on the interpretation of the key-end spec "M.N".  Unix sort reads this
1580      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1581      "skip M-1 fields, then either N-1 characters or the rest of the current
1582      field, whichever comes first".  This extra clause applies only to
1583      key-ends, not key-starts.
1584      */
1585
1586   /* Make LIM point to the end of (one byte past) the current field.  */
1587   if (tab != TAB_DEFAULT)
1588     {
1589       char *newlim;
1590       newlim = memchr (ptr, tab, lim - ptr);
1591       if (newlim)
1592         lim = newlim;
1593     }
1594   else
1595     {
1596       char *newlim;
1597       newlim = ptr;
1598       while (newlim < lim && blanks[to_uchar (*newlim)])
1599         ++newlim;
1600       while (newlim < lim && !blanks[to_uchar (*newlim)])
1601         ++newlim;
1602       lim = newlim;
1603     }
1604 #endif
1605
1606   if (echar != 0) /* We need to skip over a portion of the end field.  */
1607     {
1608       /* If we're ignoring leading blanks when computing the End
1609          of the field, skip past them here.  */
1610       if (key->skipeblanks)
1611         while (ptr < lim && blanks[to_uchar (*ptr)])
1612           ++ptr;
1613
1614       /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1615       ptr = MIN (lim, ptr + echar);
1616     }
1617
1618   return ptr;
1619 }
1620
1621 /* Fill BUF reading from FP, moving buf->left bytes from the end
1622    of buf->buf to the beginning first.  If EOF is reached and the
1623    file wasn't terminated by a newline, supply one.  Set up BUF's line
1624    table too.  FILE is the name of the file corresponding to FP.
1625    Return true if some input was read.  */
1626
1627 static bool
1628 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1629 {
1630   struct keyfield const *key = keylist;
1631   char eol = eolchar;
1632   size_t line_bytes = buf->line_bytes;
1633   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1634
1635   if (buf->eof)
1636     return false;
1637
1638   if (buf->used != buf->left)
1639     {
1640       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1641       buf->used = buf->left;
1642       buf->nlines = 0;
1643     }
1644
1645   for (;;)
1646     {
1647       char *ptr = buf->buf + buf->used;
1648       struct line *linelim = buffer_linelim (buf);
1649       struct line *line = linelim - buf->nlines;
1650       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1651       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1652
1653       while (line_bytes + 1 < avail)
1654         {
1655           /* Read as many bytes as possible, but do not read so many
1656              bytes that there might not be enough room for the
1657              corresponding line array.  The worst case is when the
1658              rest of the input file consists entirely of newlines,
1659              except that the last byte is not a newline.  */
1660           size_t readsize = (avail - 1) / (line_bytes + 1);
1661           size_t bytes_read = fread (ptr, 1, readsize, fp);
1662           char *ptrlim = ptr + bytes_read;
1663           char *p;
1664           avail -= bytes_read;
1665
1666           if (bytes_read != readsize)
1667             {
1668               if (ferror (fp))
1669                 die (_("read failed"), file);
1670               if (feof (fp))
1671                 {
1672                   buf->eof = true;
1673                   if (buf->buf == ptrlim)
1674                     return false;
1675                   if (ptrlim[-1] != eol)
1676                     *ptrlim++ = eol;
1677                 }
1678             }
1679
1680           /* Find and record each line in the just-read input.  */
1681           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1682             {
1683               ptr = p + 1;
1684               line--;
1685               line->text = line_start;
1686               line->length = ptr - line_start;
1687               mergesize = MAX (mergesize, line->length);
1688               avail -= line_bytes;
1689
1690               if (key)
1691                 {
1692                   /* Precompute the position of the first key for
1693                      efficiency.  */
1694                   line->keylim = (key->eword == SIZE_MAX
1695                                   ? p
1696                                   : limfield (line, key));
1697
1698                   if (key->sword != SIZE_MAX)
1699                     line->keybeg = begfield (line, key);
1700                   else
1701                     {
1702                       if (key->skipsblanks)
1703                         while (blanks[to_uchar (*line_start)])
1704                           line_start++;
1705                       line->keybeg = line_start;
1706                     }
1707                 }
1708
1709               line_start = ptr;
1710             }
1711
1712           ptr = ptrlim;
1713           if (buf->eof)
1714             break;
1715         }
1716
1717       buf->used = ptr - buf->buf;
1718       buf->nlines = buffer_linelim (buf) - line;
1719       if (buf->nlines != 0)
1720         {
1721           buf->left = ptr - line_start;
1722           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1723           return true;
1724         }
1725
1726       {
1727         /* The current input line is too long to fit in the buffer.
1728            Double the buffer size and try again, keeping it properly
1729            aligned.  */
1730         size_t line_alloc = buf->alloc / sizeof (struct line);
1731         buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1732         buf->alloc = line_alloc * sizeof (struct line);
1733       }
1734     }
1735 }
1736
1737 /* Compare strings A and B as numbers without explicitly converting them to
1738    machine numbers.  Comparatively slow for short strings, but asymptotically
1739    hideously fast. */
1740
1741 static int
1742 numcompare (const char *a, const char *b)
1743 {
1744   while (blanks[to_uchar (*a)])
1745     a++;
1746   while (blanks[to_uchar (*b)])
1747     b++;
1748
1749   return strnumcmp (a, b, decimal_point, thousands_sep);
1750 }
1751
1752 /* Exit with an error if a mixture of SI and IEC units detected.  */
1753
1754 static void
1755 check_mixed_SI_IEC (char prefix, struct keyfield *key)
1756 {
1757   int iec_present = prefix == 'i';
1758   if (key->iec_present != -1 && iec_present != key->iec_present)
1759     error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
1760   key->iec_present = iec_present;
1761 }
1762
1763 /* Return an integer which represents the order of magnitude of
1764    the unit following the number.  NUMBER can contain thousands separators
1765    or a decimal point, but not have preceeding blanks.
1766    Negative numbers return a negative unit order.  */
1767
1768 static int
1769 find_unit_order (const char *number, struct keyfield *key)
1770 {
1771   static const char orders [UCHAR_LIM] =
1772     {
1773 #if SOME_DAY_WE_WILL_REQUIRE_C99
1774       ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1775       ['k']=1,
1776 #else
1777       /* Generate the following table with this command:
1778          perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1779          foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1780          |fmt  */
1781       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1782       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1783       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1784       0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1785       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1786       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1787       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1789       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1790       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1791       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1792 #endif
1793     };
1794
1795   const unsigned char *p = number;
1796
1797   int sign = 1;
1798
1799   if (*p == '-')
1800     {
1801       sign = -1;
1802       p++;
1803     }
1804
1805   /* Scan to end of number.
1806      Decimals or separators not followed by digits stop the scan.
1807      Numbers ending in decimals or separators are thus considered
1808      to be lacking in units.
1809      FIXME: add support for multibyte thousands_sep and decimal_point.  */
1810
1811   while (ISDIGIT (*p))
1812     {
1813       p++;
1814
1815       if (*p == decimal_point && ISDIGIT (*(p + 1)))
1816         p += 2;
1817       else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
1818         p += 2;
1819     }
1820
1821   int order = orders[*p];
1822
1823   /* For valid units check for MiB vs MB etc.  */
1824   if (order)
1825     check_mixed_SI_IEC (*(p + 1), key);
1826
1827   return sign * order;
1828 }
1829
1830 /* Compare numbers ending in units with SI xor IEC prefixes
1831        <none/unknown> < K/k < M < G < T < P < E < Z < Y
1832    Assume that numbers are properly abbreviated.
1833    i.e. input will never have both 6000K and 5M.  */
1834
1835 static int
1836 human_numcompare (const char *a, const char *b, struct keyfield *key)
1837 {
1838   while (blanks[to_uchar (*a)])
1839     a++;
1840   while (blanks[to_uchar (*b)])
1841     b++;
1842
1843   int order_a = find_unit_order (a, key);
1844   int order_b = find_unit_order (b, key);
1845
1846   return (order_a > order_b ? 1
1847           : order_a < order_b ? -1
1848           : strnumcmp (a, b, decimal_point, thousands_sep));
1849 }
1850
1851 static int
1852 general_numcompare (const char *sa, const char *sb)
1853 {
1854   /* FIXME: add option to warn about failed conversions.  */
1855   /* FIXME: maybe add option to try expensive FP conversion
1856      only if A and B can't be compared more cheaply/accurately.  */
1857
1858   char *ea;
1859   char *eb;
1860   double a = strtod (sa, &ea);
1861   double b = strtod (sb, &eb);
1862
1863   /* Put conversion errors at the start of the collating sequence.  */
1864   if (sa == ea)
1865     return sb == eb ? 0 : -1;
1866   if (sb == eb)
1867     return 1;
1868
1869   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1870      conversion errors but before numbers; sort them by internal
1871      bit-pattern, for lack of a more portable alternative.  */
1872   return (a < b ? -1
1873           : a > b ? 1
1874           : a == b ? 0
1875           : b == b ? -1
1876           : a == a ? 1
1877           : memcmp ((char *) &a, (char *) &b, sizeof a));
1878 }
1879
1880 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1881    Return 0 if the name in S is not recognized.  */
1882
1883 static int
1884 getmonth (char const *month, size_t len)
1885 {
1886   size_t lo = 0;
1887   size_t hi = MONTHS_PER_YEAR;
1888   char const *monthlim = month + len;
1889
1890   for (;;)
1891     {
1892       if (month == monthlim)
1893         return 0;
1894       if (!blanks[to_uchar (*month)])
1895         break;
1896       ++month;
1897     }
1898
1899   do
1900     {
1901       size_t ix = (lo + hi) / 2;
1902       char const *m = month;
1903       char const *n = monthtab[ix].name;
1904
1905       for (;; m++, n++)
1906         {
1907           if (!*n)
1908             return monthtab[ix].val;
1909           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1910             {
1911               hi = ix;
1912               break;
1913             }
1914           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1915             {
1916               lo = ix + 1;
1917               break;
1918             }
1919         }
1920     }
1921   while (lo < hi);
1922
1923   return 0;
1924 }
1925
1926 /* A source of random data.  */
1927 static struct randread_source *randread_source;
1928
1929 /* Return the Ith randomly-generated state.  The caller must invoke
1930    random_state (H) for all H less than I before invoking random_state
1931    (I).  */
1932
1933 static struct md5_ctx
1934 random_state (size_t i)
1935 {
1936   /* An array of states resulting from the random data, and counts of
1937      its used and allocated members.  */
1938   static struct md5_ctx *state;
1939   static size_t used;
1940   static size_t allocated;
1941
1942   struct md5_ctx *s = &state[i];
1943
1944   if (used <= i)
1945     {
1946       unsigned char buf[MD5_DIGEST_SIZE];
1947
1948       used++;
1949
1950       if (allocated <= i)
1951         {
1952           state = X2NREALLOC (state, &allocated);
1953           s = &state[i];
1954         }
1955
1956       randread (randread_source, buf, sizeof buf);
1957       md5_init_ctx (s);
1958       md5_process_bytes (buf, sizeof buf, s);
1959     }
1960
1961   return *s;
1962 }
1963
1964 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1965    with length LENGTHB.  Return negative if less, zero if equal,
1966    positive if greater.  */
1967
1968 static int
1969 cmp_hashes (char const *texta, size_t lena,
1970             char const *textb, size_t lenb)
1971 {
1972   /* Try random hashes until a pair of hashes disagree.  But if the
1973      first pair of random hashes agree, check whether the keys are
1974      identical and if so report no difference.  */
1975   int diff;
1976   size_t i;
1977   for (i = 0; ; i++)
1978     {
1979       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1980       struct md5_ctx s[2];
1981       s[0] = s[1] = random_state (i);
1982       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1983       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1984       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1985       if (diff != 0)
1986         break;
1987       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1988         break;
1989     }
1990
1991   return diff;
1992 }
1993
1994 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1995    using one or more random hash functions.  */
1996
1997 static int
1998 compare_random (char *restrict texta, size_t lena,
1999                 char *restrict textb, size_t lenb)
2000 {
2001   int diff;
2002
2003   if (! hard_LC_COLLATE)
2004     diff = cmp_hashes (texta, lena, textb, lenb);
2005   else
2006     {
2007       /* Transform the text into the basis of comparison, so that byte
2008          strings that would otherwise considered to be equal are
2009          considered equal here even if their bytes differ.  */
2010
2011       char *buf = NULL;
2012       char stackbuf[4000];
2013       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
2014       bool a_fits = tlena <= sizeof stackbuf;
2015       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
2016                                (a_fits ? sizeof stackbuf - tlena : 0),
2017                                textb, lenb);
2018
2019       if (a_fits && tlena + tlenb <= sizeof stackbuf)
2020         buf = stackbuf;
2021       else
2022         {
2023           /* Adding 1 to the buffer size lets xmemxfrm run a bit
2024              faster by avoiding the need for an extra buffer copy.  */
2025           buf = xmalloc (tlena + tlenb + 1);
2026           xmemxfrm (buf, tlena + 1, texta, lena);
2027           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
2028         }
2029
2030       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
2031
2032       if (buf != stackbuf)
2033         free (buf);
2034     }
2035
2036   return diff;
2037 }
2038
2039 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2040    using filevercmp. See lib/filevercmp.h for function description. */
2041
2042 static int
2043 compare_version (char *restrict texta, size_t lena,
2044                  char *restrict textb, size_t lenb)
2045 {
2046   int diff;
2047
2048   /* It is necessary to save the character after the end of the field.
2049      "filevercmp" works with NUL terminated strings.  Our blocks of
2050      text are not necessarily terminated with a NUL byte. */
2051   char sv_a = texta[lena];
2052   char sv_b = textb[lenb];
2053
2054   texta[lena] = '\0';
2055   textb[lenb] = '\0';
2056
2057   diff = filevercmp (texta, textb);
2058
2059   texta[lena] = sv_a;
2060   textb[lenb] = sv_b;
2061
2062   return diff;
2063 }
2064
2065 /* Compare two lines A and B trying every key in sequence until there
2066    are no more keys or a difference is found. */
2067
2068 static int
2069 keycompare (const struct line *a, const struct line *b)
2070 {
2071   struct keyfield *key = keylist;
2072
2073   /* For the first iteration only, the key positions have been
2074      precomputed for us. */
2075   char *texta = a->keybeg;
2076   char *textb = b->keybeg;
2077   char *lima = a->keylim;
2078   char *limb = b->keylim;
2079
2080   int diff;
2081
2082   for (;;)
2083     {
2084       char const *translate = key->translate;
2085       bool const *ignore = key->ignore;
2086
2087       /* Treat field ends before field starts as empty fields.  */
2088       lima = MAX (texta, lima);
2089       limb = MAX (textb, limb);
2090
2091       /* Find the lengths. */
2092       size_t lena = lima - texta;
2093       size_t lenb = limb - textb;
2094
2095       /* Actually compare the fields. */
2096
2097       if (key->random)
2098         diff = compare_random (texta, lena, textb, lenb);
2099       else if (key->numeric || key->general_numeric || key->human_numeric)
2100         {
2101           char savea = *lima, saveb = *limb;
2102
2103           *lima = *limb = '\0';
2104           diff = (key->numeric ? numcompare (texta, textb)
2105                   : key->general_numeric ? general_numcompare (texta, textb)
2106                   : human_numcompare (texta, textb, key));
2107           *lima = savea, *limb = saveb;
2108         }
2109       else if (key->version)
2110         diff = compare_version (texta, lena, textb, lenb);
2111       else if (key->month)
2112         diff = getmonth (texta, lena) - getmonth (textb, lenb);
2113       /* Sorting like this may become slow, so in a simple locale the user
2114          can select a faster sort that is similar to ascii sort.  */
2115       else if (hard_LC_COLLATE)
2116         {
2117           if (ignore || translate)
2118             {
2119               char buf[4000];
2120               size_t size = lena + 1 + lenb + 1;
2121               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
2122               char *copy_b = copy_a + lena + 1;
2123               size_t new_len_a, new_len_b, i;
2124
2125               /* Ignore and/or translate chars before comparing.  */
2126               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
2127                 {
2128                   if (i < lena)
2129                     {
2130                       copy_a[new_len_a] = (translate
2131                                            ? translate[to_uchar (texta[i])]
2132                                            : texta[i]);
2133                       if (!ignore || !ignore[to_uchar (texta[i])])
2134                         ++new_len_a;
2135                     }
2136                   if (i < lenb)
2137                     {
2138                       copy_b[new_len_b] = (translate
2139                                            ? translate[to_uchar (textb[i])]
2140                                            : textb [i]);
2141                       if (!ignore || !ignore[to_uchar (textb[i])])
2142                         ++new_len_b;
2143                     }
2144                 }
2145
2146               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
2147
2148               if (sizeof buf < size)
2149                 free (copy_a);
2150             }
2151           else if (lena == 0)
2152             diff = - NONZERO (lenb);
2153           else if (lenb == 0)
2154             goto greater;
2155           else
2156             diff = xmemcoll (texta, lena, textb, lenb);
2157         }
2158       else if (ignore)
2159         {
2160 #define CMP_WITH_IGNORE(A, B)                                           \
2161   do                                                                    \
2162     {                                                                   \
2163           for (;;)                                                      \
2164             {                                                           \
2165               while (texta < lima && ignore[to_uchar (*texta)])         \
2166                 ++texta;                                                \
2167               while (textb < limb && ignore[to_uchar (*textb)])         \
2168                 ++textb;                                                \
2169               if (! (texta < lima && textb < limb))                     \
2170                 break;                                                  \
2171               diff = to_uchar (A) - to_uchar (B);                       \
2172               if (diff)                                                 \
2173                 goto not_equal;                                         \
2174               ++texta;                                                  \
2175               ++textb;                                                  \
2176             }                                                           \
2177                                                                         \
2178           diff = (texta < lima) - (textb < limb);                       \
2179     }                                                                   \
2180   while (0)
2181
2182           if (translate)
2183             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2184                              translate[to_uchar (*textb)]);
2185           else
2186             CMP_WITH_IGNORE (*texta, *textb);
2187         }
2188       else if (lena == 0)
2189         diff = - NONZERO (lenb);
2190       else if (lenb == 0)
2191         goto greater;
2192       else
2193         {
2194           if (translate)
2195             {
2196               while (texta < lima && textb < limb)
2197                 {
2198                   diff = (to_uchar (translate[to_uchar (*texta++)])
2199                           - to_uchar (translate[to_uchar (*textb++)]));
2200                   if (diff)
2201                     goto not_equal;
2202                 }
2203             }
2204           else
2205             {
2206               diff = memcmp (texta, textb, MIN (lena, lenb));
2207               if (diff)
2208                 goto not_equal;
2209             }
2210           diff = lena < lenb ? -1 : lena != lenb;
2211         }
2212
2213       if (diff)
2214         goto not_equal;
2215
2216       key = key->next;
2217       if (! key)
2218         break;
2219
2220       /* Find the beginning and limit of the next field.  */
2221       if (key->eword != SIZE_MAX)
2222         lima = limfield (a, key), limb = limfield (b, key);
2223       else
2224         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2225
2226       if (key->sword != SIZE_MAX)
2227         texta = begfield (a, key), textb = begfield (b, key);
2228       else
2229         {
2230           texta = a->text, textb = b->text;
2231           if (key->skipsblanks)
2232             {
2233               while (texta < lima && blanks[to_uchar (*texta)])
2234                 ++texta;
2235               while (textb < limb && blanks[to_uchar (*textb)])
2236                 ++textb;
2237             }
2238         }
2239     }
2240
2241   return 0;
2242
2243  greater:
2244   diff = 1;
2245  not_equal:
2246   return key->reverse ? -diff : diff;
2247 }
2248
2249 /* Compare two lines A and B, returning negative, zero, or positive
2250    depending on whether A compares less than, equal to, or greater than B. */
2251
2252 static int
2253 compare (const struct line *a, const struct line *b)
2254 {
2255   int diff;
2256   size_t alen, blen;
2257
2258   /* First try to compare on the specified keys (if any).
2259      The only two cases with no key at all are unadorned sort,
2260      and unadorned sort -r. */
2261   if (keylist)
2262     {
2263       diff = keycompare (a, b);
2264       if (diff || unique || stable)
2265         return diff;
2266     }
2267
2268   /* If the keys all compare equal (or no keys were specified)
2269      fall through to the default comparison.  */
2270   alen = a->length - 1, blen = b->length - 1;
2271
2272   if (alen == 0)
2273     diff = - NONZERO (blen);
2274   else if (blen == 0)
2275     diff = 1;
2276   else if (hard_LC_COLLATE)
2277     diff = xmemcoll (a->text, alen, b->text, blen);
2278   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2279     diff = alen < blen ? -1 : alen != blen;
2280
2281   return reverse ? -diff : diff;
2282 }
2283
2284 /* Check that the lines read from FILE_NAME come in order.  Return
2285    true if they are in order.  If CHECKONLY == 'c', also print a
2286    diagnostic (FILE_NAME, line number, contents of line) to stderr if
2287    they are not in order.  */
2288
2289 static bool
2290 check (char const *file_name, char checkonly)
2291 {
2292   FILE *fp = xfopen (file_name, "r");
2293   struct buffer buf;            /* Input buffer. */
2294   struct line temp;             /* Copy of previous line. */
2295   size_t alloc = 0;
2296   uintmax_t line_number = 0;
2297   struct keyfield const *key = keylist;
2298   bool nonunique = ! unique;
2299   bool ordered = true;
2300
2301   initbuf (&buf, sizeof (struct line),
2302            MAX (merge_buffer_size, sort_size));
2303   temp.text = NULL;
2304
2305   while (fillbuf (&buf, fp, file_name))
2306     {
2307       struct line const *line = buffer_linelim (&buf);
2308       struct line const *linebase = line - buf.nlines;
2309
2310       /* Make sure the line saved from the old buffer contents is
2311          less than or equal to the first line of the new buffer. */
2312       if (alloc && nonunique <= compare (&temp, line - 1))
2313         {
2314         found_disorder:
2315           {
2316             if (checkonly == 'c')
2317               {
2318                 struct line const *disorder_line = line - 1;
2319                 uintmax_t disorder_line_number =
2320                   buffer_linelim (&buf) - disorder_line + line_number;
2321                 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2322                 fprintf (stderr, _("%s: %s:%s: disorder: "),
2323                          program_name, file_name,
2324                          umaxtostr (disorder_line_number, hr_buf));
2325                 write_bytes (disorder_line->text, disorder_line->length,
2326                              stderr, _("standard error"));
2327               }
2328
2329             ordered = false;
2330             break;
2331           }
2332         }
2333
2334       /* Compare each line in the buffer with its successor.  */
2335       while (linebase < --line)
2336         if (nonunique <= compare (line, line - 1))
2337           goto found_disorder;
2338
2339       line_number += buf.nlines;
2340
2341       /* Save the last line of the buffer.  */
2342       if (alloc < line->length)
2343         {
2344           do
2345             {
2346               alloc *= 2;
2347               if (! alloc)
2348                 {
2349                   alloc = line->length;
2350                   break;
2351                 }
2352             }
2353           while (alloc < line->length);
2354
2355           temp.text = xrealloc (temp.text, alloc);
2356         }
2357       memcpy (temp.text, line->text, line->length);
2358       temp.length = line->length;
2359       if (key)
2360         {
2361           temp.keybeg = temp.text + (line->keybeg - line->text);
2362           temp.keylim = temp.text + (line->keylim - line->text);
2363         }
2364     }
2365
2366   xfclose (fp, file_name);
2367   free (buf.buf);
2368   free (temp.text);
2369   return ordered;
2370 }
2371
2372 /* Open FILES (there are NFILES of them) and store the resulting array
2373    of stream pointers into (*PFPS).  Allocate the array.  Return the
2374    number of successfully opened files, setting errno if this value is
2375    less than NFILES.  */
2376
2377 static size_t
2378 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2379 {
2380   FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2381   int i;
2382
2383   /* Open as many input files as we can.  */
2384   for (i = 0; i < nfiles; i++)
2385     {
2386       fps[i] = (files[i].pid
2387                 ? open_temp (files[i].name, files[i].pid)
2388                 : stream_open (files[i].name, "r"));
2389       if (!fps[i])
2390         break;
2391     }
2392
2393   return i;
2394 }
2395
2396 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2397    files (all of which are at the start of the FILES array), and
2398    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2399    FPS is the vector of open stream corresponding to the files.
2400    Close input and output streams before returning.
2401    OUTPUT_FILE gives the name of the output file.  If it is NULL,
2402    the output file is standard output.  */
2403
2404 static void
2405 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2406           FILE *ofp, char const *output_file, FILE **fps)
2407 {
2408   struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2409                                 /* Input buffers for each file. */
2410   struct line saved;            /* Saved line storage for unique check. */
2411   struct line const *savedline = NULL;
2412                                 /* &saved if there is a saved line. */
2413   size_t savealloc = 0;         /* Size allocated for the saved line. */
2414   struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2415                                 /* Current line in each line table. */
2416   struct line const **base = xnmalloc (nfiles, sizeof *base);
2417                                 /* Base of each line table.  */
2418   size_t *ord = xnmalloc (nfiles, sizeof *ord);
2419                                 /* Table representing a permutation of fps,
2420                                    such that cur[ord[0]] is the smallest line
2421                                    and will be next output. */
2422   size_t i;
2423   size_t j;
2424   size_t t;
2425   struct keyfield const *key = keylist;
2426   saved.text = NULL;
2427
2428   /* Read initial lines from each input file. */
2429   for (i = 0; i < nfiles; )
2430     {
2431       initbuf (&buffer[i], sizeof (struct line),
2432                MAX (merge_buffer_size, sort_size / nfiles));
2433       if (fillbuf (&buffer[i], fps[i], files[i].name))
2434         {
2435           struct line const *linelim = buffer_linelim (&buffer[i]);
2436           cur[i] = linelim - 1;
2437           base[i] = linelim - buffer[i].nlines;
2438           i++;
2439         }
2440       else
2441         {
2442           /* fps[i] is empty; eliminate it from future consideration.  */
2443           xfclose (fps[i], files[i].name);
2444           if (i < ntemps)
2445             {
2446               ntemps--;
2447               zaptemp (files[i].name);
2448             }
2449           free (buffer[i].buf);
2450           --nfiles;
2451           for (j = i; j < nfiles; ++j)
2452             {
2453               files[j] = files[j + 1];
2454               fps[j] = fps[j + 1];
2455             }
2456         }
2457     }
2458
2459   /* Set up the ord table according to comparisons among input lines.
2460      Since this only reorders two items if one is strictly greater than
2461      the other, it is stable. */
2462   for (i = 0; i < nfiles; ++i)
2463     ord[i] = i;
2464   for (i = 1; i < nfiles; ++i)
2465     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2466       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2467
2468   /* Repeatedly output the smallest line until no input remains. */
2469   while (nfiles)
2470     {
2471       struct line const *smallest = cur[ord[0]];
2472
2473       /* If uniquified output is turned on, output only the first of
2474          an identical series of lines. */
2475       if (unique)
2476         {
2477           if (savedline && compare (savedline, smallest))
2478             {
2479               savedline = NULL;
2480               write_bytes (saved.text, saved.length, ofp, output_file);
2481             }
2482           if (!savedline)
2483             {
2484               savedline = &saved;
2485               if (savealloc < smallest->length)
2486                 {
2487                   do
2488                     if (! savealloc)
2489                       {
2490                         savealloc = smallest->length;
2491                         break;
2492                       }
2493                   while ((savealloc *= 2) < smallest->length);
2494
2495                   saved.text = xrealloc (saved.text, savealloc);
2496                 }
2497               saved.length = smallest->length;
2498               memcpy (saved.text, smallest->text, saved.length);
2499               if (key)
2500                 {
2501                   saved.keybeg =
2502                     saved.text + (smallest->keybeg - smallest->text);
2503                   saved.keylim =
2504                     saved.text + (smallest->keylim - smallest->text);
2505                 }
2506             }
2507         }
2508       else
2509         write_bytes (smallest->text, smallest->length, ofp, output_file);
2510
2511       /* Check if we need to read more lines into core. */
2512       if (base[ord[0]] < smallest)
2513         cur[ord[0]] = smallest - 1;
2514       else
2515         {
2516           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2517             {
2518               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2519               cur[ord[0]] = linelim - 1;
2520               base[ord[0]] = linelim - buffer[ord[0]].nlines;
2521             }
2522           else
2523             {
2524               /* We reached EOF on fps[ord[0]].  */
2525               for (i = 1; i < nfiles; ++i)
2526                 if (ord[i] > ord[0])
2527                   --ord[i];
2528               --nfiles;
2529               xfclose (fps[ord[0]], files[ord[0]].name);
2530               if (ord[0] < ntemps)
2531                 {
2532                   ntemps--;
2533                   zaptemp (files[ord[0]].name);
2534                 }
2535               free (buffer[ord[0]].buf);
2536               for (i = ord[0]; i < nfiles; ++i)
2537                 {
2538                   fps[i] = fps[i + 1];
2539                   files[i] = files[i + 1];
2540                   buffer[i] = buffer[i + 1];
2541                   cur[i] = cur[i + 1];
2542                   base[i] = base[i + 1];
2543                 }
2544               for (i = 0; i < nfiles; ++i)
2545                 ord[i] = ord[i + 1];
2546               continue;
2547             }
2548         }
2549
2550       /* The new line just read in may be larger than other lines
2551          already in main memory; push it back in the queue until we
2552          encounter a line larger than it.  Optimize for the common
2553          case where the new line is smallest.  */
2554       {
2555         size_t lo = 1;
2556         size_t hi = nfiles;
2557         size_t probe = lo;
2558         size_t ord0 = ord[0];
2559         size_t count_of_smaller_lines;
2560
2561         while (lo < hi)
2562           {
2563             int cmp = compare (cur[ord0], cur[ord[probe]]);
2564             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2565               hi = probe;
2566             else
2567               lo = probe + 1;
2568             probe = (lo + hi) / 2;
2569           }
2570
2571         count_of_smaller_lines = lo - 1;
2572         for (j = 0; j < count_of_smaller_lines; j++)
2573           ord[j] = ord[j + 1];
2574         ord[count_of_smaller_lines] = ord0;
2575       }
2576
2577       /* Free up some resources every once in a while.  */
2578       if (MAX_PROCS_BEFORE_REAP < nprocs)
2579         reap_some ();
2580     }
2581
2582   if (unique && savedline)
2583     {
2584       write_bytes (saved.text, saved.length, ofp, output_file);
2585       free (saved.text);
2586     }
2587
2588   xfclose (ofp, output_file);
2589   free(fps);
2590   free(buffer);
2591   free(ord);
2592   free(base);
2593   free(cur);
2594 }
2595
2596 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2597    files (all of which are at the start of the FILES array), and
2598    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2599    Close input and output files before returning.
2600    OUTPUT_FILE gives the name of the output file.
2601
2602    Return the number of files successfully merged.  This number can be
2603    less than NFILES if we ran low on file descriptors, but in this
2604    case it is never less than 2.  */
2605
2606 static size_t
2607 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
2608             FILE *ofp, char const *output_file)
2609 {
2610   FILE **fps;
2611   size_t nopened = open_input_files (files, nfiles, &fps);
2612   if (nopened < nfiles && nopened < 2)
2613     die (_("open failed"), files[nopened].name);
2614   mergefps (files, ntemps, nopened, ofp, output_file, fps);
2615   return nopened;
2616 }
2617
2618 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2619    and HI (with NHI members).  T, LO, and HI point just past their
2620    respective arrays, and the arrays are in reverse order.  NLO and
2621    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
2622
2623 static inline void
2624 mergelines (struct line *t,
2625             struct line const *lo, size_t nlo,
2626             struct line const *hi, size_t nhi)
2627 {
2628   for (;;)
2629     if (compare (lo - 1, hi - 1) <= 0)
2630       {
2631         *--t = *--lo;
2632         if (! --nlo)
2633           {
2634             /* HI - NHI equalled T - (NLO + NHI) when this function
2635                began.  Therefore HI must equal T now, and there is no
2636                need to copy from HI to T.  */
2637             return;
2638           }
2639       }
2640     else
2641       {
2642         *--t = *--hi;
2643         if (! --nhi)
2644           {
2645             do
2646               *--t = *--lo;
2647             while (--nlo);
2648
2649             return;
2650           }
2651       }
2652 }
2653
2654 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2655    NLINES must be at least 2.
2656    The input and output arrays are in reverse order, and LINES and
2657    TEMP point just past the end of their respective arrays.
2658
2659    Use a recursive divide-and-conquer algorithm, in the style
2660    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
2661    the optimization suggested by exercise 5.2.4-10; this requires room
2662    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
2663    writes that this memory optimization was originally published by
2664    D. A. Bell, Comp J. 1 (1958), 75.  */
2665
2666 static void
2667 sortlines (struct line *lines, size_t nlines, struct line *temp)
2668 {
2669   if (nlines == 2)
2670     {
2671       if (0 < compare (&lines[-1], &lines[-2]))
2672         {
2673           struct line tmp = lines[-1];
2674           lines[-1] = lines[-2];
2675           lines[-2] = tmp;
2676         }
2677     }
2678   else
2679     {
2680       size_t nlo = nlines / 2;
2681       size_t nhi = nlines - nlo;
2682       struct line *lo = lines;
2683       struct line *hi = lines - nlo;
2684       struct line *sorted_lo = temp;
2685
2686       sortlines (hi, nhi, temp);
2687       if (1 < nlo)
2688         sortlines_temp (lo, nlo, sorted_lo);
2689       else
2690         sorted_lo[-1] = lo[-1];
2691
2692       mergelines (lines, sorted_lo, nlo, hi, nhi);
2693     }
2694 }
2695
2696 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2697    rather than sorting in place.  */
2698
2699 static void
2700 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2701 {
2702   if (nlines == 2)
2703     {
2704       /* Declare `swap' as int, not bool, to work around a bug
2705          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2706          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
2707       int swap = (0 < compare (&lines[-1], &lines[-2]));
2708       temp[-1] = lines[-1 - swap];
2709       temp[-2] = lines[-2 + swap];
2710     }
2711   else
2712     {
2713       size_t nlo = nlines / 2;
2714       size_t nhi = nlines - nlo;
2715       struct line *lo = lines;
2716       struct line *hi = lines - nlo;
2717       struct line *sorted_hi = temp - nlo;
2718
2719       sortlines_temp (hi, nhi, sorted_hi);
2720       if (1 < nlo)
2721         sortlines (lo, nlo, temp);
2722
2723       mergelines (temp, lo, nlo, sorted_hi, nhi);
2724     }
2725 }
2726
2727 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2728    the same as OUTFILE.  If found, merge the found instances (and perhaps
2729    some other files) into a temporary file so that it can in turn be
2730    merged into OUTFILE without destroying OUTFILE before it is completely
2731    read.  Return the new value of NFILES, which differs from the old if
2732    some merging occurred.
2733
2734    This test ensures that an otherwise-erroneous use like
2735    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2736    It's not clear that POSIX requires this nicety.
2737    Detect common error cases, but don't try to catch obscure cases like
2738    "cat ... FILE ... | sort -m -o FILE"
2739    where traditional "sort" doesn't copy the input and where
2740    people should know that they're getting into trouble anyway.
2741    Catching these obscure cases would slow down performance in
2742    common cases.  */
2743
2744 static size_t
2745 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2746                       size_t nfiles, char const *outfile)
2747 {
2748   size_t i;
2749   bool got_outstat = false;
2750   struct stat outstat;
2751
2752   for (i = ntemps; i < nfiles; i++)
2753     {
2754       bool is_stdin = STREQ (files[i].name, "-");
2755       bool same;
2756       struct stat instat;
2757
2758       if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2759         same = true;
2760       else
2761         {
2762           if (! got_outstat)
2763             {
2764               if ((outfile
2765                    ? stat (outfile, &outstat)
2766                    : fstat (STDOUT_FILENO, &outstat))
2767                   != 0)
2768                 break;
2769               got_outstat = true;
2770             }
2771
2772           same = (((is_stdin
2773                     ? fstat (STDIN_FILENO, &instat)
2774                     : stat (files[i].name, &instat))
2775                    == 0)
2776                   && SAME_INODE (instat, outstat));
2777         }
2778
2779       if (same)
2780         {
2781           FILE *tftp;
2782           pid_t pid;
2783           char *temp = create_temp (&tftp, &pid);
2784           size_t num_merged = 0;
2785           do
2786             {
2787               num_merged += mergefiles (&files[i], 0, nfiles - i, tftp, temp);
2788               files[i].name = temp;
2789               files[i].pid = pid;
2790
2791               if (i + num_merged < nfiles)
2792                 memmove(&files[i + 1], &files[i + num_merged],
2793                         num_merged * sizeof *files);
2794               ntemps += 1;
2795               nfiles -= num_merged - 1;;
2796               i += num_merged;
2797             }
2798           while (i < nfiles);
2799         }
2800     }
2801
2802   return nfiles;
2803 }
2804
2805 /* Merge the input FILES.  NTEMPS is the number of files at the
2806    start of FILES that are temporary; it is zero at the top level.
2807    NFILES is the total number of files.  Put the output in
2808    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
2809
2810 static void
2811 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2812        char const *output_file)
2813 {
2814   while (nmerge < nfiles)
2815     {
2816       /* Number of input files processed so far.  */
2817       size_t in;
2818
2819       /* Number of output files generated so far.  */
2820       size_t out;
2821
2822       /* nfiles % NMERGE; this counts input files that are left over
2823          after all full-sized merges have been done.  */
2824       size_t remainder;
2825
2826       /* Number of easily-available slots at the next loop iteration.  */
2827       size_t cheap_slots;
2828
2829       /* Do as many NMERGE-size merges as possible. In the case that
2830          nmerge is bogus, increment by the maximum number of file
2831          descriptors allowed.  */
2832       for (out = in = 0; nmerge <= nfiles - in; out++)
2833         {
2834           FILE *tfp;
2835           pid_t pid;
2836           char *temp = create_temp (&tfp, &pid);
2837           size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
2838                                           nmerge, tfp, temp);
2839           ntemps -= MIN (ntemps, num_merged);
2840           files[out].name = temp;
2841           files[out].pid = pid;
2842           in += num_merged;
2843         }
2844
2845       remainder = nfiles - in;
2846       cheap_slots = nmerge - out % nmerge;
2847
2848       if (cheap_slots < remainder)
2849         {
2850           /* So many files remain that they can't all be put into the last
2851              NMERGE-sized output window.  Do one more merge.  Merge as few
2852              files as possible, to avoid needless I/O.  */
2853           size_t nshortmerge = remainder - cheap_slots + 1;
2854           FILE *tfp;
2855           pid_t pid;
2856           char *temp = create_temp (&tfp, &pid);
2857           size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
2858                                           nshortmerge, tfp, temp);
2859           ntemps -= MIN (ntemps, num_merged);
2860           files[out].name = temp;
2861           files[out++].pid = pid;
2862           in += num_merged;
2863         }
2864
2865       /* Put the remaining input files into the last NMERGE-sized output
2866          window, so they will be merged in the next pass.  */
2867       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2868       ntemps += out;
2869       nfiles -= in - out;
2870     }
2871
2872   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2873
2874   /* We aren't guaranteed that this final mergefiles will work, therefore we
2875      try to merge into the output, and then merge as much as we can into a
2876      temp file if we can't. Repeat.  */
2877
2878   for (;;)
2879     {
2880       /* Merge directly into the output file if possible.  */
2881       FILE **fps;
2882       size_t nopened = open_input_files (files, nfiles, &fps);
2883
2884       if (nopened == nfiles)
2885         {
2886           FILE *ofp = stream_open (output_file, "w");
2887           if (ofp)
2888             {
2889               mergefps (files, ntemps, nfiles, ofp, output_file, fps);
2890               break;
2891             }
2892           if (errno != EMFILE || nopened <= 2)
2893             die (_("open failed"), output_file);
2894         }
2895       else if (nopened <= 2)
2896         die (_("open failed"), files[nopened].name);
2897
2898       /* We ran out of file descriptors.  Close one of the input
2899          files, to gain a file descriptor.  Then create a temporary
2900          file with our spare file descriptor.  Retry if that failed
2901          (e.g., some other process could open a file between the time
2902          we closed and tried to create).  */
2903       FILE *tfp;
2904       pid_t pid;
2905       char *temp;
2906       do
2907         {
2908           nopened--;
2909           xfclose (fps[nopened], files[nopened].name);
2910           temp = maybe_create_temp (&tfp, &pid, ! (nopened <= 2));
2911         }
2912       while (!temp);
2913
2914       /* Merge into the newly allocated temporary.  */
2915       mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp, fps);
2916       ntemps -= MIN (ntemps, nopened);
2917       files[0].name = temp;
2918       files[0].pid = pid;
2919
2920       memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
2921       ntemps++;
2922       nfiles -= nopened - 1;
2923     }
2924 }
2925
2926 /* Sort NFILES FILES onto OUTPUT_FILE. */
2927
2928 static void
2929 sort (char * const *files, size_t nfiles, char const *output_file)
2930 {
2931   struct buffer buf;
2932   size_t ntemps = 0;
2933   bool output_file_created = false;
2934
2935   buf.alloc = 0;
2936
2937   while (nfiles)
2938     {
2939       char const *temp_output;
2940       char const *file = *files;
2941       FILE *fp = xfopen (file, "r");
2942       FILE *tfp;
2943       size_t bytes_per_line = (2 * sizeof (struct line)
2944                                - sizeof (struct line) / 2);
2945
2946       if (! buf.alloc)
2947         initbuf (&buf, bytes_per_line,
2948                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2949       buf.eof = false;
2950       files++;
2951       nfiles--;
2952
2953       while (fillbuf (&buf, fp, file))
2954         {
2955           struct line *line;
2956           struct line *linebase;
2957
2958           if (buf.eof && nfiles
2959               && (bytes_per_line + 1
2960                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2961             {
2962               /* End of file, but there is more input and buffer room.
2963                  Concatenate the next input file; this is faster in
2964                  the usual case.  */
2965               buf.left = buf.used;
2966               break;
2967             }
2968
2969           line = buffer_linelim (&buf);
2970           linebase = line - buf.nlines;
2971           if (1 < buf.nlines)
2972             sortlines (line, buf.nlines, linebase);
2973           if (buf.eof && !nfiles && !ntemps && !buf.left)
2974             {
2975               xfclose (fp, file);
2976               tfp = xfopen (output_file, "w");
2977               temp_output = output_file;
2978               output_file_created = true;
2979             }
2980           else
2981             {
2982               ++ntemps;
2983               temp_output = create_temp (&tfp, NULL);
2984             }
2985
2986           do
2987             {
2988               line--;
2989               write_bytes (line->text, line->length, tfp, temp_output);
2990               if (unique)
2991                 while (linebase < line && compare (line, line - 1) == 0)
2992                   line--;
2993             }
2994           while (linebase < line);
2995
2996           xfclose (tfp, temp_output);
2997
2998           /* Free up some resources every once in a while.  */
2999           if (MAX_PROCS_BEFORE_REAP < nprocs)
3000             reap_some ();
3001
3002           if (output_file_created)
3003             goto finish;
3004         }
3005       xfclose (fp, file);
3006     }
3007
3008  finish:
3009   free (buf.buf);
3010
3011   if (! output_file_created)
3012     {
3013       size_t i;
3014       struct tempnode *node = temphead;
3015       struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3016       for (i = 0; node; i++)
3017         {
3018           tempfiles[i].name = node->name;
3019           tempfiles[i].pid = node->pid;
3020           node = node->next;
3021         }
3022       merge (tempfiles, ntemps, ntemps, output_file);
3023       free (tempfiles);
3024     }
3025 }
3026
3027 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
3028
3029 static void
3030 insertkey (struct keyfield *key_arg)
3031 {
3032   struct keyfield **p;
3033   struct keyfield *key = xmemdup (key_arg, sizeof *key);
3034
3035   for (p = &keylist; *p; p = &(*p)->next)
3036     continue;
3037   *p = key;
3038   key->next = NULL;
3039 }
3040
3041 /* Report a bad field specification SPEC, with extra info MSGID.  */
3042
3043 static void badfieldspec (char const *, char const *)
3044      ATTRIBUTE_NORETURN;
3045 static void
3046 badfieldspec (char const *spec, char const *msgid)
3047 {
3048   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3049          _(msgid), quote (spec));
3050   abort ();
3051 }
3052
3053 /* Report incompatible options.  */
3054
3055 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3056 static void
3057 incompatible_options (char const *opts)
3058 {
3059   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3060   abort ();
3061 }
3062
3063 /* Check compatibility of ordering options.  */
3064
3065 static void
3066 check_ordering_compatibility (void)
3067 {
3068   struct keyfield const *key;
3069
3070   for (key = keylist; key; key = key->next)
3071     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
3072               + key->version + !!key->ignore + key->human_numeric))
3073         || (key->random && key->translate))
3074       {
3075         /* The following is too big, but guaranteed to be "big enough". */
3076         char opts[sizeof short_options];
3077         char *p = opts;
3078         if (key->ignore == nondictionary)
3079           *p++ = 'd';
3080         if (key->translate)
3081           *p++ = 'f';
3082         if (key->general_numeric)
3083           *p++ = 'g';
3084         if (key->human_numeric)
3085           *p++ = 'h';
3086         if (key->ignore == nonprinting)
3087           *p++ = 'i';
3088         if (key->month)
3089           *p++ = 'M';
3090         if (key->numeric)
3091           *p++ = 'n';
3092         if (key->version)
3093           *p++ = 'V';
3094         if (key->random)
3095           *p++ = 'R';
3096         *p = '\0';
3097         incompatible_options (opts);
3098       }
3099 }
3100
3101 /* Parse the leading integer in STRING and store the resulting value
3102    (which must fit into size_t) into *VAL.  Return the address of the
3103    suffix after the integer.  If the value is too large, silently
3104    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
3105    failure; otherwise, report MSGID and exit on failure.  */
3106
3107 static char const *
3108 parse_field_count (char const *string, size_t *val, char const *msgid)
3109 {
3110   char *suffix;
3111   uintmax_t n;
3112
3113   switch (xstrtoumax (string, &suffix, 10, &n, ""))
3114     {
3115     case LONGINT_OK:
3116     case LONGINT_INVALID_SUFFIX_CHAR:
3117       *val = n;
3118       if (*val == n)
3119         break;
3120       /* Fall through.  */
3121     case LONGINT_OVERFLOW:
3122     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3123       *val = SIZE_MAX;
3124       break;
3125
3126     case LONGINT_INVALID:
3127       if (msgid)
3128         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3129                _(msgid), quote (string));
3130       return NULL;
3131     }
3132
3133   return suffix;
3134 }
3135
3136 /* Handle interrupts and hangups. */
3137
3138 static void
3139 sighandler (int sig)
3140 {
3141   if (! SA_NOCLDSTOP)
3142     signal (sig, SIG_IGN);
3143
3144   cleanup ();
3145
3146   signal (sig, SIG_DFL);
3147   raise (sig);
3148 }
3149
3150 /* Set the ordering options for KEY specified in S.
3151    Return the address of the first character in S that
3152    is not a valid ordering option.
3153    BLANKTYPE is the kind of blanks that 'b' should skip. */
3154
3155 static char *
3156 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
3157 {
3158   while (*s)
3159     {
3160       switch (*s)
3161         {
3162         case 'b':
3163           if (blanktype == bl_start || blanktype == bl_both)
3164             key->skipsblanks = true;
3165           if (blanktype == bl_end || blanktype == bl_both)
3166             key->skipeblanks = true;
3167           break;
3168         case 'd':
3169           key->ignore = nondictionary;
3170           break;
3171         case 'f':
3172           key->translate = fold_toupper;
3173           break;
3174         case 'g':
3175           key->general_numeric = true;
3176           break;
3177         case 'h':
3178           key->human_numeric = true;
3179           break;
3180         case 'i':
3181           /* Option order should not matter, so don't let -i override
3182              -d.  -d implies -i, but -i does not imply -d.  */
3183           if (! key->ignore)
3184             key->ignore = nonprinting;
3185           break;
3186         case 'M':
3187           key->month = true;
3188           break;
3189         case 'n':
3190           key->numeric = true;
3191           break;
3192         case 'R':
3193           key->random = true;
3194           break;
3195         case 'r':
3196           key->reverse = true;
3197           break;
3198         case 'V':
3199           key->version = true;
3200           break;
3201         default:
3202           return (char *) s;
3203         }
3204       ++s;
3205     }
3206   return (char *) s;
3207 }
3208
3209 static struct keyfield *
3210 key_init (struct keyfield *key)
3211 {
3212   memset (key, 0, sizeof *key);
3213   key->eword = SIZE_MAX;
3214   key->iec_present = -1;
3215   return key;
3216 }
3217
3218 int
3219 main (int argc, char **argv)
3220 {
3221   struct keyfield *key;
3222   struct keyfield key_buf;
3223   struct keyfield gkey;
3224   char const *s;
3225   int c = 0;
3226   char checkonly = 0;
3227   bool mergeonly = false;
3228   char *random_source = NULL;
3229   bool need_random = false;
3230   size_t nfiles = 0;
3231   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
3232   bool obsolete_usage = (posix2_version () < 200112);
3233   char **files;
3234   char *files_from = NULL;
3235   struct Tokens tok;
3236   char const *outfile = NULL;
3237
3238   initialize_main (&argc, &argv);
3239   set_program_name (argv[0]);
3240   setlocale (LC_ALL, "");
3241   bindtextdomain (PACKAGE, LOCALEDIR);
3242   textdomain (PACKAGE);
3243
3244   initialize_exit_failure (SORT_FAILURE);
3245
3246   hard_LC_COLLATE = hard_locale (LC_COLLATE);
3247 #if HAVE_NL_LANGINFO
3248   hard_LC_TIME = hard_locale (LC_TIME);
3249 #endif
3250
3251   /* Get locale's representation of the decimal point.  */
3252   {
3253     struct lconv const *locale = localeconv ();
3254
3255     /* If the locale doesn't define a decimal point, or if the decimal
3256        point is multibyte, use the C locale's decimal point.  FIXME:
3257        add support for multibyte decimal points.  */
3258     decimal_point = to_uchar (locale->decimal_point[0]);
3259     if (! decimal_point || locale->decimal_point[1])
3260       decimal_point = '.';
3261
3262     /* FIXME: add support for multibyte thousands separators.  */
3263     thousands_sep = to_uchar (*locale->thousands_sep);
3264     if (! thousands_sep || locale->thousands_sep[1])
3265       thousands_sep = -1;
3266   }
3267
3268   have_read_stdin = false;
3269   inittables ();
3270
3271   {
3272     size_t i;
3273     static int const sig[] =
3274       {
3275         /* The usual suspects.  */
3276         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
3277 #ifdef SIGPOLL
3278         SIGPOLL,
3279 #endif
3280 #ifdef SIGPROF
3281         SIGPROF,
3282 #endif
3283 #ifdef SIGVTALRM
3284         SIGVTALRM,
3285 #endif
3286 #ifdef SIGXCPU
3287         SIGXCPU,
3288 #endif
3289 #ifdef SIGXFSZ
3290         SIGXFSZ,
3291 #endif
3292       };
3293     enum { nsigs = ARRAY_CARDINALITY (sig) };
3294
3295 #if SA_NOCLDSTOP
3296     struct sigaction act;
3297
3298     sigemptyset (&caught_signals);
3299     for (i = 0; i < nsigs; i++)
3300       {
3301         sigaction (sig[i], NULL, &act);
3302         if (act.sa_handler != SIG_IGN)
3303           sigaddset (&caught_signals, sig[i]);
3304       }
3305
3306     act.sa_handler = sighandler;
3307     act.sa_mask = caught_signals;
3308     act.sa_flags = 0;
3309
3310     for (i = 0; i < nsigs; i++)
3311       if (sigismember (&caught_signals, sig[i]))
3312         sigaction (sig[i], &act, NULL);
3313 #else
3314     for (i = 0; i < nsigs; i++)
3315       if (signal (sig[i], SIG_IGN) != SIG_IGN)
3316         {
3317           signal (sig[i], sighandler);
3318           siginterrupt (sig[i], 1);
3319         }
3320 #endif
3321   }
3322   signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent.  */
3323
3324   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
3325   atexit (exit_cleanup);
3326
3327   gkey.sword = gkey.eword = SIZE_MAX;
3328   gkey.ignore = NULL;
3329   gkey.translate = NULL;
3330   gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
3331   gkey.iec_present = -1;
3332   gkey.random = gkey.version = false;
3333   gkey.month = gkey.reverse = false;
3334   gkey.skipsblanks = gkey.skipeblanks = false;
3335
3336   files = xnmalloc (argc, sizeof *files);
3337
3338   for (;;)
3339     {
3340       /* Parse an operand as a file after "--" was seen; or if
3341          pedantic and a file was seen, unless the POSIX version
3342          predates 1003.1-2001 and -c was not seen and the operand is
3343          "-o FILE" or "-oFILE".  */
3344       int oi = -1;
3345
3346       if (c == -1
3347           || (posixly_correct && nfiles != 0
3348               && ! (obsolete_usage
3349                     && ! checkonly
3350                     && optind != argc
3351                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
3352                     && (argv[optind][2] || optind + 1 != argc)))
3353           || ((c = getopt_long (argc, argv, short_options,
3354                                 long_options, &oi))
3355               == -1))
3356         {
3357           if (argc <= optind)
3358             break;
3359           files[nfiles++] = argv[optind++];
3360         }
3361       else switch (c)
3362         {
3363         case 1:
3364           key = NULL;
3365           if (optarg[0] == '+')
3366             {
3367               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3368                                       && ISDIGIT (argv[optind][1]));
3369               obsolete_usage |= minus_pos_usage && !posixly_correct;
3370               if (obsolete_usage)
3371                 {
3372                   /* Treat +POS1 [-POS2] as a key if possible; but silently
3373                      treat an operand as a file if it is not a valid +POS1.  */
3374                   key = key_init (&key_buf);
3375                   s = parse_field_count (optarg + 1, &key->sword, NULL);
3376                   if (s && *s == '.')
3377                     s = parse_field_count (s + 1, &key->schar, NULL);
3378                   if (! (key->sword || key->schar))
3379                     key->sword = SIZE_MAX;
3380                   if (! s || *set_ordering (s, key, bl_start))
3381                     key = NULL;
3382                   else
3383                     {
3384                       if (minus_pos_usage)
3385                         {
3386                           char const *optarg1 = argv[optind++];
3387                           s = parse_field_count (optarg1 + 1, &key->eword,
3388                                              N_("invalid number after `-'"));
3389                           if (*s == '.')
3390                             s = parse_field_count (s + 1, &key->echar,
3391                                                N_("invalid number after `.'"));
3392                           if (*set_ordering (s, key, bl_end))
3393                             badfieldspec (optarg1,
3394                                       N_("stray character in field spec"));
3395                         }
3396                       insertkey (key);
3397                     }
3398                 }
3399             }
3400           if (! key)
3401             files[nfiles++] = optarg;
3402           break;
3403
3404         case SORT_OPTION:
3405           c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3406           /* Fall through. */
3407         case 'b':
3408         case 'd':
3409         case 'f':
3410         case 'g':
3411         case 'h':
3412         case 'i':
3413         case 'M':
3414         case 'n':
3415         case 'r':
3416         case 'R':
3417         case 'V':
3418           {
3419             char str[2];
3420             str[0] = c;
3421             str[1] = '\0';
3422             set_ordering (str, &gkey, bl_both);
3423           }
3424           break;
3425
3426         case CHECK_OPTION:
3427           c = (optarg
3428                ? XARGMATCH ("--check", optarg, check_args, check_types)
3429                : 'c');
3430           /* Fall through.  */
3431         case 'c':
3432         case 'C':
3433           if (checkonly && checkonly != c)
3434             incompatible_options ("cC");
3435           checkonly = c;
3436           break;
3437
3438         case COMPRESS_PROGRAM_OPTION:
3439           if (compress_program && !STREQ (compress_program, optarg))
3440             error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3441           compress_program = optarg;
3442           break;
3443
3444         case FILES0_FROM_OPTION:
3445           files_from = optarg;
3446           break;
3447
3448         case 'k':
3449           key = key_init (&key_buf);
3450
3451           /* Get POS1. */
3452           s = parse_field_count (optarg, &key->sword,
3453                                  N_("invalid number at field start"));
3454           if (! key->sword--)
3455             {
3456               /* Provoke with `sort -k0' */
3457               badfieldspec (optarg, N_("field number is zero"));
3458             }
3459           if (*s == '.')
3460             {
3461               s = parse_field_count (s + 1, &key->schar,
3462                                      N_("invalid number after `.'"));
3463               if (! key->schar--)
3464                 {
3465                   /* Provoke with `sort -k1.0' */
3466                   badfieldspec (optarg, N_("character offset is zero"));
3467                 }
3468             }
3469           if (! (key->sword || key->schar))
3470             key->sword = SIZE_MAX;
3471           s = set_ordering (s, key, bl_start);
3472           if (*s != ',')
3473             {
3474               key->eword = SIZE_MAX;
3475               key->echar = 0;
3476             }
3477           else
3478             {
3479               /* Get POS2. */
3480               s = parse_field_count (s + 1, &key->eword,
3481                                      N_("invalid number after `,'"));
3482               if (! key->eword--)
3483                 {
3484                   /* Provoke with `sort -k1,0' */
3485                   badfieldspec (optarg, N_("field number is zero"));
3486                 }
3487               if (*s == '.')
3488                 {
3489                   s = parse_field_count (s + 1, &key->echar,
3490                                          N_("invalid number after `.'"));
3491                 }
3492               s = set_ordering (s, key, bl_end);
3493             }
3494           if (*s)
3495             badfieldspec (optarg, N_("stray character in field spec"));
3496           insertkey (key);
3497           break;
3498
3499         case 'm':
3500           mergeonly = true;
3501           break;
3502
3503         case NMERGE_OPTION:
3504           specify_nmerge (oi, c, optarg);
3505           break;
3506
3507         case 'o':
3508           if (outfile && !STREQ (outfile, optarg))
3509             error (SORT_FAILURE, 0, _("multiple output files specified"));
3510           outfile = optarg;
3511           break;
3512
3513         case RANDOM_SOURCE_OPTION:
3514           if (random_source && !STREQ (random_source, optarg))
3515             error (SORT_FAILURE, 0, _("multiple random sources specified"));
3516           random_source = optarg;
3517           break;
3518
3519         case 's':
3520           stable = true;
3521           break;
3522
3523         case 'S':
3524           specify_sort_size (oi, c, optarg);
3525           break;
3526
3527         case 't':
3528           {
3529             char newtab = optarg[0];
3530             if (! newtab)
3531               error (SORT_FAILURE, 0, _("empty tab"));
3532             if (optarg[1])
3533               {
3534                 if (STREQ (optarg, "\\0"))
3535                   newtab = '\0';
3536                 else
3537                   {
3538                     /* Provoke with `sort -txx'.  Complain about
3539                        "multi-character tab" instead of "multibyte tab", so
3540                        that the diagnostic's wording does not need to be
3541                        changed once multibyte characters are supported.  */
3542                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
3543                            quote (optarg));
3544                   }
3545               }
3546             if (tab != TAB_DEFAULT && tab != newtab)
3547               error (SORT_FAILURE, 0, _("incompatible tabs"));
3548             tab = newtab;
3549           }
3550           break;
3551
3552         case 'T':
3553           add_temp_dir (optarg);
3554           break;
3555
3556         case 'u':
3557           unique = true;
3558           break;
3559
3560         case 'y':
3561           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3562              through Solaris 7.  It is also accepted by many non-Solaris
3563              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3564              -y is marked as obsolete starting with Solaris 8 (1999), but is
3565              still accepted as of Solaris 10 prerelease (2004).
3566
3567              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3568              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3569              and which in general ignores the argument after "-y" if it
3570              consists entirely of digits (it can even be empty).  */
3571           if (optarg == argv[optind - 1])
3572             {
3573               char const *p;
3574               for (p = optarg; ISDIGIT (*p); p++)
3575                 continue;
3576               optind -= (*p != '\0');
3577             }
3578           break;
3579
3580         case 'z':
3581           eolchar = 0;
3582           break;
3583
3584         case_GETOPT_HELP_CHAR;
3585
3586         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3587
3588         default:
3589           usage (SORT_FAILURE);
3590         }
3591     }
3592
3593   if (files_from)
3594     {
3595       FILE *stream;
3596
3597       /* When using --files0-from=F, you may not specify any files
3598          on the command-line.  */
3599       if (nfiles)
3600         {
3601           error (0, 0, _("extra operand %s"), quote (files[0]));
3602           fprintf (stderr, "%s\n",
3603                    _("file operands cannot be combined with --files0-from"));
3604           usage (SORT_FAILURE);
3605         }
3606
3607       if (STREQ (files_from, "-"))
3608         stream = stdin;
3609       else
3610         {
3611           stream = fopen (files_from, "r");
3612           if (stream == NULL)
3613             error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3614                    quote (files_from));
3615         }
3616
3617       readtokens0_init (&tok);
3618
3619       if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3620         error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3621                quote (files_from));
3622
3623       if (tok.n_tok)
3624         {
3625           size_t i;
3626           free (files);
3627           files = tok.tok;
3628           nfiles = tok.n_tok;
3629           for (i = 0; i < nfiles; i++)
3630           {
3631               if (STREQ (files[i], "-"))
3632                 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3633                                           "no file name of %s allowed"),
3634                        quote (files[i]));
3635               else if (files[i][0] == '\0')
3636                 {
3637                   /* Using the standard `filename:line-number:' prefix here is
3638                      not totally appropriate, since NUL is the separator, not NL,
3639                      but it might be better than nothing.  */
3640                   unsigned long int file_number = i + 1;
3641                   error (SORT_FAILURE, 0,
3642                          _("%s:%lu: invalid zero-length file name"),
3643                          quotearg_colon (files_from), file_number);
3644                 }
3645           }
3646         }
3647       else
3648         error (SORT_FAILURE, 0, _("no input from %s"),
3649                quote (files_from));
3650     }
3651
3652   /* Inheritance of global options to individual keys. */
3653   for (key = keylist; key; key = key->next)
3654     {
3655       if (! (key->ignore
3656              || key->translate
3657              || (key->skipsblanks
3658                  || key->reverse
3659                  || key->skipeblanks
3660                  || key->month
3661                  || key->numeric
3662                  || key->version
3663                  || key->general_numeric
3664                  || key->human_numeric
3665                  || key->random)))
3666         {
3667           key->ignore = gkey.ignore;
3668           key->translate = gkey.translate;
3669           key->skipsblanks = gkey.skipsblanks;
3670           key->skipeblanks = gkey.skipeblanks;
3671           key->month = gkey.month;
3672           key->numeric = gkey.numeric;
3673           key->general_numeric = gkey.general_numeric;
3674           key->human_numeric = gkey.human_numeric;
3675           key->random = gkey.random;
3676           key->reverse = gkey.reverse;
3677           key->version = gkey.version;
3678         }
3679
3680       need_random |= key->random;
3681     }
3682
3683   if (!keylist && (gkey.ignore
3684                    || gkey.translate
3685                    || (gkey.skipsblanks
3686                        || gkey.skipeblanks
3687                        || gkey.month
3688                        || gkey.numeric
3689                        || gkey.general_numeric
3690                        || gkey.human_numeric
3691                        || gkey.random
3692                        || gkey.version)))
3693     {
3694       insertkey (&gkey);
3695       need_random |= gkey.random;
3696     }
3697
3698   check_ordering_compatibility ();
3699
3700   reverse = gkey.reverse;
3701
3702   if (need_random)
3703     {
3704       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3705       if (! randread_source)
3706         die (_("open failed"), random_source);
3707     }
3708
3709   if (temp_dir_count == 0)
3710     {
3711       char const *tmp_dir = getenv ("TMPDIR");
3712       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3713     }
3714
3715   if (nfiles == 0)
3716     {
3717       static char *minus = (char *) "-";
3718       nfiles = 1;
3719       free (files);
3720       files = &minus;
3721     }
3722
3723   /* Need to re-check that we meet the minimum requirement for memory
3724      usage with the final value for NMERGE. */
3725   if (0 < sort_size)
3726     sort_size = MAX (sort_size, MIN_SORT_SIZE);
3727
3728   if (checkonly)
3729     {
3730       if (nfiles > 1)
3731         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3732                quote (files[1]), checkonly);
3733
3734       if (outfile)
3735         {
3736           static char opts[] = {0, 'o', 0};
3737           opts[0] = checkonly;
3738           incompatible_options (opts);
3739         }
3740
3741       /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3742          input is not properly sorted.  */
3743       exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3744     }
3745
3746   if (mergeonly)
3747     {
3748       struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3749       size_t i;
3750
3751       for (i = 0; i < nfiles; ++i)
3752         sortfiles[i].name = files[i];
3753
3754       merge (sortfiles, 0, nfiles, outfile);
3755       IF_LINT (free (sortfiles));
3756     }
3757   else
3758     sort (files, nfiles, outfile);
3759
3760   if (have_read_stdin && fclose (stdin) == EOF)
3761     die (_("close failed"), "-");
3762
3763   exit (EXIT_SUCCESS);
3764 }