dd: add a flag to discard cached data
[coreutils/ericb.git] / src / sort.c
blob13954cbbc805d79580f383efd126118abdccf79e
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 #include <signal.h>
30 #include "system.h"
31 #include "argmatch.h"
32 #include "error.h"
33 #include "fadvise.h"
34 #include "filevercmp.h"
35 #include "hard-locale.h"
36 #include "hash.h"
37 #include "heap.h"
38 #include "ignore-value.h"
39 #include "md5.h"
40 #include "mbswidth.h"
41 #include "nproc.h"
42 #include "physmem.h"
43 #include "posixver.h"
44 #include "quote.h"
45 #include "quotearg.h"
46 #include "randread.h"
47 #include "readtokens0.h"
48 #include "stdio--.h"
49 #include "stdlib--.h"
50 #include "strnumcmp.h"
51 #include "xmemcoll.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
55 #if HAVE_SYS_RESOURCE_H
56 # include <sys/resource.h>
57 #endif
58 #ifndef RLIMIT_DATA
59 struct rlimit { size_t rlim_cur; };
60 # define getrlimit(Resource, Rlp) (-1)
61 #endif
63 /* The official name of this program (e.g., no `g' prefix). */
64 #define PROGRAM_NAME "sort"
66 #define AUTHORS \
67 proper_name ("Mike Haertel"), \
68 proper_name ("Paul Eggert")
70 #if HAVE_LANGINFO_CODESET
71 # include <langinfo.h>
72 #endif
74 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
75 present. */
76 #ifndef SA_NOCLDSTOP
77 # define SA_NOCLDSTOP 0
78 /* No sigprocmask. Always 'return' zero. */
79 # define sigprocmask(How, Set, Oset) (0)
80 # define sigset_t int
81 # if ! HAVE_SIGINTERRUPT
82 # define siginterrupt(sig, flag) /* empty */
83 # endif
84 #endif
86 #if !defined OPEN_MAX && defined NR_OPEN
87 # define OPEN_MAX NR_OPEN
88 #endif
89 #if !defined OPEN_MAX
90 # define OPEN_MAX 20
91 #endif
93 #define UCHAR_LIM (UCHAR_MAX + 1)
95 #if HAVE_C99_STRTOLD
96 # define long_double long double
97 #else
98 # define long_double double
99 # undef strtold
100 # define strtold strtod
101 #endif
103 #ifndef DEFAULT_TMPDIR
104 # define DEFAULT_TMPDIR "/tmp"
105 #endif
107 /* Maximum number of lines to merge every time a NODE is taken from
108 the merge queue. Node is at LEVEL in the binary merge tree,
109 and is responsible for merging TOTAL lines. */
110 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
112 /* Heuristic value for the number of lines for which it is worth
113 creating a subthread, during an internal merge sort, on a machine
114 that has processors galore. Currently this number is just a guess.
115 This value must be at least 4. We don't know of any machine where
116 this number has any practical effect. */
117 enum { SUBTHREAD_LINES_HEURISTIC = 4 };
119 /* The number of threads after which there are
120 diminishing performance gains. */
121 enum { DEFAULT_MAX_THREADS = 8 };
123 /* Exit statuses. */
124 enum
126 /* POSIX says to exit with status 1 if invoked with -c and the
127 input is not properly sorted. */
128 SORT_OUT_OF_ORDER = 1,
130 /* POSIX says any other irregular exit must exit with a status
131 code greater than 1. */
132 SORT_FAILURE = 2
135 enum
137 /* The number of times we should try to fork a compression process
138 (we retry if the fork call fails). We don't _need_ to compress
139 temp files, this is just to reduce disk access, so this number
140 can be small. Each retry doubles in duration. */
141 MAX_FORK_TRIES_COMPRESS = 4,
143 /* The number of times we should try to fork a decompression process.
144 If we can't fork a decompression process, we can't sort, so this
145 number should be big. Each retry doubles in duration. */
146 MAX_FORK_TRIES_DECOMPRESS = 9
149 enum
151 /* Level of the end-of-merge node, one level above the root. */
152 MERGE_END = 0,
154 /* Level of the root node in merge tree. */
155 MERGE_ROOT = 1
158 /* The representation of the decimal point in the current locale. */
159 static int decimal_point;
161 /* Thousands separator; if -1, then there isn't one. */
162 static int thousands_sep;
164 /* Nonzero if the corresponding locales are hard. */
165 static bool hard_LC_COLLATE;
166 #if HAVE_NL_LANGINFO
167 static bool hard_LC_TIME;
168 #endif
170 #define NONZERO(x) ((x) != 0)
172 /* The kind of blanks for '-b' to skip in various options. */
173 enum blanktype { bl_start, bl_end, bl_both };
175 /* The character marking end of line. Default to \n. */
176 static char eolchar = '\n';
178 /* Lines are held in core as counted strings. */
179 struct line
181 char *text; /* Text of the line. */
182 size_t length; /* Length including final newline. */
183 char *keybeg; /* Start of first key. */
184 char *keylim; /* Limit of first key. */
187 /* Input buffers. */
188 struct buffer
190 char *buf; /* Dynamically allocated buffer,
191 partitioned into 3 regions:
192 - input data;
193 - unused area;
194 - an array of lines, in reverse order. */
195 size_t used; /* Number of bytes used for input data. */
196 size_t nlines; /* Number of lines in the line array. */
197 size_t alloc; /* Number of bytes allocated. */
198 size_t left; /* Number of bytes left from previous reads. */
199 size_t line_bytes; /* Number of bytes to reserve for each line. */
200 bool eof; /* An EOF has been read. */
203 /* Sort key. */
204 struct keyfield
206 size_t sword; /* Zero-origin 'word' to start at. */
207 size_t schar; /* Additional characters to skip. */
208 size_t eword; /* Zero-origin last 'word' of key. */
209 size_t echar; /* Additional characters in field. */
210 bool const *ignore; /* Boolean array of characters to ignore. */
211 char const *translate; /* Translation applied to characters. */
212 bool skipsblanks; /* Skip leading blanks when finding start. */
213 bool skipeblanks; /* Skip leading blanks when finding end. */
214 bool numeric; /* Flag for numeric comparison. Handle
215 strings of digits with optional decimal
216 point, but no exponential notation. */
217 bool random; /* Sort by random hash of key. */
218 bool general_numeric; /* Flag for general, numeric comparison.
219 Handle numbers in exponential notation. */
220 bool human_numeric; /* Flag for sorting by human readable
221 units with either SI xor IEC prefixes. */
222 bool month; /* Flag for comparison by month name. */
223 bool reverse; /* Reverse the sense of comparison. */
224 bool version; /* sort by version number */
225 bool obsolete_used; /* obsolescent key option format is used. */
226 struct keyfield *next; /* Next keyfield to try. */
229 struct month
231 char const *name;
232 int val;
235 /* Binary merge tree node. */
236 struct merge_node
238 struct line *lo; /* Lines to merge from LO child node. */
239 struct line *hi; /* Lines to merge from HI child ndoe. */
240 struct line *end_lo; /* End of available lines from LO. */
241 struct line *end_hi; /* End of available lines from HI. */
242 struct line **dest; /* Pointer to destination of merge. */
243 size_t nlo; /* Total Lines remaining from LO. */
244 size_t nhi; /* Total lines remaining from HI. */
245 struct merge_node *parent; /* Parent node. */
246 struct merge_node *lo_child; /* LO child node. */
247 struct merge_node *hi_child; /* HI child node. */
248 unsigned int level; /* Level in merge tree. */
249 bool queued; /* Node is already in heap. */
250 pthread_mutex_t lock; /* Lock for node operations. */
253 /* Priority queue of merge nodes. */
254 struct merge_node_queue
256 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
257 pthread_mutex_t mutex; /* Lock for queue operations. */
258 pthread_cond_t cond; /* Conditional wait for empty queue to populate
259 when popping. */
262 /* FIXME: None of these tables work with multibyte character sets.
263 Also, there are many other bugs when handling multibyte characters.
264 One way to fix this is to rewrite `sort' to use wide characters
265 internally, but doing this with good performance is a bit
266 tricky. */
268 /* Table of blanks. */
269 static bool blanks[UCHAR_LIM];
271 /* Table of non-printing characters. */
272 static bool nonprinting[UCHAR_LIM];
274 /* Table of non-dictionary characters (not letters, digits, or blanks). */
275 static bool nondictionary[UCHAR_LIM];
277 /* Translation table folding lower case to upper. */
278 static char fold_toupper[UCHAR_LIM];
280 #define MONTHS_PER_YEAR 12
282 /* Table mapping month names to integers.
283 Alphabetic order allows binary search. */
284 static struct month monthtab[] =
286 {"APR", 4},
287 {"AUG", 8},
288 {"DEC", 12},
289 {"FEB", 2},
290 {"JAN", 1},
291 {"JUL", 7},
292 {"JUN", 6},
293 {"MAR", 3},
294 {"MAY", 5},
295 {"NOV", 11},
296 {"OCT", 10},
297 {"SEP", 9}
300 /* During the merge phase, the number of files to merge at once. */
301 #define NMERGE_DEFAULT 16
303 /* Minimum size for a merge or check buffer. */
304 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
306 /* Minimum sort size; the code might not work with smaller sizes. */
307 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
309 /* The number of bytes needed for a merge or check buffer, which can
310 function relatively efficiently even if it holds only one line. If
311 a longer line is seen, this value is increased. */
312 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
314 /* The approximate maximum number of bytes of main memory to use, as
315 specified by the user. Zero if the user has not specified a size. */
316 static size_t sort_size;
318 /* The guessed size for non-regular files. */
319 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
321 /* Array of directory names in which any temporary files are to be created. */
322 static char const **temp_dirs;
324 /* Number of temporary directory names used. */
325 static size_t temp_dir_count;
327 /* Number of allocated slots in temp_dirs. */
328 static size_t temp_dir_alloc;
330 /* Flag to reverse the order of all comparisons. */
331 static bool reverse;
333 /* Flag for stable sort. This turns off the last ditch bytewise
334 comparison of lines, and instead leaves lines in the same order
335 they were read if all keys compare equal. */
336 static bool stable;
338 /* If TAB has this value, blanks separate fields. */
339 enum { TAB_DEFAULT = CHAR_MAX + 1 };
341 /* Tab character separating fields. If TAB_DEFAULT, then fields are
342 separated by the empty string between a non-blank character and a blank
343 character. */
344 static int tab = TAB_DEFAULT;
346 /* Flag to remove consecutive duplicate lines from the output.
347 Only the last of a sequence of equal lines will be output. */
348 static bool unique;
350 /* Nonzero if any of the input files are the standard input. */
351 static bool have_read_stdin;
353 /* List of key field comparisons to be tried. */
354 static struct keyfield *keylist;
356 /* Program used to (de)compress temp files. Must accept -d. */
357 static char const *compress_program;
359 /* Annotate the output with extra info to aid the user. */
360 static bool debug;
362 /* Maximum number of files to merge in one go. If more than this
363 number are present, temp files will be used. */
364 static unsigned int nmerge = NMERGE_DEFAULT;
366 /* Report MESSAGE for FILE, then clean up and exit.
367 If FILE is null, it represents standard output. */
369 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
370 static void
371 die (char const *message, char const *file)
373 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
374 exit (SORT_FAILURE);
377 void
378 usage (int status)
380 if (status != EXIT_SUCCESS)
381 fprintf (stderr, _("Try `%s --help' for more information.\n"),
382 program_name);
383 else
385 printf (_("\
386 Usage: %s [OPTION]... [FILE]...\n\
387 or: %s [OPTION]... --files0-from=F\n\
389 program_name, program_name);
390 fputs (_("\
391 Write sorted concatenation of all FILE(s) to standard output.\n\
393 "), stdout);
394 fputs (_("\
395 Mandatory arguments to long options are mandatory for short options too.\n\
396 "), stdout);
397 fputs (_("\
398 Ordering options:\n\
400 "), stdout);
401 fputs (_("\
402 -b, --ignore-leading-blanks ignore leading blanks\n\
403 -d, --dictionary-order consider only blanks and alphanumeric characters\
405 -f, --ignore-case fold lower case to upper case characters\n\
406 "), stdout);
407 fputs (_("\
408 -g, --general-numeric-sort compare according to general numerical value\n\
409 -i, --ignore-nonprinting consider only printable characters\n\
410 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
411 "), stdout);
412 fputs (_("\
413 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
414 "), stdout);
415 fputs (_("\
416 -n, --numeric-sort compare according to string numerical value\n\
417 -R, --random-sort sort by random hash of keys\n\
418 --random-source=FILE get random bytes from FILE\n\
419 -r, --reverse reverse the result of comparisons\n\
420 "), stdout);
421 fputs (_("\
422 --sort=WORD sort according to WORD:\n\
423 general-numeric -g, human-numeric -h, month -M,\
425 numeric -n, random -R, version -V\n\
426 -V, --version-sort natural sort of (version) numbers within text\n\
428 "), stdout);
429 fputs (_("\
430 Other options:\n\
432 "), stdout);
433 fputs (_("\
434 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
435 for more use temp files\n\
436 "), stdout);
437 fputs (_("\
438 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
439 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
441 --compress-program=PROG compress temporaries with PROG;\n\
442 decompress them with PROG -d\n\
443 "), stdout);
444 fputs (_("\
445 --debug annotate the part of the line used to sort,\n\
446 and warn about questionable usage to stderr\n\
447 --files0-from=F read input from the files specified by\n\
448 NUL-terminated names in file F;\n\
449 If F is - then read names from standard input\n\
450 "), stdout);
451 fputs (_("\
452 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
453 (default end of line). See POS syntax below\n\
454 -m, --merge merge already sorted files; do not sort\n\
455 "), stdout);
456 fputs (_("\
457 -o, --output=FILE write result to FILE instead of standard output\n\
458 -s, --stable stabilize sort by disabling last-resort comparison\
460 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
461 "), stdout);
462 printf (_("\
463 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
464 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
465 multiple options specify multiple directories\n\
466 --parallel=N change the number of sorts run concurrently to N\n\
467 -u, --unique with -c, check for strict ordering;\n\
468 without -c, output only the first of an equal run\
470 "), DEFAULT_TMPDIR);
471 fputs (_("\
472 -z, --zero-terminated end lines with 0 byte, not newline\n\
473 "), stdout);
474 fputs (HELP_OPTION_DESCRIPTION, stdout);
475 fputs (VERSION_OPTION_DESCRIPTION, stdout);
476 fputs (_("\
478 POS is F[.C][OPTS], where F is the field number and C the character position\n\
479 in the field; both are origin 1. If neither -t nor -b is in effect, characters\
481 in a field are counted from the beginning of the preceding whitespace. OPTS is\
483 one or more single-letter ordering options, which override global ordering\n\
484 options for that key. If no key is given, use the entire line as the key.\n\
486 SIZE may be followed by the following multiplicative suffixes:\n\
487 "), stdout);
488 fputs (_("\
489 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
491 With no FILE, or when FILE is -, read standard input.\n\
493 *** WARNING ***\n\
494 The locale specified by the environment affects sort order.\n\
495 Set LC_ALL=C to get the traditional sort order that uses\n\
496 native byte values.\n\
497 "), stdout );
498 emit_ancillary_info ();
501 exit (status);
504 /* For long options that have no equivalent short option, use a
505 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
506 enum
508 CHECK_OPTION = CHAR_MAX + 1,
509 COMPRESS_PROGRAM_OPTION,
510 DEBUG_PROGRAM_OPTION,
511 FILES0_FROM_OPTION,
512 NMERGE_OPTION,
513 RANDOM_SOURCE_OPTION,
514 SORT_OPTION,
515 PARALLEL_OPTION
518 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
520 static struct option const long_options[] =
522 {"ignore-leading-blanks", no_argument, NULL, 'b'},
523 {"check", optional_argument, NULL, CHECK_OPTION},
524 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
525 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
526 {"dictionary-order", no_argument, NULL, 'd'},
527 {"ignore-case", no_argument, NULL, 'f'},
528 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
529 {"general-numeric-sort", no_argument, NULL, 'g'},
530 {"ignore-nonprinting", no_argument, NULL, 'i'},
531 {"key", required_argument, NULL, 'k'},
532 {"merge", no_argument, NULL, 'm'},
533 {"month-sort", no_argument, NULL, 'M'},
534 {"numeric-sort", no_argument, NULL, 'n'},
535 {"human-numeric-sort", no_argument, NULL, 'h'},
536 {"version-sort", no_argument, NULL, 'V'},
537 {"random-sort", no_argument, NULL, 'R'},
538 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
539 {"sort", required_argument, NULL, SORT_OPTION},
540 {"output", required_argument, NULL, 'o'},
541 {"reverse", no_argument, NULL, 'r'},
542 {"stable", no_argument, NULL, 's'},
543 {"batch-size", required_argument, NULL, NMERGE_OPTION},
544 {"buffer-size", required_argument, NULL, 'S'},
545 {"field-separator", required_argument, NULL, 't'},
546 {"temporary-directory", required_argument, NULL, 'T'},
547 {"unique", no_argument, NULL, 'u'},
548 {"zero-terminated", no_argument, NULL, 'z'},
549 {"parallel", required_argument, NULL, PARALLEL_OPTION},
550 {GETOPT_HELP_OPTION_DECL},
551 {GETOPT_VERSION_OPTION_DECL},
552 {NULL, 0, NULL, 0},
555 #define CHECK_TABLE \
556 _ct_("quiet", 'C') \
557 _ct_("silent", 'C') \
558 _ct_("diagnose-first", 'c')
560 static char const *const check_args[] =
562 #define _ct_(_s, _c) _s,
563 CHECK_TABLE NULL
564 #undef _ct_
566 static char const check_types[] =
568 #define _ct_(_s, _c) _c,
569 CHECK_TABLE
570 #undef _ct_
573 #define SORT_TABLE \
574 _st_("general-numeric", 'g') \
575 _st_("human-numeric", 'h') \
576 _st_("month", 'M') \
577 _st_("numeric", 'n') \
578 _st_("random", 'R') \
579 _st_("version", 'V')
581 static char const *const sort_args[] =
583 #define _st_(_s, _c) _s,
584 SORT_TABLE NULL
585 #undef _st_
587 static char const sort_types[] =
589 #define _st_(_s, _c) _c,
590 SORT_TABLE
591 #undef _st_
594 /* The set of signals that are caught. */
595 static sigset_t caught_signals;
597 /* Critical section status. */
598 struct cs_status
600 bool valid;
601 sigset_t sigs;
604 /* Enter a critical section. */
605 static struct cs_status
606 cs_enter (void)
608 struct cs_status status;
609 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
610 return status;
613 /* Leave a critical section. */
614 static void
615 cs_leave (struct cs_status status)
617 if (status.valid)
619 /* Ignore failure when restoring the signal mask. */
620 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
624 /* Possible states for a temp file. If compressed, the file's status
625 is unreaped or reaped, depending on whether 'sort' has waited for
626 the subprocess to finish. */
627 enum { UNCOMPRESSED, UNREAPED, REAPED };
629 /* The list of temporary files. */
630 struct tempnode
632 struct tempnode *volatile next;
633 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
634 char state;
635 char name[1]; /* Actual size is 1 + file name length. */
637 static struct tempnode *volatile temphead;
638 static struct tempnode *volatile *temptail = &temphead;
640 /* A file to be sorted. */
641 struct sortfile
643 /* The file's name. */
644 char const *name;
646 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
647 struct tempnode *temp;
650 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
651 static Hash_table *proctab;
653 enum { INIT_PROCTAB_SIZE = 47 };
655 static size_t
656 proctab_hasher (void const *entry, size_t tabsize)
658 struct tempnode const *node = entry;
659 return node->pid % tabsize;
662 static bool
663 proctab_comparator (void const *e1, void const *e2)
665 struct tempnode const *n1 = e1;
666 struct tempnode const *n2 = e2;
667 return n1->pid == n2->pid;
670 /* The number of unreaped child processes. */
671 static pid_t nprocs;
673 static bool delete_proc (pid_t);
675 /* If PID is positive, wait for the child process with that PID to
676 exit, and assume that PID has already been removed from the process
677 table. If PID is 0 or -1, clean up some child that has exited (by
678 waiting for it, and removing it from the proc table) and return the
679 child's process ID. However, if PID is 0 and no children have
680 exited, return 0 without waiting. */
682 static pid_t
683 reap (pid_t pid)
685 int status;
686 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
688 if (cpid < 0)
689 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
690 compress_program);
691 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
693 if (! WIFEXITED (status) || WEXITSTATUS (status))
694 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
695 compress_program);
696 --nprocs;
699 return cpid;
702 /* TEMP represents a new process; add it to the process table. Create
703 the process table the first time it's called. */
705 static void
706 register_proc (struct tempnode *temp)
708 if (! proctab)
710 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
711 proctab_hasher,
712 proctab_comparator,
713 NULL);
714 if (! proctab)
715 xalloc_die ();
718 temp->state = UNREAPED;
720 if (! hash_insert (proctab, temp))
721 xalloc_die ();
724 /* If PID is in the process table, remove it and return true.
725 Otherwise, return false. */
727 static bool
728 delete_proc (pid_t pid)
730 struct tempnode test;
732 test.pid = pid;
733 struct tempnode *node = hash_delete (proctab, &test);
734 if (! node)
735 return false;
736 node->state = REAPED;
737 return true;
740 /* Remove PID from the process table, and wait for it to exit if it
741 hasn't already. */
743 static void
744 wait_proc (pid_t pid)
746 if (delete_proc (pid))
747 reap (pid);
750 /* Reap any exited children. Do not block; reap only those that have
751 already exited. */
753 static void
754 reap_exited (void)
756 while (0 < nprocs && reap (0))
757 continue;
760 /* Reap at least one exited child, waiting if necessary. */
762 static void
763 reap_some (void)
765 reap (-1);
766 reap_exited ();
769 /* Reap all children, waiting if necessary. */
771 static void
772 reap_all (void)
774 while (0 < nprocs)
775 reap (-1);
778 /* Clean up any remaining temporary files. */
780 static void
781 cleanup (void)
783 struct tempnode const *node;
785 for (node = temphead; node; node = node->next)
786 unlink (node->name);
787 temphead = NULL;
790 /* Cleanup actions to take when exiting. */
792 static void
793 exit_cleanup (void)
795 if (temphead)
797 /* Clean up any remaining temporary files in a critical section so
798 that a signal handler does not try to clean them too. */
799 struct cs_status cs = cs_enter ();
800 cleanup ();
801 cs_leave (cs);
804 close_stdout ();
807 /* Create a new temporary file, returning its newly allocated tempnode.
808 Store into *PFD the file descriptor open for writing.
809 If the creation fails, return NULL and store -1 into *PFD if the
810 failure is due to file descriptor exhaustion and
811 SURVIVE_FD_EXHAUSTION; otherwise, die. */
813 static struct tempnode *
814 create_temp_file (int *pfd, bool survive_fd_exhaustion)
816 static char const slashbase[] = "/sortXXXXXX";
817 static size_t temp_dir_index;
818 int fd;
819 int saved_errno;
820 char const *temp_dir = temp_dirs[temp_dir_index];
821 size_t len = strlen (temp_dir);
822 struct tempnode *node =
823 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
824 char *file = node->name;
825 struct cs_status cs;
827 memcpy (file, temp_dir, len);
828 memcpy (file + len, slashbase, sizeof slashbase);
829 node->next = NULL;
830 if (++temp_dir_index == temp_dir_count)
831 temp_dir_index = 0;
833 /* Create the temporary file in a critical section, to avoid races. */
834 cs = cs_enter ();
835 fd = mkstemp (file);
836 if (0 <= fd)
838 *temptail = node;
839 temptail = &node->next;
841 saved_errno = errno;
842 cs_leave (cs);
843 errno = saved_errno;
845 if (fd < 0)
847 if (! (survive_fd_exhaustion && errno == EMFILE))
848 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
849 quote (temp_dir));
850 free (node);
851 node = NULL;
854 *pfd = fd;
855 return node;
858 /* Return a stream for FILE, opened with mode HOW. A null FILE means
859 standard output; HOW should be "w". When opening for input, "-"
860 means standard input. To avoid confusion, do not return file
861 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
862 opening an ordinary FILE. Return NULL if unsuccessful.
864 fadvise() is used to specify an access pattern for input files.
865 There are a few hints we could possibly provide,
866 and after careful testing it was decided that
867 specifying POSIX_FADV_SEQUENTIAL was not detrimental
868 to any cases. On Linux 2.6.31, this option doubles
869 the size of read ahead performed and thus was seen to
870 benefit these cases:
871 Merging
872 Sorting with a smaller internal buffer
873 Reading from faster flash devices
875 In _addition_ one could also specify other hints...
877 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
878 at least uses that to _synchronously_ prepopulate the cache
879 with the specified range. While sort does need to
880 read all of its input before outputting, a synchronous
881 read of the whole file up front precludes any processing
882 that sort could do in parallel with the system doing
883 read ahead of the data. This was seen to have negative effects
884 in a couple of cases:
885 Merging
886 Sorting with a smaller internal buffer
887 Note this option was seen to shorten the runtime for sort
888 on a multicore system with lots of RAM and other processes
889 competing for CPU. It could be argued that more explicit
890 scheduling hints with `nice` et. al. are more appropriate
891 for this situation.
893 POSIX_FADV_NOREUSE is a possibility as it could lower
894 the priority of input data in the cache as sort will
895 only need to process it once. However its functionality
896 has changed over Linux kernel versions and as of 2.6.31
897 it does nothing and thus we can't depend on what it might
898 do in future.
900 POSIX_FADV_DONTNEED is not appropriate for user specified
901 input files, but for temp files we do want to drop the
902 cache immediately after processing. This is done implicitly
903 however when the files are unlinked. */
905 static FILE *
906 stream_open (char const *file, char const *how)
908 if (!file)
909 return stdout;
910 if (*how == 'r')
912 FILE *fp;
913 if (STREQ (file, "-"))
915 have_read_stdin = true;
916 fp = stdin;
918 else
919 fp = fopen (file, how);
920 fadvise (fp, FADVISE_SEQUENTIAL);
921 return fp;
923 return fopen (file, how);
926 /* Same as stream_open, except always return a non-null value; die on
927 failure. */
929 static FILE *
930 xfopen (char const *file, char const *how)
932 FILE *fp = stream_open (file, how);
933 if (!fp)
934 die (_("open failed"), file);
935 return fp;
938 /* Close FP, whose name is FILE, and report any errors. */
940 static void
941 xfclose (FILE *fp, char const *file)
943 switch (fileno (fp))
945 case STDIN_FILENO:
946 /* Allow reading stdin from tty more than once. */
947 if (feof (fp))
948 clearerr (fp);
949 break;
951 case STDOUT_FILENO:
952 /* Don't close stdout just yet. close_stdout does that. */
953 if (fflush (fp) != 0)
954 die (_("fflush failed"), file);
955 break;
957 default:
958 if (fclose (fp) != 0)
959 die (_("close failed"), file);
960 break;
964 static void
965 dup2_or_die (int oldfd, int newfd)
967 if (dup2 (oldfd, newfd) < 0)
968 error (SORT_FAILURE, errno, _("dup2 failed"));
971 /* Fork a child process for piping to and do common cleanup. The
972 TRIES parameter tells us how many times to try to fork before
973 giving up. Return the PID of the child, or -1 (setting errno)
974 on failure. */
976 static pid_t
977 pipe_fork (int pipefds[2], size_t tries)
979 #if HAVE_WORKING_FORK
980 struct tempnode *saved_temphead;
981 int saved_errno;
982 double wait_retry = 0.25;
983 pid_t pid IF_LINT ( = -1);
984 struct cs_status cs;
986 if (pipe (pipefds) < 0)
987 return -1;
989 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
990 uncontrolled subprocess generation can hurt performance significantly.
991 Allow at most NMERGE + 2 subprocesses, on the theory that there
992 may be some useful parallelism by letting compression for the
993 previous merge finish (1 subprocess) in parallel with the current
994 merge (NMERGE + 1 subprocesses). */
996 if (nmerge + 1 < nprocs)
997 reap_some ();
999 while (tries--)
1001 /* This is so the child process won't delete our temp files
1002 if it receives a signal before exec-ing. */
1003 cs = cs_enter ();
1004 saved_temphead = temphead;
1005 temphead = NULL;
1007 pid = fork ();
1008 saved_errno = errno;
1009 if (pid)
1010 temphead = saved_temphead;
1012 cs_leave (cs);
1013 errno = saved_errno;
1015 if (0 <= pid || errno != EAGAIN)
1016 break;
1017 else
1019 xnanosleep (wait_retry);
1020 wait_retry *= 2;
1021 reap_exited ();
1025 if (pid < 0)
1027 saved_errno = errno;
1028 close (pipefds[0]);
1029 close (pipefds[1]);
1030 errno = saved_errno;
1032 else if (pid == 0)
1034 close (STDIN_FILENO);
1035 close (STDOUT_FILENO);
1037 else
1038 ++nprocs;
1040 return pid;
1042 #else /* ! HAVE_WORKING_FORK */
1043 return -1;
1044 #endif
1047 /* Create a temporary file and, if asked for, start a compressor
1048 to that file. Set *PFP to the file handle and return
1049 the address of the new temp node. If the creation
1050 fails, return NULL if the failure is due to file descriptor
1051 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1053 static struct tempnode *
1054 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1056 int tempfd;
1057 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1058 if (! node)
1059 return NULL;
1061 node->state = UNCOMPRESSED;
1063 if (compress_program)
1065 int pipefds[2];
1067 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1068 if (0 < node->pid)
1070 close (tempfd);
1071 close (pipefds[0]);
1072 tempfd = pipefds[1];
1074 register_proc (node);
1076 else if (node->pid == 0)
1078 close (pipefds[1]);
1079 dup2_or_die (tempfd, STDOUT_FILENO);
1080 close (tempfd);
1081 dup2_or_die (pipefds[0], STDIN_FILENO);
1082 close (pipefds[0]);
1084 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1085 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1086 compress_program);
1090 *pfp = fdopen (tempfd, "w");
1091 if (! *pfp)
1092 die (_("couldn't create temporary file"), node->name);
1094 return node;
1097 /* Create a temporary file and, if asked for, start a compressor
1098 to that file. Set *PFP to the file handle and return the address
1099 of the new temp node. Die on failure. */
1101 static struct tempnode *
1102 create_temp (FILE **pfp)
1104 return maybe_create_temp (pfp, false);
1107 /* Open a compressed temp file and start a decompression process through
1108 which to filter the input. Return NULL (setting errno to
1109 EMFILE) if we ran out of file descriptors, and die on any other
1110 kind of failure. */
1112 static FILE *
1113 open_temp (struct tempnode *temp)
1115 int tempfd, pipefds[2];
1116 FILE *fp = NULL;
1118 if (temp->state == UNREAPED)
1119 wait_proc (temp->pid);
1121 tempfd = open (temp->name, O_RDONLY);
1122 if (tempfd < 0)
1123 return NULL;
1125 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1127 switch (child)
1129 case -1:
1130 if (errno != EMFILE)
1131 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1132 compress_program);
1133 close (tempfd);
1134 errno = EMFILE;
1135 break;
1137 case 0:
1138 close (pipefds[0]);
1139 dup2_or_die (tempfd, STDIN_FILENO);
1140 close (tempfd);
1141 dup2_or_die (pipefds[1], STDOUT_FILENO);
1142 close (pipefds[1]);
1144 execlp (compress_program, compress_program, "-d", (char *) NULL);
1145 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1146 compress_program);
1148 default:
1149 temp->pid = child;
1150 register_proc (temp);
1151 close (tempfd);
1152 close (pipefds[1]);
1154 fp = fdopen (pipefds[0], "r");
1155 if (! fp)
1157 int saved_errno = errno;
1158 close (pipefds[0]);
1159 errno = saved_errno;
1161 break;
1164 return fp;
1167 /* Append DIR to the array of temporary directory names. */
1168 static void
1169 add_temp_dir (char const *dir)
1171 if (temp_dir_count == temp_dir_alloc)
1172 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1174 temp_dirs[temp_dir_count++] = dir;
1177 /* Remove NAME from the list of temporary files. */
1179 static void
1180 zaptemp (char const *name)
1182 struct tempnode *volatile *pnode;
1183 struct tempnode *node;
1184 struct tempnode *next;
1185 int unlink_status;
1186 int unlink_errno = 0;
1187 struct cs_status cs;
1189 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1190 continue;
1192 if (node->state == UNREAPED)
1193 wait_proc (node->pid);
1195 /* Unlink the temporary file in a critical section to avoid races. */
1196 next = node->next;
1197 cs = cs_enter ();
1198 unlink_status = unlink (name);
1199 unlink_errno = errno;
1200 *pnode = next;
1201 cs_leave (cs);
1203 if (unlink_status != 0)
1204 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1205 if (! next)
1206 temptail = pnode;
1207 free (node);
1210 #if HAVE_NL_LANGINFO
1212 static int
1213 struct_month_cmp (void const *m1, void const *m2)
1215 struct month const *month1 = m1;
1216 struct month const *month2 = m2;
1217 return strcmp (month1->name, month2->name);
1220 #endif
1222 /* Initialize the character class tables. */
1224 static void
1225 inittables (void)
1227 size_t i;
1229 for (i = 0; i < UCHAR_LIM; ++i)
1231 blanks[i] = !! isblank (i);
1232 nonprinting[i] = ! isprint (i);
1233 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1234 fold_toupper[i] = toupper (i);
1237 #if HAVE_NL_LANGINFO
1238 /* If we're not in the "C" locale, read different names for months. */
1239 if (hard_LC_TIME)
1241 for (i = 0; i < MONTHS_PER_YEAR; i++)
1243 char const *s;
1244 size_t s_len;
1245 size_t j, k;
1246 char *name;
1248 s = nl_langinfo (ABMON_1 + i);
1249 s_len = strlen (s);
1250 monthtab[i].name = name = xmalloc (s_len + 1);
1251 monthtab[i].val = i + 1;
1253 for (j = k = 0; j < s_len; j++)
1254 if (! isblank (to_uchar (s[j])))
1255 name[k++] = fold_toupper[to_uchar (s[j])];
1256 name[k] = '\0';
1258 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1260 #endif
1263 /* Specify how many inputs may be merged at once.
1264 This may be set on the command-line with the
1265 --batch-size option. */
1266 static void
1267 specify_nmerge (int oi, char c, char const *s)
1269 uintmax_t n;
1270 struct rlimit rlimit;
1271 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1273 /* Try to find out how many file descriptors we'll be able
1274 to open. We need at least nmerge + 3 (STDIN_FILENO,
1275 STDOUT_FILENO and STDERR_FILENO). */
1276 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1277 ? rlimit.rlim_cur
1278 : OPEN_MAX)
1279 - 3);
1281 if (e == LONGINT_OK)
1283 nmerge = n;
1284 if (nmerge != n)
1285 e = LONGINT_OVERFLOW;
1286 else
1288 if (nmerge < 2)
1290 error (0, 0, _("invalid --%s argument %s"),
1291 long_options[oi].name, quote (s));
1292 error (SORT_FAILURE, 0,
1293 _("minimum --%s argument is %s"),
1294 long_options[oi].name, quote ("2"));
1296 else if (max_nmerge < nmerge)
1298 e = LONGINT_OVERFLOW;
1300 else
1301 return;
1305 if (e == LONGINT_OVERFLOW)
1307 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1308 error (0, 0, _("--%s argument %s too large"),
1309 long_options[oi].name, quote (s));
1310 error (SORT_FAILURE, 0,
1311 _("maximum --%s argument with current rlimit is %s"),
1312 long_options[oi].name,
1313 uinttostr (max_nmerge, max_nmerge_buf));
1315 else
1316 xstrtol_fatal (e, oi, c, long_options, s);
1319 /* Specify the amount of main memory to use when sorting. */
1320 static void
1321 specify_sort_size (int oi, char c, char const *s)
1323 uintmax_t n;
1324 char *suffix;
1325 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1327 /* The default unit is KiB. */
1328 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1330 if (n <= UINTMAX_MAX / 1024)
1331 n *= 1024;
1332 else
1333 e = LONGINT_OVERFLOW;
1336 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1337 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1338 switch (suffix[0])
1340 case 'b':
1341 e = LONGINT_OK;
1342 break;
1344 case '%':
1346 double mem = physmem_total () * n / 100;
1348 /* Use "<", not "<=", to avoid problems with rounding. */
1349 if (mem < UINTMAX_MAX)
1351 n = mem;
1352 e = LONGINT_OK;
1354 else
1355 e = LONGINT_OVERFLOW;
1357 break;
1360 if (e == LONGINT_OK)
1362 /* If multiple sort sizes are specified, take the maximum, so
1363 that option order does not matter. */
1364 if (n < sort_size)
1365 return;
1367 sort_size = n;
1368 if (sort_size == n)
1370 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1371 return;
1374 e = LONGINT_OVERFLOW;
1377 xstrtol_fatal (e, oi, c, long_options, s);
1380 /* Specify the number of threads to spawn during internal sort. */
1381 static size_t
1382 specify_nthreads (int oi, char c, char const *s)
1384 unsigned long int nthreads;
1385 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1386 if (e == LONGINT_OVERFLOW)
1387 return SIZE_MAX;
1388 if (e != LONGINT_OK)
1389 xstrtol_fatal (e, oi, c, long_options, s);
1390 if (SIZE_MAX < nthreads)
1391 nthreads = SIZE_MAX;
1392 if (nthreads == 0)
1393 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1394 return nthreads;
1398 /* Return the default sort size. */
1399 static size_t
1400 default_sort_size (void)
1402 /* Let MEM be available memory or 1/8 of total memory, whichever
1403 is greater. */
1404 double avail = physmem_available ();
1405 double total = physmem_total ();
1406 double mem = MAX (avail, total / 8);
1407 struct rlimit rlimit;
1409 /* Let SIZE be MEM, but no more than the maximum object size or
1410 system resource limits. Avoid the MIN macro here, as it is not
1411 quite right when only one argument is floating point. Don't
1412 bother to check for values like RLIM_INFINITY since in practice
1413 they are not much less than SIZE_MAX. */
1414 size_t size = SIZE_MAX;
1415 if (mem < size)
1416 size = mem;
1417 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1418 size = rlimit.rlim_cur;
1419 #ifdef RLIMIT_AS
1420 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1421 size = rlimit.rlim_cur;
1422 #endif
1424 /* Leave a large safety margin for the above limits, as failure can
1425 occur when they are exceeded. */
1426 size /= 2;
1428 #ifdef RLIMIT_RSS
1429 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1430 Exceeding RSS is not fatal, but can be quite slow. */
1431 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1432 size = rlimit.rlim_cur / 16 * 15;
1433 #endif
1435 /* Use no less than the minimum. */
1436 return MAX (size, MIN_SORT_SIZE);
1439 /* Return the sort buffer size to use with the input files identified
1440 by FPS and FILES, which are alternate names of the same files.
1441 NFILES gives the number of input files; NFPS may be less. Assume
1442 that each input line requires LINE_BYTES extra bytes' worth of line
1443 information. Do not exceed the size bound specified by the user
1444 (or a default size bound, if the user does not specify one). */
1446 static size_t
1447 sort_buffer_size (FILE *const *fps, size_t nfps,
1448 char *const *files, size_t nfiles,
1449 size_t line_bytes)
1451 /* A bound on the input size. If zero, the bound hasn't been
1452 determined yet. */
1453 static size_t size_bound;
1455 /* In the worst case, each input byte is a newline. */
1456 size_t worst_case_per_input_byte = line_bytes + 1;
1458 /* Keep enough room for one extra input line and an extra byte.
1459 This extra room might be needed when preparing to read EOF. */
1460 size_t size = worst_case_per_input_byte + 1;
1462 size_t i;
1464 for (i = 0; i < nfiles; i++)
1466 struct stat st;
1467 off_t file_size;
1468 size_t worst_case;
1470 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1471 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1472 : stat (files[i], &st))
1473 != 0)
1474 die (_("stat failed"), files[i]);
1476 if (S_ISREG (st.st_mode))
1477 file_size = st.st_size;
1478 else
1480 /* The file has unknown size. If the user specified a sort
1481 buffer size, use that; otherwise, guess the size. */
1482 if (sort_size)
1483 return sort_size;
1484 file_size = INPUT_FILE_SIZE_GUESS;
1487 if (! size_bound)
1489 size_bound = sort_size;
1490 if (! size_bound)
1491 size_bound = default_sort_size ();
1494 /* Add the amount of memory needed to represent the worst case
1495 where the input consists entirely of newlines followed by a
1496 single non-newline. Check for overflow. */
1497 worst_case = file_size * worst_case_per_input_byte + 1;
1498 if (file_size != worst_case / worst_case_per_input_byte
1499 || size_bound - size <= worst_case)
1500 return size_bound;
1501 size += worst_case;
1504 return size;
1507 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1508 must be at least sizeof (struct line). Allocate ALLOC bytes
1509 initially. */
1511 static void
1512 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1514 /* Ensure that the line array is properly aligned. If the desired
1515 size cannot be allocated, repeatedly halve it until allocation
1516 succeeds. The smaller allocation may hurt overall performance,
1517 but that's better than failing. */
1518 while (true)
1520 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1521 buf->buf = malloc (alloc);
1522 if (buf->buf)
1523 break;
1524 alloc /= 2;
1525 if (alloc <= line_bytes + 1)
1526 xalloc_die ();
1529 buf->line_bytes = line_bytes;
1530 buf->alloc = alloc;
1531 buf->used = buf->left = buf->nlines = 0;
1532 buf->eof = false;
1535 /* Return one past the limit of the line array. */
1537 static inline struct line *
1538 buffer_linelim (struct buffer const *buf)
1540 return (struct line *) (buf->buf + buf->alloc);
1543 /* Return a pointer to the first character of the field specified
1544 by KEY in LINE. */
1546 static char *
1547 begfield (struct line const *line, struct keyfield const *key)
1549 char *ptr = line->text, *lim = ptr + line->length - 1;
1550 size_t sword = key->sword;
1551 size_t schar = key->schar;
1553 /* The leading field separator itself is included in a field when -t
1554 is absent. */
1556 if (tab != TAB_DEFAULT)
1557 while (ptr < lim && sword--)
1559 while (ptr < lim && *ptr != tab)
1560 ++ptr;
1561 if (ptr < lim)
1562 ++ptr;
1564 else
1565 while (ptr < lim && sword--)
1567 while (ptr < lim && blanks[to_uchar (*ptr)])
1568 ++ptr;
1569 while (ptr < lim && !blanks[to_uchar (*ptr)])
1570 ++ptr;
1573 /* If we're ignoring leading blanks when computing the Start
1574 of the field, skip past them here. */
1575 if (key->skipsblanks)
1576 while (ptr < lim && blanks[to_uchar (*ptr)])
1577 ++ptr;
1579 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1580 ptr = MIN (lim, ptr + schar);
1582 return ptr;
1585 /* Return the limit of (a pointer to the first character after) the field
1586 in LINE specified by KEY. */
1588 static char *
1589 limfield (struct line const *line, struct keyfield const *key)
1591 char *ptr = line->text, *lim = ptr + line->length - 1;
1592 size_t eword = key->eword, echar = key->echar;
1594 if (echar == 0)
1595 eword++; /* Skip all of end field. */
1597 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1598 whichever comes first. If there are more than EWORD fields, leave
1599 PTR pointing at the beginning of the field having zero-based index,
1600 EWORD. If a delimiter character was specified (via -t), then that
1601 `beginning' is the first character following the delimiting TAB.
1602 Otherwise, leave PTR pointing at the first `blank' character after
1603 the preceding field. */
1604 if (tab != TAB_DEFAULT)
1605 while (ptr < lim && eword--)
1607 while (ptr < lim && *ptr != tab)
1608 ++ptr;
1609 if (ptr < lim && (eword || echar))
1610 ++ptr;
1612 else
1613 while (ptr < lim && eword--)
1615 while (ptr < lim && blanks[to_uchar (*ptr)])
1616 ++ptr;
1617 while (ptr < lim && !blanks[to_uchar (*ptr)])
1618 ++ptr;
1621 #ifdef POSIX_UNSPECIFIED
1622 /* The following block of code makes GNU sort incompatible with
1623 standard Unix sort, so it's ifdef'd out for now.
1624 The POSIX spec isn't clear on how to interpret this.
1625 FIXME: request clarification.
1627 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1628 Date: Thu, 30 May 96 12:20:41 -0400
1629 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1631 [...]I believe I've found another bug in `sort'.
1633 $ cat /tmp/sort.in
1634 a b c 2 d
1635 pq rs 1 t
1636 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1637 a b c 2 d
1638 pq rs 1 t
1639 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1640 pq rs 1 t
1641 a b c 2 d
1643 Unix sort produced the answer I expected: sort on the single character
1644 in column 7. GNU sort produced different results, because it disagrees
1645 on the interpretation of the key-end spec "M.N". Unix sort reads this
1646 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1647 "skip M-1 fields, then either N-1 characters or the rest of the current
1648 field, whichever comes first". This extra clause applies only to
1649 key-ends, not key-starts.
1652 /* Make LIM point to the end of (one byte past) the current field. */
1653 if (tab != TAB_DEFAULT)
1655 char *newlim;
1656 newlim = memchr (ptr, tab, lim - ptr);
1657 if (newlim)
1658 lim = newlim;
1660 else
1662 char *newlim;
1663 newlim = ptr;
1664 while (newlim < lim && blanks[to_uchar (*newlim)])
1665 ++newlim;
1666 while (newlim < lim && !blanks[to_uchar (*newlim)])
1667 ++newlim;
1668 lim = newlim;
1670 #endif
1672 if (echar != 0) /* We need to skip over a portion of the end field. */
1674 /* If we're ignoring leading blanks when computing the End
1675 of the field, skip past them here. */
1676 if (key->skipeblanks)
1677 while (ptr < lim && blanks[to_uchar (*ptr)])
1678 ++ptr;
1680 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1681 ptr = MIN (lim, ptr + echar);
1684 return ptr;
1687 /* Fill BUF reading from FP, moving buf->left bytes from the end
1688 of buf->buf to the beginning first. If EOF is reached and the
1689 file wasn't terminated by a newline, supply one. Set up BUF's line
1690 table too. FILE is the name of the file corresponding to FP.
1691 Return true if some input was read. */
1693 static bool
1694 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1696 struct keyfield const *key = keylist;
1697 char eol = eolchar;
1698 size_t line_bytes = buf->line_bytes;
1699 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1701 if (buf->eof)
1702 return false;
1704 if (buf->used != buf->left)
1706 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1707 buf->used = buf->left;
1708 buf->nlines = 0;
1711 while (true)
1713 char *ptr = buf->buf + buf->used;
1714 struct line *linelim = buffer_linelim (buf);
1715 struct line *line = linelim - buf->nlines;
1716 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1717 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1719 while (line_bytes + 1 < avail)
1721 /* Read as many bytes as possible, but do not read so many
1722 bytes that there might not be enough room for the
1723 corresponding line array. The worst case is when the
1724 rest of the input file consists entirely of newlines,
1725 except that the last byte is not a newline. */
1726 size_t readsize = (avail - 1) / (line_bytes + 1);
1727 size_t bytes_read = fread (ptr, 1, readsize, fp);
1728 char *ptrlim = ptr + bytes_read;
1729 char *p;
1730 avail -= bytes_read;
1732 if (bytes_read != readsize)
1734 if (ferror (fp))
1735 die (_("read failed"), file);
1736 if (feof (fp))
1738 buf->eof = true;
1739 if (buf->buf == ptrlim)
1740 return false;
1741 if (line_start != ptrlim && ptrlim[-1] != eol)
1742 *ptrlim++ = eol;
1746 /* Find and record each line in the just-read input. */
1747 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1749 /* Delimit the line with NUL. This eliminates the need to
1750 temporarily replace the last byte with NUL when calling
1751 xmemcoll(), which increases performance. */
1752 *p = '\0';
1753 ptr = p + 1;
1754 line--;
1755 line->text = line_start;
1756 line->length = ptr - line_start;
1757 mergesize = MAX (mergesize, line->length);
1758 avail -= line_bytes;
1760 if (key)
1762 /* Precompute the position of the first key for
1763 efficiency. */
1764 line->keylim = (key->eword == SIZE_MAX
1766 : limfield (line, key));
1768 if (key->sword != SIZE_MAX)
1769 line->keybeg = begfield (line, key);
1770 else
1772 if (key->skipsblanks)
1773 while (blanks[to_uchar (*line_start)])
1774 line_start++;
1775 line->keybeg = line_start;
1779 line_start = ptr;
1782 ptr = ptrlim;
1783 if (buf->eof)
1784 break;
1787 buf->used = ptr - buf->buf;
1788 buf->nlines = buffer_linelim (buf) - line;
1789 if (buf->nlines != 0)
1791 buf->left = ptr - line_start;
1792 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1793 return true;
1797 /* The current input line is too long to fit in the buffer.
1798 Double the buffer size and try again, keeping it properly
1799 aligned. */
1800 size_t line_alloc = buf->alloc / sizeof (struct line);
1801 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1802 buf->alloc = line_alloc * sizeof (struct line);
1807 /* Table that maps characters to order-of-magnitude values. */
1808 static char const unit_order[UCHAR_LIM] =
1810 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1811 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1812 /* This initializer syntax works on all C99 hosts. For now, use
1813 it only on non-ASCII hosts, to ease the pain of porting to
1814 pre-C99 ASCII hosts. */
1815 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1816 ['k']=1,
1817 #else
1818 /* Generate the following table with this command:
1819 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1820 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1821 |fmt */
1822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1825 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1833 #endif
1836 /* Return an integer that represents the order of magnitude of the
1837 unit following the number. The number may contain thousands
1838 separators and a decimal point, but it may not contain leading blanks.
1839 Negative numbers get negative orders; zero numbers have a zero order. */
1841 static int
1842 find_unit_order (char const *number)
1844 bool minus_sign = (*number == '-');
1845 char const *p = number + minus_sign;
1846 int nonzero = 0;
1847 unsigned char ch;
1849 /* Scan to end of number.
1850 Decimals or separators not followed by digits stop the scan.
1851 Numbers ending in decimals or separators are thus considered
1852 to be lacking in units.
1853 FIXME: add support for multibyte thousands_sep and decimal_point. */
1857 while (ISDIGIT (ch = *p++))
1858 nonzero |= ch - '0';
1860 while (ch == thousands_sep);
1862 if (ch == decimal_point)
1863 while (ISDIGIT (ch = *p++))
1864 nonzero |= ch - '0';
1866 if (nonzero)
1868 int order = unit_order[ch];
1869 return (minus_sign ? -order : order);
1871 else
1872 return 0;
1875 /* Compare numbers A and B ending in units with SI or IEC prefixes
1876 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1878 static int
1879 human_numcompare (char const *a, char const *b)
1881 while (blanks[to_uchar (*a)])
1882 a++;
1883 while (blanks[to_uchar (*b)])
1884 b++;
1886 int diff = find_unit_order (a) - find_unit_order (b);
1887 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1890 /* Compare strings A and B as numbers without explicitly converting them to
1891 machine numbers. Comparatively slow for short strings, but asymptotically
1892 hideously fast. */
1894 static int
1895 numcompare (char const *a, char const *b)
1897 while (blanks[to_uchar (*a)])
1898 a++;
1899 while (blanks[to_uchar (*b)])
1900 b++;
1902 return strnumcmp (a, b, decimal_point, thousands_sep);
1905 static int
1906 general_numcompare (char const *sa, char const *sb)
1908 /* FIXME: maybe add option to try expensive FP conversion
1909 only if A and B can't be compared more cheaply/accurately. */
1911 char *ea;
1912 char *eb;
1913 long_double a = strtold (sa, &ea);
1914 long_double b = strtold (sb, &eb);
1916 /* Put conversion errors at the start of the collating sequence. */
1917 if (sa == ea)
1918 return sb == eb ? 0 : -1;
1919 if (sb == eb)
1920 return 1;
1922 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1923 conversion errors but before numbers; sort them by internal
1924 bit-pattern, for lack of a more portable alternative. */
1925 return (a < b ? -1
1926 : a > b ? 1
1927 : a == b ? 0
1928 : b == b ? -1
1929 : a == a ? 1
1930 : memcmp (&a, &b, sizeof a));
1933 /* Return an integer in 1..12 of the month name MONTH.
1934 Return 0 if the name in S is not recognized. */
1936 static int
1937 getmonth (char const *month, char **ea)
1939 size_t lo = 0;
1940 size_t hi = MONTHS_PER_YEAR;
1942 while (blanks[to_uchar (*month)])
1943 month++;
1947 size_t ix = (lo + hi) / 2;
1948 char const *m = month;
1949 char const *n = monthtab[ix].name;
1951 for (;; m++, n++)
1953 if (!*n)
1955 if (ea)
1956 *ea = (char *) m;
1957 return monthtab[ix].val;
1959 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
1961 hi = ix;
1962 break;
1964 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
1966 lo = ix + 1;
1967 break;
1971 while (lo < hi);
1973 return 0;
1976 /* A randomly chosen MD5 state, used for random comparison. */
1977 static struct md5_ctx random_md5_state;
1979 /* Initialize the randomly chosen MD5 state. */
1981 static void
1982 random_md5_state_init (char const *random_source)
1984 unsigned char buf[MD5_DIGEST_SIZE];
1985 struct randread_source *r = randread_new (random_source, sizeof buf);
1986 if (! r)
1987 die (_("open failed"), random_source);
1988 randread (r, buf, sizeof buf);
1989 if (randread_free (r) != 0)
1990 die (_("close failed"), random_source);
1991 md5_init_ctx (&random_md5_state);
1992 md5_process_bytes (buf, sizeof buf, &random_md5_state);
1995 /* This is like strxfrm, except it reports any error and exits. */
1997 static size_t
1998 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2000 errno = 0;
2001 size_t translated_size = strxfrm (dest, src, destsize);
2003 if (errno)
2005 error (0, errno, _("string transformation failed"));
2006 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2007 error (SORT_FAILURE, 0,
2008 _("the untransformed string was %s"),
2009 quotearg_n_style (0, locale_quoting_style, src));
2012 return translated_size;
2015 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2016 using one or more random hash functions. TEXTA[LENA] and
2017 TEXTB[LENB] must be zero. */
2019 static int
2020 compare_random (char *restrict texta, size_t lena,
2021 char *restrict textb, size_t lenb)
2023 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2024 data. This is used to break ties if there is an checksum
2025 collision, and this is good enough given the astronomically low
2026 probability of a collision. */
2027 int xfrm_diff = 0;
2029 char stackbuf[4000];
2030 char *buf = stackbuf;
2031 size_t bufsize = sizeof stackbuf;
2032 void *allocated = NULL;
2033 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2034 struct md5_ctx s[2];
2035 s[0] = s[1] = random_md5_state;
2037 if (hard_LC_COLLATE)
2039 char const *lima = texta + lena;
2040 char const *limb = textb + lenb;
2042 while (true)
2044 /* Transform the text into the basis of comparison, so that byte
2045 strings that would otherwise considered to be equal are
2046 considered equal here even if their bytes differ.
2048 Each time through this loop, transform one
2049 null-terminated string's worth from TEXTA or from TEXTB
2050 or both. That way, there's no need to store the
2051 transformation of the whole line, if it contains many
2052 null-terminated strings. */
2054 /* Store the transformed data into a big-enough buffer. */
2056 /* A 3X size guess avoids the overhead of calling strxfrm
2057 twice on typical implementations. Don't worry about
2058 size_t overflow, as the guess need not be correct. */
2059 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2060 if (bufsize < guess_bufsize)
2062 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2063 free (allocated);
2064 buf = allocated = malloc (bufsize);
2065 if (! buf)
2067 buf = stackbuf;
2068 bufsize = sizeof stackbuf;
2072 size_t sizea =
2073 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2074 bool a_fits = sizea <= bufsize;
2075 size_t sizeb =
2076 (textb < limb
2077 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2078 (a_fits ? bufsize - sizea : 0))
2079 + 1)
2080 : 0);
2082 if (! (a_fits && sizea + sizeb <= bufsize))
2084 bufsize = sizea + sizeb;
2085 if (bufsize < SIZE_MAX / 3)
2086 bufsize = bufsize * 3 / 2;
2087 free (allocated);
2088 buf = allocated = xmalloc (bufsize);
2089 if (texta < lima)
2090 strxfrm (buf, texta, sizea);
2091 if (textb < limb)
2092 strxfrm (buf + sizea, textb, sizeb);
2095 /* Advance past NULs to the next part of each input string,
2096 exiting the loop if both strings are exhausted. When
2097 exiting the loop, prepare to finish off the tiebreaker
2098 comparison properly. */
2099 if (texta < lima)
2100 texta += strlen (texta) + 1;
2101 if (textb < limb)
2102 textb += strlen (textb) + 1;
2103 if (! (texta < lima || textb < limb))
2105 lena = sizea; texta = buf;
2106 lenb = sizeb; textb = buf + sizea;
2107 break;
2110 /* Accumulate the transformed data in the corresponding
2111 checksums. */
2112 md5_process_bytes (buf, sizea, &s[0]);
2113 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2115 /* Update the tiebreaker comparison of the transformed data. */
2116 if (! xfrm_diff)
2118 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2119 if (! xfrm_diff)
2120 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2125 /* Compute and compare the checksums. */
2126 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2127 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2128 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2130 /* Fall back on the tiebreaker if the checksums collide. */
2131 if (! diff)
2133 if (! xfrm_diff)
2135 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2136 if (! xfrm_diff)
2137 xfrm_diff = (lena > lenb) - (lena < lenb);
2140 diff = xfrm_diff;
2143 free (allocated);
2145 return diff;
2148 /* Return the printable width of the block of memory starting at
2149 TEXT and ending just before LIM, counting each tab as one byte.
2150 FIXME: Should we generally be counting non printable chars? */
2152 static size_t
2153 debug_width (char const *text, char const *lim)
2155 size_t width = mbsnwidth (text, lim - text, 0);
2156 while (text < lim)
2157 width += (*text++ == '\t');
2158 return width;
2161 /* For debug mode, "underline" a key at the
2162 specified offset and screen width. */
2164 static void
2165 mark_key (size_t offset, size_t width)
2167 while (offset--)
2168 putchar (' ');
2170 if (!width)
2171 printf (_("^ no match for key\n"));
2172 else
2175 putchar ('_');
2176 while (--width);
2178 putchar ('\n');
2182 /* Return true if KEY is a numeric key. */
2184 static inline bool
2185 key_numeric (struct keyfield const *key)
2187 return key->numeric || key->general_numeric || key->human_numeric;
2190 /* For LINE, output a debugging line that underlines KEY in LINE.
2191 If KEY is null, underline the whole line. */
2193 static void
2194 debug_key (struct line const *line, struct keyfield const *key)
2196 char *text = line->text;
2197 char *beg = text;
2198 char *lim = text + line->length - 1;
2200 if (key)
2202 if (key->sword != SIZE_MAX)
2203 beg = begfield (line, key);
2204 if (key->eword != SIZE_MAX)
2205 lim = limfield (line, key);
2207 if (key->skipsblanks || key->month || key_numeric (key))
2209 char saved = *lim;
2210 *lim = '\0';
2212 while (blanks[to_uchar (*beg)])
2213 beg++;
2215 char *tighter_lim = beg;
2217 if (lim < beg)
2218 tighter_lim = lim;
2219 else if (key->month)
2220 getmonth (beg, &tighter_lim);
2221 else if (key->general_numeric)
2222 ignore_value (strtold (beg, &tighter_lim));
2223 else if (key->numeric || key->human_numeric)
2225 char *p = beg + (beg < lim && *beg == '-');
2226 bool found_digit = false;
2227 unsigned char ch;
2231 while (ISDIGIT (ch = *p++))
2232 found_digit = true;
2234 while (ch == thousands_sep);
2236 if (ch == decimal_point)
2237 while (ISDIGIT (ch = *p++))
2238 found_digit = true;
2240 if (found_digit)
2241 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2243 else
2244 tighter_lim = lim;
2246 *lim = saved;
2247 lim = tighter_lim;
2251 size_t offset = debug_width (text, beg);
2252 size_t width = debug_width (beg, lim);
2253 mark_key (offset, width);
2256 /* Debug LINE by underlining its keys. */
2258 static void
2259 debug_line (struct line const *line)
2261 struct keyfield const *key = keylist;
2264 debug_key (line, key);
2265 while (key && ((key = key->next) || ! (unique || stable)));
2268 /* Return whether sorting options specified for key. */
2270 static bool
2271 default_key_compare (struct keyfield const *key)
2273 return ! (key->ignore
2274 || key->translate
2275 || key->skipsblanks
2276 || key->skipeblanks
2277 || key_numeric (key)
2278 || key->month
2279 || key->version
2280 || key->random
2281 /* || key->reverse */
2285 /* Convert a key to the short options used to specify it. */
2287 static void
2288 key_to_opts (struct keyfield const *key, char *opts)
2290 if (key->skipsblanks || key->skipeblanks)
2291 *opts++ = 'b';/* either disables global -b */
2292 if (key->ignore == nondictionary)
2293 *opts++ = 'd';
2294 if (key->translate)
2295 *opts++ = 'f';
2296 if (key->general_numeric)
2297 *opts++ = 'g';
2298 if (key->human_numeric)
2299 *opts++ = 'h';
2300 if (key->ignore == nonprinting)
2301 *opts++ = 'i';
2302 if (key->month)
2303 *opts++ = 'M';
2304 if (key->numeric)
2305 *opts++ = 'n';
2306 if (key->random)
2307 *opts++ = 'R';
2308 if (key->reverse)
2309 *opts++ = 'r';
2310 if (key->version)
2311 *opts++ = 'V';
2312 *opts = '\0';
2315 /* Output data independent key warnings to stderr. */
2317 static void
2318 key_warnings (struct keyfield const *gkey, bool gkey_only)
2320 struct keyfield const *key;
2321 struct keyfield ugkey = *gkey;
2322 unsigned long keynum = 1;
2324 for (key = keylist; key; key = key->next, keynum++)
2326 if (key->obsolete_used)
2328 size_t sword = key->sword;
2329 size_t eword = key->eword;
2330 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2331 /* obsolescent syntax +A.x -B.y is equivalent to:
2332 -k A+1.x+1,B.y (when y = 0)
2333 -k A+1.x+1,B+1.y (when y > 0) */
2334 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2335 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2336 char *po = obuf;
2337 char *pn = nbuf;
2339 if (sword == SIZE_MAX)
2340 sword++;
2342 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2343 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2344 if (key->eword != SIZE_MAX)
2346 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2347 stpcpy (stpcpy (pn, ","),
2348 umaxtostr (eword + 1
2349 + (key->echar == SIZE_MAX), tmp));
2351 error (0, 0, _("obsolescent key `%s' used; consider `%s' instead"),
2352 obuf, nbuf);
2355 /* Warn about field specs that will never match. */
2356 if (key->sword != SIZE_MAX && key->eword < key->sword)
2357 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2359 /* Warn about significant leading blanks. */
2360 bool implicit_skip = key_numeric (key) || key->month;
2361 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2362 && !(key->schar || key->echar);
2363 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2364 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2365 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2366 || (!key->skipsblanks && key->schar)
2367 || (!key->skipeblanks && key->echar)))
2368 error (0, 0, _("leading blanks are significant in key %lu; "
2369 "consider also specifying `b'"), keynum);
2371 /* Warn about numeric comparisons spanning fields,
2372 as field delimiters could be interpreted as part
2373 of the number (maybe only in other locales). */
2374 if (!gkey_only && key_numeric (key))
2376 size_t sword = key->sword + 1;
2377 size_t eword = key->eword + 1;
2378 if (!sword)
2379 sword++;
2380 if (sword != eword)
2381 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2382 keynum);
2385 /* Flag global options not copied or specified in any key. */
2386 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2387 ugkey.ignore = NULL;
2388 if (ugkey.translate && (ugkey.translate == key->translate))
2389 ugkey.translate = NULL;
2390 ugkey.skipsblanks &= !key->skipsblanks;
2391 ugkey.skipeblanks &= !key->skipeblanks;
2392 ugkey.month &= !key->month;
2393 ugkey.numeric &= !key->numeric;
2394 ugkey.general_numeric &= !key->general_numeric;
2395 ugkey.human_numeric &= !key->human_numeric;
2396 ugkey.random &= !key->random;
2397 ugkey.version &= !key->version;
2398 ugkey.reverse &= !key->reverse;
2401 /* Warn about ignored global options flagged above.
2402 Note if gkey is the only one in the list, all flags are cleared. */
2403 if (!default_key_compare (&ugkey)
2404 || (ugkey.reverse && (stable || unique) && keylist))
2406 bool ugkey_reverse = ugkey.reverse;
2407 if (!(stable || unique))
2408 ugkey.reverse = false;
2409 /* The following is too big, but guaranteed to be "big enough". */
2410 char opts[sizeof short_options];
2411 key_to_opts (&ugkey, opts);
2412 error (0, 0,
2413 ngettext ("option `-%s' is ignored",
2414 "options `-%s' are ignored",
2415 select_plural (strlen (opts))), opts);
2416 ugkey.reverse = ugkey_reverse;
2418 if (ugkey.reverse && !(stable || unique) && keylist)
2419 error (0, 0, _("option `-r' only applies to last-resort comparison"));
2422 /* Compare two lines A and B trying every key in sequence until there
2423 are no more keys or a difference is found. */
2425 static int
2426 keycompare (struct line const *a, struct line const *b)
2428 struct keyfield *key = keylist;
2430 /* For the first iteration only, the key positions have been
2431 precomputed for us. */
2432 char *texta = a->keybeg;
2433 char *textb = b->keybeg;
2434 char *lima = a->keylim;
2435 char *limb = b->keylim;
2437 int diff;
2439 while (true)
2441 char const *translate = key->translate;
2442 bool const *ignore = key->ignore;
2444 /* Treat field ends before field starts as empty fields. */
2445 lima = MAX (texta, lima);
2446 limb = MAX (textb, limb);
2448 /* Find the lengths. */
2449 size_t lena = lima - texta;
2450 size_t lenb = limb - textb;
2452 if (hard_LC_COLLATE || key_numeric (key)
2453 || key->month || key->random || key->version)
2455 char *ta;
2456 char *tb;
2457 size_t tlena;
2458 size_t tlenb;
2460 char enda IF_LINT (= 0);
2461 char endb IF_LINT (= 0);
2462 void *allocated IF_LINT (= NULL);
2463 char stackbuf[4000];
2465 if (ignore || translate)
2467 /* Compute with copies of the keys, which are the result of
2468 translating or ignoring characters, and which need their
2469 own storage. */
2471 size_t i;
2473 /* Allocate space for copies. */
2474 size_t size = lena + 1 + lenb + 1;
2475 if (size <= sizeof stackbuf)
2476 ta = stackbuf, allocated = NULL;
2477 else
2478 ta = allocated = xmalloc (size);
2479 tb = ta + lena + 1;
2481 /* Put into each copy a version of the key in which the
2482 requested characters are ignored or translated. */
2483 for (tlena = i = 0; i < lena; i++)
2484 if (! (ignore && ignore[to_uchar (texta[i])]))
2485 ta[tlena++] = (translate
2486 ? translate[to_uchar (texta[i])]
2487 : texta[i]);
2488 ta[tlena] = '\0';
2490 for (tlenb = i = 0; i < lenb; i++)
2491 if (! (ignore && ignore[to_uchar (textb[i])]))
2492 tb[tlenb++] = (translate
2493 ? translate[to_uchar (textb[i])]
2494 : textb[i]);
2495 tb[tlenb] = '\0';
2497 else
2499 /* Use the keys in-place, temporarily null-terminated. */
2500 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2501 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2504 if (key->numeric)
2505 diff = numcompare (ta, tb);
2506 else if (key->general_numeric)
2507 diff = general_numcompare (ta, tb);
2508 else if (key->human_numeric)
2509 diff = human_numcompare (ta, tb);
2510 else if (key->month)
2511 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2512 else if (key->random)
2513 diff = compare_random (ta, tlena, tb, tlenb);
2514 else if (key->version)
2515 diff = filevercmp (ta, tb);
2516 else
2518 /* Locale-dependent string sorting. This is slower than
2519 C-locale sorting, which is implemented below. */
2520 if (tlena == 0)
2521 diff = - NONZERO (tlenb);
2522 else if (tlenb == 0)
2523 diff = 1;
2524 else
2525 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2528 if (ignore || translate)
2529 free (allocated);
2530 else
2532 ta[tlena] = enda;
2533 tb[tlenb] = endb;
2536 else if (ignore)
2538 #define CMP_WITH_IGNORE(A, B) \
2539 do \
2541 while (true) \
2543 while (texta < lima && ignore[to_uchar (*texta)]) \
2544 ++texta; \
2545 while (textb < limb && ignore[to_uchar (*textb)]) \
2546 ++textb; \
2547 if (! (texta < lima && textb < limb)) \
2548 break; \
2549 diff = to_uchar (A) - to_uchar (B); \
2550 if (diff) \
2551 goto not_equal; \
2552 ++texta; \
2553 ++textb; \
2556 diff = (texta < lima) - (textb < limb); \
2558 while (0)
2560 if (translate)
2561 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2562 translate[to_uchar (*textb)]);
2563 else
2564 CMP_WITH_IGNORE (*texta, *textb);
2566 else if (lena == 0)
2567 diff = - NONZERO (lenb);
2568 else if (lenb == 0)
2569 goto greater;
2570 else
2572 if (translate)
2574 while (texta < lima && textb < limb)
2576 diff = (to_uchar (translate[to_uchar (*texta++)])
2577 - to_uchar (translate[to_uchar (*textb++)]));
2578 if (diff)
2579 goto not_equal;
2582 else
2584 diff = memcmp (texta, textb, MIN (lena, lenb));
2585 if (diff)
2586 goto not_equal;
2588 diff = lena < lenb ? -1 : lena != lenb;
2591 if (diff)
2592 goto not_equal;
2594 key = key->next;
2595 if (! key)
2596 break;
2598 /* Find the beginning and limit of the next field. */
2599 if (key->eword != SIZE_MAX)
2600 lima = limfield (a, key), limb = limfield (b, key);
2601 else
2602 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2604 if (key->sword != SIZE_MAX)
2605 texta = begfield (a, key), textb = begfield (b, key);
2606 else
2608 texta = a->text, textb = b->text;
2609 if (key->skipsblanks)
2611 while (texta < lima && blanks[to_uchar (*texta)])
2612 ++texta;
2613 while (textb < limb && blanks[to_uchar (*textb)])
2614 ++textb;
2619 return 0;
2621 greater:
2622 diff = 1;
2623 not_equal:
2624 return key->reverse ? -diff : diff;
2627 /* Compare two lines A and B, returning negative, zero, or positive
2628 depending on whether A compares less than, equal to, or greater than B. */
2630 static int
2631 compare (struct line const *a, struct line const *b)
2633 int diff;
2634 size_t alen, blen;
2636 /* First try to compare on the specified keys (if any).
2637 The only two cases with no key at all are unadorned sort,
2638 and unadorned sort -r. */
2639 if (keylist)
2641 diff = keycompare (a, b);
2642 if (diff || unique || stable)
2643 return diff;
2646 /* If the keys all compare equal (or no keys were specified)
2647 fall through to the default comparison. */
2648 alen = a->length - 1, blen = b->length - 1;
2650 if (alen == 0)
2651 diff = - NONZERO (blen);
2652 else if (blen == 0)
2653 diff = 1;
2654 else if (hard_LC_COLLATE)
2656 /* Note xmemcoll0 is a performance enhancement as
2657 it will not unconditionally write '\0' after the
2658 passed in buffers, which was seen to give around
2659 a 3% increase in performance for short lines. */
2660 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2662 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2663 diff = alen < blen ? -1 : alen != blen;
2665 return reverse ? -diff : diff;
2668 /* Write LINE to output stream FP; the output file's name is
2669 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2670 otherwise. If debugging is enabled and FP is standard output,
2671 append some debugging information. */
2673 static void
2674 write_line (struct line const *line, FILE *fp, char const *output_file)
2676 char *buf = line->text;
2677 size_t n_bytes = line->length;
2678 char *ebuf = buf + n_bytes;
2680 if (!output_file && debug)
2682 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2683 char const *c = buf;
2685 while (c < ebuf)
2687 char wc = *c++;
2688 if (wc == '\t')
2689 wc = '>';
2690 else if (c == ebuf)
2691 wc = '\n';
2692 if (fputc (wc, fp) == EOF)
2693 die (_("write failed"), output_file);
2696 debug_line (line);
2698 else
2700 ebuf[-1] = eolchar;
2701 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2702 die (_("write failed"), output_file);
2703 ebuf[-1] = '\0';
2707 /* Check that the lines read from FILE_NAME come in order. Return
2708 true if they are in order. If CHECKONLY == 'c', also print a
2709 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2710 they are not in order. */
2712 static bool
2713 check (char const *file_name, char checkonly)
2715 FILE *fp = xfopen (file_name, "r");
2716 struct buffer buf; /* Input buffer. */
2717 struct line temp; /* Copy of previous line. */
2718 size_t alloc = 0;
2719 uintmax_t line_number = 0;
2720 struct keyfield const *key = keylist;
2721 bool nonunique = ! unique;
2722 bool ordered = true;
2724 initbuf (&buf, sizeof (struct line),
2725 MAX (merge_buffer_size, sort_size));
2726 temp.text = NULL;
2728 while (fillbuf (&buf, fp, file_name))
2730 struct line const *line = buffer_linelim (&buf);
2731 struct line const *linebase = line - buf.nlines;
2733 /* Make sure the line saved from the old buffer contents is
2734 less than or equal to the first line of the new buffer. */
2735 if (alloc && nonunique <= compare (&temp, line - 1))
2737 found_disorder:
2739 if (checkonly == 'c')
2741 struct line const *disorder_line = line - 1;
2742 uintmax_t disorder_line_number =
2743 buffer_linelim (&buf) - disorder_line + line_number;
2744 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2745 fprintf (stderr, _("%s: %s:%s: disorder: "),
2746 program_name, file_name,
2747 umaxtostr (disorder_line_number, hr_buf));
2748 write_line (disorder_line, stderr, _("standard error"));
2751 ordered = false;
2752 break;
2756 /* Compare each line in the buffer with its successor. */
2757 while (linebase < --line)
2758 if (nonunique <= compare (line, line - 1))
2759 goto found_disorder;
2761 line_number += buf.nlines;
2763 /* Save the last line of the buffer. */
2764 if (alloc < line->length)
2768 alloc *= 2;
2769 if (! alloc)
2771 alloc = line->length;
2772 break;
2775 while (alloc < line->length);
2777 free (temp.text);
2778 temp.text = xmalloc (alloc);
2780 memcpy (temp.text, line->text, line->length);
2781 temp.length = line->length;
2782 if (key)
2784 temp.keybeg = temp.text + (line->keybeg - line->text);
2785 temp.keylim = temp.text + (line->keylim - line->text);
2789 xfclose (fp, file_name);
2790 free (buf.buf);
2791 free (temp.text);
2792 return ordered;
2795 /* Open FILES (there are NFILES of them) and store the resulting array
2796 of stream pointers into (*PFPS). Allocate the array. Return the
2797 number of successfully opened files, setting errno if this value is
2798 less than NFILES. */
2800 static size_t
2801 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2803 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2804 int i;
2806 /* Open as many input files as we can. */
2807 for (i = 0; i < nfiles; i++)
2809 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2810 ? open_temp (files[i].temp)
2811 : stream_open (files[i].name, "r"));
2812 if (!fps[i])
2813 break;
2816 return i;
2819 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2820 files (all of which are at the start of the FILES array), and
2821 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2822 FPS is the vector of open stream corresponding to the files.
2823 Close input and output streams before returning.
2824 OUTPUT_FILE gives the name of the output file. If it is NULL,
2825 the output file is standard output. */
2827 static void
2828 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2829 FILE *ofp, char const *output_file, FILE **fps)
2831 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2832 /* Input buffers for each file. */
2833 struct line saved; /* Saved line storage for unique check. */
2834 struct line const *savedline = NULL;
2835 /* &saved if there is a saved line. */
2836 size_t savealloc = 0; /* Size allocated for the saved line. */
2837 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2838 /* Current line in each line table. */
2839 struct line const **base = xnmalloc (nfiles, sizeof *base);
2840 /* Base of each line table. */
2841 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2842 /* Table representing a permutation of fps,
2843 such that cur[ord[0]] is the smallest line
2844 and will be next output. */
2845 size_t i;
2846 size_t j;
2847 size_t t;
2848 struct keyfield const *key = keylist;
2849 saved.text = NULL;
2851 /* Read initial lines from each input file. */
2852 for (i = 0; i < nfiles; )
2854 initbuf (&buffer[i], sizeof (struct line),
2855 MAX (merge_buffer_size, sort_size / nfiles));
2856 if (fillbuf (&buffer[i], fps[i], files[i].name))
2858 struct line const *linelim = buffer_linelim (&buffer[i]);
2859 cur[i] = linelim - 1;
2860 base[i] = linelim - buffer[i].nlines;
2861 i++;
2863 else
2865 /* fps[i] is empty; eliminate it from future consideration. */
2866 xfclose (fps[i], files[i].name);
2867 if (i < ntemps)
2869 ntemps--;
2870 zaptemp (files[i].name);
2872 free (buffer[i].buf);
2873 --nfiles;
2874 for (j = i; j < nfiles; ++j)
2876 files[j] = files[j + 1];
2877 fps[j] = fps[j + 1];
2882 /* Set up the ord table according to comparisons among input lines.
2883 Since this only reorders two items if one is strictly greater than
2884 the other, it is stable. */
2885 for (i = 0; i < nfiles; ++i)
2886 ord[i] = i;
2887 for (i = 1; i < nfiles; ++i)
2888 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2889 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2891 /* Repeatedly output the smallest line until no input remains. */
2892 while (nfiles)
2894 struct line const *smallest = cur[ord[0]];
2896 /* If uniquified output is turned on, output only the first of
2897 an identical series of lines. */
2898 if (unique)
2900 if (savedline && compare (savedline, smallest))
2902 savedline = NULL;
2903 write_line (&saved, ofp, output_file);
2905 if (!savedline)
2907 savedline = &saved;
2908 if (savealloc < smallest->length)
2911 if (! savealloc)
2913 savealloc = smallest->length;
2914 break;
2916 while ((savealloc *= 2) < smallest->length);
2918 free (saved.text);
2919 saved.text = xmalloc (savealloc);
2921 saved.length = smallest->length;
2922 memcpy (saved.text, smallest->text, saved.length);
2923 if (key)
2925 saved.keybeg =
2926 saved.text + (smallest->keybeg - smallest->text);
2927 saved.keylim =
2928 saved.text + (smallest->keylim - smallest->text);
2932 else
2933 write_line (smallest, ofp, output_file);
2935 /* Check if we need to read more lines into core. */
2936 if (base[ord[0]] < smallest)
2937 cur[ord[0]] = smallest - 1;
2938 else
2940 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2942 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2943 cur[ord[0]] = linelim - 1;
2944 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2946 else
2948 /* We reached EOF on fps[ord[0]]. */
2949 for (i = 1; i < nfiles; ++i)
2950 if (ord[i] > ord[0])
2951 --ord[i];
2952 --nfiles;
2953 xfclose (fps[ord[0]], files[ord[0]].name);
2954 if (ord[0] < ntemps)
2956 ntemps--;
2957 zaptemp (files[ord[0]].name);
2959 free (buffer[ord[0]].buf);
2960 for (i = ord[0]; i < nfiles; ++i)
2962 fps[i] = fps[i + 1];
2963 files[i] = files[i + 1];
2964 buffer[i] = buffer[i + 1];
2965 cur[i] = cur[i + 1];
2966 base[i] = base[i + 1];
2968 for (i = 0; i < nfiles; ++i)
2969 ord[i] = ord[i + 1];
2970 continue;
2974 /* The new line just read in may be larger than other lines
2975 already in main memory; push it back in the queue until we
2976 encounter a line larger than it. Optimize for the common
2977 case where the new line is smallest. */
2979 size_t lo = 1;
2980 size_t hi = nfiles;
2981 size_t probe = lo;
2982 size_t ord0 = ord[0];
2983 size_t count_of_smaller_lines;
2985 while (lo < hi)
2987 int cmp = compare (cur[ord0], cur[ord[probe]]);
2988 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2989 hi = probe;
2990 else
2991 lo = probe + 1;
2992 probe = (lo + hi) / 2;
2995 count_of_smaller_lines = lo - 1;
2996 for (j = 0; j < count_of_smaller_lines; j++)
2997 ord[j] = ord[j + 1];
2998 ord[count_of_smaller_lines] = ord0;
3002 if (unique && savedline)
3004 write_line (&saved, ofp, output_file);
3005 free (saved.text);
3008 xfclose (ofp, output_file);
3009 free (fps);
3010 free (buffer);
3011 free (ord);
3012 free (base);
3013 free (cur);
3016 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3017 files (all of which are at the start of the FILES array), and
3018 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3019 Close input and output files before returning.
3020 OUTPUT_FILE gives the name of the output file.
3022 Return the number of files successfully merged. This number can be
3023 less than NFILES if we ran low on file descriptors, but in this
3024 case it is never less than 2. */
3026 static size_t
3027 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3028 FILE *ofp, char const *output_file)
3030 FILE **fps;
3031 size_t nopened = open_input_files (files, nfiles, &fps);
3032 if (nopened < nfiles && nopened < 2)
3033 die (_("open failed"), files[nopened].name);
3034 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3035 return nopened;
3038 /* Merge into T (of size NLINES) the two sorted arrays of lines
3039 LO (with NLINES / 2 members), and
3040 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3041 T and LO point just past their respective arrays, and the arrays
3042 are in reverse order. NLINES must be at least 2. */
3044 static void
3045 mergelines (struct line *restrict t, size_t nlines,
3046 struct line const *restrict lo)
3048 size_t nlo = nlines / 2;
3049 size_t nhi = nlines - nlo;
3050 struct line *hi = t - nlo;
3052 while (true)
3053 if (compare (lo - 1, hi - 1) <= 0)
3055 *--t = *--lo;
3056 if (! --nlo)
3058 /* HI must equal T now, and there is no need to copy from
3059 HI to T. */
3060 return;
3063 else
3065 *--t = *--hi;
3066 if (! --nhi)
3069 *--t = *--lo;
3070 while (--nlo);
3072 return;
3077 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3078 Do this all within one thread. NLINES must be at least 2.
3079 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3080 Otherwise the sort is in-place and TEMP is half-sized.
3081 The input and output arrays are in reverse order, and LINES and
3082 TEMP point just past the end of their respective arrays.
3084 Use a recursive divide-and-conquer algorithm, in the style
3085 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3086 the optimization suggested by exercise 5.2.4-10; this requires room
3087 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3088 writes that this memory optimization was originally published by
3089 D. A. Bell, Comp J. 1 (1958), 75. */
3091 static void
3092 sequential_sort (struct line *restrict lines, size_t nlines,
3093 struct line *restrict temp, bool to_temp)
3095 if (nlines == 2)
3097 /* Declare `swap' as int, not bool, to work around a bug
3098 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3099 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3100 int swap = (0 < compare (&lines[-1], &lines[-2]));
3101 if (to_temp)
3103 temp[-1] = lines[-1 - swap];
3104 temp[-2] = lines[-2 + swap];
3106 else if (swap)
3108 temp[-1] = lines[-1];
3109 lines[-1] = lines[-2];
3110 lines[-2] = temp[-1];
3113 else
3115 size_t nlo = nlines / 2;
3116 size_t nhi = nlines - nlo;
3117 struct line *lo = lines;
3118 struct line *hi = lines - nlo;
3120 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3121 if (1 < nlo)
3122 sequential_sort (lo, nlo, temp, !to_temp);
3123 else if (!to_temp)
3124 temp[-1] = lo[-1];
3126 struct line *dest;
3127 struct line const *sorted_lo;
3128 if (to_temp)
3130 dest = temp;
3131 sorted_lo = lines;
3133 else
3135 dest = lines;
3136 sorted_lo = temp;
3138 mergelines (dest, nlines, sorted_lo);
3142 static struct merge_node *init_node (struct merge_node *restrict,
3143 struct merge_node *restrict,
3144 struct line *, size_t, size_t, bool);
3147 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3148 lines, with destination DEST. */
3149 static struct merge_node *
3150 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3152 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3154 struct merge_node *root = merge_tree;
3155 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3156 root->dest = NULL;
3157 root->nlo = root->nhi = nlines;
3158 root->parent = NULL;
3159 root->level = MERGE_END;
3160 root->queued = false;
3161 pthread_mutex_init (&root->lock, NULL);
3163 init_node (root, root + 1, dest, nthreads, nlines, false);
3164 return merge_tree;
3167 /* Destroy the merge tree. */
3168 static void
3169 merge_tree_destroy (struct merge_node *merge_tree)
3171 free (merge_tree);
3174 /* Initialize a merge tree node and its descendants. The node's
3175 parent is PARENT. The node and its descendants are taken from the
3176 array of nodes NODE_POOL. Their destination starts at DEST; they
3177 will consume NTHREADS threads. The total number of sort lines is
3178 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3179 its parent. */
3181 static struct merge_node *
3182 init_node (struct merge_node *restrict parent,
3183 struct merge_node *restrict node_pool,
3184 struct line *dest, size_t nthreads,
3185 size_t total_lines, bool is_lo_child)
3187 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3188 size_t nlo = nlines / 2;
3189 size_t nhi = nlines - nlo;
3190 struct line *lo = dest - total_lines;
3191 struct line *hi = lo - nlo;
3192 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3194 struct merge_node *node = node_pool++;
3195 node->lo = node->end_lo = lo;
3196 node->hi = node->end_hi = hi;
3197 node->dest = parent_end;
3198 node->nlo = nlo;
3199 node->nhi = nhi;
3200 node->parent = parent;
3201 node->level = parent->level + 1;
3202 node->queued = false;
3203 pthread_mutex_init (&node->lock, NULL);
3205 if (nthreads > 1)
3207 size_t lo_threads = nthreads / 2;
3208 size_t hi_threads = nthreads - lo_threads;
3209 node->lo_child = node_pool;
3210 node_pool = init_node (node, node_pool, lo, lo_threads,
3211 total_lines, true);
3212 node->hi_child = node_pool;
3213 node_pool = init_node (node, node_pool, hi, hi_threads,
3214 total_lines, false);
3216 else
3218 node->lo_child = NULL;
3219 node->hi_child = NULL;
3221 return node_pool;
3225 /* Compare two merge nodes A and B for priority. */
3227 static int
3228 compare_nodes (void const *a, void const *b)
3230 struct merge_node const *nodea = a;
3231 struct merge_node const *nodeb = b;
3232 if (nodea->level == nodeb->level)
3233 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3234 return nodea->level < nodeb->level;
3237 /* Lock a merge tree NODE. */
3239 static inline void
3240 lock_node (struct merge_node *node)
3242 pthread_mutex_lock (&node->lock);
3245 /* Unlock a merge tree NODE. */
3247 static inline void
3248 unlock_node (struct merge_node *node)
3250 pthread_mutex_unlock (&node->lock);
3253 /* Destroy merge QUEUE. */
3255 static void
3256 queue_destroy (struct merge_node_queue *queue)
3258 heap_free (queue->priority_queue);
3259 pthread_cond_destroy (&queue->cond);
3260 pthread_mutex_destroy (&queue->mutex);
3263 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3264 NTHREADS threads. */
3266 static void
3267 queue_init (struct merge_node_queue *queue, size_t nthreads)
3269 /* Though it's highly unlikely all nodes are in the heap at the same
3270 time, the heap should accommodate all of them. Counting a NULL
3271 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3272 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3273 pthread_mutex_init (&queue->mutex, NULL);
3274 pthread_cond_init (&queue->cond, NULL);
3277 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3278 does not need to lock NODE. */
3280 static void
3281 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3283 pthread_mutex_lock (&queue->mutex);
3284 heap_insert (queue->priority_queue, node);
3285 node->queued = true;
3286 pthread_mutex_unlock (&queue->mutex);
3287 pthread_cond_signal (&queue->cond);
3290 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3292 static struct merge_node *
3293 queue_pop (struct merge_node_queue *queue)
3295 struct merge_node *node;
3296 pthread_mutex_lock (&queue->mutex);
3297 while (! (node = heap_remove_top (queue->priority_queue)))
3298 pthread_cond_wait (&queue->cond, &queue->mutex);
3299 pthread_mutex_unlock (&queue->mutex);
3300 lock_node (node);
3301 node->queued = false;
3302 return node;
3305 /* Output LINE to TFP, unless -u is specified and the line compares
3306 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3307 is null if TFP is standard output.
3309 This function does not save the line for comparison later, so it is
3310 appropriate only for internal sort. */
3312 static void
3313 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3315 static struct line saved;
3317 if (unique)
3319 if (saved.text && ! compare (line, &saved))
3320 return;
3321 saved = *line;
3324 write_line (line, tfp, temp_output);
3327 /* Merge the lines currently available to a NODE in the binary
3328 merge tree. Merge a number of lines appropriate for this merge
3329 level, assuming TOTAL_LINES is the total number of lines.
3331 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3332 the name of TFP, or is null if TFP is standard output. */
3334 static void
3335 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3336 FILE *tfp, char const *temp_output)
3338 struct line *lo_orig = node->lo;
3339 struct line *hi_orig = node->hi;
3340 size_t to_merge = MAX_MERGE (total_lines, node->level);
3341 size_t merged_lo;
3342 size_t merged_hi;
3344 if (node->level > MERGE_ROOT)
3346 /* Merge to destination buffer. */
3347 struct line *dest = *node->dest;
3348 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3349 if (compare (node->lo - 1, node->hi - 1) <= 0)
3350 *--dest = *--node->lo;
3351 else
3352 *--dest = *--node->hi;
3354 merged_lo = lo_orig - node->lo;
3355 merged_hi = hi_orig - node->hi;
3357 if (node->nhi == merged_hi)
3358 while (node->lo != node->end_lo && to_merge--)
3359 *--dest = *--node->lo;
3360 else if (node->nlo == merged_lo)
3361 while (node->hi != node->end_hi && to_merge--)
3362 *--dest = *--node->hi;
3363 *node->dest = dest;
3365 else
3367 /* Merge directly to output. */
3368 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3370 if (compare (node->lo - 1, node->hi - 1) <= 0)
3371 write_unique (--node->lo, tfp, temp_output);
3372 else
3373 write_unique (--node->hi, tfp, temp_output);
3376 merged_lo = lo_orig - node->lo;
3377 merged_hi = hi_orig - node->hi;
3379 if (node->nhi == merged_hi)
3381 while (node->lo != node->end_lo && to_merge--)
3382 write_unique (--node->lo, tfp, temp_output);
3384 else if (node->nlo == merged_lo)
3386 while (node->hi != node->end_hi && to_merge--)
3387 write_unique (--node->hi, tfp, temp_output);
3391 /* Update NODE. */
3392 merged_lo = lo_orig - node->lo;
3393 merged_hi = hi_orig - node->hi;
3394 node->nlo -= merged_lo;
3395 node->nhi -= merged_hi;
3398 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3399 NODE's children has available lines and the other either has
3400 available lines or has exhausted its lines. */
3402 static void
3403 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3405 if (! node->queued)
3407 bool lo_avail = (node->lo - node->end_lo) != 0;
3408 bool hi_avail = (node->hi - node->end_hi) != 0;
3409 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3410 queue_insert (queue, node);
3414 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3416 static void
3417 queue_check_insert_parent (struct merge_node_queue *queue,
3418 struct merge_node *node)
3420 if (node->level > MERGE_ROOT)
3422 lock_node (node->parent);
3423 queue_check_insert (queue, node->parent);
3424 unlock_node (node->parent);
3426 else if (node->nlo + node->nhi == 0)
3428 /* If the MERGE_ROOT NODE has finished merging, insert the
3429 MERGE_END node. */
3430 queue_insert (queue, node->parent);
3434 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3435 some of those lines, until the MERGE_END node is popped.
3436 TOTAL_LINES is the total number of lines. If merging at the top
3437 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3438 null if TFP is standard output. */
3440 static void
3441 merge_loop (struct merge_node_queue *queue,
3442 size_t total_lines, FILE *tfp, char const *temp_output)
3444 while (1)
3446 struct merge_node *node = queue_pop (queue);
3448 if (node->level == MERGE_END)
3450 unlock_node (node);
3451 /* Reinsert so other threads can pop it. */
3452 queue_insert (queue, node);
3453 break;
3455 mergelines_node (node, total_lines, tfp, temp_output);
3456 queue_check_insert (queue, node);
3457 queue_check_insert_parent (queue, node);
3459 unlock_node (node);
3464 static void sortlines (struct line *restrict, size_t, size_t,
3465 struct merge_node *, bool, struct merge_node_queue *,
3466 FILE *, char const *);
3468 /* Thread arguments for sortlines_thread. */
3470 struct thread_args
3472 /* Source, i.e., the array of lines to sort. This points just past
3473 the end of the array. */
3474 struct line *lines;
3476 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3477 size_t nthreads;
3479 /* Number of lines in LINES and DEST. */
3480 size_t const total_lines;
3482 /* Merge node. Lines from this node and this node's sibling will merged
3483 to this node's parent. */
3484 struct merge_node *const node;
3486 /* True if this node is sorting the lower half of the parent's work. */
3487 bool is_lo_child;
3489 /* The priority queue controlling available work for the entire
3490 internal sort. */
3491 struct merge_node_queue *const queue;
3493 /* If at the top level, the file to output to, and the file's name.
3494 If the file is standard output, the file's name is null. */
3495 FILE *tfp;
3496 char const *output_temp;
3499 /* Like sortlines, except with a signature acceptable to pthread_create. */
3501 static void *
3502 sortlines_thread (void *data)
3504 struct thread_args const *args = data;
3505 sortlines (args->lines, args->nthreads, args->total_lines,
3506 args->node, args->is_lo_child, args->queue, args->tfp,
3507 args->output_temp);
3508 return NULL;
3511 /* Sort lines, possibly in parallel. The arguments are as in struct
3512 thread_args above.
3514 The algorithm has three phases: node creation, sequential sort,
3515 and binary merge.
3517 During node creation, sortlines recursively visits each node in the
3518 binary merge tree and creates a NODE structure corresponding to all the
3519 future line merging NODE is responsible for. For each call to
3520 sortlines, half the available threads are assigned to each recursive
3521 call, until a leaf node having only 1 available thread is reached.
3523 Each leaf node then performs two sequential sorts, one on each half of
3524 the lines it is responsible for. It records in its NODE structure that
3525 there are two sorted sublists available to merge from, and inserts its
3526 NODE into the priority queue.
3528 The binary merge phase then begins. Each thread drops into a loop
3529 where the thread retrieves a NODE from the priority queue, merges lines
3530 available to that NODE, and potentially insert NODE or its parent back
3531 into the queue if there are sufficient available lines for them to
3532 merge. This continues until all lines at all nodes of the merge tree
3533 have been merged. */
3535 static void
3536 sortlines (struct line *restrict lines, size_t nthreads,
3537 size_t total_lines, struct merge_node *node, bool is_lo_child,
3538 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3540 size_t nlines = node->nlo + node->nhi;
3542 /* Calculate thread arguments. */
3543 size_t lo_threads = nthreads / 2;
3544 size_t hi_threads = nthreads - lo_threads;
3545 pthread_t thread;
3546 struct thread_args args = {lines, lo_threads, total_lines,
3547 node->lo_child, true, queue, tfp, temp_output};
3549 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3550 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3552 sortlines (lines - node->nlo, hi_threads, total_lines,
3553 node->hi_child, false, queue, tfp, temp_output);
3554 pthread_join (thread, NULL);
3556 else
3558 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3559 Sort with 1 thread. */
3560 size_t nlo = node->nlo;
3561 size_t nhi = node->nhi;
3562 struct line *temp = lines - total_lines;
3563 if (1 < nhi)
3564 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3565 if (1 < nlo)
3566 sequential_sort (lines, nlo, temp, false);
3568 /* Update merge NODE. No need to lock yet. */
3569 node->lo = lines;
3570 node->hi = lines - nlo;
3571 node->end_lo = lines - nlo;
3572 node->end_hi = lines - nlo - nhi;
3574 queue_insert (queue, node);
3575 merge_loop (queue, total_lines, tfp, temp_output);
3578 pthread_mutex_destroy (&node->lock);
3581 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3582 the same as OUTFILE. If found, replace each with the same
3583 temporary copy that can be merged into OUTFILE without destroying
3584 OUTFILE before it is completely read. This temporary copy does not
3585 count as a merge temp, so don't worry about incrementing NTEMPS in
3586 the caller; final cleanup will remove it, not zaptemp.
3588 This test ensures that an otherwise-erroneous use like
3589 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3590 It's not clear that POSIX requires this nicety.
3591 Detect common error cases, but don't try to catch obscure cases like
3592 "cat ... FILE ... | sort -m -o FILE"
3593 where traditional "sort" doesn't copy the input and where
3594 people should know that they're getting into trouble anyway.
3595 Catching these obscure cases would slow down performance in
3596 common cases. */
3598 static void
3599 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3600 size_t nfiles, char const *outfile)
3602 size_t i;
3603 bool got_outstat = false;
3604 struct stat outstat;
3605 struct tempnode *tempcopy = NULL;
3607 for (i = ntemps; i < nfiles; i++)
3609 bool is_stdin = STREQ (files[i].name, "-");
3610 bool same;
3611 struct stat instat;
3613 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3614 same = true;
3615 else
3617 if (! got_outstat)
3619 if ((outfile
3620 ? stat (outfile, &outstat)
3621 : fstat (STDOUT_FILENO, &outstat))
3622 != 0)
3623 break;
3624 got_outstat = true;
3627 same = (((is_stdin
3628 ? fstat (STDIN_FILENO, &instat)
3629 : stat (files[i].name, &instat))
3630 == 0)
3631 && SAME_INODE (instat, outstat));
3634 if (same)
3636 if (! tempcopy)
3638 FILE *tftp;
3639 tempcopy = create_temp (&tftp);
3640 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3643 files[i].name = tempcopy->name;
3644 files[i].temp = tempcopy;
3649 /* Merge the input FILES. NTEMPS is the number of files at the
3650 start of FILES that are temporary; it is zero at the top level.
3651 NFILES is the total number of files. Put the output in
3652 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3654 static void
3655 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3656 char const *output_file)
3658 while (nmerge < nfiles)
3660 /* Number of input files processed so far. */
3661 size_t in;
3663 /* Number of output files generated so far. */
3664 size_t out;
3666 /* nfiles % NMERGE; this counts input files that are left over
3667 after all full-sized merges have been done. */
3668 size_t remainder;
3670 /* Number of easily-available slots at the next loop iteration. */
3671 size_t cheap_slots;
3673 /* Do as many NMERGE-size merges as possible. In the case that
3674 nmerge is bogus, increment by the maximum number of file
3675 descriptors allowed. */
3676 for (out = in = 0; nmerge <= nfiles - in; out++)
3678 FILE *tfp;
3679 struct tempnode *temp = create_temp (&tfp);
3680 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3681 nmerge, tfp, temp->name);
3682 ntemps -= MIN (ntemps, num_merged);
3683 files[out].name = temp->name;
3684 files[out].temp = temp;
3685 in += num_merged;
3688 remainder = nfiles - in;
3689 cheap_slots = nmerge - out % nmerge;
3691 if (cheap_slots < remainder)
3693 /* So many files remain that they can't all be put into the last
3694 NMERGE-sized output window. Do one more merge. Merge as few
3695 files as possible, to avoid needless I/O. */
3696 size_t nshortmerge = remainder - cheap_slots + 1;
3697 FILE *tfp;
3698 struct tempnode *temp = create_temp (&tfp);
3699 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3700 nshortmerge, tfp, temp->name);
3701 ntemps -= MIN (ntemps, num_merged);
3702 files[out].name = temp->name;
3703 files[out++].temp = temp;
3704 in += num_merged;
3707 /* Put the remaining input files into the last NMERGE-sized output
3708 window, so they will be merged in the next pass. */
3709 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3710 ntemps += out;
3711 nfiles -= in - out;
3714 avoid_trashing_input (files, ntemps, nfiles, output_file);
3716 /* We aren't guaranteed that this final mergefiles will work, therefore we
3717 try to merge into the output, and then merge as much as we can into a
3718 temp file if we can't. Repeat. */
3720 while (true)
3722 /* Merge directly into the output file if possible. */
3723 FILE **fps;
3724 size_t nopened = open_input_files (files, nfiles, &fps);
3726 if (nopened == nfiles)
3728 FILE *ofp = stream_open (output_file, "w");
3729 if (ofp)
3731 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3732 break;
3734 if (errno != EMFILE || nopened <= 2)
3735 die (_("open failed"), output_file);
3737 else if (nopened <= 2)
3738 die (_("open failed"), files[nopened].name);
3740 /* We ran out of file descriptors. Close one of the input
3741 files, to gain a file descriptor. Then create a temporary
3742 file with our spare file descriptor. Retry if that failed
3743 (e.g., some other process could open a file between the time
3744 we closed and tried to create). */
3745 FILE *tfp;
3746 struct tempnode *temp;
3749 nopened--;
3750 xfclose (fps[nopened], files[nopened].name);
3751 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3753 while (!temp);
3755 /* Merge into the newly allocated temporary. */
3756 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3757 fps);
3758 ntemps -= MIN (ntemps, nopened);
3759 files[0].name = temp->name;
3760 files[0].temp = temp;
3762 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3763 ntemps++;
3764 nfiles -= nopened - 1;
3768 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3770 static void
3771 sort (char *const *files, size_t nfiles, char const *output_file,
3772 size_t nthreads)
3774 struct buffer buf;
3775 IF_LINT (buf.buf = NULL);
3776 size_t ntemps = 0;
3777 bool output_file_created = false;
3779 buf.alloc = 0;
3781 while (nfiles)
3783 char const *temp_output;
3784 char const *file = *files;
3785 FILE *fp = xfopen (file, "r");
3786 FILE *tfp;
3788 size_t bytes_per_line;
3789 if (nthreads > 1)
3791 /* Get log P. */
3792 size_t tmp = 1;
3793 size_t mult = 1;
3794 while (tmp < nthreads)
3796 tmp *= 2;
3797 mult++;
3799 bytes_per_line = (mult * sizeof (struct line));
3801 else
3802 bytes_per_line = sizeof (struct line) * 3 / 2;
3804 if (! buf.alloc)
3805 initbuf (&buf, bytes_per_line,
3806 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3807 buf.eof = false;
3808 files++;
3809 nfiles--;
3811 while (fillbuf (&buf, fp, file))
3813 struct line *line;
3815 if (buf.eof && nfiles
3816 && (bytes_per_line + 1
3817 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3819 /* End of file, but there is more input and buffer room.
3820 Concatenate the next input file; this is faster in
3821 the usual case. */
3822 buf.left = buf.used;
3823 break;
3826 line = buffer_linelim (&buf);
3827 if (buf.eof && !nfiles && !ntemps && !buf.left)
3829 xfclose (fp, file);
3830 tfp = xfopen (output_file, "w");
3831 temp_output = output_file;
3832 output_file_created = true;
3834 else
3836 ++ntemps;
3837 temp_output = create_temp (&tfp)->name;
3839 if (1 < buf.nlines)
3841 struct merge_node_queue queue;
3842 queue_init (&queue, nthreads);
3843 struct merge_node *merge_tree =
3844 merge_tree_init (nthreads, buf.nlines, line);
3845 struct merge_node *root = merge_tree + 1;
3847 sortlines (line, nthreads, buf.nlines, root,
3848 true, &queue, tfp, temp_output);
3849 queue_destroy (&queue);
3850 pthread_mutex_destroy (&root->lock);
3851 merge_tree_destroy (merge_tree);
3853 else
3854 write_unique (line - 1, tfp, temp_output);
3856 xfclose (tfp, temp_output);
3858 if (output_file_created)
3859 goto finish;
3861 xfclose (fp, file);
3864 finish:
3865 free (buf.buf);
3867 if (! output_file_created)
3869 size_t i;
3870 struct tempnode *node = temphead;
3871 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3872 for (i = 0; node; i++)
3874 tempfiles[i].name = node->name;
3875 tempfiles[i].temp = node;
3876 node = node->next;
3878 merge (tempfiles, ntemps, ntemps, output_file);
3879 free (tempfiles);
3882 reap_all ();
3885 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3887 static void
3888 insertkey (struct keyfield *key_arg)
3890 struct keyfield **p;
3891 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3893 for (p = &keylist; *p; p = &(*p)->next)
3894 continue;
3895 *p = key;
3896 key->next = NULL;
3899 /* Report a bad field specification SPEC, with extra info MSGID. */
3901 static void badfieldspec (char const *, char const *)
3902 ATTRIBUTE_NORETURN;
3903 static void
3904 badfieldspec (char const *spec, char const *msgid)
3906 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3907 _(msgid), quote (spec));
3908 abort ();
3911 /* Report incompatible options. */
3913 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3914 static void
3915 incompatible_options (char const *opts)
3917 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3918 abort ();
3921 /* Check compatibility of ordering options. */
3923 static void
3924 check_ordering_compatibility (void)
3926 struct keyfield *key;
3928 for (key = keylist; key; key = key->next)
3929 if (1 < (key->numeric + key->general_numeric + key->human_numeric
3930 + key->month + (key->version | key->random | !!key->ignore)))
3932 /* The following is too big, but guaranteed to be "big enough". */
3933 char opts[sizeof short_options];
3934 /* Clear flags we're not interested in. */
3935 key->skipsblanks = key->skipeblanks = key->reverse = false;
3936 key_to_opts (key, opts);
3937 incompatible_options (opts);
3941 /* Parse the leading integer in STRING and store the resulting value
3942 (which must fit into size_t) into *VAL. Return the address of the
3943 suffix after the integer. If the value is too large, silently
3944 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3945 failure; otherwise, report MSGID and exit on failure. */
3947 static char const *
3948 parse_field_count (char const *string, size_t *val, char const *msgid)
3950 char *suffix;
3951 uintmax_t n;
3953 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3955 case LONGINT_OK:
3956 case LONGINT_INVALID_SUFFIX_CHAR:
3957 *val = n;
3958 if (*val == n)
3959 break;
3960 /* Fall through. */
3961 case LONGINT_OVERFLOW:
3962 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3963 *val = SIZE_MAX;
3964 break;
3966 case LONGINT_INVALID:
3967 if (msgid)
3968 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3969 _(msgid), quote (string));
3970 return NULL;
3973 return suffix;
3976 /* Handle interrupts and hangups. */
3978 static void
3979 sighandler (int sig)
3981 if (! SA_NOCLDSTOP)
3982 signal (sig, SIG_IGN);
3984 cleanup ();
3986 signal (sig, SIG_DFL);
3987 raise (sig);
3990 /* Set the ordering options for KEY specified in S.
3991 Return the address of the first character in S that
3992 is not a valid ordering option.
3993 BLANKTYPE is the kind of blanks that 'b' should skip. */
3995 static char *
3996 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
3998 while (*s)
4000 switch (*s)
4002 case 'b':
4003 if (blanktype == bl_start || blanktype == bl_both)
4004 key->skipsblanks = true;
4005 if (blanktype == bl_end || blanktype == bl_both)
4006 key->skipeblanks = true;
4007 break;
4008 case 'd':
4009 key->ignore = nondictionary;
4010 break;
4011 case 'f':
4012 key->translate = fold_toupper;
4013 break;
4014 case 'g':
4015 key->general_numeric = true;
4016 break;
4017 case 'h':
4018 key->human_numeric = true;
4019 break;
4020 case 'i':
4021 /* Option order should not matter, so don't let -i override
4022 -d. -d implies -i, but -i does not imply -d. */
4023 if (! key->ignore)
4024 key->ignore = nonprinting;
4025 break;
4026 case 'M':
4027 key->month = true;
4028 break;
4029 case 'n':
4030 key->numeric = true;
4031 break;
4032 case 'R':
4033 key->random = true;
4034 break;
4035 case 'r':
4036 key->reverse = true;
4037 break;
4038 case 'V':
4039 key->version = true;
4040 break;
4041 default:
4042 return (char *) s;
4044 ++s;
4046 return (char *) s;
4049 /* Initialize KEY. */
4051 static struct keyfield *
4052 key_init (struct keyfield *key)
4054 memset (key, 0, sizeof *key);
4055 key->eword = SIZE_MAX;
4056 return key;
4060 main (int argc, char **argv)
4062 struct keyfield *key;
4063 struct keyfield key_buf;
4064 struct keyfield gkey;
4065 bool gkey_only = false;
4066 char const *s;
4067 int c = 0;
4068 char checkonly = 0;
4069 bool mergeonly = false;
4070 char *random_source = NULL;
4071 bool need_random = false;
4072 size_t nthreads = 0;
4073 size_t nfiles = 0;
4074 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4075 bool obsolete_usage = (posix2_version () < 200112);
4076 char **files;
4077 char *files_from = NULL;
4078 struct Tokens tok;
4079 char const *outfile = NULL;
4081 initialize_main (&argc, &argv);
4082 set_program_name (argv[0]);
4083 setlocale (LC_ALL, "");
4084 bindtextdomain (PACKAGE, LOCALEDIR);
4085 textdomain (PACKAGE);
4087 initialize_exit_failure (SORT_FAILURE);
4089 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4090 #if HAVE_NL_LANGINFO
4091 hard_LC_TIME = hard_locale (LC_TIME);
4092 #endif
4094 /* Get locale's representation of the decimal point. */
4096 struct lconv const *locale = localeconv ();
4098 /* If the locale doesn't define a decimal point, or if the decimal
4099 point is multibyte, use the C locale's decimal point. FIXME:
4100 add support for multibyte decimal points. */
4101 decimal_point = to_uchar (locale->decimal_point[0]);
4102 if (! decimal_point || locale->decimal_point[1])
4103 decimal_point = '.';
4105 /* FIXME: add support for multibyte thousands separators. */
4106 thousands_sep = to_uchar (*locale->thousands_sep);
4107 if (! thousands_sep || locale->thousands_sep[1])
4108 thousands_sep = -1;
4111 have_read_stdin = false;
4112 inittables ();
4115 size_t i;
4116 static int const sig[] =
4118 /* The usual suspects. */
4119 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4120 #ifdef SIGPOLL
4121 SIGPOLL,
4122 #endif
4123 #ifdef SIGPROF
4124 SIGPROF,
4125 #endif
4126 #ifdef SIGVTALRM
4127 SIGVTALRM,
4128 #endif
4129 #ifdef SIGXCPU
4130 SIGXCPU,
4131 #endif
4132 #ifdef SIGXFSZ
4133 SIGXFSZ,
4134 #endif
4136 enum { nsigs = ARRAY_CARDINALITY (sig) };
4138 #if SA_NOCLDSTOP
4139 struct sigaction act;
4141 sigemptyset (&caught_signals);
4142 for (i = 0; i < nsigs; i++)
4144 sigaction (sig[i], NULL, &act);
4145 if (act.sa_handler != SIG_IGN)
4146 sigaddset (&caught_signals, sig[i]);
4149 act.sa_handler = sighandler;
4150 act.sa_mask = caught_signals;
4151 act.sa_flags = 0;
4153 for (i = 0; i < nsigs; i++)
4154 if (sigismember (&caught_signals, sig[i]))
4155 sigaction (sig[i], &act, NULL);
4156 #else
4157 for (i = 0; i < nsigs; i++)
4158 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4160 signal (sig[i], sighandler);
4161 siginterrupt (sig[i], 1);
4163 #endif
4165 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4167 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4168 atexit (exit_cleanup);
4170 key_init (&gkey);
4171 gkey.sword = SIZE_MAX;
4173 files = xnmalloc (argc, sizeof *files);
4175 while (true)
4177 /* Parse an operand as a file after "--" was seen; or if
4178 pedantic and a file was seen, unless the POSIX version
4179 predates 1003.1-2001 and -c was not seen and the operand is
4180 "-o FILE" or "-oFILE". */
4181 int oi = -1;
4183 if (c == -1
4184 || (posixly_correct && nfiles != 0
4185 && ! (obsolete_usage
4186 && ! checkonly
4187 && optind != argc
4188 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4189 && (argv[optind][2] || optind + 1 != argc)))
4190 || ((c = getopt_long (argc, argv, short_options,
4191 long_options, &oi))
4192 == -1))
4194 if (argc <= optind)
4195 break;
4196 files[nfiles++] = argv[optind++];
4198 else switch (c)
4200 case 1:
4201 key = NULL;
4202 if (optarg[0] == '+')
4204 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4205 && ISDIGIT (argv[optind][1]));
4206 obsolete_usage |= minus_pos_usage && !posixly_correct;
4207 if (obsolete_usage)
4209 /* Treat +POS1 [-POS2] as a key if possible; but silently
4210 treat an operand as a file if it is not a valid +POS1. */
4211 key = key_init (&key_buf);
4212 s = parse_field_count (optarg + 1, &key->sword, NULL);
4213 if (s && *s == '.')
4214 s = parse_field_count (s + 1, &key->schar, NULL);
4215 if (! (key->sword || key->schar))
4216 key->sword = SIZE_MAX;
4217 if (! s || *set_ordering (s, key, bl_start))
4218 key = NULL;
4219 else
4221 if (minus_pos_usage)
4223 char const *optarg1 = argv[optind++];
4224 s = parse_field_count (optarg1 + 1, &key->eword,
4225 N_("invalid number after `-'"));
4226 if (*s == '.')
4227 s = parse_field_count (s + 1, &key->echar,
4228 N_("invalid number after `.'"));
4229 if (!key->echar && key->eword)
4231 /* obsolescent syntax +A.x -B.y is equivalent to:
4232 -k A+1.x+1,B.y (when y = 0)
4233 -k A+1.x+1,B+1.y (when y > 0)
4234 So eword is decremented as in the -k case
4235 only when the end field (B) is specified and
4236 echar (y) is 0. */
4237 key->eword--;
4239 if (*set_ordering (s, key, bl_end))
4240 badfieldspec (optarg1,
4241 N_("stray character in field spec"));
4243 key->obsolete_used = true;
4244 insertkey (key);
4248 if (! key)
4249 files[nfiles++] = optarg;
4250 break;
4252 case SORT_OPTION:
4253 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4254 /* Fall through. */
4255 case 'b':
4256 case 'd':
4257 case 'f':
4258 case 'g':
4259 case 'h':
4260 case 'i':
4261 case 'M':
4262 case 'n':
4263 case 'r':
4264 case 'R':
4265 case 'V':
4267 char str[2];
4268 str[0] = c;
4269 str[1] = '\0';
4270 set_ordering (str, &gkey, bl_both);
4272 break;
4274 case CHECK_OPTION:
4275 c = (optarg
4276 ? XARGMATCH ("--check", optarg, check_args, check_types)
4277 : 'c');
4278 /* Fall through. */
4279 case 'c':
4280 case 'C':
4281 if (checkonly && checkonly != c)
4282 incompatible_options ("cC");
4283 checkonly = c;
4284 break;
4286 case COMPRESS_PROGRAM_OPTION:
4287 if (compress_program && !STREQ (compress_program, optarg))
4288 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4289 compress_program = optarg;
4290 break;
4292 case DEBUG_PROGRAM_OPTION:
4293 debug = true;
4294 break;
4296 case FILES0_FROM_OPTION:
4297 files_from = optarg;
4298 break;
4300 case 'k':
4301 key = key_init (&key_buf);
4303 /* Get POS1. */
4304 s = parse_field_count (optarg, &key->sword,
4305 N_("invalid number at field start"));
4306 if (! key->sword--)
4308 /* Provoke with `sort -k0' */
4309 badfieldspec (optarg, N_("field number is zero"));
4311 if (*s == '.')
4313 s = parse_field_count (s + 1, &key->schar,
4314 N_("invalid number after `.'"));
4315 if (! key->schar--)
4317 /* Provoke with `sort -k1.0' */
4318 badfieldspec (optarg, N_("character offset is zero"));
4321 if (! (key->sword || key->schar))
4322 key->sword = SIZE_MAX;
4323 s = set_ordering (s, key, bl_start);
4324 if (*s != ',')
4326 key->eword = SIZE_MAX;
4327 key->echar = 0;
4329 else
4331 /* Get POS2. */
4332 s = parse_field_count (s + 1, &key->eword,
4333 N_("invalid number after `,'"));
4334 if (! key->eword--)
4336 /* Provoke with `sort -k1,0' */
4337 badfieldspec (optarg, N_("field number is zero"));
4339 if (*s == '.')
4341 s = parse_field_count (s + 1, &key->echar,
4342 N_("invalid number after `.'"));
4344 s = set_ordering (s, key, bl_end);
4346 if (*s)
4347 badfieldspec (optarg, N_("stray character in field spec"));
4348 insertkey (key);
4349 break;
4351 case 'm':
4352 mergeonly = true;
4353 break;
4355 case NMERGE_OPTION:
4356 specify_nmerge (oi, c, optarg);
4357 break;
4359 case 'o':
4360 if (outfile && !STREQ (outfile, optarg))
4361 error (SORT_FAILURE, 0, _("multiple output files specified"));
4362 outfile = optarg;
4363 break;
4365 case RANDOM_SOURCE_OPTION:
4366 if (random_source && !STREQ (random_source, optarg))
4367 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4368 random_source = optarg;
4369 break;
4371 case 's':
4372 stable = true;
4373 break;
4375 case 'S':
4376 specify_sort_size (oi, c, optarg);
4377 break;
4379 case 't':
4381 char newtab = optarg[0];
4382 if (! newtab)
4383 error (SORT_FAILURE, 0, _("empty tab"));
4384 if (optarg[1])
4386 if (STREQ (optarg, "\\0"))
4387 newtab = '\0';
4388 else
4390 /* Provoke with `sort -txx'. Complain about
4391 "multi-character tab" instead of "multibyte tab", so
4392 that the diagnostic's wording does not need to be
4393 changed once multibyte characters are supported. */
4394 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4395 quote (optarg));
4398 if (tab != TAB_DEFAULT && tab != newtab)
4399 error (SORT_FAILURE, 0, _("incompatible tabs"));
4400 tab = newtab;
4402 break;
4404 case 'T':
4405 add_temp_dir (optarg);
4406 break;
4408 case PARALLEL_OPTION:
4409 nthreads = specify_nthreads (oi, c, optarg);
4410 break;
4412 case 'u':
4413 unique = true;
4414 break;
4416 case 'y':
4417 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4418 through Solaris 7. It is also accepted by many non-Solaris
4419 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4420 -y is marked as obsolete starting with Solaris 8 (1999), but is
4421 still accepted as of Solaris 10 prerelease (2004).
4423 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4424 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4425 and which in general ignores the argument after "-y" if it
4426 consists entirely of digits (it can even be empty). */
4427 if (optarg == argv[optind - 1])
4429 char const *p;
4430 for (p = optarg; ISDIGIT (*p); p++)
4431 continue;
4432 optind -= (*p != '\0');
4434 break;
4436 case 'z':
4437 eolchar = 0;
4438 break;
4440 case_GETOPT_HELP_CHAR;
4442 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4444 default:
4445 usage (SORT_FAILURE);
4449 if (files_from)
4451 FILE *stream;
4453 /* When using --files0-from=F, you may not specify any files
4454 on the command-line. */
4455 if (nfiles)
4457 error (0, 0, _("extra operand %s"), quote (files[0]));
4458 fprintf (stderr, "%s\n",
4459 _("file operands cannot be combined with --files0-from"));
4460 usage (SORT_FAILURE);
4463 if (STREQ (files_from, "-"))
4464 stream = stdin;
4465 else
4467 stream = fopen (files_from, "r");
4468 if (stream == NULL)
4469 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4470 quote (files_from));
4473 readtokens0_init (&tok);
4475 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4476 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4477 quote (files_from));
4479 if (tok.n_tok)
4481 size_t i;
4482 free (files);
4483 files = tok.tok;
4484 nfiles = tok.n_tok;
4485 for (i = 0; i < nfiles; i++)
4487 if (STREQ (files[i], "-"))
4488 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4489 "no file name of %s allowed"),
4490 quote (files[i]));
4491 else if (files[i][0] == '\0')
4493 /* Using the standard `filename:line-number:' prefix here is
4494 not totally appropriate, since NUL is the separator,
4495 not NL, but it might be better than nothing. */
4496 unsigned long int file_number = i + 1;
4497 error (SORT_FAILURE, 0,
4498 _("%s:%lu: invalid zero-length file name"),
4499 quotearg_colon (files_from), file_number);
4503 else
4504 error (SORT_FAILURE, 0, _("no input from %s"),
4505 quote (files_from));
4508 /* Inheritance of global options to individual keys. */
4509 for (key = keylist; key; key = key->next)
4511 if (default_key_compare (key) && !key->reverse)
4513 key->ignore = gkey.ignore;
4514 key->translate = gkey.translate;
4515 key->skipsblanks = gkey.skipsblanks;
4516 key->skipeblanks = gkey.skipeblanks;
4517 key->month = gkey.month;
4518 key->numeric = gkey.numeric;
4519 key->general_numeric = gkey.general_numeric;
4520 key->human_numeric = gkey.human_numeric;
4521 key->version = gkey.version;
4522 key->random = gkey.random;
4523 key->reverse = gkey.reverse;
4526 need_random |= key->random;
4529 if (!keylist && !default_key_compare (&gkey))
4531 gkey_only = true;
4532 insertkey (&gkey);
4533 need_random |= gkey.random;
4536 check_ordering_compatibility ();
4538 if (debug)
4540 if (checkonly || outfile)
4542 static char opts[] = "X --debug";
4543 opts[0] = (checkonly ? checkonly : 'o');
4544 incompatible_options (opts);
4547 /* Always output the locale in debug mode, since this
4548 is such a common source of confusion. */
4549 if (hard_LC_COLLATE)
4550 error (0, 0, _("using %s sorting rules"),
4551 quote (setlocale (LC_COLLATE, NULL)));
4552 else
4553 error (0, 0, _("using simple byte comparison"));
4554 key_warnings (&gkey, gkey_only);
4557 reverse = gkey.reverse;
4559 if (need_random)
4560 random_md5_state_init (random_source);
4562 if (temp_dir_count == 0)
4564 char const *tmp_dir = getenv ("TMPDIR");
4565 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4568 if (nfiles == 0)
4570 static char *minus = (char *) "-";
4571 nfiles = 1;
4572 free (files);
4573 files = &minus;
4576 /* Need to re-check that we meet the minimum requirement for memory
4577 usage with the final value for NMERGE. */
4578 if (0 < sort_size)
4579 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4581 if (checkonly)
4583 if (nfiles > 1)
4584 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4585 quote (files[1]), checkonly);
4587 if (outfile)
4589 static char opts[] = {0, 'o', 0};
4590 opts[0] = checkonly;
4591 incompatible_options (opts);
4594 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4595 input is not properly sorted. */
4596 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4599 if (mergeonly)
4601 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4602 size_t i;
4604 for (i = 0; i < nfiles; ++i)
4605 sortfiles[i].name = files[i];
4607 merge (sortfiles, 0, nfiles, outfile);
4608 IF_LINT (free (sortfiles));
4610 else
4612 if (!nthreads)
4614 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4615 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4618 /* Avoid integer overflow later. */
4619 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4620 nthreads = MIN (nthreads, nthreads_max);
4622 sort (files, nfiles, outfile, nthreads);
4625 if (have_read_stdin && fclose (stdin) == EOF)
4626 die (_("close failed"), "-");
4628 exit (EXIT_SUCCESS);