1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/resource.h>
28 #include <sys/types.h>
36 #include "filevercmp.h"
37 #include "hard-locale.h"
40 #include "ignore-value.h"
49 #include "readtokens0.h"
52 #include "strnumcmp.h"
54 #include "xnanosleep.h"
58 struct rlimit
{ size_t rlim_cur
; };
59 # define getrlimit(Resource, Rlp) (-1)
62 /* The official name of this program (e.g., no 'g' prefix). */
63 #define PROGRAM_NAME "sort"
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
85 #if !defined OPEN_MAX && defined NR_OPEN
86 # define OPEN_MAX NR_OPEN
92 #define UCHAR_LIM (UCHAR_MAX + 1)
95 # define long_double long double
97 # define long_double double
99 # define strtold strtod
102 #ifndef DEFAULT_TMPDIR
103 # define DEFAULT_TMPDIR "/tmp"
106 /* Maximum number of lines to merge every time a NODE is taken from
107 the merge queue. Node is at LEVEL in the binary merge tree,
108 and is responsible for merging TOTAL lines. */
109 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
111 /* Heuristic value for the number of lines for which it is worth creating
112 a subthread, during an internal merge sort. I.e., it is a small number
113 of "average" lines for which sorting via two threads is faster than
114 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
115 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
116 lines of gensort -a output is sorted slightly faster with --parallel=2
117 than with --parallel=1. By contrast, using --parallel=1 is about 10%
118 faster than using --parallel=2 with a 64K-line input. */
119 enum { SUBTHREAD_LINES_HEURISTIC
= 128 * 1024 };
120 verify (4 <= SUBTHREAD_LINES_HEURISTIC
);
122 /* The number of threads after which there are
123 diminishing performance gains. */
124 enum { DEFAULT_MAX_THREADS
= 8 };
129 /* POSIX says to exit with status 1 if invoked with -c and the
130 input is not properly sorted. */
131 SORT_OUT_OF_ORDER
= 1,
133 /* POSIX says any other irregular exit must exit with a status
134 code greater than 1. */
140 /* The number of times we should try to fork a compression process
141 (we retry if the fork call fails). We don't _need_ to compress
142 temp files, this is just to reduce disk access, so this number
143 can be small. Each retry doubles in duration. */
144 MAX_FORK_TRIES_COMPRESS
= 4,
146 /* The number of times we should try to fork a decompression process.
147 If we can't fork a decompression process, we can't sort, so this
148 number should be big. Each retry doubles in duration. */
149 MAX_FORK_TRIES_DECOMPRESS
= 9
154 /* Level of the end-of-merge node, one level above the root. */
157 /* Level of the root node in merge tree. */
161 /* The representation of the decimal point in the current locale. */
162 static int decimal_point
;
164 /* Thousands separator; if -1, then there isn't one. */
165 static int thousands_sep
;
167 /* Nonzero if the corresponding locales are hard. */
168 static bool hard_LC_COLLATE
;
170 static bool hard_LC_TIME
;
173 #define NONZERO(x) ((x) != 0)
175 /* The kind of blanks for '-b' to skip in various options. */
176 enum blanktype
{ bl_start
, bl_end
, bl_both
};
178 /* The character marking end of line. Default to \n. */
179 static char eolchar
= '\n';
181 /* Lines are held in core as counted strings. */
184 char *text
; /* Text of the line. */
185 size_t length
; /* Length including final newline. */
186 char *keybeg
; /* Start of first key. */
187 char *keylim
; /* Limit of first key. */
193 char *buf
; /* Dynamically allocated buffer,
194 partitioned into 3 regions:
197 - an array of lines, in reverse order. */
198 size_t used
; /* Number of bytes used for input data. */
199 size_t nlines
; /* Number of lines in the line array. */
200 size_t alloc
; /* Number of bytes allocated. */
201 size_t left
; /* Number of bytes left from previous reads. */
202 size_t line_bytes
; /* Number of bytes to reserve for each line. */
203 bool eof
; /* An EOF has been read. */
209 size_t sword
; /* Zero-origin 'word' to start at. */
210 size_t schar
; /* Additional characters to skip. */
211 size_t eword
; /* Zero-origin last 'word' of key. */
212 size_t echar
; /* Additional characters in field. */
213 bool const *ignore
; /* Boolean array of characters to ignore. */
214 char const *translate
; /* Translation applied to characters. */
215 bool skipsblanks
; /* Skip leading blanks when finding start. */
216 bool skipeblanks
; /* Skip leading blanks when finding end. */
217 bool numeric
; /* Flag for numeric comparison. Handle
218 strings of digits with optional decimal
219 point, but no exponential notation. */
220 bool random
; /* Sort by random hash of key. */
221 bool general_numeric
; /* Flag for general, numeric comparison.
222 Handle numbers in exponential notation. */
223 bool human_numeric
; /* Flag for sorting by human readable
224 units with either SI or IEC prefixes. */
225 bool month
; /* Flag for comparison by month name. */
226 bool reverse
; /* Reverse the sense of comparison. */
227 bool version
; /* sort by version number */
228 bool obsolete_used
; /* obsolescent key option format is used. */
229 struct keyfield
*next
; /* Next keyfield to try. */
238 /* Binary merge tree node. */
241 struct line
*lo
; /* Lines to merge from LO child node. */
242 struct line
*hi
; /* Lines to merge from HI child ndoe. */
243 struct line
*end_lo
; /* End of available lines from LO. */
244 struct line
*end_hi
; /* End of available lines from HI. */
245 struct line
**dest
; /* Pointer to destination of merge. */
246 size_t nlo
; /* Total Lines remaining from LO. */
247 size_t nhi
; /* Total lines remaining from HI. */
248 struct merge_node
*parent
; /* Parent node. */
249 struct merge_node
*lo_child
; /* LO child node. */
250 struct merge_node
*hi_child
; /* HI child node. */
251 unsigned int level
; /* Level in merge tree. */
252 bool queued
; /* Node is already in heap. */
253 pthread_mutex_t lock
; /* Lock for node operations. */
256 /* Priority queue of merge nodes. */
257 struct merge_node_queue
259 struct heap
*priority_queue
; /* Priority queue of merge tree nodes. */
260 pthread_mutex_t mutex
; /* Lock for queue operations. */
261 pthread_cond_t cond
; /* Conditional wait for empty queue to populate
265 /* Used to implement --unique (-u). */
266 static struct line saved_line
;
268 /* FIXME: None of these tables work with multibyte character sets.
269 Also, there are many other bugs when handling multibyte characters.
270 One way to fix this is to rewrite 'sort' to use wide characters
271 internally, but doing this with good performance is a bit
274 /* Table of blanks. */
275 static bool blanks
[UCHAR_LIM
];
277 /* Table of non-printing characters. */
278 static bool nonprinting
[UCHAR_LIM
];
280 /* Table of non-dictionary characters (not letters, digits, or blanks). */
281 static bool nondictionary
[UCHAR_LIM
];
283 /* Translation table folding lower case to upper. */
284 static char fold_toupper
[UCHAR_LIM
];
286 #define MONTHS_PER_YEAR 12
288 /* Table mapping month names to integers.
289 Alphabetic order allows binary search. */
290 static struct month monthtab
[] =
306 /* During the merge phase, the number of files to merge at once. */
307 #define NMERGE_DEFAULT 16
309 /* Minimum size for a merge or check buffer. */
310 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
312 /* Minimum sort size; the code might not work with smaller sizes. */
313 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
315 /* The number of bytes needed for a merge or check buffer, which can
316 function relatively efficiently even if it holds only one line. If
317 a longer line is seen, this value is increased. */
318 static size_t merge_buffer_size
= MAX (MIN_MERGE_BUFFER_SIZE
, 256 * 1024);
320 /* The approximate maximum number of bytes of main memory to use, as
321 specified by the user. Zero if the user has not specified a size. */
322 static size_t sort_size
;
324 /* The initial allocation factor for non-regular files.
325 This is used, e.g., when reading from a pipe.
326 Don't make it too big, since it is multiplied by ~130 to
327 obtain the size of the actual buffer sort will allocate.
328 Also, there may be 8 threads all doing this at the same time. */
329 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
331 /* Array of directory names in which any temporary files are to be created. */
332 static char const **temp_dirs
;
334 /* Number of temporary directory names used. */
335 static size_t temp_dir_count
;
337 /* Number of allocated slots in temp_dirs. */
338 static size_t temp_dir_alloc
;
340 /* Flag to reverse the order of all comparisons. */
343 /* Flag for stable sort. This turns off the last ditch bytewise
344 comparison of lines, and instead leaves lines in the same order
345 they were read if all keys compare equal. */
348 /* If TAB has this value, blanks separate fields. */
349 enum { TAB_DEFAULT
= CHAR_MAX
+ 1 };
351 /* Tab character separating fields. If TAB_DEFAULT, then fields are
352 separated by the empty string between a non-blank character and a blank
354 static int tab
= TAB_DEFAULT
;
356 /* Flag to remove consecutive duplicate lines from the output.
357 Only the last of a sequence of equal lines will be output. */
360 /* Nonzero if any of the input files are the standard input. */
361 static bool have_read_stdin
;
363 /* List of key field comparisons to be tried. */
364 static struct keyfield
*keylist
;
366 /* Program used to (de)compress temp files. Must accept -d. */
367 static char const *compress_program
;
369 /* Annotate the output with extra info to aid the user. */
372 /* Maximum number of files to merge in one go. If more than this
373 number are present, temp files will be used. */
374 static unsigned int nmerge
= NMERGE_DEFAULT
;
376 /* Report MESSAGE for FILE, then clean up and exit.
377 If FILE is null, it represents standard output. */
379 static void die (char const *, char const *) ATTRIBUTE_NORETURN
;
381 die (char const *message
, char const *file
)
383 error (0, errno
, "%s: %s", message
, file
? file
: _("standard output"));
390 if (status
!= EXIT_SUCCESS
)
395 Usage: %s [OPTION]... [FILE]...\n\
396 or: %s [OPTION]... --files0-from=F\n\
398 program_name
, program_name
);
400 Write sorted concatenation of all FILE(s) to standard output.\n\
403 emit_mandatory_arg_note ();
410 -b, --ignore-leading-blanks ignore leading blanks\n\
411 -d, --dictionary-order consider only blanks and alphanumeric characters\
413 -f, --ignore-case fold lower case to upper case characters\n\
416 -g, --general-numeric-sort compare according to general numerical value\n\
417 -i, --ignore-nonprinting consider only printable characters\n\
418 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
421 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
424 -n, --numeric-sort compare according to string numerical value\n\
425 -R, --random-sort sort by random hash of keys\n\
426 --random-source=FILE get random bytes from FILE\n\
427 -r, --reverse reverse the result of comparisons\n\
430 --sort=WORD sort according to WORD:\n\
431 general-numeric -g, human-numeric -h, month -M,\
433 numeric -n, random -R, version -V\n\
434 -V, --version-sort natural sort of (version) numbers within text\n\
442 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
443 for more use temp files\n\
446 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
447 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
449 --compress-program=PROG compress temporaries with PROG;\n\
450 decompress them with PROG -d\n\
453 --debug annotate the part of the line used to sort,\n\
454 and warn about questionable usage to stderr\n\
455 --files0-from=F read input from the files specified by\n\
456 NUL-terminated names in file F;\n\
457 If F is - then read names from standard input\n\
460 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
461 -m, --merge merge already sorted files; do not sort\n\
464 -o, --output=FILE write result to FILE instead of standard output\n\
465 -s, --stable stabilize sort by disabling last-resort comparison\
467 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
470 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
471 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
472 multiple options specify multiple directories\n\
473 --parallel=N change the number of sorts run concurrently to N\n\
474 -u, --unique with -c, check for strict ordering;\n\
475 without -c, output only the first of an equal run\
479 -z, --zero-terminated end lines with 0 byte, not newline\n\
481 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
482 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
485 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
486 field number and C a character position in the field; both are origin 1, and\n\
487 the stop position defaults to the line's end. If neither -t nor -b is in\n\
488 effect, characters in a field are counted from the beginning of the preceding\n\
489 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
491 which override global ordering options for that key. If no key is given, use\n\
492 the entire line as the key.\n\
494 SIZE may be followed by the following multiplicative suffixes:\n\
497 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
499 With no FILE, or when FILE is -, read standard input.\n\
502 The locale specified by the environment affects sort order.\n\
503 Set LC_ALL=C to get the traditional sort order that uses\n\
504 native byte values.\n\
506 emit_ancillary_info ();
512 /* For long options that have no equivalent short option, use a
513 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
516 CHECK_OPTION
= CHAR_MAX
+ 1,
517 COMPRESS_PROGRAM_OPTION
,
518 DEBUG_PROGRAM_OPTION
,
521 RANDOM_SOURCE_OPTION
,
526 static char const short_options
[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
528 static struct option
const long_options
[] =
530 {"ignore-leading-blanks", no_argument
, NULL
, 'b'},
531 {"check", optional_argument
, NULL
, CHECK_OPTION
},
532 {"compress-program", required_argument
, NULL
, COMPRESS_PROGRAM_OPTION
},
533 {"debug", no_argument
, NULL
, DEBUG_PROGRAM_OPTION
},
534 {"dictionary-order", no_argument
, NULL
, 'd'},
535 {"ignore-case", no_argument
, NULL
, 'f'},
536 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
537 {"general-numeric-sort", no_argument
, NULL
, 'g'},
538 {"ignore-nonprinting", no_argument
, NULL
, 'i'},
539 {"key", required_argument
, NULL
, 'k'},
540 {"merge", no_argument
, NULL
, 'm'},
541 {"month-sort", no_argument
, NULL
, 'M'},
542 {"numeric-sort", no_argument
, NULL
, 'n'},
543 {"human-numeric-sort", no_argument
, NULL
, 'h'},
544 {"version-sort", no_argument
, NULL
, 'V'},
545 {"random-sort", no_argument
, NULL
, 'R'},
546 {"random-source", required_argument
, NULL
, RANDOM_SOURCE_OPTION
},
547 {"sort", required_argument
, NULL
, SORT_OPTION
},
548 {"output", required_argument
, NULL
, 'o'},
549 {"reverse", no_argument
, NULL
, 'r'},
550 {"stable", no_argument
, NULL
, 's'},
551 {"batch-size", required_argument
, NULL
, NMERGE_OPTION
},
552 {"buffer-size", required_argument
, NULL
, 'S'},
553 {"field-separator", required_argument
, NULL
, 't'},
554 {"temporary-directory", required_argument
, NULL
, 'T'},
555 {"unique", no_argument
, NULL
, 'u'},
556 {"zero-terminated", no_argument
, NULL
, 'z'},
557 {"parallel", required_argument
, NULL
, PARALLEL_OPTION
},
558 {GETOPT_HELP_OPTION_DECL
},
559 {GETOPT_VERSION_OPTION_DECL
},
563 #define CHECK_TABLE \
565 _ct_("silent", 'C') \
566 _ct_("diagnose-first", 'c')
568 static char const *const check_args
[] =
570 #define _ct_(_s, _c) _s,
574 static char const check_types
[] =
576 #define _ct_(_s, _c) _c,
582 _st_("general-numeric", 'g') \
583 _st_("human-numeric", 'h') \
585 _st_("numeric", 'n') \
586 _st_("random", 'R') \
589 static char const *const sort_args
[] =
591 #define _st_(_s, _c) _s,
595 static char const sort_types
[] =
597 #define _st_(_s, _c) _c,
602 /* The set of signals that are caught. */
603 static sigset_t caught_signals
;
605 /* Critical section status. */
612 /* Enter a critical section. */
613 static struct cs_status
616 struct cs_status status
;
617 status
.valid
= (sigprocmask (SIG_BLOCK
, &caught_signals
, &status
.sigs
) == 0);
621 /* Leave a critical section. */
623 cs_leave (struct cs_status status
)
627 /* Ignore failure when restoring the signal mask. */
628 sigprocmask (SIG_SETMASK
, &status
.sigs
, NULL
);
632 /* Possible states for a temp file. If compressed, the file's status
633 is unreaped or reaped, depending on whether 'sort' has waited for
634 the subprocess to finish. */
635 enum { UNCOMPRESSED
, UNREAPED
, REAPED
};
637 /* The list of temporary files. */
640 struct tempnode
*volatile next
;
641 pid_t pid
; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
643 char name
[1]; /* Actual size is 1 + file name length. */
645 static struct tempnode
*volatile temphead
;
646 static struct tempnode
*volatile *temptail
= &temphead
;
648 /* A file to be sorted. */
651 /* The file's name. */
654 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
655 struct tempnode
*temp
;
658 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
659 static Hash_table
*proctab
;
661 enum { INIT_PROCTAB_SIZE
= 47 };
664 proctab_hasher (void const *entry
, size_t tabsize
)
666 struct tempnode
const *node
= entry
;
667 return node
->pid
% tabsize
;
671 proctab_comparator (void const *e1
, void const *e2
)
673 struct tempnode
const *n1
= e1
;
674 struct tempnode
const *n2
= e2
;
675 return n1
->pid
== n2
->pid
;
678 /* The number of unreaped child processes. */
681 static bool delete_proc (pid_t
);
683 /* If PID is positive, wait for the child process with that PID to
684 exit, and assume that PID has already been removed from the process
685 table. If PID is 0 or -1, clean up some child that has exited (by
686 waiting for it, and removing it from the proc table) and return the
687 child's process ID. However, if PID is 0 and no children have
688 exited, return 0 without waiting. */
694 pid_t cpid
= waitpid ((pid
? pid
: -1), &status
, (pid
? 0 : WNOHANG
));
697 error (SORT_FAILURE
, errno
, _("waiting for %s [-d]"),
699 else if (0 < cpid
&& (0 < pid
|| delete_proc (cpid
)))
701 if (! WIFEXITED (status
) || WEXITSTATUS (status
))
702 error (SORT_FAILURE
, 0, _("%s [-d] terminated abnormally"),
710 /* TEMP represents a new process; add it to the process table. Create
711 the process table the first time it's called. */
714 register_proc (struct tempnode
*temp
)
718 proctab
= hash_initialize (INIT_PROCTAB_SIZE
, NULL
,
726 temp
->state
= UNREAPED
;
728 if (! hash_insert (proctab
, temp
))
732 /* If PID is in the process table, remove it and return true.
733 Otherwise, return false. */
736 delete_proc (pid_t pid
)
738 struct tempnode test
;
741 struct tempnode
*node
= hash_delete (proctab
, &test
);
744 node
->state
= REAPED
;
748 /* Remove PID from the process table, and wait for it to exit if it
752 wait_proc (pid_t pid
)
754 if (delete_proc (pid
))
758 /* Reap any exited children. Do not block; reap only those that have
764 while (0 < nprocs
&& reap (0))
768 /* Reap at least one exited child, waiting if necessary. */
777 /* Reap all children, waiting if necessary. */
786 /* Clean up any remaining temporary files. */
791 struct tempnode
const *node
;
793 for (node
= temphead
; node
; node
= node
->next
)
798 /* Cleanup actions to take when exiting. */
805 /* Clean up any remaining temporary files in a critical section so
806 that a signal handler does not try to clean them too. */
807 struct cs_status cs
= cs_enter ();
815 /* Create a new temporary file, returning its newly allocated tempnode.
816 Store into *PFD the file descriptor open for writing.
817 If the creation fails, return NULL and store -1 into *PFD if the
818 failure is due to file descriptor exhaustion and
819 SURVIVE_FD_EXHAUSTION; otherwise, die. */
821 static struct tempnode
*
822 create_temp_file (int *pfd
, bool survive_fd_exhaustion
)
824 static char const slashbase
[] = "/sortXXXXXX";
825 static size_t temp_dir_index
;
828 char const *temp_dir
= temp_dirs
[temp_dir_index
];
829 size_t len
= strlen (temp_dir
);
830 struct tempnode
*node
=
831 xmalloc (offsetof (struct tempnode
, name
) + len
+ sizeof slashbase
);
832 char *file
= node
->name
;
835 memcpy (file
, temp_dir
, len
);
836 memcpy (file
+ len
, slashbase
, sizeof slashbase
);
838 if (++temp_dir_index
== temp_dir_count
)
841 /* Create the temporary file in a critical section, to avoid races. */
847 temptail
= &node
->next
;
855 if (! (survive_fd_exhaustion
&& errno
== EMFILE
))
856 error (SORT_FAILURE
, errno
, _("cannot create temporary file in %s"),
866 /* Return a stream for FILE, opened with mode HOW. A null FILE means
867 standard output; HOW should be "w". When opening for input, "-"
868 means standard input. To avoid confusion, do not return file
869 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
870 opening an ordinary FILE. Return NULL if unsuccessful.
872 fadvise() is used to specify an access pattern for input files.
873 There are a few hints we could possibly provide,
874 and after careful testing it was decided that
875 specifying POSIX_FADV_SEQUENTIAL was not detrimental
876 to any cases. On Linux 2.6.31, this option doubles
877 the size of read ahead performed and thus was seen to
880 Sorting with a smaller internal buffer
881 Reading from faster flash devices
883 In _addition_ one could also specify other hints...
885 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
886 at least uses that to _synchronously_ prepopulate the cache
887 with the specified range. While sort does need to
888 read all of its input before outputting, a synchronous
889 read of the whole file up front precludes any processing
890 that sort could do in parallel with the system doing
891 read ahead of the data. This was seen to have negative effects
892 in a couple of cases:
894 Sorting with a smaller internal buffer
895 Note this option was seen to shorten the runtime for sort
896 on a multicore system with lots of RAM and other processes
897 competing for CPU. It could be argued that more explicit
898 scheduling hints with 'nice' et. al. are more appropriate
901 POSIX_FADV_NOREUSE is a possibility as it could lower
902 the priority of input data in the cache as sort will
903 only need to process it once. However its functionality
904 has changed over Linux kernel versions and as of 2.6.31
905 it does nothing and thus we can't depend on what it might
908 POSIX_FADV_DONTNEED is not appropriate for user specified
909 input files, but for temp files we do want to drop the
910 cache immediately after processing. This is done implicitly
911 however when the files are unlinked. */
914 stream_open (char const *file
, char const *how
)
920 if (STREQ (file
, "-"))
922 have_read_stdin
= true;
926 fp
= fopen (file
, how
);
927 fadvise (fp
, FADVISE_SEQUENTIAL
);
929 else if (*how
== 'w')
931 if (file
&& ftruncate (STDOUT_FILENO
, 0) != 0)
932 error (SORT_FAILURE
, errno
, _("%s: error truncating"),
937 assert (!"unexpected mode passed to stream_open");
942 /* Same as stream_open, except always return a non-null value; die on
946 xfopen (char const *file
, char const *how
)
948 FILE *fp
= stream_open (file
, how
);
950 die (_("open failed"), file
);
954 /* Close FP, whose name is FILE, and report any errors. */
957 xfclose (FILE *fp
, char const *file
)
962 /* Allow reading stdin from tty more than once. */
968 /* Don't close stdout just yet. close_stdout does that. */
969 if (fflush (fp
) != 0)
970 die (_("fflush failed"), file
);
974 if (fclose (fp
) != 0)
975 die (_("close failed"), file
);
981 move_fd_or_die (int oldfd
, int newfd
)
985 if (dup2 (oldfd
, newfd
) < 0)
986 error (SORT_FAILURE
, errno
, _("dup2 failed"));
991 /* Fork a child process for piping to and do common cleanup. The
992 TRIES parameter tells us how many times to try to fork before
993 giving up. Return the PID of the child, or -1 (setting errno)
997 pipe_fork (int pipefds
[2], size_t tries
)
999 #if HAVE_WORKING_FORK
1000 struct tempnode
*saved_temphead
;
1002 double wait_retry
= 0.25;
1003 pid_t pid
IF_LINT ( = -1);
1004 struct cs_status cs
;
1006 if (pipe (pipefds
) < 0)
1009 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1010 uncontrolled subprocess generation can hurt performance significantly.
1011 Allow at most NMERGE + 2 subprocesses, on the theory that there
1012 may be some useful parallelism by letting compression for the
1013 previous merge finish (1 subprocess) in parallel with the current
1014 merge (NMERGE + 1 subprocesses). */
1016 if (nmerge
+ 1 < nprocs
)
1021 /* This is so the child process won't delete our temp files
1022 if it receives a signal before exec-ing. */
1024 saved_temphead
= temphead
;
1028 saved_errno
= errno
;
1030 temphead
= saved_temphead
;
1033 errno
= saved_errno
;
1035 if (0 <= pid
|| errno
!= EAGAIN
)
1039 xnanosleep (wait_retry
);
1047 saved_errno
= errno
;
1050 errno
= saved_errno
;
1054 close (STDIN_FILENO
);
1055 close (STDOUT_FILENO
);
1062 #else /* ! HAVE_WORKING_FORK */
1067 /* Create a temporary file and, if asked for, start a compressor
1068 to that file. Set *PFP to the file handle and return
1069 the address of the new temp node. If the creation
1070 fails, return NULL if the failure is due to file descriptor
1071 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1073 static struct tempnode
*
1074 maybe_create_temp (FILE **pfp
, bool survive_fd_exhaustion
)
1077 struct tempnode
*node
= create_temp_file (&tempfd
, survive_fd_exhaustion
);
1081 node
->state
= UNCOMPRESSED
;
1083 if (compress_program
)
1087 node
->pid
= pipe_fork (pipefds
, MAX_FORK_TRIES_COMPRESS
);
1092 tempfd
= pipefds
[1];
1094 register_proc (node
);
1096 else if (node
->pid
== 0)
1099 move_fd_or_die (tempfd
, STDOUT_FILENO
);
1100 move_fd_or_die (pipefds
[0], STDIN_FILENO
);
1102 if (execlp (compress_program
, compress_program
, (char *) NULL
) < 0)
1103 error (SORT_FAILURE
, errno
, _("couldn't execute %s"),
1108 *pfp
= fdopen (tempfd
, "w");
1110 die (_("couldn't create temporary file"), node
->name
);
1115 /* Create a temporary file and, if asked for, start a compressor
1116 to that file. Set *PFP to the file handle and return the address
1117 of the new temp node. Die on failure. */
1119 static struct tempnode
*
1120 create_temp (FILE **pfp
)
1122 return maybe_create_temp (pfp
, false);
1125 /* Open a compressed temp file and start a decompression process through
1126 which to filter the input. Return NULL (setting errno to
1127 EMFILE) if we ran out of file descriptors, and die on any other
1131 open_temp (struct tempnode
*temp
)
1133 int tempfd
, pipefds
[2];
1136 if (temp
->state
== UNREAPED
)
1137 wait_proc (temp
->pid
);
1139 tempfd
= open (temp
->name
, O_RDONLY
);
1143 pid_t child
= pipe_fork (pipefds
, MAX_FORK_TRIES_DECOMPRESS
);
1148 if (errno
!= EMFILE
)
1149 error (SORT_FAILURE
, errno
, _("couldn't create process for %s -d"),
1157 move_fd_or_die (tempfd
, STDIN_FILENO
);
1158 move_fd_or_die (pipefds
[1], STDOUT_FILENO
);
1160 execlp (compress_program
, compress_program
, "-d", (char *) NULL
);
1161 error (SORT_FAILURE
, errno
, _("couldn't execute %s -d"),
1166 register_proc (temp
);
1170 fp
= fdopen (pipefds
[0], "r");
1173 int saved_errno
= errno
;
1175 errno
= saved_errno
;
1183 /* Append DIR to the array of temporary directory names. */
1185 add_temp_dir (char const *dir
)
1187 if (temp_dir_count
== temp_dir_alloc
)
1188 temp_dirs
= X2NREALLOC (temp_dirs
, &temp_dir_alloc
);
1190 temp_dirs
[temp_dir_count
++] = dir
;
1193 /* Remove NAME from the list of temporary files. */
1196 zaptemp (char const *name
)
1198 struct tempnode
*volatile *pnode
;
1199 struct tempnode
*node
;
1200 struct tempnode
*next
;
1202 int unlink_errno
= 0;
1203 struct cs_status cs
;
1205 for (pnode
= &temphead
; (node
= *pnode
)->name
!= name
; pnode
= &node
->next
)
1208 if (node
->state
== UNREAPED
)
1209 wait_proc (node
->pid
);
1211 /* Unlink the temporary file in a critical section to avoid races. */
1214 unlink_status
= unlink (name
);
1215 unlink_errno
= errno
;
1219 if (unlink_status
!= 0)
1220 error (0, unlink_errno
, _("warning: cannot remove: %s"), name
);
1226 #if HAVE_NL_LANGINFO
1229 struct_month_cmp (void const *m1
, void const *m2
)
1231 struct month
const *month1
= m1
;
1232 struct month
const *month2
= m2
;
1233 return strcmp (month1
->name
, month2
->name
);
1238 /* Initialize the character class tables. */
1245 for (i
= 0; i
< UCHAR_LIM
; ++i
)
1247 blanks
[i
] = !! isblank (i
);
1248 nonprinting
[i
] = ! isprint (i
);
1249 nondictionary
[i
] = ! isalnum (i
) && ! isblank (i
);
1250 fold_toupper
[i
] = toupper (i
);
1253 #if HAVE_NL_LANGINFO
1254 /* If we're not in the "C" locale, read different names for months. */
1257 for (i
= 0; i
< MONTHS_PER_YEAR
; i
++)
1264 s
= nl_langinfo (ABMON_1
+ i
);
1266 monthtab
[i
].name
= name
= xmalloc (s_len
+ 1);
1267 monthtab
[i
].val
= i
+ 1;
1269 for (j
= k
= 0; j
< s_len
; j
++)
1270 if (! isblank (to_uchar (s
[j
])))
1271 name
[k
++] = fold_toupper
[to_uchar (s
[j
])];
1274 qsort (monthtab
, MONTHS_PER_YEAR
, sizeof *monthtab
, struct_month_cmp
);
1279 /* Specify how many inputs may be merged at once.
1280 This may be set on the command-line with the
1281 --batch-size option. */
1283 specify_nmerge (int oi
, char c
, char const *s
)
1286 struct rlimit rlimit
;
1287 enum strtol_error e
= xstrtoumax (s
, NULL
, 10, &n
, NULL
);
1289 /* Try to find out how many file descriptors we'll be able
1290 to open. We need at least nmerge + 3 (STDIN_FILENO,
1291 STDOUT_FILENO and STDERR_FILENO). */
1292 unsigned int max_nmerge
= ((getrlimit (RLIMIT_NOFILE
, &rlimit
) == 0
1297 if (e
== LONGINT_OK
)
1301 e
= LONGINT_OVERFLOW
;
1306 error (0, 0, _("invalid --%s argument %s"),
1307 long_options
[oi
].name
, quote (s
));
1308 error (SORT_FAILURE
, 0,
1309 _("minimum --%s argument is %s"),
1310 long_options
[oi
].name
, quote ("2"));
1312 else if (max_nmerge
< nmerge
)
1314 e
= LONGINT_OVERFLOW
;
1321 if (e
== LONGINT_OVERFLOW
)
1323 char max_nmerge_buf
[INT_BUFSIZE_BOUND (max_nmerge
)];
1324 error (0, 0, _("--%s argument %s too large"),
1325 long_options
[oi
].name
, quote (s
));
1326 error (SORT_FAILURE
, 0,
1327 _("maximum --%s argument with current rlimit is %s"),
1328 long_options
[oi
].name
,
1329 uinttostr (max_nmerge
, max_nmerge_buf
));
1332 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1335 /* Specify the amount of main memory to use when sorting. */
1337 specify_sort_size (int oi
, char c
, char const *s
)
1341 enum strtol_error e
= xstrtoumax (s
, &suffix
, 10, &n
, "EgGkKmMPtTYZ");
1343 /* The default unit is KiB. */
1344 if (e
== LONGINT_OK
&& ISDIGIT (suffix
[-1]))
1346 if (n
<= UINTMAX_MAX
/ 1024)
1349 e
= LONGINT_OVERFLOW
;
1352 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1353 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& ISDIGIT (suffix
[-1]) && ! suffix
[1])
1362 double mem
= physmem_total () * n
/ 100;
1364 /* Use "<", not "<=", to avoid problems with rounding. */
1365 if (mem
< UINTMAX_MAX
)
1371 e
= LONGINT_OVERFLOW
;
1376 if (e
== LONGINT_OK
)
1378 /* If multiple sort sizes are specified, take the maximum, so
1379 that option order does not matter. */
1386 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
1390 e
= LONGINT_OVERFLOW
;
1393 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1396 /* Specify the number of threads to spawn during internal sort. */
1398 specify_nthreads (int oi
, char c
, char const *s
)
1400 unsigned long int nthreads
;
1401 enum strtol_error e
= xstrtoul (s
, NULL
, 10, &nthreads
, "");
1402 if (e
== LONGINT_OVERFLOW
)
1404 if (e
!= LONGINT_OK
)
1405 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1406 if (SIZE_MAX
< nthreads
)
1407 nthreads
= SIZE_MAX
;
1409 error (SORT_FAILURE
, 0, _("number in parallel must be nonzero"));
1413 /* Return the default sort size. */
1415 default_sort_size (void)
1417 /* Let SIZE be MEM, but no more than the maximum object size,
1418 total memory, or system resource limits. Don't bother to check
1419 for values like RLIM_INFINITY since in practice they are not much
1420 less than SIZE_MAX. */
1421 size_t size
= SIZE_MAX
;
1422 struct rlimit rlimit
;
1423 if (getrlimit (RLIMIT_DATA
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1424 size
= rlimit
.rlim_cur
;
1426 if (getrlimit (RLIMIT_AS
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1427 size
= rlimit
.rlim_cur
;
1430 /* Leave a large safety margin for the above limits, as failure can
1431 occur when they are exceeded. */
1435 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1436 Exceeding RSS is not fatal, but can be quite slow. */
1437 if (getrlimit (RLIMIT_RSS
, &rlimit
) == 0 && rlimit
.rlim_cur
/ 16 * 15 < size
)
1438 size
= rlimit
.rlim_cur
/ 16 * 15;
1441 /* Let MEM be available memory or 1/8 of total memory, whichever
1443 double avail
= physmem_available ();
1444 double total
= physmem_total ();
1445 double mem
= MAX (avail
, total
/ 8);
1447 /* Leave a 1/4 margin for physical memory. */
1448 if (total
* 0.75 < size
)
1449 size
= total
* 0.75;
1451 /* Return the minimum of MEM and SIZE, but no less than
1452 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1453 right when only one argument is floating point. */
1456 return MAX (size
, MIN_SORT_SIZE
);
1459 /* Return the sort buffer size to use with the input files identified
1460 by FPS and FILES, which are alternate names of the same files.
1461 NFILES gives the number of input files; NFPS may be less. Assume
1462 that each input line requires LINE_BYTES extra bytes' worth of line
1463 information. Do not exceed the size bound specified by the user
1464 (or a default size bound, if the user does not specify one). */
1467 sort_buffer_size (FILE *const *fps
, size_t nfps
,
1468 char *const *files
, size_t nfiles
,
1471 /* A bound on the input size. If zero, the bound hasn't been
1473 static size_t size_bound
;
1475 /* In the worst case, each input byte is a newline. */
1476 size_t worst_case_per_input_byte
= line_bytes
+ 1;
1478 /* Keep enough room for one extra input line and an extra byte.
1479 This extra room might be needed when preparing to read EOF. */
1480 size_t size
= worst_case_per_input_byte
+ 1;
1484 for (i
= 0; i
< nfiles
; i
++)
1490 if ((i
< nfps
? fstat (fileno (fps
[i
]), &st
)
1491 : STREQ (files
[i
], "-") ? fstat (STDIN_FILENO
, &st
)
1492 : stat (files
[i
], &st
))
1494 die (_("stat failed"), files
[i
]);
1496 if (S_ISREG (st
.st_mode
))
1497 file_size
= st
.st_size
;
1500 /* The file has unknown size. If the user specified a sort
1501 buffer size, use that; otherwise, guess the size. */
1504 file_size
= INPUT_FILE_SIZE_GUESS
;
1509 size_bound
= sort_size
;
1511 size_bound
= default_sort_size ();
1514 /* Add the amount of memory needed to represent the worst case
1515 where the input consists entirely of newlines followed by a
1516 single non-newline. Check for overflow. */
1517 worst_case
= file_size
* worst_case_per_input_byte
+ 1;
1518 if (file_size
!= worst_case
/ worst_case_per_input_byte
1519 || size_bound
- size
<= worst_case
)
1527 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1528 must be at least sizeof (struct line). Allocate ALLOC bytes
1532 initbuf (struct buffer
*buf
, size_t line_bytes
, size_t alloc
)
1534 /* Ensure that the line array is properly aligned. If the desired
1535 size cannot be allocated, repeatedly halve it until allocation
1536 succeeds. The smaller allocation may hurt overall performance,
1537 but that's better than failing. */
1540 alloc
+= sizeof (struct line
) - alloc
% sizeof (struct line
);
1541 buf
->buf
= malloc (alloc
);
1545 if (alloc
<= line_bytes
+ 1)
1549 buf
->line_bytes
= line_bytes
;
1551 buf
->used
= buf
->left
= buf
->nlines
= 0;
1555 /* Return one past the limit of the line array. */
1557 static inline struct line
*
1558 buffer_linelim (struct buffer
const *buf
)
1560 void *linelim
= buf
->buf
+ buf
->alloc
;
1564 /* Return a pointer to the first character of the field specified
1568 begfield (struct line
const *line
, struct keyfield
const *key
)
1570 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1571 size_t sword
= key
->sword
;
1572 size_t schar
= key
->schar
;
1574 /* The leading field separator itself is included in a field when -t
1577 if (tab
!= TAB_DEFAULT
)
1578 while (ptr
< lim
&& sword
--)
1580 while (ptr
< lim
&& *ptr
!= tab
)
1586 while (ptr
< lim
&& sword
--)
1588 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1590 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1594 /* If we're ignoring leading blanks when computing the Start
1595 of the field, skip past them here. */
1596 if (key
->skipsblanks
)
1597 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1600 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1601 ptr
= MIN (lim
, ptr
+ schar
);
1606 /* Return the limit of (a pointer to the first character after) the field
1607 in LINE specified by KEY. */
1610 limfield (struct line
const *line
, struct keyfield
const *key
)
1612 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1613 size_t eword
= key
->eword
, echar
= key
->echar
;
1616 eword
++; /* Skip all of end field. */
1618 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1619 whichever comes first. If there are more than EWORD fields, leave
1620 PTR pointing at the beginning of the field having zero-based index,
1621 EWORD. If a delimiter character was specified (via -t), then that
1622 'beginning' is the first character following the delimiting TAB.
1623 Otherwise, leave PTR pointing at the first 'blank' character after
1624 the preceding field. */
1625 if (tab
!= TAB_DEFAULT
)
1626 while (ptr
< lim
&& eword
--)
1628 while (ptr
< lim
&& *ptr
!= tab
)
1630 if (ptr
< lim
&& (eword
|| echar
))
1634 while (ptr
< lim
&& eword
--)
1636 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1638 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1642 #ifdef POSIX_UNSPECIFIED
1643 /* The following block of code makes GNU sort incompatible with
1644 standard Unix sort, so it's ifdef'd out for now.
1645 The POSIX spec isn't clear on how to interpret this.
1646 FIXME: request clarification.
1648 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1649 Date: Thu, 30 May 96 12:20:41 -0400
1650 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1652 [...]I believe I've found another bug in 'sort'.
1657 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1660 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1664 Unix sort produced the answer I expected: sort on the single character
1665 in column 7. GNU sort produced different results, because it disagrees
1666 on the interpretation of the key-end spec "M.N". Unix sort reads this
1667 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1668 "skip M-1 fields, then either N-1 characters or the rest of the current
1669 field, whichever comes first". This extra clause applies only to
1670 key-ends, not key-starts.
1673 /* Make LIM point to the end of (one byte past) the current field. */
1674 if (tab
!= TAB_DEFAULT
)
1677 newlim
= memchr (ptr
, tab
, lim
- ptr
);
1685 while (newlim
< lim
&& blanks
[to_uchar (*newlim
)])
1687 while (newlim
< lim
&& !blanks
[to_uchar (*newlim
)])
1693 if (echar
!= 0) /* We need to skip over a portion of the end field. */
1695 /* If we're ignoring leading blanks when computing the End
1696 of the field, skip past them here. */
1697 if (key
->skipeblanks
)
1698 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1701 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1702 ptr
= MIN (lim
, ptr
+ echar
);
1708 /* Fill BUF reading from FP, moving buf->left bytes from the end
1709 of buf->buf to the beginning first. If EOF is reached and the
1710 file wasn't terminated by a newline, supply one. Set up BUF's line
1711 table too. FILE is the name of the file corresponding to FP.
1712 Return true if some input was read. */
1715 fillbuf (struct buffer
*buf
, FILE *fp
, char const *file
)
1717 struct keyfield
const *key
= keylist
;
1719 size_t line_bytes
= buf
->line_bytes
;
1720 size_t mergesize
= merge_buffer_size
- MIN_MERGE_BUFFER_SIZE
;
1725 if (buf
->used
!= buf
->left
)
1727 memmove (buf
->buf
, buf
->buf
+ buf
->used
- buf
->left
, buf
->left
);
1728 buf
->used
= buf
->left
;
1734 char *ptr
= buf
->buf
+ buf
->used
;
1735 struct line
*linelim
= buffer_linelim (buf
);
1736 struct line
*line
= linelim
- buf
->nlines
;
1737 size_t avail
= (char *) linelim
- buf
->nlines
* line_bytes
- ptr
;
1738 char *line_start
= buf
->nlines
? line
->text
+ line
->length
: buf
->buf
;
1740 while (line_bytes
+ 1 < avail
)
1742 /* Read as many bytes as possible, but do not read so many
1743 bytes that there might not be enough room for the
1744 corresponding line array. The worst case is when the
1745 rest of the input file consists entirely of newlines,
1746 except that the last byte is not a newline. */
1747 size_t readsize
= (avail
- 1) / (line_bytes
+ 1);
1748 size_t bytes_read
= fread (ptr
, 1, readsize
, fp
);
1749 char *ptrlim
= ptr
+ bytes_read
;
1751 avail
-= bytes_read
;
1753 if (bytes_read
!= readsize
)
1756 die (_("read failed"), file
);
1760 if (buf
->buf
== ptrlim
)
1762 if (line_start
!= ptrlim
&& ptrlim
[-1] != eol
)
1767 /* Find and record each line in the just-read input. */
1768 while ((p
= memchr (ptr
, eol
, ptrlim
- ptr
)))
1770 /* Delimit the line with NUL. This eliminates the need to
1771 temporarily replace the last byte with NUL when calling
1772 xmemcoll(), which increases performance. */
1776 line
->text
= line_start
;
1777 line
->length
= ptr
- line_start
;
1778 mergesize
= MAX (mergesize
, line
->length
);
1779 avail
-= line_bytes
;
1783 /* Precompute the position of the first key for
1785 line
->keylim
= (key
->eword
== SIZE_MAX
1787 : limfield (line
, key
));
1789 if (key
->sword
!= SIZE_MAX
)
1790 line
->keybeg
= begfield (line
, key
);
1793 if (key
->skipsblanks
)
1794 while (blanks
[to_uchar (*line_start
)])
1796 line
->keybeg
= line_start
;
1808 buf
->used
= ptr
- buf
->buf
;
1809 buf
->nlines
= buffer_linelim (buf
) - line
;
1810 if (buf
->nlines
!= 0)
1812 buf
->left
= ptr
- line_start
;
1813 merge_buffer_size
= mergesize
+ MIN_MERGE_BUFFER_SIZE
;
1818 /* The current input line is too long to fit in the buffer.
1819 Increase the buffer size and try again, keeping it properly
1821 size_t line_alloc
= buf
->alloc
/ sizeof (struct line
);
1822 buf
->buf
= x2nrealloc (buf
->buf
, &line_alloc
, sizeof (struct line
));
1823 buf
->alloc
= line_alloc
* sizeof (struct line
);
1828 /* Table that maps characters to order-of-magnitude values. */
1829 static char const unit_order
[UCHAR_LIM
] =
1831 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1832 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1833 /* This initializer syntax works on all C99 hosts. For now, use
1834 it only on non-ASCII hosts, to ease the pain of porting to
1835 pre-C99 ASCII hosts. */
1836 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1839 /* Generate the following table with this command:
1840 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1841 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1846 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1857 /* Return an integer that represents the order of magnitude of the
1858 unit following the number. The number may contain thousands
1859 separators and a decimal point, but it may not contain leading blanks.
1860 Negative numbers get negative orders; zero numbers have a zero order. */
1862 static int _GL_ATTRIBUTE_PURE
1863 find_unit_order (char const *number
)
1865 bool minus_sign
= (*number
== '-');
1866 char const *p
= number
+ minus_sign
;
1870 /* Scan to end of number.
1871 Decimals or separators not followed by digits stop the scan.
1872 Numbers ending in decimals or separators are thus considered
1873 to be lacking in units.
1874 FIXME: add support for multibyte thousands_sep and decimal_point. */
1878 while (ISDIGIT (ch
= *p
++))
1879 nonzero
|= ch
- '0';
1881 while (ch
== thousands_sep
);
1883 if (ch
== decimal_point
)
1884 while (ISDIGIT (ch
= *p
++))
1885 nonzero
|= ch
- '0';
1889 int order
= unit_order
[ch
];
1890 return (minus_sign
? -order
: order
);
1896 /* Compare numbers A and B ending in units with SI or IEC prefixes
1897 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1900 human_numcompare (char const *a
, char const *b
)
1902 while (blanks
[to_uchar (*a
)])
1904 while (blanks
[to_uchar (*b
)])
1907 int diff
= find_unit_order (a
) - find_unit_order (b
);
1908 return (diff
? diff
: strnumcmp (a
, b
, decimal_point
, thousands_sep
));
1911 /* Compare strings A and B as numbers without explicitly converting them to
1912 machine numbers. Comparatively slow for short strings, but asymptotically
1916 numcompare (char const *a
, char const *b
)
1918 while (blanks
[to_uchar (*a
)])
1920 while (blanks
[to_uchar (*b
)])
1923 return strnumcmp (a
, b
, decimal_point
, thousands_sep
);
1926 /* Work around a problem whereby the long double value returned by glibc's
1927 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1928 A and B before calling strtold. FIXME: remove this function once
1929 gnulib guarantees that strtold's result is always well defined. */
1931 nan_compare (char const *sa
, char const *sb
)
1934 memset (&a
, 0, sizeof a
);
1935 a
= strtold (sa
, NULL
);
1938 memset (&b
, 0, sizeof b
);
1939 b
= strtold (sb
, NULL
);
1941 return memcmp (&a
, &b
, sizeof a
);
1945 general_numcompare (char const *sa
, char const *sb
)
1947 /* FIXME: maybe add option to try expensive FP conversion
1948 only if A and B can't be compared more cheaply/accurately. */
1952 long_double a
= strtold (sa
, &ea
);
1953 long_double b
= strtold (sb
, &eb
);
1955 /* Put conversion errors at the start of the collating sequence. */
1957 return sb
== eb
? 0 : -1;
1961 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1962 conversion errors but before numbers; sort them by internal
1963 bit-pattern, for lack of a more portable alternative. */
1969 : nan_compare (sa
, sb
));
1972 /* Return an integer in 1..12 of the month name MONTH.
1973 Return 0 if the name in S is not recognized. */
1976 getmonth (char const *month
, char **ea
)
1979 size_t hi
= MONTHS_PER_YEAR
;
1981 while (blanks
[to_uchar (*month
)])
1986 size_t ix
= (lo
+ hi
) / 2;
1987 char const *m
= month
;
1988 char const *n
= monthtab
[ix
].name
;
1996 return monthtab
[ix
].val
;
1998 if (to_uchar (fold_toupper
[to_uchar (*m
)]) < to_uchar (*n
))
2003 else if (to_uchar (fold_toupper
[to_uchar (*m
)]) > to_uchar (*n
))
2015 /* A randomly chosen MD5 state, used for random comparison. */
2016 static struct md5_ctx random_md5_state
;
2018 /* Initialize the randomly chosen MD5 state. */
2021 random_md5_state_init (char const *random_source
)
2023 unsigned char buf
[MD5_DIGEST_SIZE
];
2024 struct randread_source
*r
= randread_new (random_source
, sizeof buf
);
2026 die (_("open failed"), random_source
);
2027 randread (r
, buf
, sizeof buf
);
2028 if (randread_free (r
) != 0)
2029 die (_("close failed"), random_source
);
2030 md5_init_ctx (&random_md5_state
);
2031 md5_process_bytes (buf
, sizeof buf
, &random_md5_state
);
2034 /* This is like strxfrm, except it reports any error and exits. */
2037 xstrxfrm (char *restrict dest
, char const *restrict src
, size_t destsize
)
2040 size_t translated_size
= strxfrm (dest
, src
, destsize
);
2044 error (0, errno
, _("string transformation failed"));
2045 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2046 error (SORT_FAILURE
, 0,
2047 _("the untransformed string was %s"),
2048 quotearg_n_style (0, locale_quoting_style
, src
));
2051 return translated_size
;
2054 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2055 using one or more random hash functions. TEXTA[LENA] and
2056 TEXTB[LENB] must be zero. */
2059 compare_random (char *restrict texta
, size_t lena
,
2060 char *restrict textb
, size_t lenb
)
2062 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2063 data. This is used to break ties if there is a checksum
2064 collision, and this is good enough given the astronomically low
2065 probability of a collision. */
2068 char stackbuf
[4000];
2069 char *buf
= stackbuf
;
2070 size_t bufsize
= sizeof stackbuf
;
2071 void *allocated
= NULL
;
2072 uint32_t dig
[2][MD5_DIGEST_SIZE
/ sizeof (uint32_t)];
2073 struct md5_ctx s
[2];
2074 s
[0] = s
[1] = random_md5_state
;
2076 if (hard_LC_COLLATE
)
2078 char const *lima
= texta
+ lena
;
2079 char const *limb
= textb
+ lenb
;
2083 /* Transform the text into the basis of comparison, so that byte
2084 strings that would otherwise considered to be equal are
2085 considered equal here even if their bytes differ.
2087 Each time through this loop, transform one
2088 null-terminated string's worth from TEXTA or from TEXTB
2089 or both. That way, there's no need to store the
2090 transformation of the whole line, if it contains many
2091 null-terminated strings. */
2093 /* Store the transformed data into a big-enough buffer. */
2095 /* A 3X size guess avoids the overhead of calling strxfrm
2096 twice on typical implementations. Don't worry about
2097 size_t overflow, as the guess need not be correct. */
2098 size_t guess_bufsize
= 3 * (lena
+ lenb
) + 2;
2099 if (bufsize
< guess_bufsize
)
2101 bufsize
= MAX (guess_bufsize
, bufsize
* 3 / 2);
2103 buf
= allocated
= malloc (bufsize
);
2107 bufsize
= sizeof stackbuf
;
2112 (texta
< lima
? xstrxfrm (buf
, texta
, bufsize
) + 1 : 0);
2113 bool a_fits
= sizea
<= bufsize
;
2116 ? (xstrxfrm ((a_fits
? buf
+ sizea
: NULL
), textb
,
2117 (a_fits
? bufsize
- sizea
: 0))
2121 if (! (a_fits
&& sizea
+ sizeb
<= bufsize
))
2123 bufsize
= sizea
+ sizeb
;
2124 if (bufsize
< SIZE_MAX
/ 3)
2125 bufsize
= bufsize
* 3 / 2;
2127 buf
= allocated
= xmalloc (bufsize
);
2129 strxfrm (buf
, texta
, sizea
);
2131 strxfrm (buf
+ sizea
, textb
, sizeb
);
2134 /* Advance past NULs to the next part of each input string,
2135 exiting the loop if both strings are exhausted. When
2136 exiting the loop, prepare to finish off the tiebreaker
2137 comparison properly. */
2139 texta
+= strlen (texta
) + 1;
2141 textb
+= strlen (textb
) + 1;
2142 if (! (texta
< lima
|| textb
< limb
))
2144 lena
= sizea
; texta
= buf
;
2145 lenb
= sizeb
; textb
= buf
+ sizea
;
2149 /* Accumulate the transformed data in the corresponding
2151 md5_process_bytes (buf
, sizea
, &s
[0]);
2152 md5_process_bytes (buf
+ sizea
, sizeb
, &s
[1]);
2154 /* Update the tiebreaker comparison of the transformed data. */
2157 xfrm_diff
= memcmp (buf
, buf
+ sizea
, MIN (sizea
, sizeb
));
2159 xfrm_diff
= (sizea
> sizeb
) - (sizea
< sizeb
);
2164 /* Compute and compare the checksums. */
2165 md5_process_bytes (texta
, lena
, &s
[0]); md5_finish_ctx (&s
[0], dig
[0]);
2166 md5_process_bytes (textb
, lenb
, &s
[1]); md5_finish_ctx (&s
[1], dig
[1]);
2167 int diff
= memcmp (dig
[0], dig
[1], sizeof dig
[0]);
2169 /* Fall back on the tiebreaker if the checksums collide. */
2174 xfrm_diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2176 xfrm_diff
= (lena
> lenb
) - (lena
< lenb
);
2187 /* Return the printable width of the block of memory starting at
2188 TEXT and ending just before LIM, counting each tab as one byte.
2189 FIXME: Should we generally be counting non printable chars? */
2192 debug_width (char const *text
, char const *lim
)
2194 size_t width
= mbsnwidth (text
, lim
- text
, 0);
2196 width
+= (*text
++ == '\t');
2200 /* For debug mode, "underline" a key at the
2201 specified offset and screen width. */
2204 mark_key (size_t offset
, size_t width
)
2210 printf (_("^ no match for key\n"));
2221 /* Return true if KEY is a numeric key. */
2224 key_numeric (struct keyfield
const *key
)
2226 return key
->numeric
|| key
->general_numeric
|| key
->human_numeric
;
2229 /* For LINE, output a debugging line that underlines KEY in LINE.
2230 If KEY is null, underline the whole line. */
2233 debug_key (struct line
const *line
, struct keyfield
const *key
)
2235 char *text
= line
->text
;
2237 char *lim
= text
+ line
->length
- 1;
2241 if (key
->sword
!= SIZE_MAX
)
2242 beg
= begfield (line
, key
);
2243 if (key
->eword
!= SIZE_MAX
)
2244 lim
= limfield (line
, key
);
2246 if (key
->skipsblanks
|| key
->month
|| key_numeric (key
))
2251 while (blanks
[to_uchar (*beg
)])
2254 char *tighter_lim
= beg
;
2258 else if (key
->month
)
2259 getmonth (beg
, &tighter_lim
);
2260 else if (key
->general_numeric
)
2261 ignore_value (strtold (beg
, &tighter_lim
));
2262 else if (key
->numeric
|| key
->human_numeric
)
2264 char *p
= beg
+ (beg
< lim
&& *beg
== '-');
2265 bool found_digit
= false;
2270 while (ISDIGIT (ch
= *p
++))
2273 while (ch
== thousands_sep
);
2275 if (ch
== decimal_point
)
2276 while (ISDIGIT (ch
= *p
++))
2280 tighter_lim
= p
- ! (key
->human_numeric
&& unit_order
[ch
]);
2290 size_t offset
= debug_width (text
, beg
);
2291 size_t width
= debug_width (beg
, lim
);
2292 mark_key (offset
, width
);
2295 /* Debug LINE by underlining its keys. */
2298 debug_line (struct line
const *line
)
2300 struct keyfield
const *key
= keylist
;
2303 debug_key (line
, key
);
2304 while (key
&& ((key
= key
->next
) || ! (unique
|| stable
)));
2307 /* Return whether sorting options specified for key. */
2310 default_key_compare (struct keyfield
const *key
)
2312 return ! (key
->ignore
2316 || key_numeric (key
)
2320 /* || key->reverse */
2324 /* Convert a key to the short options used to specify it. */
2327 key_to_opts (struct keyfield
const *key
, char *opts
)
2329 if (key
->skipsblanks
|| key
->skipeblanks
)
2330 *opts
++ = 'b';/* either disables global -b */
2331 if (key
->ignore
== nondictionary
)
2335 if (key
->general_numeric
)
2337 if (key
->human_numeric
)
2339 if (key
->ignore
== nonprinting
)
2354 /* Output data independent key warnings to stderr. */
2357 key_warnings (struct keyfield
const *gkey
, bool gkey_only
)
2359 struct keyfield
const *key
;
2360 struct keyfield ugkey
= *gkey
;
2361 unsigned long keynum
= 1;
2363 for (key
= keylist
; key
; key
= key
->next
, keynum
++)
2365 if (key
->obsolete_used
)
2367 size_t sword
= key
->sword
;
2368 size_t eword
= key
->eword
;
2369 char tmp
[INT_BUFSIZE_BOUND (uintmax_t)];
2370 /* obsolescent syntax +A.x -B.y is equivalent to:
2371 -k A+1.x+1,B.y (when y = 0)
2372 -k A+1.x+1,B+1.y (when y > 0) */
2373 char obuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 4]; /* +# -# */
2374 char nbuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 5]; /* -k #,# */
2378 if (sword
== SIZE_MAX
)
2381 po
= stpcpy (stpcpy (po
, "+"), umaxtostr (sword
, tmp
));
2382 pn
= stpcpy (stpcpy (pn
, "-k "), umaxtostr (sword
+ 1, tmp
));
2383 if (key
->eword
!= SIZE_MAX
)
2385 stpcpy (stpcpy (po
, " -"), umaxtostr (eword
+ 1, tmp
));
2386 stpcpy (stpcpy (pn
, ","),
2387 umaxtostr (eword
+ 1
2388 + (key
->echar
== SIZE_MAX
), tmp
));
2390 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2391 quote_n (0, obuf
), quote_n (1, nbuf
));
2394 /* Warn about field specs that will never match. */
2395 if (key
->sword
!= SIZE_MAX
&& key
->eword
< key
->sword
)
2396 error (0, 0, _("key %lu has zero width and will be ignored"), keynum
);
2398 /* Warn about significant leading blanks. */
2399 bool implicit_skip
= key_numeric (key
) || key
->month
;
2400 bool maybe_space_aligned
= !hard_LC_COLLATE
&& default_key_compare (key
)
2401 && !(key
->schar
|| key
->echar
);
2402 bool line_offset
= key
->eword
== 0 && key
->echar
!= 0; /* -k1.x,1.y */
2403 if (!gkey_only
&& tab
== TAB_DEFAULT
&& !line_offset
2404 && ((!key
->skipsblanks
&& !(implicit_skip
|| maybe_space_aligned
))
2405 || (!key
->skipsblanks
&& key
->schar
)
2406 || (!key
->skipeblanks
&& key
->echar
)))
2407 error (0, 0, _("leading blanks are significant in key %lu; "
2408 "consider also specifying 'b'"), keynum
);
2410 /* Warn about numeric comparisons spanning fields,
2411 as field delimiters could be interpreted as part
2412 of the number (maybe only in other locales). */
2413 if (!gkey_only
&& key_numeric (key
))
2415 size_t sword
= key
->sword
+ 1;
2416 size_t eword
= key
->eword
+ 1;
2419 if (!eword
|| sword
< eword
)
2420 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2424 /* Flag global options not copied or specified in any key. */
2425 if (ugkey
.ignore
&& (ugkey
.ignore
== key
->ignore
))
2426 ugkey
.ignore
= NULL
;
2427 if (ugkey
.translate
&& (ugkey
.translate
== key
->translate
))
2428 ugkey
.translate
= NULL
;
2429 ugkey
.skipsblanks
&= !key
->skipsblanks
;
2430 ugkey
.skipeblanks
&= !key
->skipeblanks
;
2431 ugkey
.month
&= !key
->month
;
2432 ugkey
.numeric
&= !key
->numeric
;
2433 ugkey
.general_numeric
&= !key
->general_numeric
;
2434 ugkey
.human_numeric
&= !key
->human_numeric
;
2435 ugkey
.random
&= !key
->random
;
2436 ugkey
.version
&= !key
->version
;
2437 ugkey
.reverse
&= !key
->reverse
;
2440 /* Warn about ignored global options flagged above.
2441 Note if gkey is the only one in the list, all flags are cleared. */
2442 if (!default_key_compare (&ugkey
)
2443 || (ugkey
.reverse
&& (stable
|| unique
) && keylist
))
2445 bool ugkey_reverse
= ugkey
.reverse
;
2446 if (!(stable
|| unique
))
2447 ugkey
.reverse
= false;
2448 /* The following is too big, but guaranteed to be "big enough". */
2449 char opts
[sizeof short_options
];
2450 key_to_opts (&ugkey
, opts
);
2452 ngettext ("option '-%s' is ignored",
2453 "options '-%s' are ignored",
2454 select_plural (strlen (opts
))), opts
);
2455 ugkey
.reverse
= ugkey_reverse
;
2457 if (ugkey
.reverse
&& !(stable
|| unique
) && keylist
)
2458 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2461 /* Compare two lines A and B trying every key in sequence until there
2462 are no more keys or a difference is found. */
2465 keycompare (struct line
const *a
, struct line
const *b
)
2467 struct keyfield
*key
= keylist
;
2469 /* For the first iteration only, the key positions have been
2470 precomputed for us. */
2471 char *texta
= a
->keybeg
;
2472 char *textb
= b
->keybeg
;
2473 char *lima
= a
->keylim
;
2474 char *limb
= b
->keylim
;
2480 char const *translate
= key
->translate
;
2481 bool const *ignore
= key
->ignore
;
2483 /* Treat field ends before field starts as empty fields. */
2484 lima
= MAX (texta
, lima
);
2485 limb
= MAX (textb
, limb
);
2487 /* Find the lengths. */
2488 size_t lena
= lima
- texta
;
2489 size_t lenb
= limb
- textb
;
2491 if (hard_LC_COLLATE
|| key_numeric (key
)
2492 || key
->month
|| key
->random
|| key
->version
)
2499 char enda
IF_LINT (= 0);
2500 char endb
IF_LINT (= 0);
2501 void *allocated
IF_LINT (= NULL
);
2502 char stackbuf
[4000];
2504 if (ignore
|| translate
)
2506 /* Compute with copies of the keys, which are the result of
2507 translating or ignoring characters, and which need their
2512 /* Allocate space for copies. */
2513 size_t size
= lena
+ 1 + lenb
+ 1;
2514 if (size
<= sizeof stackbuf
)
2515 ta
= stackbuf
, allocated
= NULL
;
2517 ta
= allocated
= xmalloc (size
);
2520 /* Put into each copy a version of the key in which the
2521 requested characters are ignored or translated. */
2522 for (tlena
= i
= 0; i
< lena
; i
++)
2523 if (! (ignore
&& ignore
[to_uchar (texta
[i
])]))
2524 ta
[tlena
++] = (translate
2525 ? translate
[to_uchar (texta
[i
])]
2529 for (tlenb
= i
= 0; i
< lenb
; i
++)
2530 if (! (ignore
&& ignore
[to_uchar (textb
[i
])]))
2531 tb
[tlenb
++] = (translate
2532 ? translate
[to_uchar (textb
[i
])]
2538 /* Use the keys in-place, temporarily null-terminated. */
2539 ta
= texta
; tlena
= lena
; enda
= ta
[tlena
]; ta
[tlena
] = '\0';
2540 tb
= textb
; tlenb
= lenb
; endb
= tb
[tlenb
]; tb
[tlenb
] = '\0';
2544 diff
= numcompare (ta
, tb
);
2545 else if (key
->general_numeric
)
2546 diff
= general_numcompare (ta
, tb
);
2547 else if (key
->human_numeric
)
2548 diff
= human_numcompare (ta
, tb
);
2549 else if (key
->month
)
2550 diff
= getmonth (ta
, NULL
) - getmonth (tb
, NULL
);
2551 else if (key
->random
)
2552 diff
= compare_random (ta
, tlena
, tb
, tlenb
);
2553 else if (key
->version
)
2554 diff
= filevercmp (ta
, tb
);
2557 /* Locale-dependent string sorting. This is slower than
2558 C-locale sorting, which is implemented below. */
2560 diff
= - NONZERO (tlenb
);
2561 else if (tlenb
== 0)
2564 diff
= xmemcoll0 (ta
, tlena
+ 1, tb
, tlenb
+ 1);
2567 if (ignore
|| translate
)
2577 #define CMP_WITH_IGNORE(A, B) \
2582 while (texta < lima && ignore[to_uchar (*texta)]) \
2584 while (textb < limb && ignore[to_uchar (*textb)]) \
2586 if (! (texta < lima && textb < limb)) \
2588 diff = to_uchar (A) - to_uchar (B); \
2595 diff = (texta < lima) - (textb < limb); \
2600 CMP_WITH_IGNORE (translate
[to_uchar (*texta
)],
2601 translate
[to_uchar (*textb
)]);
2603 CMP_WITH_IGNORE (*texta
, *textb
);
2606 diff
= - NONZERO (lenb
);
2613 while (texta
< lima
&& textb
< limb
)
2615 diff
= (to_uchar (translate
[to_uchar (*texta
++)])
2616 - to_uchar (translate
[to_uchar (*textb
++)]));
2623 diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2627 diff
= lena
< lenb
? -1 : lena
!= lenb
;
2637 /* Find the beginning and limit of the next field. */
2638 if (key
->eword
!= SIZE_MAX
)
2639 lima
= limfield (a
, key
), limb
= limfield (b
, key
);
2641 lima
= a
->text
+ a
->length
- 1, limb
= b
->text
+ b
->length
- 1;
2643 if (key
->sword
!= SIZE_MAX
)
2644 texta
= begfield (a
, key
), textb
= begfield (b
, key
);
2647 texta
= a
->text
, textb
= b
->text
;
2648 if (key
->skipsblanks
)
2650 while (texta
< lima
&& blanks
[to_uchar (*texta
)])
2652 while (textb
< limb
&& blanks
[to_uchar (*textb
)])
2663 return key
->reverse
? -diff
: diff
;
2666 /* Compare two lines A and B, returning negative, zero, or positive
2667 depending on whether A compares less than, equal to, or greater than B. */
2670 compare (struct line
const *a
, struct line
const *b
)
2675 /* First try to compare on the specified keys (if any).
2676 The only two cases with no key at all are unadorned sort,
2677 and unadorned sort -r. */
2680 diff
= keycompare (a
, b
);
2681 if (diff
|| unique
|| stable
)
2685 /* If the keys all compare equal (or no keys were specified)
2686 fall through to the default comparison. */
2687 alen
= a
->length
- 1, blen
= b
->length
- 1;
2690 diff
= - NONZERO (blen
);
2693 else if (hard_LC_COLLATE
)
2695 /* Note xmemcoll0 is a performance enhancement as
2696 it will not unconditionally write '\0' after the
2697 passed in buffers, which was seen to give around
2698 a 3% increase in performance for short lines. */
2699 diff
= xmemcoll0 (a
->text
, alen
+ 1, b
->text
, blen
+ 1);
2701 else if (! (diff
= memcmp (a
->text
, b
->text
, MIN (alen
, blen
))))
2702 diff
= alen
< blen
? -1 : alen
!= blen
;
2704 return reverse
? -diff
: diff
;
2707 /* Write LINE to output stream FP; the output file's name is
2708 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2709 otherwise. If debugging is enabled and FP is standard output,
2710 append some debugging information. */
2713 write_line (struct line
const *line
, FILE *fp
, char const *output_file
)
2715 char *buf
= line
->text
;
2716 size_t n_bytes
= line
->length
;
2717 char *ebuf
= buf
+ n_bytes
;
2719 if (!output_file
&& debug
)
2721 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2722 char const *c
= buf
;
2731 if (fputc (wc
, fp
) == EOF
)
2732 die (_("write failed"), output_file
);
2740 if (fwrite (buf
, 1, n_bytes
, fp
) != n_bytes
)
2741 die (_("write failed"), output_file
);
2746 /* Check that the lines read from FILE_NAME come in order. Return
2747 true if they are in order. If CHECKONLY == 'c', also print a
2748 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2749 they are not in order. */
2752 check (char const *file_name
, char checkonly
)
2754 FILE *fp
= xfopen (file_name
, "r");
2755 struct buffer buf
; /* Input buffer. */
2756 struct line temp
; /* Copy of previous line. */
2758 uintmax_t line_number
= 0;
2759 struct keyfield
const *key
= keylist
;
2760 bool nonunique
= ! unique
;
2761 bool ordered
= true;
2763 initbuf (&buf
, sizeof (struct line
),
2764 MAX (merge_buffer_size
, sort_size
));
2767 while (fillbuf (&buf
, fp
, file_name
))
2769 struct line
const *line
= buffer_linelim (&buf
);
2770 struct line
const *linebase
= line
- buf
.nlines
;
2772 /* Make sure the line saved from the old buffer contents is
2773 less than or equal to the first line of the new buffer. */
2774 if (alloc
&& nonunique
<= compare (&temp
, line
- 1))
2778 if (checkonly
== 'c')
2780 struct line
const *disorder_line
= line
- 1;
2781 uintmax_t disorder_line_number
=
2782 buffer_linelim (&buf
) - disorder_line
+ line_number
;
2783 char hr_buf
[INT_BUFSIZE_BOUND (disorder_line_number
)];
2784 fprintf (stderr
, _("%s: %s:%s: disorder: "),
2785 program_name
, file_name
,
2786 umaxtostr (disorder_line_number
, hr_buf
));
2787 write_line (disorder_line
, stderr
, _("standard error"));
2795 /* Compare each line in the buffer with its successor. */
2796 while (linebase
< --line
)
2797 if (nonunique
<= compare (line
, line
- 1))
2798 goto found_disorder
;
2800 line_number
+= buf
.nlines
;
2802 /* Save the last line of the buffer. */
2803 if (alloc
< line
->length
)
2810 alloc
= line
->length
;
2814 while (alloc
< line
->length
);
2817 temp
.text
= xmalloc (alloc
);
2819 memcpy (temp
.text
, line
->text
, line
->length
);
2820 temp
.length
= line
->length
;
2823 temp
.keybeg
= temp
.text
+ (line
->keybeg
- line
->text
);
2824 temp
.keylim
= temp
.text
+ (line
->keylim
- line
->text
);
2828 xfclose (fp
, file_name
);
2834 /* Open FILES (there are NFILES of them) and store the resulting array
2835 of stream pointers into (*PFPS). Allocate the array. Return the
2836 number of successfully opened files, setting errno if this value is
2837 less than NFILES. */
2840 open_input_files (struct sortfile
*files
, size_t nfiles
, FILE ***pfps
)
2842 FILE **fps
= *pfps
= xnmalloc (nfiles
, sizeof *fps
);
2845 /* Open as many input files as we can. */
2846 for (i
= 0; i
< nfiles
; i
++)
2848 fps
[i
] = (files
[i
].temp
&& files
[i
].temp
->state
!= UNCOMPRESSED
2849 ? open_temp (files
[i
].temp
)
2850 : stream_open (files
[i
].name
, "r"));
2858 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2859 files (all of which are at the start of the FILES array), and
2860 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2861 FPS is the vector of open stream corresponding to the files.
2862 Close input and output streams before returning.
2863 OUTPUT_FILE gives the name of the output file. If it is NULL,
2864 the output file is standard output. */
2867 mergefps (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
2868 FILE *ofp
, char const *output_file
, FILE **fps
)
2870 struct buffer
*buffer
= xnmalloc (nfiles
, sizeof *buffer
);
2871 /* Input buffers for each file. */
2872 struct line saved
; /* Saved line storage for unique check. */
2873 struct line
const *savedline
= NULL
;
2874 /* &saved if there is a saved line. */
2875 size_t savealloc
= 0; /* Size allocated for the saved line. */
2876 struct line
const **cur
= xnmalloc (nfiles
, sizeof *cur
);
2877 /* Current line in each line table. */
2878 struct line
const **base
= xnmalloc (nfiles
, sizeof *base
);
2879 /* Base of each line table. */
2880 size_t *ord
= xnmalloc (nfiles
, sizeof *ord
);
2881 /* Table representing a permutation of fps,
2882 such that cur[ord[0]] is the smallest line
2883 and will be next output. */
2887 struct keyfield
const *key
= keylist
;
2890 /* Read initial lines from each input file. */
2891 for (i
= 0; i
< nfiles
; )
2893 initbuf (&buffer
[i
], sizeof (struct line
),
2894 MAX (merge_buffer_size
, sort_size
/ nfiles
));
2895 if (fillbuf (&buffer
[i
], fps
[i
], files
[i
].name
))
2897 struct line
const *linelim
= buffer_linelim (&buffer
[i
]);
2898 cur
[i
] = linelim
- 1;
2899 base
[i
] = linelim
- buffer
[i
].nlines
;
2904 /* fps[i] is empty; eliminate it from future consideration. */
2905 xfclose (fps
[i
], files
[i
].name
);
2909 zaptemp (files
[i
].name
);
2911 free (buffer
[i
].buf
);
2913 for (j
= i
; j
< nfiles
; ++j
)
2915 files
[j
] = files
[j
+ 1];
2916 fps
[j
] = fps
[j
+ 1];
2921 /* Set up the ord table according to comparisons among input lines.
2922 Since this only reorders two items if one is strictly greater than
2923 the other, it is stable. */
2924 for (i
= 0; i
< nfiles
; ++i
)
2926 for (i
= 1; i
< nfiles
; ++i
)
2927 if (0 < compare (cur
[ord
[i
- 1]], cur
[ord
[i
]]))
2928 t
= ord
[i
- 1], ord
[i
- 1] = ord
[i
], ord
[i
] = t
, i
= 0;
2930 /* Repeatedly output the smallest line until no input remains. */
2933 struct line
const *smallest
= cur
[ord
[0]];
2935 /* If uniquified output is turned on, output only the first of
2936 an identical series of lines. */
2939 if (savedline
&& compare (savedline
, smallest
))
2942 write_line (&saved
, ofp
, output_file
);
2947 if (savealloc
< smallest
->length
)
2952 savealloc
= smallest
->length
;
2955 while ((savealloc
*= 2) < smallest
->length
);
2958 saved
.text
= xmalloc (savealloc
);
2960 saved
.length
= smallest
->length
;
2961 memcpy (saved
.text
, smallest
->text
, saved
.length
);
2965 saved
.text
+ (smallest
->keybeg
- smallest
->text
);
2967 saved
.text
+ (smallest
->keylim
- smallest
->text
);
2972 write_line (smallest
, ofp
, output_file
);
2974 /* Check if we need to read more lines into core. */
2975 if (base
[ord
[0]] < smallest
)
2976 cur
[ord
[0]] = smallest
- 1;
2979 if (fillbuf (&buffer
[ord
[0]], fps
[ord
[0]], files
[ord
[0]].name
))
2981 struct line
const *linelim
= buffer_linelim (&buffer
[ord
[0]]);
2982 cur
[ord
[0]] = linelim
- 1;
2983 base
[ord
[0]] = linelim
- buffer
[ord
[0]].nlines
;
2987 /* We reached EOF on fps[ord[0]]. */
2988 for (i
= 1; i
< nfiles
; ++i
)
2989 if (ord
[i
] > ord
[0])
2992 xfclose (fps
[ord
[0]], files
[ord
[0]].name
);
2993 if (ord
[0] < ntemps
)
2996 zaptemp (files
[ord
[0]].name
);
2998 free (buffer
[ord
[0]].buf
);
2999 for (i
= ord
[0]; i
< nfiles
; ++i
)
3001 fps
[i
] = fps
[i
+ 1];
3002 files
[i
] = files
[i
+ 1];
3003 buffer
[i
] = buffer
[i
+ 1];
3004 cur
[i
] = cur
[i
+ 1];
3005 base
[i
] = base
[i
+ 1];
3007 for (i
= 0; i
< nfiles
; ++i
)
3008 ord
[i
] = ord
[i
+ 1];
3013 /* The new line just read in may be larger than other lines
3014 already in main memory; push it back in the queue until we
3015 encounter a line larger than it. Optimize for the common
3016 case where the new line is smallest. */
3021 size_t ord0
= ord
[0];
3022 size_t count_of_smaller_lines
;
3026 int cmp
= compare (cur
[ord0
], cur
[ord
[probe
]]);
3027 if (cmp
< 0 || (cmp
== 0 && ord0
< ord
[probe
]))
3031 probe
= (lo
+ hi
) / 2;
3034 count_of_smaller_lines
= lo
- 1;
3035 for (j
= 0; j
< count_of_smaller_lines
; j
++)
3036 ord
[j
] = ord
[j
+ 1];
3037 ord
[count_of_smaller_lines
] = ord0
;
3041 if (unique
&& savedline
)
3043 write_line (&saved
, ofp
, output_file
);
3047 xfclose (ofp
, output_file
);
3055 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3056 files (all of which are at the start of the FILES array), and
3057 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3058 Close input and output files before returning.
3059 OUTPUT_FILE gives the name of the output file.
3061 Return the number of files successfully merged. This number can be
3062 less than NFILES if we ran low on file descriptors, but in this
3063 case it is never less than 2. */
3066 mergefiles (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3067 FILE *ofp
, char const *output_file
)
3070 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3071 if (nopened
< nfiles
&& nopened
< 2)
3072 die (_("open failed"), files
[nopened
].name
);
3073 mergefps (files
, ntemps
, nopened
, ofp
, output_file
, fps
);
3077 /* Merge into T (of size NLINES) the two sorted arrays of lines
3078 LO (with NLINES / 2 members), and
3079 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3080 T and LO point just past their respective arrays, and the arrays
3081 are in reverse order. NLINES must be at least 2. */
3084 mergelines (struct line
*restrict t
, size_t nlines
,
3085 struct line
const *restrict lo
)
3087 size_t nlo
= nlines
/ 2;
3088 size_t nhi
= nlines
- nlo
;
3089 struct line
*hi
= t
- nlo
;
3092 if (compare (lo
- 1, hi
- 1) <= 0)
3097 /* HI must equal T now, and there is no need to copy from
3116 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3117 Do this all within one thread. NLINES must be at least 2.
3118 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3119 Otherwise the sort is in-place and TEMP is half-sized.
3120 The input and output arrays are in reverse order, and LINES and
3121 TEMP point just past the end of their respective arrays.
3123 Use a recursive divide-and-conquer algorithm, in the style
3124 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3125 the optimization suggested by exercise 5.2.4-10; this requires room
3126 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3127 writes that this memory optimization was originally published by
3128 D. A. Bell, Comp J. 1 (1958), 75. */
3131 sequential_sort (struct line
*restrict lines
, size_t nlines
,
3132 struct line
*restrict temp
, bool to_temp
)
3136 /* Declare 'swap' as int, not bool, to work around a bug
3137 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3138 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3139 int swap
= (0 < compare (&lines
[-1], &lines
[-2]));
3142 temp
[-1] = lines
[-1 - swap
];
3143 temp
[-2] = lines
[-2 + swap
];
3147 temp
[-1] = lines
[-1];
3148 lines
[-1] = lines
[-2];
3149 lines
[-2] = temp
[-1];
3154 size_t nlo
= nlines
/ 2;
3155 size_t nhi
= nlines
- nlo
;
3156 struct line
*lo
= lines
;
3157 struct line
*hi
= lines
- nlo
;
3159 sequential_sort (hi
, nhi
, temp
- (to_temp
? nlo
: 0), to_temp
);
3161 sequential_sort (lo
, nlo
, temp
, !to_temp
);
3166 struct line
const *sorted_lo
;
3177 mergelines (dest
, nlines
, sorted_lo
);
3181 static struct merge_node
*init_node (struct merge_node
*restrict
,
3182 struct merge_node
*restrict
,
3183 struct line
*, size_t, size_t, bool);
3186 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3187 lines, with destination DEST. */
3188 static struct merge_node
*
3189 merge_tree_init (size_t nthreads
, size_t nlines
, struct line
*dest
)
3191 struct merge_node
*merge_tree
= xmalloc (2 * sizeof *merge_tree
* nthreads
);
3193 struct merge_node
*root
= merge_tree
;
3194 root
->lo
= root
->hi
= root
->end_lo
= root
->end_hi
= NULL
;
3196 root
->nlo
= root
->nhi
= nlines
;
3197 root
->parent
= NULL
;
3198 root
->level
= MERGE_END
;
3199 root
->queued
= false;
3200 pthread_mutex_init (&root
->lock
, NULL
);
3202 init_node (root
, root
+ 1, dest
, nthreads
, nlines
, false);
3206 /* Destroy the merge tree. */
3208 merge_tree_destroy (struct merge_node
*merge_tree
)
3213 /* Initialize a merge tree node and its descendants. The node's
3214 parent is PARENT. The node and its descendants are taken from the
3215 array of nodes NODE_POOL. Their destination starts at DEST; they
3216 will consume NTHREADS threads. The total number of sort lines is
3217 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3220 static struct merge_node
*
3221 init_node (struct merge_node
*restrict parent
,
3222 struct merge_node
*restrict node_pool
,
3223 struct line
*dest
, size_t nthreads
,
3224 size_t total_lines
, bool is_lo_child
)
3226 size_t nlines
= (is_lo_child
? parent
->nlo
: parent
->nhi
);
3227 size_t nlo
= nlines
/ 2;
3228 size_t nhi
= nlines
- nlo
;
3229 struct line
*lo
= dest
- total_lines
;
3230 struct line
*hi
= lo
- nlo
;
3231 struct line
**parent_end
= (is_lo_child
? &parent
->end_lo
: &parent
->end_hi
);
3233 struct merge_node
*node
= node_pool
++;
3234 node
->lo
= node
->end_lo
= lo
;
3235 node
->hi
= node
->end_hi
= hi
;
3236 node
->dest
= parent_end
;
3239 node
->parent
= parent
;
3240 node
->level
= parent
->level
+ 1;
3241 node
->queued
= false;
3242 pthread_mutex_init (&node
->lock
, NULL
);
3246 size_t lo_threads
= nthreads
/ 2;
3247 size_t hi_threads
= nthreads
- lo_threads
;
3248 node
->lo_child
= node_pool
;
3249 node_pool
= init_node (node
, node_pool
, lo
, lo_threads
,
3251 node
->hi_child
= node_pool
;
3252 node_pool
= init_node (node
, node_pool
, hi
, hi_threads
,
3253 total_lines
, false);
3257 node
->lo_child
= NULL
;
3258 node
->hi_child
= NULL
;
3264 /* Compare two merge nodes A and B for priority. */
3267 compare_nodes (void const *a
, void const *b
)
3269 struct merge_node
const *nodea
= a
;
3270 struct merge_node
const *nodeb
= b
;
3271 if (nodea
->level
== nodeb
->level
)
3272 return (nodea
->nlo
+ nodea
->nhi
) < (nodeb
->nlo
+ nodeb
->nhi
);
3273 return nodea
->level
< nodeb
->level
;
3276 /* Lock a merge tree NODE. */
3279 lock_node (struct merge_node
*node
)
3281 pthread_mutex_lock (&node
->lock
);
3284 /* Unlock a merge tree NODE. */
3287 unlock_node (struct merge_node
*node
)
3289 pthread_mutex_unlock (&node
->lock
);
3292 /* Destroy merge QUEUE. */
3295 queue_destroy (struct merge_node_queue
*queue
)
3297 heap_free (queue
->priority_queue
);
3298 pthread_cond_destroy (&queue
->cond
);
3299 pthread_mutex_destroy (&queue
->mutex
);
3302 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3303 NTHREADS threads. */
3306 queue_init (struct merge_node_queue
*queue
, size_t nthreads
)
3308 /* Though it's highly unlikely all nodes are in the heap at the same
3309 time, the heap should accommodate all of them. Counting a NULL
3310 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3311 queue
->priority_queue
= heap_alloc (compare_nodes
, 2 * nthreads
);
3312 pthread_mutex_init (&queue
->mutex
, NULL
);
3313 pthread_cond_init (&queue
->cond
, NULL
);
3316 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3317 does not need to lock NODE. */
3320 queue_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3322 pthread_mutex_lock (&queue
->mutex
);
3323 heap_insert (queue
->priority_queue
, node
);
3324 node
->queued
= true;
3325 pthread_mutex_unlock (&queue
->mutex
);
3326 pthread_cond_signal (&queue
->cond
);
3329 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3331 static struct merge_node
*
3332 queue_pop (struct merge_node_queue
*queue
)
3334 struct merge_node
*node
;
3335 pthread_mutex_lock (&queue
->mutex
);
3336 while (! (node
= heap_remove_top (queue
->priority_queue
)))
3337 pthread_cond_wait (&queue
->cond
, &queue
->mutex
);
3338 pthread_mutex_unlock (&queue
->mutex
);
3340 node
->queued
= false;
3344 /* Output LINE to TFP, unless -u is specified and the line compares
3345 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3346 is null if TFP is standard output.
3348 This function does not save the line for comparison later, so it is
3349 appropriate only for internal sort. */
3352 write_unique (struct line
const *line
, FILE *tfp
, char const *temp_output
)
3356 if (saved_line
.text
&& ! compare (line
, &saved_line
))
3361 write_line (line
, tfp
, temp_output
);
3364 /* Merge the lines currently available to a NODE in the binary
3365 merge tree. Merge a number of lines appropriate for this merge
3366 level, assuming TOTAL_LINES is the total number of lines.
3368 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3369 the name of TFP, or is null if TFP is standard output. */
3372 mergelines_node (struct merge_node
*restrict node
, size_t total_lines
,
3373 FILE *tfp
, char const *temp_output
)
3375 struct line
*lo_orig
= node
->lo
;
3376 struct line
*hi_orig
= node
->hi
;
3377 size_t to_merge
= MAX_MERGE (total_lines
, node
->level
);
3381 if (node
->level
> MERGE_ROOT
)
3383 /* Merge to destination buffer. */
3384 struct line
*dest
= *node
->dest
;
3385 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3386 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3387 *--dest
= *--node
->lo
;
3389 *--dest
= *--node
->hi
;
3391 merged_lo
= lo_orig
- node
->lo
;
3392 merged_hi
= hi_orig
- node
->hi
;
3394 if (node
->nhi
== merged_hi
)
3395 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3396 *--dest
= *--node
->lo
;
3397 else if (node
->nlo
== merged_lo
)
3398 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3399 *--dest
= *--node
->hi
;
3404 /* Merge directly to output. */
3405 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3407 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3408 write_unique (--node
->lo
, tfp
, temp_output
);
3410 write_unique (--node
->hi
, tfp
, temp_output
);
3413 merged_lo
= lo_orig
- node
->lo
;
3414 merged_hi
= hi_orig
- node
->hi
;
3416 if (node
->nhi
== merged_hi
)
3418 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3419 write_unique (--node
->lo
, tfp
, temp_output
);
3421 else if (node
->nlo
== merged_lo
)
3423 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3424 write_unique (--node
->hi
, tfp
, temp_output
);
3429 merged_lo
= lo_orig
- node
->lo
;
3430 merged_hi
= hi_orig
- node
->hi
;
3431 node
->nlo
-= merged_lo
;
3432 node
->nhi
-= merged_hi
;
3435 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3436 NODE's children has available lines and the other either has
3437 available lines or has exhausted its lines. */
3440 queue_check_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3444 bool lo_avail
= (node
->lo
- node
->end_lo
) != 0;
3445 bool hi_avail
= (node
->hi
- node
->end_hi
) != 0;
3446 if (lo_avail
? hi_avail
|| ! node
->nhi
: hi_avail
&& ! node
->nlo
)
3447 queue_insert (queue
, node
);
3451 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3454 queue_check_insert_parent (struct merge_node_queue
*queue
,
3455 struct merge_node
*node
)
3457 if (node
->level
> MERGE_ROOT
)
3459 lock_node (node
->parent
);
3460 queue_check_insert (queue
, node
->parent
);
3461 unlock_node (node
->parent
);
3463 else if (node
->nlo
+ node
->nhi
== 0)
3465 /* If the MERGE_ROOT NODE has finished merging, insert the
3467 queue_insert (queue
, node
->parent
);
3471 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3472 some of those lines, until the MERGE_END node is popped.
3473 TOTAL_LINES is the total number of lines. If merging at the top
3474 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3475 null if TFP is standard output. */
3478 merge_loop (struct merge_node_queue
*queue
,
3479 size_t total_lines
, FILE *tfp
, char const *temp_output
)
3483 struct merge_node
*node
= queue_pop (queue
);
3485 if (node
->level
== MERGE_END
)
3488 /* Reinsert so other threads can pop it. */
3489 queue_insert (queue
, node
);
3492 mergelines_node (node
, total_lines
, tfp
, temp_output
);
3493 queue_check_insert (queue
, node
);
3494 queue_check_insert_parent (queue
, node
);
3501 static void sortlines (struct line
*restrict
, size_t, size_t,
3502 struct merge_node
*, struct merge_node_queue
*,
3503 FILE *, char const *);
3505 /* Thread arguments for sortlines_thread. */
3509 /* Source, i.e., the array of lines to sort. This points just past
3510 the end of the array. */
3513 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3516 /* Number of lines in LINES and DEST. */
3517 size_t const total_lines
;
3519 /* Merge node. Lines from this node and this node's sibling will merged
3520 to this node's parent. */
3521 struct merge_node
*const node
;
3523 /* The priority queue controlling available work for the entire
3525 struct merge_node_queue
*const queue
;
3527 /* If at the top level, the file to output to, and the file's name.
3528 If the file is standard output, the file's name is null. */
3530 char const *output_temp
;
3533 /* Like sortlines, except with a signature acceptable to pthread_create. */
3536 sortlines_thread (void *data
)
3538 struct thread_args
const *args
= data
;
3539 sortlines (args
->lines
, args
->nthreads
, args
->total_lines
,
3540 args
->node
, args
->queue
, args
->tfp
,
3545 /* Sort lines, possibly in parallel. The arguments are as in struct
3548 The algorithm has three phases: node creation, sequential sort,
3551 During node creation, sortlines recursively visits each node in the
3552 binary merge tree and creates a NODE structure corresponding to all the
3553 future line merging NODE is responsible for. For each call to
3554 sortlines, half the available threads are assigned to each recursive
3555 call, until a leaf node having only 1 available thread is reached.
3557 Each leaf node then performs two sequential sorts, one on each half of
3558 the lines it is responsible for. It records in its NODE structure that
3559 there are two sorted sublists available to merge from, and inserts its
3560 NODE into the priority queue.
3562 The binary merge phase then begins. Each thread drops into a loop
3563 where the thread retrieves a NODE from the priority queue, merges lines
3564 available to that NODE, and potentially insert NODE or its parent back
3565 into the queue if there are sufficient available lines for them to
3566 merge. This continues until all lines at all nodes of the merge tree
3567 have been merged. */
3570 sortlines (struct line
*restrict lines
, size_t nthreads
,
3571 size_t total_lines
, struct merge_node
*node
,
3572 struct merge_node_queue
*queue
, FILE *tfp
, char const *temp_output
)
3574 size_t nlines
= node
->nlo
+ node
->nhi
;
3576 /* Calculate thread arguments. */
3577 size_t lo_threads
= nthreads
/ 2;
3578 size_t hi_threads
= nthreads
- lo_threads
;
3580 struct thread_args args
= {lines
, lo_threads
, total_lines
,
3581 node
->lo_child
, queue
, tfp
, temp_output
};
3583 if (nthreads
> 1 && SUBTHREAD_LINES_HEURISTIC
<= nlines
3584 && pthread_create (&thread
, NULL
, sortlines_thread
, &args
) == 0)
3586 sortlines (lines
- node
->nlo
, hi_threads
, total_lines
,
3587 node
->hi_child
, queue
, tfp
, temp_output
);
3588 pthread_join (thread
, NULL
);
3592 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3593 Sort with 1 thread. */
3594 size_t nlo
= node
->nlo
;
3595 size_t nhi
= node
->nhi
;
3596 struct line
*temp
= lines
- total_lines
;
3598 sequential_sort (lines
- nlo
, nhi
, temp
- nlo
/ 2, false);
3600 sequential_sort (lines
, nlo
, temp
, false);
3602 /* Update merge NODE. No need to lock yet. */
3604 node
->hi
= lines
- nlo
;
3605 node
->end_lo
= lines
- nlo
;
3606 node
->end_hi
= lines
- nlo
- nhi
;
3608 queue_insert (queue
, node
);
3609 merge_loop (queue
, total_lines
, tfp
, temp_output
);
3612 pthread_mutex_destroy (&node
->lock
);
3615 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3616 the same as OUTFILE. If found, replace each with the same
3617 temporary copy that can be merged into OUTFILE without destroying
3618 OUTFILE before it is completely read. This temporary copy does not
3619 count as a merge temp, so don't worry about incrementing NTEMPS in
3620 the caller; final cleanup will remove it, not zaptemp.
3622 This test ensures that an otherwise-erroneous use like
3623 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3624 It's not clear that POSIX requires this nicety.
3625 Detect common error cases, but don't try to catch obscure cases like
3626 "cat ... FILE ... | sort -m -o FILE"
3627 where traditional "sort" doesn't copy the input and where
3628 people should know that they're getting into trouble anyway.
3629 Catching these obscure cases would slow down performance in
3633 avoid_trashing_input (struct sortfile
*files
, size_t ntemps
,
3634 size_t nfiles
, char const *outfile
)
3637 bool got_outstat
= false;
3638 struct stat outstat
;
3639 struct tempnode
*tempcopy
= NULL
;
3641 for (i
= ntemps
; i
< nfiles
; i
++)
3643 bool is_stdin
= STREQ (files
[i
].name
, "-");
3647 if (outfile
&& STREQ (outfile
, files
[i
].name
) && !is_stdin
)
3653 if (fstat (STDOUT_FILENO
, &outstat
) != 0)
3659 ? fstat (STDIN_FILENO
, &instat
)
3660 : stat (files
[i
].name
, &instat
))
3662 && SAME_INODE (instat
, outstat
));
3670 tempcopy
= create_temp (&tftp
);
3671 mergefiles (&files
[i
], 0, 1, tftp
, tempcopy
->name
);
3674 files
[i
].name
= tempcopy
->name
;
3675 files
[i
].temp
= tempcopy
;
3680 /* Scan the input files to ensure all are accessible.
3681 Otherwise exit with a diagnostic.
3683 Note this will catch common issues with permissions etc.
3684 but will fail to notice issues where you can open() but not read(),
3685 like when a directory is specified on some systems.
3686 Catching these obscure cases could slow down performance in
3690 check_inputs (char *const *files
, size_t nfiles
)
3693 for (i
= 0; i
< nfiles
; i
++)
3695 if (STREQ (files
[i
], "-"))
3698 if (euidaccess (files
[i
], R_OK
) != 0)
3699 die (_("cannot read"), files
[i
]);
3703 /* Ensure a specified output file can be created or written to,
3704 and point stdout to it. Do not truncate the file.
3705 Exit with a diagnostic on failure. */
3708 check_output (char const *outfile
)
3712 int outfd
= open (outfile
, O_WRONLY
| O_CREAT
| O_BINARY
, MODE_RW_UGO
);
3714 die (_("open failed"), outfile
);
3715 move_fd_or_die (outfd
, STDOUT_FILENO
);
3719 /* Merge the input FILES. NTEMPS is the number of files at the
3720 start of FILES that are temporary; it is zero at the top level.
3721 NFILES is the total number of files. Put the output in
3722 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3725 merge (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3726 char const *output_file
)
3728 while (nmerge
< nfiles
)
3730 /* Number of input files processed so far. */
3733 /* Number of output files generated so far. */
3736 /* nfiles % NMERGE; this counts input files that are left over
3737 after all full-sized merges have been done. */
3740 /* Number of easily-available slots at the next loop iteration. */
3743 /* Do as many NMERGE-size merges as possible. In the case that
3744 nmerge is bogus, increment by the maximum number of file
3745 descriptors allowed. */
3746 for (out
= in
= 0; nmerge
<= nfiles
- in
; out
++)
3749 struct tempnode
*temp
= create_temp (&tfp
);
3750 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nmerge
),
3751 nmerge
, tfp
, temp
->name
);
3752 ntemps
-= MIN (ntemps
, num_merged
);
3753 files
[out
].name
= temp
->name
;
3754 files
[out
].temp
= temp
;
3758 remainder
= nfiles
- in
;
3759 cheap_slots
= nmerge
- out
% nmerge
;
3761 if (cheap_slots
< remainder
)
3763 /* So many files remain that they can't all be put into the last
3764 NMERGE-sized output window. Do one more merge. Merge as few
3765 files as possible, to avoid needless I/O. */
3766 size_t nshortmerge
= remainder
- cheap_slots
+ 1;
3768 struct tempnode
*temp
= create_temp (&tfp
);
3769 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nshortmerge
),
3770 nshortmerge
, tfp
, temp
->name
);
3771 ntemps
-= MIN (ntemps
, num_merged
);
3772 files
[out
].name
= temp
->name
;
3773 files
[out
++].temp
= temp
;
3777 /* Put the remaining input files into the last NMERGE-sized output
3778 window, so they will be merged in the next pass. */
3779 memmove (&files
[out
], &files
[in
], (nfiles
- in
) * sizeof *files
);
3784 avoid_trashing_input (files
, ntemps
, nfiles
, output_file
);
3786 /* We aren't guaranteed that this final mergefiles will work, therefore we
3787 try to merge into the output, and then merge as much as we can into a
3788 temp file if we can't. Repeat. */
3792 /* Merge directly into the output file if possible. */
3794 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3796 if (nopened
== nfiles
)
3798 FILE *ofp
= stream_open (output_file
, "w");
3801 mergefps (files
, ntemps
, nfiles
, ofp
, output_file
, fps
);
3804 if (errno
!= EMFILE
|| nopened
<= 2)
3805 die (_("open failed"), output_file
);
3807 else if (nopened
<= 2)
3808 die (_("open failed"), files
[nopened
].name
);
3810 /* We ran out of file descriptors. Close one of the input
3811 files, to gain a file descriptor. Then create a temporary
3812 file with our spare file descriptor. Retry if that failed
3813 (e.g., some other process could open a file between the time
3814 we closed and tried to create). */
3816 struct tempnode
*temp
;
3820 xfclose (fps
[nopened
], files
[nopened
].name
);
3821 temp
= maybe_create_temp (&tfp
, ! (nopened
<= 2));
3825 /* Merge into the newly allocated temporary. */
3826 mergefps (&files
[0], MIN (ntemps
, nopened
), nopened
, tfp
, temp
->name
,
3828 ntemps
-= MIN (ntemps
, nopened
);
3829 files
[0].name
= temp
->name
;
3830 files
[0].temp
= temp
;
3832 memmove (&files
[1], &files
[nopened
], (nfiles
- nopened
) * sizeof *files
);
3834 nfiles
-= nopened
- 1;
3838 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3841 sort (char *const *files
, size_t nfiles
, char const *output_file
,
3845 IF_LINT (buf
.buf
= NULL
);
3847 bool output_file_created
= false;
3853 char const *temp_output
;
3854 char const *file
= *files
;
3855 FILE *fp
= xfopen (file
, "r");
3858 size_t bytes_per_line
;
3864 while (tmp
< nthreads
)
3869 bytes_per_line
= (mult
* sizeof (struct line
));
3872 bytes_per_line
= sizeof (struct line
) * 3 / 2;
3875 initbuf (&buf
, bytes_per_line
,
3876 sort_buffer_size (&fp
, 1, files
, nfiles
, bytes_per_line
));
3881 while (fillbuf (&buf
, fp
, file
))
3885 if (buf
.eof
&& nfiles
3886 && (bytes_per_line
+ 1
3887 < (buf
.alloc
- buf
.used
- bytes_per_line
* buf
.nlines
)))
3889 /* End of file, but there is more input and buffer room.
3890 Concatenate the next input file; this is faster in
3892 buf
.left
= buf
.used
;
3896 saved_line
.text
= NULL
;
3897 line
= buffer_linelim (&buf
);
3898 if (buf
.eof
&& !nfiles
&& !ntemps
&& !buf
.left
)
3901 tfp
= xfopen (output_file
, "w");
3902 temp_output
= output_file
;
3903 output_file_created
= true;
3908 temp_output
= create_temp (&tfp
)->name
;
3912 struct merge_node_queue queue
;
3913 queue_init (&queue
, nthreads
);
3914 struct merge_node
*merge_tree
=
3915 merge_tree_init (nthreads
, buf
.nlines
, line
);
3916 struct merge_node
*root
= merge_tree
+ 1;
3918 sortlines (line
, nthreads
, buf
.nlines
, root
,
3919 &queue
, tfp
, temp_output
);
3920 queue_destroy (&queue
);
3921 pthread_mutex_destroy (&root
->lock
);
3922 merge_tree_destroy (merge_tree
);
3925 write_unique (line
- 1, tfp
, temp_output
);
3927 xfclose (tfp
, temp_output
);
3929 if (output_file_created
)
3938 if (! output_file_created
)
3941 struct tempnode
*node
= temphead
;
3942 struct sortfile
*tempfiles
= xnmalloc (ntemps
, sizeof *tempfiles
);
3943 for (i
= 0; node
; i
++)
3945 tempfiles
[i
].name
= node
->name
;
3946 tempfiles
[i
].temp
= node
;
3949 merge (tempfiles
, ntemps
, ntemps
, output_file
);
3956 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3959 insertkey (struct keyfield
*key_arg
)
3961 struct keyfield
**p
;
3962 struct keyfield
*key
= xmemdup (key_arg
, sizeof *key
);
3964 for (p
= &keylist
; *p
; p
= &(*p
)->next
)
3970 /* Report a bad field specification SPEC, with extra info MSGID. */
3972 static void badfieldspec (char const *, char const *)
3975 badfieldspec (char const *spec
, char const *msgid
)
3977 error (SORT_FAILURE
, 0, _("%s: invalid field specification %s"),
3978 _(msgid
), quote (spec
));
3982 /* Report incompatible options. */
3984 static void incompatible_options (char const *) ATTRIBUTE_NORETURN
;
3986 incompatible_options (char const *opts
)
3988 error (SORT_FAILURE
, 0, _("options '-%s' are incompatible"), opts
);
3992 /* Check compatibility of ordering options. */
3995 check_ordering_compatibility (void)
3997 struct keyfield
*key
;
3999 for (key
= keylist
; key
; key
= key
->next
)
4000 if (1 < (key
->numeric
+ key
->general_numeric
+ key
->human_numeric
4001 + key
->month
+ (key
->version
| key
->random
| !!key
->ignore
)))
4003 /* The following is too big, but guaranteed to be "big enough". */
4004 char opts
[sizeof short_options
];
4005 /* Clear flags we're not interested in. */
4006 key
->skipsblanks
= key
->skipeblanks
= key
->reverse
= false;
4007 key_to_opts (key
, opts
);
4008 incompatible_options (opts
);
4012 /* Parse the leading integer in STRING and store the resulting value
4013 (which must fit into size_t) into *VAL. Return the address of the
4014 suffix after the integer. If the value is too large, silently
4015 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4016 failure; otherwise, report MSGID and exit on failure. */
4019 parse_field_count (char const *string
, size_t *val
, char const *msgid
)
4024 switch (xstrtoumax (string
, &suffix
, 10, &n
, ""))
4027 case LONGINT_INVALID_SUFFIX_CHAR
:
4032 case LONGINT_OVERFLOW
:
4033 case LONGINT_OVERFLOW
| LONGINT_INVALID_SUFFIX_CHAR
:
4037 case LONGINT_INVALID
:
4039 error (SORT_FAILURE
, 0, _("%s: invalid count at start of %s"),
4040 _(msgid
), quote (string
));
4047 /* Handle interrupts and hangups. */
4050 sighandler (int sig
)
4053 signal (sig
, SIG_IGN
);
4057 signal (sig
, SIG_DFL
);
4061 /* Set the ordering options for KEY specified in S.
4062 Return the address of the first character in S that
4063 is not a valid ordering option.
4064 BLANKTYPE is the kind of blanks that 'b' should skip. */
4067 set_ordering (char const *s
, struct keyfield
*key
, enum blanktype blanktype
)
4074 if (blanktype
== bl_start
|| blanktype
== bl_both
)
4075 key
->skipsblanks
= true;
4076 if (blanktype
== bl_end
|| blanktype
== bl_both
)
4077 key
->skipeblanks
= true;
4080 key
->ignore
= nondictionary
;
4083 key
->translate
= fold_toupper
;
4086 key
->general_numeric
= true;
4089 key
->human_numeric
= true;
4092 /* Option order should not matter, so don't let -i override
4093 -d. -d implies -i, but -i does not imply -d. */
4095 key
->ignore
= nonprinting
;
4101 key
->numeric
= true;
4107 key
->reverse
= true;
4110 key
->version
= true;
4120 /* Initialize KEY. */
4122 static struct keyfield
*
4123 key_init (struct keyfield
*key
)
4125 memset (key
, 0, sizeof *key
);
4126 key
->eword
= SIZE_MAX
;
4131 main (int argc
, char **argv
)
4133 struct keyfield
*key
;
4134 struct keyfield key_buf
;
4135 struct keyfield gkey
;
4136 bool gkey_only
= false;
4140 bool mergeonly
= false;
4141 char *random_source
= NULL
;
4142 bool need_random
= false;
4143 size_t nthreads
= 0;
4145 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
4146 bool obsolete_usage
= (posix2_version () < 200112);
4148 char *files_from
= NULL
;
4150 char const *outfile
= NULL
;
4152 initialize_main (&argc
, &argv
);
4153 set_program_name (argv
[0]);
4154 setlocale (LC_ALL
, "");
4155 bindtextdomain (PACKAGE
, LOCALEDIR
);
4156 textdomain (PACKAGE
);
4158 initialize_exit_failure (SORT_FAILURE
);
4160 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
4161 #if HAVE_NL_LANGINFO
4162 hard_LC_TIME
= hard_locale (LC_TIME
);
4165 /* Get locale's representation of the decimal point. */
4167 struct lconv
const *locale
= localeconv ();
4169 /* If the locale doesn't define a decimal point, or if the decimal
4170 point is multibyte, use the C locale's decimal point. FIXME:
4171 add support for multibyte decimal points. */
4172 decimal_point
= to_uchar (locale
->decimal_point
[0]);
4173 if (! decimal_point
|| locale
->decimal_point
[1])
4174 decimal_point
= '.';
4176 /* FIXME: add support for multibyte thousands separators. */
4177 thousands_sep
= to_uchar (*locale
->thousands_sep
);
4178 if (! thousands_sep
|| locale
->thousands_sep
[1])
4182 have_read_stdin
= false;
4187 static int const sig
[] =
4189 /* The usual suspects. */
4190 SIGALRM
, SIGHUP
, SIGINT
, SIGPIPE
, SIGQUIT
, SIGTERM
,
4207 enum { nsigs
= ARRAY_CARDINALITY (sig
) };
4210 struct sigaction act
;
4212 sigemptyset (&caught_signals
);
4213 for (i
= 0; i
< nsigs
; i
++)
4215 sigaction (sig
[i
], NULL
, &act
);
4216 if (act
.sa_handler
!= SIG_IGN
)
4217 sigaddset (&caught_signals
, sig
[i
]);
4220 act
.sa_handler
= sighandler
;
4221 act
.sa_mask
= caught_signals
;
4224 for (i
= 0; i
< nsigs
; i
++)
4225 if (sigismember (&caught_signals
, sig
[i
]))
4226 sigaction (sig
[i
], &act
, NULL
);
4228 for (i
= 0; i
< nsigs
; i
++)
4229 if (signal (sig
[i
], SIG_IGN
) != SIG_IGN
)
4231 signal (sig
[i
], sighandler
);
4232 siginterrupt (sig
[i
], 1);
4236 signal (SIGCHLD
, SIG_DFL
); /* Don't inherit CHLD handling from parent. */
4238 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4239 atexit (exit_cleanup
);
4242 gkey
.sword
= SIZE_MAX
;
4244 files
= xnmalloc (argc
, sizeof *files
);
4248 /* Parse an operand as a file after "--" was seen; or if
4249 pedantic and a file was seen, unless the POSIX version
4250 predates 1003.1-2001 and -c was not seen and the operand is
4251 "-o FILE" or "-oFILE". */
4255 || (posixly_correct
&& nfiles
!= 0
4256 && ! (obsolete_usage
4259 && argv
[optind
][0] == '-' && argv
[optind
][1] == 'o'
4260 && (argv
[optind
][2] || optind
+ 1 != argc
)))
4261 || ((c
= getopt_long (argc
, argv
, short_options
,
4267 files
[nfiles
++] = argv
[optind
++];
4273 if (optarg
[0] == '+')
4275 bool minus_pos_usage
= (optind
!= argc
&& argv
[optind
][0] == '-'
4276 && ISDIGIT (argv
[optind
][1]));
4277 obsolete_usage
|= minus_pos_usage
&& !posixly_correct
;
4280 /* Treat +POS1 [-POS2] as a key if possible; but silently
4281 treat an operand as a file if it is not a valid +POS1. */
4282 key
= key_init (&key_buf
);
4283 s
= parse_field_count (optarg
+ 1, &key
->sword
, NULL
);
4285 s
= parse_field_count (s
+ 1, &key
->schar
, NULL
);
4286 if (! (key
->sword
|| key
->schar
))
4287 key
->sword
= SIZE_MAX
;
4288 if (! s
|| *set_ordering (s
, key
, bl_start
))
4292 if (minus_pos_usage
)
4294 char const *optarg1
= argv
[optind
++];
4295 s
= parse_field_count (optarg1
+ 1, &key
->eword
,
4296 N_("invalid number after '-'"));
4297 /* When called with a non-NULL message ID,
4298 parse_field_count cannot return NULL. Tell static
4299 analysis tools that dereferencing S is safe. */
4302 s
= parse_field_count (s
+ 1, &key
->echar
,
4303 N_("invalid number after '.'"));
4304 if (!key
->echar
&& key
->eword
)
4306 /* obsolescent syntax +A.x -B.y is equivalent to:
4307 -k A+1.x+1,B.y (when y = 0)
4308 -k A+1.x+1,B+1.y (when y > 0)
4309 So eword is decremented as in the -k case
4310 only when the end field (B) is specified and
4314 if (*set_ordering (s
, key
, bl_end
))
4315 badfieldspec (optarg1
,
4316 N_("stray character in field spec"));
4318 key
->obsolete_used
= true;
4324 files
[nfiles
++] = optarg
;
4328 c
= XARGMATCH ("--sort", optarg
, sort_args
, sort_types
);
4345 set_ordering (str
, &gkey
, bl_both
);
4351 ? XARGMATCH ("--check", optarg
, check_args
, check_types
)
4356 if (checkonly
&& checkonly
!= c
)
4357 incompatible_options ("cC");
4361 case COMPRESS_PROGRAM_OPTION
:
4362 if (compress_program
&& !STREQ (compress_program
, optarg
))
4363 error (SORT_FAILURE
, 0, _("multiple compress programs specified"));
4364 compress_program
= optarg
;
4367 case DEBUG_PROGRAM_OPTION
:
4371 case FILES0_FROM_OPTION
:
4372 files_from
= optarg
;
4376 key
= key_init (&key_buf
);
4379 s
= parse_field_count (optarg
, &key
->sword
,
4380 N_("invalid number at field start"));
4383 /* Provoke with 'sort -k0' */
4384 badfieldspec (optarg
, N_("field number is zero"));
4388 s
= parse_field_count (s
+ 1, &key
->schar
,
4389 N_("invalid number after '.'"));
4392 /* Provoke with 'sort -k1.0' */
4393 badfieldspec (optarg
, N_("character offset is zero"));
4396 if (! (key
->sword
|| key
->schar
))
4397 key
->sword
= SIZE_MAX
;
4398 s
= set_ordering (s
, key
, bl_start
);
4401 key
->eword
= SIZE_MAX
;
4407 s
= parse_field_count (s
+ 1, &key
->eword
,
4408 N_("invalid number after ','"));
4411 /* Provoke with 'sort -k1,0' */
4412 badfieldspec (optarg
, N_("field number is zero"));
4416 s
= parse_field_count (s
+ 1, &key
->echar
,
4417 N_("invalid number after '.'"));
4419 s
= set_ordering (s
, key
, bl_end
);
4422 badfieldspec (optarg
, N_("stray character in field spec"));
4431 specify_nmerge (oi
, c
, optarg
);
4435 if (outfile
&& !STREQ (outfile
, optarg
))
4436 error (SORT_FAILURE
, 0, _("multiple output files specified"));
4440 case RANDOM_SOURCE_OPTION
:
4441 if (random_source
&& !STREQ (random_source
, optarg
))
4442 error (SORT_FAILURE
, 0, _("multiple random sources specified"));
4443 random_source
= optarg
;
4451 specify_sort_size (oi
, c
, optarg
);
4456 char newtab
= optarg
[0];
4458 error (SORT_FAILURE
, 0, _("empty tab"));
4461 if (STREQ (optarg
, "\\0"))
4465 /* Provoke with 'sort -txx'. Complain about
4466 "multi-character tab" instead of "multibyte tab", so
4467 that the diagnostic's wording does not need to be
4468 changed once multibyte characters are supported. */
4469 error (SORT_FAILURE
, 0, _("multi-character tab %s"),
4473 if (tab
!= TAB_DEFAULT
&& tab
!= newtab
)
4474 error (SORT_FAILURE
, 0, _("incompatible tabs"));
4480 add_temp_dir (optarg
);
4483 case PARALLEL_OPTION
:
4484 nthreads
= specify_nthreads (oi
, c
, optarg
);
4492 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4493 through Solaris 7. It is also accepted by many non-Solaris
4494 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4495 -y is marked as obsolete starting with Solaris 8 (1999), but is
4496 still accepted as of Solaris 10 prerelease (2004).
4498 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4499 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4500 and which in general ignores the argument after "-y" if it
4501 consists entirely of digits (it can even be empty). */
4502 if (optarg
== argv
[optind
- 1])
4505 for (p
= optarg
; ISDIGIT (*p
); p
++)
4507 optind
-= (*p
!= '\0');
4515 case_GETOPT_HELP_CHAR
;
4517 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
4520 usage (SORT_FAILURE
);
4528 /* When using --files0-from=F, you may not specify any files
4529 on the command-line. */
4532 error (0, 0, _("extra operand %s"), quote (files
[0]));
4533 fprintf (stderr
, "%s\n",
4534 _("file operands cannot be combined with --files0-from"));
4535 usage (SORT_FAILURE
);
4538 if (STREQ (files_from
, "-"))
4542 stream
= fopen (files_from
, "r");
4544 error (SORT_FAILURE
, errno
, _("cannot open %s for reading"),
4545 quote (files_from
));
4548 readtokens0_init (&tok
);
4550 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
4551 error (SORT_FAILURE
, 0, _("cannot read file names from %s"),
4552 quote (files_from
));
4560 for (i
= 0; i
< nfiles
; i
++)
4562 if (STREQ (files
[i
], "-"))
4563 error (SORT_FAILURE
, 0, _("when reading file names from stdin, "
4564 "no file name of %s allowed"),
4566 else if (files
[i
][0] == '\0')
4568 /* Using the standard 'filename:line-number:' prefix here is
4569 not totally appropriate, since NUL is the separator,
4570 not NL, but it might be better than nothing. */
4571 unsigned long int file_number
= i
+ 1;
4572 error (SORT_FAILURE
, 0,
4573 _("%s:%lu: invalid zero-length file name"),
4574 quotearg_colon (files_from
), file_number
);
4579 error (SORT_FAILURE
, 0, _("no input from %s"),
4580 quote (files_from
));
4583 /* Inheritance of global options to individual keys. */
4584 for (key
= keylist
; key
; key
= key
->next
)
4586 if (default_key_compare (key
) && !key
->reverse
)
4588 key
->ignore
= gkey
.ignore
;
4589 key
->translate
= gkey
.translate
;
4590 key
->skipsblanks
= gkey
.skipsblanks
;
4591 key
->skipeblanks
= gkey
.skipeblanks
;
4592 key
->month
= gkey
.month
;
4593 key
->numeric
= gkey
.numeric
;
4594 key
->general_numeric
= gkey
.general_numeric
;
4595 key
->human_numeric
= gkey
.human_numeric
;
4596 key
->version
= gkey
.version
;
4597 key
->random
= gkey
.random
;
4598 key
->reverse
= gkey
.reverse
;
4601 need_random
|= key
->random
;
4604 if (!keylist
&& !default_key_compare (&gkey
))
4608 need_random
|= gkey
.random
;
4611 check_ordering_compatibility ();
4615 if (checkonly
|| outfile
)
4617 static char opts
[] = "X --debug";
4618 opts
[0] = (checkonly
? checkonly
: 'o');
4619 incompatible_options (opts
);
4622 /* Always output the locale in debug mode, since this
4623 is such a common source of confusion. */
4624 if (hard_LC_COLLATE
)
4625 error (0, 0, _("using %s sorting rules"),
4626 quote (setlocale (LC_COLLATE
, NULL
)));
4628 error (0, 0, _("using simple byte comparison"));
4629 key_warnings (&gkey
, gkey_only
);
4632 reverse
= gkey
.reverse
;
4635 random_md5_state_init (random_source
);
4637 if (temp_dir_count
== 0)
4639 char const *tmp_dir
= getenv ("TMPDIR");
4640 add_temp_dir (tmp_dir
? tmp_dir
: DEFAULT_TMPDIR
);
4645 static char *minus
= (char *) "-";
4651 /* Need to re-check that we meet the minimum requirement for memory
4652 usage with the final value for NMERGE. */
4654 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
4659 error (SORT_FAILURE
, 0, _("extra operand %s not allowed with -%c"),
4660 quote (files
[1]), checkonly
);
4664 static char opts
[] = {0, 'o', 0};
4665 opts
[0] = checkonly
;
4666 incompatible_options (opts
);
4669 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4670 input is not properly sorted. */
4671 exit (check (files
[0], checkonly
) ? EXIT_SUCCESS
: SORT_OUT_OF_ORDER
);
4674 /* Check all inputs are accessible, or exit immediately. */
4675 check_inputs (files
, nfiles
);
4677 /* Check output is writable, or exit immediately. */
4678 check_output (outfile
);
4682 struct sortfile
*sortfiles
= xcalloc (nfiles
, sizeof *sortfiles
);
4685 for (i
= 0; i
< nfiles
; ++i
)
4686 sortfiles
[i
].name
= files
[i
];
4688 merge (sortfiles
, 0, nfiles
, outfile
);
4689 IF_LINT (free (sortfiles
));
4695 unsigned long int np
= num_processors (NPROC_CURRENT_OVERRIDABLE
);
4696 nthreads
= MIN (np
, DEFAULT_MAX_THREADS
);
4699 /* Avoid integer overflow later. */
4700 size_t nthreads_max
= SIZE_MAX
/ (2 * sizeof (struct merge_node
));
4701 nthreads
= MIN (nthreads
, nthreads_max
);
4703 sort (files
, nfiles
, outfile
, nthreads
);
4706 if (have_read_stdin
&& fclose (stdin
) == EOF
)
4707 die (_("close failed"), "-");
4709 exit (EXIT_SUCCESS
);