1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
26 #include <sys/types.h>
32 #include "filevercmp.h"
40 #include "readtokens0.h"
43 #include "strnumcmp.h"
48 #if HAVE_SYS_RESOURCE_H
49 # include <sys/resource.h>
52 struct rlimit
{ size_t rlim_cur
; };
53 # define getrlimit(Resource, Rlp) (-1)
56 /* The official name of this program (e.g., no `g' prefix). */
57 #define PROGRAM_NAME "sort"
60 proper_name ("Mike Haertel"), \
61 proper_name ("Paul Eggert")
63 #if HAVE_LANGINFO_CODESET
64 # include <langinfo.h>
67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
70 # define SA_NOCLDSTOP 0
71 /* No sigprocmask. Always 'return' zero. */
72 # define sigprocmask(How, Set, Oset) (0)
74 # if ! HAVE_SIGINTERRUPT
75 # define siginterrupt(sig, flag) /* empty */
79 #if !defined OPEN_MAX && defined NR_OPEN
80 # define OPEN_MAX NR_OPEN
86 #define UCHAR_LIM (UCHAR_MAX + 1)
88 #ifndef DEFAULT_TMPDIR
89 # define DEFAULT_TMPDIR "/tmp"
95 /* POSIX says to exit with status 1 if invoked with -c and the
96 input is not properly sorted. */
97 SORT_OUT_OF_ORDER
= 1,
99 /* POSIX says any other irregular exit must exit with a status
100 code greater than 1. */
106 /* The number of times we should try to fork a compression process
107 (we retry if the fork call fails). We don't _need_ to compress
108 temp files, this is just to reduce disk access, so this number
110 MAX_FORK_TRIES_COMPRESS
= 2,
112 /* The number of times we should try to fork a decompression process.
113 If we can't fork a decompression process, we can't sort, so this
114 number should be big. */
115 MAX_FORK_TRIES_DECOMPRESS
= 8
118 /* The representation of the decimal point in the current locale. */
119 static int decimal_point
;
121 /* Thousands separator; if -1, then there isn't one. */
122 static int thousands_sep
;
124 /* Nonzero if the corresponding locales are hard. */
125 static bool hard_LC_COLLATE
;
127 static bool hard_LC_TIME
;
130 #define NONZERO(x) ((x) != 0)
132 /* The kind of blanks for '-b' to skip in various options. */
133 enum blanktype
{ bl_start
, bl_end
, bl_both
};
135 /* The character marking end of line. Default to \n. */
136 static char eolchar
= '\n';
138 /* Lines are held in core as counted strings. */
141 char *text
; /* Text of the line. */
142 size_t length
; /* Length including final newline. */
143 char *keybeg
; /* Start of first key. */
144 char *keylim
; /* Limit of first key. */
150 char *buf
; /* Dynamically allocated buffer,
151 partitioned into 3 regions:
154 - an array of lines, in reverse order. */
155 size_t used
; /* Number of bytes used for input data. */
156 size_t nlines
; /* Number of lines in the line array. */
157 size_t alloc
; /* Number of bytes allocated. */
158 size_t left
; /* Number of bytes left from previous reads. */
159 size_t line_bytes
; /* Number of bytes to reserve for each line. */
160 bool eof
; /* An EOF has been read. */
165 size_t sword
; /* Zero-origin 'word' to start at. */
166 size_t schar
; /* Additional characters to skip. */
167 size_t eword
; /* Zero-origin first word after field. */
168 size_t echar
; /* Additional characters in field. */
169 bool const *ignore
; /* Boolean array of characters to ignore. */
170 char const *translate
; /* Translation applied to characters. */
171 bool skipsblanks
; /* Skip leading blanks when finding start. */
172 bool skipeblanks
; /* Skip leading blanks when finding end. */
173 bool numeric
; /* Flag for numeric comparison. Handle
174 strings of digits with optional decimal
175 point, but no exponential notation. */
176 bool random
; /* Sort by random hash of key. */
177 bool general_numeric
; /* Flag for general, numeric comparison.
178 Handle numbers in exponential notation. */
179 bool human_numeric
; /* Flag for sorting by human readable
180 units with either SI xor IEC prefixes. */
181 int si_present
; /* Flag for checking for mixed SI and IEC. */
182 bool month
; /* Flag for comparison by month name. */
183 bool reverse
; /* Reverse the sense of comparison. */
184 bool version
; /* sort by version number */
185 struct keyfield
*next
; /* Next keyfield to try. */
194 /* FIXME: None of these tables work with multibyte character sets.
195 Also, there are many other bugs when handling multibyte characters.
196 One way to fix this is to rewrite `sort' to use wide characters
197 internally, but doing this with good performance is a bit
200 /* Table of blanks. */
201 static bool blanks
[UCHAR_LIM
];
203 /* Table of non-printing characters. */
204 static bool nonprinting
[UCHAR_LIM
];
206 /* Table of non-dictionary characters (not letters, digits, or blanks). */
207 static bool nondictionary
[UCHAR_LIM
];
209 /* Translation table folding lower case to upper. */
210 static char fold_toupper
[UCHAR_LIM
];
212 #define MONTHS_PER_YEAR 12
214 /* Table mapping month names to integers.
215 Alphabetic order allows binary search. */
216 static struct month monthtab
[] =
232 /* During the merge phase, the number of files to merge at once. */
233 #define NMERGE_DEFAULT 16
235 /* Minimum size for a merge or check buffer. */
236 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
238 /* Minimum sort size; the code might not work with smaller sizes. */
239 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
241 /* The number of bytes needed for a merge or check buffer, which can
242 function relatively efficiently even if it holds only one line. If
243 a longer line is seen, this value is increased. */
244 static size_t merge_buffer_size
= MAX (MIN_MERGE_BUFFER_SIZE
, 256 * 1024);
246 /* The approximate maximum number of bytes of main memory to use, as
247 specified by the user. Zero if the user has not specified a size. */
248 static size_t sort_size
;
250 /* The guessed size for non-regular files. */
251 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
253 /* Array of directory names in which any temporary files are to be created. */
254 static char const **temp_dirs
;
256 /* Number of temporary directory names used. */
257 static size_t temp_dir_count
;
259 /* Number of allocated slots in temp_dirs. */
260 static size_t temp_dir_alloc
;
262 /* Flag to reverse the order of all comparisons. */
265 /* Flag for stable sort. This turns off the last ditch bytewise
266 comparison of lines, and instead leaves lines in the same order
267 they were read if all keys compare equal. */
270 /* If TAB has this value, blanks separate fields. */
271 enum { TAB_DEFAULT
= CHAR_MAX
+ 1 };
273 /* Tab character separating fields. If TAB_DEFAULT, then fields are
274 separated by the empty string between a non-blank character and a blank
276 static int tab
= TAB_DEFAULT
;
278 /* Flag to remove consecutive duplicate lines from the output.
279 Only the last of a sequence of equal lines will be output. */
282 /* Nonzero if any of the input files are the standard input. */
283 static bool have_read_stdin
;
285 /* List of key field comparisons to be tried. */
286 static struct keyfield
*keylist
;
288 /* Program used to (de)compress temp files. Must accept -d. */
289 static char const *compress_program
;
291 /* Maximum number of files to merge in one go. If more than this
292 number are present, temp files will be used. */
293 static unsigned int nmerge
= NMERGE_DEFAULT
;
295 static void sortlines_temp (struct line
*, size_t, struct line
*);
297 /* Report MESSAGE for FILE, then clean up and exit.
298 If FILE is null, it represents standard output. */
300 static void die (char const *, char const *) ATTRIBUTE_NORETURN
;
302 die (char const *message
, char const *file
)
304 error (0, errno
, "%s: %s", message
, file
? file
: _("standard output"));
311 if (status
!= EXIT_SUCCESS
)
312 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
317 Usage: %s [OPTION]... [FILE]...\n\
318 or: %s [OPTION]... --files0-from=F\n\
320 program_name
, program_name
);
322 Write sorted concatenation of all FILE(s) to standard output.\n\
326 Mandatory arguments to long options are mandatory for short options too.\n\
333 -b, --ignore-leading-blanks ignore leading blanks\n\
334 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
335 -f, --ignore-case fold lower case to upper case characters\n\
338 -g, --general-numeric-sort compare according to general numerical value\n\
339 -i, --ignore-nonprinting consider only printable characters\n\
340 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
343 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
346 -n, --numeric-sort compare according to string numerical value\n\
347 -R, --random-sort sort by random hash of keys\n\
348 --random-source=FILE get random bytes from FILE\n\
349 -r, --reverse reverse the result of comparisons\n\
352 --sort=WORD sort according to WORD:\n\
353 general-numeric -g, human-numeric -h, month -M,\n\
354 numeric -n, random -R, version -V\n\
355 -V, --version-sort natural sort of (version) numbers within text\n\
363 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
364 for more use temp files\n\
367 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
368 -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\
369 --compress-program=PROG compress temporaries with PROG;\n\
370 decompress them with PROG -d\n\
371 --files0-from=F read input from the files specified by\n\
372 NUL-terminated names in file F;\n\
373 If F is - then read names from standard input\n\
376 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
377 (default end of line)\n\
378 -m, --merge merge already sorted files; do not sort\n\
381 -o, --output=FILE write result to FILE instead of standard output\n\
382 -s, --stable stabilize sort by disabling last-resort comparison\n\
383 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
386 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
387 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
388 multiple options specify multiple directories\n\
389 -u, --unique with -c, check for strict ordering;\n\
390 without -c, output only the first of an equal run\n\
393 -z, --zero-terminated end lines with 0 byte, not newline\n\
395 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
396 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
399 POS is F[.C][OPTS], where F is the field number and C the character position\n\
400 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
401 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
402 one or more single-letter ordering options, which override global ordering\n\
403 options for that key. If no key is given, use the entire line as the key.\n\
405 SIZE may be followed by the following multiplicative suffixes:\n\
408 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
410 With no FILE, or when FILE is -, read standard input.\n\
413 The locale specified by the environment affects sort order.\n\
414 Set LC_ALL=C to get the traditional sort order that uses\n\
415 native byte values.\n\
417 emit_bug_reporting_address ();
423 /* For long options that have no equivalent short option, use a
424 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
427 CHECK_OPTION
= CHAR_MAX
+ 1,
428 COMPRESS_PROGRAM_OPTION
,
431 RANDOM_SOURCE_OPTION
,
435 static char const short_options
[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
437 static struct option
const long_options
[] =
439 {"ignore-leading-blanks", no_argument
, NULL
, 'b'},
440 {"check", optional_argument
, NULL
, CHECK_OPTION
},
441 {"compress-program", required_argument
, NULL
, COMPRESS_PROGRAM_OPTION
},
442 {"dictionary-order", no_argument
, NULL
, 'd'},
443 {"ignore-case", no_argument
, NULL
, 'f'},
444 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
445 {"general-numeric-sort", no_argument
, NULL
, 'g'},
446 {"ignore-nonprinting", no_argument
, NULL
, 'i'},
447 {"key", required_argument
, NULL
, 'k'},
448 {"merge", no_argument
, NULL
, 'm'},
449 {"month-sort", no_argument
, NULL
, 'M'},
450 {"numeric-sort", no_argument
, NULL
, 'n'},
451 {"human-numeric-sort", no_argument
, NULL
, 'h'},
452 {"version-sort", no_argument
, NULL
, 'V'},
453 {"random-sort", no_argument
, NULL
, 'R'},
454 {"random-source", required_argument
, NULL
, RANDOM_SOURCE_OPTION
},
455 {"sort", required_argument
, NULL
, SORT_OPTION
},
456 {"output", required_argument
, NULL
, 'o'},
457 {"reverse", no_argument
, NULL
, 'r'},
458 {"stable", no_argument
, NULL
, 's'},
459 {"batch-size", required_argument
, NULL
, NMERGE_OPTION
},
460 {"buffer-size", required_argument
, NULL
, 'S'},
461 {"field-separator", required_argument
, NULL
, 't'},
462 {"temporary-directory", required_argument
, NULL
, 'T'},
463 {"unique", no_argument
, NULL
, 'u'},
464 {"zero-terminated", no_argument
, NULL
, 'z'},
465 {GETOPT_HELP_OPTION_DECL
},
466 {GETOPT_VERSION_OPTION_DECL
},
470 #define CHECK_TABLE \
472 _ct_("silent", 'C') \
473 _ct_("diagnose-first", 'c')
475 static char const *const check_args
[] =
477 #define _ct_(_s, _c) _s,
481 static char const check_types
[] =
483 #define _ct_(_s, _c) _c,
489 _st_("general-numeric", 'g') \
490 _st_("human-numeric", 'h') \
492 _st_("numeric", 'n') \
493 _st_("random", 'R') \
496 static char const *const sort_args
[] =
498 #define _st_(_s, _c) _s,
502 static char const sort_types
[] =
504 #define _st_(_s, _c) _c,
509 /* The set of signals that are caught. */
510 static sigset_t caught_signals
;
512 /* Critical section status. */
519 /* Enter a critical section. */
520 static struct cs_status
523 struct cs_status status
;
524 status
.valid
= (sigprocmask (SIG_BLOCK
, &caught_signals
, &status
.sigs
) == 0);
528 /* Leave a critical section. */
530 cs_leave (struct cs_status status
)
534 /* Ignore failure when restoring the signal mask. */
535 sigprocmask (SIG_SETMASK
, &status
.sigs
, NULL
);
539 /* The list of temporary files. */
542 struct tempnode
*volatile next
;
543 pid_t pid
; /* If compressed, the pid of compressor, else zero */
544 char name
[1]; /* Actual size is 1 + file name length. */
546 static struct tempnode
*volatile temphead
;
547 static struct tempnode
*volatile *temptail
= &temphead
;
552 pid_t pid
; /* If compressed, the pid of compressor, else zero */
555 /* A table where we store compression process states. We clean up all
556 processes in a timely manner so as not to exhaust system resources,
557 so we store the info on whether the process is still running, or has
559 static Hash_table
*proctab
;
561 enum { INIT_PROCTAB_SIZE
= 47 };
563 enum procstate
{ ALIVE
, ZOMBIE
};
565 /* A proctab entry. The COUNT field is there in case we fork a new
566 compression process that has the same PID as an old zombie process
567 that is still in the table (because the process to decompress the
568 temp file it was associated with hasn't started yet). */
572 enum procstate state
;
577 proctab_hasher (const void *entry
, size_t tabsize
)
579 const struct procnode
*node
= entry
;
580 return node
->pid
% tabsize
;
584 proctab_comparator (const void *e1
, const void *e2
)
586 const struct procnode
*n1
= e1
, *n2
= e2
;
587 return n1
->pid
== n2
->pid
;
590 /* The total number of forked processes (compressors and decompressors)
591 that have not been reaped yet. */
592 static size_t nprocs
;
594 /* The number of child processes we'll allow before we try to reap some. */
595 enum { MAX_PROCS_BEFORE_REAP
= 2 };
597 /* If 0 < PID, wait for the child process with that PID to exit.
598 If PID is -1, clean up a random child process which has finished and
599 return the process ID of that child. If PID is -1 and no processes
600 have quit yet, return 0 without waiting. */
606 pid_t cpid
= waitpid (pid
, &status
, pid
< 0 ? WNOHANG
: 0);
609 error (SORT_FAILURE
, errno
, _("waiting for %s [-d]"),
613 if (! WIFEXITED (status
) || WEXITSTATUS (status
))
614 error (SORT_FAILURE
, 0, _("%s [-d] terminated abnormally"),
622 /* Add the PID of a running compression process to proctab, or update
623 the entry COUNT and STATE fields if it's already there. This also
624 creates the table for us the first time it's called. */
627 register_proc (pid_t pid
)
629 struct procnode test
, *node
;
633 proctab
= hash_initialize (INIT_PROCTAB_SIZE
, NULL
,
642 node
= hash_lookup (proctab
, &test
);
650 node
= xmalloc (sizeof *node
);
654 if (hash_insert (proctab
, node
) == NULL
)
659 /* This is called when we reap a random process. We don't know
660 whether we have reaped a compression process or a decompression
661 process until we look in the table. If there's an ALIVE entry for
662 it, then we have reaped a compression process, so change the state
663 to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */
666 update_proc (pid_t pid
)
668 struct procnode test
, *node
;
671 node
= hash_lookup (proctab
, &test
);
673 node
->state
= ZOMBIE
;
676 /* This is for when we need to wait for a compression process to exit.
677 If it has a ZOMBIE entry in the table then it's already dead and has
678 been reaped. Note that if there's an ALIVE entry for it, it still may
679 already have died and been reaped if a second process was created with
680 the same PID. This is probably exceedingly rare, but to be on the safe
681 side we will have to wait for any compression process with this PID. */
684 wait_proc (pid_t pid
)
686 struct procnode test
, *node
;
689 node
= hash_lookup (proctab
, &test
);
690 if (node
->state
== ALIVE
)
693 node
->state
= ZOMBIE
;
696 hash_delete (proctab
, node
);
701 /* Keep reaping finished children as long as there are more to reap.
702 This doesn't block waiting for any of them, it only reaps those
703 that are already dead. */
710 while (0 < nprocs
&& (pid
= reap (-1)))
714 /* Clean up any remaining temporary files. */
719 struct tempnode
const *node
;
721 for (node
= temphead
; node
; node
= node
->next
)
726 /* Cleanup actions to take when exiting. */
733 /* Clean up any remaining temporary files in a critical section so
734 that a signal handler does not try to clean them too. */
735 struct cs_status cs
= cs_enter ();
743 /* Create a new temporary file, returning its newly allocated tempnode.
744 Store into *PFD the file descriptor open for writing.
745 If the creation fails, return NULL and store -1 into *PFD if the
746 failure is due to file descriptor exhaustion and
747 SURVIVE_FD_EXHAUSTION; otherwise, die. */
749 static struct tempnode
*
750 create_temp_file (int *pfd
, bool survive_fd_exhaustion
)
752 static char const slashbase
[] = "/sortXXXXXX";
753 static size_t temp_dir_index
;
756 char const *temp_dir
= temp_dirs
[temp_dir_index
];
757 size_t len
= strlen (temp_dir
);
758 struct tempnode
*node
=
759 xmalloc (offsetof (struct tempnode
, name
) + len
+ sizeof slashbase
);
760 char *file
= node
->name
;
763 memcpy (file
, temp_dir
, len
);
764 memcpy (file
+ len
, slashbase
, sizeof slashbase
);
767 if (++temp_dir_index
== temp_dir_count
)
770 /* Create the temporary file in a critical section, to avoid races. */
776 temptail
= &node
->next
;
784 if (! (survive_fd_exhaustion
&& errno
== EMFILE
))
785 error (SORT_FAILURE
, errno
, _("cannot create temporary file in %s"),
795 /* Return a stream for FILE, opened with mode HOW. A null FILE means
796 standard output; HOW should be "w". When opening for input, "-"
797 means standard input. To avoid confusion, do not return file
798 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
799 opening an ordinary FILE. Return NULL if unsuccessful. */
802 stream_open (const char *file
, const char *how
)
806 if (STREQ (file
, "-") && *how
== 'r')
808 have_read_stdin
= true;
811 return fopen (file
, how
);
814 /* Same as stream_open, except always return a non-null value; die on
818 xfopen (const char *file
, const char *how
)
820 FILE *fp
= stream_open (file
, how
);
822 die (_("open failed"), file
);
826 /* Close FP, whose name is FILE, and report any errors. */
829 xfclose (FILE *fp
, char const *file
)
834 /* Allow reading stdin from tty more than once. */
840 /* Don't close stdout just yet. close_stdout does that. */
841 if (fflush (fp
) != 0)
842 die (_("fflush failed"), file
);
846 if (fclose (fp
) != 0)
847 die (_("close failed"), file
);
853 dup2_or_die (int oldfd
, int newfd
)
855 if (dup2 (oldfd
, newfd
) < 0)
856 error (SORT_FAILURE
, errno
, _("dup2 failed"));
859 /* Fork a child process for piping to and do common cleanup. The
860 TRIES parameter tells us how many times to try to fork before
861 giving up. Return the PID of the child, or -1 (setting errno)
865 pipe_fork (int pipefds
[2], size_t tries
)
867 #if HAVE_WORKING_FORK
868 struct tempnode
*saved_temphead
;
870 unsigned int wait_retry
= 1;
871 pid_t pid
IF_LINT (= -1);
874 if (pipe (pipefds
) < 0)
879 /* This is so the child process won't delete our temp files
880 if it receives a signal before exec-ing. */
882 saved_temphead
= temphead
;
888 temphead
= saved_temphead
;
893 if (0 <= pid
|| errno
!= EAGAIN
)
912 close (STDIN_FILENO
);
913 close (STDOUT_FILENO
);
920 #else /* ! HAVE_WORKING_FORK */
925 /* Create a temporary file and start a compression program to filter output
926 to that file. Set *PFP to the file handle and if PPID is non-NULL,
927 set *PPID to the PID of the newly-created process. If the creation
928 fails, return NULL if the failure is due to file descriptor
929 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
932 maybe_create_temp (FILE **pfp
, pid_t
*ppid
, bool survive_fd_exhaustion
)
935 struct tempnode
*node
= create_temp_file (&tempfd
, survive_fd_exhaustion
);
943 if (compress_program
)
947 node
->pid
= pipe_fork (pipefds
, MAX_FORK_TRIES_COMPRESS
);
954 register_proc (node
->pid
);
956 else if (node
->pid
== 0)
959 dup2_or_die (tempfd
, STDOUT_FILENO
);
961 dup2_or_die (pipefds
[0], STDIN_FILENO
);
964 if (execlp (compress_program
, compress_program
, (char *) NULL
) < 0)
965 error (SORT_FAILURE
, errno
, _("couldn't execute %s"),
972 *pfp
= fdopen (tempfd
, "w");
974 die (_("couldn't create temporary file"), name
);
982 /* Create a temporary file and start a compression program to filter output
983 to that file. Set *PFP to the file handle and if *PPID is non-NULL,
984 set it to the PID of the newly-created process. Die on failure. */
987 create_temp (FILE **pfp
, pid_t
*ppid
)
989 return maybe_create_temp (pfp
, ppid
, false);
992 /* Open a compressed temp file and start a decompression process through
993 which to filter the input. PID must be the valid processes ID of the
994 process used to compress the file. Return NULL (setting errno to
995 EMFILE) if we ran out of file descriptors, and die on any other
999 open_temp (const char *name
, pid_t pid
)
1001 int tempfd
, pipefds
[2];
1006 tempfd
= open (name
, O_RDONLY
);
1010 switch (pipe_fork (pipefds
, MAX_FORK_TRIES_DECOMPRESS
))
1013 if (errno
!= EMFILE
)
1014 error (SORT_FAILURE
, errno
, _("couldn't create process for %s -d"),
1022 dup2_or_die (tempfd
, STDIN_FILENO
);
1024 dup2_or_die (pipefds
[1], STDOUT_FILENO
);
1027 execlp (compress_program
, compress_program
, "-d", (char *) NULL
);
1028 error (SORT_FAILURE
, errno
, _("couldn't execute %s -d"),
1035 fp
= fdopen (pipefds
[0], "r");
1038 int saved_errno
= errno
;
1040 errno
= saved_errno
;
1049 write_bytes (const char *buf
, size_t n_bytes
, FILE *fp
, const char *output_file
)
1051 if (fwrite (buf
, 1, n_bytes
, fp
) != n_bytes
)
1052 die (_("write failed"), output_file
);
1055 /* Append DIR to the array of temporary directory names. */
1057 add_temp_dir (char const *dir
)
1059 if (temp_dir_count
== temp_dir_alloc
)
1060 temp_dirs
= X2NREALLOC (temp_dirs
, &temp_dir_alloc
);
1062 temp_dirs
[temp_dir_count
++] = dir
;
1065 /* Remove NAME from the list of temporary files. */
1068 zaptemp (const char *name
)
1070 struct tempnode
*volatile *pnode
;
1071 struct tempnode
*node
;
1072 struct tempnode
*next
;
1074 int unlink_errno
= 0;
1075 struct cs_status cs
;
1077 for (pnode
= &temphead
; (node
= *pnode
)->name
!= name
; pnode
= &node
->next
)
1080 /* Unlink the temporary file in a critical section to avoid races. */
1083 unlink_status
= unlink (name
);
1084 unlink_errno
= errno
;
1088 if (unlink_status
!= 0)
1089 error (0, unlink_errno
, _("warning: cannot remove: %s"), name
);
1095 #if HAVE_NL_LANGINFO
1098 struct_month_cmp (const void *m1
, const void *m2
)
1100 struct month
const *month1
= m1
;
1101 struct month
const *month2
= m2
;
1102 return strcmp (month1
->name
, month2
->name
);
1107 /* Initialize the character class tables. */
1114 for (i
= 0; i
< UCHAR_LIM
; ++i
)
1116 blanks
[i
] = !! isblank (i
);
1117 nonprinting
[i
] = ! isprint (i
);
1118 nondictionary
[i
] = ! isalnum (i
) && ! isblank (i
);
1119 fold_toupper
[i
] = toupper (i
);
1122 #if HAVE_NL_LANGINFO
1123 /* If we're not in the "C" locale, read different names for months. */
1126 for (i
= 0; i
< MONTHS_PER_YEAR
; i
++)
1133 s
= (char *) nl_langinfo (ABMON_1
+ i
);
1135 monthtab
[i
].name
= name
= xmalloc (s_len
+ 1);
1136 monthtab
[i
].val
= i
+ 1;
1138 for (j
= 0; j
< s_len
; j
++)
1139 name
[j
] = fold_toupper
[to_uchar (s
[j
])];
1142 qsort ((void *) monthtab
, MONTHS_PER_YEAR
,
1143 sizeof *monthtab
, struct_month_cmp
);
1148 /* Specify how many inputs may be merged at once.
1149 This may be set on the command-line with the
1150 --batch-size option. */
1152 specify_nmerge (int oi
, char c
, char const *s
)
1155 struct rlimit rlimit
;
1156 enum strtol_error e
= xstrtoumax (s
, NULL
, 10, &n
, NULL
);
1158 /* Try to find out how many file descriptors we'll be able
1159 to open. We need at least nmerge + 3 (STDIN_FILENO,
1160 STDOUT_FILENO and STDERR_FILENO). */
1161 unsigned int max_nmerge
= ((getrlimit (RLIMIT_NOFILE
, &rlimit
) == 0
1166 if (e
== LONGINT_OK
)
1170 e
= LONGINT_OVERFLOW
;
1175 error (0, 0, _("invalid --%s argument %s"),
1176 long_options
[oi
].name
, quote(s
));
1177 error (SORT_FAILURE
, 0,
1178 _("minimum --%s argument is %s"),
1179 long_options
[oi
].name
, quote("2"));
1181 else if (max_nmerge
< nmerge
)
1183 e
= LONGINT_OVERFLOW
;
1190 if (e
== LONGINT_OVERFLOW
)
1192 char max_nmerge_buf
[INT_BUFSIZE_BOUND (unsigned int)];
1193 error (0, 0, _("--%s argument %s too large"),
1194 long_options
[oi
].name
, quote(s
));
1195 error (SORT_FAILURE
, 0,
1196 _("maximum --%s argument with current rlimit is %s"),
1197 long_options
[oi
].name
,
1198 uinttostr (max_nmerge
, max_nmerge_buf
));
1201 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1204 /* Specify the amount of main memory to use when sorting. */
1206 specify_sort_size (int oi
, char c
, char const *s
)
1210 enum strtol_error e
= xstrtoumax (s
, &suffix
, 10, &n
, "EgGkKmMPtTYZ");
1212 /* The default unit is KiB. */
1213 if (e
== LONGINT_OK
&& ISDIGIT (suffix
[-1]))
1215 if (n
<= UINTMAX_MAX
/ 1024)
1218 e
= LONGINT_OVERFLOW
;
1221 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1222 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& ISDIGIT (suffix
[-1]) && ! suffix
[1])
1231 double mem
= physmem_total () * n
/ 100;
1233 /* Use "<", not "<=", to avoid problems with rounding. */
1234 if (mem
< UINTMAX_MAX
)
1240 e
= LONGINT_OVERFLOW
;
1245 if (e
== LONGINT_OK
)
1247 /* If multiple sort sizes are specified, take the maximum, so
1248 that option order does not matter. */
1255 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
1259 e
= LONGINT_OVERFLOW
;
1262 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1265 /* Return the default sort size. */
1267 default_sort_size (void)
1269 /* Let MEM be available memory or 1/8 of total memory, whichever
1271 double avail
= physmem_available ();
1272 double total
= physmem_total ();
1273 double mem
= MAX (avail
, total
/ 8);
1274 struct rlimit rlimit
;
1276 /* Let SIZE be MEM, but no more than the maximum object size or
1277 system resource limits. Avoid the MIN macro here, as it is not
1278 quite right when only one argument is floating point. Don't
1279 bother to check for values like RLIM_INFINITY since in practice
1280 they are not much less than SIZE_MAX. */
1281 size_t size
= SIZE_MAX
;
1284 if (getrlimit (RLIMIT_DATA
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1285 size
= rlimit
.rlim_cur
;
1287 if (getrlimit (RLIMIT_AS
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1288 size
= rlimit
.rlim_cur
;
1291 /* Leave a large safety margin for the above limits, as failure can
1292 occur when they are exceeded. */
1296 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1297 Exceeding RSS is not fatal, but can be quite slow. */
1298 if (getrlimit (RLIMIT_RSS
, &rlimit
) == 0 && rlimit
.rlim_cur
/ 16 * 15 < size
)
1299 size
= rlimit
.rlim_cur
/ 16 * 15;
1302 /* Use no less than the minimum. */
1303 return MAX (size
, MIN_SORT_SIZE
);
1306 /* Return the sort buffer size to use with the input files identified
1307 by FPS and FILES, which are alternate names of the same files.
1308 NFILES gives the number of input files; NFPS may be less. Assume
1309 that each input line requires LINE_BYTES extra bytes' worth of line
1310 information. Do not exceed the size bound specified by the user
1311 (or a default size bound, if the user does not specify one). */
1314 sort_buffer_size (FILE *const *fps
, size_t nfps
,
1315 char *const *files
, size_t nfiles
,
1318 /* A bound on the input size. If zero, the bound hasn't been
1320 static size_t size_bound
;
1322 /* In the worst case, each input byte is a newline. */
1323 size_t worst_case_per_input_byte
= line_bytes
+ 1;
1325 /* Keep enough room for one extra input line and an extra byte.
1326 This extra room might be needed when preparing to read EOF. */
1327 size_t size
= worst_case_per_input_byte
+ 1;
1331 for (i
= 0; i
< nfiles
; i
++)
1337 if ((i
< nfps
? fstat (fileno (fps
[i
]), &st
)
1338 : STREQ (files
[i
], "-") ? fstat (STDIN_FILENO
, &st
)
1339 : stat (files
[i
], &st
))
1341 die (_("stat failed"), files
[i
]);
1343 if (S_ISREG (st
.st_mode
))
1344 file_size
= st
.st_size
;
1347 /* The file has unknown size. If the user specified a sort
1348 buffer size, use that; otherwise, guess the size. */
1351 file_size
= INPUT_FILE_SIZE_GUESS
;
1356 size_bound
= sort_size
;
1358 size_bound
= default_sort_size ();
1361 /* Add the amount of memory needed to represent the worst case
1362 where the input consists entirely of newlines followed by a
1363 single non-newline. Check for overflow. */
1364 worst_case
= file_size
* worst_case_per_input_byte
+ 1;
1365 if (file_size
!= worst_case
/ worst_case_per_input_byte
1366 || size_bound
- size
<= worst_case
)
1374 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1375 must be at least sizeof (struct line). Allocate ALLOC bytes
1379 initbuf (struct buffer
*buf
, size_t line_bytes
, size_t alloc
)
1381 /* Ensure that the line array is properly aligned. If the desired
1382 size cannot be allocated, repeatedly halve it until allocation
1383 succeeds. The smaller allocation may hurt overall performance,
1384 but that's better than failing. */
1387 alloc
+= sizeof (struct line
) - alloc
% sizeof (struct line
);
1388 buf
->buf
= malloc (alloc
);
1392 if (alloc
<= line_bytes
+ 1)
1396 buf
->line_bytes
= line_bytes
;
1398 buf
->used
= buf
->left
= buf
->nlines
= 0;
1402 /* Return one past the limit of the line array. */
1404 static inline struct line
*
1405 buffer_linelim (struct buffer
const *buf
)
1407 return (struct line
*) (buf
->buf
+ buf
->alloc
);
1410 /* Return a pointer to the first character of the field specified
1414 begfield (const struct line
*line
, const struct keyfield
*key
)
1416 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1417 size_t sword
= key
->sword
;
1418 size_t schar
= key
->schar
;
1420 /* The leading field separator itself is included in a field when -t
1423 if (tab
!= TAB_DEFAULT
)
1424 while (ptr
< lim
&& sword
--)
1426 while (ptr
< lim
&& *ptr
!= tab
)
1432 while (ptr
< lim
&& sword
--)
1434 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1436 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1440 /* If we're ignoring leading blanks when computing the Start
1441 of the field, skip past them here. */
1442 if (key
->skipsblanks
)
1443 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1446 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1447 ptr
= MIN (lim
, ptr
+ schar
);
1452 /* Return the limit of (a pointer to the first character after) the field
1453 in LINE specified by KEY. */
1456 limfield (const struct line
*line
, const struct keyfield
*key
)
1458 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1459 size_t eword
= key
->eword
, echar
= key
->echar
;
1462 eword
++; /* Skip all of end field. */
1464 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1465 whichever comes first. If there are more than EWORD fields, leave
1466 PTR pointing at the beginning of the field having zero-based index,
1467 EWORD. If a delimiter character was specified (via -t), then that
1468 `beginning' is the first character following the delimiting TAB.
1469 Otherwise, leave PTR pointing at the first `blank' character after
1470 the preceding field. */
1471 if (tab
!= TAB_DEFAULT
)
1472 while (ptr
< lim
&& eword
--)
1474 while (ptr
< lim
&& *ptr
!= tab
)
1476 if (ptr
< lim
&& (eword
| echar
))
1480 while (ptr
< lim
&& eword
--)
1482 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1484 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1488 #ifdef POSIX_UNSPECIFIED
1489 /* The following block of code makes GNU sort incompatible with
1490 standard Unix sort, so it's ifdef'd out for now.
1491 The POSIX spec isn't clear on how to interpret this.
1492 FIXME: request clarification.
1494 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1495 Date: Thu, 30 May 96 12:20:41 -0400
1496 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1498 [...]I believe I've found another bug in `sort'.
1503 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1506 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1510 Unix sort produced the answer I expected: sort on the single character
1511 in column 7. GNU sort produced different results, because it disagrees
1512 on the interpretation of the key-end spec "M.N". Unix sort reads this
1513 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1514 "skip M-1 fields, then either N-1 characters or the rest of the current
1515 field, whichever comes first". This extra clause applies only to
1516 key-ends, not key-starts.
1519 /* Make LIM point to the end of (one byte past) the current field. */
1520 if (tab
!= TAB_DEFAULT
)
1523 newlim
= memchr (ptr
, tab
, lim
- ptr
);
1531 while (newlim
< lim
&& blanks
[to_uchar (*newlim
)])
1533 while (newlim
< lim
&& !blanks
[to_uchar (*newlim
)])
1539 if (echar
!= 0) /* We need to skip over a portion of the end field. */
1541 /* If we're ignoring leading blanks when computing the End
1542 of the field, skip past them here. */
1543 if (key
->skipeblanks
)
1544 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1547 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1548 ptr
= MIN (lim
, ptr
+ echar
);
1554 /* Fill BUF reading from FP, moving buf->left bytes from the end
1555 of buf->buf to the beginning first. If EOF is reached and the
1556 file wasn't terminated by a newline, supply one. Set up BUF's line
1557 table too. FILE is the name of the file corresponding to FP.
1558 Return true if some input was read. */
1561 fillbuf (struct buffer
*buf
, FILE *fp
, char const *file
)
1563 struct keyfield
const *key
= keylist
;
1565 size_t line_bytes
= buf
->line_bytes
;
1566 size_t mergesize
= merge_buffer_size
- MIN_MERGE_BUFFER_SIZE
;
1571 if (buf
->used
!= buf
->left
)
1573 memmove (buf
->buf
, buf
->buf
+ buf
->used
- buf
->left
, buf
->left
);
1574 buf
->used
= buf
->left
;
1580 char *ptr
= buf
->buf
+ buf
->used
;
1581 struct line
*linelim
= buffer_linelim (buf
);
1582 struct line
*line
= linelim
- buf
->nlines
;
1583 size_t avail
= (char *) linelim
- buf
->nlines
* line_bytes
- ptr
;
1584 char *line_start
= buf
->nlines
? line
->text
+ line
->length
: buf
->buf
;
1586 while (line_bytes
+ 1 < avail
)
1588 /* Read as many bytes as possible, but do not read so many
1589 bytes that there might not be enough room for the
1590 corresponding line array. The worst case is when the
1591 rest of the input file consists entirely of newlines,
1592 except that the last byte is not a newline. */
1593 size_t readsize
= (avail
- 1) / (line_bytes
+ 1);
1594 size_t bytes_read
= fread (ptr
, 1, readsize
, fp
);
1595 char *ptrlim
= ptr
+ bytes_read
;
1597 avail
-= bytes_read
;
1599 if (bytes_read
!= readsize
)
1602 die (_("read failed"), file
);
1606 if (buf
->buf
== ptrlim
)
1608 if (ptrlim
[-1] != eol
)
1613 /* Find and record each line in the just-read input. */
1614 while ((p
= memchr (ptr
, eol
, ptrlim
- ptr
)))
1618 line
->text
= line_start
;
1619 line
->length
= ptr
- line_start
;
1620 mergesize
= MAX (mergesize
, line
->length
);
1621 avail
-= line_bytes
;
1625 /* Precompute the position of the first key for
1627 line
->keylim
= (key
->eword
== SIZE_MAX
1629 : limfield (line
, key
));
1631 if (key
->sword
!= SIZE_MAX
)
1632 line
->keybeg
= begfield (line
, key
);
1635 if (key
->skipsblanks
)
1636 while (blanks
[to_uchar (*line_start
)])
1638 line
->keybeg
= line_start
;
1650 buf
->used
= ptr
- buf
->buf
;
1651 buf
->nlines
= buffer_linelim (buf
) - line
;
1652 if (buf
->nlines
!= 0)
1654 buf
->left
= ptr
- line_start
;
1655 merge_buffer_size
= mergesize
+ MIN_MERGE_BUFFER_SIZE
;
1660 /* The current input line is too long to fit in the buffer.
1661 Double the buffer size and try again, keeping it properly
1663 size_t line_alloc
= buf
->alloc
/ sizeof (struct line
);
1664 buf
->buf
= x2nrealloc (buf
->buf
, &line_alloc
, sizeof (struct line
));
1665 buf
->alloc
= line_alloc
* sizeof (struct line
);
1670 /* Compare strings A and B as numbers without explicitly converting them to
1671 machine numbers. Comparatively slow for short strings, but asymptotically
1675 numcompare (const char *a
, const char *b
)
1677 while (blanks
[to_uchar (*a
)])
1679 while (blanks
[to_uchar (*b
)])
1682 return strnumcmp (a
, b
, decimal_point
, thousands_sep
);
1685 /* Exit with an error if a mixture of SI and IEC units detected. */
1688 check_mixed_SI_IEC (char prefix
, struct keyfield
*key
)
1690 int si_present
= prefix
== 'i';
1691 if (key
->si_present
!= -1 && si_present
!= key
->si_present
)
1692 error (SORT_FAILURE
, 0, _("both SI and IEC prefixes present on units"));
1693 key
->si_present
= si_present
;
1696 /* Return an integer which represents the order of magnitude of
1697 the unit following the number. NUMBER can contain thousands separators
1698 or a decimal point, but not have preceeding blanks.
1699 Negative numbers return a negative unit order. */
1702 find_unit_order (const char *number
, struct keyfield
*key
)
1704 static const char orders
[UCHAR_LIM
] = {
1705 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1709 const unsigned char *p
= number
;
1719 /* Scan to end of number.
1720 Decimals or separators not followed by digits stop the scan.
1721 Numbers ending in decimals or separators are thus considered
1722 to be lacking in units.
1723 FIXME: add support for multibyte thousands_sep and decimal_point. */
1725 while (ISDIGIT (*p
))
1729 if (*p
== decimal_point
&& ISDIGIT (*(p
+ 1)))
1731 else if (*p
== thousands_sep
&& ISDIGIT (*(p
+ 1)))
1735 int order
= orders
[*p
];
1737 /* For valid units check for MiB vs MB etc. */
1739 check_mixed_SI_IEC (*(p
+ 1), key
);
1741 return sign
* order
;
1744 /* Compare numbers ending in units with SI xor IEC prefixes
1745 <none/unknown> < K/k < M < G < T < P < E < Z < Y
1746 Assume that numbers are properly abbreviated.
1747 i.e. input will never have both 6000K and 5M. */
1750 human_numcompare (const char *a
, const char *b
, struct keyfield
*key
)
1752 while (blanks
[to_uchar (*a
)])
1754 while (blanks
[to_uchar (*b
)])
1757 int order_a
= find_unit_order (a
, key
);
1758 int order_b
= find_unit_order (b
, key
);
1760 return (order_a
> order_b
? 1
1761 : order_a
< order_b
? -1
1762 : strnumcmp (a
, b
, decimal_point
, thousands_sep
));
1766 general_numcompare (const char *sa
, const char *sb
)
1768 /* FIXME: add option to warn about failed conversions. */
1769 /* FIXME: maybe add option to try expensive FP conversion
1770 only if A and B can't be compared more cheaply/accurately. */
1774 double a
= strtod (sa
, &ea
);
1775 double b
= strtod (sb
, &eb
);
1777 /* Put conversion errors at the start of the collating sequence. */
1779 return sb
== eb
? 0 : -1;
1783 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1784 conversion errors but before numbers; sort them by internal
1785 bit-pattern, for lack of a more portable alternative. */
1791 : memcmp ((char *) &a
, (char *) &b
, sizeof a
));
1794 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1795 Return 0 if the name in S is not recognized. */
1798 getmonth (char const *month
, size_t len
)
1801 size_t hi
= MONTHS_PER_YEAR
;
1802 char const *monthlim
= month
+ len
;
1806 if (month
== monthlim
)
1808 if (!blanks
[to_uchar (*month
)])
1815 size_t ix
= (lo
+ hi
) / 2;
1816 char const *m
= month
;
1817 char const *n
= monthtab
[ix
].name
;
1822 return monthtab
[ix
].val
;
1823 if (m
== monthlim
|| fold_toupper
[to_uchar (*m
)] < to_uchar (*n
))
1828 else if (fold_toupper
[to_uchar (*m
)] > to_uchar (*n
))
1840 /* A source of random data. */
1841 static struct randread_source
*randread_source
;
1843 /* Return the Ith randomly-generated state. The caller must invoke
1844 random_state (H) for all H less than I before invoking random_state
1847 static struct md5_ctx
1848 random_state (size_t i
)
1850 /* An array of states resulting from the random data, and counts of
1851 its used and allocated members. */
1852 static struct md5_ctx
*state
;
1854 static size_t allocated
;
1856 struct md5_ctx
*s
= &state
[i
];
1860 unsigned char buf
[MD5_DIGEST_SIZE
];
1866 state
= X2NREALLOC (state
, &allocated
);
1870 randread (randread_source
, buf
, sizeof buf
);
1872 md5_process_bytes (buf
, sizeof buf
, s
);
1878 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1879 with length LENGTHB. Return negative if less, zero if equal,
1880 positive if greater. */
1883 cmp_hashes (char const *texta
, size_t lena
,
1884 char const *textb
, size_t lenb
)
1886 /* Try random hashes until a pair of hashes disagree. But if the
1887 first pair of random hashes agree, check whether the keys are
1888 identical and if so report no difference. */
1893 uint32_t dig
[2][MD5_DIGEST_SIZE
/ sizeof (uint32_t)];
1894 struct md5_ctx s
[2];
1895 s
[0] = s
[1] = random_state (i
);
1896 md5_process_bytes (texta
, lena
, &s
[0]); md5_finish_ctx (&s
[0], dig
[0]);
1897 md5_process_bytes (textb
, lenb
, &s
[1]); md5_finish_ctx (&s
[1], dig
[1]);
1898 diff
= memcmp (dig
[0], dig
[1], sizeof dig
[0]);
1901 if (i
== 0 && lena
== lenb
&& memcmp (texta
, textb
, lena
) == 0)
1908 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1909 using one or more random hash functions. */
1912 compare_random (char *restrict texta
, size_t lena
,
1913 char *restrict textb
, size_t lenb
)
1917 if (! hard_LC_COLLATE
)
1918 diff
= cmp_hashes (texta
, lena
, textb
, lenb
);
1921 /* Transform the text into the basis of comparison, so that byte
1922 strings that would otherwise considered to be equal are
1923 considered equal here even if their bytes differ. */
1926 char stackbuf
[4000];
1927 size_t tlena
= xmemxfrm (stackbuf
, sizeof stackbuf
, texta
, lena
);
1928 bool a_fits
= tlena
<= sizeof stackbuf
;
1929 size_t tlenb
= xmemxfrm ((a_fits
? stackbuf
+ tlena
: NULL
),
1930 (a_fits
? sizeof stackbuf
- tlena
: 0),
1933 if (a_fits
&& tlena
+ tlenb
<= sizeof stackbuf
)
1937 /* Adding 1 to the buffer size lets xmemxfrm run a bit
1938 faster by avoiding the need for an extra buffer copy. */
1939 buf
= xmalloc (tlena
+ tlenb
+ 1);
1940 xmemxfrm (buf
, tlena
+ 1, texta
, lena
);
1941 xmemxfrm (buf
+ tlena
, tlenb
+ 1, textb
, lenb
);
1944 diff
= cmp_hashes (buf
, tlena
, buf
+ tlena
, tlenb
);
1946 if (buf
!= stackbuf
)
1953 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1954 using filevercmp. See lib/filevercmp.h for function description. */
1957 compare_version (char *restrict texta
, size_t lena
,
1958 char *restrict textb
, size_t lenb
)
1962 /* It is necessary to save the character after the end of the field.
1963 "filevercmp" works with NUL terminated strings. Our blocks of
1964 text are not necessarily terminated with a NUL byte. */
1965 char sv_a
= texta
[lena
];
1966 char sv_b
= textb
[lenb
];
1971 diff
= filevercmp (texta
, textb
);
1979 /* Compare two lines A and B trying every key in sequence until there
1980 are no more keys or a difference is found. */
1983 keycompare (const struct line
*a
, const struct line
*b
)
1985 struct keyfield
*key
= keylist
;
1987 /* For the first iteration only, the key positions have been
1988 precomputed for us. */
1989 char *texta
= a
->keybeg
;
1990 char *textb
= b
->keybeg
;
1991 char *lima
= a
->keylim
;
1992 char *limb
= b
->keylim
;
1998 char const *translate
= key
->translate
;
1999 bool const *ignore
= key
->ignore
;
2001 /* Treat field ends before field starts as empty fields. */
2002 lima
= MAX (texta
, lima
);
2003 limb
= MAX (textb
, limb
);
2005 /* Find the lengths. */
2006 size_t lena
= lima
- texta
;
2007 size_t lenb
= limb
- textb
;
2009 /* Actually compare the fields. */
2012 diff
= compare_random (texta
, lena
, textb
, lenb
);
2013 else if (key
->numeric
| key
->general_numeric
| key
->human_numeric
)
2015 char savea
= *lima
, saveb
= *limb
;
2017 *lima
= *limb
= '\0';
2018 diff
= (key
->numeric
? numcompare (texta
, textb
)
2019 : key
->general_numeric
? general_numcompare (texta
, textb
)
2020 : human_numcompare (texta
, textb
, key
));
2021 *lima
= savea
, *limb
= saveb
;
2023 else if (key
->version
)
2024 diff
= compare_version (texta
, lena
, textb
, lenb
);
2025 else if (key
->month
)
2026 diff
= getmonth (texta
, lena
) - getmonth (textb
, lenb
);
2027 /* Sorting like this may become slow, so in a simple locale the user
2028 can select a faster sort that is similar to ascii sort. */
2029 else if (hard_LC_COLLATE
)
2031 if (ignore
|| translate
)
2034 size_t size
= lena
+ 1 + lenb
+ 1;
2035 char *copy_a
= (size
<= sizeof buf
? buf
: xmalloc (size
));
2036 char *copy_b
= copy_a
+ lena
+ 1;
2037 size_t new_len_a
, new_len_b
, i
;
2039 /* Ignore and/or translate chars before comparing. */
2040 for (new_len_a
= new_len_b
= i
= 0; i
< MAX (lena
, lenb
); i
++)
2044 copy_a
[new_len_a
] = (translate
2045 ? translate
[to_uchar (texta
[i
])]
2047 if (!ignore
|| !ignore
[to_uchar (texta
[i
])])
2052 copy_b
[new_len_b
] = (translate
2053 ? translate
[to_uchar (textb
[i
])]
2055 if (!ignore
|| !ignore
[to_uchar (textb
[i
])])
2060 diff
= xmemcoll (copy_a
, new_len_a
, copy_b
, new_len_b
);
2062 if (sizeof buf
< size
)
2066 diff
= - NONZERO (lenb
);
2070 diff
= xmemcoll (texta
, lena
, textb
, lenb
);
2074 #define CMP_WITH_IGNORE(A, B) \
2079 while (texta < lima && ignore[to_uchar (*texta)]) \
2081 while (textb < limb && ignore[to_uchar (*textb)]) \
2083 if (! (texta < lima && textb < limb)) \
2085 diff = to_uchar (A) - to_uchar (B); \
2092 diff = (texta < lima) - (textb < limb); \
2097 CMP_WITH_IGNORE (translate
[to_uchar (*texta
)],
2098 translate
[to_uchar (*textb
)]);
2100 CMP_WITH_IGNORE (*texta
, *textb
);
2103 diff
= - NONZERO (lenb
);
2110 while (texta
< lima
&& textb
< limb
)
2112 diff
= (to_uchar (translate
[to_uchar (*texta
++)])
2113 - to_uchar (translate
[to_uchar (*textb
++)]));
2120 diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2124 diff
= lena
< lenb
? -1 : lena
!= lenb
;
2134 /* Find the beginning and limit of the next field. */
2135 if (key
->eword
!= SIZE_MAX
)
2136 lima
= limfield (a
, key
), limb
= limfield (b
, key
);
2138 lima
= a
->text
+ a
->length
- 1, limb
= b
->text
+ b
->length
- 1;
2140 if (key
->sword
!= SIZE_MAX
)
2141 texta
= begfield (a
, key
), textb
= begfield (b
, key
);
2144 texta
= a
->text
, textb
= b
->text
;
2145 if (key
->skipsblanks
)
2147 while (texta
< lima
&& blanks
[to_uchar (*texta
)])
2149 while (textb
< limb
&& blanks
[to_uchar (*textb
)])
2160 return key
->reverse
? -diff
: diff
;
2163 /* Compare two lines A and B, returning negative, zero, or positive
2164 depending on whether A compares less than, equal to, or greater than B. */
2167 compare (const struct line
*a
, const struct line
*b
)
2172 /* First try to compare on the specified keys (if any).
2173 The only two cases with no key at all are unadorned sort,
2174 and unadorned sort -r. */
2177 diff
= keycompare (a
, b
);
2178 if (diff
| unique
| stable
)
2182 /* If the keys all compare equal (or no keys were specified)
2183 fall through to the default comparison. */
2184 alen
= a
->length
- 1, blen
= b
->length
- 1;
2187 diff
= - NONZERO (blen
);
2190 else if (hard_LC_COLLATE
)
2191 diff
= xmemcoll (a
->text
, alen
, b
->text
, blen
);
2192 else if (! (diff
= memcmp (a
->text
, b
->text
, MIN (alen
, blen
))))
2193 diff
= alen
< blen
? -1 : alen
!= blen
;
2195 return reverse
? -diff
: diff
;
2198 /* Check that the lines read from FILE_NAME come in order. Return
2199 true if they are in order. If CHECKONLY == 'c', also print a
2200 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2201 they are not in order. */
2204 check (char const *file_name
, char checkonly
)
2206 FILE *fp
= xfopen (file_name
, "r");
2207 struct buffer buf
; /* Input buffer. */
2208 struct line temp
; /* Copy of previous line. */
2210 uintmax_t line_number
= 0;
2211 struct keyfield
const *key
= keylist
;
2212 bool nonunique
= ! unique
;
2213 bool ordered
= true;
2215 initbuf (&buf
, sizeof (struct line
),
2216 MAX (merge_buffer_size
, sort_size
));
2219 while (fillbuf (&buf
, fp
, file_name
))
2221 struct line
const *line
= buffer_linelim (&buf
);
2222 struct line
const *linebase
= line
- buf
.nlines
;
2224 /* Make sure the line saved from the old buffer contents is
2225 less than or equal to the first line of the new buffer. */
2226 if (alloc
&& nonunique
<= compare (&temp
, line
- 1))
2230 if (checkonly
== 'c')
2232 struct line
const *disorder_line
= line
- 1;
2233 uintmax_t disorder_line_number
=
2234 buffer_linelim (&buf
) - disorder_line
+ line_number
;
2235 char hr_buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2236 fprintf (stderr
, _("%s: %s:%s: disorder: "),
2237 program_name
, file_name
,
2238 umaxtostr (disorder_line_number
, hr_buf
));
2239 write_bytes (disorder_line
->text
, disorder_line
->length
,
2240 stderr
, _("standard error"));
2248 /* Compare each line in the buffer with its successor. */
2249 while (linebase
< --line
)
2250 if (nonunique
<= compare (line
, line
- 1))
2251 goto found_disorder
;
2253 line_number
+= buf
.nlines
;
2255 /* Save the last line of the buffer. */
2256 if (alloc
< line
->length
)
2263 alloc
= line
->length
;
2267 while (alloc
< line
->length
);
2269 temp
.text
= xrealloc (temp
.text
, alloc
);
2271 memcpy (temp
.text
, line
->text
, line
->length
);
2272 temp
.length
= line
->length
;
2275 temp
.keybeg
= temp
.text
+ (line
->keybeg
- line
->text
);
2276 temp
.keylim
= temp
.text
+ (line
->keylim
- line
->text
);
2280 xfclose (fp
, file_name
);
2286 /* Open FILES (there are NFILES of them) and store the resulting array
2287 of stream pointers into (*PFPS). Allocate the array. Return the
2288 number of successfully opened files, setting errno if this value is
2289 less than NFILES. */
2292 open_input_files (struct sortfile
*files
, size_t nfiles
, FILE ***pfps
)
2294 FILE **fps
= *pfps
= xnmalloc (nfiles
, sizeof *fps
);
2297 /* Open as many input files as we can. */
2298 for (i
= 0; i
< nfiles
; i
++)
2300 fps
[i
] = (files
[i
].pid
2301 ? open_temp (files
[i
].name
, files
[i
].pid
)
2302 : stream_open (files
[i
].name
, "r"));
2310 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2311 files (all of which are at the start of the FILES array), and
2312 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2313 FPS is the vector of open stream corresponding to the files.
2314 Close input and output streams before returning.
2315 OUTPUT_FILE gives the name of the output file. If it is NULL,
2316 the output file is standard output. */
2319 mergefps (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
2320 FILE *ofp
, char const *output_file
, FILE **fps
)
2322 struct buffer
*buffer
= xnmalloc (nfiles
, sizeof *buffer
);
2323 /* Input buffers for each file. */
2324 struct line saved
; /* Saved line storage for unique check. */
2325 struct line
const *savedline
= NULL
;
2326 /* &saved if there is a saved line. */
2327 size_t savealloc
= 0; /* Size allocated for the saved line. */
2328 struct line
const **cur
= xnmalloc (nfiles
, sizeof *cur
);
2329 /* Current line in each line table. */
2330 struct line
const **base
= xnmalloc (nfiles
, sizeof *base
);
2331 /* Base of each line table. */
2332 size_t *ord
= xnmalloc (nfiles
, sizeof *ord
);
2333 /* Table representing a permutation of fps,
2334 such that cur[ord[0]] is the smallest line
2335 and will be next output. */
2339 struct keyfield
const *key
= keylist
;
2342 /* Read initial lines from each input file. */
2343 for (i
= 0; i
< nfiles
; )
2345 initbuf (&buffer
[i
], sizeof (struct line
),
2346 MAX (merge_buffer_size
, sort_size
/ nfiles
));
2347 if (fillbuf (&buffer
[i
], fps
[i
], files
[i
].name
))
2349 struct line
const *linelim
= buffer_linelim (&buffer
[i
]);
2350 cur
[i
] = linelim
- 1;
2351 base
[i
] = linelim
- buffer
[i
].nlines
;
2356 /* fps[i] is empty; eliminate it from future consideration. */
2357 xfclose (fps
[i
], files
[i
].name
);
2361 zaptemp (files
[i
].name
);
2363 free (buffer
[i
].buf
);
2365 for (j
= i
; j
< nfiles
; ++j
)
2367 files
[j
] = files
[j
+ 1];
2368 fps
[j
] = fps
[j
+ 1];
2373 /* Set up the ord table according to comparisons among input lines.
2374 Since this only reorders two items if one is strictly greater than
2375 the other, it is stable. */
2376 for (i
= 0; i
< nfiles
; ++i
)
2378 for (i
= 1; i
< nfiles
; ++i
)
2379 if (0 < compare (cur
[ord
[i
- 1]], cur
[ord
[i
]]))
2380 t
= ord
[i
- 1], ord
[i
- 1] = ord
[i
], ord
[i
] = t
, i
= 0;
2382 /* Repeatedly output the smallest line until no input remains. */
2385 struct line
const *smallest
= cur
[ord
[0]];
2387 /* If uniquified output is turned on, output only the first of
2388 an identical series of lines. */
2391 if (savedline
&& compare (savedline
, smallest
))
2394 write_bytes (saved
.text
, saved
.length
, ofp
, output_file
);
2399 if (savealloc
< smallest
->length
)
2404 savealloc
= smallest
->length
;
2407 while ((savealloc
*= 2) < smallest
->length
);
2409 saved
.text
= xrealloc (saved
.text
, savealloc
);
2411 saved
.length
= smallest
->length
;
2412 memcpy (saved
.text
, smallest
->text
, saved
.length
);
2416 saved
.text
+ (smallest
->keybeg
- smallest
->text
);
2418 saved
.text
+ (smallest
->keylim
- smallest
->text
);
2423 write_bytes (smallest
->text
, smallest
->length
, ofp
, output_file
);
2425 /* Check if we need to read more lines into core. */
2426 if (base
[ord
[0]] < smallest
)
2427 cur
[ord
[0]] = smallest
- 1;
2430 if (fillbuf (&buffer
[ord
[0]], fps
[ord
[0]], files
[ord
[0]].name
))
2432 struct line
const *linelim
= buffer_linelim (&buffer
[ord
[0]]);
2433 cur
[ord
[0]] = linelim
- 1;
2434 base
[ord
[0]] = linelim
- buffer
[ord
[0]].nlines
;
2438 /* We reached EOF on fps[ord[0]]. */
2439 for (i
= 1; i
< nfiles
; ++i
)
2440 if (ord
[i
] > ord
[0])
2443 xfclose (fps
[ord
[0]], files
[ord
[0]].name
);
2444 if (ord
[0] < ntemps
)
2447 zaptemp (files
[ord
[0]].name
);
2449 free (buffer
[ord
[0]].buf
);
2450 for (i
= ord
[0]; i
< nfiles
; ++i
)
2452 fps
[i
] = fps
[i
+ 1];
2453 files
[i
] = files
[i
+ 1];
2454 buffer
[i
] = buffer
[i
+ 1];
2455 cur
[i
] = cur
[i
+ 1];
2456 base
[i
] = base
[i
+ 1];
2458 for (i
= 0; i
< nfiles
; ++i
)
2459 ord
[i
] = ord
[i
+ 1];
2464 /* The new line just read in may be larger than other lines
2465 already in main memory; push it back in the queue until we
2466 encounter a line larger than it. Optimize for the common
2467 case where the new line is smallest. */
2472 size_t ord0
= ord
[0];
2473 size_t count_of_smaller_lines
;
2477 int cmp
= compare (cur
[ord0
], cur
[ord
[probe
]]);
2478 if (cmp
< 0 || (cmp
== 0 && ord0
< ord
[probe
]))
2482 probe
= (lo
+ hi
) / 2;
2485 count_of_smaller_lines
= lo
- 1;
2486 for (j
= 0; j
< count_of_smaller_lines
; j
++)
2487 ord
[j
] = ord
[j
+ 1];
2488 ord
[count_of_smaller_lines
] = ord0
;
2491 /* Free up some resources every once in a while. */
2492 if (MAX_PROCS_BEFORE_REAP
< nprocs
)
2496 if (unique
&& savedline
)
2498 write_bytes (saved
.text
, saved
.length
, ofp
, output_file
);
2502 xfclose (ofp
, output_file
);
2510 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2511 files (all of which are at the start of the FILES array), and
2512 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2513 Close input and output files before returning.
2514 OUTPUT_FILE gives the name of the output file.
2516 Return the number of files successfully merged. This number can be
2517 less than NFILES if we ran low on file descriptors, but in this
2518 case it is never less than 2. */
2521 mergefiles (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
2522 FILE *ofp
, char const *output_file
)
2525 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
2526 if (nopened
< nfiles
&& nopened
< 2)
2527 die (_("open failed"), files
[nopened
].name
);
2528 mergefps (files
, ntemps
, nopened
, ofp
, output_file
, fps
);
2532 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2533 and HI (with NHI members). T, LO, and HI point just past their
2534 respective arrays, and the arrays are in reverse order. NLO and
2535 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
2538 mergelines (struct line
*t
,
2539 struct line
const *lo
, size_t nlo
,
2540 struct line
const *hi
, size_t nhi
)
2543 if (compare (lo
- 1, hi
- 1) <= 0)
2548 /* HI - NHI equalled T - (NLO + NHI) when this function
2549 began. Therefore HI must equal T now, and there is no
2550 need to copy from HI to T. */
2568 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2569 NLINES must be at least 2.
2570 The input and output arrays are in reverse order, and LINES and
2571 TEMP point just past the end of their respective arrays.
2573 Use a recursive divide-and-conquer algorithm, in the style
2574 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
2575 the optimization suggested by exercise 5.2.4-10; this requires room
2576 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
2577 writes that this memory optimization was originally published by
2578 D. A. Bell, Comp J. 1 (1958), 75. */
2581 sortlines (struct line
*lines
, size_t nlines
, struct line
*temp
)
2585 if (0 < compare (&lines
[-1], &lines
[-2]))
2587 struct line tmp
= lines
[-1];
2588 lines
[-1] = lines
[-2];
2594 size_t nlo
= nlines
/ 2;
2595 size_t nhi
= nlines
- nlo
;
2596 struct line
*lo
= lines
;
2597 struct line
*hi
= lines
- nlo
;
2598 struct line
*sorted_lo
= temp
;
2600 sortlines (hi
, nhi
, temp
);
2602 sortlines_temp (lo
, nlo
, sorted_lo
);
2604 sorted_lo
[-1] = lo
[-1];
2606 mergelines (lines
, sorted_lo
, nlo
, hi
, nhi
);
2610 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2611 rather than sorting in place. */
2614 sortlines_temp (struct line
*lines
, size_t nlines
, struct line
*temp
)
2618 /* Declare `swap' as int, not bool, to work around a bug
2619 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2620 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
2621 int swap
= (0 < compare (&lines
[-1], &lines
[-2]));
2622 temp
[-1] = lines
[-1 - swap
];
2623 temp
[-2] = lines
[-2 + swap
];
2627 size_t nlo
= nlines
/ 2;
2628 size_t nhi
= nlines
- nlo
;
2629 struct line
*lo
= lines
;
2630 struct line
*hi
= lines
- nlo
;
2631 struct line
*sorted_hi
= temp
- nlo
;
2633 sortlines_temp (hi
, nhi
, sorted_hi
);
2635 sortlines (lo
, nlo
, temp
);
2637 mergelines (temp
, lo
, nlo
, sorted_hi
, nhi
);
2641 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2642 the same as OUTFILE. If found, merge the found instances (and perhaps
2643 some other files) into a temporary file so that it can in turn be
2644 merged into OUTFILE without destroying OUTFILE before it is completely
2645 read. Return the new value of NFILES, which differs from the old if
2646 some merging occurred.
2648 This test ensures that an otherwise-erroneous use like
2649 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2650 It's not clear that POSIX requires this nicety.
2651 Detect common error cases, but don't try to catch obscure cases like
2652 "cat ... FILE ... | sort -m -o FILE"
2653 where traditional "sort" doesn't copy the input and where
2654 people should know that they're getting into trouble anyway.
2655 Catching these obscure cases would slow down performance in
2659 avoid_trashing_input (struct sortfile
*files
, size_t ntemps
,
2660 size_t nfiles
, char const *outfile
)
2663 bool got_outstat
= false;
2664 struct stat outstat
;
2666 for (i
= ntemps
; i
< nfiles
; i
++)
2668 bool is_stdin
= STREQ (files
[i
].name
, "-");
2672 if (outfile
&& STREQ (outfile
, files
[i
].name
) && !is_stdin
)
2679 ? stat (outfile
, &outstat
)
2680 : fstat (STDOUT_FILENO
, &outstat
))
2687 ? fstat (STDIN_FILENO
, &instat
)
2688 : stat (files
[i
].name
, &instat
))
2690 && SAME_INODE (instat
, outstat
));
2697 char *temp
= create_temp (&tftp
, &pid
);
2698 size_t num_merged
= 0;
2701 num_merged
+= mergefiles (&files
[i
], 0, nfiles
- i
, tftp
, temp
);
2702 files
[i
].name
= temp
;
2705 if (i
+ num_merged
< nfiles
)
2706 memmove(&files
[i
+ 1], &files
[i
+ num_merged
],
2707 num_merged
* sizeof *files
);
2709 nfiles
-= num_merged
- 1;;
2719 /* Merge the input FILES. NTEMPS is the number of files at the
2720 start of FILES that are temporary; it is zero at the top level.
2721 NFILES is the total number of files. Put the output in
2722 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
2725 merge (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
2726 char const *output_file
)
2728 while (nmerge
< nfiles
)
2730 /* Number of input files processed so far. */
2733 /* Number of output files generated so far. */
2736 /* nfiles % NMERGE; this counts input files that are left over
2737 after all full-sized merges have been done. */
2740 /* Number of easily-available slots at the next loop iteration. */
2743 /* Do as many NMERGE-size merges as possible. In the case that
2744 nmerge is bogus, increment by the maximum number of file
2745 descriptors allowed. */
2746 for (out
= in
= 0; nmerge
<= nfiles
- in
; out
++)
2750 char *temp
= create_temp (&tfp
, &pid
);
2751 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nmerge
),
2753 ntemps
-= MIN (ntemps
, num_merged
);
2754 files
[out
].name
= temp
;
2755 files
[out
].pid
= pid
;
2759 remainder
= nfiles
- in
;
2760 cheap_slots
= nmerge
- out
% nmerge
;
2762 if (cheap_slots
< remainder
)
2764 /* So many files remain that they can't all be put into the last
2765 NMERGE-sized output window. Do one more merge. Merge as few
2766 files as possible, to avoid needless I/O. */
2767 size_t nshortmerge
= remainder
- cheap_slots
+ 1;
2770 char *temp
= create_temp (&tfp
, &pid
);
2771 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nshortmerge
),
2772 nshortmerge
, tfp
, temp
);
2773 ntemps
-= MIN (ntemps
, num_merged
);
2774 files
[out
].name
= temp
;
2775 files
[out
++].pid
= pid
;
2779 /* Put the remaining input files into the last NMERGE-sized output
2780 window, so they will be merged in the next pass. */
2781 memmove(&files
[out
], &files
[in
], (nfiles
- in
) * sizeof *files
);
2786 nfiles
= avoid_trashing_input (files
, ntemps
, nfiles
, output_file
);
2788 /* We aren't guaranteed that this final mergefiles will work, therefore we
2789 try to merge into the output, and then merge as much as we can into a
2790 temp file if we can't. Repeat. */
2794 /* Merge directly into the output file if possible. */
2796 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
2798 if (nopened
== nfiles
)
2800 FILE *ofp
= stream_open (output_file
, "w");
2803 mergefps (files
, ntemps
, nfiles
, ofp
, output_file
, fps
);
2806 if (errno
!= EMFILE
|| nopened
<= 2)
2807 die (_("open failed"), output_file
);
2809 else if (nopened
<= 2)
2810 die (_("open failed"), files
[nopened
].name
);
2812 /* We ran out of file descriptors. Close one of the input
2813 files, to gain a file descriptor. Then create a temporary
2814 file with our spare file descriptor. Retry if that failed
2815 (e.g., some other process could open a file between the time
2816 we closed and tried to create). */
2823 xfclose (fps
[nopened
], files
[nopened
].name
);
2824 temp
= maybe_create_temp (&tfp
, &pid
, ! (nopened
<= 2));
2828 /* Merge into the newly allocated temporary. */
2829 mergefps (&files
[0], MIN (ntemps
, nopened
), nopened
, tfp
, temp
, fps
);
2830 ntemps
-= MIN (ntemps
, nopened
);
2831 files
[0].name
= temp
;
2834 memmove (&files
[1], &files
[nopened
], (nfiles
- nopened
) * sizeof *files
);
2836 nfiles
-= nopened
- 1;
2840 /* Sort NFILES FILES onto OUTPUT_FILE. */
2843 sort (char * const *files
, size_t nfiles
, char const *output_file
)
2847 bool output_file_created
= false;
2853 char const *temp_output
;
2854 char const *file
= *files
;
2855 FILE *fp
= xfopen (file
, "r");
2857 size_t bytes_per_line
= (2 * sizeof (struct line
)
2858 - sizeof (struct line
) / 2);
2861 initbuf (&buf
, bytes_per_line
,
2862 sort_buffer_size (&fp
, 1, files
, nfiles
, bytes_per_line
));
2867 while (fillbuf (&buf
, fp
, file
))
2870 struct line
*linebase
;
2872 if (buf
.eof
&& nfiles
2873 && (bytes_per_line
+ 1
2874 < (buf
.alloc
- buf
.used
- bytes_per_line
* buf
.nlines
)))
2876 /* End of file, but there is more input and buffer room.
2877 Concatenate the next input file; this is faster in
2879 buf
.left
= buf
.used
;
2883 line
= buffer_linelim (&buf
);
2884 linebase
= line
- buf
.nlines
;
2886 sortlines (line
, buf
.nlines
, linebase
);
2887 if (buf
.eof
&& !nfiles
&& !ntemps
&& !buf
.left
)
2890 tfp
= xfopen (output_file
, "w");
2891 temp_output
= output_file
;
2892 output_file_created
= true;
2897 temp_output
= create_temp (&tfp
, NULL
);
2903 write_bytes (line
->text
, line
->length
, tfp
, temp_output
);
2905 while (linebase
< line
&& compare (line
, line
- 1) == 0)
2908 while (linebase
< line
);
2910 xfclose (tfp
, temp_output
);
2912 /* Free up some resources every once in a while. */
2913 if (MAX_PROCS_BEFORE_REAP
< nprocs
)
2916 if (output_file_created
)
2925 if (! output_file_created
)
2928 struct tempnode
*node
= temphead
;
2929 struct sortfile
*tempfiles
= xnmalloc (ntemps
, sizeof *tempfiles
);
2930 for (i
= 0; node
; i
++)
2932 tempfiles
[i
].name
= node
->name
;
2933 tempfiles
[i
].pid
= node
->pid
;
2936 merge (tempfiles
, ntemps
, ntemps
, output_file
);
2941 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
2944 insertkey (struct keyfield
*key_arg
)
2946 struct keyfield
**p
;
2947 struct keyfield
*key
= xmemdup (key_arg
, sizeof *key
);
2949 for (p
= &keylist
; *p
; p
= &(*p
)->next
)
2955 /* Report a bad field specification SPEC, with extra info MSGID. */
2957 static void badfieldspec (char const *, char const *)
2960 badfieldspec (char const *spec
, char const *msgid
)
2962 error (SORT_FAILURE
, 0, _("%s: invalid field specification %s"),
2963 _(msgid
), quote (spec
));
2967 /* Report incompatible options. */
2969 static void incompatible_options (char const *) ATTRIBUTE_NORETURN
;
2971 incompatible_options (char const *opts
)
2973 error (SORT_FAILURE
, 0, _("options `-%s' are incompatible"), opts
);
2977 /* Check compatibility of ordering options. */
2980 check_ordering_compatibility (void)
2982 struct keyfield
const *key
;
2984 for (key
= keylist
; key
; key
= key
->next
)
2985 if ((1 < (key
->random
+ key
->numeric
+ key
->general_numeric
+ key
->month
2986 + key
->version
+ !!key
->ignore
+ key
->human_numeric
))
2987 || (key
->random
&& key
->translate
))
2989 /* The following is too big, but guaranteed to be "big enough". */
2990 char opts
[sizeof short_options
];
2992 if (key
->ignore
== nondictionary
)
2996 if (key
->general_numeric
)
2998 if (key
->human_numeric
)
3000 if (key
->ignore
== nonprinting
)
3011 incompatible_options (opts
);
3015 /* Parse the leading integer in STRING and store the resulting value
3016 (which must fit into size_t) into *VAL. Return the address of the
3017 suffix after the integer. If the value is too large, silently
3018 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3019 failure; otherwise, report MSGID and exit on failure. */
3022 parse_field_count (char const *string
, size_t *val
, char const *msgid
)
3027 switch (xstrtoumax (string
, &suffix
, 10, &n
, ""))
3030 case LONGINT_INVALID_SUFFIX_CHAR
:
3035 case LONGINT_OVERFLOW
:
3036 case LONGINT_OVERFLOW
| LONGINT_INVALID_SUFFIX_CHAR
:
3040 case LONGINT_INVALID
:
3042 error (SORT_FAILURE
, 0, _("%s: invalid count at start of %s"),
3043 _(msgid
), quote (string
));
3050 /* Handle interrupts and hangups. */
3053 sighandler (int sig
)
3056 signal (sig
, SIG_IGN
);
3060 signal (sig
, SIG_DFL
);
3064 /* Set the ordering options for KEY specified in S.
3065 Return the address of the first character in S that
3066 is not a valid ordering option.
3067 BLANKTYPE is the kind of blanks that 'b' should skip. */
3070 set_ordering (const char *s
, struct keyfield
*key
, enum blanktype blanktype
)
3077 if (blanktype
== bl_start
|| blanktype
== bl_both
)
3078 key
->skipsblanks
= true;
3079 if (blanktype
== bl_end
|| blanktype
== bl_both
)
3080 key
->skipeblanks
= true;
3083 key
->ignore
= nondictionary
;
3086 key
->translate
= fold_toupper
;
3089 key
->general_numeric
= true;
3092 key
->human_numeric
= true;
3095 /* Option order should not matter, so don't let -i override
3096 -d. -d implies -i, but -i does not imply -d. */
3098 key
->ignore
= nonprinting
;
3104 key
->numeric
= true;
3110 key
->reverse
= true;
3113 key
->version
= true;
3123 static struct keyfield
*
3124 key_init (struct keyfield
*key
)
3126 memset (key
, 0, sizeof *key
);
3127 key
->eword
= SIZE_MAX
;
3128 key
->si_present
= -1;
3133 main (int argc
, char **argv
)
3135 struct keyfield
*key
;
3136 struct keyfield key_buf
;
3137 struct keyfield gkey
;
3141 bool mergeonly
= false;
3142 char *random_source
= NULL
;
3143 bool need_random
= false;
3145 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
3146 bool obsolete_usage
= (posix2_version () < 200112);
3148 char *files_from
= NULL
;
3150 char const *outfile
= NULL
;
3152 initialize_main (&argc
, &argv
);
3153 set_program_name (argv
[0]);
3154 setlocale (LC_ALL
, "");
3155 bindtextdomain (PACKAGE
, LOCALEDIR
);
3156 textdomain (PACKAGE
);
3158 initialize_exit_failure (SORT_FAILURE
);
3160 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
3161 #if HAVE_NL_LANGINFO
3162 hard_LC_TIME
= hard_locale (LC_TIME
);
3165 /* Get locale's representation of the decimal point. */
3167 struct lconv
const *locale
= localeconv ();
3169 /* If the locale doesn't define a decimal point, or if the decimal
3170 point is multibyte, use the C locale's decimal point. FIXME:
3171 add support for multibyte decimal points. */
3172 decimal_point
= to_uchar (locale
->decimal_point
[0]);
3173 if (! decimal_point
|| locale
->decimal_point
[1])
3174 decimal_point
= '.';
3176 /* FIXME: add support for multibyte thousands separators. */
3177 thousands_sep
= to_uchar (*locale
->thousands_sep
);
3178 if (! thousands_sep
|| locale
->thousands_sep
[1])
3182 have_read_stdin
= false;
3187 static int const sig
[] =
3189 /* The usual suspects. */
3190 SIGALRM
, SIGHUP
, SIGINT
, SIGPIPE
, SIGQUIT
, SIGTERM
,
3207 enum { nsigs
= ARRAY_CARDINALITY (sig
) };
3210 struct sigaction act
;
3212 sigemptyset (&caught_signals
);
3213 for (i
= 0; i
< nsigs
; i
++)
3215 sigaction (sig
[i
], NULL
, &act
);
3216 if (act
.sa_handler
!= SIG_IGN
)
3217 sigaddset (&caught_signals
, sig
[i
]);
3220 act
.sa_handler
= sighandler
;
3221 act
.sa_mask
= caught_signals
;
3224 for (i
= 0; i
< nsigs
; i
++)
3225 if (sigismember (&caught_signals
, sig
[i
]))
3226 sigaction (sig
[i
], &act
, NULL
);
3228 for (i
= 0; i
< nsigs
; i
++)
3229 if (signal (sig
[i
], SIG_IGN
) != SIG_IGN
)
3231 signal (sig
[i
], sighandler
);
3232 siginterrupt (sig
[i
], 1);
3237 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
3238 atexit (exit_cleanup
);
3240 gkey
.sword
= gkey
.eword
= SIZE_MAX
;
3242 gkey
.translate
= NULL
;
3243 gkey
.numeric
= gkey
.general_numeric
= gkey
.human_numeric
= false;
3244 gkey
.si_present
= -1;
3245 gkey
.random
= gkey
.version
= false;
3246 gkey
.month
= gkey
.reverse
= false;
3247 gkey
.skipsblanks
= gkey
.skipeblanks
= false;
3249 files
= xnmalloc (argc
, sizeof *files
);
3253 /* Parse an operand as a file after "--" was seen; or if
3254 pedantic and a file was seen, unless the POSIX version
3255 predates 1003.1-2001 and -c was not seen and the operand is
3256 "-o FILE" or "-oFILE". */
3260 || (posixly_correct
&& nfiles
!= 0
3261 && ! (obsolete_usage
3264 && argv
[optind
][0] == '-' && argv
[optind
][1] == 'o'
3265 && (argv
[optind
][2] || optind
+ 1 != argc
)))
3266 || ((c
= getopt_long (argc
, argv
, short_options
,
3272 files
[nfiles
++] = argv
[optind
++];
3278 if (optarg
[0] == '+')
3280 bool minus_pos_usage
= (optind
!= argc
&& argv
[optind
][0] == '-'
3281 && ISDIGIT (argv
[optind
][1]));
3282 obsolete_usage
|= minus_pos_usage
& ~posixly_correct
;
3285 /* Treat +POS1 [-POS2] as a key if possible; but silently
3286 treat an operand as a file if it is not a valid +POS1. */
3287 key
= key_init (&key_buf
);
3288 s
= parse_field_count (optarg
+ 1, &key
->sword
, NULL
);
3290 s
= parse_field_count (s
+ 1, &key
->schar
, NULL
);
3291 if (! (key
->sword
| key
->schar
))
3292 key
->sword
= SIZE_MAX
;
3293 if (! s
|| *set_ordering (s
, key
, bl_start
))
3297 if (minus_pos_usage
)
3299 char const *optarg1
= argv
[optind
++];
3300 s
= parse_field_count (optarg1
+ 1, &key
->eword
,
3301 N_("invalid number after `-'"));
3303 s
= parse_field_count (s
+ 1, &key
->echar
,
3304 N_("invalid number after `.'"));
3305 if (*set_ordering (s
, key
, bl_end
))
3306 badfieldspec (optarg1
,
3307 N_("stray character in field spec"));
3314 files
[nfiles
++] = optarg
;
3318 c
= XARGMATCH ("--sort", optarg
, sort_args
, sort_types
);
3335 set_ordering (str
, &gkey
, bl_both
);
3341 ? XARGMATCH ("--check", optarg
, check_args
, check_types
)
3346 if (checkonly
&& checkonly
!= c
)
3347 incompatible_options ("cC");
3351 case COMPRESS_PROGRAM_OPTION
:
3352 if (compress_program
&& !STREQ (compress_program
, optarg
))
3353 error (SORT_FAILURE
, 0, _("multiple compress programs specified"));
3354 compress_program
= optarg
;
3357 case FILES0_FROM_OPTION
:
3358 files_from
= optarg
;
3362 key
= key_init (&key_buf
);
3365 s
= parse_field_count (optarg
, &key
->sword
,
3366 N_("invalid number at field start"));
3369 /* Provoke with `sort -k0' */
3370 badfieldspec (optarg
, N_("field number is zero"));
3374 s
= parse_field_count (s
+ 1, &key
->schar
,
3375 N_("invalid number after `.'"));
3378 /* Provoke with `sort -k1.0' */
3379 badfieldspec (optarg
, N_("character offset is zero"));
3382 if (! (key
->sword
| key
->schar
))
3383 key
->sword
= SIZE_MAX
;
3384 s
= set_ordering (s
, key
, bl_start
);
3387 key
->eword
= SIZE_MAX
;
3393 s
= parse_field_count (s
+ 1, &key
->eword
,
3394 N_("invalid number after `,'"));
3397 /* Provoke with `sort -k1,0' */
3398 badfieldspec (optarg
, N_("field number is zero"));
3402 s
= parse_field_count (s
+ 1, &key
->echar
,
3403 N_("invalid number after `.'"));
3405 s
= set_ordering (s
, key
, bl_end
);
3408 badfieldspec (optarg
, N_("stray character in field spec"));
3417 specify_nmerge (oi
, c
, optarg
);
3421 if (outfile
&& !STREQ (outfile
, optarg
))
3422 error (SORT_FAILURE
, 0, _("multiple output files specified"));
3426 case RANDOM_SOURCE_OPTION
:
3427 if (random_source
&& !STREQ (random_source
, optarg
))
3428 error (SORT_FAILURE
, 0, _("multiple random sources specified"));
3429 random_source
= optarg
;
3437 specify_sort_size (oi
, c
, optarg
);
3442 char newtab
= optarg
[0];
3444 error (SORT_FAILURE
, 0, _("empty tab"));
3447 if (STREQ (optarg
, "\\0"))
3451 /* Provoke with `sort -txx'. Complain about
3452 "multi-character tab" instead of "multibyte tab", so
3453 that the diagnostic's wording does not need to be
3454 changed once multibyte characters are supported. */
3455 error (SORT_FAILURE
, 0, _("multi-character tab %s"),
3459 if (tab
!= TAB_DEFAULT
&& tab
!= newtab
)
3460 error (SORT_FAILURE
, 0, _("incompatible tabs"));
3466 add_temp_dir (optarg
);
3474 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3475 through Solaris 7. It is also accepted by many non-Solaris
3476 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3477 -y is marked as obsolete starting with Solaris 8 (1999), but is
3478 still accepted as of Solaris 10 prerelease (2004).
3480 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3481 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3482 and which in general ignores the argument after "-y" if it
3483 consists entirely of digits (it can even be empty). */
3484 if (optarg
== argv
[optind
- 1])
3487 for (p
= optarg
; ISDIGIT (*p
); p
++)
3489 optind
-= (*p
!= '\0');
3497 case_GETOPT_HELP_CHAR
;
3499 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
3502 usage (SORT_FAILURE
);
3510 /* When using --files0-from=F, you may not specify any files
3511 on the command-line. */
3514 error (0, 0, _("extra operand %s"), quote (files
[0]));
3515 fprintf (stderr
, "%s\n",
3516 _("file operands cannot be combined with --files0-from"));
3517 usage (SORT_FAILURE
);
3520 if (STREQ (files_from
, "-"))
3524 stream
= fopen (files_from
, "r");
3526 error (SORT_FAILURE
, errno
, _("cannot open %s for reading"),
3527 quote (files_from
));
3530 readtokens0_init (&tok
);
3532 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
3533 error (SORT_FAILURE
, 0, _("cannot read file names from %s"),
3534 quote (files_from
));
3542 for (i
= 0; i
< nfiles
; i
++)
3544 if (STREQ (files
[i
], "-"))
3545 error (SORT_FAILURE
, 0, _("when reading file names from stdin, "
3546 "no file name of %s allowed"),
3548 else if (files
[i
][0] == '\0')
3550 /* Using the standard `filename:line-number:' prefix here is
3551 not totally appropriate, since NUL is the separator, not NL,
3552 but it might be better than nothing. */
3553 unsigned long int file_number
= i
+ 1;
3554 error (SORT_FAILURE
, 0,
3555 _("%s:%lu: invalid zero-length file name"),
3556 quotearg_colon (files_from
), file_number
);
3561 error (SORT_FAILURE
, 0, _("no input from %s"),
3562 quote (files_from
));
3565 /* Inheritance of global options to individual keys. */
3566 for (key
= keylist
; key
; key
= key
->next
)
3570 || (key
->skipsblanks
3576 | key
->general_numeric
3577 | key
->human_numeric
3580 key
->ignore
= gkey
.ignore
;
3581 key
->translate
= gkey
.translate
;
3582 key
->skipsblanks
= gkey
.skipsblanks
;
3583 key
->skipeblanks
= gkey
.skipeblanks
;
3584 key
->month
= gkey
.month
;
3585 key
->numeric
= gkey
.numeric
;
3586 key
->general_numeric
= gkey
.general_numeric
;
3587 key
->human_numeric
= gkey
.human_numeric
;
3588 key
->random
= gkey
.random
;
3589 key
->reverse
= gkey
.reverse
;
3590 key
->version
= gkey
.version
;
3593 need_random
|= key
->random
;
3596 if (!keylist
&& (gkey
.ignore
3598 || (gkey
.skipsblanks
3602 | gkey
.general_numeric
3603 | gkey
.human_numeric
3608 need_random
|= gkey
.random
;
3611 check_ordering_compatibility ();
3613 reverse
= gkey
.reverse
;
3617 randread_source
= randread_new (random_source
, MD5_DIGEST_SIZE
);
3618 if (! randread_source
)
3619 die (_("open failed"), random_source
);
3622 if (temp_dir_count
== 0)
3624 char const *tmp_dir
= getenv ("TMPDIR");
3625 add_temp_dir (tmp_dir
? tmp_dir
: DEFAULT_TMPDIR
);
3630 static char *minus
= (char *) "-";
3636 /* Need to re-check that we meet the minimum requirement for memory
3637 usage with the final value for NMERGE. */
3639 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
3644 error (SORT_FAILURE
, 0, _("extra operand %s not allowed with -%c"),
3645 quote (files
[1]), checkonly
);
3649 static char opts
[] = {0, 'o', 0};
3650 opts
[0] = checkonly
;
3651 incompatible_options (opts
);
3654 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3655 input is not properly sorted. */
3656 exit (check (files
[0], checkonly
) ? EXIT_SUCCESS
: SORT_OUT_OF_ORDER
);
3661 struct sortfile
*sortfiles
= xcalloc (nfiles
, sizeof *sortfiles
);
3664 for (i
= 0; i
< nfiles
; ++i
)
3665 sortfiles
[i
].name
= files
[i
];
3667 merge (sortfiles
, 0, nfiles
, outfile
);
3668 IF_LINT (free (sortfiles
));
3671 sort (files
, nfiles
, outfile
);
3673 if (have_read_stdin
&& fclose (stdin
) == EOF
)
3674 die (_("close failed"), "-");
3676 exit (EXIT_SUCCESS
);