parsort: Fail if TMPDIR does not exist.
[parallel.git] / src / parsort
blob7452a6badb0d3d4b014a2506dfd4038cdeacbcca
1 #!/usr/bin/perl
3 # SPDX-FileCopyrightText: 2021 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
4 # SPDX-License-Identifier: GPL-3.0-or-later
6 =pod
8 =head1 NAME
10 parsort - Sort (big files) in parallel
13 =head1 SYNOPSIS
15 B<parsort> I<options for sort>
18 =head1 DESCRIPTION
20 B<parsort> uses GNU B<sort> to sort in parallel. It works just like
21 B<sort> but faster on inputs with more than 1 M lines, if you have a
22 multicore machine.
24 Hopefully these ideas will make it into GNU B<sort> in the future.
27 =head1 EXAMPLE
29 Sort files:
31 parsort *.txt > sorted.txt
33 Sort stdin (standard input) numerically:
35 cat numbers | parsort -n > sorted.txt
38 =head1 PERFORMANCE
40 B<parsort> is faster on files, because these can be read in parallel.
42 On a 48 core machine you should see a speedup of 3x over B<sort>.
45 =head1 AUTHOR
47 Copyright (C) 2020-2021 Ole Tange,
48 http://ole.tange.dk and Free Software Foundation, Inc.
51 =head1 LICENSE
53 Copyright (C) 2012 Free Software Foundation, Inc.
55 This program is free software; you can redistribute it and/or modify
56 it under the terms of the GNU General Public License as published by
57 the Free Software Foundation; either version 3 of the License, or
58 at your option any later version.
60 This program is distributed in the hope that it will be useful,
61 but WITHOUT ANY WARRANTY; without even the implied warranty of
62 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
63 GNU General Public License for more details.
65 You should have received a copy of the GNU General Public License
66 along with this program. If not, see <http://www.gnu.org/licenses/>.
69 =head1 DEPENDENCIES
71 B<parsort> uses B<sort>, B<bash>, and B<parallel>.
74 =head1 SEE ALSO
76 B<sort>
79 =cut
81 use strict;
82 use Getopt::Long;
83 use POSIX qw(mkfifo);
85 Getopt::Long::Configure("bundling","require_order");
87 my @ARGV_before = @ARGV;
89 GetOptions(
90 "debug|D" => \$opt::D,
91 "version" => \$opt::version,
92 "verbose|v" => \$opt::verbose,
93 "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
94 "d|dictionary-order" => \$opt::dictionary_order,
95 "f|ignore-case" => \$opt::ignore_case,
96 "g|general-numeric-sort" => \$opt::general_numeric_sort,
97 "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
98 "M|month-sort" => \$opt::month_sort,
99 "h|human-numeric-sort" => \$opt::human_numeric_sort,
100 "n|numeric-sort" => \$opt::numeric_sort,
101 "N|numascii" => \$opt::numascii,
102 "r|reverse" => \$opt::reverse,
103 "R|random-sort" => \$opt::random_sort,
104 "sort=s" => \$opt::sort,
105 "V|version-sort" => \$opt::version_sort,
106 "k|key=s" => \@opt::key,
107 "t|field-separator=s" => \$opt::field_separator,
108 "z|zero-terminated" => \$opt::zero_terminated,
109 "files0-from=s" => \$opt::files0_from,
110 "random-source=s" => \$opt::dummy,
111 "batch-size=s" => \$opt::dummy,
112 "check=s" => \$opt::dummy,
113 "c" => \$opt::dummy,
114 "C" => \$opt::dummy,
115 "compress-program=s" => \$opt::dummy,
116 "T|temporary-directory=s" => \$opt::dummy,
117 "parallel=s" => \$opt::dummy,
118 "u|unique" => \$opt::dummy,
119 "S|buffer-size=s" => \$opt::dummy,
120 "s|stable" => \$opt::dummy,
121 "help" => \$opt::dummy,
122 ) || exit(255);
123 $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
124 $Global::version = 20210623;
125 if($opt::version) { version(); exit 0; }
126 @Global::sortoptions =
127 shell_quote(@ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
128 #if($opt::zero_terminated) { $/ = "\0"; }
130 $ENV{'TMPDIR'} ||= "/tmp";
132 sub merge {
133 # Input:
134 # @cmd = commands to 'cat' (part of) a file
135 my @cmd = @_;
136 chomp(@cmd);
137 while($#cmd > 0) {
138 my @tmp;
139 while($#cmd >= 0) {
140 my $a = shift @cmd;
141 my $b = shift @cmd;
142 $a &&= "<($a)";
143 $b &&= "<($b)";
144 # This looks like useless use of 'cat', but contrary to
145 # naive belief it increases performance dramatically.
146 push @tmp, "sort -m @Global::sortoptions $a $b | cat"
148 @cmd = @tmp;
150 return @cmd;
153 sub sort_files {
154 # Input is files
155 my @files = @_;
156 # Let GNU Parallel generate the commands to read parts of files
157 # The commands split at \n and there will be at least one for each CPU thread
158 open(my $par,"-|",qw(parallel --pipepart --block -1 --dryrun -vv sort),
159 @Global::sortoptions, '::::', @files) || die;
160 my @cmd = merge(<$par>);
161 close $par;
162 # The command uses <(...) so it is incompatible with /bin/sh
163 open(my $bash,"|-","bash") || die;
164 print $bash @cmd;
165 close $bash;
168 sub sort_stdin {
169 # Input is stdin
170 # Spread the input between n processes that each sort
171 # n = number of CPU threads
172 my $numthreads = `parallel --number-of-threads`;
173 my @fifos = map { tmpfifo() } 1..$numthreads;
174 map { mkfifo($_,0600) } @fifos;
175 # This trick removes the fifo as soon as it is connected in the other end
176 # (rm fifo; ...) < fifo
177 my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
178 map { Q($_) } @fifos);
179 @cmd = merge(@cmd);
180 if(fork) {
181 } else {
182 exec(qw(parallel -j),$numthreads,
183 # 286k is the best mean value after testing 250..350
184 qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
186 # The command uses <(...) so it is incompatible with /bin/sh
187 open(my $bash,"|-","bash") || die;
188 print $bash @cmd;
189 close $bash;
192 sub tmpname {
193 # Select a name that does not exist
194 # Do not create the file as it may be used for creating a socket (by tmux)
195 # Remember the name in $Global::unlink to avoid hitting the same name twice
196 my $name = shift;
197 my($tmpname);
198 if(not -w $ENV{'TMPDIR'}) {
199 if(not -e $ENV{'TMPDIR'}) {
200 ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
201 Q($ENV{'TMPDIR'})."'");
202 } else {
203 ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
204 Q($ENV{'TMPDIR'})."'");
206 exit(255);
208 do {
209 $tmpname = $ENV{'TMPDIR'}."/".$name.
210 join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
211 } while(-e $tmpname);
212 return $tmpname;
215 sub tmpfifo {
216 # Find an unused name and mkfifo on it
217 my $tmpfifo = tmpname("psort");
218 mkfifo($tmpfifo,0600);
219 return $tmpfifo;
222 sub version() {
223 # Returns: N/A
224 print join
225 ("\n",
226 "GNU $Global::progname $Global::version",
227 "Copyright (C) 2020-2021 Ole Tange, http://ole.tange.dk and Free Software",
228 "Foundation, Inc.",
229 "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
230 "This is free software: you are free to change and redistribute it.",
231 "GNU $Global::progname comes with no warranty.",
233 "Web site: https://www.gnu.org/software/parallel\n",
237 sub shell_quote(@) {
238 # Input:
239 # @strings = strings to be quoted
240 # Returns:
241 # @shell_quoted_strings = string quoted as needed by the shell
242 return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
245 sub shell_quote_scalar_rc($) {
246 # Quote for the rc-shell
247 my $a = $_[0];
248 if(defined $a) {
249 if(($a =~ s/'/''/g)
251 ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
252 # A string was replaced
253 # No need to test for "" or \0
254 } elsif($a eq "") {
255 $a = "''";
256 } elsif($a eq "\0") {
257 $a = "";
260 return $a;
263 sub shell_quote_scalar_csh($) {
264 # Quote for (t)csh
265 my $a = $_[0];
266 if(defined $a) {
267 # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
268 # This is 1% faster than the above
269 if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
271 # quote newline in csh as \\\n
272 ($a =~ s/[\n]/"\\\n"/go)) {
273 # A string was replaced
274 # No need to test for "" or \0
275 } elsif($a eq "") {
276 $a = "''";
277 } elsif($a eq "\0") {
278 $a = "";
281 return $a;
284 sub shell_quote_scalar_default($) {
285 # Quote for other shells (Bourne compatibles)
286 # Inputs:
287 # $string = string to be quoted
288 # Returns:
289 # $shell_quoted = string quoted as needed by the shell
290 my $s = $_[0];
291 if($s =~ /[^-_.+a-z0-9\/]/i) {
292 $s =~ s/'/'"'"'/g; # "-quote single quotes
293 $s = "'$s'"; # '-quote entire string
294 $s =~ s/^''//; # Remove unneeded '' at ends
295 $s =~ s/''$//; # (faster than s/^''|''$//g)
296 return $s;
297 } elsif ($s eq "") {
298 return "''";
299 } else {
300 # No quoting needed
301 return $s;
305 sub shell_quote_scalar($) {
306 # Quote the string so the shell will not expand any special chars
307 # Inputs:
308 # $string = string to be quoted
309 # Returns:
310 # $shell_quoted = string quoted as needed by the shell
312 # Speed optimization: Choose the correct shell_quote_scalar_*
313 # and call that directly from now on
314 no warnings 'redefine';
315 if($Global::cshell) {
316 # (t)csh
317 *shell_quote_scalar = \&shell_quote_scalar_csh;
318 } elsif($Global::shell =~ m:(^|/)rc$:) {
319 # rc-shell
320 *shell_quote_scalar = \&shell_quote_scalar_rc;
321 } else {
322 # other shells
323 *shell_quote_scalar = \&shell_quote_scalar_default;
325 # The sub is now redefined. Call it
326 return shell_quote_scalar($_[0]);
329 sub Q($) {
330 # Q alias for ::shell_quote_scalar
331 my $ret = shell_quote_scalar($_[0]);
332 no warnings 'redefine';
333 *Q = \&::shell_quote_scalar;
334 return $ret;
338 sub status(@) {
339 my @w = @_;
340 my $fh = $Global::status_fd || *STDERR;
341 print $fh map { ($_, "\n") } @w;
342 flush $fh;
345 sub status_no_nl(@) {
346 my @w = @_;
347 my $fh = $Global::status_fd || *STDERR;
348 print $fh @w;
349 flush $fh;
352 sub warning(@) {
353 my @w = @_;
354 my $prog = $Global::progname || "parsort";
355 status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
359 my %warnings;
360 sub warning_once(@) {
361 my @w = @_;
362 my $prog = $Global::progname || "parsort";
363 $warnings{@w}++ or
364 status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
368 sub error(@) {
369 my @w = @_;
370 my $prog = $Global::progname || "parsort";
371 status(map { ($prog.": Error: ". $_); } @w);
374 sub die_bug($) {
375 my $bugid = shift;
376 print STDERR
377 ("$Global::progname: This should not happen. You have found a bug. ",
378 "Please follow\n",
379 "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
380 "\n",
381 "Include this in the report:\n",
382 "* The version number: $Global::version\n",
383 "* The bugid: $bugid\n",
384 "* The command line being run\n",
385 "* The files being read (put the files on a webserver if they are big)\n",
386 "\n",
387 "If you get the error on smaller/fewer files, please include those instead.\n");
388 exit(255);
391 sub version() {
392 # Returns: N/A
393 print join
394 ("\n",
395 "GNU $Global::progname $Global::version",
396 "Copyright (C) 2007-2021 Ole Tange, http://ole.tange.dk and Free Software",
397 "Foundation, Inc.",
398 "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
399 "This is free software: you are free to change and redistribute it.",
400 "GNU $Global::progname comes with no warranty.",
402 "Web site: https://www.gnu.org/software/${Global::progname}\n",
403 "When using programs that use GNU Parallel to process data for publication",
404 "please cite as described in 'parallel --citation'.\n",
408 if(@ARGV) {
409 sort_files(@ARGV);
410 } elsif(length $opt::files0_from) {
411 $/="\0";
412 open(my $fh,"<",$opt::files0_from) || die;
413 my @files = <$fh>;
414 chomp(@files);
415 sort_files(@files);
416 } else {
417 sort_stdin();
420 # Test
421 # -z
422 # OK: cat bigfile | parsort
423 # OK: parsort -k4n files*.txt
424 # OK: parsort files*.txt
425 # OK: parsort "file with space"