parallel: --results should only use chars that the file system supports.
[parallel.git] / src / parsort
blob64c64c00b41fa9be74a7ff28d34302810972ad39
1 #!/usr/bin/perl
3 # SPDX-FileCopyrightText: 2021-2022 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
4 # SPDX-License-Identifier: GPL-3.0-or-later
6 =pod
8 =head1 NAME
10 parsort - Sort (big files) in parallel
13 =head1 SYNOPSIS
15 B<parsort> I<options for sort>
18 =head1 DESCRIPTION
20 B<parsort> uses GNU B<sort> to sort in parallel. It works just like
21 B<sort> but faster on inputs with more than 1 M lines, if you have a
22 multicore machine.
24 Hopefully these ideas will make it into GNU B<sort> in the future.
27 =head1 EXAMPLE
29 Sort files:
31 parsort *.txt > sorted.txt
33 Sort stdin (standard input) numerically:
35 cat numbers | parsort -n > sorted.txt
38 =head1 PERFORMANCE
40 B<parsort> is faster on a file than on stdin (standard input), because
41 different parts of a file can be read in parallel.
43 On a 48 core machine you should see a speedup of 3x over B<sort>.
46 =head1 AUTHOR
48 Copyright (C) 2020-2022 Ole Tange,
49 http://ole.tange.dk and Free Software Foundation, Inc.
52 =head1 LICENSE
54 Copyright (C) 2012 Free Software Foundation, Inc.
56 This program is free software; you can redistribute it and/or modify
57 it under the terms of the GNU General Public License as published by
58 the Free Software Foundation; either version 3 of the License, or
59 at your option any later version.
61 This program is distributed in the hope that it will be useful,
62 but WITHOUT ANY WARRANTY; without even the implied warranty of
63 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
64 GNU General Public License for more details.
66 You should have received a copy of the GNU General Public License
67 along with this program. If not, see <http://www.gnu.org/licenses/>.
70 =head1 DEPENDENCIES
72 B<parsort> uses B<sort>, B<bash>, and B<parallel>.
75 =head1 SEE ALSO
77 B<sort>
80 =cut
82 use strict;
83 use Getopt::Long;
84 use POSIX qw(mkfifo);
86 Getopt::Long::Configure("bundling","require_order");
88 my @ARGV_before = @ARGV;
90 GetOptions(
91 "debug|D" => \$opt::D,
92 "version" => \$opt::version,
93 "verbose|v" => \$opt::verbose,
94 "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
95 "d|dictionary-order" => \$opt::dictionary_order,
96 "f|ignore-case" => \$opt::ignore_case,
97 "g|general-numeric-sort" => \$opt::general_numeric_sort,
98 "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
99 "M|month-sort" => \$opt::month_sort,
100 "h|human-numeric-sort" => \$opt::human_numeric_sort,
101 "n|numeric-sort" => \$opt::numeric_sort,
102 "N|numascii" => \$opt::numascii,
103 "r|reverse" => \$opt::reverse,
104 "R|random-sort" => \$opt::random_sort,
105 "sort=s" => \$opt::sort,
106 "V|version-sort" => \$opt::version_sort,
107 "k|key=s" => \@opt::key,
108 "t|field-separator=s" => \$opt::field_separator,
109 "z|zero-terminated" => \$opt::zero_terminated,
110 "files0-from=s" => \$opt::files0_from,
111 "random-source=s" => \$opt::dummy,
112 "batch-size=s" => \$opt::dummy,
113 "check=s" => \$opt::dummy,
114 "c" => \$opt::dummy,
115 "C" => \$opt::dummy,
116 "compress-program=s" => \$opt::dummy,
117 "T|temporary-directory=s" => \$opt::dummy,
118 "parallel=s" => \$opt::dummy,
119 "u|unique" => \$opt::dummy,
120 "S|buffer-size=s" => \$opt::dummy,
121 "s|stable" => \$opt::dummy,
122 "help" => \$opt::dummy,
123 ) || exit(255);
124 $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
125 $Global::version = 20221122;
126 if($opt::version) { version(); exit 0; }
127 @Global::sortoptions = grep { ! /^-D$/ }
128 shell_quote(@ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
130 $ENV{'TMPDIR'} ||= "/tmp";
132 sub merge {
133 # Input:
134 # @cmd = commands to 'cat' (part of) a file
135 # 'cat a' 'cat b' 'cat c' =>
136 # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))
137 my @cmd = @_;
138 chomp(@cmd);
139 while($#cmd > 0) {
140 my @tmp;
141 while($#cmd >= 0) {
142 my $a = shift @cmd;
143 my $b = shift @cmd;
144 $a &&= "<($a)";
145 $b &&= "<($b)";
146 # This looks like useless use of 'cat', but contrary to
147 # naive belief it increases performance dramatically.
148 push @tmp, "sort -m @Global::sortoptions $a $b | cat"
150 @cmd = @tmp;
152 return @cmd;
155 sub sort_files {
156 # Input is files
157 my @files = @_;
158 # Let GNU Parallel generate the commands to read parts of files
159 # The commands split at \n (or \0)
160 # and there will be at least one for each CPU thread
161 my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
162 open(my $par,"-|",qw(parallel), @subopt,
163 qw(--pipepart --block -1 --dryrun -vv sort),
164 @Global::sortoptions, '::::', @files) || die;
165 my @cmd = merge(<$par>);
166 close $par;
167 debug(@cmd);
168 # The command uses <(...) so it is incompatible with /bin/sh
169 open(my $bash,"|-","bash") || die;
170 print $bash @cmd;
171 close $bash;
174 sub sort_stdin {
175 # Input is stdin
176 # Spread the input between n processes that each sort
177 # n = number of CPU threads
178 my $numthreads = `parallel --number-of-threads`;
179 my @fifos = map { tmpfifo() } 1..$numthreads;
180 map { mkfifo($_,0600) } @fifos;
181 # This trick removes the fifo as soon as it is connected in the other end
182 # (rm fifo; ...) < fifo
183 my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
184 map { Q($_) } @fifos);
185 @cmd = merge(@cmd);
186 if(fork) {
187 } else {
188 my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
189 exec(qw(parallel -j), $numthreads, @subopt,
190 # 286k is the best mean value after testing 250..350
191 qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
193 # The command uses <(...) so it is incompatible with /bin/sh
194 open(my $bash,"|-","bash") || die;
195 print $bash @cmd;
196 close $bash;
199 sub tmpname {
200 # Select a name that does not exist
201 # Do not create the file as it may be used for creating a socket (by tmux)
202 # Remember the name in $Global::unlink to avoid hitting the same name twice
203 my $name = shift;
204 my($tmpname);
205 if(not -w $ENV{'TMPDIR'}) {
206 if(not -e $ENV{'TMPDIR'}) {
207 ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
208 Q($ENV{'TMPDIR'})."'");
209 } else {
210 ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
211 Q($ENV{'TMPDIR'})."'");
213 exit(255);
215 do {
216 $tmpname = $ENV{'TMPDIR'}."/".$name.
217 join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
218 } while(-e $tmpname);
219 return $tmpname;
222 sub tmpfifo {
223 # Find an unused name and mkfifo on it
224 my $tmpfifo = tmpname("psort");
225 mkfifo($tmpfifo,0600);
226 return $tmpfifo;
229 sub debug {
230 # Returns: N/A
231 $opt::D or return;
232 @_ = grep { defined $_ ? $_ : "" } @_;
233 print STDERR @_[1..$#_];
236 sub version() {
237 # Returns: N/A
238 print join
239 ("\n",
240 "GNU $Global::progname $Global::version",
241 "Copyright (C) 2020-2022 Ole Tange, http://ole.tange.dk and Free Software",
242 "Foundation, Inc.",
243 "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
244 "This is free software: you are free to change and redistribute it.",
245 "GNU $Global::progname comes with no warranty.",
247 "Web site: https://www.gnu.org/software/parallel\n",
251 sub shell_quote(@) {
252 # Input:
253 # @strings = strings to be quoted
254 # Returns:
255 # @shell_quoted_strings = string quoted as needed by the shell
256 return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
259 sub shell_quote_scalar_rc($) {
260 # Quote for the rc-shell
261 my $a = $_[0];
262 if(defined $a) {
263 if(($a =~ s/'/''/g)
265 ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
266 # A string was replaced
267 # No need to test for "" or \0
268 } elsif($a eq "") {
269 $a = "''";
270 } elsif($a eq "\0") {
271 $a = "";
274 return $a;
277 sub shell_quote_scalar_csh($) {
278 # Quote for (t)csh
279 my $a = $_[0];
280 if(defined $a) {
281 # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
282 # This is 1% faster than the above
283 if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
285 # quote newline in csh as \\\n
286 ($a =~ s/[\n]/"\\\n"/go)) {
287 # A string was replaced
288 # No need to test for "" or \0
289 } elsif($a eq "") {
290 $a = "''";
291 } elsif($a eq "\0") {
292 $a = "";
295 return $a;
298 sub shell_quote_scalar_default($) {
299 # Quote for other shells (Bourne compatibles)
300 # Inputs:
301 # $string = string to be quoted
302 # Returns:
303 # $shell_quoted = string quoted as needed by the shell
304 my $s = $_[0];
305 if($s =~ /[^-_.+a-z0-9\/]/i) {
306 $s =~ s/'/'"'"'/g; # "-quote single quotes
307 $s = "'$s'"; # '-quote entire string
308 $s =~ s/^''//; # Remove unneeded '' at ends
309 $s =~ s/''$//; # (faster than s/^''|''$//g)
310 return $s;
311 } elsif ($s eq "") {
312 return "''";
313 } else {
314 # No quoting needed
315 return $s;
319 sub shell_quote_scalar($) {
320 # Quote the string so the shell will not expand any special chars
321 # Inputs:
322 # $string = string to be quoted
323 # Returns:
324 # $shell_quoted = string quoted as needed by the shell
326 # Speed optimization: Choose the correct shell_quote_scalar_*
327 # and call that directly from now on
328 no warnings 'redefine';
329 if($Global::cshell) {
330 # (t)csh
331 *shell_quote_scalar = \&shell_quote_scalar_csh;
332 } elsif($Global::shell =~ m:(^|/)rc$:) {
333 # rc-shell
334 *shell_quote_scalar = \&shell_quote_scalar_rc;
335 } else {
336 # other shells
337 *shell_quote_scalar = \&shell_quote_scalar_default;
339 # The sub is now redefined. Call it
340 return shell_quote_scalar($_[0]);
343 sub Q($) {
344 # Q alias for ::shell_quote_scalar
345 my $ret = shell_quote_scalar($_[0]);
346 no warnings 'redefine';
347 *Q = \&::shell_quote_scalar;
348 return $ret;
352 sub status(@) {
353 my @w = @_;
354 my $fh = $Global::status_fd || *STDERR;
355 print $fh map { ($_, "\n") } @w;
356 flush $fh;
359 sub status_no_nl(@) {
360 my @w = @_;
361 my $fh = $Global::status_fd || *STDERR;
362 print $fh @w;
363 flush $fh;
366 sub warning(@) {
367 my @w = @_;
368 my $prog = $Global::progname || "parsort";
369 status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
373 my %warnings;
374 sub warning_once(@) {
375 my @w = @_;
376 my $prog = $Global::progname || "parsort";
377 $warnings{@w}++ or
378 status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
382 sub error(@) {
383 my @w = @_;
384 my $prog = $Global::progname || "parsort";
385 status(map { ($prog.": Error: ". $_); } @w);
388 sub die_bug($) {
389 my $bugid = shift;
390 print STDERR
391 ("$Global::progname: This should not happen. You have found a bug. ",
392 "Please follow\n",
393 "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
394 "\n",
395 "Include this in the report:\n",
396 "* The version number: $Global::version\n",
397 "* The bugid: $bugid\n",
398 "* The command line being run\n",
399 "* The files being read (put the files on a webserver if they are big)\n",
400 "\n",
401 "If you get the error on smaller/fewer files, please include those instead.\n");
402 exit(255);
405 if(@ARGV) {
406 sort_files(@ARGV);
407 } elsif(length $opt::files0_from) {
408 $/="\0";
409 open(my $fh,"<",$opt::files0_from) || die;
410 my @files = <$fh>;
411 chomp(@files);
412 sort_files(@files);
413 } else {
414 sort_stdin();
417 # Test
418 # -z
419 # OK: cat bigfile | parsort
420 # OK: parsort -k4n files*.txt
421 # OK: parsort files*.txt
422 # OK: parsort "file with space"