Released as 20240522 ('Tbilisi')
[parallel.git] / src / parsort
blob720d0a57563ea9c3a21d8524c01fe5a0ad72f1d2
1 #!/usr/bin/perl
3 # SPDX-FileCopyrightText: 2021-2024 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
4 # SPDX-License-Identifier: GPL-3.0-or-later
6 =pod
8 =head1 NAME
10 parsort - Sort (big files) in parallel
13 =head1 SYNOPSIS
15 B<parsort> I<options for sort>
18 =head1 DESCRIPTION
20 B<parsort> uses GNU B<sort> to sort in parallel. It works just like
21 B<sort> but faster on inputs with more than 1 M lines, if you have a
22 multicore machine.
24 Hopefully these ideas will make it into GNU B<sort> in the future.
27 =head1 OPTIONS
29 Same as B<sort>. Except:
31 =over 4
33 =item B<--parallel=>I<N>
35 Change the number of sorts run concurrently to I<N>. I<N> will be
36 increased to number of files if B<parsort> is given more than I<N>
37 files.
39 =back
42 =head1 EXAMPLE
44 Sort files:
46 parsort *.txt > sorted.txt
48 Sort stdin (standard input) numerically:
50 cat numbers | parsort -n > sorted.txt
53 =head1 PERFORMANCE
55 B<parsort> is faster on files than on stdin (standard input), because
56 different parts of a file can be read in parallel.
58 On a 48 core machine you should see a speedup of 3x over B<sort>.
61 =head1 AUTHOR
63 Copyright (C) 2020-2024 Ole Tange,
64 http://ole.tange.dk and Free Software Foundation, Inc.
67 =head1 LICENSE
69 Copyright (C) 2012 Free Software Foundation, Inc.
71 This program is free software; you can redistribute it and/or modify
72 it under the terms of the GNU General Public License as published by
73 the Free Software Foundation; either version 3 of the License, or
74 at your option any later version.
76 This program is distributed in the hope that it will be useful,
77 but WITHOUT ANY WARRANTY; without even the implied warranty of
78 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
79 GNU General Public License for more details.
81 You should have received a copy of the GNU General Public License
82 along with this program. If not, see <http://www.gnu.org/licenses/>.
85 =head1 DEPENDENCIES
87 B<parsort> uses B<sort>, B<bash>, and B<parallel>.
90 =head1 SEE ALSO
92 B<sort>
95 =cut
97 use strict;
98 use Getopt::Long;
99 use POSIX qw(mkfifo);
101 Getopt::Long::Configure("bundling","require_order");
103 my @ARGV_before = @ARGV;
105 GetOptions(
106 "debug|D" => \$opt::D,
107 "version" => \$opt::version,
108 "verbose|v" => \$opt::verbose,
109 "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
110 "d|dictionary-order" => \$opt::dictionary_order,
111 "f|ignore-case" => \$opt::ignore_case,
112 "g|general-numeric-sort" => \$opt::general_numeric_sort,
113 "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
114 "M|month-sort" => \$opt::month_sort,
115 "h|human-numeric-sort" => \$opt::human_numeric_sort,
116 "n|numeric-sort" => \$opt::numeric_sort,
117 "N|numascii" => \$opt::numascii,
118 "r|reverse" => \$opt::reverse,
119 "R|random-sort" => \$opt::random_sort,
120 "sort=s" => \$opt::sort,
121 "V|version-sort" => \$opt::version_sort,
122 "k|key=s" => \@opt::key,
123 "t|field-separator=s" => \$opt::field_separator,
124 "z|zero-terminated" => \$opt::zero_terminated,
125 "files0-from=s" => \$opt::files0_from,
126 "random-source=s" => \$opt::dummy,
127 "batch-size=s" => \$opt::dummy,
128 "check=s" => \$opt::dummy,
129 "c" => \$opt::dummy,
130 "C" => \$opt::dummy,
131 "compress-program=s" => \$opt::dummy,
132 "T|temporary-directory=s" => \$opt::dummy,
133 "parallel=s" => \$opt::parallel,
134 "u|unique" => \$opt::dummy,
135 "S|buffer-size=s" => \$opt::dummy,
136 "s|stable" => \$opt::dummy,
137 "help" => \$opt::dummy,
138 ) || exit(255);
139 $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
140 $Global::version = 20240522;
141 if($opt::version) { version(); exit 0; }
142 # Remove -D and --parallel=N
143 my @s = (grep { ! /^-D$|^--parallel=\S+$/ }
144 @ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
145 my @sortoptions;
146 while(@s) {
147 my $o = shift @s;
148 # Remove '--parallel N'
149 if($o eq "--parallel") {
150 $o = shift @s;
151 } else {
152 push @sortoptions, $o;
155 @Global::sortoptions = shell_quote(@sortoptions);
156 $ENV{'TMPDIR'} ||= "/tmp";
158 sub merge {
159 # Input:
160 # @cmd = commands to 'cat' (part of) a file
161 # 'cat a' 'cat b' 'cat c' =>
162 # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))
163 my @cmd = @_;
164 chomp(@cmd);
165 while($#cmd > 0) {
166 my @tmp;
167 while($#cmd >= 0) {
168 my $a = shift @cmd;
169 my $b = shift @cmd;
170 $a &&= "<($a)";
171 $b &&= "<($b)";
172 # This looks like useless use of 'cat', but contrary to
173 # naive belief it increases performance dramatically.
174 push @tmp, "sort -m @Global::sortoptions $a $b | cat"
176 @cmd = @tmp;
178 return @cmd;
181 sub sort_files {
182 # Input is files
183 my @files = @_;
184 # Let GNU Parallel generate the commands to read parts of files
185 # The commands split at \n (or \0)
186 # and there will be at least one for each CPU thread
187 my @subopt;
188 if($opt::zero_terminated) { push @subopt, qw(--recend "\0"); }
189 if($opt::parallel) { push @subopt, qw(--jobs), $opt::parallel; }
190 # $uniq is needed because @files could contain \n
191 my $uniq = join "", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..20);
192 open(my $par,"-|",qw(parallel), @subopt,
193 qw(--pipepart --block -1 --dryrun -vv sort),
194 @Global::sortoptions, $uniq, '::::', @files) || die;
195 # Generated commands:
196 # <file perl-catter | (sort ... $uniq )
197 # Use $uniq to split into commands
198 # (We cannot use \n because 'file' may contain newline)
199 my @cmd = map { "$_)\n" } split(/$uniq[)]\n/, join("",<$par>));
200 debug(1,@cmd);
201 close $par;
202 @cmd = merge(@cmd);
203 # The command uses <(...) so it is incompatible with /bin/sh
204 open(my $bash,"|-","bash") || die;
205 print $bash @cmd;
206 close $bash;
209 sub sort_stdin {
210 # Input is stdin
211 # Spread the input between n processes that each sort
212 # n = number of CPU threads
213 my $numthreads;
214 chomp($numthreads = $opt::parallel || `parallel --number-of-threads`);
215 my @fifos = map { tmpfifo() } 1..$numthreads;
216 map { mkfifo($_,0600) } @fifos;
217 # This trick removes the fifo as soon as it is connected in the other end
218 # (rm fifo; ...) < fifo
219 my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
220 map { Q($_) } @fifos);
221 @cmd = merge(@cmd);
222 if(fork) {
223 } else {
224 my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
225 exec(qw(parallel -0 -j), $numthreads, @subopt,
226 # 286k is the best mean value after testing 250..350
227 qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
229 # The command uses <(...) so it is incompatible with /bin/sh
230 open(my $bash,"|-","bash") || die;
231 print $bash @cmd;
232 close $bash;
235 sub tmpname {
236 # Select a name that does not exist
237 # Do not create the file as it may be used for creating a socket (by tmux)
238 # Remember the name in $Global::unlink to avoid hitting the same name twice
239 my $name = shift;
240 my($tmpname);
241 if(not -w $ENV{'TMPDIR'}) {
242 if(not -e $ENV{'TMPDIR'}) {
243 ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
244 Q($ENV{'TMPDIR'})."'");
245 } else {
246 ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
247 Q($ENV{'TMPDIR'})."'");
249 exit(255);
251 do {
252 $tmpname = $ENV{'TMPDIR'}."/".$name.
253 join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
254 } while(-e $tmpname);
255 return $tmpname;
258 sub tmpfifo {
259 # Find an unused name and mkfifo on it
260 my $tmpfifo = tmpname("psort");
261 mkfifo($tmpfifo,0600);
262 return $tmpfifo;
265 sub debug {
266 # Returns: N/A
267 $opt::D or return;
268 @_ = grep { defined $_ ? $_ : "" } @_;
269 print STDERR @_[1..$#_];
272 sub version() {
273 # Returns: N/A
274 print join
275 ("\n",
276 "GNU $Global::progname $Global::version",
277 "Copyright (C) 2020-2024 Ole Tange, http://ole.tange.dk and Free Software",
278 "Foundation, Inc.",
279 "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
280 "This is free software: you are free to change and redistribute it.",
281 "GNU $Global::progname comes with no warranty.",
283 "Web site: https://www.gnu.org/software/parallel\n",
287 sub shell_quote(@) {
288 # Input:
289 # @strings = strings to be quoted
290 # Returns:
291 # @shell_quoted_strings = string quoted as needed by the shell
292 return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
295 sub shell_quote_scalar_rc($) {
296 # Quote for the rc-shell
297 my $a = $_[0];
298 if(defined $a) {
299 if(($a =~ s/'/''/g)
301 ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
302 # A string was replaced
303 # No need to test for "" or \0
304 } elsif($a eq "") {
305 $a = "''";
306 } elsif($a eq "\0") {
307 $a = "";
310 return $a;
313 sub shell_quote_scalar_csh($) {
314 # Quote for (t)csh
315 my $a = $_[0];
316 if(defined $a) {
317 # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
318 # This is 1% faster than the above
319 if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
321 # quote newline in csh as \\\n
322 ($a =~ s/[\n]/"\\\n"/go)) {
323 # A string was replaced
324 # No need to test for "" or \0
325 } elsif($a eq "") {
326 $a = "''";
327 } elsif($a eq "\0") {
328 $a = "";
331 return $a;
334 sub shell_quote_scalar_default($) {
335 # Quote for other shells (Bourne compatibles)
336 # Inputs:
337 # $string = string to be quoted
338 # Returns:
339 # $shell_quoted = string quoted as needed by the shell
340 my $s = $_[0];
341 if($s =~ /[^-_.+a-z0-9\/]/i) {
342 $s =~ s/'/'"'"'/g; # "-quote single quotes
343 $s = "'$s'"; # '-quote entire string
344 $s =~ s/^''//; # Remove unneeded '' at ends
345 $s =~ s/''$//; # (faster than s/^''|''$//g)
346 return $s;
347 } elsif ($s eq "") {
348 return "''";
349 } else {
350 # No quoting needed
351 return $s;
355 sub shell_quote_scalar($) {
356 # Quote the string so the shell will not expand any special chars
357 # Inputs:
358 # $string = string to be quoted
359 # Returns:
360 # $shell_quoted = string quoted as needed by the shell
362 # Speed optimization: Choose the correct shell_quote_scalar_*
363 # and call that directly from now on
364 no warnings 'redefine';
365 if($Global::cshell) {
366 # (t)csh
367 *shell_quote_scalar = \&shell_quote_scalar_csh;
368 } elsif($Global::shell =~ m:(^|/)rc$:) {
369 # rc-shell
370 *shell_quote_scalar = \&shell_quote_scalar_rc;
371 } else {
372 # other shells
373 *shell_quote_scalar = \&shell_quote_scalar_default;
375 # The sub is now redefined. Call it
376 return shell_quote_scalar($_[0]);
379 sub Q($) {
380 # Q alias for ::shell_quote_scalar
381 my $ret = shell_quote_scalar($_[0]);
382 no warnings 'redefine';
383 *Q = \&::shell_quote_scalar;
384 return $ret;
388 sub status(@) {
389 my @w = @_;
390 my $fh = $Global::status_fd || *STDERR;
391 print $fh map { ($_, "\n") } @w;
392 flush $fh;
395 sub status_no_nl(@) {
396 my @w = @_;
397 my $fh = $Global::status_fd || *STDERR;
398 print $fh @w;
399 flush $fh;
402 sub warning(@) {
403 my @w = @_;
404 my $prog = $Global::progname || "parsort";
405 status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
409 my %warnings;
410 sub warning_once(@) {
411 my @w = @_;
412 my $prog = $Global::progname || "parsort";
413 $warnings{@w}++ or
414 status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
418 sub error(@) {
419 my @w = @_;
420 my $prog = $Global::progname || "parsort";
421 status(map { ($prog.": Error: ". $_); } @w);
424 sub die_bug($) {
425 my $bugid = shift;
426 print STDERR
427 ("$Global::progname: This should not happen. You have found a bug. ",
428 "Please follow\n",
429 "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
430 "\n",
431 "Include this in the report:\n",
432 "* The version number: $Global::version\n",
433 "* The bugid: $bugid\n",
434 "* The command line being run\n",
435 "* The files being read (put the files on a webserver if they are big)\n",
436 "\n",
437 "If you get the error on smaller/fewer files, please include those instead.\n");
438 exit(255);
441 if(@ARGV) {
442 sort_files(@ARGV);
443 } elsif(length $opt::files0_from) {
444 $/="\0";
445 open(my $fh,"<",$opt::files0_from) || die;
446 my @files = <$fh>;
447 chomp(@files);
448 sort_files(@files);
449 } else {
450 sort_stdin();
453 # Test
454 # -z
455 # OK: cat bigfile | parsort
456 # OK: parsort -k4n files*.txt
457 # OK: parsort files*.txt
458 # OK: parsort "file with space"