src/parsort

   1 #!/usr/bin/perl
   2
   3 # SPDX-FileCopyrightText: 2021-2024 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
   4 # SPDX-License-Identifier: GPL-3.0-or-later
   5
   6 =pod
   7
   8 =head1 NAME
   9
  10 parsort - Sort (big files) in parallel
  11
  12
  13 =head1 SYNOPSIS
  14
  15 B<parsort> I<options for sort>
  16
  17
  18 =head1 DESCRIPTION
  19
  20 B<parsort> uses GNU B<sort> to sort in parallel. It works just like
  21 B<sort> but faster on inputs with more than 1 M lines, if you have a
  22 multicore machine.
  23
  24 Hopefully these ideas will make it into GNU B<sort> in the future.
  25
  26
  27 =head1 OPTIONS
  28
  29 Same as B<sort>. Except:
  30
  31 =over 4
  32
  33 =item B<--parallel=>I<N>
  34
  35 Change the number of sorts run concurrently to I<N>. I<N> will be
  36 increased to number of files if B<parsort> is given more than I<N>
  37 files.
  38
  39 =back
  40
  41
  42 =head1 EXAMPLE
  43
  44 Sort files:
  45
  46   parsort *.txt > sorted.txt
  47
  48 Sort stdin (standard input) numerically:
  49
  50   cat numbers | parsort -n > sorted.txt
  51
  52
  53 =head1 PERFORMANCE
  54
  55 B<parsort> is faster on files than on stdin (standard input), because
  56 different parts of a file can be read in parallel.
  57
  58 On a 48 core machine you should see a speedup of 3x over B<sort>.
  59
  60
  61 =head1 AUTHOR
  62
  63 Copyright (C) 2020-2024 Ole Tange,
  64 http://ole.tange.dk and Free Software Foundation, Inc.
  65
  66
  67 =head1 LICENSE
  68
  69 Copyright (C) 2012 Free Software Foundation, Inc.
  70
  71 This program is free software; you can redistribute it and/or modify
  72 it under the terms of the GNU General Public License as published by
  73 the Free Software Foundation; either version 3 of the License, or
  74 at your option any later version.
  75
  76 This program is distributed in the hope that it will be useful,
  77 but WITHOUT ANY WARRANTY; without even the implied warranty of
  78 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  79 GNU General Public License for more details.
  80
  81 You should have received a copy of the GNU General Public License
  82 along with this program.  If not, see <http://www.gnu.org/licenses/>.
  83
  84
  85 =head1 DEPENDENCIES
  86
  87 B<parsort> uses B<sort>, B<bash>, and B<parallel>.
  88
  89
  90 =head1 SEE ALSO
  91
  92 B<sort>
  93
  94
  95 =cut
  96
  97 use strict;
  98 use Getopt::Long;
  99 use POSIX qw(mkfifo);
 100
 101 Getopt::Long::Configure("bundling","require_order");
 102
 103 my @ARGV_before = @ARGV;
 104
 105 GetOptions(
 106     "debug|D" => \$opt::D,
 107     "version" => \$opt::version,
 108     "verbose|v" => \$opt::verbose,
 109     "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
 110     "d|dictionary-order" => \$opt::dictionary_order,
 111     "f|ignore-case" => \$opt::ignore_case,
 112     "g|general-numeric-sort" => \$opt::general_numeric_sort,
 113     "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
 114     "M|month-sort" => \$opt::month_sort,
 115     "h|human-numeric-sort" => \$opt::human_numeric_sort,
 116     "n|numeric-sort" => \$opt::numeric_sort,
 117     "N|numascii" => \$opt::numascii,
 118     "r|reverse" => \$opt::reverse,
 119     "R|random-sort" => \$opt::random_sort,
 120     "sort=s" => \$opt::sort,
 121     "V|version-sort" => \$opt::version_sort,
 122     "k|key=s" => \@opt::key,
 123     "t|field-separator=s" => \$opt::field_separator,
 124     "z|zero-terminated" => \$opt::zero_terminated,
 125     "files0-from=s" => \$opt::files0_from,
 126     "random-source=s" => \$opt::dummy,
 127     "batch-size=s" => \$opt::dummy,
 128     "check=s" => \$opt::dummy,
 129     "c" => \$opt::dummy,
 130     "C" => \$opt::dummy,
 131     "compress-program=s" => \$opt::dummy,
 132     "T|temporary-directory=s" => \$opt::dummy,
 133     "parallel=s" => \$opt::parallel,
 134     "u|unique" => \$opt::dummy,
 135     "S|buffer-size=s" => \$opt::dummy,
 136     "s|stable" => \$opt::dummy,
 137     "help" => \$opt::dummy,
 138     ) || exit(255);
 139 $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
 140 $Global::version = 20240522;
 141 if($opt::version) { version(); exit 0; }
 142 # Remove -D and --parallel=N
 143 my @s = (grep { ! /^-D$|^--parallel=\S+$/ }
 144                    @ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
 145 my @sortoptions;
 146 while(@s) {
 147     my $o = shift @s;
 148     # Remove '--parallel N'
 149     if($o eq "--parallel") {
 150         $o = shift @s;
 151     } else {
 152         push @sortoptions, $o;
 153     }
 154 }
 155 @Global::sortoptions = shell_quote(@sortoptions);
 156 $ENV{'TMPDIR'} ||= "/tmp";
 157
 158 sub merge {
 159     # Input:
 160     #   @cmd = commands to 'cat' (part of) a file
 161     # 'cat a' 'cat b' 'cat c' =>
 162     # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))
 163     my @cmd = @_;
 164     chomp(@cmd);
 165     while($#cmd > 0) {
 166         my @tmp;
 167         while($#cmd >= 0) {
 168             my $a = shift @cmd;
 169             my $b = shift @cmd;
 170             $a &&= "<($a)";
 171             $b &&= "<($b)";
 172             # This looks like useless use of 'cat', but contrary to
 173             # naive belief it increases performance dramatically.
 174             push @tmp, "sort -m @Global::sortoptions $a $b | cat"
 175         }
 176         @cmd = @tmp;
 177     }
 178     return @cmd;
 179 }
 180
 181 sub sort_files {
 182     # Input is files
 183     my @files = @_;
 184     # Let GNU Parallel generate the commands to read parts of files
 185     # The commands split at \n (or \0)
 186     # and there will be at least one for each CPU thread
 187     my @subopt;
 188     if($opt::zero_terminated) { push @subopt, qw(--recend "\0"); }
 189     if($opt::parallel) { push @subopt, qw(--jobs), $opt::parallel; }
 190     # $uniq is needed because @files could contain \n
 191     my $uniq = join "", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..20);
 192     open(my $par,"-|",qw(parallel), @subopt,
 193          qw(--pipepart --block -1 --dryrun -vv sort),
 194          @Global::sortoptions, $uniq, '::::', @files) || die;
 195     # Generated commands:
 196     # <file perl-catter | (sort ... $uniq )
 197     # Use $uniq to split into commands
 198     # (We cannot use \n because 'file' may contain newline)
 199     my @cmd = map { "$_)\n" } split(/$uniq[)]\n/, join("",<$par>));
 200     debug(1,@cmd);
 201     close $par;
 202     @cmd = merge(@cmd);
 203     # The command uses <(...) so it is incompatible with /bin/sh
 204     open(my $bash,"|-","bash") || die;
 205     print $bash @cmd;
 206     close $bash;
 207 }
 208
 209 sub sort_stdin {
 210     # Input is stdin
 211     # Spread the input between n processes that each sort
 212     # n = number of CPU threads
 213     my $numthreads;
 214     chomp($numthreads = $opt::parallel || `parallel --number-of-threads`);
 215     my @fifos = map { tmpfifo() } 1..$numthreads;
 216     map { mkfifo($_,0600) } @fifos;
 217     # This trick removes the fifo as soon as it is connected in the other end
 218     # (rm fifo; ...) < fifo
 219     my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
 220                map { Q($_) } @fifos);
 221     @cmd = merge(@cmd);
 222     if(fork) {
 223     } else {
 224         my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
 225         exec(qw(parallel -0 -j), $numthreads, @subopt,
 226              # 286k is the best mean value after testing 250..350
 227              qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
 228     }
 229     # The command uses <(...) so it is incompatible with /bin/sh
 230     open(my $bash,"|-","bash") || die;
 231     print $bash @cmd;
 232     close $bash;
 233 }
 234
 235 sub tmpname {
 236     # Select a name that does not exist
 237     # Do not create the file as it may be used for creating a socket (by tmux)
 238     # Remember the name in $Global::unlink to avoid hitting the same name twice
 239     my $name = shift;
 240     my($tmpname);
 241     if(not -w $ENV{'TMPDIR'}) {
 242         if(not -e $ENV{'TMPDIR'}) {
 243             ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
 244                     Q($ENV{'TMPDIR'})."'");
 245         } else {
 246             ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
 247                     Q($ENV{'TMPDIR'})."'");
 248         }
 249         exit(255);
 250     }
 251     do {
 252         $tmpname = $ENV{'TMPDIR'}."/".$name.
 253             join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
 254     } while(-e $tmpname);
 255     return $tmpname;
 256 }
 257
 258 sub tmpfifo {
 259     # Find an unused name and mkfifo on it
 260     my $tmpfifo = tmpname("psort");
 261     mkfifo($tmpfifo,0600);
 262     return $tmpfifo;
 263 }
 264
 265 sub debug {
 266     # Returns: N/A
 267     $opt::D or return;
 268     @_ = grep { defined $_ ? $_ : "" } @_;
 269     print STDERR @_[1..$#_];
 270 }
 271
 272 sub version() {
 273     # Returns: N/A
 274     print join
 275         ("\n",
 276          "GNU $Global::progname $Global::version",
 277          "Copyright (C) 2020-2024 Ole Tange, http://ole.tange.dk and Free Software",
 278          "Foundation, Inc.",
 279          "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
 280          "This is free software: you are free to change and redistribute it.",
 281          "GNU $Global::progname comes with no warranty.",
 282          "",
 283          "Web site: https://www.gnu.org/software/parallel\n",
 284         );
 285 }
 286
 287 sub shell_quote(@) {
 288     # Input:
 289     #   @strings = strings to be quoted
 290     # Returns:
 291     #   @shell_quoted_strings = string quoted as needed by the shell
 292     return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
 293 }
 294
 295 sub shell_quote_scalar_rc($) {
 296     # Quote for the rc-shell
 297     my $a = $_[0];
 298     if(defined $a) {
 299         if(($a =~ s/'/''/g)
 300            +
 301            ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
 302             # A string was replaced
 303             # No need to test for "" or \0
 304         } elsif($a eq "") {
 305             $a = "''";
 306         } elsif($a eq "\0") {
 307             $a = "";
 308         }
 309     }
 310     return $a;
 311 }
 312
 313 sub shell_quote_scalar_csh($) {
 314     # Quote for (t)csh
 315     my $a = $_[0];
 316     if(defined $a) {
 317         # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
 318         # This is 1% faster than the above
 319         if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
 320            +
 321            # quote newline in csh as \\\n
 322            ($a =~ s/[\n]/"\\\n"/go)) {
 323             # A string was replaced
 324             # No need to test for "" or \0
 325         } elsif($a eq "") {
 326             $a = "''";
 327         } elsif($a eq "\0") {
 328             $a = "";
 329         }
 330     }
 331     return $a;
 332 }
 333
 334 sub shell_quote_scalar_default($) {
 335     # Quote for other shells (Bourne compatibles)
 336     # Inputs:
 337     #   $string = string to be quoted
 338     # Returns:
 339     #   $shell_quoted = string quoted as needed by the shell
 340     my $s = $_[0];
 341     if($s =~ /[^-_.+a-z0-9\/]/i) {
 342         $s =~ s/'/'"'"'/g; # "-quote single quotes
 343         $s = "'$s'";       # '-quote entire string
 344         $s =~ s/^''//;     # Remove unneeded '' at ends
 345         $s =~ s/''$//;     # (faster than s/^''|''$//g)
 346         return $s;
 347     } elsif ($s eq "") {
 348         return "''";
 349     } else {
 350         # No quoting needed
 351         return $s;
 352     }
 353 }
 354
 355 sub shell_quote_scalar($) {
 356     # Quote the string so the shell will not expand any special chars
 357     # Inputs:
 358     #   $string = string to be quoted
 359     # Returns:
 360     #   $shell_quoted = string quoted as needed by the shell
 361
 362     # Speed optimization: Choose the correct shell_quote_scalar_*
 363     # and call that directly from now on
 364     no warnings 'redefine';
 365     if($Global::cshell) {
 366         # (t)csh
 367         *shell_quote_scalar = \&shell_quote_scalar_csh;
 368     } elsif($Global::shell =~ m:(^|/)rc$:) {
 369         # rc-shell
 370         *shell_quote_scalar = \&shell_quote_scalar_rc;
 371     } else {
 372         # other shells
 373         *shell_quote_scalar = \&shell_quote_scalar_default;
 374     }
 375     # The sub is now redefined. Call it
 376     return shell_quote_scalar($_[0]);
 377 }
 378
 379 sub Q($) {
 380     # Q alias for ::shell_quote_scalar
 381     my $ret = shell_quote_scalar($_[0]);
 382     no warnings 'redefine';
 383     *Q = \&::shell_quote_scalar;
 384     return $ret;
 385 }
 386
 387
 388 sub status(@) {
 389     my @w = @_;
 390     my $fh = $Global::status_fd || *STDERR;
 391     print $fh map { ($_, "\n") } @w;
 392     flush $fh;
 393 }
 394
 395 sub status_no_nl(@) {
 396     my @w = @_;
 397     my $fh = $Global::status_fd || *STDERR;
 398     print $fh @w;
 399     flush $fh;
 400 }
 401
 402 sub warning(@) {
 403     my @w = @_;
 404     my $prog = $Global::progname || "parsort";
 405     status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
 406 }
 407
 408 {
 409     my %warnings;
 410     sub warning_once(@) {
 411         my @w = @_;
 412         my $prog = $Global::progname || "parsort";
 413         $warnings{@w}++ or
 414             status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
 415     }
 416 }
 417
 418 sub error(@) {
 419     my @w = @_;
 420     my $prog = $Global::progname || "parsort";
 421     status(map { ($prog.": Error: ". $_); } @w);
 422 }
 423
 424 sub die_bug($) {
 425     my $bugid = shift;
 426     print STDERR
 427         ("$Global::progname: This should not happen. You have found a bug. ",
 428          "Please follow\n",
 429          "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
 430          "\n",
 431          "Include this in the report:\n",
 432          "* The version number: $Global::version\n",
 433          "* The bugid: $bugid\n",
 434          "* The command line being run\n",
 435          "* The files being read (put the files on a webserver if they are big)\n",
 436          "\n",
 437          "If you get the error on smaller/fewer files, please include those instead.\n");
 438     exit(255);
 439 }
 440
 441 if(@ARGV) {
 442     sort_files(@ARGV);
 443 } elsif(length $opt::files0_from) {
 444     $/="\0";
 445     open(my $fh,"<",$opt::files0_from) || die;
 446     my @files = <$fh>;
 447     chomp(@files);
 448     sort_files(@files);
 449 } else {
 450     sort_stdin();
 451 }
 452
 453 # Test
 454 # -z
 455 # OK: cat bigfile | parsort
 456 # OK: parsort -k4n files*.txt
 457 # OK: parsort files*.txt
 458 # OK: parsort "file with space"
 459