src/parsort

   1 #!/usr/bin/perl
   2
   3 # SPDX-FileCopyrightText: 2021-2022 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
   4 # SPDX-License-Identifier: GPL-3.0-or-later
   5
   6 =pod
   7
   8 =head1 NAME
   9
  10 parsort - Sort (big files) in parallel
  11
  12
  13 =head1 SYNOPSIS
  14
  15 B<parsort> I<options for sort>
  16
  17
  18 =head1 DESCRIPTION
  19
  20 B<parsort> uses GNU B<sort> to sort in parallel. It works just like
  21 B<sort> but faster on inputs with more than 1 M lines, if you have a
  22 multicore machine.
  23
  24 Hopefully these ideas will make it into GNU B<sort> in the future.
  25
  26
  27 =head1 EXAMPLE
  28
  29 Sort files:
  30
  31   parsort *.txt > sorted.txt
  32
  33 Sort stdin (standard input) numerically:
  34
  35   cat numbers | parsort -n > sorted.txt
  36
  37
  38 =head1 PERFORMANCE
  39
  40 B<parsort> is faster on a file than on stdin (standard input), because
  41 different parts of a file can be read in parallel.
  42
  43 On a 48 core machine you should see a speedup of 3x over B<sort>.
  44
  45
  46 =head1 AUTHOR
  47
  48 Copyright (C) 2020-2022 Ole Tange,
  49 http://ole.tange.dk and Free Software Foundation, Inc.
  50
  51
  52 =head1 LICENSE
  53
  54 Copyright (C) 2012 Free Software Foundation, Inc.
  55
  56 This program is free software; you can redistribute it and/or modify
  57 it under the terms of the GNU General Public License as published by
  58 the Free Software Foundation; either version 3 of the License, or
  59 at your option any later version.
  60
  61 This program is distributed in the hope that it will be useful,
  62 but WITHOUT ANY WARRANTY; without even the implied warranty of
  63 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  64 GNU General Public License for more details.
  65
  66 You should have received a copy of the GNU General Public License
  67 along with this program.  If not, see <http://www.gnu.org/licenses/>.
  68
  69
  70 =head1 DEPENDENCIES
  71
  72 B<parsort> uses B<sort>, B<bash>, and B<parallel>.
  73
  74
  75 =head1 SEE ALSO
  76
  77 B<sort>
  78
  79
  80 =cut
  81
  82 use strict;
  83 use Getopt::Long;
  84 use POSIX qw(mkfifo);
  85
  86 Getopt::Long::Configure("bundling","require_order");
  87
  88 my @ARGV_before = @ARGV;
  89
  90 GetOptions(
  91     "debug|D" => \$opt::D,
  92     "version" => \$opt::version,
  93     "verbose|v" => \$opt::verbose,
  94     "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
  95     "d|dictionary-order" => \$opt::dictionary_order,
  96     "f|ignore-case" => \$opt::ignore_case,
  97     "g|general-numeric-sort" => \$opt::general_numeric_sort,
  98     "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
  99     "M|month-sort" => \$opt::month_sort,
 100     "h|human-numeric-sort" => \$opt::human_numeric_sort,
 101     "n|numeric-sort" => \$opt::numeric_sort,
 102     "N|numascii" => \$opt::numascii,
 103     "r|reverse" => \$opt::reverse,
 104     "R|random-sort" => \$opt::random_sort,
 105     "sort=s" => \$opt::sort,
 106     "V|version-sort" => \$opt::version_sort,
 107     "k|key=s" => \@opt::key,
 108     "t|field-separator=s" => \$opt::field_separator,
 109     "z|zero-terminated" => \$opt::zero_terminated,
 110     "files0-from=s" => \$opt::files0_from,
 111     "random-source=s" => \$opt::dummy,
 112     "batch-size=s" => \$opt::dummy,
 113     "check=s" => \$opt::dummy,
 114     "c" => \$opt::dummy,
 115     "C" => \$opt::dummy,
 116     "compress-program=s" => \$opt::dummy,
 117     "T|temporary-directory=s" => \$opt::dummy,
 118     "parallel=s" => \$opt::dummy,
 119     "u|unique" => \$opt::dummy,
 120     "S|buffer-size=s" => \$opt::dummy,
 121     "s|stable" => \$opt::dummy,
 122     "help" => \$opt::dummy,
 123     ) || exit(255);
 124 $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
 125 $Global::version = 20221122;
 126 if($opt::version) { version(); exit 0; }
 127 @Global::sortoptions = grep { ! /^-D$/ }
 128     shell_quote(@ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
 129
 130 $ENV{'TMPDIR'} ||= "/tmp";
 131
 132 sub merge {
 133     # Input:
 134     #   @cmd = commands to 'cat' (part of) a file
 135     # 'cat a' 'cat b' 'cat c' =>
 136     # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))
 137     my @cmd = @_;
 138     chomp(@cmd);
 139     while($#cmd > 0) {
 140         my @tmp;
 141         while($#cmd >= 0) {
 142             my $a = shift @cmd;
 143             my $b = shift @cmd;
 144             $a &&= "<($a)";
 145             $b &&= "<($b)";
 146             # This looks like useless use of 'cat', but contrary to
 147             # naive belief it increases performance dramatically.
 148             push @tmp, "sort -m @Global::sortoptions $a $b | cat"
 149         }
 150         @cmd = @tmp;
 151     }
 152     return @cmd;
 153 }
 154
 155 sub sort_files {
 156     # Input is files
 157     my @files = @_;
 158     # Let GNU Parallel generate the commands to read parts of files
 159     # The commands split at \n (or \0)
 160     # and there will be at least one for each CPU thread
 161     my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
 162     open(my $par,"-|",qw(parallel), @subopt,
 163          qw(--pipepart --block -1 --dryrun -vv sort),
 164          @Global::sortoptions, '::::', @files) || die;
 165     my @cmd = merge(<$par>);
 166     close $par;
 167     debug(@cmd);
 168     # The command uses <(...) so it is incompatible with /bin/sh
 169     open(my $bash,"|-","bash") || die;
 170     print $bash @cmd;
 171     close $bash;
 172 }
 173
 174 sub sort_stdin {
 175     # Input is stdin
 176     # Spread the input between n processes that each sort
 177     # n = number of CPU threads
 178     my $numthreads = `parallel --number-of-threads`;
 179     my @fifos = map { tmpfifo() } 1..$numthreads;
 180     map { mkfifo($_,0600) } @fifos;
 181     # This trick removes the fifo as soon as it is connected in the other end
 182     # (rm fifo; ...) < fifo
 183     my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
 184                map { Q($_) } @fifos);
 185     @cmd = merge(@cmd);
 186     if(fork) {
 187     } else {
 188         my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
 189         exec(qw(parallel -j), $numthreads, @subopt,
 190              # 286k is the best mean value after testing 250..350
 191              qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
 192     }
 193     # The command uses <(...) so it is incompatible with /bin/sh
 194     open(my $bash,"|-","bash") || die;
 195     print $bash @cmd;
 196     close $bash;
 197 }
 198
 199 sub tmpname {
 200     # Select a name that does not exist
 201     # Do not create the file as it may be used for creating a socket (by tmux)
 202     # Remember the name in $Global::unlink to avoid hitting the same name twice
 203     my $name = shift;
 204     my($tmpname);
 205     if(not -w $ENV{'TMPDIR'}) {
 206         if(not -e $ENV{'TMPDIR'}) {
 207             ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
 208                     Q($ENV{'TMPDIR'})."'");
 209         } else {
 210             ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
 211                     Q($ENV{'TMPDIR'})."'");
 212         }
 213         exit(255);
 214     }
 215     do {
 216         $tmpname = $ENV{'TMPDIR'}."/".$name.
 217             join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
 218     } while(-e $tmpname);
 219     return $tmpname;
 220 }
 221
 222 sub tmpfifo {
 223     # Find an unused name and mkfifo on it
 224     my $tmpfifo = tmpname("psort");
 225     mkfifo($tmpfifo,0600);
 226     return $tmpfifo;
 227 }
 228
 229 sub debug {
 230     # Returns: N/A
 231     $opt::D or return;
 232     @_ = grep { defined $_ ? $_ : "" } @_;
 233     print STDERR @_[1..$#_];
 234 }
 235
 236 sub version() {
 237     # Returns: N/A
 238     print join
 239         ("\n",
 240          "GNU $Global::progname $Global::version",
 241          "Copyright (C) 2020-2022 Ole Tange, http://ole.tange.dk and Free Software",
 242          "Foundation, Inc.",
 243          "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
 244          "This is free software: you are free to change and redistribute it.",
 245          "GNU $Global::progname comes with no warranty.",
 246          "",
 247          "Web site: https://www.gnu.org/software/parallel\n",
 248         );
 249 }
 250
 251 sub shell_quote(@) {
 252     # Input:
 253     #   @strings = strings to be quoted
 254     # Returns:
 255     #   @shell_quoted_strings = string quoted as needed by the shell
 256     return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
 257 }
 258
 259 sub shell_quote_scalar_rc($) {
 260     # Quote for the rc-shell
 261     my $a = $_[0];
 262     if(defined $a) {
 263         if(($a =~ s/'/''/g)
 264            +
 265            ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
 266             # A string was replaced
 267             # No need to test for "" or \0
 268         } elsif($a eq "") {
 269             $a = "''";
 270         } elsif($a eq "\0") {
 271             $a = "";
 272         }
 273     }
 274     return $a;
 275 }
 276
 277 sub shell_quote_scalar_csh($) {
 278     # Quote for (t)csh
 279     my $a = $_[0];
 280     if(defined $a) {
 281         # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
 282         # This is 1% faster than the above
 283         if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
 284            +
 285            # quote newline in csh as \\\n
 286            ($a =~ s/[\n]/"\\\n"/go)) {
 287             # A string was replaced
 288             # No need to test for "" or \0
 289         } elsif($a eq "") {
 290             $a = "''";
 291         } elsif($a eq "\0") {
 292             $a = "";
 293         }
 294     }
 295     return $a;
 296 }
 297
 298 sub shell_quote_scalar_default($) {
 299     # Quote for other shells (Bourne compatibles)
 300     # Inputs:
 301     #   $string = string to be quoted
 302     # Returns:
 303     #   $shell_quoted = string quoted as needed by the shell
 304     my $s = $_[0];
 305     if($s =~ /[^-_.+a-z0-9\/]/i) {
 306         $s =~ s/'/'"'"'/g; # "-quote single quotes
 307         $s = "'$s'";       # '-quote entire string
 308         $s =~ s/^''//;     # Remove unneeded '' at ends
 309         $s =~ s/''$//;     # (faster than s/^''|''$//g)
 310         return $s;
 311     } elsif ($s eq "") {
 312         return "''";
 313     } else {
 314         # No quoting needed
 315         return $s;
 316     }
 317 }
 318
 319 sub shell_quote_scalar($) {
 320     # Quote the string so the shell will not expand any special chars
 321     # Inputs:
 322     #   $string = string to be quoted
 323     # Returns:
 324     #   $shell_quoted = string quoted as needed by the shell
 325
 326     # Speed optimization: Choose the correct shell_quote_scalar_*
 327     # and call that directly from now on
 328     no warnings 'redefine';
 329     if($Global::cshell) {
 330         # (t)csh
 331         *shell_quote_scalar = \&shell_quote_scalar_csh;
 332     } elsif($Global::shell =~ m:(^|/)rc$:) {
 333         # rc-shell
 334         *shell_quote_scalar = \&shell_quote_scalar_rc;
 335     } else {
 336         # other shells
 337         *shell_quote_scalar = \&shell_quote_scalar_default;
 338     }
 339     # The sub is now redefined. Call it
 340     return shell_quote_scalar($_[0]);
 341 }
 342
 343 sub Q($) {
 344     # Q alias for ::shell_quote_scalar
 345     my $ret = shell_quote_scalar($_[0]);
 346     no warnings 'redefine';
 347     *Q = \&::shell_quote_scalar;
 348     return $ret;
 349 }
 350
 351
 352 sub status(@) {
 353     my @w = @_;
 354     my $fh = $Global::status_fd || *STDERR;
 355     print $fh map { ($_, "\n") } @w;
 356     flush $fh;
 357 }
 358
 359 sub status_no_nl(@) {
 360     my @w = @_;
 361     my $fh = $Global::status_fd || *STDERR;
 362     print $fh @w;
 363     flush $fh;
 364 }
 365
 366 sub warning(@) {
 367     my @w = @_;
 368     my $prog = $Global::progname || "parsort";
 369     status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
 370 }
 371
 372 {
 373     my %warnings;
 374     sub warning_once(@) {
 375         my @w = @_;
 376         my $prog = $Global::progname || "parsort";
 377         $warnings{@w}++ or
 378             status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
 379     }
 380 }
 381
 382 sub error(@) {
 383     my @w = @_;
 384     my $prog = $Global::progname || "parsort";
 385     status(map { ($prog.": Error: ". $_); } @w);
 386 }
 387
 388 sub die_bug($) {
 389     my $bugid = shift;
 390     print STDERR
 391         ("$Global::progname: This should not happen. You have found a bug. ",
 392          "Please follow\n",
 393          "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
 394          "\n",
 395          "Include this in the report:\n",
 396          "* The version number: $Global::version\n",
 397          "* The bugid: $bugid\n",
 398          "* The command line being run\n",
 399          "* The files being read (put the files on a webserver if they are big)\n",
 400          "\n",
 401          "If you get the error on smaller/fewer files, please include those instead.\n");
 402     exit(255);
 403 }
 404
 405 if(@ARGV) {
 406     sort_files(@ARGV);
 407 } elsif(length $opt::files0_from) {
 408     $/="\0";
 409     open(my $fh,"<",$opt::files0_from) || die;
 410     my @files = <$fh>;
 411     chomp(@files);
 412     sort_files(@files);
 413 } else {
 414     sort_stdin();
 415 }
 416
 417 # Test
 418 # -z
 419 # OK: cat bigfile | parsort
 420 # OK: parsort -k4n files*.txt
 421 # OK: parsort files*.txt
 422 # OK: parsort "file with space"
 423