scripts/seq/bp_seqpart.pl

   1 #!/usr/bin/env perl
   2 use strict;
   3 use warnings;
   4
   5 =head1 NAME
   6
   7 I<bp_seqpart.pl> - Takes one or more sequence files and splits them into a number of load balanced files.
   8
   9 =head1 USAGE
  10
  11  bp_seqpart.pl -n <NUM_PARTS> [-h, -p <PREFIX>, -f <FORMAT>, -o <OUT_DIR>] <FILES...>
  12
  13    -n number of files to create through partitioning
  14    -h this help message
  15    -p prefix for all FASTA file names output, files are of the form <outdir>/<prefix>#.<format>
  16    -f format of the files, defaults to FASTA but you can specify anything supported by SeqIO from BioPerl
  17    -o output directory where to dump the split sequence files
  18
  19 =head1 DESCRIPTION
  20
  21 Script wrapping SeqIO that allows partitioning of multiple sequence files into near equal sized parts for later parallel processing. Even if you have 10 input files outputting to 10 files will balance the files to contain similar total length of sequence. ID's are ignored when deciding on how to balance each sequence.
  22
  23 =head1 AUTHOR
  24
  25 B<Matt Oates> - I<Matt.Oates@bristol.ac.uk>
  26
  27 =head1 FEEDBACK
  28
  29 =head2 Mailing Lists
  30
  31 User feedback is an integral part of the evolution of this and other
  32 Bioperl modules. Send your comments and suggestions preferably to
  33 the Bioperl mailing list.  Your participation is much appreciated.
  34
  35   bioperl-l@bioperl.org                  - General discussion
  36   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  37
  38 =head2 Reporting Bugs
  39
  40 Report bugs to the Bioperl bug tracking system to help us keep track
  41 of the bugs and their resolution. Bug reports can be submitted via
  42 email or the web:
  43
  44   https://github.com/bioperl/bioperl-live/issues
  45
  46 =head1 EDIT HISTORY
  47
  48 2012-04-03 - Matt Oates
  49         First features added.
  50 =cut
  51
  52 =head1 DEPENDANCY
  53 B<Getopt::Long> Used to parse command line options.
  54 B<Pod::Usage> Used for usage and help output.
  55 B<Bio::SeqIO> Used to cut up sequences and parse FASTA.
  56 =cut
  57 use Getopt::Long;                     #Deal with command line options
  58 use Pod::Usage;                       #Print a usage man page from the POD comments after __END__
  59 use Bio::SeqIO;                       #Deal with sequence parsing, format and file IO
  60
  61 # Command Line Options
  62 my $help;               #Same again but this time should we output the POD man page defined after __END__
  63 my $prefix = 'part';    #Name each part
  64 my $format = 'fasta';   #Sequence format we are using, default to fasta
  65 my $outdir = '.';       #Use the current directory as default
  66 my $num_splits;         #Number of files to split into
  67 my @partitions;         #Details of each partition for the split
  68
  69 #Set command line flags and parameters.
  70 GetOptions("help|h!" => \$help,
  71            "prefix|p=s" => \$prefix,
  72            "format|f=s" => \$format,
  73            "num-splits|n=i" => \$num_splits,
  74            "outdir|o=s" => \$outdir,
  75         ) or die "Fatal Error: Problem parsing command-line ".$!;
  76
  77 #Print out some help if it was asked for or if no arguments were given.
  78 pod2usage(-exitstatus => 0, -verbose => 2) if $help;
  79
  80 pod2usage(-exitstatus => 0, -verbose => 1, -msg => 'Please specify the number of split parts with -n <N>')
  81     unless defined $num_splits;
  82
  83 #Setup a bunch of empty partitions including some SeqIO file handles to write to
  84 @partitions = map {
  85     $_ = { length => 0,
  86            size => 0,
  87            file => Bio::SeqIO->new(
  88                     -file   => ">$outdir/$prefix$_.$format",
  89                     -format => $format,
  90                )
  91          }
  92     } 1..$num_splits;
  93
  94 #Get sequences from all the files specified.
  95 foreach my $file (@ARGV) {
  96     #Open each input file in turn for reading
  97     my $in  = Bio::SeqIO->new(
  98         -file => "<$file",
  99         -format => $format
 100     );
 101     #While there are still sequences to consume
 102     while ( my $seq = $in->next_seq() ) {
 103         #Sort the partitions on how full they are
 104         @partitions = sort {$a->{size} <=> $b->{size}} @partitions;
 105         #Add the length of the current seq to the smallest partition size
 106         my $length = $seq->length;
 107         $partitions[0]{size} += $length;
 108         #Increase the length of the partition
 109         $partitions[0]{length}++;
 110         #Write this sequence to the partitions file
 111         $partitions[0]{file}->write_seq($seq);
 112     }
 113 }
 114
 115 #Report some basic statistics after the job
 116 my $part = 1;
 117 foreach my $partition (@partitions) {
 118     print STDERR "$outdir/$prefix$part.$format\n";
 119     print STDERR "\tSequence count = $partition->{length}\n";
 120     print STDERR "\tSequence characters = $partition->{size}\n";
 121     $part++;
 122 }
 123
 124 1;
 125 __END__
 126