scripts/utilities/bp_dbsplit.pl

   1 #!perl
   2 #-*-Perl-*-
   3
   4 =head1 NAME
   5
   6 bp_dbsplit - script to split an input set of database(s) into smaller pieces
   7
   8 =head1 SYNOPSIS
   9
  10   bp_dbsplit.PLS --size 50 [-i inputfile] [-if inputformat] [-of outputformat]
  11               [--prefix outputprefix] [ < file1 file 2  OR file1 file2]
  12
  13 =head1 DESCRIPTION
  14
  15 This script will take as input a list of filenames or a single file or
  16 from STDIN a sequence database and split the database into separate
  17 files of X numbers of sequences.  You specify X with the C<--size/-s>
  18 parameter.  The input and output sequence format is any that is
  19 supported by bioperl (fasta,embl,genbank,gcg, swissprot, etc).
  20
  21 You can specify the input data either as a single file with -i
  22 filename, or as a single file as an argument like
  23
  24   % bp_dbsplit file1 file2
  25
  26 or as a list of sequence data with
  27
  28   % cat file1 file2 file3 | bp_dbsplit
  29
  30 You'll want to use the C<--prefix> to specify what the output prefix will
  31 be.
  32
  33 =head1 FEEDBACK
  34
  35 =head2 Mailing Lists
  36
  37 User feedback is an integral part of the evolution of this and other
  38 Bioperl modules. Send your comments and suggestions preferably to
  39 the Bioperl mailing list.  Your participation is much appreciated.
  40
  41   bioperl-l@bioperl.org                  - General discussion
  42   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  43
  44 =head2 Reporting Bugs
  45
  46 Report bugs to the Bioperl bug tracking system to help us keep track
  47 of the bugs and their resolution. Bug reports can be submitted via
  48 the web:
  49
  50   https://github.com/bioperl/bioperl-live/issues
  51
  52 =head1 AUTHOR
  53
  54 Jason Stajich, jason-at-bioperl-dot-org
  55
  56 =cut
  57
  58 use strict;
  59 use warnings;
  60 use Bio::SeqIO;
  61 use Bio::SeqIO::MultiFile;
  62
  63 use Getopt::Long;
  64 my $dbsize = 100;
  65 my $prefix;
  66 my ($informat,$outformat,$infile) = ( 'fasta', 'fasta');
  67
  68 GetOptions (
  69             's|size:s'     => \$dbsize,
  70             'if:s'         => \$informat,
  71             'of:s'         => \$outformat,
  72             'i:s'          => \$infile,
  73             'p|prefix:s'   => \$prefix,
  74
  75 );
  76 if( @ARGV == 1 ) {
  77     $infile = shift @ARGV;
  78 }
  79 $prefix ||= $infile || $ARGV[0] || 'db';
  80
  81 my $in;
  82 if( @ARGV ) {
  83     $in = new Bio::SeqIO::MultiFile(-files => [@ARGV],
  84                                     -format => $informat || 'fasta');
  85 } elsif( $infile ) {
  86     $in = new Bio::SeqIO(-file  => $infile,
  87                          -format=> $informat);
  88 } else {
  89     $in = new Bio::SeqIO(-format=> $informat);
  90 }
  91 my $count = 1;
  92 my $out = new Bio::SeqIO(-format => $outformat,
  93                          -file   => ">$prefix.$count");
  94 my $scount = 0;
  95 while( my $seq = $in->next_seq ) {
  96     if( ++$scount > $dbsize && $count ) {
  97         $out->close();
  98         undef($out);
  99         $count++;
 100         $out = new Bio::SeqIO(-format => $outformat,
 101                               -file   => ">$prefix.$count");
 102         $scount = 1;
 103     }
 104     $out->write_seq($seq);
 105 }
 106
 107
 108 __END__