bin/bp_split_seq

   1 #!/usr/bin/perl
   2 use strict;
   3 use warnings;
   4
   5 =head1 NAME
   6
   7 bp_split_seq - splits a sequence into equal sized chunks with an optional
   8             overlapping range
   9
  10 =head1 SYNOPSIS
  11
  12 bp_split_seq -c 10000 [-o 1000] [-i] -f seq.in
  13
  14 =head1 DESCRIPTION
  15
  16 The script will split sequences into chunks
  17
  18 Mandatory Options:
  19
  20   -c  Desired length of the resulting sequences.
  21   -f  Input file (must be FASTA format).
  22
  23 Special Options:
  24
  25   -o  Overlapping range between the resulting sequences.
  26   -i  Create an index file with the resulting sequence files. This is
  27       useful if you want to pass this list as input arguments into
  28       another programs (i.e. CLUSTAL, HMMER, etc.).
  29
  30 =head1 FEEDBACK
  31
  32 =head2 Mailing Lists
  33
  34 User feedback is an integral part of the evolution of this and other
  35 Bioperl modules. Send your comments and suggestions preferably to
  36 the Bioperl mailing list.  Your participation is much appreciated.
  37
  38   bioperl-l@bioperl.org                  - General discussion
  39   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  40
  41 =head2 Reporting Bugs
  42
  43 Report bugs to the Bioperl bug tracking system to help us keep track
  44 of the bugs and their resolution. Bug reports can be submitted via the
  45 web:
  46
  47   https://github.com/bioperl/bioperl-live/issues
  48
  49 =head1 AUTHORS
  50
  51   Ewan Birney E<lt>birney-at-ebi.ac.ukE<gt>
  52   Mauricio Herrera Cuadra E<lt>mauricio at open-bio.orgE<gt>
  53   (some enhancements)
  54
  55 =cut
  56
  57 # Modules, pragmas and variables to use
  58 use Bio::Seq;
  59 use Bio::SeqIO;
  60 use Getopt::Long;
  61 use vars qw($opt_c $opt_o $opt_i $opt_f $index_file);
  62
  63 # Gets options from the command line
  64 GetOptions qw(-c=i -o:i -i -f=s);
  65
  66 # If no mandatory options are given prints an error and exits
  67 if (!$opt_c) {
  68     print "ERROR: No chunk size has been specified.\n" and exit();
  69 } elsif (!$opt_f) {
  70     print "ERROR: No FASTA file has been specified.\n" and exit();
  71 }
  72
  73 # Declares offset size
  74 my $offset = $opt_o ? $opt_o : "0";
  75
  76 # Opens the FASTA file
  77 my $in = Bio::SeqIO->new(
  78     -file   => "$opt_f",
  79     -format => "Fasta",
  80 );
  81 print "==> Opening FASTA file:\t\t\t\t$opt_f\n";
  82
  83 # Reads the next sequence object
  84 while (my $seq = $in->next_seq()) {
  85
  86     # Reads the ID for the sequence and prints it
  87     my $id = $seq->id();
  88     print "--> The ID for this sequence is:\t\t$id\n";
  89
  90     # Reads the description for the sequence and prints it
  91     my $desc = $seq->desc();
  92     print "--> The description for this sequence is:\t$desc\n";
  93
  94     # Gets sequence length and prints it
  95     my $seq_length = $seq->length();
  96     print "--> The length of this sequence is:\t\t$seq_length\n";
  97
  98     # If the chunk size is bigger than the sequence length prints the error and exits
  99     (print "ERROR: Specified chunk size is bigger than sequence length.\n" and exit()) if ($opt_c > $seq_length);
 100
 101     # Creates a directory for writing the resulting files
 102     mkdir("split", 0755) unless -e "split" and -d "split";
 103
 104     # Creates the INDEX file if the option was given
 105     my $FH;
 106     if ($opt_i) {
 107         $index_file = "$id.c$opt_c.o$offset.INDEX";
 108         open $FH, '>', $index_file or die "Could not write file '$index_file': $!\n";
 109     }
 110
 111     # Loops through the sequence
 112     for (my $i = 1; $i < $seq_length; $i += $opt_c) {
 113         my $end = (($i + $opt_c) > $seq_length) ? ($seq_length + 1) : ($i + $opt_c);
 114         my $seq_range = (($i + $opt_c) > $seq_length) ? "$i-".($end - 1) : "$i-$end";
 115         my $id = $seq->id();
 116         $id .= "_$seq_range";
 117
 118         # Stores chunk into its corresponding FASTA file
 119         my $out = Bio::SeqIO->new(
 120             -file   => ">split/$id.faa",
 121             -format => "Fasta",
 122         );
 123         my $trunc_seq = $seq->trunc($i, $end - 1);
 124         $trunc_seq->id($id);
 125         $out->write_seq($trunc_seq);
 126         print "==> Sequence chunk:\t$seq_range\tstored in file:\tsplit/$id.faa\n";
 127
 128         # Prints the current file name into the INDEX file if the option was given
 129         print $FH "split/$id.faa\n" if $opt_i;
 130
 131         # Decreases the $i value with the offset value
 132         $i -= $offset;
 133     }
 134
 135     # Closes the INDEX file if the option was given
 136     if ($opt_i) {
 137         print "==> INDEX stored in file:\t\t\t$index_file\n";
 138         close $FH;
 139     }
 140 }
 141
 142 __END__