bin/bp_biofetch_genbank_proxy: move to Bio-DB-NCBIHelper
[bioperl-live.git] / bin / bp_split_seq
blob81273adbf626a52657b733fb405f832fc827ab0c
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
5 =head1 NAME
7 bp_split_seq - splits a sequence into equal sized chunks with an optional
8 overlapping range
10 =head1 SYNOPSIS
12 bp_split_seq -c 10000 [-o 1000] [-i] -f seq.in
14 =head1 DESCRIPTION
16 The script will split sequences into chunks
18 Mandatory Options:
20 -c Desired length of the resulting sequences.
21 -f Input file (must be FASTA format).
23 Special Options:
25 -o Overlapping range between the resulting sequences.
26 -i Create an index file with the resulting sequence files. This is
27 useful if you want to pass this list as input arguments into
28 another programs (i.e. CLUSTAL, HMMER, etc.).
30 =head1 FEEDBACK
32 =head2 Mailing Lists
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 =head2 Reporting Bugs
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 of the bugs and their resolution. Bug reports can be submitted via the
45 web:
47 https://github.com/bioperl/bioperl-live/issues
49 =head1 AUTHORS
51 Ewan Birney E<lt>birney-at-ebi.ac.ukE<gt>
52 Mauricio Herrera Cuadra E<lt>mauricio at open-bio.orgE<gt>
53 (some enhancements)
55 =cut
57 # Modules, pragmas and variables to use
58 use Bio::Seq;
59 use Bio::SeqIO;
60 use Getopt::Long;
61 use vars qw($opt_c $opt_o $opt_i $opt_f $index_file);
63 # Gets options from the command line
64 GetOptions qw(-c=i -o:i -i -f=s);
66 # If no mandatory options are given prints an error and exits
67 if (!$opt_c) {
68 print "ERROR: No chunk size has been specified.\n" and exit();
69 } elsif (!$opt_f) {
70 print "ERROR: No FASTA file has been specified.\n" and exit();
73 # Declares offset size
74 my $offset = $opt_o ? $opt_o : "0";
76 # Opens the FASTA file
77 my $in = Bio::SeqIO->new(
78 -file => "$opt_f",
79 -format => "Fasta",
81 print "==> Opening FASTA file:\t\t\t\t$opt_f\n";
83 # Reads the next sequence object
84 while (my $seq = $in->next_seq()) {
86 # Reads the ID for the sequence and prints it
87 my $id = $seq->id();
88 print "--> The ID for this sequence is:\t\t$id\n";
90 # Reads the description for the sequence and prints it
91 my $desc = $seq->desc();
92 print "--> The description for this sequence is:\t$desc\n";
94 # Gets sequence length and prints it
95 my $seq_length = $seq->length();
96 print "--> The length of this sequence is:\t\t$seq_length\n";
98 # If the chunk size is bigger than the sequence length prints the error and exits
99 (print "ERROR: Specified chunk size is bigger than sequence length.\n" and exit()) if ($opt_c > $seq_length);
101 # Creates a directory for writing the resulting files
102 mkdir("split", 0755) unless -e "split" and -d "split";
104 # Creates the INDEX file if the option was given
105 my $FH;
106 if ($opt_i) {
107 $index_file = "$id.c$opt_c.o$offset.INDEX";
108 open $FH, '>', $index_file or die "Could not write file '$index_file': $!\n";
111 # Loops through the sequence
112 for (my $i = 1; $i < $seq_length; $i += $opt_c) {
113 my $end = (($i + $opt_c) > $seq_length) ? ($seq_length + 1) : ($i + $opt_c);
114 my $seq_range = (($i + $opt_c) > $seq_length) ? "$i-".($end - 1) : "$i-$end";
115 my $id = $seq->id();
116 $id .= "_$seq_range";
118 # Stores chunk into its corresponding FASTA file
119 my $out = Bio::SeqIO->new(
120 -file => ">split/$id.faa",
121 -format => "Fasta",
123 my $trunc_seq = $seq->trunc($i, $end - 1);
124 $trunc_seq->id($id);
125 $out->write_seq($trunc_seq);
126 print "==> Sequence chunk:\t$seq_range\tstored in file:\tsplit/$id.faa\n";
128 # Prints the current file name into the INDEX file if the option was given
129 print $FH "split/$id.faa\n" if $opt_i;
131 # Decreases the $i value with the offset value
132 $i -= $offset;
135 # Closes the INDEX file if the option was given
136 if ($opt_i) {
137 print "==> INDEX stored in file:\t\t\t$index_file\n";
138 close $FH;
142 __END__