Travis CI: Add Test::Most dependency
[bioperl-live.git] / scripts / utilities / bp_find-blast-matches.pl
blob6430075b44f6e115ec67fb1d5aa13579161fc8b2
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
5 =head1 NAME
7 bp_find-blast-matches.pl - extract DNA sequences based on BLAST hits
9 =head1 SYNOPSIS
11 bp_find-blast-matches.pl [-h -e -p -5 -n -o -3 -header] -blast <BLAST_FILE> -fasta <FASTA_FILE>
13 =head1 OPTIONS
15 =head2 Mandatory:
17 =over
19 =item B<-blast>
21 BLAST output file to read from. The alignment should use the file specified by
22 '-fasta' option ideally
24 =item B<-fasta>
26 FASTA file to read from. This is where the sequence will be extracted from
28 =back
30 =head2 Optional:
32 =over
34 =item B<-h>
36 Displays this help message
38 =item B<-e>
40 Maximum e-value for matches (0.01 by default)
42 =item B<-p>
44 Number of base pairs of 5' region to be included with the sequence
46 =item B<-5>
48 Number of base pairs of 5' region only, excluding the regular sequence
50 =item B<-3>
52 Number of base pairs of 3' region only, excluding the regular sequence
54 =item B<-n>
56 Number of top hits to display, starting with the best hit
58 =item B<-o>
60 Exact match to display (this option can't be used in conjunction with '-n'
62 =item B<-header>
64 The FASTA header to display instead of the default
66 =back
68 =head1 DESCRIPTION
70 This script takes a BLAST output file and a FASTA file as arguments,
71 given after the '-blast' and '-fasta' options respectively. The BLAST output
72 file should have been generated with your sequence of interest and the
73 FASTA file supplied as an argument.
74 Example: find-blast-matches.pl -blast BLAST_FILE -fasta FASTA_FILE
76 It parses through the BLAST file to check for high quality matches,
77 which are then searched for in the FASTA file. The sequence may vary
78 from you candidate sequence, hence the BLAST search prior.
80 The sequence from the FASTA file is then displayed to STDOUT.
81 Optional arguments can be used, such as to extract the 5' or 3' region.
83 =head1 AUTHOR
85 Gabriel Abud - E<lt>gabriel.jabud-at-gmail.comE<gt>
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to
93 the Bioperl mailing list. Your participation is much appreciated
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Reporting Bugs
100 Report bugs to the Bioperl bug tracking system to help us keep track
101 of the bugs and their resolution. Bug reports can be submitted via
102 email or the web:
104 https://github.com/bioperl/bioperl-live/issues
106 =head1 EDIT HISTORY
108 2014-08-04 - Gabriel Abud
109 First features added
111 =head1 DEPENDANCIES
113 Getopt::long, Pod::Usage, Bio::SearchIO, Bio::Seq, Bio::SeqIO, Bio::Perl,
114 File::Basename
116 =cut
119 # Modules
120 use Bio::SearchIO qw(new);
121 use Bio::Seq qw(new);
122 use Bio::SeqIO qw(new);
123 use Bio::Perl qw(revcom_as_string);
124 use File::Basename qw(basename);
125 use Getopt::Long qw(GetOptions);
126 use Pod::Usage;
128 # Variables
129 my $baseProg = basename($0); # Program name
130 my $line;
131 my @scaffolds;
132 my $inputFile;
133 my $blastFile;
134 my $scaffoldFind;
135 my $baseList;
136 my @start;
137 my @end;
138 my @strand;
139 my @arrayBases;
140 my $query_name;
141 my @query_names;
142 my @accessions;
143 my $accession;
144 my $baseFile;
145 my $total_size;
146 my $title;
147 my $default_header;
148 my $in;
149 my $out;
151 # Command line options
152 my $exact_match; # Undef by default
153 my $e_value = 0.01; # Default e-value
154 my $matches = 1; # Default number of matches
155 my $promoter; # Undef by default
156 my $three_prime; # Undef by default
157 my $promoter_only; # Undef by default
158 my $opt_help; # Undef by default
159 my $header; # Undef by default
161 # Functions for proper command line usage
162 sub syntax {
163 print STDERR
164 "Usage: $baseProg -blast 'BLAST_file' -fasta 'FASTA_file' [OPTIONS]\n";
165 print STDERR "Try '$baseProg --help' for more information.\n";
166 exit;
169 sub help {
170 print STDERR
171 "\nNAME:\n",
172 "\t$baseProg - extract a DNA sequence based on BLAST hits\n\n",
173 "SYNTAX:\n",
174 "\t$baseProg -blast 'BLAST_file' -fasta 'FASTA__file' [OPTIONS]\n\n",
175 "OPTIONS:\n",
176 "\t(All options require an additional number argument [ie: -e 0.01])\n\n",
177 "\t-e, maximum e-value for matches (0.01 by default)\n\n",
178 "\t-p, number of base pairs of 5' region to be included with the\n",
179 "\t sequence of interest\n\n",
180 "\t-5, number of base pairs of 5' region, excluding the sequence\n",
181 "\t of interest (unlike '-p')\n\n",
182 "\t-n, number of top hits to display, starting with the highest hit\n",
183 "\t(1 by default)\n\n",
184 "\t-o, exact match to display (this option can't be used in conjuction\n",
185 "\t with '-n')\n\n",
186 "\t-3, number of base pairs of 3' region to display\n\n",
187 "\t-header, the fasta header to display instead of the default\n\n";
188 exit;
191 # Get command line options
192 GetOptions(
193 'e=f' => \$e_value,
194 'p=i' => \$promoter,
195 'n=i' => \$matches,
196 'o=i' => \$exact_match,
197 '5=i' => \$promoter_only,
198 '3=i' => \$three_prime,
199 'help|h' => \$opt_help,
200 'header|head=s' => \$header,
201 'blast|b=s' => \$blastFile,
202 'fasta|f=s' => \$inputFile
203 ) or pod2usage(-exitstatus => 0, -verbose => 1);
205 # Help screen
206 pod2usage(-exitstatus => 0, -verbose => 2) if $opt_help;
207 #help() if defined $opt_help;
209 # Checks for required arguments
210 pod2usage(-exitstatus => 0, -verbose => 1,
211 -msg => "You must specify the FASTA and BLAST files to read from!\n")
212 if (!defined $blastFile || !defined $inputFile);
213 #syntax() if ( !defined $blastFile || !defined $inputFile );
215 # Checks for any negative numbers
216 #syntax()
217 pod2usage(-exitstatus => 0, -verbose => 1,
218 -msg => "You must use positive numbers as values to options!")
219 if ( (defined $e_value && $e_value < 0)
220 || (defined $promoter && $promoter < 0)
221 || (defined $matches && $matches < 0)
222 || (defined $exact_match && $exact_match < 0)
223 || (defined $promoter_only && $promoter_only < 0)
224 || (defined $three_prime && $three_prime < 0 ) );
226 if ( $matches > 1 && defined $exact_match ) {
227 print STDERR "Cannot use both options '-n' and '-o' at the same time\n";
228 print STDERR "(Type '$baseProg --help' for more information\n)";
229 exit;
232 if ( defined $promoter && defined $promoter_only ) {
233 print STDERR "Cannot use both options '-p' and '-5' at the same time\n";
234 print STDERR "(Type '$baseProg --help' for more information)\n";
235 exit;
238 if ( defined $three_prime && ( defined $promoter || defined $promoter_only ) ) {
239 print STDERR "Cannot use both '-3' with '-p' or '-5' at the same time\n";
240 print STDERR "(Type $baseProg --help' for more information)\n";
241 exit;
244 # Class used to search through the blast file
245 eval { $in = Bio::SearchIO->new( -file => $blastFile, -format => "blast" ); };
246 if ($@) {
247 die "'$blastFile' does not appear to be a BLAST output file! Exiting...\n";
249 $out = Bio::SeqIO->new( -fh => \*STDOUT, -format => "fasta" );
251 # Creates arrays of all the scaffold names and the coordinates of where those scaffolds are found
252 my $n = 0;
253 OUTERLOOP: while ( my $result = $in->next_result ) {
254 LOOP: while ( my $hit = $result->next_hit ) {
255 while ( my $hsp = $hit->next_hsp ) {
257 # Finds all matches with an evalue <= $e_value (or 0.01 by default)
258 if ( $hsp->evalue <= $e_value ) {
259 ( $start[$n], $end[$n] ) = $hsp->range('hit');
260 $scaffolds[$n] = $hit->name, $strand[$n] = $hsp->strand('hit');
261 $n += 1;
264 # If an exact_match option is given
265 if ( defined($exact_match) && $exact_match == $n ) {
266 ( $start[0], $end[0] ) = $hsp->range('hit');
267 $scaffolds[0] = $hit->name, $strand[0] = $hsp->strand('hit');
268 last OUTERLOOP;
270 elsif ( !defined($exact_match) && $n >= $matches )
271 { # Exits after the correct amount of matches have been found (1 by default)
272 last LOOP;
278 $baseFile = basename $inputFile;
280 open INFILE, "<$inputFile"
281 or die "Couldn't open the input file '$inputFile'! Exiting...\n";
283 $scaffoldFind = 0;
284 my %scaffoldList;
285 my %baseList;
287 # Extracts only the scaffolds of interest, avoiding duplicates
288 if ( defined $exact_match ) {
289 $scaffoldList{ $scaffolds[0] } = 1;
291 else {
292 foreach my $num ( 0 .. $#scaffolds ) {
293 if ( !defined $scaffoldList{ $scaffolds[$num] } ) {
294 $scaffoldList{ $scaffolds[$num] } = 1;
298 my $remaining_scaffolds = my $unique_scaffolds = keys(%scaffoldList);
300 local $/ = "\n>";
302 # Reads the FASTA file here, storing FASTA headers where the sequence is found
303 while ( $line = <INFILE> ) {
304 chomp($line);
305 next if ( $line =~ m/^\s*?$/ );
307 foreach my $scaffold ( sort keys %scaffoldList ) {
308 if ( $line =~ /^[ \t]{0,5}$scaffold/ )
309 { # True if FASTA segment contains the scaffold
310 $line =~ s/^.*?\n//;
311 $line =~ s/\s//g;
312 $baseList{$scaffold} = $line;
313 $scaffoldFind++;
314 $remaining_scaffolds--;
317 last unless ($remaining_scaffolds);
320 if ( $scaffoldFind != $unique_scaffolds ) {
321 print STDERR "The scaffold specified in the BLAST file was not found.\n";
322 print STDERR "Make sure you are using the correct FASTA file.\n";
323 exit;
326 for my $m ( 0 .. $#scaffolds )
327 { # Runs a loop as many times as there are scaffolds
328 $baseList = $baseList{ $scaffolds[$m] };
330 # Print title line for each scaffold
331 $accession = $baseFile;
332 $default_header = "(BLAST hit:$scaffolds[$m]|";
334 my $real_start = $start[$m];
335 my $real_end = $end[$m];
337 # For "normal", positive strands (+/+)
338 if ( $strand[$m] == 1 ) {
340 # If the -p flag was used
341 if ( defined $promoter ) { # 5' region specified with -p flag
342 # If 5' region is too large for sequence, don't include the 5' region
343 if ( $promoter >= $start[$m] ) {
344 print STDERR "ERROR: 5' region is too big!! (max promoter = ",
345 $start[$m] - 1, " )\n",
346 "Showing sequence without 5' region...\n";
347 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
349 else {
350 my $real_start = $start[$m] - $promoter;
351 $baseList = substr $baseList, $start[$m] - $promoter - 1,
352 $promoter + $end[$m] - $start[$m] + 1;
356 # If the -3 flag was used
357 elsif ( defined $three_prime ) {
358 $total_size = length($baseList);
360 if ( ( $three_prime + $end[$m] ) > $total_size ) {
361 die "ERROR: 3' region is too big!! ",
362 "(max 3' region = ", $total_size - $end[$m], ")\n",
365 else {
366 $real_start = $end[$m] + 1;
367 $real_end = $end[$m] + $three_prime;
368 $baseList = substr $baseList, $end[$m], $three_prime;
372 # If the -5 flag was used
373 elsif ( defined $promoter_only )
374 { # 5' region was specified with -5 flag
375 # If 5' region is too large for sequence, don't include the 5' region
376 if ( $promoter_only >= $start[$m] ) {
377 die "ERROR: 5' region is too big!! (max promoter = ",
378 $start[$m] - 1, " )\n",
381 else {
382 $real_start = $start[$m] - $promoter_only;
383 $real_end = $start[$m] - 1;
384 $baseList = substr $baseList, $start[$m] - $promoter_only - 1,
385 $promoter_only;
388 else { # Default: just the BLAST hit (no 5' or 3')
389 $baseList = substr $baseList, $start[$m] - 1,
390 $end[$m] - $start[$m] + 1;
394 # Checks to see if the hit sequence is the reverse compliment
395 elsif ( $strand[$m] == -1 ) {
397 # If -p flag was used:
398 if ( defined $promoter ) {
399 $total_size = length($baseList);
401 if ( ( $promoter + $end[$m] ) > $total_size ) {
402 print STDERR "ERROR: 5' region is too big!! ",
403 "(max 5' region = ", $total_size - $end[$m], ")\n",
404 "Showing sequence without 5' region...\n";
405 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
407 else {
408 $real_start = $start[$m] + 1;
409 $real_end = $end[$m] + $promoter;
410 $baseList = substr $baseList, $start[$m] - 1,
411 $end[$m] - $start[$m] + $promoter + 1;
416 # If -3 flag was used
417 elsif ( defined $three_prime ) {
418 if ( $three_prime >= $start[$m] ) {
419 die "ERROR: 3' region is too big!! (max = ", $start[$m] - 1,
420 " )\n";
422 else {
423 $real_start = $start[$m] - $three_prime;
424 $real_end = $start[$m] - 1;
425 $baseList = substr $baseList, $start[$m] - $three_prime - 1,
426 $three_prime;
430 # If -5 flag was used
431 elsif ( defined $promoter_only ) {
432 $total_size = length($baseList);
434 if ( ( $promoter_only + $end[$m] ) > $total_size ) {
435 die "ERROR: 5' region is too big!! ",
436 "(max promoter = ", $total_size - $end[$m], ")\n",
439 else {
440 $real_start = $end[$m] + 1;
441 $real_end = $end[$m] + $promoter_only;
442 $baseList = substr $baseList, $end[$m], $promoter_only;
446 else { # If 5' region wasn't specified at all
447 $baseList = substr $baseList, $start[$m] - 1,
448 $end[$m] - $start[$m] + 1;
450 $baseList = revcom_as_string($baseList);
452 $default_header .= "$real_start-$real_end)";
453 $default_header .= " showing 5' region ($promoter_only bp) only"
454 if defined($promoter_only);
455 $default_header .= " showing 3' region ($three_prime bp)"
456 if defined($three_prime);
457 $default_header .= " with 5' region ($promoter bp)" if defined($promoter);
458 my $seq_obj = Bio::Seq->new( -seq => "$baseList", -alphabet => 'dna' );
460 if ( defined $header ) {
461 $header =~ s/^\s*//;
462 $header =~ s/\s*$//;
463 $header =~ s/^>//;
464 $header =~ s/^([^\s]+)//;
465 my $default_name = $1;
466 $seq_obj->display_id($default_name);
467 $seq_obj->desc($header);
469 else {
470 $seq_obj->desc($default_header);
471 $seq_obj->display_id($accession);
474 # Prints the sequence to STDOUT
475 $out->write_seq($seq_obj);
476 print "\n";
478 # If 'exact_match' option is specified, exit after first (and only) iteration
479 if ( defined($exact_match) ) {
480 close INFILE;
481 exit;
484 close INFILE;
485 } # End of major for loop
487 __END__