7 bp_find-blast-matches.pl - extract DNA sequences based on BLAST hits
11 bp_find-blast-matches.pl [-h -e -p -5 -n -o -3 -header] -blast <BLAST_FILE> -fasta <FASTA_FILE>
21 BLAST output file to read from. The alignment should use the file specified by
22 '-fasta' option ideally
26 FASTA file to read from. This is where the sequence will be extracted from
36 Displays this help message
40 Maximum e-value for matches (0.01 by default)
44 Number of base pairs of 5' region to be included with the sequence
48 Number of base pairs of 5' region only, excluding the regular sequence
52 Number of base pairs of 3' region only, excluding the regular sequence
56 Number of top hits to display, starting with the best hit
60 Exact match to display (this option can't be used in conjunction with '-n'
64 The FASTA header to display instead of the default
70 This script takes a BLAST output file and a FASTA file as arguments,
71 given after the '-blast' and '-fasta' options respectively. The BLAST output
72 file should have been generated with your sequence of interest and the
73 FASTA file supplied as an argument.
74 Example: find-blast-matches.pl -blast BLAST_FILE -fasta FASTA_FILE
76 It parses through the BLAST file to check for high quality matches,
77 which are then searched for in the FASTA file. The sequence may vary
78 from you candidate sequence, hence the BLAST search prior.
80 The sequence from the FASTA file is then displayed to STDOUT.
81 Optional arguments can be used, such as to extract the 5' or 3' region.
85 Gabriel Abud - E<lt>gabriel.jabud-at-gmail.comE<gt>
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to
93 the Bioperl mailing list. Your participation is much appreciated
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
100 Report bugs to the Bioperl bug tracking system to help us keep track
101 of the bugs and their resolution. Bug reports can be submitted via
104 https://github.com/bioperl/bioperl-live/issues
108 2014-08-04 - Gabriel Abud
113 Getopt::long, Pod::Usage, Bio::SearchIO, Bio::Seq, Bio::SeqIO, Bio::Perl,
120 use Bio
::SearchIO
qw(new);
121 use Bio
::Seq
qw(new);
122 use Bio
::SeqIO
qw(new);
123 use Bio
::Perl
qw(revcom_as_string);
124 use File
::Basename
qw(basename);
125 use Getopt
::Long
qw(GetOptions);
129 my $baseProg = basename
($0); # Program name
151 # Command line options
152 my $exact_match; # Undef by default
153 my $e_value = 0.01; # Default e-value
154 my $matches = 1; # Default number of matches
155 my $promoter; # Undef by default
156 my $three_prime; # Undef by default
157 my $promoter_only; # Undef by default
158 my $opt_help; # Undef by default
159 my $header; # Undef by default
161 # Functions for proper command line usage
164 "Usage: $baseProg -blast 'BLAST_file' -fasta 'FASTA_file' [OPTIONS]\n";
165 print STDERR
"Try '$baseProg --help' for more information.\n";
172 "\t$baseProg - extract a DNA sequence based on BLAST hits\n\n",
174 "\t$baseProg -blast 'BLAST_file' -fasta 'FASTA__file' [OPTIONS]\n\n",
176 "\t(All options require an additional number argument [ie: -e 0.01])\n\n",
177 "\t-e, maximum e-value for matches (0.01 by default)\n\n",
178 "\t-p, number of base pairs of 5' region to be included with the\n",
179 "\t sequence of interest\n\n",
180 "\t-5, number of base pairs of 5' region, excluding the sequence\n",
181 "\t of interest (unlike '-p')\n\n",
182 "\t-n, number of top hits to display, starting with the highest hit\n",
183 "\t(1 by default)\n\n",
184 "\t-o, exact match to display (this option can't be used in conjuction\n",
186 "\t-3, number of base pairs of 3' region to display\n\n",
187 "\t-header, the fasta header to display instead of the default\n\n";
191 # Get command line options
196 'o=i' => \
$exact_match,
197 '5=i' => \
$promoter_only,
198 '3=i' => \
$three_prime,
199 'help|h' => \
$opt_help,
200 'header|head=s' => \
$header,
201 'blast|b=s' => \
$blastFile,
202 'fasta|f=s' => \
$inputFile
203 ) or pod2usage
(-exitstatus
=> 0, -verbose
=> 1);
206 pod2usage
(-exitstatus
=> 0, -verbose
=> 2) if $opt_help;
207 #help() if defined $opt_help;
209 # Checks for required arguments
210 pod2usage
(-exitstatus
=> 0, -verbose
=> 1,
211 -msg
=> "You must specify the FASTA and BLAST files to read from!\n")
212 if (!defined $blastFile || !defined $inputFile);
213 #syntax() if ( !defined $blastFile || !defined $inputFile );
215 # Checks for any negative numbers
217 pod2usage
(-exitstatus
=> 0, -verbose
=> 1,
218 -msg
=> "You must use positive numbers as values to options!")
219 if ( (defined $e_value && $e_value < 0)
220 || (defined $promoter && $promoter < 0)
221 || (defined $matches && $matches < 0)
222 || (defined $exact_match && $exact_match < 0)
223 || (defined $promoter_only && $promoter_only < 0)
224 || (defined $three_prime && $three_prime < 0 ) );
226 if ( $matches > 1 && defined $exact_match ) {
227 print STDERR
"Cannot use both options '-n' and '-o' at the same time\n";
228 print STDERR
"(Type '$baseProg --help' for more information\n)";
232 if ( defined $promoter && defined $promoter_only ) {
233 print STDERR
"Cannot use both options '-p' and '-5' at the same time\n";
234 print STDERR
"(Type '$baseProg --help' for more information)\n";
238 if ( defined $three_prime && ( defined $promoter || defined $promoter_only ) ) {
239 print STDERR
"Cannot use both '-3' with '-p' or '-5' at the same time\n";
240 print STDERR
"(Type $baseProg --help' for more information)\n";
244 # Class used to search through the blast file
245 eval { $in = Bio
::SearchIO
->new( -file
=> $blastFile, -format
=> "blast" ); };
247 die "'$blastFile' does not appear to be a BLAST output file! Exiting...\n";
249 $out = Bio
::SeqIO
->new( -fh
=> \
*STDOUT
, -format
=> "fasta" );
251 # Creates arrays of all the scaffold names and the coordinates of where those scaffolds are found
253 OUTERLOOP
: while ( my $result = $in->next_result ) {
254 LOOP
: while ( my $hit = $result->next_hit ) {
255 while ( my $hsp = $hit->next_hsp ) {
257 # Finds all matches with an evalue <= $e_value (or 0.01 by default)
258 if ( $hsp->evalue <= $e_value ) {
259 ( $start[$n], $end[$n] ) = $hsp->range('hit');
260 $scaffolds[$n] = $hit->name, $strand[$n] = $hsp->strand('hit');
264 # If an exact_match option is given
265 if ( defined($exact_match) && $exact_match == $n ) {
266 ( $start[0], $end[0] ) = $hsp->range('hit');
267 $scaffolds[0] = $hit->name, $strand[0] = $hsp->strand('hit');
270 elsif ( !defined($exact_match) && $n >= $matches )
271 { # Exits after the correct amount of matches have been found (1 by default)
278 $baseFile = basename
$inputFile;
280 open INFILE
, "<$inputFile"
281 or die "Couldn't open the input file '$inputFile'! Exiting...\n";
287 # Extracts only the scaffolds of interest, avoiding duplicates
288 if ( defined $exact_match ) {
289 $scaffoldList{ $scaffolds[0] } = 1;
292 foreach my $num ( 0 .. $#scaffolds ) {
293 if ( !defined $scaffoldList{ $scaffolds[$num] } ) {
294 $scaffoldList{ $scaffolds[$num] } = 1;
298 my $remaining_scaffolds = my $unique_scaffolds = keys(%scaffoldList);
302 # Reads the FASTA file here, storing FASTA headers where the sequence is found
303 while ( $line = <INFILE
> ) {
305 next if ( $line =~ m/^\s*?$/ );
307 foreach my $scaffold ( sort keys %scaffoldList ) {
308 if ( $line =~ /^[ \t]{0,5}$scaffold/ )
309 { # True if FASTA segment contains the scaffold
312 $baseList{$scaffold} = $line;
314 $remaining_scaffolds--;
317 last unless ($remaining_scaffolds);
320 if ( $scaffoldFind != $unique_scaffolds ) {
321 print STDERR
"The scaffold specified in the BLAST file was not found.\n";
322 print STDERR
"Make sure you are using the correct FASTA file.\n";
326 for my $m ( 0 .. $#scaffolds )
327 { # Runs a loop as many times as there are scaffolds
328 $baseList = $baseList{ $scaffolds[$m] };
330 # Print title line for each scaffold
331 $accession = $baseFile;
332 $default_header = "(BLAST hit:$scaffolds[$m]|";
334 my $real_start = $start[$m];
335 my $real_end = $end[$m];
337 # For "normal", positive strands (+/+)
338 if ( $strand[$m] == 1 ) {
340 # If the -p flag was used
341 if ( defined $promoter ) { # 5' region specified with -p flag
342 # If 5' region is too large for sequence, don't include the 5' region
343 if ( $promoter >= $start[$m] ) {
344 print STDERR
"ERROR: 5' region is too big!! (max promoter = ",
345 $start[$m] - 1, " )\n",
346 "Showing sequence without 5' region...\n";
347 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
350 my $real_start = $start[$m] - $promoter;
351 $baseList = substr $baseList, $start[$m] - $promoter - 1,
352 $promoter + $end[$m] - $start[$m] + 1;
356 # If the -3 flag was used
357 elsif ( defined $three_prime ) {
358 $total_size = length($baseList);
360 if ( ( $three_prime + $end[$m] ) > $total_size ) {
361 die "ERROR: 3' region is too big!! ",
362 "(max 3' region = ", $total_size - $end[$m], ")\n",
366 $real_start = $end[$m] + 1;
367 $real_end = $end[$m] + $three_prime;
368 $baseList = substr $baseList, $end[$m], $three_prime;
372 # If the -5 flag was used
373 elsif ( defined $promoter_only )
374 { # 5' region was specified with -5 flag
375 # If 5' region is too large for sequence, don't include the 5' region
376 if ( $promoter_only >= $start[$m] ) {
377 die "ERROR: 5' region is too big!! (max promoter = ",
378 $start[$m] - 1, " )\n",
382 $real_start = $start[$m] - $promoter_only;
383 $real_end = $start[$m] - 1;
384 $baseList = substr $baseList, $start[$m] - $promoter_only - 1,
388 else { # Default: just the BLAST hit (no 5' or 3')
389 $baseList = substr $baseList, $start[$m] - 1,
390 $end[$m] - $start[$m] + 1;
394 # Checks to see if the hit sequence is the reverse compliment
395 elsif ( $strand[$m] == -1 ) {
397 # If -p flag was used:
398 if ( defined $promoter ) {
399 $total_size = length($baseList);
401 if ( ( $promoter + $end[$m] ) > $total_size ) {
402 print STDERR
"ERROR: 5' region is too big!! ",
403 "(max 5' region = ", $total_size - $end[$m], ")\n",
404 "Showing sequence without 5' region...\n";
405 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
408 $real_start = $start[$m] + 1;
409 $real_end = $end[$m] + $promoter;
410 $baseList = substr $baseList, $start[$m] - 1,
411 $end[$m] - $start[$m] + $promoter + 1;
416 # If -3 flag was used
417 elsif ( defined $three_prime ) {
418 if ( $three_prime >= $start[$m] ) {
419 die "ERROR: 3' region is too big!! (max = ", $start[$m] - 1,
423 $real_start = $start[$m] - $three_prime;
424 $real_end = $start[$m] - 1;
425 $baseList = substr $baseList, $start[$m] - $three_prime - 1,
430 # If -5 flag was used
431 elsif ( defined $promoter_only ) {
432 $total_size = length($baseList);
434 if ( ( $promoter_only + $end[$m] ) > $total_size ) {
435 die "ERROR: 5' region is too big!! ",
436 "(max promoter = ", $total_size - $end[$m], ")\n",
440 $real_start = $end[$m] + 1;
441 $real_end = $end[$m] + $promoter_only;
442 $baseList = substr $baseList, $end[$m], $promoter_only;
446 else { # If 5' region wasn't specified at all
447 $baseList = substr $baseList, $start[$m] - 1,
448 $end[$m] - $start[$m] + 1;
450 $baseList = revcom_as_string
($baseList);
452 $default_header .= "$real_start-$real_end)";
453 $default_header .= " showing 5' region ($promoter_only bp) only"
454 if defined($promoter_only);
455 $default_header .= " showing 3' region ($three_prime bp)"
456 if defined($three_prime);
457 $default_header .= " with 5' region ($promoter bp)" if defined($promoter);
458 my $seq_obj = Bio
::Seq
->new( -seq
=> "$baseList", -alphabet
=> 'dna' );
460 if ( defined $header ) {
464 $header =~ s/^([^\s]+)//;
465 my $default_name = $1;
466 $seq_obj->display_id($default_name);
467 $seq_obj->desc($header);
470 $seq_obj->desc($default_header);
471 $seq_obj->display_id($accession);
474 # Prints the sequence to STDOUT
475 $out->write_seq($seq_obj);
478 # If 'exact_match' option is specified, exit after first (and only) iteration
479 if ( defined($exact_match) ) {
485 } # End of major for loop