Clean up test: use the $db in scope and reuse variable
[bioperl-live.git] / examples / tools / standaloneblast.pl
blobba9746371a8c0887fe48d6b374cb8d37f5c591a7
1 #!/usr/bin/perl
3 # PROGRAM : standaloneblast.pl
4 # PURPOSE : Demonstrate possible uses of Bio::Tools::StandAloneBlast.pm
5 # AUTHOR : Peter Schattner schattner@alum.mit.edu
6 # CREATED : Nov 01 2000
8 # INSTALLATION
11 # You will need to enable Blast to find the Blast program. This can be done
12 # in (at least) two ways:
13 # 1. define an environmental variable blastDIR:
14 # export BLASTDIR=/home/peter/blast or
15 # 2. include a definition of an environmental variable BLASTDIR in every script that will
16 # use StandAloneBlast.pm.
17 # BEGIN {$ENV{BLASTDIR} = '/home/peter/blast/'; }
19 # We also need to select the database to be used
20 my $amino_database = 'swissprot';
24 # We are going to demonstrate 3 possible applications of StandAloneBlast.pm:
25 # 1. Test effect of varying choice of substitution matrices
26 # 2. Test effect of varying choice of gap penalty
27 # 3. Comparison of results of psiblast depending on whether psiblast itself is used
28 # to identify an alignment to use for blasting or whether an external alignment is given to
29 # psiblast
31 use strict;
32 use Getopt::Long;
33 use Bio::SimpleAlign;
34 use Bio::Tools::Run::StandAloneBlast;
35 use Bio::SearchIO;
36 use Bio::AlignIO;
37 use Bio::SeqIO;
38 use Bio::Root::IO;
40 # set some default values
41 my $queryseq = Bio::Root::IO->catfile(qw(t data cysprot1.fa) );
42 my $executable = 'blastpgp';
43 my $queryaln = Bio::Root::IO->catfile(qw(t data cysprot.msf) );
44 my @params = ('database' => $amino_database);
45 # string listing examples to be executed. Default is to execute
46 # all tests (ie 1,2 and 3)
47 my $do_only = '';
48 my $example1param = 'MATRIX'; # parameter to be varied in example 1
49 my $example2param = 'GAP'; # parameter to be varied in example 1
50 my $example1values = [ 'BLOSUM62', 'BLOSUM80', 'PAM70']; # MATRIX values to try
51 my $example2values = [ 7, 9, 25]; # GAP values to be tried
52 my $queryalnformat = 'msf';
53 my $jiter = 2;
54 # only use pos. specific scoring matrix if > 50% of residues have
55 # consensus letter (and compare with 25% or 75% cut off)
56 my $maskvalues = [50, 25, 75] ; my $helpflag = 0; # Flag to show usage info.
58 # get user options
59 my @argv = @ARGV; # copy ARGV before GetOptions() massacres it.
60 my $paramvalstring;
61 my $maskvalstring;
63 &GetOptions("h!" => \$helpflag,
64 "help!" => \$helpflag,
65 "in=s" => \$queryseq,
66 "inaln=s" => \$queryaln,
67 "alnfmt=s" => \$queryalnformat,
68 "param=s" => \$example1param,
69 "exec=s" => \$executable,
70 "paramvals=s" => \$paramvalstring,
71 "do=i" => \$do_only,
72 "maskvals=s" => \$maskvalstring,
73 "iter=i" => \$jiter,
74 ) ;
76 if ($paramvalstring) { @$example1values = split (":", $paramvalstring); }
77 if ($maskvalstring) { @$maskvalues = split (":", $maskvalstring); }
79 if ($helpflag) { &example_usage(); exit 0;}
81 # create factory & set user-specified global blast parameters
82 foreach my $argv (@argv) {
83 next unless ($argv =~ /^(.*)=>(.*)$/);
84 push (@params, $1 => $2);
86 my $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
88 # If "do" variable not set, do all four examples
89 if ( ! $do_only) {
90 &vary_params($queryseq, $example1param, $example1values); # ex. 1
92 # To compare gap penalties of 7, 9 and 25 we need to set the
93 # scoring matrix to BLOSUM62 and extension penalty to 2 (these are
94 # limitations of BLAST)
96 $factory->MATRIX('BLOSUM62');
98 $factory->EXTENSION(2);
99 &vary_params($queryseq, $example2param, $example2values); # ex. 2
101 # For the psiblast tests we want to restore gap opening and
102 # extension values to their defaults
104 $factory->GAP(11);
105 $factory->EXTENSION(1);
106 # now do the mask comparison example and ..
107 &vary_masks($queryseq, $maskvalues); # ex. 3
108 # do the jumpstart-align vs multiple iteration examples with the
109 # mask value set to 50%
110 &aligned_blast($queryseq, $queryaln, $queryalnformat,
111 $jiter, $maskvalues->[0]); # ex. 4
112 } elsif ($do_only == 1) {
113 &vary_params($queryseq,$example1param, $example1values);
114 } elsif ($do_only == 3) {
115 &vary_masks($queryseq, $maskvalues);
116 } elsif ($do_only == 4 ) {
117 &aligned_blast($queryseq, $queryaln, $queryalnformat, $jiter, $maskvalues->[0]);
118 } else {
119 &example_usage();
122 exit 0;
124 ##########
125 ## End of "main"
128 #################################################
129 # compare_runs(): Prints out display of which hits were found by different methods
130 # Various methods are labeled by "tags" found in array @runtags
132 # args:
133 # $typetag - label describing type of "tags"
134 # $runtags - reference to array @runtags
135 # $hashhits - reference to hash of all the hits found by all runs (%hashhits)
136 # value for each hit is string which is the concatenation of all the "tags" of
137 # runs that found that hit
138 # returns: nothing
140 sub compare_runs {
141 my $typetag = shift;
142 my $runtags = shift;
144 my $hashhits = shift;
146 my ($tag, @taghits);
148 print "Comparing BLAST results... \n";
150 # Get total number of hits found by any method
151 my $numhits = keys %$hashhits ; # scalar context to get total number of hits by all methods
152 print "Total number of hits found: $numhits \n";
154 # Get total number of hits found by every method
155 my $alltags = join ( "" , @$runtags );
156 my @alltaghits = grep $$hashhits{$_} =~ /$alltags/ , keys %$hashhits;
157 print " Number of hits found by every method / parameter-value: " ,
158 scalar(@alltaghits), "\n";
160 # If one desires to see the hits found by all methods, uncomment next 2 lines
161 #print " Hits were found all methods / parameters: \n";
162 #print join ( "\n", @alltaghits ) , "\n";
164 # For each method/parameter-value (labeled by type) display hits found
165 # exclusively by that method
166 foreach $tag (@$runtags) {
167 @taghits = grep $$hashhits{$_} =~ /^$tag$/ , keys %$hashhits;
168 print " Hits found only when $typetag was $tag: \n";
169 print join ( "\n", @taghits ) , "\n";
171 return 1;
175 #################################################
176 # vary_params(): Example demonstrating varying of parameter
178 # args:
179 # $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
180 # $param - name of parameter to be varied
181 # $values - reference to array of values to be used for the parameter
182 # returns: nothing
186 sub vary_params {
188 my $queryseq = shift;
189 my $param = shift;
190 my $values = shift;
193 print "Beginning $param parameter-varying example... \n";
195 # Now we'll perform several blasts, 1 for each value of the
196 # selected parameter. In the first default case, we vary the
197 # MATRIX substitution parameter, creating 3 BLAST reports, using
198 # MATRIX values of BLOSUM62, BLOSUM80 or PAM70.
200 # In the second default case, we vary the GAP penalty parameter,
201 # creating 3 BLAST reports, using GAP penalties of 7, 9 and 25. In
202 # either case we then automatically parse the resulting report to
203 # identify which hits are found with any of the parameter values
204 # and which with only one of them.
207 # To test the BLAST results to some other parameter it is only
208 # necessary to change the parameters passed to the script on the
209 # commandline. The only tricky part is that the BLAST program
210 # itself only supports a limited range of parameters. See the
211 # BLAST documentation.
213 my ($report, $sbjct, $paramvalue);
215 my $hashhits = { }; # key is hit id, value is string of param values for which hit was found
217 foreach $paramvalue (@$values) {
219 $factory->$param($paramvalue); # set parameter value
221 print "Performing BLAST with $param = $paramvalue \n";
223 $report = $factory->$executable($queryseq);
224 my $r = $report->next_result;
225 while( my $hit = $r->next_hit ) {
226 $hashhits->{$hit->name} .= "$paramvalue";
230 &compare_runs( $param , $values , $hashhits);
232 return 1;
236 #################################################
238 # vary_masks(): Example demonstrating varying of parameter
240 # args:
241 # $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
242 # $maskvalues - reference to array of values to be used for the mask threshold
243 # returns: nothing
245 # Now we'll perform several blasts, 1 for each value of the mask threshold.
246 # In the default case, we use thresholds of 25%, 50% and 75%. (Recall the threshold is
247 # % of resudues which must match the consensus residue before deciding to use the
248 # position specific scoring matrix rather than the default - BLOSUM or PAM - matrix)
249 # We then automatically parse the resulting reports to identify which hits
250 # are found with any of the mask threshold values and which with only one of them.
253 sub vary_masks {
255 my $queryseq = shift;
256 my $values = shift;
259 print "Beginning mask-varying example... \n";
261 my ($report, $sbjct, $maskvalue);
263 my $hashhits = { }; # key is hit id, value is string of param values for which hit was found
265 # Get the alignment file
266 my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
267 my $aln = $str->next_aln();
269 foreach $maskvalue (@$values) {
271 print "Performing BLAST with mask threshold = $maskvalue % \n";
273 # Create the proper mask for 'jumpstarting'
274 my $mask = &create_mask($aln, $maskvalue);
275 my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
276 my $r = $report2->next_result;
277 while($sbjct = $r->next_hit) {
278 $hashhits->{$sbjct->name} .= "$maskvalue";
282 &compare_runs( 'mask threshold' , $values , $hashhits);
284 return 1;
288 #################################################
289 # aligned_blast ():
292 # args:
293 # $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
294 # $queryaln - file containing alignment to be used to "jumpstart" psiblast in "-B mode"
295 # $queryaln *must contain $queryseq with the same name and length
296 # (psiblast is very picky)
297 # $queryalnformat - format of alignment (can = "fasta", "msf", etc)
298 # $jiter - number of iterations in psiblast run
299 # $maskvalue - threshold indicating how similar residues must be at a sequence location
300 # before position-specific-scoring matrix is used
301 # : "0" => use position specific matrix at all residues, or
302 # "100" => use default (eg BLOSUM) at all residues
303 # returns: nothing
306 # For this example, we'll compare the results of psiblast depending on whether psiblast itself is
308 # used to identify an alignment to use for blasting or whether an external alignment is given to
309 # psiblast
311 sub aligned_blast {
314 my $queryseq = shift;
315 my $queryaln = shift;
316 my $queryalnformat = shift;
317 my $jiter = shift;
318 my $maskvalue = shift;
320 my $hashhits = { };
321 my ($sbjct, $id);
323 print "\nBeginning aligned blast example... \n";
326 # First we do a single-iteration psiblast search but with a specified alignment to
327 # "jump start" psiblast
330 print "\nBeginning jump-start psiblast ... \n";
333 my $tag1 = 'jumpstart';
335 # $factory->j('1'); # perform single iteration
337 # Get the alignment file
338 my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
339 my $aln = $str->next_aln();
342 # Create the proper mask for 'jumpstarting'
343 my $mask = &create_mask($aln, $maskvalue);
346 my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
347 while($sbjct = $report2->next_result) {
348 $hashhits->{$sbjct->name} .= "$tag1";
351 # Then we do a "plain" psiblast multiple-iteration search
353 print "\nBeginning multiple-iteration psiblast ... \n";
355 my $undefineB ;
356 $factory->B($undefineB);
358 my $tag2 = 'iterated';
359 $factory->j($jiter); # 'j' is blast parameter for # of iterations
360 my $report1 = $factory->blastpgp($queryseq);
361 my $total_iterations = $report1->number_of_iterations;
362 my $last_iteration = $report1->round($total_iterations);
365 while($sbjct = $last_iteration->next_result) {
366 $hashhits->{$sbjct->name} .= "$tag2";
369 # Now we compare the results of the searches
371 my $tagtype = 'iterated_or_jumpstart';
372 my $values = [ $tag1, $tag2];
374 &compare_runs( $tagtype , $values , $hashhits);
376 return 1;
381 #################################################
384 # create_mask(): creates a mask for the psiblast jumpstart alignment
385 # that determines at what residues position-specific
386 # scoring matrices (PSSMs) are used and at what
387 # residues default scoring matrices (eg BLOSUM) are
388 # used. See psiblast documentation for more details,
390 # args:
391 # $aln - SimpleAlign object with alignment
392 # $maskvalue - label describing type of "tags"
393 # returns: actual mask, ie a string of 0's and 1's which is the
394 # same length as each sequence in the alignment and has
395 # a "1" at locations where (PSSMs) are to be used
396 # and a "0" at all other locations.
399 sub create_mask {
400 my $aln = shift;
401 my $maskvalue = shift;
402 my $mask = "";
404 die "psiblast jumpstart requires all sequences to be same length \n"
405 unless $aln->is_flush();
406 my $len = $aln->length();
408 if ($maskvalue =~ /^(\d){1,3}$/ ) {
409 $mask = $aln->consensus_string($maskvalue) ;
410 $mask =~ s/[^\?]/1/g ;
411 $mask =~ s/\?/0/g ;
413 else { die "maskvalue must be an integer between 0 and 100 \n"; }
414 return $mask ;
417 #----------------
418 sub example_usage {
419 #----------------
421 #-----------------------
422 # Prints usage information for general parameters.
424 print STDERR <<"QQ_PARAMS_QQ";
426 Command-line accessible script variables and commands:
427 -------------------------------
428 -h : Display this usage info and exit.
429 -in <str> : File containing input sequences in fasta format (default = $queryseq) .
430 -inaln <str> : File containing input alignment for example 3 (default = $queryaln) .
431 -alnfmt <str> : Format of input alignment for example 3, eg "msf", "fasta", "pfam".
432 (default = $queryalnformat) .
433 -do <int> : Number of test to be executed ("1" => vary parameters,
434 "3" => compare iterated & jumpstart psiblast.) If omitted,
435 three default tests performed.
436 -exec <str> : Blast executable to be used in example 1. Can be "blastall" or
437 "blastpgp" (default is "blastpgp").
438 -param <str> : Parameter to be varied in example 1. Any blast parameter
439 can be varied (default = 'MATRIX')
440 -paramvals <str>: String containing parameter values in example 1, separated
441 by ":"'s. (default = 'BLOSUM62:BLOSUM80:PAM70')
442 -iter <int> : Maximum number of iterations in psiblast in example 3 (default = 2)
443 -maskvals <str>: String containing mask threshold values (in per-cents) for example 3,
444 separated by ":"'s. (default = '50:75:25')
446 In addition, any valid Blast parameter can be set using the syntax "parameter=>value" as in "database=>swissprot"
448 So some typical command lines might be:
449 >standaloneblast.pl -do 1 -param expectation -paramvals '1e-10:1e-5'
451 >standaloneblast.pl -do 1 -exec blastall -param q -paramvals '-1:-7' -in='t/dna1.fa' "pr=>blastn" "d=>ecoli.nt"
453 >standaloneblast.pl -do 4 -maskvals 0 -iter 3
455 >standaloneblast.pl -do 3 -maskvals '10:50:90' -in 't/data/cysprot1.fa' -alnfmt msf -inaln 't/cysprot.msf'
459 QQ_PARAMS_QQ