3 # PROGRAM : standaloneblast.pl
4 # PURPOSE : Demonstrate possible uses of Bio::Tools::StandAloneBlast.pm
5 # AUTHOR : Peter Schattner schattner@alum.mit.edu
6 # CREATED : Nov 01 2000
11 # You will need to enable Blast to find the Blast program. This can be done
12 # in (at least) two ways:
13 # 1. define an environmental variable blastDIR:
14 # export BLASTDIR=/home/peter/blast or
15 # 2. include a definition of an environmental variable BLASTDIR in every script that will
16 # use StandAloneBlast.pm.
17 # BEGIN {$ENV{BLASTDIR} = '/home/peter/blast/'; }
19 # We also need to select the database to be used
20 my $amino_database = 'swissprot';
24 # We are going to demonstrate 3 possible applications of StandAloneBlast.pm:
25 # 1. Test effect of varying choice of substitution matrices
26 # 2. Test effect of varying choice of gap penalty
27 # 3. Comparison of results of psiblast depending on whether psiblast itself is used
28 # to identify an alignment to use for blasting or whether an external alignment is given to
34 use Bio
::Tools
::Run
::StandAloneBlast
;
40 # set some default values
41 my $queryseq = Bio
::Root
::IO
->catfile(qw(t data cysprot1.fa) );
42 my $executable = 'blastpgp';
43 my $queryaln = Bio
::Root
::IO
->catfile(qw(t data cysprot.msf) );
44 my @params = ('database' => $amino_database);
45 # string listing examples to be executed. Default is to execute
46 # all tests (ie 1,2 and 3)
48 my $example1param = 'MATRIX'; # parameter to be varied in example 1
49 my $example2param = 'GAP'; # parameter to be varied in example 1
50 my $example1values = [ 'BLOSUM62', 'BLOSUM80', 'PAM70']; # MATRIX values to try
51 my $example2values = [ 7, 9, 25]; # GAP values to be tried
52 my $queryalnformat = 'msf';
54 # only use pos. specific scoring matrix if > 50% of residues have
55 # consensus letter (and compare with 25% or 75% cut off)
56 my $maskvalues = [50, 25, 75] ; my $helpflag = 0; # Flag to show usage info.
59 my @argv = @ARGV; # copy ARGV before GetOptions() massacres it.
63 &GetOptions
("h!" => \
$helpflag,
64 "help!" => \
$helpflag,
66 "inaln=s" => \
$queryaln,
67 "alnfmt=s" => \
$queryalnformat,
68 "param=s" => \
$example1param,
69 "exec=s" => \
$executable,
70 "paramvals=s" => \
$paramvalstring,
72 "maskvals=s" => \
$maskvalstring,
76 if ($paramvalstring) { @
$example1values = split (":", $paramvalstring); }
77 if ($maskvalstring) { @
$maskvalues = split (":", $maskvalstring); }
79 if ($helpflag) { &example_usage
(); exit 0;}
81 # create factory & set user-specified global blast parameters
82 foreach my $argv (@argv) {
83 next unless ($argv =~ /^(.*)=>(.*)$/);
84 push (@params, $1 => $2);
86 my $factory = Bio
::Tools
::Run
::StandAloneBlast
->new(@params);
88 # If "do" variable not set, do all four examples
90 &vary_params
($queryseq, $example1param, $example1values); # ex. 1
92 # To compare gap penalties of 7, 9 and 25 we need to set the
93 # scoring matrix to BLOSUM62 and extension penalty to 2 (these are
94 # limitations of BLAST)
96 $factory->MATRIX('BLOSUM62');
98 $factory->EXTENSION(2);
99 &vary_params
($queryseq, $example2param, $example2values); # ex. 2
101 # For the psiblast tests we want to restore gap opening and
102 # extension values to their defaults
105 $factory->EXTENSION(1);
106 # now do the mask comparison example and ..
107 &vary_masks
($queryseq, $maskvalues); # ex. 3
108 # do the jumpstart-align vs multiple iteration examples with the
109 # mask value set to 50%
110 &aligned_blast
($queryseq, $queryaln, $queryalnformat,
111 $jiter, $maskvalues->[0]); # ex. 4
112 } elsif ($do_only == 1) {
113 &vary_params
($queryseq,$example1param, $example1values);
114 } elsif ($do_only == 3) {
115 &vary_masks
($queryseq, $maskvalues);
116 } elsif ($do_only == 4 ) {
117 &aligned_blast
($queryseq, $queryaln, $queryalnformat, $jiter, $maskvalues->[0]);
128 #################################################
129 # compare_runs(): Prints out display of which hits were found by different methods
130 # Various methods are labeled by "tags" found in array @runtags
133 # $typetag - label describing type of "tags"
134 # $runtags - reference to array @runtags
135 # $hashhits - reference to hash of all the hits found by all runs (%hashhits)
136 # value for each hit is string which is the concatenation of all the "tags" of
137 # runs that found that hit
144 my $hashhits = shift;
148 print "Comparing BLAST results... \n";
150 # Get total number of hits found by any method
151 my $numhits = keys %$hashhits ; # scalar context to get total number of hits by all methods
152 print "Total number of hits found: $numhits \n";
154 # Get total number of hits found by every method
155 my $alltags = join ( "" , @
$runtags );
156 my @alltaghits = grep $$hashhits{$_} =~ /$alltags/ , keys %$hashhits;
157 print " Number of hits found by every method / parameter-value: " ,
158 scalar(@alltaghits), "\n";
160 # If one desires to see the hits found by all methods, uncomment next 2 lines
161 #print " Hits were found all methods / parameters: \n";
162 #print join ( "\n", @alltaghits ) , "\n";
164 # For each method/parameter-value (labeled by type) display hits found
165 # exclusively by that method
166 foreach $tag (@
$runtags) {
167 @taghits = grep $$hashhits{$_} =~ /^$tag$/ , keys %$hashhits;
168 print " Hits found only when $typetag was $tag: \n";
169 print join ( "\n", @taghits ) , "\n";
175 #################################################
176 # vary_params(): Example demonstrating varying of parameter
179 # $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
180 # $param - name of parameter to be varied
181 # $values - reference to array of values to be used for the parameter
188 my $queryseq = shift;
193 print "Beginning $param parameter-varying example... \n";
195 # Now we'll perform several blasts, 1 for each value of the
196 # selected parameter. In the first default case, we vary the
197 # MATRIX substitution parameter, creating 3 BLAST reports, using
198 # MATRIX values of BLOSUM62, BLOSUM80 or PAM70.
200 # In the second default case, we vary the GAP penalty parameter,
201 # creating 3 BLAST reports, using GAP penalties of 7, 9 and 25. In
202 # either case we then automatically parse the resulting report to
203 # identify which hits are found with any of the parameter values
204 # and which with only one of them.
207 # To test the BLAST results to some other parameter it is only
208 # necessary to change the parameters passed to the script on the
209 # commandline. The only tricky part is that the BLAST program
210 # itself only supports a limited range of parameters. See the
211 # BLAST documentation.
213 my ($report, $sbjct, $paramvalue);
215 my $hashhits = { }; # key is hit id, value is string of param values for which hit was found
217 foreach $paramvalue (@
$values) {
219 $factory->$param($paramvalue); # set parameter value
221 print "Performing BLAST with $param = $paramvalue \n";
223 $report = $factory->$executable($queryseq);
224 my $r = $report->next_result;
225 while( my $hit = $r->next_hit ) {
226 $hashhits->{$hit->name} .= "$paramvalue";
230 &compare_runs
( $param , $values , $hashhits);
236 #################################################
238 # vary_masks(): Example demonstrating varying of parameter
241 # $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
242 # $maskvalues - reference to array of values to be used for the mask threshold
245 # Now we'll perform several blasts, 1 for each value of the mask threshold.
246 # In the default case, we use thresholds of 25%, 50% and 75%. (Recall the threshold is
247 # % of resudues which must match the consensus residue before deciding to use the
248 # position specific scoring matrix rather than the default - BLOSUM or PAM - matrix)
249 # We then automatically parse the resulting reports to identify which hits
250 # are found with any of the mask threshold values and which with only one of them.
255 my $queryseq = shift;
259 print "Beginning mask-varying example... \n";
261 my ($report, $sbjct, $maskvalue);
263 my $hashhits = { }; # key is hit id, value is string of param values for which hit was found
265 # Get the alignment file
266 my $str = Bio
::AlignIO
->new(-file
=> "$queryaln", '-format' => "$queryalnformat", );
267 my $aln = $str->next_aln();
269 foreach $maskvalue (@
$values) {
271 print "Performing BLAST with mask threshold = $maskvalue % \n";
273 # Create the proper mask for 'jumpstarting'
274 my $mask = &create_mask
($aln, $maskvalue);
275 my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
276 my $r = $report2->next_result;
277 while($sbjct = $r->next_hit) {
278 $hashhits->{$sbjct->name} .= "$maskvalue";
282 &compare_runs
( 'mask threshold' , $values , $hashhits);
288 #################################################
293 # $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
294 # $queryaln - file containing alignment to be used to "jumpstart" psiblast in "-B mode"
295 # $queryaln *must contain $queryseq with the same name and length
296 # (psiblast is very picky)
297 # $queryalnformat - format of alignment (can = "fasta", "msf", etc)
298 # $jiter - number of iterations in psiblast run
299 # $maskvalue - threshold indicating how similar residues must be at a sequence location
300 # before position-specific-scoring matrix is used
301 # : "0" => use position specific matrix at all residues, or
302 # "100" => use default (eg BLOSUM) at all residues
306 # For this example, we'll compare the results of psiblast depending on whether psiblast itself is
308 # used to identify an alignment to use for blasting or whether an external alignment is given to
314 my $queryseq = shift;
315 my $queryaln = shift;
316 my $queryalnformat = shift;
318 my $maskvalue = shift;
323 print "\nBeginning aligned blast example... \n";
326 # First we do a single-iteration psiblast search but with a specified alignment to
327 # "jump start" psiblast
330 print "\nBeginning jump-start psiblast ... \n";
333 my $tag1 = 'jumpstart';
335 # $factory->j('1'); # perform single iteration
337 # Get the alignment file
338 my $str = Bio
::AlignIO
->new(-file
=> "$queryaln", '-format' => "$queryalnformat", );
339 my $aln = $str->next_aln();
342 # Create the proper mask for 'jumpstarting'
343 my $mask = &create_mask
($aln, $maskvalue);
346 my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
347 while($sbjct = $report2->next_result) {
348 $hashhits->{$sbjct->name} .= "$tag1";
351 # Then we do a "plain" psiblast multiple-iteration search
353 print "\nBeginning multiple-iteration psiblast ... \n";
356 $factory->B($undefineB);
358 my $tag2 = 'iterated';
359 $factory->j($jiter); # 'j' is blast parameter for # of iterations
360 my $report1 = $factory->blastpgp($queryseq);
361 my $total_iterations = $report1->number_of_iterations;
362 my $last_iteration = $report1->round($total_iterations);
365 while($sbjct = $last_iteration->next_result) {
366 $hashhits->{$sbjct->name} .= "$tag2";
369 # Now we compare the results of the searches
371 my $tagtype = 'iterated_or_jumpstart';
372 my $values = [ $tag1, $tag2];
374 &compare_runs
( $tagtype , $values , $hashhits);
381 #################################################
384 # create_mask(): creates a mask for the psiblast jumpstart alignment
385 # that determines at what residues position-specific
386 # scoring matrices (PSSMs) are used and at what
387 # residues default scoring matrices (eg BLOSUM) are
388 # used. See psiblast documentation for more details,
391 # $aln - SimpleAlign object with alignment
392 # $maskvalue - label describing type of "tags"
393 # returns: actual mask, ie a string of 0's and 1's which is the
394 # same length as each sequence in the alignment and has
395 # a "1" at locations where (PSSMs) are to be used
396 # and a "0" at all other locations.
401 my $maskvalue = shift;
404 die "psiblast jumpstart requires all sequences to be same length \n"
405 unless $aln->is_flush();
406 my $len = $aln->length();
408 if ($maskvalue =~ /^(\d){1,3}$/ ) {
409 $mask = $aln->consensus_string($maskvalue) ;
410 $mask =~ s/[^\?]/1/g ;
413 else { die "maskvalue must be an integer between 0 and 100 \n"; }
421 #-----------------------
422 # Prints usage information for general parameters.
424 print STDERR
<<"QQ_PARAMS_QQ";
426 Command-line accessible script variables and commands:
427 -------------------------------
428 -h : Display this usage info and exit.
429 -in <str> : File containing input sequences in fasta format (default = $queryseq) .
430 -inaln <str> : File containing input alignment for example 3 (default = $queryaln) .
431 -alnfmt <str> : Format of input alignment for example 3, eg "msf", "fasta", "pfam".
432 (default = $queryalnformat) .
433 -do <int> : Number of test to be executed ("1" => vary parameters,
434 "3" => compare iterated & jumpstart psiblast.) If omitted,
435 three default tests performed.
436 -exec <str> : Blast executable to be used in example 1. Can be "blastall" or
437 "blastpgp" (default is "blastpgp").
438 -param <str> : Parameter to be varied in example 1. Any blast parameter
439 can be varied (default = 'MATRIX')
440 -paramvals <str>: String containing parameter values in example 1, separated
441 by ":"'s. (default = 'BLOSUM62:BLOSUM80:PAM70')
442 -iter <int> : Maximum number of iterations in psiblast in example 3 (default = 2)
443 -maskvals <str>: String containing mask threshold values (in per-cents) for example 3,
444 separated by ":"'s. (default = '50:75:25')
446 In addition, any valid Blast parameter can be set using the syntax "parameter=>value" as in "database=>swissprot"
448 So some typical command lines might be:
449 >standaloneblast.pl -do 1 -param expectation -paramvals '1e-10:1e-5'
451 >standaloneblast.pl -do 1 -exec blastall -param q -paramvals '-1:-7' -in='t/dna1.fa' "pr=>blastn" "d=>ecoli.nt"
453 >standaloneblast.pl -do 4 -maskvals 0 -iter 3
455 >standaloneblast.pl -do 3 -maskvals '10:50:90' -in 't/data/cysprot1.fa' -alnfmt msf -inaln 't/cysprot.msf'