remote database change (ensembl ID not present in mapper any more)
[bioperl-live.git] / examples / align / clustalw.pl
blobdb6caee566fd8ec0ec4a99564c1721aa83dd0fcb
1 #!/usr/bin/perl
2 # PROGRAM : clustalw.pl
3 # PURPOSE : Demonstrate possible uses of Bio::Tools::Run::Alignment::Clustalw.pm
4 # AUTHOR : Peter Schattner schattner@alum.mit.edu
5 # CREATED : Oct 06 2000
6 # REVISION : $Id$
8 # INSTALLATION
10 # You will need to have installed clustalw and to ensure that Clustalw.pm can find it.
11 # This can be done in different ways (bash syntax):
12 # export PATH=$PATH:/home/peter/clustalw1.8
13 # or
14 # define an environmental variable CLUSTALDIR:
15 # export CLUSTALDIR=/home/peter/clustalw1.8
16 # or
17 # include a definition of an environmental variable CLUSTALDIR in every
18 # script that will use Clustal.pm.
19 # BEGIN {$ENV{CLUSTALDIR} = '/home/peter/clustalw1.8/'; }
21 # We are going to demonstrate 3 possible applications of Clustalw.pm:
22 # 1. Test effect of varying clustalw alignment parameter(s) on resulting alignment
23 # 2. Test effect of changing the order that sequences are added to the alignment
24 # on the resulting alignment
25 # 3. Test effect of incorporating an "anchor point" in the alignment process
27 # Before we can do any tests, we need to set up the environment, create the factory
28 # and read in the unaligned sequences.
31 #BEGIN {
32 # $ENV{CLUSTALDIR} = '/home/peter/clustalw1.8/';
35 use Getopt::Long;
36 use Bio::Tools::Run::Alignment::Clustalw;
37 use Bio::SimpleAlign;
38 use Bio::AlignIO;
39 use Bio::SeqIO;
40 use strict;
42 # set some default values
43 my $infile = 't/data/cysprot1a.fa';
44 my @params = ('quiet' => 1 );
45 my $do_only = '123'; # string listing examples to be executed. Default is to
46 # execute all tests (ie 1,2 and 3)
47 my $param = 'ktuple'; # parameter to be varied in example 1
48 my $startvalue = 1; # initial value for parameter $param
49 my $stopvalue = 3; # final value for parameter $param
50 my $regex = 'W[AT]F'; # regular expression for 'anchoring' alignment in example 3
51 my $extension = 30; # distance regexp anchor should be extended in each direction
52 # for local alignment in example 3
53 my $helpflag = 0; # Flag to show usage info.
55 # get user options
56 my @argv = @ARGV; # copy ARGV before GetOptions() massacres it.
58 &GetOptions("h!" => \$helpflag, "help!" => \$helpflag,
59 "in=s" => \$infile,
60 "param=s" => \$param,
61 "do=s" => \$do_only,
62 "start=i" => \$startvalue,
63 "stop=i" => \$stopvalue,
64 "ext=i" => \$extension,
65 "regex=s" => \$regex,) ;
67 if ($helpflag) { &clustalw_usage(); exit 0;}
69 # create factory & set user-specified global clustalw parameters
70 foreach my $argv (@argv) {
71 unless ($argv =~ /^(.*)=>(.*)$/) { next;}
72 push (@params, $1 => $2);
74 my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
77 # put unaligned sequences in a Bio::Seq array
78 my $str = Bio::SeqIO->new(-file=> $infile, '-format' => 'Fasta');
79 my ($paramvalue, $aln, $subaln, @consensus, $seq_num, $string, $strout, $id);
80 my @seq_array =();
81 while ( my $seq = $str->next_seq() ) { push (@seq_array, $seq) ;}
83 # Do each example that has digit present in variable $do_only
84 $_= $do_only;
85 /1/ && &vary_params();
86 /2/ && &vary_align_order();
87 /3/ && &anchored_align();
89 ## End of "main"
91 #################################################
92 # vary_params(): Example demonstrating varying of clustalw parameter
95 sub vary_params {
97 print "Beginning parameter-varying example... \n";
99 # Now we'll create several alignments, 1 for each value of the selected
100 # parameter. We also compute a simple consensus string for each alignment.
101 # (In the default case, we vary the "ktuple" parameter, creating 3
102 # alignments using ktuple values from 1 to 3.)
104 my $index =0;
105 for ($paramvalue = $startvalue; $paramvalue < ($stopvalue + 1); $paramvalue++) {
106 $factory->$param($paramvalue); # set parameter value
107 print "Performing alignment with $param = $paramvalue \n";
108 $aln = $factory->align(\@seq_array);
109 $string = $aln->consensus_string(); # Get consensus of alignment
110 # convert '?' to 'X' at non-consensus positions
111 $string =~ s/\?/X/g;
112 $consensus[$index] = Bio::Seq->new(-id=>"$param=$paramvalue",-seq=>$string);
113 $index++;
115 # Compare consensus strings for alignments with different $param values by
116 # making an alignment of the different consensus strings
117 # $factory->ktuple(1); # set ktuple parameter
118 print "Performing alignment of $param consensus sequences \n";
119 $aln = $factory->align(\@consensus);
120 $strout = Bio::AlignIO->newFh('-format' => 'msf');
121 print $strout $aln;
123 return 1;
127 #################################################
128 # vary_align_order():
130 # For our second example, we'll test the effect of changing the order
131 # that sequences are added to the alignment
133 sub vary_align_order {
135 print "\nBeginning alignment-order-changing example... \n";
137 @consensus = (); # clear array
138 for ($seq_num = 0; $seq_num < scalar(@seq_array); $seq_num++) {
139 my $obj_out = shift @seq_array; # remove one Seq object from array and save
140 $id = $obj_out->display_id;
141 # align remaining sequences
142 print "Performing alignment with sequence $id left out \n";
143 $subaln = $factory->align(\@seq_array);
144 # add left-out sequence to subalignment
145 $aln = $factory->profile_align($subaln,$obj_out);
146 $string = $aln->consensus_string(); # Get consensus of alignment
147 # convert '?' to 'X' for non-consensus positions
148 $string =~ s/\?/X/g;
149 $consensus[$seq_num] = Bio::Seq->new(-id=>"$id left out",-seq=>$string);
150 push @seq_array, $obj_out; # return Seq object for next (sub) alignment
153 # Compare consensus strings for alignments created in different orders
154 # $factory->ktuple(1); # set ktuple parameter
155 print "\nPerforming alignment of consensus sequences for different reorderings \n";
156 print "Each consensus is labeled by the sequence which was omitted in the initial alignment\n";
157 $aln = $factory->align(\@consensus);
158 $strout = Bio::AlignIO->newFh('-format' => 'msf');
159 print $strout $aln;
161 return 1;
164 #################################################
165 # anchored_align()
167 # For our last example, we'll test a way to perform a local alignment by
168 # "anchoring" the alignment to a regular expression. This is similar
169 # to the approach taken in the recent dbclustal program.
170 # In principle, we could write a script to search for a good regular expression
171 # to use. Instead, here we'll simply choose one manually after looking at the
172 # previous alignments.
174 sub anchored_align {
176 my @local_array = ();
177 my @seqs_not_matched = ();
179 print "\n Beginning anchored-alignment example... \n";
181 for ($seq_num = 0; $seq_num < scalar(@seq_array); $seq_num++) {
182 my $seqobj = $seq_array[$seq_num];
183 my $seq = $seqobj->seq();
184 my $id = $seqobj->id();
185 # if $regex is not found in the sequence, save sequence id name and set
186 # array value =0 for later
187 unless ($seq =~/$regex/) {
188 $local_array[$seq_num] = 0;
189 push (@seqs_not_matched, $id) ;
190 next;
192 # find positions of start and of subsequence to be aligned
193 my $match_start_pos = length($`);
194 my $match_stop_pos = length($`) + length($&);
195 my $start = ($match_start_pos - $extension) > 1 ?
196 ($match_start_pos - $extension) +1 : 1;
197 my $stop = ($match_stop_pos + $extension) < length($seq) ?
198 ($match_stop_pos + $extension) : length($seq);
199 my $string = $seqobj->subseq($start, $stop);
201 $local_array[$seq_num] = Bio::Seq->new(-id=>$id, -seq=>$string);
203 @local_array = grep $_ , @local_array; # remove array entries with no match
205 # Perform alignment on the local segments of the sequences which match "anchor"
206 $aln = $factory->align(\@local_array);
207 my $consensus = $aln->consensus_string(); # Get consensus of local alignment
209 if (scalar(@seqs_not_matched) ) {
210 print " Sequences not matching $regex : @seqs_not_matched \n"
211 } else {
212 print " All sequences match $regex : @seqs_not_matched \n"
214 print "Consensus sequence of local alignment: $consensus \n";
216 return 1;
219 #----------------
220 sub clustalw_usage {
221 #----------------
223 #-----------------------
224 # Prints usage information for general parameters.
226 print STDERR <<"QQ_PARAMS_QQ";
228 Command-line accessible script variables and commands:
229 -------------------------------
230 -h : Display this usage info and exit.
231 -in <str> : File containing input sequences in fasta format (default = $infile) .
232 -do <str> : String listing examples to be executed. Default is to execute
233 all tests (ie default = '123')
234 -param <str> : Parameter to be varied in example 1. Any clustalw parameter
235 which takes inteer values can be varied (default = 'ktuple')
236 -start <int> : Initial value for varying parameter in example 1 (default = 1)
237 -stop <int> : Final value for varying parameter (default = 3)
238 -regex <str> : Regular expression for 'anchoring' alignment in example 3
239 (default = $regex)
240 -ext <int> : Distance regexp anchor should be extended in each direction
241 for local alignment in example 3 (default = 30)
243 In addition, any valid Clustalw parameter can be set using the syntax
244 "parameter=>value" as in "ktuple=>3"
246 So a typical command lines might be:
247 > clustalw.pl -param=pairgap -start=2 -stop=3 -do=1 "ktuple=>3"
249 > clustalw.pl -ext=10 -regex='W[AST]F' -do=23 -in='t/cysprot1a.fa'
251 QQ_PARAMS_QQ