3 # BioPerl module for Bio::Align::ProteinStatistics
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances)
19 use Bio::Align::ProteinStatistics;
21 my $in = Bio::AlignIO->new(-format => 'fasta',
22 -file => 'pep-104.fasaln');
23 my $aln = $in->next_aln;
25 my $pepstats = Bio::Align::ProteinStatistics->new();
26 $kimura = $protstats->distance(-align => $aln,
28 print $kimura->print_matrix;
33 This object is for generating various statistics from a protein
34 alignment. Mostly it is where pairwise protein distances can be
39 D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP,
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to
48 the Bioperl mailing list. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 Report bugs to the Bioperl bug tracking system to help us keep track
56 of the bugs and their resolution. Bug reports can be submitted via the
59 http://bugzilla.open-bio.org/
61 =head1 AUTHOR - Jason Stajich
63 Email jason-at-bioperl.org
67 The rest of the documentation details each of the object methods.
68 Internal methods are usually preceded with a _
73 # Let the code begin...
76 package Bio
::Align
::ProteinStatistics
;
77 use vars
qw(%DistanceMethods $Precision $DefaultGapPenalty);
80 use Bio::Align::PairwiseStatistics;
81 use Bio::Matrix::PhylipDist;
83 %DistanceMethods = ('kimura|k' => 'Kimura',
86 $DefaultGapPenalty = 0;
88 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
93 Usage : my $obj = Bio::Align::ProteinStatistics->new();
94 Function: Builds a new Bio::Align::ProteinStatistics object
95 Returns : an instance of Bio::Align::ProteinStatistics
102 my($class,@args) = @_;
104 my $self = $class->SUPER::new
(@args);
105 $self->pairwise_stats( Bio
::Align
::PairwiseStatistics
->new());
113 Usage : my $distance_mat = $stats->distance(-align => $aln,
115 Function: Calculates a distance matrix for all pairwise distances of
116 sequences in an alignment.
117 Returns : L<Bio::Matrix::PhylipDist> object
118 Args : -align => Bio::Align::AlignI object
119 -method => String specifying specific distance method
120 (implementing class may assume a default)
125 my ($self,@args) = @_;
126 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
127 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
128 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
130 $method ||= 'Kimura';
131 foreach my $m ( keys %DistanceMethods ) {
132 if(defined $m && $method =~ /$m/i ) {
133 my $mtd = "D_$DistanceMethods{$m}";
134 return $self->$mtd($aln);
137 $self->warn("Unrecognized distance method $method must be one of [".
138 join(',',$self->available_distance_methods())."]");
142 =head2 available_distance_methods
144 Title : available_distance_methods
145 Usage : my @methods = $stats->available_distance_methods();
146 Function: Enumerates the possible distance methods
147 Returns : Array of strings
153 sub available_distance_methods
{
154 my ($self,@args) = @_;
155 return values %DistanceMethods;
158 =head2 D - distance methods
167 Usage : my $matrix = $pepstats->D_Kimura($aln);
168 Function: Calculate Kimura protein distance (Kimura 1983) which
169 approximates PAM distance
170 D = -ln ( 1 - p - 0.2 * p^2 )
171 Returns : L<Bio::Matrix::PhylipDist>
172 Args : L<Bio::Align::AlignI>
177 # Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
180 my ($self,$aln) = @_;
181 return 0 unless $self->_check_arg($aln);
182 # ambiguities ignored at this point
183 my (@seqs,@names,@values,%dist);
185 foreach my $seq ( $aln->each_seq) {
186 push @names, $seq->display_id;
187 push @seqs, uc $seq->seq();
190 my $len = $aln->length;
191 my $precisionstr = "%.$Precision"."f";
193 for( my $i = 0; $i < $seqct-1; $i++ ) {
194 # (diagonals) distance is 0 for same sequence
195 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
196 $values[$i][$i] = sprintf($precisionstr,0);
197 for( my $j = $i+1; $j < $seqct; $j++ ) {
198 my ($scored,$match) = (0,0);
199 for( my $k=0; $k < $len; $k++ ) {
200 my $m1 = substr($seqs[$i],$k,1);
201 my $m2 = substr($seqs[$j],$k,1);
202 if( $m1 ne '-' && $m2 ne '-' ) {
203 # score is number of scored bases (alignable bases)
204 # it could have also come from
205 # my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
206 # match is number of matches weighting ambiguity bases
208 $match += _check_ambiguity_protein
($m1,$m2);
212 # From Felsenstein's PHYLIP documentation:
213 # This is very quick to do but has some obvious
214 # limitations. It does not take into account which amino
215 # acids differ or to what amino acids they change, so some
216 # information is lost. The units of the distance measure
217 # are fraction of amino acids differing, as also in the
218 # case of the PAM distance. If the fraction of amino acids
219 # differing gets larger than 0.8541 the distance becomes
222 my $D = 1 - ( $match / $scored );
224 $D = - log ( 1 - $D - (0.2 * ($D ** 2)));
225 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
227 $values[$j][$i] = $values[$i][$j] = ' NaN';
230 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
231 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
233 # (diagonals) distance is 0 for same sequence
234 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
235 $values[$j][$j] = sprintf($precisionstr,0);
239 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_PEPstats',
242 -values => \
@values);
246 # some methods from EMBOSS distmat
247 sub _check_ambiguity_protein
252 if( $t1 ne 'X' && $t1 eq $t2 ) {
254 } elsif( ((($t1 eq 'B' && $t2 eq 'DN') ||
255 ($t2 eq 'B' && $t2 eq 'DN'))) ||
257 ( ($t1 eq 'Z' && $t2 eq 'EQ') ||
258 ($t2 eq 'Z' && $t1 eq 'EQ'))) {
260 } elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
262 } elsif( $t1 eq 'X' || $t2 eq 'X' ) {
272 =head2 pairwise_stats
274 Title : pairwise_stats
275 Usage : $obj->pairwise_stats($newval)
277 Returns : value of pairwise_stats
278 Args : newvalue (optional)
284 my ($self,$value) = @_;
285 if( defined $value) {
286 $self->{'_pairwise_stats'} = $value;
288 return $self->{'_pairwise_stats'};
293 my($self,$aln ) = @_;
294 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
295 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
297 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) {
298 $self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);