bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / Align / ProteinStatistics.pm
blob57e497e8562bcedc22545a8eb1ac1676e5fa23dd
1 # $Id$
3 # BioPerl module for Bio::Align::ProteinStatistics
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances)
17 =head1 SYNOPSIS
19 use Bio::Align::ProteinStatistics;
20 use Bio::AlignIO;
21 my $in = Bio::AlignIO->new(-format => 'fasta',
22 -file => 'pep-104.fasaln');
23 my $aln = $in->next_aln;
25 my $pepstats = Bio::Align::ProteinStatistics->new();
26 $kimura = $protstats->distance(-align => $aln,
27 -method => 'Kimura');
28 print $kimura->print_matrix;
31 =head1 DESCRIPTION
33 This object is for generating various statistics from a protein
34 alignment. Mostly it is where pairwise protein distances can be
35 calculated.
37 =head1 REFERENCES
39 D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP,
40 Cambridge.
42 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to
48 the Bioperl mailing list. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 =head2 Reporting Bugs
55 Report bugs to the Bioperl bug tracking system to help us keep track
56 of the bugs and their resolution. Bug reports can be submitted via the
57 web:
59 http://bugzilla.open-bio.org/
61 =head1 AUTHOR - Jason Stajich
63 Email jason-at-bioperl.org
65 =head1 APPENDIX
67 The rest of the documentation details each of the object methods.
68 Internal methods are usually preceded with a _
70 =cut
73 # Let the code begin...
76 package Bio::Align::ProteinStatistics;
77 use vars qw(%DistanceMethods $Precision $DefaultGapPenalty);
78 use strict;
80 use Bio::Align::PairwiseStatistics;
81 use Bio::Matrix::PhylipDist;
83 %DistanceMethods = ('kimura|k' => 'Kimura',
85 $Precision = 5;
86 $DefaultGapPenalty = 0;
88 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
90 =head2 new
92 Title : new
93 Usage : my $obj = Bio::Align::ProteinStatistics->new();
94 Function: Builds a new Bio::Align::ProteinStatistics object
95 Returns : an instance of Bio::Align::ProteinStatistics
96 Args :
99 =cut
101 sub new {
102 my($class,@args) = @_;
104 my $self = $class->SUPER::new(@args);
105 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
107 return $self;
110 =head2 distance
112 Title : distance
113 Usage : my $distance_mat = $stats->distance(-align => $aln,
114 -method => $method);
115 Function: Calculates a distance matrix for all pairwise distances of
116 sequences in an alignment.
117 Returns : L<Bio::Matrix::PhylipDist> object
118 Args : -align => Bio::Align::AlignI object
119 -method => String specifying specific distance method
120 (implementing class may assume a default)
122 =cut
124 sub distance{
125 my ($self,@args) = @_;
126 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
127 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
128 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
130 $method ||= 'Kimura';
131 foreach my $m ( keys %DistanceMethods ) {
132 if(defined $m && $method =~ /$m/i ) {
133 my $mtd = "D_$DistanceMethods{$m}";
134 return $self->$mtd($aln);
137 $self->warn("Unrecognized distance method $method must be one of [".
138 join(',',$self->available_distance_methods())."]");
139 return;
142 =head2 available_distance_methods
144 Title : available_distance_methods
145 Usage : my @methods = $stats->available_distance_methods();
146 Function: Enumerates the possible distance methods
147 Returns : Array of strings
148 Args : none
151 =cut
153 sub available_distance_methods{
154 my ($self,@args) = @_;
155 return values %DistanceMethods;
158 =head2 D - distance methods
161 =cut
164 =head2 D_Kimura
166 Title : D_Kimura
167 Usage : my $matrix = $pepstats->D_Kimura($aln);
168 Function: Calculate Kimura protein distance (Kimura 1983) which
169 approximates PAM distance
170 D = -ln ( 1 - p - 0.2 * p^2 )
171 Returns : L<Bio::Matrix::PhylipDist>
172 Args : L<Bio::Align::AlignI>
175 =cut
177 # Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
179 sub D_Kimura{
180 my ($self,$aln) = @_;
181 return 0 unless $self->_check_arg($aln);
182 # ambiguities ignored at this point
183 my (@seqs,@names,@values,%dist);
184 my $seqct = 0;
185 foreach my $seq ( $aln->each_seq) {
186 push @names, $seq->display_id;
187 push @seqs, uc $seq->seq();
188 $seqct++;
190 my $len = $aln->length;
191 my $precisionstr = "%.$Precision"."f";
193 for( my $i = 0; $i < $seqct-1; $i++ ) {
194 # (diagonals) distance is 0 for same sequence
195 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
196 $values[$i][$i] = sprintf($precisionstr,0);
197 for( my $j = $i+1; $j < $seqct; $j++ ) {
198 my ($scored,$match) = (0,0);
199 for( my $k=0; $k < $len; $k++ ) {
200 my $m1 = substr($seqs[$i],$k,1);
201 my $m2 = substr($seqs[$j],$k,1);
202 if( $m1 ne '-' && $m2 ne '-' ) {
203 # score is number of scored bases (alignable bases)
204 # it could have also come from
205 # my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
206 # match is number of matches weighting ambiguity bases
207 # as well
208 $match += _check_ambiguity_protein($m1,$m2);
209 $scored++;
212 # From Felsenstein's PHYLIP documentation:
213 # This is very quick to do but has some obvious
214 # limitations. It does not take into account which amino
215 # acids differ or to what amino acids they change, so some
216 # information is lost. The units of the distance measure
217 # are fraction of amino acids differing, as also in the
218 # case of the PAM distance. If the fraction of amino acids
219 # differing gets larger than 0.8541 the distance becomes
220 # infinite.
222 my $D = 1 - ( $match / $scored );
223 if( $D < 0.8541 ) {
224 $D = - log ( 1 - $D - (0.2 * ($D ** 2)));
225 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
226 } else {
227 $values[$j][$i] = $values[$i][$j] = ' NaN';
229 # fwd and rev lookup
230 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
231 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
233 # (diagonals) distance is 0 for same sequence
234 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
235 $values[$j][$j] = sprintf($precisionstr,0);
239 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
240 -matrix => \%dist,
241 -names => \@names,
242 -values => \@values);
246 # some methods from EMBOSS distmat
247 sub _check_ambiguity_protein
249 my ($t1,$t2) = @_;
250 my $n = 0;
252 if( $t1 ne 'X' && $t1 eq $t2 ) {
253 $n = 1.0;
254 } elsif( ((($t1 eq 'B' && $t2 eq 'DN') ||
255 ($t2 eq 'B' && $t2 eq 'DN'))) ||
257 ( ($t1 eq 'Z' && $t2 eq 'EQ') ||
258 ($t2 eq 'Z' && $t1 eq 'EQ'))) {
259 $n = 0.5;
260 } elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
261 $n = 0.0025;
262 } elsif( $t1 eq 'X' || $t2 eq 'X' ) {
263 $n = 0.05;
265 return $n;
268 =head2 Data Methods
270 =cut
272 =head2 pairwise_stats
274 Title : pairwise_stats
275 Usage : $obj->pairwise_stats($newval)
276 Function:
277 Returns : value of pairwise_stats
278 Args : newvalue (optional)
281 =cut
283 sub pairwise_stats{
284 my ($self,$value) = @_;
285 if( defined $value) {
286 $self->{'_pairwise_stats'} = $value;
288 return $self->{'_pairwise_stats'};
292 sub _check_arg {
293 my($self,$aln ) = @_;
294 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
295 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
296 return 0;
297 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) {
298 $self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
299 return 0;
301 return 1;