add latest changes to, um, Changes
[bioperl-live.git] / Bio / Align / ProteinStatistics.pm
blob430d77eddd5e5800a9907d23ec9e526736c91f68
2 # BioPerl module for Bio::Align::ProteinStatistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances)
18 =head1 SYNOPSIS
20 use Bio::Align::ProteinStatistics;
21 use Bio::AlignIO;
22 my $in = Bio::AlignIO->new(-format => 'fasta',
23 -file => 'pep-104.fasaln');
24 my $aln = $in->next_aln;
26 my $pepstats = Bio::Align::ProteinStatistics->new();
27 $kimura = $protstats->distance(-align => $aln,
28 -method => 'Kimura');
29 print $kimura->print_matrix;
32 =head1 DESCRIPTION
34 This object is for generating various statistics from a protein
35 alignment. Mostly it is where pairwise protein distances can be
36 calculated.
38 =head1 REFERENCES
40 D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP,
41 Cambridge.
43 =head1 FEEDBACK
45 =head2 Mailing Lists
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via the
69 web:
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Jason Stajich
75 Email jason-at-bioperl.org
77 =head1 APPENDIX
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
82 =cut
85 # Let the code begin...
88 package Bio::Align::ProteinStatistics;
89 use vars qw(%DistanceMethods $Precision $DefaultGapPenalty);
90 use strict;
92 use Bio::Align::PairwiseStatistics;
93 use Bio::Matrix::PhylipDist;
95 %DistanceMethods = ('kimura|k' => 'Kimura',
97 $Precision = 5;
98 $DefaultGapPenalty = 0;
100 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
102 =head2 new
104 Title : new
105 Usage : my $obj = Bio::Align::ProteinStatistics->new();
106 Function: Builds a new Bio::Align::ProteinStatistics object
107 Returns : an instance of Bio::Align::ProteinStatistics
108 Args :
111 =cut
113 sub new {
114 my($class,@args) = @_;
116 my $self = $class->SUPER::new(@args);
117 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
119 return $self;
122 =head2 distance
124 Title : distance
125 Usage : my $distance_mat = $stats->distance(-align => $aln,
126 -method => $method);
127 Function: Calculates a distance matrix for all pairwise distances of
128 sequences in an alignment.
129 Returns : L<Bio::Matrix::PhylipDist> object
130 Args : -align => Bio::Align::AlignI object
131 -method => String specifying specific distance method
132 (implementing class may assume a default)
134 =cut
136 sub distance{
137 my ($self,@args) = @_;
138 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
139 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
140 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
142 $method ||= 'Kimura';
143 foreach my $m ( keys %DistanceMethods ) {
144 if(defined $m && $method =~ /$m/i ) {
145 my $mtd = "D_$DistanceMethods{$m}";
146 return $self->$mtd($aln);
149 $self->warn("Unrecognized distance method $method must be one of [".
150 join(',',$self->available_distance_methods())."]");
151 return;
154 =head2 available_distance_methods
156 Title : available_distance_methods
157 Usage : my @methods = $stats->available_distance_methods();
158 Function: Enumerates the possible distance methods
159 Returns : Array of strings
160 Args : none
163 =cut
165 sub available_distance_methods{
166 my ($self,@args) = @_;
167 return values %DistanceMethods;
170 =head2 D - distance methods
173 =cut
176 =head2 D_Kimura
178 Title : D_Kimura
179 Usage : my $matrix = $pepstats->D_Kimura($aln);
180 Function: Calculate Kimura protein distance (Kimura 1983) which
181 approximates PAM distance
182 D = -ln ( 1 - p - 0.2 * p^2 )
183 Returns : L<Bio::Matrix::PhylipDist>
184 Args : L<Bio::Align::AlignI>
187 =cut
189 # Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
191 sub D_Kimura{
192 my ($self,$aln) = @_;
193 return 0 unless $self->_check_arg($aln);
194 # ambiguities ignored at this point
195 my (@seqs,@names,@values,%dist);
196 my $seqct = 0;
197 foreach my $seq ( $aln->each_seq) {
198 push @names, $seq->display_id;
199 push @seqs, uc $seq->seq();
200 $seqct++;
202 my $len = $aln->length;
203 my $precisionstr = "%.$Precision"."f";
205 for( my $i = 0; $i < $seqct-1; $i++ ) {
206 # (diagonals) distance is 0 for same sequence
207 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
208 $values[$i][$i] = sprintf($precisionstr,0);
209 for( my $j = $i+1; $j < $seqct; $j++ ) {
210 my ($scored,$match) = (0,0);
211 for( my $k=0; $k < $len; $k++ ) {
212 my $m1 = substr($seqs[$i],$k,1);
213 my $m2 = substr($seqs[$j],$k,1);
214 if( $m1 ne '-' && $m2 ne '-' ) {
215 # score is number of scored bases (alignable bases)
216 # it could have also come from
217 # my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
218 # match is number of matches weighting ambiguity bases
219 # as well
220 $match += _check_ambiguity_protein($m1,$m2);
221 $scored++;
224 # From Felsenstein's PHYLIP documentation:
225 # This is very quick to do but has some obvious
226 # limitations. It does not take into account which amino
227 # acids differ or to what amino acids they change, so some
228 # information is lost. The units of the distance measure
229 # are fraction of amino acids differing, as also in the
230 # case of the PAM distance. If the fraction of amino acids
231 # differing gets larger than 0.8541 the distance becomes
232 # infinite.
234 my $D = 1 - ( $match / $scored );
235 if( $D < 0.8541 ) {
236 $D = - log ( 1 - $D - (0.2 * ($D ** 2)));
237 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
238 } else {
239 $values[$j][$i] = $values[$i][$j] = ' NaN';
241 # fwd and rev lookup
242 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
243 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
245 # (diagonals) distance is 0 for same sequence
246 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
247 $values[$j][$j] = sprintf($precisionstr,0);
251 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
252 -matrix => \%dist,
253 -names => \@names,
254 -values => \@values);
258 # some methods from EMBOSS distmat
259 sub _check_ambiguity_protein
261 my ($t1,$t2) = @_;
262 my $n = 0;
264 if( $t1 ne 'X' && $t1 eq $t2 ) {
265 $n = 1.0;
266 } elsif( (($t1 eq 'B' && $t2 =~ /[DN]/ ) ||
267 ($t2 eq 'B' && $t1 =~ /[DN]/ )) ||
269 (($t1 eq 'Z' && $t2 =~ /[EQ]/) ||
270 ($t2 eq 'Z' && $t1 =~ /[EQ]/ ))) {
271 $n = 0.5;
272 } elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
273 $n = 0.0025;
274 } elsif( $t1 eq 'X' || $t2 eq 'X' ) {
275 $n = 0.05;
277 return $n;
280 =head2 Data Methods
282 =cut
284 =head2 pairwise_stats
286 Title : pairwise_stats
287 Usage : $obj->pairwise_stats($newval)
288 Function:
289 Returns : value of pairwise_stats
290 Args : newvalue (optional)
293 =cut
295 sub pairwise_stats{
296 my ($self,$value) = @_;
297 if( defined $value) {
298 $self->{'_pairwise_stats'} = $value;
300 return $self->{'_pairwise_stats'};
304 sub _check_arg {
305 my($self,$aln ) = @_;
306 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
307 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
308 return 0;
309 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) {
310 $self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
311 return 0;
313 return 1;