[bug 2686]
[bioperl-live.git] / Bio / Align / DNAStatistics.pm
blob75979e660327b01ff682dbdafe669d62cf9d19f1
1 # $Id$
3 # BioPerl module for Bio::Align::DNAStatistics
5 # Cared for by Jason Stajich <jason-AT-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
17 =head1 SYNOPSIS
19 use Bio::AlignIO;
20 use Bio::Align::DNAStatistics;
22 my $stats = Bio::Align::DNAStatistics->new();
23 my $alignin = Bio::AlignIO->new(-format => 'emboss',
24 -file => 't/data/insulin.water');
25 my $aln = $alignin->next_aln;
26 my $jcmatrix = $stats->distance(-align => $aln,
27 -method => 'Jukes-Cantor');
29 print $jcmatrix->print_matrix;
30 ## and for measurements of synonymous /nonsynonymous substitutions ##
32 my $in = Bio::AlignIO->new(-format => 'fasta',
33 -file => 't/data/nei_gojobori_test.aln');
34 my $alnobj = $in->next_aln;
35 my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
36 my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
37 print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
38 for (sort keys %{$results->[0]} ){
39 next if /Seq/;
40 printf("%-9s %.4f \n",$_ , $results->[0]{$_});
43 my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
44 for my $an (@$results2){
45 print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
46 for (sort keys %$an ){
47 next if /Seq/;
48 printf("%-9s %.4f \n",$_ , $an->{$_});
50 print "\n\n";
53 my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
54 for (sort keys %$result3 ){
55 next if /Seq/;
56 printf("%-9s %.4f \n",$_ , $result3->{$_});
59 =head1 DESCRIPTION
61 This object contains routines for calculating various statistics and
62 distances for DNA alignments. The routines are not well tested and do
63 contain errors at this point. Work is underway to correct them, but
64 do not expect this code to give you the right answer currently! Use
65 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
66 distances.
69 Several different distance method calculations are supported. Listed
70 in brackets are the pattern which will match
72 =over 3
74 =item JukesCantor [jc|jukes|jukescantor|jukes-cantor]
76 =item Uncorrected [jcuncor|uncorrected]
78 =item F81 [f81|felsenstein]
80 =item Kimura [k2|k2p|k80|kimura]
82 =item Tamura [t92|tamura|tamura92]
84 =item F84 [f84|felsenstein84]
86 =item TajimaNei [tajimanei|tajima\-nei]
88 =item JinNei [jinnei|jin\-nei] (not implemented)
90 =back
92 There are also three methods to calculate the ratio of synonymous to
93 non-synonymous mutations. All are implementations of the Nei-Gojobori
94 evolutionary pathway method and use the Jukes-Cantor method of
95 nucleotide substitution. This method works well so long as the
96 nucleotide frequencies are roughly equal and there is no significant
97 transition/transversion bias. In order to use these methods there are
98 several pre-requisites for the alignment.
100 =over 3
102 =item 1
104 DNA alignment must be based on protein alignment. Use the subroutine
105 L<aa_to_dna_aln> in Bio::Align::Utilities to achieve this.
107 =item 2
109 Therefore alignment gaps must be in multiples of 3 (representing an aa
110 deletion/insertion) and at present must be indicated by a '-' symbol.
112 =item 3
114 Alignment must be solely of coding region and be in reading frame 0 to
115 achieve meaningful results
117 =item 4
119 Alignment must therefore be a multiple of 3 nucleotides long.
121 =item 5
123 All sequences must be the same length (including gaps). This should be
124 the case anyway if the sequences have been automatically aligned using
125 a program like Clustal.
127 =item 6
129 Only the standard codon alphabet is supported at present.
131 =back
133 calc_KaKs_pair() calculates a number of statistics for a named pair of
134 sequences in the alignment.
136 calc_all_KaKs_pairs() calculates these statistics for all pairwise
137 comparisons in an MSA. The statistics returned are:
139 =over 3
141 =item S_d
143 Number of synonymous mutations between the 2 sequences.
145 =item N_d
147 Number of non-synonymous mutations between the 2 sequences.
149 =item S
151 Mean number of synonymous sites in both sequences.
153 =item N
155 mean number of synonymous sites in both sequences.
157 =item P_s
159 proportion of synonymous differences in both sequences given by P_s = S_d/S.
161 =item P_n
163 proportion of non-synonymous differences in both sequences given by P_n = S_n/S.
165 =item D_s
167 estimation of synonymous mutations per synonymous site (by Jukes-Cantor).
169 =item D_n
171 estimation of non-synonymous mutations per non-synonymous site (by Jukes-Cantor).
173 =item D_n_var
175 estimation of variance of D_n .
177 =item D_s_var
179 estimation of variance of S_n.
181 =item z_value
183 calculation of z value.Positive value indicates D_n E<gt> D_s,
184 negative value indicates D_s E<gt> D_n.
186 =back
188 The statistics returned by calc_average_KaKs are:
190 =over 3
192 =item D_s
194 Average number of synonymous mutations/synonymous site.
196 =item D_n
198 Average number of non-synonymous mutations/non-synonymous site.
200 =item D_s_var
202 Estimated variance of Ds from bootstrapped alignments.
204 =item D_n_var
206 Estimated variance of Dn from bootstrapped alignments.
208 =item z_score
210 calculation of z value. Positive value indicates D_n E<gt>D_s,
211 negative values vice versa.
213 =back
215 The design of the code is based around the explanation of the
216 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
217 Phylogenetics" by Nei and Kumar, published by Oxford University
218 Press. The methods have been tested using the worked example 4.1 in
219 the book, and reproduce those results. If people like having this sort
220 of analysis in BioPerl other methods for estimating Ds and Dn can be
221 provided later.
224 Much of the DNA distance code is based on implementations in EMBOSS
225 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
226 al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
227 Mitchison.
229 =head1 REFERENCES
231 =over 3
233 =item D_JukesCantor
235 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
236 Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
237 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
238 1969, pp. 21-132.
240 =item D_Tamura
242 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
244 =item D_Kimura
246 M Kimura, J. Mol. Evol., 1980, 16, 111.
248 =item JinNei
250 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
252 =item D_TajimaNei
254 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
256 =back
258 =head1 FEEDBACK
260 =head2 Mailing Lists
262 User feedback is an integral part of the evolution of this and other
263 Bioperl modules. Send your comments and suggestions preferably to
264 the Bioperl mailing list. Your participation is much appreciated.
266 bioperl-l@bioperl.org - General discussion
267 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
269 =head2 Reporting Bugs
271 Report bugs to the Bioperl bug tracking system to help us keep track
272 of the bugs and their resolution. Bug reports can be submitted via the
273 web:
275 http://bugzilla.open-bio.org/
277 =head1 AUTHOR - Jason Stajich
279 Email jason-AT-bioperl.org
281 =head1 CONTRIBUTORS
283 Richard Adams, richard.adams@ed.ac.uk
285 =head1 APPENDIX
287 The rest of the documentation details each of the object methods.
288 Internal methods are usually preceded with a _
290 =cut
293 # Let the code begin...
296 package Bio::Align::DNAStatistics;
297 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
298 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
299 $CODONS %synchanges $synsites $Precision $GCChhars);
300 use strict;
301 use Bio::Align::PairwiseStatistics;
302 use Bio::Matrix::PhylipDist;
303 use Bio::Tools::IUPAC;
305 BEGIN {
306 $GapChars = '[\.\-]';
307 $GCChhars = '[GCS]';
308 @Nucleotides = qw(A G T C);
309 $SeqCount = 2;
310 $Precision = 5;
312 # these values come from EMBOSS distmat implementation
313 %NucleotideIndexes = ( 'A' => 0,
314 'T' => 1,
315 'C' => 2,
316 'G' => 3,
318 'AT' => 0,
319 'AC' => 1,
320 'AG' => 2,
321 'CT' => 3,
322 'GT' => 4,
323 'CG' => 5,
325 # these are wrong now
326 # 'S' => [ 1, 3],
327 # 'W' => [ 0, 4],
328 # 'Y' => [ 2, 3],
329 # 'R' => [ 0, 1],
330 # 'M' => [ 0, 3],
331 # 'K' => [ 1, 2],
332 # 'B' => [ 1, 2, 3],
333 # 'H' => [ 0, 2, 3],
334 # 'V' => [ 0, 1, 3],
335 # 'D' => [ 0, 1, 2],
338 $DefaultGapPenalty = 0;
339 # could put ambiguities here?
340 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
341 'T' => [ 'A', 'G'],
342 'C' => [ 'A', 'G'],
343 'G' => [ 'C', 'T'],
345 'Transitions' => { 'A' => [ 'G' ],
346 'G' => [ 'A' ],
347 'C' => [ 'T' ],
348 'T' => [ 'C' ],
351 %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
352 'jcuncor|uncorrected' => 'Uncorrected',
353 'f81|felsenstein81' => 'F81',
354 'k2|k2p|k80|kimura' => 'Kimura',
355 't92|tamura|tamura92' => 'Tamura',
356 'f84|felsenstein84' => 'F84',
357 'tajimanei|tajima\-nei' => 'TajimaNei',
358 'jinnei|jin\-nei' => 'JinNei');
361 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
363 ## generate look up hashes for Nei_Gojobori methods##
364 $CODONS = get_codons();
365 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
366 #create look up hash of number of possible synonymous mutations per codon
367 $synsites = get_syn_sites();
368 #create reference look up hash of single basechanges in codons
369 %synchanges = get_syn_changes();
373 =head2 new
375 Title : new
376 Usage : my $obj = Bio::Align::DNAStatistics->new();
377 Function: Builds a new Bio::Align::DNAStatistics object
378 Returns : Bio::Align::DNAStatistics
379 Args : none
382 =cut
384 sub new {
385 my ($class,@args) = @_;
386 my $self = $class->SUPER::new(@args);
388 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
390 return $self;
394 =head2 distance
396 Title : distance
397 Usage : my $distance_mat = $stats->distance(-align => $aln,
398 -method => $method);
399 Function: Calculates a distance matrix for all pairwise distances of
400 sequences in an alignment.
401 Returns : L<Bio::Matrix::PhylipDist> object
402 Args : -align => Bio::Align::AlignI object
403 -method => String specifying specific distance method
404 (implementing class may assume a default)
405 See also: L<Bio::Matrix::PhylipDist>
407 =cut
409 sub distance{
410 my ($self,@args) = @_;
411 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
412 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
413 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
415 $method ||= 'JukesCantor';
416 foreach my $m ( keys %DistanceMethods ) {
417 if(defined $m && $method =~ /$m/i ) {
418 my $mtd = "D_$DistanceMethods{$m}";
419 return $self->$mtd($aln);
422 $self->warn("Unrecognized distance method $method must be one of [".
423 join(',',$self->available_distance_methods())."]");
424 return;
427 =head2 available_distance_methods
429 Title : available_distance_methods
430 Usage : my @methods = $stats->available_distance_methods();
431 Function: Enumerates the possible distance methods
432 Returns : Array of strings
433 Args : none
436 =cut
438 sub available_distance_methods{
439 my ($self,@args) = @_;
440 return values %DistanceMethods;
443 =head2 D - distance methods
446 =cut
449 =head2 D_JukesCantor
451 Title : D_JukesCantor
452 Usage : my $d = $stat->D_JukesCantor($aln)
453 Function: Calculates D (pairwise distance) between 2 sequences in an
454 alignment using the Jukes-Cantor 1 parameter model.
455 Returns : L<Bio::Matrix::PhylipDist>
456 Args : L<Bio::Align::AlignI> of DNA sequences
457 double - gap penalty
460 =cut
462 sub D_JukesCantor{
463 my ($self,$aln,$gappenalty) = @_;
464 return 0 unless $self->_check_arg($aln);
465 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
466 # ambiguities ignored at this point
467 my (@seqs,@names,@values,%dist);
468 my $seqct = 0;
469 foreach my $seq ( $aln->each_seq) {
470 push @names, $seq->display_id;
471 push @seqs, uc $seq->seq();
472 $seqct++;
474 my $precisionstr = "%.$Precision"."f";
475 for(my $i = 0; $i < $seqct-1; $i++ ) {
476 # (diagonals) distance is 0 for same sequence
477 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
478 $values[$i][$i] = sprintf($precisionstr,0);
480 for( my $j = $i+1; $j < $seqct; $j++ ) {
481 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
482 $seqs[$j]);
483 # just want diagonals
484 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
485 $matrix->[2]->[2] + $matrix->[3]->[3] );
486 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
487 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
488 # fwd and rev lookup
489 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
490 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
491 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
492 # (diagonals) distance is 0 for same sequence
493 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
494 $values[$j][$j] = sprintf($precisionstr,0);
497 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
498 -matrix => \%dist,
499 -names => \@names,
500 -values => \@values);
503 =head2 D_F81
505 Title : D_F81
506 Usage : my $d = $stat->D_F81($aln)
507 Function: Calculates D (pairwise distance) between 2 sequences in an
508 alignment using the Felsenstein 1981 distance model.
509 Relaxes the assumption of equal base frequencies that is
510 in JC.
511 Returns : L<Bio::Matrix::PhylipDist>
512 Args : L<Bio::Align::AlignI> of DNA sequences
515 =cut
517 sub D_F81{
518 my ($self,$aln,$gappenalty) = @_;
519 return 0 unless $self->_check_arg($aln);
520 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
521 # ambiguities ignored at this point
522 my (@seqs,@names,@values,%dist);
523 my $seqct = 0;
524 foreach my $seq ( $aln->each_seq) {
525 push @names, $seq->display_id;;
526 push @seqs, uc $seq->seq();
527 $seqct++;
529 my $precisionstr = "%.$Precision"."f";
530 for(my $i = 0; $i < $seqct-1; $i++ ) {
531 # (diagonals) distance is 0 for same sequence
532 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
533 $values[$i][$i] = sprintf($precisionstr,0);
535 for( my $j = $i+1; $j < $seqct; $j++ ) {
537 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
538 $seqs[$j]);
539 # just want diagonals
540 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
541 $matrix->[2]->[2] + $matrix->[3]->[3] );
542 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
543 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
544 # fwd and rev lookup
545 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
546 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
547 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
548 # (diagonals) distance is 0 for same sequence
549 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
550 $values[$j][$j] = sprintf($precisionstr,0);
553 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
554 -matrix => \%dist,
555 -names => \@names,
556 -values => \@values);
559 =head2 D_Uncorrected
561 Title : D_Uncorrected
562 Usage : my $d = $stats->D_Uncorrected($aln)
563 Function: Calculate a distance D, no correction for multiple substitutions
564 is used.
565 Returns : L<Bio::Matrix::PhylipDist>
566 Args : L<Bio::Align::AlignI> (DNA Alignment)
567 [optional] gap penalty
569 =cut
571 sub D_Uncorrected {
572 my ($self,$aln,$gappenalty) = @_;
573 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
574 return 0 unless $self->_check_arg($aln);
575 # ambiguities ignored at this point
576 my (@seqs,@names,@values,%dist);
577 my $seqct = 0;
578 foreach my $seq ( $aln->each_seq) {
579 push @names, $seq->display_id;
580 push @seqs, uc $seq->seq();
581 $seqct++;
584 my $precisionstr = "%.$Precision"."f";
585 my $len = $aln->length;
586 for( my $i = 0; $i < $seqct-1; $i++ ) {
587 # (diagonals) distance is 0 for same sequence
588 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
589 $values[$i][$i] = sprintf($precisionstr,0);
591 for( my $j = $i+1; $j < $seqct; $j++ ) {
592 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
593 $seqs[$j]);
594 my $m = ( $matrix->[0]->[0] +
595 $matrix->[1]->[1] +
596 $matrix->[2]->[2] +
597 $matrix->[3]->[3] );
598 my $D = 1 - ( $m / ( $len - $gaps + ( $gaps * $gappenalty)));
599 # fwd and rev lookup
600 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
601 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
602 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
603 # (diagonals) distance is 0 for same sequence
604 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
605 $values[$j][$j] = sprintf($precisionstr,0);
608 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
609 -matrix => \%dist,
610 -names => \@names,
611 -values => \@values);
615 # M Kimura, J. Mol. Evol., 1980, 16, 111.
617 =head2 D_Kimura
619 Title : D_Kimura
620 Usage : my $d = $stat->D_Kimura($aln)
621 Function: Calculates D (pairwise distance) between all pairs of sequences
622 in an alignment using the Kimura 2 parameter model.
623 Returns : L<Bio::Matrix::PhylipDist>
624 Args : L<Bio::Align::AlignI> of DNA sequences
627 =cut
629 sub D_Kimura {
630 my ($self,$aln) = @_;
631 return 0 unless $self->_check_arg($aln);
632 # ambiguities ignored at this point
633 my (@names,@values,%dist);
634 my $seqct = 0;
635 foreach my $seq ( $aln->each_seq) {
636 push @names, $seq->display_id;
637 $seqct++;
640 my $precisionstr = "%.$Precision"."f";
642 for( my $i = 0; $i < $seqct-1; $i++ ) {
643 # (diagonals) distance is 0 for same sequence
644 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
645 $values[$i][$i] = sprintf($precisionstr,0);
647 for( my $j = $i+1; $j < $seqct; $j++ ) {
648 my $pairwise = $aln->select_noncont($i+1,$j+1);
649 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
650 unless( $L ) {
651 $L = 1;
653 my $P = $self->transitions($pairwise) / $L;
654 my $Q = $self->transversions($pairwise) / $L;
655 my $K = 0;
656 my $a = 1 / ( 1 - (2 * $P) - $Q);
657 my $b = 1 / ( 1 - 2 * $Q );
658 if( $a < 0 || $b < 0 ) {
659 $K = -1;
660 } else{
661 $K = (1/2) * log ( $a ) + (1/4) * log($b);
663 # fwd and rev lookup
664 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
665 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
666 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
667 # (diagonals) distance is 0 for same sequence
668 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
669 $values[$j][$j] = sprintf($precisionstr,0);
672 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
673 -matrix => \%dist,
674 -names => \@names,
675 -values => \@values);
679 =head2 D_Kimura_variance
681 Title : D_Kimura
682 Usage : my $d = $stat->D_Kimura_variance($aln)
683 Function: Calculates D (pairwise distance) between all pairs of sequences
684 in an alignment using the Kimura 2 parameter model.
685 Returns : array of 2 L<Bio::Matrix::PhylipDist>,
686 the first is the Kimura distance and the second is
687 a matrix of variance V(K)
688 Args : L<Bio::Align::AlignI> of DNA sequences
691 =cut
693 sub D_Kimura_variance {
694 my ($self,$aln) = @_;
695 return 0 unless $self->_check_arg($aln);
696 # ambiguities ignored at this point
697 my (@names,@values,%dist,@var);
698 my $seqct = 0;
699 foreach my $seq ( $aln->each_seq) {
700 push @names, $seq->display_id;
701 $seqct++;
704 my $precisionstr = "%.$Precision"."f";
706 for( my $i = 0; $i < $seqct-1; $i++ ) {
707 # (diagonals) distance is 0 for same sequence
708 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
709 $values[$i][$i] = sprintf($precisionstr,0);
711 for( my $j = $i+1; $j < $seqct; $j++ ) {
712 my $pairwise = $aln->select_noncont($i+1,$j+1);
713 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
714 unless( $L ) {
715 $L = 1;
717 my $P = $self->transitions($pairwise) / $L;
718 my $Q = $self->transversions($pairwise) / $L;
719 my ($a,$b,$K,$var_k);
720 my $a_denom = ( 1 - (2 * $P) - $Q);
721 my $b_denom = 1 - 2 * $Q;
722 unless( $a_denom > 0 && $b_denom > 0 ) {
723 $a = 1;
724 $b = 1;
725 $K = -1;
726 $var_k = -1;
727 } else {
728 $a = 1 / $a_denom;
729 $b = 1 / $b_denom;
730 $K = (1/2) * log ( $a ) + (1/4) * log($b);
731 # from Wu and Li 1985 which in turn is from Kimura 1980
732 my $c = ( $a - $b ) / 2;
733 my $d = ( $a + $b ) / 2;
734 $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
737 # fwd and rev lookup
738 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
739 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
740 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
741 # (diagonals) distance is 0 for same sequence
742 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
743 $values[$j]->[$j] = sprintf($precisionstr,0);
745 $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
746 $var[$j]->[$j] = $values[$j]->[$j];
749 return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
750 -matrix => \%dist,
751 -names => \@names,
752 -values => \@values),
753 Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
754 -matrix => \%dist,
755 -names => \@names,
756 -values => \@var)
761 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
763 =head2 D_Tamura
765 Title : D_Tamura
766 Usage : Calculates D (pairwise distance) between 2 sequences in an
767 alignment using Tamura 1992 distance model.
768 Returns : L<Bio::Matrix::PhylipDist>
769 Args : L<Bio::Align::AlignI> of DNA sequences
772 =cut
774 sub D_Tamura {
775 my ($self,$aln) = @_;
776 return 0 unless $self->_check_arg($aln);
777 # ambiguities ignored at this point
778 my (@seqs,@names,@values,%dist,$i,$j);
779 my $seqct = 0;
780 my $length = $aln->length;
781 foreach my $seq ( $aln->each_seq) {
782 push @names, $seq->display_id;;
783 push @seqs, uc $seq->seq();
784 $seqct++;
787 my $precisionstr = "%.$Precision"."f";
788 my (@gap,@gc,@trans,@tranv,@score);
789 $i = 0;
790 for my $t1 ( @seqs ) {
791 $j = 0;
792 for my $t2 ( @seqs ) {
793 $gap[$i][$j] = 0;
794 for( my $k = 0; $k < $length; $k++ ) {
795 my ($c1,$c2) = ( substr($seqs[$i],$k,1),
796 substr($seqs[$j],$k,1) );
797 if( $c1 =~ /^$GapChars$/ ||
798 $c2 =~ /^$GapChars$/ ) {
799 $gap[$i][$j]++;
800 } elsif( $c2 =~ /^$GCChhars$/i ) {
801 $gc[$i][$j]++;
804 $gc[$i][$j] = ( $gc[$i][$j] /
805 ($length - $gap[$i][$j]) );
806 $j++;
808 $i++;
811 for( $i = 0; $i < $seqct-1; $i++ ) {
812 # (diagonals) distance is 0 for same sequence
813 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
814 $values[$i][$i] = sprintf($precisionstr,0);
816 for( $j = $i+1; $j < $seqct; $j++ ) {
818 my $pairwise = $aln->select_noncont($i+1,$j+1);
819 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
820 my $P = $self->transitions($pairwise) / $L;
821 my $Q = $self->transversions($pairwise) / $L;
822 my $C = $gc[$i][$j] + $gc[$j][$i]-
823 ( 2 * $gc[$i][$j] * $gc[$j][$i] );
824 if( $P ) {
825 $P = $P / $C;
827 my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
828 # fwd and rev lookup
829 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
830 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
831 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
832 # (diagonals) distance is 0 for same sequence
833 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
834 $values[$j][$j] = sprintf($precisionstr,0);
837 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
838 -matrix => \%dist,
839 -names => \@names,
840 -values => \@values);
844 =head2 D_F84
846 Title : D_F84
847 Usage : my $d = $stat->D_F84($aln)
848 Function: Calculates D (pairwise distance) between 2 sequences in an
849 alignment using the Felsenstein 1984 distance model.
850 Returns : L<Bio::Matrix::PhylipDist>
851 Args : L<Bio::Align::AlignI> of DNA sequences
852 [optional] double - gap penalty
854 =cut
856 sub D_F84 {
857 my ($self,$aln,$gappenalty) = @_;
858 return 0 unless $self->_check_arg($aln);
859 $self->throw_not_implemented();
860 # ambiguities ignored at this point
861 my (@seqs,@names,@values,%dist);
862 my $seqct = 0;
863 foreach my $seq ( $aln->each_seq) {
864 # if there is no name,
865 my $id = $seq->display_id;
866 if( ! length($id) || # deal with empty names
867 $id =~ /^\s+$/ ) {
868 $id = $seqct+1;
870 push @names, $id;
871 push @seqs, uc $seq->seq();
872 $seqct++;
875 my $precisionstr = "%.$Precision"."f";
877 for( my $i = 0; $i < $seqct-1; $i++ ) {
878 # (diagonals) distance is 0 for same sequence
879 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
880 $values[$i][$i] = sprintf($precisionstr,0);
882 for( my $j = $i+1; $j < $seqct; $j++ ) {
887 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
888 # Tajima-Nei correction used for multiple substitutions in the calc
889 # of the distance matrix. Nucleic acids only.
891 # D = p-distance = 1 - (matches/(posns_scored + gaps)
893 # distance = -b * ln(1-D/b)
896 =head2 D_TajimaNei
898 Title : D_TajimaNei
899 Usage : my $d = $stat->D_TajimaNei($aln)
900 Function: Calculates D (pairwise distance) between 2 sequences in an
901 alignment using the TajimaNei 1984 distance model.
902 Returns : L<Bio::Matrix::PhylipDist>
903 Args : Bio::Align::AlignI of DNA sequences
906 =cut
908 sub D_TajimaNei{
909 my ($self,$aln) = @_;
910 return 0 unless $self->_check_arg($aln);
911 # ambiguities ignored at this point
912 my (@seqs,@names,@values,%dist);
913 my $seqct = 0;
914 foreach my $seq ( $aln->each_seq) {
915 # if there is no name,
916 push @names, $seq->display_id;
917 push @seqs, uc $seq->seq();
918 $seqct++;
920 my $precisionstr = "%.$Precision"."f";
921 my ($i,$j,$bs);
922 # pairwise
923 for( $i =0; $i < $seqct -1; $i++ ) {
924 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
925 $values[$i][$i] = sprintf($precisionstr,0);
927 for ( $j = $i+1; $j <$seqct;$j++ ) {
928 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
929 $seqs[$j]);
930 my $pairwise = $aln->select_noncont($i+1,$j+1);
931 my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
932 my $fij2 = 0;
933 for( $bs = 0; $bs < 4; $bs++ ) {
934 my $fi = 0;
935 map {$fi += $matrix->[$bs]->[$_] } 0..3;
936 my $fj = 0;
937 # summation
938 map { $fj += $matrix->[$_]->[$bs] } 0..3;
939 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
940 $fij2 += $fij**2;
943 my ($pair,$h) = (0,0);
944 for( $bs = 0; $bs < 3; $bs++ ) {
945 for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
946 my $fij = $pfreq->[$pair++] / $slen;
947 if( $fij ) {
949 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
951 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
952 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
953 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
954 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
956 if( $fij ) {
957 $h += ( ($fij**2) / 2 ) /
958 ( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
959 ( ( $ci2 + $cj2 ) / (2 * $slen) )
962 $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
966 # just want diagonals which are matches (A matched A, C -> C)
968 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
969 $matrix->[2]->[2] + $matrix->[3]->[3] );
970 my $D = 1 - ( $m / $slen);
971 my $d;
972 if( $h == 0 ) {
973 $d = -1;
974 } else {
975 my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
976 my $c = 1- $D/ $b;
978 if( $c < 0 ) {
979 $d = -1;
980 } else {
981 $d = (-1 * $b) * log ( $c);
984 # fwd and rev lookup
985 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
986 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
987 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
989 # (diagonals) distance is 0 for same sequence
990 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
991 $values[$j][$j] = sprintf($precisionstr,0);
994 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
995 -matrix => \%dist,
996 -names => \@names,
997 -values => \@values);
1001 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1003 =head2 D_JinNei
1005 Title : D_JinNei
1006 Usage : my $d = $stat->D_JinNei($aln)
1007 Function: Calculates D (pairwise distance) between 2 sequences in an
1008 alignment using the Jin-Nei 1990 distance model.
1009 Returns : L<Bio::Matrix::PhylipDist>
1010 Args : L<Bio::Align::AlignI> of DNA sequences
1013 =cut
1015 sub D_JinNei{
1016 my ($self,@args) = @_;
1017 $self->warn("JinNei implementation not completed");
1018 return;
1021 =head2 transversions
1023 Title : transversions
1024 Usage : my $transversions = $stats->transversion($aln);
1025 Function: Calculates the number of transversions between two sequences in
1026 an alignment
1027 Returns : integer
1028 Args : Bio::Align::AlignI
1031 =cut
1033 sub transversions{
1034 my ($self,$aln) = @_;
1035 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1038 =head2 transitions
1040 Title : transitions
1041 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1042 Function: Calculates the number of transitions in a given DNA alignment
1043 Returns : integer representing the number of transitions
1044 Args : Bio::Align::AlignI object
1047 =cut
1049 sub transitions{
1050 my ($self,$aln) = @_;
1051 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1055 sub _trans_count_helper {
1056 my ($self,$aln,$type) = @_;
1057 return 0 unless( $self->_check_arg($aln) );
1058 if( ! $aln->is_flush ) { $self->throw("must be flush") }
1059 my (@tcount);
1060 my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1061 uc $aln->get_seq_by_pos(2)->seq() );
1062 my $alen = $aln->length;
1063 for (my $i = 0;$i<$alen; $i++ ) {
1064 my ($c1,$c2) = ( substr($first,$i,1),
1065 substr($second,$i,1) );
1066 if( $c1 ne $c2 ) {
1067 foreach my $nt ( @{$type->{$c1}} ) {
1068 if( $nt eq $c2) {
1069 $tcount[$i]++;
1074 my $sum = 0;
1075 map { if( $_) { $sum += $_} } @tcount;
1076 return $sum;
1079 # this will generate a matrix which records across the row, the number
1080 # of DNA subst
1082 sub _build_nt_matrix {
1083 my ($self,$seqa,$seqb) = @_;
1086 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1087 [ qw(0 0 0 0) ],
1088 [ qw(0 0 0 0) ],
1089 [ qw(0 0 0 0) ] ];
1090 my $gaps = 0; # number of gaps
1091 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1092 my $len_a = length($seqa);
1093 for( my $i = 0; $i < $len_a; $i++) {
1094 my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1095 $ti =~ tr/U/T/;
1096 $tj =~ tr/U/T/;
1098 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1099 if( $tj =~ /^$GapChars$/) { $gaps++; next }
1101 my $ti_index = $NucleotideIndexes{$ti};
1102 my $tj_index = $NucleotideIndexes{$tj};
1104 if( ! defined $ti_index ) {
1105 $self->warn("ti_index not defined for $ti\n");
1106 next;
1109 $basect_matrix->[$ti_index]->[$tj_index]++;
1111 if( $ti ne $tj ) {
1112 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1115 return ($basect_matrix,$pfreq,$gaps);
1118 sub _check_ambiguity_nucleotide {
1119 my ($base1,$base2) = @_;
1120 my %iub = Bio::Tools::IUPAC->iupac_iub();
1121 my @amb1 = @{ $iub{uc($base1)} };
1122 my @amb2 = @{ $iub{uc($base2)} };
1123 my ($pmatch) = (0);
1124 for my $amb ( @amb1 ) {
1125 if( grep { $amb eq $_ } @amb2 ) {
1126 $pmatch = 1;
1127 last;
1130 if( $pmatch ) {
1131 return (1 / scalar @amb1) * (1 / scalar @amb2);
1132 } else {
1133 return 0;
1138 sub _check_arg {
1139 my($self,$aln ) = @_;
1140 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1141 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1142 return 0;
1143 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1144 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1145 return 0;
1147 return 1;
1150 =head2 Data Methods
1152 =cut
1154 =head2 pairwise_stats
1156 Title : pairwise_stats
1157 Usage : $obj->pairwise_stats($newval)
1158 Function:
1159 Returns : value of pairwise_stats
1160 Args : newvalue (optional)
1163 =cut
1165 sub pairwise_stats{
1166 my ($self,$value) = @_;
1167 if( defined $value) {
1168 $self->{'_pairwise_stats'} = $value;
1170 return $self->{'_pairwise_stats'};
1174 =head2 calc_KaKs_pair
1176 Title : calc_KaKs_pair
1177 Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1178 $name1, $name2).
1179 Function : calculates Nei-Gojobori statistics for pairwise
1180 comparison.
1181 Args : A Bio::Align::AlignI compliant object such as a
1182 Bio::SimpleAlign object, and 2 sequence name strings.
1183 Returns : a reference to a hash of statistics with keys as
1184 listed in Description.
1186 =cut
1188 sub calc_KaKs_pair {
1189 my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1190 $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1191 if @_!= 4;
1192 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1193 my @seqs = (
1194 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1195 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1196 {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1197 {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1199 if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1200 $self->throw(" aligned sequences must be of equal length!");
1202 my $results = [];
1203 $self->_get_av_ds_dn(\@seqs, $results);
1204 return $results;
1208 =head2 calc_all_KaKs_pairs
1210 Title : calc_all_KaKs_pairs
1211 Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1212 Function : Calculates Nei_gojobori statistics for all pairwise
1213 combinations in sequence.
1214 Arguments: A Bio::Align::ALignI compliant object such as
1215 a Bio::SimpleAlign object.
1216 Returns : A reference to an array of hashes of statistics of
1217 all pairwise comparisons in the alignment.
1219 =cut
1223 sub calc_all_KaKs_pairs {
1224 #returns a multi_element_array with all pairwise comparisons
1225 my ($self,$aln) = @_;
1226 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1227 my @seqs;
1228 for my $seq ($aln->each_seq) {
1229 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1231 my $results ;
1232 $results = $self->_get_av_ds_dn(\@seqs, $results);
1233 return $results;
1236 =head2 calc_average_KaKs
1238 Title : calc_average_KaKs.
1239 Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1240 Function : calculates Nei_Gojobori stats for average of all
1241 sequences in the alignment.
1242 Args : A Bio::Align::AlignI compliant object such as a
1243 Bio::SimpleAlign object, number of bootstrap iterations
1244 (default 1000).
1245 Returns : A reference to a hash of statistics as listed in Description.
1247 =cut
1249 sub calc_average_KaKs {
1250 #calculates global value for sequences in alignment using bootstrapping
1251 #this is quite slow (~10 seconds per 3 X 200nt seqs);
1252 my ($self, $aln, $bootstrap_rpt) = @_;
1253 $bootstrap_rpt ||= 1000;
1254 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1255 my @seqs;
1256 for my $seq ($aln->each_seq) {
1257 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1259 my $results ;
1260 my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1261 #print "ds = $ds_orig, dn = $dn_orig\n";
1262 $results = {D_s => $ds_orig, D_n => $dn_orig};
1263 $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1264 return $results;
1267 ############## primary internal subs for alignment comparisons ########################
1269 sub _run_bootstrap {
1270 ### generates sampled sequences, calculates Ds and Dn values,
1271 ### then calculates variance of sampled sequences and add results to results hash
1272 ###
1273 my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1274 my @seqs = @$seq_ref;
1275 my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1276 my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1278 #1st make alternative array of codons;
1279 my $c = 0;
1280 while ($c < length $seqs[0]{'seq'}) {
1281 for (0..$#seqs) {
1282 push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1284 $c+=3;
1287 for (1..$bootstrap_rpt) {
1288 my $sampled = _resample (\@btstrp_aoa);
1289 my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1290 push @{$bootstrap_values{'ds'}}, $ds;
1291 push @{$bootstrap_values{'dn'}}, $dn;
1294 $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1295 $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1296 $results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1297 sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1298 #print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1299 #print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1300 #print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1303 sub _resample {
1304 my $ref = shift;
1305 my $codon_num = scalar (@{$ref->[0]});
1306 my @altered;
1307 for (0..$codon_num -1) { #for each codon
1308 my $rand = int (rand ($codon_num));
1309 for (0..$#$ref) {
1310 push @{$altered[$_]}, $ref->[$_][$rand];
1313 my @stringed = map {join '', @$_}@altered;
1314 my @return;
1315 #now out in random name to keep other subs happy
1316 for (@stringed) {
1317 push @return, {id=>'1', seq=> $_};
1319 return \@return;
1322 sub _get_av_ds_dn {
1323 # takes array of hashes of sequence strings and ids #
1324 my $self = shift;
1325 my $seq_ref = shift;
1326 my $result = shift if @_;
1327 my @caller = caller(1);
1328 my @seqarray = @$seq_ref;
1329 my $bootstrap_score_list;
1330 #for a multiple alignment considers all pairwise combinations#
1331 my %dsfor_average = (ds => [], dn => []);
1332 for (my $i = 0; $i < scalar @seqarray; $i++) {
1333 for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1334 # print "comparing $i and $j\n";
1335 if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1336 $self->warn(" aligned sequences must be of equal length!");
1337 next;
1340 my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1341 my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1342 # print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1343 my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1344 #get averages
1345 my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1346 my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1348 #calculate ps and pn (p54)
1349 my $syn_prop = $syn_count / $av_s_site;
1350 my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1352 #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1353 my $d_syn = $self->jk($syn_prop);
1354 my $d_nc = $self->jk($nc_prop);
1356 #JK calculation must succeed for continuation of calculation
1357 #ret_value = -1 if error
1358 next unless $d_nc >=0 && $d_syn >=0;
1361 push @{$dsfor_average{'ds'}}, $d_syn;
1362 push @{$dsfor_average{'dn'}}, $d_nc;
1364 #if not doing bootstrap, calculate the pairwise comparisin stats
1365 if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1366 #now calculate variances assuming large sample
1367 my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1368 my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1369 #now calculate z_value
1370 #print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1371 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1372 my $z = ($d_syn_var + $d_nc_var) ?
1373 ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1374 # print "z is $z\n";
1375 push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1376 S_d => $syn_count, N_d =>$non_syn_count,
1377 P_s => $syn_prop, P_n=>$nc_prop,
1378 D_s => @{$dsfor_average{'ds'}}[-1],
1379 D_n => @{$dsfor_average{'dn'}}[-1],
1380 D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1381 Seq1 => $seqarray[$i]{'id'},
1382 Seq2 => $seqarray[$j]{'id'},
1383 z_score => $z,
1385 $self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1386 if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1387 }#endif
1391 #warn of failure if no results hashes are present
1392 #will fail if Jukes Cantor has failed for all pairwise combinations
1393 #$self->warn("calculation failed!") if scalar @$result ==0;
1395 #return results unless bootstrapping
1396 return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1397 #else if getting average for bootstrap
1398 return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1402 sub jk {
1403 my ($self, $p) = @_;
1404 if ($p > 0.75) {
1405 $self->warn( " Jukes Cantor won't work -too divergent!");
1406 return -1;
1408 return -1 * (3/4) * (log(1 - (4/3) * $p));
1411 #works for large value of n (50?100?)
1412 sub jk_var {
1413 my ($p, $n) = @_;
1414 return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1418 # compares 2 sequences to find the number of synonymous/non
1419 # synonymous mutations between them
1421 sub analyse_mutations {
1422 my ($seq1, $seq2) = @_;
1423 my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1424 [2,1]], # depend on which is the same
1425 1=>[[0,2],
1426 [2,0]],
1427 2=>[[0,1],
1428 [1,0]],
1430 3=> [ [0,1,2], # all need to be altered
1431 [1,0,2],
1432 [0,2,1],
1433 [1,2,0],
1434 [2,0,1],
1435 [2,1,0] ],
1437 my $TOTAL = 0; # total synonymous changes
1438 my $TOTAL_n = 0; # total non-synonymous changes
1439 my $gap_cnt = 0;
1441 my %input;
1442 my $seqlen = length($seq1);
1443 for (my $j=0; $j< $seqlen; $j+=3) {
1444 $input{'cod1'} = substr($seq1, $j,3);
1445 $input{'cod2'} = substr($seq2, $j,3);
1447 #ignore codon if beeing compared with gaps!
1448 if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1449 $gap_cnt += 3; #just increments once if there is a pair of gaps
1450 next;
1453 my ($diff_cnt, $same) = count_diffs(\%input);
1455 #ignore if codons are identical
1456 next if $diff_cnt == 0 ;
1457 if ($diff_cnt == 1) {
1458 $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1459 $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1460 #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1462 elsif ($diff_cnt ==2) {
1463 my $s_cnt = 0;
1464 my $n_cnt = 0;
1465 my $tot_muts = 4;
1466 #will stay 4 unless there are stop codons at intervening point
1467 OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
1468 my $altered = $input{'cod1'};
1469 my $prev= $altered;
1470 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1471 for my $mut_i (@$perm) { #index of codon mutated
1472 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1473 if ($t[$CODONS->{$altered}] eq '*') {
1474 $tot_muts -=2;
1475 #print "changes to stop codon!!\n";
1476 next OUTER;
1478 else {
1479 $s_cnt += $synchanges{$prev}{$altered};
1480 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1482 $prev = $altered;
1484 # print "\n";
1486 if ($tot_muts != 0) {
1487 $TOTAL += ($s_cnt/($tot_muts/2));
1488 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1492 elsif ($diff_cnt ==3 ) {
1493 my $s_cnt = 0;
1494 my $n_cnt = 0;
1495 my $tot_muts = 18; #potential number of mutations
1496 OUTER: for my $perm (@{$mutator{'3'}}) {
1497 my $altered = $input{'cod1'};
1498 my $prev= $altered;
1499 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1500 for my $mut_i (@$perm) { #index of codon mutated
1501 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1502 if ($t[$CODONS->{$altered}] eq '*') {
1503 $tot_muts -=3;
1504 # print "changes to stop codon!!\n";
1505 next OUTER;
1508 else {
1509 $s_cnt += $synchanges{$prev}{$altered};
1510 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1512 $prev = $altered;
1514 # print "\n";
1516 }#end OUTER loop
1517 #calculate number of synonymous/non synonymous mutations for that codon
1518 # and add to total
1519 if ($tot_muts != 0) {
1520 $TOTAL += ($s_cnt / ($tot_muts /3));
1521 $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1523 } #endif $diffcnt = 3
1524 } #end of sequencetraversal
1525 return ($TOTAL, $TOTAL_n, $gap_cnt);
1529 sub count_diffs {
1530 #counts the number of nucleotide differences between 2 codons
1531 # returns this value plus the codon index of which nucleotide is the same when 2
1532 #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1533 # to change.
1534 my $ref = shift;
1535 my $cnt = 0;
1536 my $same= undef;
1537 #just for 2 differences
1538 for (0..2) {
1539 if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1540 $cnt++;
1541 } else {
1542 $same = $_;
1545 return ($cnt, $same);
1548 =head2 get_syn_changes
1550 Title : get_syn_changes
1551 Usage : Bio::Align::DNAStatitics->get_syn_changes
1552 Function: Generate a hashref of all pairwise combinations of codns
1553 differing by 1
1554 Returns : Symetic matrix using hashes
1555 First key is codon
1556 and each codon points to a hashref of codons
1557 the values of which describe type of change.
1558 my $type = $hash{$codon1}->{$codon2};
1559 values are :
1560 1 synonymous
1561 0 non-syn
1562 -1 either codon is a stop codon
1563 Args : none
1565 =cut
1567 sub get_syn_changes {
1568 #hash of all pairwise combinations of codons differing by 1
1569 # 1 = syn, 0 = non-syn, -1 = stop
1570 my %results;
1571 my @codons = _make_codons ();
1572 my $arr_len = scalar @codons;
1573 for (my $i = 0; $i < $arr_len -1; $i++) {
1574 my $cod1 = $codons[$i];
1575 for (my $j = $i +1; $j < $arr_len; $j++) {
1576 my $diff_cnt = 0;
1577 for my $pos(0..2) {
1578 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1580 next if $diff_cnt !=1;
1582 #synon change
1583 if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1584 $results{$cod1}{$codons[$j]} =1;
1585 $results{$codons[$j]}{$cod1} = 1;
1587 #stop codon
1588 elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1589 $results{$cod1}{$codons[$j]} = -1;
1590 $results{$codons[$j]}{$cod1} = -1;
1592 # nc change
1593 else {
1594 $results{$cod1}{$codons[$j]} = 0;
1595 $results{$codons[$j]}{$cod1} = 0;
1599 return %results;
1602 =head2 dnds_pattern_number
1604 Title : dnds_pattern_number
1605 Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1606 Function: Counts the number of codons with no gaps in the MSA
1607 Returns : Number of codons with no gaps ('patterns' in PAML notation)
1608 Args : A Bio::Align::AlignI compliant object such as a
1609 Bio::SimpleAlign object.
1611 =cut
1613 sub dnds_pattern_number{
1614 my ($self, $aln) = @_;
1615 return ($aln->remove_gaps->length)/3;
1618 sub count_syn_sites {
1619 #counts the number of possible synonymous changes for sequence
1620 my ($seq, $synsite) = @_;
1621 __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1622 my $S = 0;
1623 for (my $i = 0; $i< length($seq); $i+=3) {
1624 my $cod = substr($seq, $i, 3);
1625 next if $cod =~ /\-/; #deal with alignment gaps
1626 $S += $synsite->{$cod}{'s'};
1628 #print "S is $S\n";
1629 return $S;
1634 sub get_syn_sites {
1635 #sub to generate lookup hash for the number of synonymous changes per codon
1636 my @nucs = qw(T C A G);
1637 my %raw_results;
1638 for my $i (@nucs) {
1639 for my $j (@nucs) {
1640 for my $k (@nucs) {
1641 # for each possible codon
1642 my $cod = "$i$j$k";
1643 my $aa = $t[$CODONS->{$cod}];
1644 #calculate number of synonymous mutations vs non syn mutations
1645 for my $i (qw(0 1 2)){
1646 my $s = 0;
1647 my $n = 3;
1648 for my $nuc (qw(A T C G)) {
1649 next if substr ($cod, $i,1) eq $nuc;
1650 my $test = $cod;
1651 substr($test, $i, 1) = $nuc ;
1652 if ($t[$CODONS->{$test}] eq $aa) {
1653 $s++;
1655 if ($t[$CODONS->{$test}] eq '*') {
1656 $n--;
1659 $raw_results{$cod}[$i] = {'s' => $s ,
1660 'n' => $n };
1663 } #end analysis of single codon
1665 } #end analysis of all codons
1666 my %final_results;
1668 for my $cod (sort keys %raw_results) {
1669 my $t = 0;
1670 map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
1671 $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1673 return \%final_results;
1676 sub _make_codons {
1677 #makes all codon combinations, returns array of them
1678 my @nucs = qw(T C A G);
1679 my @codons;
1680 for my $i (@nucs) {
1681 for my $j (@nucs) {
1682 for my $k (@nucs) {
1683 push @codons, "$i$j$k";
1687 return @codons;
1690 sub get_codons {
1691 #generates codon translation look up table#
1692 my $x = 0;
1693 my $CODONS = {};
1694 for my $codon (_make_codons) {
1695 $CODONS->{$codon} = $x;
1696 $x++;
1698 return $CODONS;
1701 #########stats subs, can go in another module? Here for speed. ###
1702 sub mean {
1703 my $ref = shift;
1704 my $el_num = scalar @$ref;
1705 my $tot = 0;
1706 map{$tot += $_}@$ref;
1707 return ($tot/$el_num);
1710 sub variance {
1711 my $ref = shift;
1712 my $mean = mean($ref);
1713 my $sum_of_squares = 0;
1714 map{$sum_of_squares += ($_ - $mean) **2}@$ref;
1715 return $sum_of_squares;
1718 sub sampling_variance {
1719 my $ref = shift;
1720 return variance($ref) / (scalar @$ref -1);