3 # BioPerl module for Bio::Align::DNAStatistics
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-AT-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
22 use Bio::Align::DNAStatistics;
24 my $stats = Bio::Align::DNAStatistics->new();
25 my $alignin = Bio::AlignIO->new(-format => 'emboss',
26 -file => 't/data/insulin.water');
27 my $aln = $alignin->next_aln;
28 my $jcmatrix = $stats->distance(-align => $aln,
29 -method => 'Jukes-Cantor');
31 print $jcmatrix->print_matrix;
32 ## and for measurements of synonymous /nonsynonymous substitutions ##
34 my $in = Bio::AlignIO->new(-format => 'fasta',
35 -file => 't/data/nei_gojobori_test.aln');
36 my $alnobj = $in->next_aln;
37 my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
38 my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
39 print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
40 for (sort keys %{$results->[0]} ){
42 printf("%-9s %.4f \n",$_ , $results->[0]{$_});
45 my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
46 for my $an (@$results2){
47 print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
48 for (sort keys %$an ){
50 printf("%-9s %.4f \n",$_ , $an->{$_});
55 my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
56 for (sort keys %$result3 ){
58 printf("%-9s %.4f \n",$_ , $result3->{$_});
63 This object contains routines for calculating various statistics and
64 distances for DNA alignments. The routines are not well tested and do
65 contain errors at this point. Work is underway to correct them, but
66 do not expect this code to give you the right answer currently! Use
67 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
71 Several different distance method calculations are supported. Listed
72 in brackets are the pattern which will match
76 =item JukesCantor [jc|jukes|jukescantor|jukes-cantor]
78 =item Uncorrected [jcuncor|uncorrected]
80 =item F81 [f81|felsenstein]
82 =item Kimura [k2|k2p|k80|kimura]
84 =item Tamura [t92|tamura|tamura92]
86 =item F84 [f84|felsenstein84]
88 =item TajimaNei [tajimanei|tajima\-nei]
90 =item JinNei [jinnei|jin\-nei] (not implemented)
94 There are also three methods to calculate the ratio of synonymous to
95 non-synonymous mutations. All are implementations of the Nei-Gojobori
96 evolutionary pathway method and use the Jukes-Cantor method of
97 nucleotide substitution. This method works well so long as the
98 nucleotide frequencies are roughly equal and there is no significant
99 transition/transversion bias. In order to use these methods there are
100 several pre-requisites for the alignment.
106 DNA alignment must be based on protein alignment. Use the subroutine
107 L<aa_to_dna_aln> in Bio::Align::Utilities to achieve this.
111 Therefore alignment gaps must be in multiples of 3 (representing an aa
112 deletion/insertion) and at present must be indicated by a '-' symbol.
116 Alignment must be solely of coding region and be in reading frame 0 to
117 achieve meaningful results
121 Alignment must therefore be a multiple of 3 nucleotides long.
125 All sequences must be the same length (including gaps). This should be
126 the case anyway if the sequences have been automatically aligned using
127 a program like Clustal.
131 Only the standard codon alphabet is supported at present.
135 calc_KaKs_pair() calculates a number of statistics for a named pair of
136 sequences in the alignment.
138 calc_all_KaKs_pairs() calculates these statistics for all pairwise
139 comparisons in an MSA. The statistics returned are:
145 Number of synonymous mutations between the 2 sequences.
149 Number of non-synonymous mutations between the 2 sequences.
153 Mean number of synonymous sites in both sequences.
157 mean number of synonymous sites in both sequences.
161 proportion of synonymous differences in both sequences given by P_s = S_d/S.
165 proportion of non-synonymous differences in both sequences given by P_n = S_n/S.
169 estimation of synonymous mutations per synonymous site (by Jukes-Cantor).
173 estimation of non-synonymous mutations per non-synonymous site (by Jukes-Cantor).
177 estimation of variance of D_n .
181 estimation of variance of S_n.
185 calculation of z value.Positive value indicates D_n E<gt> D_s,
186 negative value indicates D_s E<gt> D_n.
190 The statistics returned by calc_average_KaKs are:
196 Average number of synonymous mutations/synonymous site.
200 Average number of non-synonymous mutations/non-synonymous site.
204 Estimated variance of Ds from bootstrapped alignments.
208 Estimated variance of Dn from bootstrapped alignments.
212 calculation of z value. Positive value indicates D_n E<gt>D_s,
213 negative values vice versa.
217 The design of the code is based around the explanation of the
218 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
219 Phylogenetics" by Nei and Kumar, published by Oxford University
220 Press. The methods have been tested using the worked example 4.1 in
221 the book, and reproduce those results. If people like having this sort
222 of analysis in BioPerl other methods for estimating Ds and Dn can be
226 Much of the DNA distance code is based on implementations in EMBOSS
227 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
228 al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
237 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
238 Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
239 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
244 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
248 M Kimura, J. Mol. Evol., 1980, 16, 111.
252 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
256 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
264 User feedback is an integral part of the evolution of this and other
265 Bioperl modules. Send your comments and suggestions preferably to
266 the Bioperl mailing list. Your participation is much appreciated.
268 bioperl-l@bioperl.org - General discussion
269 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
273 Please direct usage questions or support issues to the mailing list:
275 L<bioperl-l@bioperl.org>
277 rather than to the module maintainer directly. Many experienced and
278 reponsive experts will be able look at the problem and quickly
279 address it. Please include a thorough description of the problem
280 with code and data examples if at all possible.
282 =head2 Reporting Bugs
284 Report bugs to the Bioperl bug tracking system to help us keep track
285 of the bugs and their resolution. Bug reports can be submitted via the
288 http://bugzilla.open-bio.org/
290 =head1 AUTHOR - Jason Stajich
292 Email jason-AT-bioperl.org
296 Richard Adams, richard.adams@ed.ac.uk
300 The rest of the documentation details each of the object methods.
301 Internal methods are usually preceded with a _
306 # Let the code begin...
309 package Bio
::Align
::DNAStatistics
;
310 use vars
qw(%DNAChanges @Nucleotides %NucleotideIndexes
311 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
312 $CODONS %synchanges $synsites $Precision $GCChhars);
314 use Bio::Align::PairwiseStatistics;
315 use Bio::Matrix::PhylipDist;
316 use Bio::Tools::IUPAC;
319 $GapChars = '[\.\-]';
321 @Nucleotides = qw(A G T C);
325 # these values come from EMBOSS distmat implementation
326 %NucleotideIndexes = ( 'A' => 0,
338 # these are wrong now
351 $DefaultGapPenalty = 0;
352 # could put ambiguities here?
353 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
358 'Transitions' => { 'A' => [ 'G' ],
364 %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
365 'jcuncor|uncorrected' => 'Uncorrected',
366 'f81|felsenstein81' => 'F81',
367 'k2|k2p|k80|kimura' => 'Kimura',
368 't92|tamura|tamura92' => 'Tamura',
369 'f84|felsenstein84' => 'F84',
370 'tajimanei|tajima\-nei' => 'TajimaNei',
371 'jinnei|jin\-nei' => 'JinNei');
374 use base
qw(Bio::Root::Root Bio::Align::StatisticsI);
376 ## generate look up hashes for Nei_Gojobori methods##
377 $CODONS = get_codons
();
378 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
379 #create look up hash of number of possible synonymous mutations per codon
380 $synsites = get_syn_sites
();
381 #create reference look up hash of single basechanges in codons
382 %synchanges = get_syn_changes
();
389 Usage : my $obj = Bio::Align::DNAStatistics->new();
390 Function: Builds a new Bio::Align::DNAStatistics object
391 Returns : Bio::Align::DNAStatistics
398 my ($class,@args) = @_;
399 my $self = $class->SUPER::new
(@args);
401 $self->pairwise_stats( Bio
::Align
::PairwiseStatistics
->new());
410 Usage : my $distance_mat = $stats->distance(-align => $aln,
412 Function: Calculates a distance matrix for all pairwise distances of
413 sequences in an alignment.
414 Returns : L<Bio::Matrix::PhylipDist> object
415 Args : -align => Bio::Align::AlignI object
416 -method => String specifying specific distance method
417 (implementing class may assume a default)
418 See also: L<Bio::Matrix::PhylipDist>
423 my ($self,@args) = @_;
424 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
425 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
426 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
428 $method ||= 'JukesCantor';
429 foreach my $m ( keys %DistanceMethods ) {
430 if(defined $m && $method =~ /$m/i ) {
431 my $mtd = "D_$DistanceMethods{$m}";
432 return $self->$mtd($aln);
435 $self->warn("Unrecognized distance method $method must be one of [".
436 join(',',$self->available_distance_methods())."]");
440 =head2 available_distance_methods
442 Title : available_distance_methods
443 Usage : my @methods = $stats->available_distance_methods();
444 Function: Enumerates the possible distance methods
445 Returns : Array of strings
451 sub available_distance_methods
{
452 my ($self,@args) = @_;
453 return values %DistanceMethods;
456 =head2 D - distance methods
464 Title : D_JukesCantor
465 Usage : my $d = $stat->D_JukesCantor($aln)
466 Function: Calculates D (pairwise distance) between 2 sequences in an
467 alignment using the Jukes-Cantor 1 parameter model.
468 Returns : L<Bio::Matrix::PhylipDist>
469 Args : L<Bio::Align::AlignI> of DNA sequences
476 my ($self,$aln,$gappenalty) = @_;
477 return 0 unless $self->_check_arg($aln);
478 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
479 # ambiguities ignored at this point
480 my (@seqs,@names,@values,%dist);
482 foreach my $seq ( $aln->each_seq) {
483 push @names, $seq->display_id;
484 push @seqs, uc $seq->seq();
487 my $precisionstr = "%.$Precision"."f";
488 for(my $i = 0; $i < $seqct-1; $i++ ) {
489 # (diagonals) distance is 0 for same sequence
490 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
491 $values[$i][$i] = sprintf($precisionstr,0);
493 for( my $j = $i+1; $j < $seqct; $j++ ) {
494 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
496 # just want diagonals
497 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
498 $matrix->[2]->[2] + $matrix->[3]->[3] );
499 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
500 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
502 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
503 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
504 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
505 # (diagonals) distance is 0 for same sequence
506 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
507 $values[$j][$j] = sprintf($precisionstr,0);
510 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
513 -values => \
@values);
519 Usage : my $d = $stat->D_F81($aln)
520 Function: Calculates D (pairwise distance) between 2 sequences in an
521 alignment using the Felsenstein 1981 distance model.
522 Relaxes the assumption of equal base frequencies that is
524 Returns : L<Bio::Matrix::PhylipDist>
525 Args : L<Bio::Align::AlignI> of DNA sequences
531 my ($self,$aln,$gappenalty) = @_;
532 return 0 unless $self->_check_arg($aln);
533 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
534 # ambiguities ignored at this point
535 my (@seqs,@names,@values,%dist);
537 foreach my $seq ( $aln->each_seq) {
538 push @names, $seq->display_id;;
539 push @seqs, uc $seq->seq();
542 my $precisionstr = "%.$Precision"."f";
543 for(my $i = 0; $i < $seqct-1; $i++ ) {
544 # (diagonals) distance is 0 for same sequence
545 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
546 $values[$i][$i] = sprintf($precisionstr,0);
548 for( my $j = $i+1; $j < $seqct; $j++ ) {
550 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
552 # just want diagonals
553 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
554 $matrix->[2]->[2] + $matrix->[3]->[3] );
555 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
556 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
558 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
559 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
560 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
561 # (diagonals) distance is 0 for same sequence
562 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
563 $values[$j][$j] = sprintf($precisionstr,0);
566 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
569 -values => \
@values);
574 Title : D_Uncorrected
575 Usage : my $d = $stats->D_Uncorrected($aln)
576 Function: Calculate a distance D, no correction for multiple substitutions
578 Returns : L<Bio::Matrix::PhylipDist>
579 Args : L<Bio::Align::AlignI> (DNA Alignment)
580 [optional] gap penalty
585 my ($self,$aln,$gappenalty) = @_;
586 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
587 return 0 unless $self->_check_arg($aln);
588 # ambiguities ignored at this point
589 my (@seqs,@names,@values,%dist);
591 foreach my $seq ( $aln->each_seq) {
592 push @names, $seq->display_id;
593 push @seqs, uc $seq->seq();
597 my $precisionstr = "%.$Precision"."f";
598 my $len = $aln->length;
599 for( my $i = 0; $i < $seqct-1; $i++ ) {
600 # (diagonals) distance is 0 for same sequence
601 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
602 $values[$i][$i] = sprintf($precisionstr,0);
604 for( my $j = $i+1; $j < $seqct; $j++ ) {
605 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
607 my $m = ( $matrix->[0]->[0] +
611 my $D = 1 - ( $m / ( $len - $gaps + ( $gaps * $gappenalty)));
613 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
614 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
615 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
616 # (diagonals) distance is 0 for same sequence
617 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
618 $values[$j][$j] = sprintf($precisionstr,0);
621 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
624 -values => \
@values);
628 # M Kimura, J. Mol. Evol., 1980, 16, 111.
633 Usage : my $d = $stat->D_Kimura($aln)
634 Function: Calculates D (pairwise distance) between all pairs of sequences
635 in an alignment using the Kimura 2 parameter model.
636 Returns : L<Bio::Matrix::PhylipDist>
637 Args : L<Bio::Align::AlignI> of DNA sequences
643 my ($self,$aln) = @_;
644 return 0 unless $self->_check_arg($aln);
645 # ambiguities ignored at this point
646 my (@names,@values,%dist);
648 foreach my $seq ( $aln->each_seq) {
649 push @names, $seq->display_id;
653 my $precisionstr = "%.$Precision"."f";
655 for( my $i = 0; $i < $seqct-1; $i++ ) {
656 # (diagonals) distance is 0 for same sequence
657 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
658 $values[$i][$i] = sprintf($precisionstr,0);
660 for( my $j = $i+1; $j < $seqct; $j++ ) {
661 my $pairwise = $aln->select_noncont($i+1,$j+1);
662 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
666 my $P = $self->transitions($pairwise) / $L;
667 my $Q = $self->transversions($pairwise) / $L;
669 my $denom = ( 1 - (2 * $P) - $Q);
671 $self->throw("cannot find distance for ",$i+1,
672 ",",$j+1," $P, $Q\n");
674 my $a = 1 / ( 1 - (2 * $P) - $Q);
675 my $b = 1 / ( 1 - 2 * $Q );
676 if( $a < 0 || $b < 0 ) {
679 $K = (1/2) * log ( $a ) + (1/4) * log($b);
682 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
683 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
684 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
685 # (diagonals) distance is 0 for same sequence
686 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
687 $values[$j][$j] = sprintf($precisionstr,0);
690 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
693 -values => \
@values);
697 =head2 D_Kimura_variance
700 Usage : my $d = $stat->D_Kimura_variance($aln)
701 Function: Calculates D (pairwise distance) between all pairs of sequences
702 in an alignment using the Kimura 2 parameter model.
703 Returns : array of 2 L<Bio::Matrix::PhylipDist>,
704 the first is the Kimura distance and the second is
705 a matrix of variance V(K)
706 Args : L<Bio::Align::AlignI> of DNA sequences
711 sub D_Kimura_variance
{
712 my ($self,$aln) = @_;
713 return 0 unless $self->_check_arg($aln);
714 # ambiguities ignored at this point
715 my (@names,@values,%dist,@var);
717 foreach my $seq ( $aln->each_seq) {
718 push @names, $seq->display_id;
722 my $precisionstr = "%.$Precision"."f";
724 for( my $i = 0; $i < $seqct-1; $i++ ) {
725 # (diagonals) distance is 0 for same sequence
726 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
727 $values[$i][$i] = sprintf($precisionstr,0);
729 for( my $j = $i+1; $j < $seqct; $j++ ) {
730 my $pairwise = $aln->select_noncont($i+1,$j+1);
731 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
735 my $P = $self->transitions($pairwise) / $L;
736 my $Q = $self->transversions($pairwise) / $L;
737 my ($a,$b,$K,$var_k);
738 my $a_denom = ( 1 - (2 * $P) - $Q);
739 my $b_denom = 1 - 2 * $Q;
740 unless( $a_denom > 0 && $b_denom > 0 ) {
748 $K = (1/2) * log ( $a ) + (1/4) * log($b);
749 # from Wu and Li 1985 which in turn is from Kimura 1980
750 my $c = ( $a - $b ) / 2;
751 my $d = ( $a + $b ) / 2;
752 $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
756 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
757 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
758 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
759 # (diagonals) distance is 0 for same sequence
760 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
761 $values[$j]->[$j] = sprintf($precisionstr,0);
763 $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
764 $var[$j]->[$j] = $values[$j]->[$j];
767 return ( Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
770 -values => \
@values),
771 Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
779 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
784 Usage : Calculates D (pairwise distance) between 2 sequences in an
785 alignment using Tamura 1992 distance model.
786 Returns : L<Bio::Matrix::PhylipDist>
787 Args : L<Bio::Align::AlignI> of DNA sequences
793 my ($self,$aln) = @_;
794 return 0 unless $self->_check_arg($aln);
795 # ambiguities ignored at this point
796 my (@seqs,@names,@values,%dist,$i,$j);
798 my $length = $aln->length;
799 foreach my $seq ( $aln->each_seq) {
800 push @names, $seq->display_id;;
801 push @seqs, uc $seq->seq();
805 my $precisionstr = "%.$Precision"."f";
806 my (@gap,@gc,@trans,@tranv,@score);
808 for my $t1 ( @seqs ) {
810 for my $t2 ( @seqs ) {
812 for( my $k = 0; $k < $length; $k++ ) {
813 my ($c1,$c2) = ( substr($seqs[$i],$k,1),
814 substr($seqs[$j],$k,1) );
815 if( $c1 =~ /^$GapChars$/ ||
816 $c2 =~ /^$GapChars$/ ) {
818 } elsif( $c2 =~ /^$GCChhars$/i ) {
822 $gc[$i][$j] = ( $gc[$i][$j] /
823 ($length - $gap[$i][$j]) );
829 for( $i = 0; $i < $seqct-1; $i++ ) {
830 # (diagonals) distance is 0 for same sequence
831 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
832 $values[$i][$i] = sprintf($precisionstr,0);
834 for( $j = $i+1; $j < $seqct; $j++ ) {
836 my $pairwise = $aln->select_noncont($i+1,$j+1);
837 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
838 my $P = $self->transitions($pairwise) / $L;
839 my $Q = $self->transversions($pairwise) / $L;
840 my $C = $gc[$i][$j] + $gc[$j][$i]-
841 ( 2 * $gc[$i][$j] * $gc[$j][$i] );
845 my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
847 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
848 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
849 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
850 # (diagonals) distance is 0 for same sequence
851 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
852 $values[$j][$j] = sprintf($precisionstr,0);
855 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
858 -values => \
@values);
865 Usage : my $d = $stat->D_F84($aln)
866 Function: Calculates D (pairwise distance) between 2 sequences in an
867 alignment using the Felsenstein 1984 distance model.
868 Returns : L<Bio::Matrix::PhylipDist>
869 Args : L<Bio::Align::AlignI> of DNA sequences
870 [optional] double - gap penalty
875 my ($self,$aln,$gappenalty) = @_;
876 return 0 unless $self->_check_arg($aln);
877 $self->throw_not_implemented();
878 # ambiguities ignored at this point
879 my (@seqs,@names,@values,%dist);
881 foreach my $seq ( $aln->each_seq) {
882 # if there is no name,
883 my $id = $seq->display_id;
884 if( ! length($id) || # deal with empty names
889 push @seqs, uc $seq->seq();
893 my $precisionstr = "%.$Precision"."f";
895 for( my $i = 0; $i < $seqct-1; $i++ ) {
896 # (diagonals) distance is 0 for same sequence
897 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
898 $values[$i][$i] = sprintf($precisionstr,0);
900 for( my $j = $i+1; $j < $seqct; $j++ ) {
905 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
906 # Tajima-Nei correction used for multiple substitutions in the calc
907 # of the distance matrix. Nucleic acids only.
909 # D = p-distance = 1 - (matches/(posns_scored + gaps)
911 # distance = -b * ln(1-D/b)
917 Usage : my $d = $stat->D_TajimaNei($aln)
918 Function: Calculates D (pairwise distance) between 2 sequences in an
919 alignment using the TajimaNei 1984 distance model.
920 Returns : L<Bio::Matrix::PhylipDist>
921 Args : Bio::Align::AlignI of DNA sequences
927 my ($self,$aln) = @_;
928 return 0 unless $self->_check_arg($aln);
929 # ambiguities ignored at this point
930 my (@seqs,@names,@values,%dist);
932 foreach my $seq ( $aln->each_seq) {
933 # if there is no name,
934 push @names, $seq->display_id;
935 push @seqs, uc $seq->seq();
938 my $precisionstr = "%.$Precision"."f";
941 for( $i =0; $i < $seqct -1; $i++ ) {
942 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
943 $values[$i][$i] = sprintf($precisionstr,0);
945 for ( $j = $i+1; $j <$seqct;$j++ ) {
946 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
948 my $pairwise = $aln->select_noncont($i+1,$j+1);
949 my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
951 for( $bs = 0; $bs < 4; $bs++ ) {
953 map {$fi += $matrix->[$bs]->[$_] } 0..3;
956 map { $fj += $matrix->[$_]->[$bs] } 0..3;
957 my $fij = ( $fi && $fj ) ?
($fi + $fj) /( 2 * $slen) : 0;
961 my ($pair,$h) = (0,0);
962 for( $bs = 0; $bs < 3; $bs++ ) {
963 for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
964 my $fij = $pfreq->[$pair++] / $slen;
967 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
969 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
970 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
971 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
972 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
975 $h += ( ($fij**2) / 2 ) /
976 ( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
977 ( ( $ci2 + $cj2 ) / (2 * $slen) )
980 $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
984 # just want diagonals which are matches (A matched A, C -> C)
986 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
987 $matrix->[2]->[2] + $matrix->[3]->[3] );
988 my $D = 1 - ( $m / $slen);
993 my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
999 $d = (-1 * $b) * log ( $c);
1002 # fwd and rev lookup
1003 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1004 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1005 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1007 # (diagonals) distance is 0 for same sequence
1008 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1009 $values[$j][$j] = sprintf($precisionstr,0);
1012 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
1015 -values => \
@values);
1019 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1024 Usage : my $d = $stat->D_JinNei($aln)
1025 Function: Calculates D (pairwise distance) between 2 sequences in an
1026 alignment using the Jin-Nei 1990 distance model.
1027 Returns : L<Bio::Matrix::PhylipDist>
1028 Args : L<Bio::Align::AlignI> of DNA sequences
1034 my ($self,@args) = @_;
1035 $self->warn("JinNei implementation not completed");
1039 =head2 transversions
1041 Title : transversions
1042 Usage : my $transversions = $stats->transversion($aln);
1043 Function: Calculates the number of transversions between two sequences in
1046 Args : Bio::Align::AlignI
1052 my ($self,$aln) = @_;
1053 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1059 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1060 Function: Calculates the number of transitions in a given DNA alignment
1061 Returns : integer representing the number of transitions
1062 Args : Bio::Align::AlignI object
1068 my ($self,$aln) = @_;
1069 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1073 sub _trans_count_helper
{
1074 my ($self,$aln,$type) = @_;
1075 return 0 unless( $self->_check_arg($aln) );
1076 if( ! $aln->is_flush ) { $self->throw("must be flush") }
1078 my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1079 uc $aln->get_seq_by_pos(2)->seq() );
1080 my $alen = $aln->length;
1081 for (my $i = 0;$i<$alen; $i++ ) {
1082 my ($c1,$c2) = ( substr($first,$i,1),
1083 substr($second,$i,1) );
1085 foreach my $nt ( @
{$type->{$c1}} ) {
1093 map { if( $_) { $sum += $_} } @tcount;
1097 # this will generate a matrix which records across the row, the number
1100 sub _build_nt_matrix
{
1101 my ($self,$seqa,$seqb) = @_;
1104 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1108 my $gaps = 0; # number of gaps
1109 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1110 my $len_a = length($seqa);
1111 for( my $i = 0; $i < $len_a; $i++) {
1112 my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1116 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1117 if( $tj =~ /^$GapChars$/) { $gaps++; next }
1119 my $ti_index = $NucleotideIndexes{$ti};
1120 my $tj_index = $NucleotideIndexes{$tj};
1122 if( ! defined $ti_index ) {
1123 $self->warn("ti_index not defined for $ti\n");
1127 $basect_matrix->[$ti_index]->[$tj_index]++;
1130 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1133 return ($basect_matrix,$pfreq,$gaps);
1136 sub _check_ambiguity_nucleotide
{
1137 my ($base1,$base2) = @_;
1138 my %iub = Bio
::Tools
::IUPAC
->iupac_iub();
1139 my @amb1 = @
{ $iub{uc($base1)} };
1140 my @amb2 = @
{ $iub{uc($base2)} };
1142 for my $amb ( @amb1 ) {
1143 if( grep { $amb eq $_ } @amb2 ) {
1149 return (1 / scalar @amb1) * (1 / scalar @amb2);
1157 my($self,$aln ) = @_;
1158 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1159 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1161 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1162 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1172 =head2 pairwise_stats
1174 Title : pairwise_stats
1175 Usage : $obj->pairwise_stats($newval)
1177 Returns : value of pairwise_stats
1178 Args : newvalue (optional)
1184 my ($self,$value) = @_;
1185 if( defined $value) {
1186 $self->{'_pairwise_stats'} = $value;
1188 return $self->{'_pairwise_stats'};
1192 =head2 calc_KaKs_pair
1194 Title : calc_KaKs_pair
1195 Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1197 Function : calculates Nei-Gojobori statistics for pairwise
1199 Args : A Bio::Align::AlignI compliant object such as a
1200 Bio::SimpleAlign object, and 2 sequence name strings.
1201 Returns : a reference to a hash of statistics with keys as
1202 listed in Description.
1206 sub calc_KaKs_pair
{
1207 my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1208 $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1210 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1212 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1213 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1214 {id
=> $seq1_id, seq
=> uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1215 {id
=> $seq2_id, seq
=> uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1217 if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1218 $self->throw(" aligned sequences must be of equal length!");
1221 $self->_get_av_ds_dn(\
@seqs, $results);
1226 =head2 calc_all_KaKs_pairs
1228 Title : calc_all_KaKs_pairs
1229 Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1230 Function : Calculates Nei_gojobori statistics for all pairwise
1231 combinations in sequence.
1232 Arguments: A Bio::Align::ALignI compliant object such as
1233 a Bio::SimpleAlign object.
1234 Returns : A reference to an array of hashes of statistics of
1235 all pairwise comparisons in the alignment.
1241 sub calc_all_KaKs_pairs
{
1242 #returns a multi_element_array with all pairwise comparisons
1243 my ($self,$aln) = @_;
1244 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1246 for my $seq ($aln->each_seq) {
1247 push @seqs, {id
=> $seq->display_id, seq
=>$seq->seq};
1250 $results = $self->_get_av_ds_dn(\
@seqs, $results);
1254 =head2 calc_average_KaKs
1256 Title : calc_average_KaKs.
1257 Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1258 Function : calculates Nei_Gojobori stats for average of all
1259 sequences in the alignment.
1260 Args : A Bio::Align::AlignI compliant object such as a
1261 Bio::SimpleAlign object, number of bootstrap iterations
1263 Returns : A reference to a hash of statistics as listed in Description.
1267 sub calc_average_KaKs
{
1268 #calculates global value for sequences in alignment using bootstrapping
1269 #this is quite slow (~10 seconds per 3 X 200nt seqs);
1270 my ($self, $aln, $bootstrap_rpt) = @_;
1271 $bootstrap_rpt ||= 1000;
1272 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1274 for my $seq ($aln->each_seq) {
1275 push @seqs, {id
=> $seq->display_id, seq
=>$seq->seq};
1278 my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\
@seqs);
1279 #print "ds = $ds_orig, dn = $dn_orig\n";
1280 $results = {D_s
=> $ds_orig, D_n
=> $dn_orig};
1281 $self->_run_bootstrap(\
@seqs, $results, $bootstrap_rpt);
1285 ############## primary internal subs for alignment comparisons ########################
1287 sub _run_bootstrap
{
1288 ### generates sampled sequences, calculates Ds and Dn values,
1289 ### then calculates variance of sampled sequences and add results to results hash
1291 my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1292 my @seqs = @
$seq_ref;
1293 my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1294 my %bootstrap_values = (ds
=> [], dn
=>[]); # to hold list of av values
1296 #1st make alternative array of codons;
1298 while ($c < length $seqs[0]{'seq'}) {
1300 push @
{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1305 for (1..$bootstrap_rpt) {
1306 my $sampled = _resample
(\
@btstrp_aoa);
1307 my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1308 push @
{$bootstrap_values{'ds'}}, $ds;
1309 push @
{$bootstrap_values{'dn'}}, $dn;
1312 $results->{'D_s_var'} = sampling_variance
($bootstrap_values{'ds'});
1313 $results->{'D_n_var'} = sampling_variance
($bootstrap_values{'dn'});
1314 $results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1315 sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1316 #print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1317 #print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1318 #print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1323 my $codon_num = scalar (@
{$ref->[0]});
1325 for (0..$codon_num -1) { #for each codon
1326 my $rand = int (rand ($codon_num));
1328 push @
{$altered[$_]}, $ref->[$_][$rand];
1331 my @stringed = map {join '', @
$_}@altered;
1333 #now out in random name to keep other subs happy
1335 push @return, {id
=>'1', seq
=> $_};
1341 # takes array of hashes of sequence strings and ids #
1343 my $seq_ref = shift;
1344 my $result = shift if @_;
1345 my @caller = caller(1);
1346 my @seqarray = @
$seq_ref;
1347 my $bootstrap_score_list;
1348 #for a multiple alignment considers all pairwise combinations#
1349 my %dsfor_average = (ds
=> [], dn
=> []);
1350 for (my $i = 0; $i < scalar @seqarray; $i++) {
1351 for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1352 # print "comparing $i and $j\n";
1353 if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1354 $self->warn(" aligned sequences must be of equal length!");
1358 my $syn_site_count = count_syn_sites
($seqarray[$i]{'seq'}, $synsites);
1359 my $syn_site_count2 = count_syn_sites
($seqarray[$j]{'seq'}, $synsites);
1360 # print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1361 my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations
($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1363 my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1364 my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1366 #calculate ps and pn (p54)
1367 my $syn_prop = $syn_count / $av_s_site;
1368 my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1370 #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1371 my $d_syn = $self->jk($syn_prop);
1372 my $d_nc = $self->jk($nc_prop);
1374 #JK calculation must succeed for continuation of calculation
1375 #ret_value = -1 if error
1376 next unless $d_nc >=0 && $d_syn >=0;
1379 push @
{$dsfor_average{'ds'}}, $d_syn;
1380 push @
{$dsfor_average{'dn'}}, $d_nc;
1382 #if not doing bootstrap, calculate the pairwise comparisin stats
1383 if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1384 #now calculate variances assuming large sample
1385 my $d_syn_var = jk_var
($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1386 my $d_nc_var = jk_var
($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1387 #now calculate z_value
1388 #print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1389 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1390 my $z = ($d_syn_var + $d_nc_var) ?
1391 ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1392 # print "z is $z\n";
1393 push @
$result , {S
=> $av_s_site, N
=>$av_ns_syn_site,
1394 S_d
=> $syn_count, N_d
=>$non_syn_count,
1395 P_s
=> $syn_prop, P_n
=>$nc_prop,
1396 D_s
=> @
{$dsfor_average{'ds'}}[-1],
1397 D_n
=> @
{$dsfor_average{'dn'}}[-1],
1398 D_n_var
=>$d_nc_var, D_s_var
=> $d_syn_var,
1399 Seq1
=> $seqarray[$i]{'id'},
1400 Seq2
=> $seqarray[$j]{'id'},
1403 $self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1404 if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1409 #warn of failure if no results hashes are present
1410 #will fail if Jukes Cantor has failed for all pairwise combinations
1411 #$self->warn("calculation failed!") if scalar @$result ==0;
1413 #return results unless bootstrapping
1414 return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1415 #else if getting average for bootstrap
1416 return( mean
($dsfor_average{'ds'}),mean
($dsfor_average{'dn'})) ;
1421 my ($self, $p) = @_;
1423 $self->warn( " Jukes Cantor won't work -too divergent!");
1426 return -1 * (3/4) * (log(1 - (4/3) * $p));
1429 #works for large value of n (50?100?)
1432 return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1436 # compares 2 sequences to find the number of synonymous/non
1437 # synonymous mutations between them
1439 sub analyse_mutations
{
1440 my ($seq1, $seq2) = @_;
1441 my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1442 [2,1]], # depend on which is the same
1448 3=> [ [0,1,2], # all need to be altered
1455 my $TOTAL = 0; # total synonymous changes
1456 my $TOTAL_n = 0; # total non-synonymous changes
1460 my $seqlen = length($seq1);
1461 for (my $j=0; $j< $seqlen; $j+=3) {
1462 $input{'cod1'} = substr($seq1, $j,3);
1463 $input{'cod2'} = substr($seq2, $j,3);
1465 #ignore codon if beeing compared with gaps!
1466 if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1467 $gap_cnt += 3; #just increments once if there is a pair of gaps
1471 my ($diff_cnt, $same) = count_diffs
(\
%input);
1473 #ignore if codons are identical
1474 next if $diff_cnt == 0 ;
1475 if ($diff_cnt == 1) {
1476 $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1477 $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1478 #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1480 elsif ($diff_cnt ==2) {
1484 #will stay 4 unless there are stop codons at intervening point
1485 OUTER
:for my $perm (@
{$mutator{'2'}{$same}}) {
1486 my $altered = $input{'cod1'};
1488 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1489 for my $mut_i (@
$perm) { #index of codon mutated
1490 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1491 if ($t[$CODONS->{$altered}] eq '*') {
1493 #print "changes to stop codon!!\n";
1497 $s_cnt += $synchanges{$prev}{$altered};
1498 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1504 if ($tot_muts != 0) {
1505 $TOTAL += ($s_cnt/($tot_muts/2));
1506 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1510 elsif ($diff_cnt ==3 ) {
1513 my $tot_muts = 18; #potential number of mutations
1514 OUTER
: for my $perm (@
{$mutator{'3'}}) {
1515 my $altered = $input{'cod1'};
1517 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1518 for my $mut_i (@
$perm) { #index of codon mutated
1519 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1520 if ($t[$CODONS->{$altered}] eq '*') {
1522 # print "changes to stop codon!!\n";
1527 $s_cnt += $synchanges{$prev}{$altered};
1528 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1535 #calculate number of synonymous/non synonymous mutations for that codon
1537 if ($tot_muts != 0) {
1538 $TOTAL += ($s_cnt / ($tot_muts /3));
1539 $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1541 } #endif $diffcnt = 3
1542 } #end of sequencetraversal
1543 return ($TOTAL, $TOTAL_n, $gap_cnt);
1548 #counts the number of nucleotide differences between 2 codons
1549 # returns this value plus the codon index of which nucleotide is the same when 2
1550 #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1555 #just for 2 differences
1557 if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1563 return ($cnt, $same);
1566 =head2 get_syn_changes
1568 Title : get_syn_changes
1569 Usage : Bio::Align::DNAStatitics->get_syn_changes
1570 Function: Generate a hashref of all pairwise combinations of codns
1572 Returns : Symetic matrix using hashes
1574 and each codon points to a hashref of codons
1575 the values of which describe type of change.
1576 my $type = $hash{$codon1}->{$codon2};
1580 -1 either codon is a stop codon
1585 sub get_syn_changes
{
1586 #hash of all pairwise combinations of codons differing by 1
1587 # 1 = syn, 0 = non-syn, -1 = stop
1589 my @codons = _make_codons
();
1590 my $arr_len = scalar @codons;
1591 for (my $i = 0; $i < $arr_len -1; $i++) {
1592 my $cod1 = $codons[$i];
1593 for (my $j = $i +1; $j < $arr_len; $j++) {
1596 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1598 next if $diff_cnt !=1;
1601 if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1602 $results{$cod1}{$codons[$j]} =1;
1603 $results{$codons[$j]}{$cod1} = 1;
1606 elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1607 $results{$cod1}{$codons[$j]} = -1;
1608 $results{$codons[$j]}{$cod1} = -1;
1612 $results{$cod1}{$codons[$j]} = 0;
1613 $results{$codons[$j]}{$cod1} = 0;
1620 =head2 dnds_pattern_number
1622 Title : dnds_pattern_number
1623 Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1624 Function: Counts the number of codons with no gaps in the MSA
1625 Returns : Number of codons with no gaps ('patterns' in PAML notation)
1626 Args : A Bio::Align::AlignI compliant object such as a
1627 Bio::SimpleAlign object.
1631 sub dnds_pattern_number
{
1632 my ($self, $aln) = @_;
1633 return ($aln->remove_gaps->length)/3;
1636 sub count_syn_sites
{
1637 #counts the number of possible synonymous changes for sequence
1638 my ($seq, $synsite) = @_;
1639 __PACKAGE__
->throw("not integral number of codons") if length($seq) % 3 != 0;
1641 for (my $i = 0; $i< length($seq); $i+=3) {
1642 my $cod = substr($seq, $i, 3);
1643 next if $cod =~ /\-/; #deal with alignment gaps
1644 $S += $synsite->{$cod}{'s'};
1653 #sub to generate lookup hash for the number of synonymous changes per codon
1654 my @nucs = qw(T C A G);
1659 # for each possible codon
1661 my $aa = $t[$CODONS->{$cod}];
1662 #calculate number of synonymous mutations vs non syn mutations
1663 for my $i (qw(0 1 2)){
1666 for my $nuc (qw(A T C G)) {
1667 next if substr ($cod, $i,1) eq $nuc;
1669 substr($test, $i, 1) = $nuc ;
1670 if ($t[$CODONS->{$test}] eq $aa) {
1673 if ($t[$CODONS->{$test}] eq '*') {
1677 $raw_results{$cod}[$i] = {'s' => $s ,
1681 } #end analysis of single codon
1683 } #end analysis of all codons
1686 for my $cod (sort keys %raw_results) {
1688 map{$t += ($_->{'s'} /$_->{'n'})} @
{$raw_results{$cod}};
1689 $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1691 return \
%final_results;
1695 #makes all codon combinations, returns array of them
1696 my @nucs = qw(T C A G);
1701 push @codons, "$i$j$k";
1709 #generates codon translation look up table#
1712 for my $codon (_make_codons
) {
1713 $CODONS->{$codon} = $x;
1719 #########stats subs, can go in another module? Here for speed. ###
1722 my $el_num = scalar @
$ref;
1724 map{$tot += $_}@
$ref;
1725 return ($tot/$el_num);
1730 my $mean = mean
($ref);
1731 my $sum_of_squares = 0;
1732 map{$sum_of_squares += ($_ - $mean) **2}@
$ref;
1733 return $sum_of_squares;
1736 sub sampling_variance
{
1738 return variance
($ref) / (scalar @
$ref -1);