sync w/ main trunk
[bioperl-live.git] / Bio / Align / DNAStatistics.pm
blobdb7d9c249fc3cc03a31f5ea4cee03a28cf96e13f
1 # $Id$
3 # BioPerl module for Bio::Align::DNAStatistics
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-AT-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
19 =head1 SYNOPSIS
21 use Bio::AlignIO;
22 use Bio::Align::DNAStatistics;
24 my $stats = Bio::Align::DNAStatistics->new();
25 my $alignin = Bio::AlignIO->new(-format => 'emboss',
26 -file => 't/data/insulin.water');
27 my $aln = $alignin->next_aln;
28 my $jcmatrix = $stats->distance(-align => $aln,
29 -method => 'Jukes-Cantor');
31 print $jcmatrix->print_matrix;
32 ## and for measurements of synonymous /nonsynonymous substitutions ##
34 my $in = Bio::AlignIO->new(-format => 'fasta',
35 -file => 't/data/nei_gojobori_test.aln');
36 my $alnobj = $in->next_aln;
37 my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
38 my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
39 print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
40 for (sort keys %{$results->[0]} ){
41 next if /Seq/;
42 printf("%-9s %.4f \n",$_ , $results->[0]{$_});
45 my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
46 for my $an (@$results2){
47 print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
48 for (sort keys %$an ){
49 next if /Seq/;
50 printf("%-9s %.4f \n",$_ , $an->{$_});
52 print "\n\n";
55 my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
56 for (sort keys %$result3 ){
57 next if /Seq/;
58 printf("%-9s %.4f \n",$_ , $result3->{$_});
61 =head1 DESCRIPTION
63 This object contains routines for calculating various statistics and
64 distances for DNA alignments. The routines are not well tested and do
65 contain errors at this point. Work is underway to correct them, but
66 do not expect this code to give you the right answer currently! Use
67 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
68 distances.
71 Several different distance method calculations are supported. Listed
72 in brackets are the pattern which will match
74 =over 3
76 =item JukesCantor [jc|jukes|jukescantor|jukes-cantor]
78 =item Uncorrected [jcuncor|uncorrected]
80 =item F81 [f81|felsenstein]
82 =item Kimura [k2|k2p|k80|kimura]
84 =item Tamura [t92|tamura|tamura92]
86 =item F84 [f84|felsenstein84]
88 =item TajimaNei [tajimanei|tajima\-nei]
90 =item JinNei [jinnei|jin\-nei] (not implemented)
92 =back
94 There are also three methods to calculate the ratio of synonymous to
95 non-synonymous mutations. All are implementations of the Nei-Gojobori
96 evolutionary pathway method and use the Jukes-Cantor method of
97 nucleotide substitution. This method works well so long as the
98 nucleotide frequencies are roughly equal and there is no significant
99 transition/transversion bias. In order to use these methods there are
100 several pre-requisites for the alignment.
102 =over 3
104 =item 1
106 DNA alignment must be based on protein alignment. Use the subroutine
107 L<aa_to_dna_aln> in Bio::Align::Utilities to achieve this.
109 =item 2
111 Therefore alignment gaps must be in multiples of 3 (representing an aa
112 deletion/insertion) and at present must be indicated by a '-' symbol.
114 =item 3
116 Alignment must be solely of coding region and be in reading frame 0 to
117 achieve meaningful results
119 =item 4
121 Alignment must therefore be a multiple of 3 nucleotides long.
123 =item 5
125 All sequences must be the same length (including gaps). This should be
126 the case anyway if the sequences have been automatically aligned using
127 a program like Clustal.
129 =item 6
131 Only the standard codon alphabet is supported at present.
133 =back
135 calc_KaKs_pair() calculates a number of statistics for a named pair of
136 sequences in the alignment.
138 calc_all_KaKs_pairs() calculates these statistics for all pairwise
139 comparisons in an MSA. The statistics returned are:
141 =over 3
143 =item S_d
145 Number of synonymous mutations between the 2 sequences.
147 =item N_d
149 Number of non-synonymous mutations between the 2 sequences.
151 =item S
153 Mean number of synonymous sites in both sequences.
155 =item N
157 mean number of synonymous sites in both sequences.
159 =item P_s
161 proportion of synonymous differences in both sequences given by P_s = S_d/S.
163 =item P_n
165 proportion of non-synonymous differences in both sequences given by P_n = S_n/S.
167 =item D_s
169 estimation of synonymous mutations per synonymous site (by Jukes-Cantor).
171 =item D_n
173 estimation of non-synonymous mutations per non-synonymous site (by Jukes-Cantor).
175 =item D_n_var
177 estimation of variance of D_n .
179 =item D_s_var
181 estimation of variance of S_n.
183 =item z_value
185 calculation of z value.Positive value indicates D_n E<gt> D_s,
186 negative value indicates D_s E<gt> D_n.
188 =back
190 The statistics returned by calc_average_KaKs are:
192 =over 3
194 =item D_s
196 Average number of synonymous mutations/synonymous site.
198 =item D_n
200 Average number of non-synonymous mutations/non-synonymous site.
202 =item D_s_var
204 Estimated variance of Ds from bootstrapped alignments.
206 =item D_n_var
208 Estimated variance of Dn from bootstrapped alignments.
210 =item z_score
212 calculation of z value. Positive value indicates D_n E<gt>D_s,
213 negative values vice versa.
215 =back
217 The design of the code is based around the explanation of the
218 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
219 Phylogenetics" by Nei and Kumar, published by Oxford University
220 Press. The methods have been tested using the worked example 4.1 in
221 the book, and reproduce those results. If people like having this sort
222 of analysis in BioPerl other methods for estimating Ds and Dn can be
223 provided later.
226 Much of the DNA distance code is based on implementations in EMBOSS
227 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
228 al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
229 Mitchison.
231 =head1 REFERENCES
233 =over 3
235 =item D_JukesCantor
237 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
238 Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
239 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
240 1969, pp. 21-132.
242 =item D_Tamura
244 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
246 =item D_Kimura
248 M Kimura, J. Mol. Evol., 1980, 16, 111.
250 =item JinNei
252 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
254 =item D_TajimaNei
256 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
258 =back
260 =head1 FEEDBACK
262 =head2 Mailing Lists
264 User feedback is an integral part of the evolution of this and other
265 Bioperl modules. Send your comments and suggestions preferably to
266 the Bioperl mailing list. Your participation is much appreciated.
268 bioperl-l@bioperl.org - General discussion
269 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
271 =head2 Support
273 Please direct usage questions or support issues to the mailing list:
275 L<bioperl-l@bioperl.org>
277 rather than to the module maintainer directly. Many experienced and
278 reponsive experts will be able look at the problem and quickly
279 address it. Please include a thorough description of the problem
280 with code and data examples if at all possible.
282 =head2 Reporting Bugs
284 Report bugs to the Bioperl bug tracking system to help us keep track
285 of the bugs and their resolution. Bug reports can be submitted via the
286 web:
288 http://bugzilla.open-bio.org/
290 =head1 AUTHOR - Jason Stajich
292 Email jason-AT-bioperl.org
294 =head1 CONTRIBUTORS
296 Richard Adams, richard.adams@ed.ac.uk
298 =head1 APPENDIX
300 The rest of the documentation details each of the object methods.
301 Internal methods are usually preceded with a _
303 =cut
306 # Let the code begin...
309 package Bio::Align::DNAStatistics;
310 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
311 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
312 $CODONS %synchanges $synsites $Precision $GCChhars);
313 use strict;
314 use Bio::Align::PairwiseStatistics;
315 use Bio::Matrix::PhylipDist;
316 use Bio::Tools::IUPAC;
318 BEGIN {
319 $GapChars = '[\.\-]';
320 $GCChhars = '[GCS]';
321 @Nucleotides = qw(A G T C);
322 $SeqCount = 2;
323 $Precision = 5;
325 # these values come from EMBOSS distmat implementation
326 %NucleotideIndexes = ( 'A' => 0,
327 'T' => 1,
328 'C' => 2,
329 'G' => 3,
331 'AT' => 0,
332 'AC' => 1,
333 'AG' => 2,
334 'CT' => 3,
335 'GT' => 4,
336 'CG' => 5,
338 # these are wrong now
339 # 'S' => [ 1, 3],
340 # 'W' => [ 0, 4],
341 # 'Y' => [ 2, 3],
342 # 'R' => [ 0, 1],
343 # 'M' => [ 0, 3],
344 # 'K' => [ 1, 2],
345 # 'B' => [ 1, 2, 3],
346 # 'H' => [ 0, 2, 3],
347 # 'V' => [ 0, 1, 3],
348 # 'D' => [ 0, 1, 2],
351 $DefaultGapPenalty = 0;
352 # could put ambiguities here?
353 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
354 'T' => [ 'A', 'G'],
355 'C' => [ 'A', 'G'],
356 'G' => [ 'C', 'T'],
358 'Transitions' => { 'A' => [ 'G' ],
359 'G' => [ 'A' ],
360 'C' => [ 'T' ],
361 'T' => [ 'C' ],
364 %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
365 'jcuncor|uncorrected' => 'Uncorrected',
366 'f81|felsenstein81' => 'F81',
367 'k2|k2p|k80|kimura' => 'Kimura',
368 't92|tamura|tamura92' => 'Tamura',
369 'f84|felsenstein84' => 'F84',
370 'tajimanei|tajima\-nei' => 'TajimaNei',
371 'jinnei|jin\-nei' => 'JinNei');
374 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
376 ## generate look up hashes for Nei_Gojobori methods##
377 $CODONS = get_codons();
378 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
379 #create look up hash of number of possible synonymous mutations per codon
380 $synsites = get_syn_sites();
381 #create reference look up hash of single basechanges in codons
382 %synchanges = get_syn_changes();
386 =head2 new
388 Title : new
389 Usage : my $obj = Bio::Align::DNAStatistics->new();
390 Function: Builds a new Bio::Align::DNAStatistics object
391 Returns : Bio::Align::DNAStatistics
392 Args : none
395 =cut
397 sub new {
398 my ($class,@args) = @_;
399 my $self = $class->SUPER::new(@args);
401 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
403 return $self;
407 =head2 distance
409 Title : distance
410 Usage : my $distance_mat = $stats->distance(-align => $aln,
411 -method => $method);
412 Function: Calculates a distance matrix for all pairwise distances of
413 sequences in an alignment.
414 Returns : L<Bio::Matrix::PhylipDist> object
415 Args : -align => Bio::Align::AlignI object
416 -method => String specifying specific distance method
417 (implementing class may assume a default)
418 See also: L<Bio::Matrix::PhylipDist>
420 =cut
422 sub distance{
423 my ($self,@args) = @_;
424 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
425 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
426 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
428 $method ||= 'JukesCantor';
429 foreach my $m ( keys %DistanceMethods ) {
430 if(defined $m && $method =~ /$m/i ) {
431 my $mtd = "D_$DistanceMethods{$m}";
432 return $self->$mtd($aln);
435 $self->warn("Unrecognized distance method $method must be one of [".
436 join(',',$self->available_distance_methods())."]");
437 return;
440 =head2 available_distance_methods
442 Title : available_distance_methods
443 Usage : my @methods = $stats->available_distance_methods();
444 Function: Enumerates the possible distance methods
445 Returns : Array of strings
446 Args : none
449 =cut
451 sub available_distance_methods{
452 my ($self,@args) = @_;
453 return values %DistanceMethods;
456 =head2 D - distance methods
459 =cut
462 =head2 D_JukesCantor
464 Title : D_JukesCantor
465 Usage : my $d = $stat->D_JukesCantor($aln)
466 Function: Calculates D (pairwise distance) between 2 sequences in an
467 alignment using the Jukes-Cantor 1 parameter model.
468 Returns : L<Bio::Matrix::PhylipDist>
469 Args : L<Bio::Align::AlignI> of DNA sequences
470 double - gap penalty
473 =cut
475 sub D_JukesCantor{
476 my ($self,$aln,$gappenalty) = @_;
477 return 0 unless $self->_check_arg($aln);
478 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
479 # ambiguities ignored at this point
480 my (@seqs,@names,@values,%dist);
481 my $seqct = 0;
482 foreach my $seq ( $aln->each_seq) {
483 push @names, $seq->display_id;
484 push @seqs, uc $seq->seq();
485 $seqct++;
487 my $precisionstr = "%.$Precision"."f";
488 for(my $i = 0; $i < $seqct-1; $i++ ) {
489 # (diagonals) distance is 0 for same sequence
490 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
491 $values[$i][$i] = sprintf($precisionstr,0);
493 for( my $j = $i+1; $j < $seqct; $j++ ) {
494 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
495 $seqs[$j]);
496 # just want diagonals
497 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
498 $matrix->[2]->[2] + $matrix->[3]->[3] );
499 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
500 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
501 # fwd and rev lookup
502 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
503 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
504 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
505 # (diagonals) distance is 0 for same sequence
506 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
507 $values[$j][$j] = sprintf($precisionstr,0);
510 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
511 -matrix => \%dist,
512 -names => \@names,
513 -values => \@values);
516 =head2 D_F81
518 Title : D_F81
519 Usage : my $d = $stat->D_F81($aln)
520 Function: Calculates D (pairwise distance) between 2 sequences in an
521 alignment using the Felsenstein 1981 distance model.
522 Relaxes the assumption of equal base frequencies that is
523 in JC.
524 Returns : L<Bio::Matrix::PhylipDist>
525 Args : L<Bio::Align::AlignI> of DNA sequences
528 =cut
530 sub D_F81{
531 my ($self,$aln,$gappenalty) = @_;
532 return 0 unless $self->_check_arg($aln);
533 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
534 # ambiguities ignored at this point
535 my (@seqs,@names,@values,%dist);
536 my $seqct = 0;
537 foreach my $seq ( $aln->each_seq) {
538 push @names, $seq->display_id;;
539 push @seqs, uc $seq->seq();
540 $seqct++;
542 my $precisionstr = "%.$Precision"."f";
543 for(my $i = 0; $i < $seqct-1; $i++ ) {
544 # (diagonals) distance is 0 for same sequence
545 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
546 $values[$i][$i] = sprintf($precisionstr,0);
548 for( my $j = $i+1; $j < $seqct; $j++ ) {
550 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
551 $seqs[$j]);
552 # just want diagonals
553 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
554 $matrix->[2]->[2] + $matrix->[3]->[3] );
555 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
556 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
557 # fwd and rev lookup
558 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
559 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
560 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
561 # (diagonals) distance is 0 for same sequence
562 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
563 $values[$j][$j] = sprintf($precisionstr,0);
566 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
567 -matrix => \%dist,
568 -names => \@names,
569 -values => \@values);
572 =head2 D_Uncorrected
574 Title : D_Uncorrected
575 Usage : my $d = $stats->D_Uncorrected($aln)
576 Function: Calculate a distance D, no correction for multiple substitutions
577 is used.
578 Returns : L<Bio::Matrix::PhylipDist>
579 Args : L<Bio::Align::AlignI> (DNA Alignment)
580 [optional] gap penalty
582 =cut
584 sub D_Uncorrected {
585 my ($self,$aln,$gappenalty) = @_;
586 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
587 return 0 unless $self->_check_arg($aln);
588 # ambiguities ignored at this point
589 my (@seqs,@names,@values,%dist);
590 my $seqct = 0;
591 foreach my $seq ( $aln->each_seq) {
592 push @names, $seq->display_id;
593 push @seqs, uc $seq->seq();
594 $seqct++;
597 my $precisionstr = "%.$Precision"."f";
598 my $len = $aln->length;
599 for( my $i = 0; $i < $seqct-1; $i++ ) {
600 # (diagonals) distance is 0 for same sequence
601 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
602 $values[$i][$i] = sprintf($precisionstr,0);
604 for( my $j = $i+1; $j < $seqct; $j++ ) {
605 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
606 $seqs[$j]);
607 my $m = ( $matrix->[0]->[0] +
608 $matrix->[1]->[1] +
609 $matrix->[2]->[2] +
610 $matrix->[3]->[3] );
611 my $D = 1 - ( $m / ( $len - $gaps + ( $gaps * $gappenalty)));
612 # fwd and rev lookup
613 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
614 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
615 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
616 # (diagonals) distance is 0 for same sequence
617 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
618 $values[$j][$j] = sprintf($precisionstr,0);
621 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
622 -matrix => \%dist,
623 -names => \@names,
624 -values => \@values);
628 # M Kimura, J. Mol. Evol., 1980, 16, 111.
630 =head2 D_Kimura
632 Title : D_Kimura
633 Usage : my $d = $stat->D_Kimura($aln)
634 Function: Calculates D (pairwise distance) between all pairs of sequences
635 in an alignment using the Kimura 2 parameter model.
636 Returns : L<Bio::Matrix::PhylipDist>
637 Args : L<Bio::Align::AlignI> of DNA sequences
640 =cut
642 sub D_Kimura {
643 my ($self,$aln) = @_;
644 return 0 unless $self->_check_arg($aln);
645 # ambiguities ignored at this point
646 my (@names,@values,%dist);
647 my $seqct = 0;
648 foreach my $seq ( $aln->each_seq) {
649 push @names, $seq->display_id;
650 $seqct++;
653 my $precisionstr = "%.$Precision"."f";
655 for( my $i = 0; $i < $seqct-1; $i++ ) {
656 # (diagonals) distance is 0 for same sequence
657 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
658 $values[$i][$i] = sprintf($precisionstr,0);
660 for( my $j = $i+1; $j < $seqct; $j++ ) {
661 my $pairwise = $aln->select_noncont($i+1,$j+1);
662 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
663 unless( $L ) {
664 $L = 1;
666 my $P = $self->transitions($pairwise) / $L;
667 my $Q = $self->transversions($pairwise) / $L;
668 my $K = 0;
669 my $denom = ( 1 - (2 * $P) - $Q);
670 if( $denom == 0 ) {
671 $self->throw("cannot find distance for ",$i+1,
672 ",",$j+1," $P, $Q\n");
674 my $a = 1 / ( 1 - (2 * $P) - $Q);
675 my $b = 1 / ( 1 - 2 * $Q );
676 if( $a < 0 || $b < 0 ) {
677 $K = -1;
678 } else{
679 $K = (1/2) * log ( $a ) + (1/4) * log($b);
681 # fwd and rev lookup
682 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
683 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
684 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
685 # (diagonals) distance is 0 for same sequence
686 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
687 $values[$j][$j] = sprintf($precisionstr,0);
690 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
691 -matrix => \%dist,
692 -names => \@names,
693 -values => \@values);
697 =head2 D_Kimura_variance
699 Title : D_Kimura
700 Usage : my $d = $stat->D_Kimura_variance($aln)
701 Function: Calculates D (pairwise distance) between all pairs of sequences
702 in an alignment using the Kimura 2 parameter model.
703 Returns : array of 2 L<Bio::Matrix::PhylipDist>,
704 the first is the Kimura distance and the second is
705 a matrix of variance V(K)
706 Args : L<Bio::Align::AlignI> of DNA sequences
709 =cut
711 sub D_Kimura_variance {
712 my ($self,$aln) = @_;
713 return 0 unless $self->_check_arg($aln);
714 # ambiguities ignored at this point
715 my (@names,@values,%dist,@var);
716 my $seqct = 0;
717 foreach my $seq ( $aln->each_seq) {
718 push @names, $seq->display_id;
719 $seqct++;
722 my $precisionstr = "%.$Precision"."f";
724 for( my $i = 0; $i < $seqct-1; $i++ ) {
725 # (diagonals) distance is 0 for same sequence
726 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
727 $values[$i][$i] = sprintf($precisionstr,0);
729 for( my $j = $i+1; $j < $seqct; $j++ ) {
730 my $pairwise = $aln->select_noncont($i+1,$j+1);
731 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
732 unless( $L ) {
733 $L = 1;
735 my $P = $self->transitions($pairwise) / $L;
736 my $Q = $self->transversions($pairwise) / $L;
737 my ($a,$b,$K,$var_k);
738 my $a_denom = ( 1 - (2 * $P) - $Q);
739 my $b_denom = 1 - 2 * $Q;
740 unless( $a_denom > 0 && $b_denom > 0 ) {
741 $a = 1;
742 $b = 1;
743 $K = -1;
744 $var_k = -1;
745 } else {
746 $a = 1 / $a_denom;
747 $b = 1 / $b_denom;
748 $K = (1/2) * log ( $a ) + (1/4) * log($b);
749 # from Wu and Li 1985 which in turn is from Kimura 1980
750 my $c = ( $a - $b ) / 2;
751 my $d = ( $a + $b ) / 2;
752 $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
755 # fwd and rev lookup
756 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
757 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
758 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
759 # (diagonals) distance is 0 for same sequence
760 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
761 $values[$j]->[$j] = sprintf($precisionstr,0);
763 $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
764 $var[$j]->[$j] = $values[$j]->[$j];
767 return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
768 -matrix => \%dist,
769 -names => \@names,
770 -values => \@values),
771 Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
772 -matrix => \%dist,
773 -names => \@names,
774 -values => \@var)
779 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
781 =head2 D_Tamura
783 Title : D_Tamura
784 Usage : Calculates D (pairwise distance) between 2 sequences in an
785 alignment using Tamura 1992 distance model.
786 Returns : L<Bio::Matrix::PhylipDist>
787 Args : L<Bio::Align::AlignI> of DNA sequences
790 =cut
792 sub D_Tamura {
793 my ($self,$aln) = @_;
794 return 0 unless $self->_check_arg($aln);
795 # ambiguities ignored at this point
796 my (@seqs,@names,@values,%dist,$i,$j);
797 my $seqct = 0;
798 my $length = $aln->length;
799 foreach my $seq ( $aln->each_seq) {
800 push @names, $seq->display_id;;
801 push @seqs, uc $seq->seq();
802 $seqct++;
805 my $precisionstr = "%.$Precision"."f";
806 my (@gap,@gc,@trans,@tranv,@score);
807 $i = 0;
808 for my $t1 ( @seqs ) {
809 $j = 0;
810 for my $t2 ( @seqs ) {
811 $gap[$i][$j] = 0;
812 for( my $k = 0; $k < $length; $k++ ) {
813 my ($c1,$c2) = ( substr($seqs[$i],$k,1),
814 substr($seqs[$j],$k,1) );
815 if( $c1 =~ /^$GapChars$/ ||
816 $c2 =~ /^$GapChars$/ ) {
817 $gap[$i][$j]++;
818 } elsif( $c2 =~ /^$GCChhars$/i ) {
819 $gc[$i][$j]++;
822 $gc[$i][$j] = ( $gc[$i][$j] /
823 ($length - $gap[$i][$j]) );
824 $j++;
826 $i++;
829 for( $i = 0; $i < $seqct-1; $i++ ) {
830 # (diagonals) distance is 0 for same sequence
831 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
832 $values[$i][$i] = sprintf($precisionstr,0);
834 for( $j = $i+1; $j < $seqct; $j++ ) {
836 my $pairwise = $aln->select_noncont($i+1,$j+1);
837 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
838 my $P = $self->transitions($pairwise) / $L;
839 my $Q = $self->transversions($pairwise) / $L;
840 my $C = $gc[$i][$j] + $gc[$j][$i]-
841 ( 2 * $gc[$i][$j] * $gc[$j][$i] );
842 if( $P ) {
843 $P = $P / $C;
845 my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
846 # fwd and rev lookup
847 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
848 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
849 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
850 # (diagonals) distance is 0 for same sequence
851 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
852 $values[$j][$j] = sprintf($precisionstr,0);
855 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
856 -matrix => \%dist,
857 -names => \@names,
858 -values => \@values);
862 =head2 D_F84
864 Title : D_F84
865 Usage : my $d = $stat->D_F84($aln)
866 Function: Calculates D (pairwise distance) between 2 sequences in an
867 alignment using the Felsenstein 1984 distance model.
868 Returns : L<Bio::Matrix::PhylipDist>
869 Args : L<Bio::Align::AlignI> of DNA sequences
870 [optional] double - gap penalty
872 =cut
874 sub D_F84 {
875 my ($self,$aln,$gappenalty) = @_;
876 return 0 unless $self->_check_arg($aln);
877 $self->throw_not_implemented();
878 # ambiguities ignored at this point
879 my (@seqs,@names,@values,%dist);
880 my $seqct = 0;
881 foreach my $seq ( $aln->each_seq) {
882 # if there is no name,
883 my $id = $seq->display_id;
884 if( ! length($id) || # deal with empty names
885 $id =~ /^\s+$/ ) {
886 $id = $seqct+1;
888 push @names, $id;
889 push @seqs, uc $seq->seq();
890 $seqct++;
893 my $precisionstr = "%.$Precision"."f";
895 for( my $i = 0; $i < $seqct-1; $i++ ) {
896 # (diagonals) distance is 0 for same sequence
897 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
898 $values[$i][$i] = sprintf($precisionstr,0);
900 for( my $j = $i+1; $j < $seqct; $j++ ) {
905 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
906 # Tajima-Nei correction used for multiple substitutions in the calc
907 # of the distance matrix. Nucleic acids only.
909 # D = p-distance = 1 - (matches/(posns_scored + gaps)
911 # distance = -b * ln(1-D/b)
914 =head2 D_TajimaNei
916 Title : D_TajimaNei
917 Usage : my $d = $stat->D_TajimaNei($aln)
918 Function: Calculates D (pairwise distance) between 2 sequences in an
919 alignment using the TajimaNei 1984 distance model.
920 Returns : L<Bio::Matrix::PhylipDist>
921 Args : Bio::Align::AlignI of DNA sequences
924 =cut
926 sub D_TajimaNei{
927 my ($self,$aln) = @_;
928 return 0 unless $self->_check_arg($aln);
929 # ambiguities ignored at this point
930 my (@seqs,@names,@values,%dist);
931 my $seqct = 0;
932 foreach my $seq ( $aln->each_seq) {
933 # if there is no name,
934 push @names, $seq->display_id;
935 push @seqs, uc $seq->seq();
936 $seqct++;
938 my $precisionstr = "%.$Precision"."f";
939 my ($i,$j,$bs);
940 # pairwise
941 for( $i =0; $i < $seqct -1; $i++ ) {
942 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
943 $values[$i][$i] = sprintf($precisionstr,0);
945 for ( $j = $i+1; $j <$seqct;$j++ ) {
946 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
947 $seqs[$j]);
948 my $pairwise = $aln->select_noncont($i+1,$j+1);
949 my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
950 my $fij2 = 0;
951 for( $bs = 0; $bs < 4; $bs++ ) {
952 my $fi = 0;
953 map {$fi += $matrix->[$bs]->[$_] } 0..3;
954 my $fj = 0;
955 # summation
956 map { $fj += $matrix->[$_]->[$bs] } 0..3;
957 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
958 $fij2 += $fij**2;
961 my ($pair,$h) = (0,0);
962 for( $bs = 0; $bs < 3; $bs++ ) {
963 for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
964 my $fij = $pfreq->[$pair++] / $slen;
965 if( $fij ) {
967 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
969 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
970 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
971 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
972 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
974 if( $fij ) {
975 $h += ( ($fij**2) / 2 ) /
976 ( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
977 ( ( $ci2 + $cj2 ) / (2 * $slen) )
980 $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
984 # just want diagonals which are matches (A matched A, C -> C)
986 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
987 $matrix->[2]->[2] + $matrix->[3]->[3] );
988 my $D = 1 - ( $m / $slen);
989 my $d;
990 if( $h == 0 ) {
991 $d = -1;
992 } else {
993 my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
994 my $c = 1- $D/ $b;
996 if( $c < 0 ) {
997 $d = -1;
998 } else {
999 $d = (-1 * $b) * log ( $c);
1002 # fwd and rev lookup
1003 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1004 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1005 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1007 # (diagonals) distance is 0 for same sequence
1008 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1009 $values[$j][$j] = sprintf($precisionstr,0);
1012 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1013 -matrix => \%dist,
1014 -names => \@names,
1015 -values => \@values);
1019 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1021 =head2 D_JinNei
1023 Title : D_JinNei
1024 Usage : my $d = $stat->D_JinNei($aln)
1025 Function: Calculates D (pairwise distance) between 2 sequences in an
1026 alignment using the Jin-Nei 1990 distance model.
1027 Returns : L<Bio::Matrix::PhylipDist>
1028 Args : L<Bio::Align::AlignI> of DNA sequences
1031 =cut
1033 sub D_JinNei{
1034 my ($self,@args) = @_;
1035 $self->warn("JinNei implementation not completed");
1036 return;
1039 =head2 transversions
1041 Title : transversions
1042 Usage : my $transversions = $stats->transversion($aln);
1043 Function: Calculates the number of transversions between two sequences in
1044 an alignment
1045 Returns : integer
1046 Args : Bio::Align::AlignI
1049 =cut
1051 sub transversions{
1052 my ($self,$aln) = @_;
1053 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1056 =head2 transitions
1058 Title : transitions
1059 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1060 Function: Calculates the number of transitions in a given DNA alignment
1061 Returns : integer representing the number of transitions
1062 Args : Bio::Align::AlignI object
1065 =cut
1067 sub transitions{
1068 my ($self,$aln) = @_;
1069 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1073 sub _trans_count_helper {
1074 my ($self,$aln,$type) = @_;
1075 return 0 unless( $self->_check_arg($aln) );
1076 if( ! $aln->is_flush ) { $self->throw("must be flush") }
1077 my (@tcount);
1078 my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1079 uc $aln->get_seq_by_pos(2)->seq() );
1080 my $alen = $aln->length;
1081 for (my $i = 0;$i<$alen; $i++ ) {
1082 my ($c1,$c2) = ( substr($first,$i,1),
1083 substr($second,$i,1) );
1084 if( $c1 ne $c2 ) {
1085 foreach my $nt ( @{$type->{$c1}} ) {
1086 if( $nt eq $c2) {
1087 $tcount[$i]++;
1092 my $sum = 0;
1093 map { if( $_) { $sum += $_} } @tcount;
1094 return $sum;
1097 # this will generate a matrix which records across the row, the number
1098 # of DNA subst
1100 sub _build_nt_matrix {
1101 my ($self,$seqa,$seqb) = @_;
1104 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1105 [ qw(0 0 0 0) ],
1106 [ qw(0 0 0 0) ],
1107 [ qw(0 0 0 0) ] ];
1108 my $gaps = 0; # number of gaps
1109 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1110 my $len_a = length($seqa);
1111 for( my $i = 0; $i < $len_a; $i++) {
1112 my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1113 $ti =~ tr/U/T/;
1114 $tj =~ tr/U/T/;
1116 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1117 if( $tj =~ /^$GapChars$/) { $gaps++; next }
1119 my $ti_index = $NucleotideIndexes{$ti};
1120 my $tj_index = $NucleotideIndexes{$tj};
1122 if( ! defined $ti_index ) {
1123 $self->warn("ti_index not defined for $ti\n");
1124 next;
1127 $basect_matrix->[$ti_index]->[$tj_index]++;
1129 if( $ti ne $tj ) {
1130 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1133 return ($basect_matrix,$pfreq,$gaps);
1136 sub _check_ambiguity_nucleotide {
1137 my ($base1,$base2) = @_;
1138 my %iub = Bio::Tools::IUPAC->iupac_iub();
1139 my @amb1 = @{ $iub{uc($base1)} };
1140 my @amb2 = @{ $iub{uc($base2)} };
1141 my ($pmatch) = (0);
1142 for my $amb ( @amb1 ) {
1143 if( grep { $amb eq $_ } @amb2 ) {
1144 $pmatch = 1;
1145 last;
1148 if( $pmatch ) {
1149 return (1 / scalar @amb1) * (1 / scalar @amb2);
1150 } else {
1151 return 0;
1156 sub _check_arg {
1157 my($self,$aln ) = @_;
1158 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1159 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1160 return 0;
1161 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1162 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1163 return 0;
1165 return 1;
1168 =head2 Data Methods
1170 =cut
1172 =head2 pairwise_stats
1174 Title : pairwise_stats
1175 Usage : $obj->pairwise_stats($newval)
1176 Function:
1177 Returns : value of pairwise_stats
1178 Args : newvalue (optional)
1181 =cut
1183 sub pairwise_stats{
1184 my ($self,$value) = @_;
1185 if( defined $value) {
1186 $self->{'_pairwise_stats'} = $value;
1188 return $self->{'_pairwise_stats'};
1192 =head2 calc_KaKs_pair
1194 Title : calc_KaKs_pair
1195 Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1196 $name1, $name2).
1197 Function : calculates Nei-Gojobori statistics for pairwise
1198 comparison.
1199 Args : A Bio::Align::AlignI compliant object such as a
1200 Bio::SimpleAlign object, and 2 sequence name strings.
1201 Returns : a reference to a hash of statistics with keys as
1202 listed in Description.
1204 =cut
1206 sub calc_KaKs_pair {
1207 my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1208 $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1209 if @_!= 4;
1210 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1211 my @seqs = (
1212 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1213 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1214 {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1215 {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1217 if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1218 $self->throw(" aligned sequences must be of equal length!");
1220 my $results = [];
1221 $self->_get_av_ds_dn(\@seqs, $results);
1222 return $results;
1226 =head2 calc_all_KaKs_pairs
1228 Title : calc_all_KaKs_pairs
1229 Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1230 Function : Calculates Nei_gojobori statistics for all pairwise
1231 combinations in sequence.
1232 Arguments: A Bio::Align::ALignI compliant object such as
1233 a Bio::SimpleAlign object.
1234 Returns : A reference to an array of hashes of statistics of
1235 all pairwise comparisons in the alignment.
1237 =cut
1241 sub calc_all_KaKs_pairs {
1242 #returns a multi_element_array with all pairwise comparisons
1243 my ($self,$aln) = @_;
1244 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1245 my @seqs;
1246 for my $seq ($aln->each_seq) {
1247 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1249 my $results ;
1250 $results = $self->_get_av_ds_dn(\@seqs, $results);
1251 return $results;
1254 =head2 calc_average_KaKs
1256 Title : calc_average_KaKs.
1257 Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1258 Function : calculates Nei_Gojobori stats for average of all
1259 sequences in the alignment.
1260 Args : A Bio::Align::AlignI compliant object such as a
1261 Bio::SimpleAlign object, number of bootstrap iterations
1262 (default 1000).
1263 Returns : A reference to a hash of statistics as listed in Description.
1265 =cut
1267 sub calc_average_KaKs {
1268 #calculates global value for sequences in alignment using bootstrapping
1269 #this is quite slow (~10 seconds per 3 X 200nt seqs);
1270 my ($self, $aln, $bootstrap_rpt) = @_;
1271 $bootstrap_rpt ||= 1000;
1272 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1273 my @seqs;
1274 for my $seq ($aln->each_seq) {
1275 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1277 my $results ;
1278 my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1279 #print "ds = $ds_orig, dn = $dn_orig\n";
1280 $results = {D_s => $ds_orig, D_n => $dn_orig};
1281 $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1282 return $results;
1285 ############## primary internal subs for alignment comparisons ########################
1287 sub _run_bootstrap {
1288 ### generates sampled sequences, calculates Ds and Dn values,
1289 ### then calculates variance of sampled sequences and add results to results hash
1290 ###
1291 my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1292 my @seqs = @$seq_ref;
1293 my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1294 my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1296 #1st make alternative array of codons;
1297 my $c = 0;
1298 while ($c < length $seqs[0]{'seq'}) {
1299 for (0..$#seqs) {
1300 push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1302 $c+=3;
1305 for (1..$bootstrap_rpt) {
1306 my $sampled = _resample (\@btstrp_aoa);
1307 my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1308 push @{$bootstrap_values{'ds'}}, $ds;
1309 push @{$bootstrap_values{'dn'}}, $dn;
1312 $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1313 $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1314 $results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1315 sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1316 #print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1317 #print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1318 #print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1321 sub _resample {
1322 my $ref = shift;
1323 my $codon_num = scalar (@{$ref->[0]});
1324 my @altered;
1325 for (0..$codon_num -1) { #for each codon
1326 my $rand = int (rand ($codon_num));
1327 for (0..$#$ref) {
1328 push @{$altered[$_]}, $ref->[$_][$rand];
1331 my @stringed = map {join '', @$_}@altered;
1332 my @return;
1333 #now out in random name to keep other subs happy
1334 for (@stringed) {
1335 push @return, {id=>'1', seq=> $_};
1337 return \@return;
1340 sub _get_av_ds_dn {
1341 # takes array of hashes of sequence strings and ids #
1342 my $self = shift;
1343 my $seq_ref = shift;
1344 my $result = shift if @_;
1345 my @caller = caller(1);
1346 my @seqarray = @$seq_ref;
1347 my $bootstrap_score_list;
1348 #for a multiple alignment considers all pairwise combinations#
1349 my %dsfor_average = (ds => [], dn => []);
1350 for (my $i = 0; $i < scalar @seqarray; $i++) {
1351 for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1352 # print "comparing $i and $j\n";
1353 if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1354 $self->warn(" aligned sequences must be of equal length!");
1355 next;
1358 my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1359 my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1360 # print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1361 my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1362 #get averages
1363 my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1364 my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1366 #calculate ps and pn (p54)
1367 my $syn_prop = $syn_count / $av_s_site;
1368 my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1370 #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1371 my $d_syn = $self->jk($syn_prop);
1372 my $d_nc = $self->jk($nc_prop);
1374 #JK calculation must succeed for continuation of calculation
1375 #ret_value = -1 if error
1376 next unless $d_nc >=0 && $d_syn >=0;
1379 push @{$dsfor_average{'ds'}}, $d_syn;
1380 push @{$dsfor_average{'dn'}}, $d_nc;
1382 #if not doing bootstrap, calculate the pairwise comparisin stats
1383 if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1384 #now calculate variances assuming large sample
1385 my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1386 my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1387 #now calculate z_value
1388 #print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1389 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1390 my $z = ($d_syn_var + $d_nc_var) ?
1391 ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1392 # print "z is $z\n";
1393 push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1394 S_d => $syn_count, N_d =>$non_syn_count,
1395 P_s => $syn_prop, P_n=>$nc_prop,
1396 D_s => @{$dsfor_average{'ds'}}[-1],
1397 D_n => @{$dsfor_average{'dn'}}[-1],
1398 D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1399 Seq1 => $seqarray[$i]{'id'},
1400 Seq2 => $seqarray[$j]{'id'},
1401 z_score => $z,
1403 $self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1404 if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1405 }#endif
1409 #warn of failure if no results hashes are present
1410 #will fail if Jukes Cantor has failed for all pairwise combinations
1411 #$self->warn("calculation failed!") if scalar @$result ==0;
1413 #return results unless bootstrapping
1414 return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1415 #else if getting average for bootstrap
1416 return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1420 sub jk {
1421 my ($self, $p) = @_;
1422 if ($p > 0.75) {
1423 $self->warn( " Jukes Cantor won't work -too divergent!");
1424 return -1;
1426 return -1 * (3/4) * (log(1 - (4/3) * $p));
1429 #works for large value of n (50?100?)
1430 sub jk_var {
1431 my ($p, $n) = @_;
1432 return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1436 # compares 2 sequences to find the number of synonymous/non
1437 # synonymous mutations between them
1439 sub analyse_mutations {
1440 my ($seq1, $seq2) = @_;
1441 my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1442 [2,1]], # depend on which is the same
1443 1=>[[0,2],
1444 [2,0]],
1445 2=>[[0,1],
1446 [1,0]],
1448 3=> [ [0,1,2], # all need to be altered
1449 [1,0,2],
1450 [0,2,1],
1451 [1,2,0],
1452 [2,0,1],
1453 [2,1,0] ],
1455 my $TOTAL = 0; # total synonymous changes
1456 my $TOTAL_n = 0; # total non-synonymous changes
1457 my $gap_cnt = 0;
1459 my %input;
1460 my $seqlen = length($seq1);
1461 for (my $j=0; $j< $seqlen; $j+=3) {
1462 $input{'cod1'} = substr($seq1, $j,3);
1463 $input{'cod2'} = substr($seq2, $j,3);
1465 #ignore codon if beeing compared with gaps!
1466 if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1467 $gap_cnt += 3; #just increments once if there is a pair of gaps
1468 next;
1471 my ($diff_cnt, $same) = count_diffs(\%input);
1473 #ignore if codons are identical
1474 next if $diff_cnt == 0 ;
1475 if ($diff_cnt == 1) {
1476 $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1477 $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1478 #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1480 elsif ($diff_cnt ==2) {
1481 my $s_cnt = 0;
1482 my $n_cnt = 0;
1483 my $tot_muts = 4;
1484 #will stay 4 unless there are stop codons at intervening point
1485 OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
1486 my $altered = $input{'cod1'};
1487 my $prev= $altered;
1488 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1489 for my $mut_i (@$perm) { #index of codon mutated
1490 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1491 if ($t[$CODONS->{$altered}] eq '*') {
1492 $tot_muts -=2;
1493 #print "changes to stop codon!!\n";
1494 next OUTER;
1496 else {
1497 $s_cnt += $synchanges{$prev}{$altered};
1498 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1500 $prev = $altered;
1502 # print "\n";
1504 if ($tot_muts != 0) {
1505 $TOTAL += ($s_cnt/($tot_muts/2));
1506 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1510 elsif ($diff_cnt ==3 ) {
1511 my $s_cnt = 0;
1512 my $n_cnt = 0;
1513 my $tot_muts = 18; #potential number of mutations
1514 OUTER: for my $perm (@{$mutator{'3'}}) {
1515 my $altered = $input{'cod1'};
1516 my $prev= $altered;
1517 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1518 for my $mut_i (@$perm) { #index of codon mutated
1519 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1520 if ($t[$CODONS->{$altered}] eq '*') {
1521 $tot_muts -=3;
1522 # print "changes to stop codon!!\n";
1523 next OUTER;
1526 else {
1527 $s_cnt += $synchanges{$prev}{$altered};
1528 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1530 $prev = $altered;
1532 # print "\n";
1534 }#end OUTER loop
1535 #calculate number of synonymous/non synonymous mutations for that codon
1536 # and add to total
1537 if ($tot_muts != 0) {
1538 $TOTAL += ($s_cnt / ($tot_muts /3));
1539 $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1541 } #endif $diffcnt = 3
1542 } #end of sequencetraversal
1543 return ($TOTAL, $TOTAL_n, $gap_cnt);
1547 sub count_diffs {
1548 #counts the number of nucleotide differences between 2 codons
1549 # returns this value plus the codon index of which nucleotide is the same when 2
1550 #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1551 # to change.
1552 my $ref = shift;
1553 my $cnt = 0;
1554 my $same= undef;
1555 #just for 2 differences
1556 for (0..2) {
1557 if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1558 $cnt++;
1559 } else {
1560 $same = $_;
1563 return ($cnt, $same);
1566 =head2 get_syn_changes
1568 Title : get_syn_changes
1569 Usage : Bio::Align::DNAStatitics->get_syn_changes
1570 Function: Generate a hashref of all pairwise combinations of codns
1571 differing by 1
1572 Returns : Symetic matrix using hashes
1573 First key is codon
1574 and each codon points to a hashref of codons
1575 the values of which describe type of change.
1576 my $type = $hash{$codon1}->{$codon2};
1577 values are :
1578 1 synonymous
1579 0 non-syn
1580 -1 either codon is a stop codon
1581 Args : none
1583 =cut
1585 sub get_syn_changes {
1586 #hash of all pairwise combinations of codons differing by 1
1587 # 1 = syn, 0 = non-syn, -1 = stop
1588 my %results;
1589 my @codons = _make_codons ();
1590 my $arr_len = scalar @codons;
1591 for (my $i = 0; $i < $arr_len -1; $i++) {
1592 my $cod1 = $codons[$i];
1593 for (my $j = $i +1; $j < $arr_len; $j++) {
1594 my $diff_cnt = 0;
1595 for my $pos(0..2) {
1596 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1598 next if $diff_cnt !=1;
1600 #synon change
1601 if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1602 $results{$cod1}{$codons[$j]} =1;
1603 $results{$codons[$j]}{$cod1} = 1;
1605 #stop codon
1606 elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1607 $results{$cod1}{$codons[$j]} = -1;
1608 $results{$codons[$j]}{$cod1} = -1;
1610 # nc change
1611 else {
1612 $results{$cod1}{$codons[$j]} = 0;
1613 $results{$codons[$j]}{$cod1} = 0;
1617 return %results;
1620 =head2 dnds_pattern_number
1622 Title : dnds_pattern_number
1623 Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1624 Function: Counts the number of codons with no gaps in the MSA
1625 Returns : Number of codons with no gaps ('patterns' in PAML notation)
1626 Args : A Bio::Align::AlignI compliant object such as a
1627 Bio::SimpleAlign object.
1629 =cut
1631 sub dnds_pattern_number{
1632 my ($self, $aln) = @_;
1633 return ($aln->remove_gaps->length)/3;
1636 sub count_syn_sites {
1637 #counts the number of possible synonymous changes for sequence
1638 my ($seq, $synsite) = @_;
1639 __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1640 my $S = 0;
1641 for (my $i = 0; $i< length($seq); $i+=3) {
1642 my $cod = substr($seq, $i, 3);
1643 next if $cod =~ /\-/; #deal with alignment gaps
1644 $S += $synsite->{$cod}{'s'};
1646 #print "S is $S\n";
1647 return $S;
1652 sub get_syn_sites {
1653 #sub to generate lookup hash for the number of synonymous changes per codon
1654 my @nucs = qw(T C A G);
1655 my %raw_results;
1656 for my $i (@nucs) {
1657 for my $j (@nucs) {
1658 for my $k (@nucs) {
1659 # for each possible codon
1660 my $cod = "$i$j$k";
1661 my $aa = $t[$CODONS->{$cod}];
1662 #calculate number of synonymous mutations vs non syn mutations
1663 for my $i (qw(0 1 2)){
1664 my $s = 0;
1665 my $n = 3;
1666 for my $nuc (qw(A T C G)) {
1667 next if substr ($cod, $i,1) eq $nuc;
1668 my $test = $cod;
1669 substr($test, $i, 1) = $nuc ;
1670 if ($t[$CODONS->{$test}] eq $aa) {
1671 $s++;
1673 if ($t[$CODONS->{$test}] eq '*') {
1674 $n--;
1677 $raw_results{$cod}[$i] = {'s' => $s ,
1678 'n' => $n };
1681 } #end analysis of single codon
1683 } #end analysis of all codons
1684 my %final_results;
1686 for my $cod (sort keys %raw_results) {
1687 my $t = 0;
1688 map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
1689 $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1691 return \%final_results;
1694 sub _make_codons {
1695 #makes all codon combinations, returns array of them
1696 my @nucs = qw(T C A G);
1697 my @codons;
1698 for my $i (@nucs) {
1699 for my $j (@nucs) {
1700 for my $k (@nucs) {
1701 push @codons, "$i$j$k";
1705 return @codons;
1708 sub get_codons {
1709 #generates codon translation look up table#
1710 my $x = 0;
1711 my $CODONS = {};
1712 for my $codon (_make_codons) {
1713 $CODONS->{$codon} = $x;
1714 $x++;
1716 return $CODONS;
1719 #########stats subs, can go in another module? Here for speed. ###
1720 sub mean {
1721 my $ref = shift;
1722 my $el_num = scalar @$ref;
1723 my $tot = 0;
1724 map{$tot += $_}@$ref;
1725 return ($tot/$el_num);
1728 sub variance {
1729 my $ref = shift;
1730 my $mean = mean($ref);
1731 my $sum_of_squares = 0;
1732 map{$sum_of_squares += ($_ - $mean) **2}@$ref;
1733 return $sum_of_squares;
1736 sub sampling_variance {
1737 my $ref = shift;
1738 return variance($ref) / (scalar @$ref -1);