Bio/Align/DNAStatistics.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Align::DNAStatistics
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Jason Stajich <jason-AT-bioperl.org>
   8 #
   9 # Copyright Jason Stajich
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
  18
  19 =head1 SYNOPSIS
  20
  21   use Bio::AlignIO;
  22   use Bio::Align::DNAStatistics;
  23
  24   my $stats = Bio::Align::DNAStatistics->new();
  25   my $alignin = Bio::AlignIO->new(-format => 'emboss',
  26                                  -file   => 't/data/insulin.water');
  27   my $aln = $alignin->next_aln;
  28   my $jcmatrix = $stats->distance(-align => $aln,
  29                                   -method => 'Jukes-Cantor');
  30
  31   print $jcmatrix->print_matrix;
  32   ## and for measurements of synonymous /nonsynonymous substitutions ##
  33
  34   my $in = Bio::AlignIO->new(-format => 'fasta',
  35                             -file   => 't/data/nei_gojobori_test.aln');
  36   my $alnobj = $in->next_aln;
  37   my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
  38   my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
  39   print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
  40   for (sort keys %{$results->[0]} ){
  41       next if /Seq/;
  42       printf("%-9s %.4f \n",$_ , $results->[0]{$_});
  43   }
  44
  45   my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
  46   for my $an (@$results2){
  47       print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
  48       for (sort keys %$an ){
  49           next if /Seq/;
  50           printf("%-9s %.4f \n",$_ , $an->{$_});
  51       }
  52       print "\n\n";
  53   }
  54
  55   my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
  56   for (sort keys %$result3 ){
  57       next if /Seq/;
  58       printf("%-9s %.4f \n",$_ , $result3->{$_});
  59   }
  60
  61 =head1 DESCRIPTION
  62
  63 This object contains routines for calculating various statistics and
  64 distances for DNA alignments.  The routines are not well tested and do
  65 contain errors at this point.  Work is underway to correct them, but
  66 do not expect this code to give you the right answer currently!  Use
  67 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
  68 distances.
  69
  70
  71 Several different distance method calculations are supported.  Listed
  72 in brackets are the pattern which will match
  73
  74 =over 3
  75
  76 =item JukesCantor [jc|jukes|jukescantor|jukes-cantor]
  77
  78 =item Uncorrected [jcuncor|uncorrected]
  79
  80 =item F81 [f81|felsenstein]
  81
  82 =item Kimura [k2|k2p|k80|kimura]
  83
  84 =item Tamura [t92|tamura|tamura92]
  85
  86 =item F84 [f84|felsenstein84]
  87
  88 =item TajimaNei [tajimanei|tajima\-nei]
  89
  90 =item JinNei [jinnei|jin\-nei] (not implemented)
  91
  92 =back
  93
  94 There are also three methods to calculate the ratio of synonymous to
  95 non-synonymous mutations.  All are implementations of the Nei-Gojobori
  96 evolutionary pathway method and use the Jukes-Cantor method of
  97 nucleotide substitution. This method works well so long as the
  98 nucleotide frequencies are roughly equal and there is no significant
  99 transition/transversion bias.  In order to use these methods there are
 100 several pre-requisites for the alignment.
 101
 102 =over 3
 103
 104 =item 1
 105
 106 DNA alignment must be based on protein alignment. Use the subroutine
 107 L<aa_to_dna_aln> in Bio::Align::Utilities to achieve this.
 108
 109 =item 2
 110
 111 Therefore alignment gaps must be in multiples of 3 (representing an aa
 112 deletion/insertion) and at present must be indicated by a '-' symbol.
 113
 114 =item 3
 115
 116 Alignment must be solely of coding region and be in reading frame 0 to
 117 achieve meaningful results
 118
 119 =item 4
 120
 121 Alignment must therefore be a multiple of 3 nucleotides long.
 122
 123 =item 5
 124
 125 All sequences must be the same length (including gaps). This should be
 126 the case anyway if the sequences have been automatically aligned using
 127 a program like Clustal.
 128
 129 =item 6
 130
 131 Only the standard codon alphabet is supported at present.
 132
 133 =back
 134
 135 calc_KaKs_pair() calculates a number of statistics for a named pair of
 136 sequences in the alignment.
 137
 138 calc_all_KaKs_pairs() calculates these statistics for all pairwise
 139 comparisons in an MSA.  The statistics returned are:
 140
 141 =over 3
 142
 143 =item S_d
 144
 145 Number of synonymous mutations between the 2 sequences.
 146
 147 =item N_d
 148
 149 Number of non-synonymous mutations between the 2 sequences.
 150
 151 =item S
 152
 153 Mean number of  synonymous sites in both sequences.
 154
 155 =item N
 156
 157 mean number of  synonymous sites in both sequences.
 158
 159 =item P_s
 160
 161 proportion of synonymous differences in both sequences given by P_s = S_d/S.
 162
 163 =item P_n
 164
 165 proportion of non-synonymous differences in both sequences given by P_n = S_n/S.
 166
 167 =item D_s
 168
 169 estimation of synonymous mutations per synonymous site (by Jukes-Cantor).
 170
 171 =item D_n
 172
 173 estimation of non-synonymous mutations per non-synonymous site (by Jukes-Cantor).
 174
 175 =item D_n_var
 176
 177 estimation of variance of D_n .
 178
 179 =item D_s_var
 180
 181 estimation of variance of S_n.
 182
 183 =item z_value
 184
 185 calculation of z value.Positive value indicates D_n E<gt> D_s,
 186 negative value indicates D_s E<gt> D_n.
 187
 188 =back
 189
 190 The statistics returned by calc_average_KaKs are:
 191
 192 =over 3
 193
 194 =item D_s
 195
 196 Average number of synonymous mutations/synonymous site.
 197
 198 =item D_n
 199
 200 Average number of non-synonymous mutations/non-synonymous site.
 201
 202 =item D_s_var
 203
 204 Estimated variance of Ds from bootstrapped alignments.
 205
 206 =item D_n_var
 207
 208 Estimated variance of Dn from bootstrapped alignments.
 209
 210 =item z_score
 211
 212 calculation of z value. Positive value indicates D_n E<gt>D_s,
 213 negative values vice versa.
 214
 215 =back
 216
 217 The design of the code is based around the explanation of the
 218 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
 219 Phylogenetics" by Nei and Kumar, published by Oxford University
 220 Press. The methods have been tested using the worked example 4.1 in
 221 the book, and reproduce those results. If people like having this sort
 222 of analysis in BioPerl other methods for estimating Ds and Dn can be
 223 provided later.
 224
 225
 226 Much of the DNA distance code is based on implementations in EMBOSS
 227 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
 228 al) [dnadist.c].  Insight also gained from Eddy, Durbin, Krogh, &
 229 Mitchison.
 230
 231 =head1 REFERENCES
 232
 233 =over 3
 234
 235 =item D_JukesCantor
 236
 237 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
 238 Mol. Systematics, 2nd ed, 1996, Ch 11.  Derived from "Evolution of
 239 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
 240 1969, pp. 21-132.
 241
 242 =item D_Tamura
 243
 244 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
 245
 246 =item D_Kimura
 247
 248 M Kimura, J. Mol. Evol., 1980, 16, 111.
 249
 250 =item JinNei
 251
 252 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
 253
 254 =item D_TajimaNei
 255
 256 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
 257
 258 =back
 259
 260 =head1 FEEDBACK
 261
 262 =head2 Mailing Lists
 263
 264 User feedback is an integral part of the evolution of this and other
 265 Bioperl modules. Send your comments and suggestions preferably to
 266 the Bioperl mailing list.  Your participation is much appreciated.
 267
 268   bioperl-l@bioperl.org                  - General discussion
 269   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 270
 271 =head2 Support
 272
 273 Please direct usage questions or support issues to the mailing list:
 274
 275 L<bioperl-l@bioperl.org>
 276
 277 rather than to the module maintainer directly. Many experienced and
 278 reponsive experts will be able look at the problem and quickly
 279 address it. Please include a thorough description of the problem
 280 with code and data examples if at all possible.
 281
 282 =head2 Reporting Bugs
 283
 284 Report bugs to the Bioperl bug tracking system to help us keep track
 285 of the bugs and their resolution. Bug reports can be submitted via the
 286 web:
 287
 288   http://bugzilla.open-bio.org/
 289
 290 =head1 AUTHOR - Jason Stajich
 291
 292 Email jason-AT-bioperl.org
 293
 294 =head1 CONTRIBUTORS
 295
 296 Richard Adams, richard.adams@ed.ac.uk
 297
 298 =head1 APPENDIX
 299
 300 The rest of the documentation details each of the object methods.
 301 Internal methods are usually preceded with a _
 302
 303 =cut
 304
 305
 306 # Let the code begin...
 307
 308
 309 package Bio::Align::DNAStatistics;
 310 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
 311             $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
 312             $CODONS %synchanges $synsites $Precision $GCChhars);
 313 use strict;
 314 use Bio::Align::PairwiseStatistics;
 315 use Bio::Matrix::PhylipDist;
 316 use Bio::Tools::IUPAC;
 317
 318 BEGIN {
 319     $GapChars = '[\.\-]';
 320     $GCChhars = '[GCS]';
 321     @Nucleotides = qw(A G T C);
 322     $SeqCount = 2;
 323     $Precision = 5;
 324
 325     # these values come from EMBOSS distmat implementation
 326     %NucleotideIndexes = ( 'A' => 0,
 327                            'T' => 1,
 328                            'C' => 2,
 329                            'G' => 3,
 330
 331                            'AT' => 0,
 332                            'AC' => 1,
 333                            'AG' => 2,
 334                            'CT' => 3,
 335                            'GT' => 4,
 336                            'CG' => 5,
 337
 338 # these are wrong now
 339 #                          'S' => [ 1, 3],
 340 #                          'W' => [ 0, 4],
 341 #                          'Y' => [ 2, 3],
 342 #                          'R' => [ 0, 1],
 343 #                          'M' => [ 0, 3],
 344 #                          'K' => [ 1, 2],
 345 #                          'B' => [ 1, 2, 3],
 346 #                          'H' => [ 0, 2, 3],
 347 #                          'V' => [ 0, 1, 3],
 348 #                          'D' => [ 0, 1, 2],
 349                            );
 350
 351     $DefaultGapPenalty = 0;
 352     # could put ambiguities here?
 353     %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
 354                                          'T' => [ 'A', 'G'],
 355                                          'C' => [ 'A', 'G'],
 356                                          'G' => [ 'C', 'T'],
 357                                      },
 358                     'Transitions'   => { 'A' => [ 'G' ],
 359                                          'G' => [ 'A' ],
 360                                          'C' => [ 'T' ],
 361                                          'T' => [ 'C' ],
 362                                      },
 363                     );
 364     %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
 365                          'jcuncor|uncorrected'   => 'Uncorrected',
 366                          'f81|felsenstein81'     => 'F81',
 367                          'k2|k2p|k80|kimura'     => 'Kimura',
 368                          't92|tamura|tamura92'   => 'Tamura',
 369                          'f84|felsenstein84'     => 'F84',
 370                          'tajimanei|tajima\-nei' => 'TajimaNei',
 371                          'jinnei|jin\-nei'       => 'JinNei');
 372
 373 }
 374 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
 375
 376 ## generate look up hashes for Nei_Gojobori methods##
 377 $CODONS = get_codons();
 378 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
 379 #create look up hash of number of possible synonymous mutations per codon
 380 $synsites = get_syn_sites();
 381 #create reference look up hash of single basechanges in codons
 382 %synchanges = get_syn_changes();
 383
 384
 385
 386 =head2 new
 387
 388  Title   : new
 389  Usage   : my $obj = Bio::Align::DNAStatistics->new();
 390  Function: Builds a new Bio::Align::DNAStatistics object
 391  Returns : Bio::Align::DNAStatistics
 392  Args    : none
 393
 394
 395 =cut
 396
 397 sub new {
 398     my ($class,@args) = @_;
 399     my $self = $class->SUPER::new(@args);
 400
 401     $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
 402
 403     return $self;
 404 }
 405
 406
 407 =head2 distance
 408
 409  Title   : distance
 410  Usage   : my $distance_mat = $stats->distance(-align  => $aln,
 411                                                -method => $method);
 412  Function: Calculates a distance matrix for all pairwise distances of
 413            sequences in an alignment.
 414  Returns : L<Bio::Matrix::PhylipDist> object
 415  Args    : -align  => Bio::Align::AlignI object
 416            -method => String specifying specific distance method
 417                       (implementing class may assume a default)
 418 See also: L<Bio::Matrix::PhylipDist>
 419
 420 =cut
 421
 422 sub distance{
 423    my ($self,@args) = @_;
 424    my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
 425    if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
 426        $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
 427    }
 428    $method ||= 'JukesCantor';
 429    foreach my $m ( keys %DistanceMethods ) {
 430        if(defined $m &&  $method =~ /$m/i ) {
 431            my $mtd = "D_$DistanceMethods{$m}";
 432            return $self->$mtd($aln);
 433        }
 434    }
 435    $self->warn("Unrecognized distance method $method must be one of [".
 436                join(',',$self->available_distance_methods())."]");
 437    return;
 438 }
 439
 440 =head2 available_distance_methods
 441
 442  Title   : available_distance_methods
 443  Usage   : my @methods = $stats->available_distance_methods();
 444  Function: Enumerates the possible distance methods
 445  Returns : Array of strings
 446  Args    : none
 447
 448
 449 =cut
 450
 451 sub available_distance_methods{
 452    my ($self,@args) = @_;
 453    return values %DistanceMethods;
 454 }
 455
 456 =head2 D - distance methods
 457
 458
 459 =cut
 460
 461
 462 =head2 D_JukesCantor
 463
 464  Title   : D_JukesCantor
 465  Usage   : my $d = $stat->D_JukesCantor($aln)
 466  Function: Calculates D (pairwise distance) between 2 sequences in an
 467            alignment using the Jukes-Cantor 1 parameter model.
 468  Returns : L<Bio::Matrix::PhylipDist>
 469  Args    : L<Bio::Align::AlignI> of DNA sequences
 470            double - gap penalty
 471
 472
 473 =cut
 474
 475 sub D_JukesCantor{
 476    my ($self,$aln,$gappenalty) = @_;
 477    return 0 unless $self->_check_arg($aln);
 478    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 479    # ambiguities ignored at this point
 480    my (@seqs,@names,@values,%dist);
 481    my $seqct = 0;
 482    foreach my $seq ( $aln->each_seq) {
 483        push @names, $seq->display_id;
 484        push @seqs, uc $seq->seq();
 485        $seqct++;
 486    }
 487    my $precisionstr = "%.$Precision"."f";
 488    for(my $i = 0; $i < $seqct-1; $i++ ) {
 489        # (diagonals) distance is 0 for same sequence
 490        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 491        $values[$i][$i] = sprintf($precisionstr,0);
 492
 493        for( my $j = $i+1; $j < $seqct; $j++ ) {
 494            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 495                                                                $seqs[$j]);
 496            # just want diagonals
 497            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 498                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 499            my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
 500            my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
 501            # fwd and rev lookup
 502            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 503            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 504            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 505            # (diagonals) distance is 0 for same sequence
 506            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 507            $values[$j][$j] = sprintf($precisionstr,0);
 508        }
 509    }
 510    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 511                                        -matrix  => \%dist,
 512                                        -names   => \@names,
 513                                        -values  => \@values);
 514 }
 515
 516 =head2 D_F81
 517
 518  Title   : D_F81
 519  Usage   : my $d = $stat->D_F81($aln)
 520  Function: Calculates D (pairwise distance) between 2 sequences in an
 521            alignment using the Felsenstein 1981 distance model.
 522            Relaxes the assumption of equal base frequencies that is
 523            in JC.
 524  Returns : L<Bio::Matrix::PhylipDist>
 525  Args    : L<Bio::Align::AlignI> of DNA sequences
 526
 527
 528 =cut
 529
 530 sub D_F81{
 531    my ($self,$aln,$gappenalty) = @_;
 532    return 0 unless $self->_check_arg($aln);
 533    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 534    # ambiguities ignored at this point
 535    my (@seqs,@names,@values,%dist);
 536    my $seqct = 0;
 537    foreach my $seq ( $aln->each_seq) {
 538        push @names, $seq->display_id;;
 539        push @seqs, uc $seq->seq();
 540        $seqct++;
 541    }
 542    my $precisionstr = "%.$Precision"."f";
 543    for(my $i = 0; $i < $seqct-1; $i++ ) {
 544        # (diagonals) distance is 0 for same sequence
 545        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 546        $values[$i][$i] = sprintf($precisionstr,0);
 547
 548        for( my $j = $i+1; $j < $seqct; $j++ ) {
 549
 550            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 551                                                                $seqs[$j]);
 552            # just want diagonals
 553            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 554                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 555            my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
 556            my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
 557            # fwd and rev lookup
 558            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 559            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 560            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 561            # (diagonals) distance is 0 for same sequence
 562            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 563            $values[$j][$j] = sprintf($precisionstr,0);
 564        }
 565    }
 566    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 567                                        -matrix  => \%dist,
 568                                        -names   => \@names,
 569                                        -values  => \@values);
 570 }
 571
 572 =head2 D_Uncorrected
 573
 574  Title   : D_Uncorrected
 575  Usage   : my $d = $stats->D_Uncorrected($aln)
 576  Function: Calculate a distance D, no correction for multiple substitutions
 577            is used.
 578  Returns : L<Bio::Matrix::PhylipDist>
 579  Args    : L<Bio::Align::AlignI> (DNA Alignment)
 580            [optional] gap penalty
 581
 582 =cut
 583
 584 sub D_Uncorrected {
 585    my ($self,$aln,$gappenalty) = @_;
 586    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 587    return 0 unless $self->_check_arg($aln);
 588    # ambiguities ignored at this point
 589    my (@seqs,@names,@values,%dist);
 590    my $seqct = 0;
 591    foreach my $seq ( $aln->each_seq) {
 592        push @names, $seq->display_id;
 593        push @seqs, uc $seq->seq();
 594        $seqct++;
 595    }
 596
 597    my $precisionstr = "%.$Precision"."f";
 598    my $len = $aln->length;
 599    for( my $i = 0; $i < $seqct-1; $i++ ) {
 600        # (diagonals) distance is 0 for same sequence
 601        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 602        $values[$i][$i] = sprintf($precisionstr,0);
 603
 604        for( my $j = $i+1; $j < $seqct; $j++ ) {
 605            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 606                                                                $seqs[$j]);
 607            my $m = ( $matrix->[0]->[0] +
 608                      $matrix->[1]->[1] +
 609                      $matrix->[2]->[2] +
 610                      $matrix->[3]->[3] );
 611            my $D = 1 - ( $m / ( $len - $gaps + ( $gaps * $gappenalty)));
 612            # fwd and rev lookup
 613            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 614            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 615            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
 616            # (diagonals) distance is 0 for same sequence
 617            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 618            $values[$j][$j] = sprintf($precisionstr,0);
 619        }
 620    }
 621    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 622                                        -matrix  => \%dist,
 623                                        -names   => \@names,
 624                                        -values  => \@values);
 625 }
 626
 627
 628 # M Kimura, J. Mol. Evol., 1980, 16, 111.
 629
 630 =head2 D_Kimura
 631
 632  Title   : D_Kimura
 633  Usage   : my $d = $stat->D_Kimura($aln)
 634  Function: Calculates D (pairwise distance) between all pairs of sequences
 635            in an alignment using the Kimura 2 parameter model.
 636  Returns : L<Bio::Matrix::PhylipDist>
 637  Args    : L<Bio::Align::AlignI> of DNA sequences
 638
 639
 640 =cut
 641
 642 sub D_Kimura {
 643    my ($self,$aln) = @_;
 644    return 0 unless $self->_check_arg($aln);
 645    # ambiguities ignored at this point
 646    my (@names,@values,%dist);
 647    my $seqct = 0;
 648    foreach my $seq ( $aln->each_seq) {
 649        push @names, $seq->display_id;
 650        $seqct++;
 651    }
 652
 653    my $precisionstr = "%.$Precision"."f";
 654
 655    for( my $i = 0; $i < $seqct-1; $i++ ) {
 656        # (diagonals) distance is 0 for same sequence
 657        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 658        $values[$i][$i] = sprintf($precisionstr,0);
 659
 660        for( my $j = $i+1; $j < $seqct; $j++ ) {
 661            my $pairwise = $aln->select_noncont($i+1,$j+1);
 662            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 663            unless( $L ) {
 664                $L = 1;
 665            }
 666            my $P = $self->transitions($pairwise) / $L;
 667            my $Q = $self->transversions($pairwise) / $L;
 668            my $K = 0;
 669            my $denom = ( 1 - (2 * $P) - $Q);
 670            if( $denom == 0 ) {
 671                $self->throw("cannot find distance for ",$i+1,
 672                             ",",$j+1," $P, $Q\n");
 673            }
 674            my $a = 1 / ( 1 - (2 * $P) - $Q);
 675            my $b = 1 / ( 1 - 2 * $Q );
 676            if( $a < 0 || $b < 0 ) {
 677                $K = -1;
 678            } else{
 679                $K = (1/2) * log ( $a ) + (1/4) * log($b);
 680            }
 681            # fwd and rev lookup
 682            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 683            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 684            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
 685            # (diagonals) distance is 0 for same sequence
 686            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 687            $values[$j][$j] = sprintf($precisionstr,0);
 688        }
 689    }
 690    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 691                                        -matrix  => \%dist,
 692                                        -names   => \@names,
 693                                        -values  => \@values);
 694 }
 695
 696
 697 =head2 D_Kimura_variance
 698
 699  Title   : D_Kimura
 700  Usage   : my $d = $stat->D_Kimura_variance($aln)
 701  Function: Calculates D (pairwise distance) between all pairs of sequences
 702            in an alignment using the Kimura 2 parameter model.
 703  Returns : array of 2 L<Bio::Matrix::PhylipDist>,
 704            the first is the Kimura distance and the second is
 705            a matrix of variance V(K)
 706  Args    : L<Bio::Align::AlignI> of DNA sequences
 707
 708
 709 =cut
 710
 711 sub D_Kimura_variance {
 712    my ($self,$aln) = @_;
 713    return 0 unless $self->_check_arg($aln);
 714    # ambiguities ignored at this point
 715    my (@names,@values,%dist,@var);
 716    my $seqct = 0;
 717    foreach my $seq ( $aln->each_seq) {
 718        push @names, $seq->display_id;
 719        $seqct++;
 720    }
 721
 722    my $precisionstr = "%.$Precision"."f";
 723
 724    for( my $i = 0; $i < $seqct-1; $i++ ) {
 725        # (diagonals) distance is 0 for same sequence
 726        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 727        $values[$i][$i] = sprintf($precisionstr,0);
 728
 729        for( my $j = $i+1; $j < $seqct; $j++ ) {
 730            my $pairwise = $aln->select_noncont($i+1,$j+1);
 731            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 732            unless( $L ) {
 733                $L = 1;
 734            }
 735            my $P = $self->transitions($pairwise) / $L;
 736            my $Q = $self->transversions($pairwise) / $L;
 737            my ($a,$b,$K,$var_k);
 738            my $a_denom = ( 1 - (2 * $P) - $Q);
 739            my $b_denom = 1 - 2 * $Q;
 740            unless( $a_denom > 0 && $b_denom > 0 ) {
 741                $a = 1;
 742                $b = 1;
 743                $K = -1;
 744                $var_k = -1;
 745            } else {
 746                $a = 1 / $a_denom;
 747                $b = 1 / $b_denom;
 748                $K = (1/2) * log ( $a ) + (1/4) * log($b);
 749                # from Wu and Li 1985 which in turn is from Kimura 1980
 750                my $c = ( $a - $b ) / 2;
 751                my $d = ( $a + $b ) / 2;
 752                $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
 753            }
 754
 755            # fwd and rev lookup
 756            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 757            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 758            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
 759            # (diagonals) distance is 0 for same sequence
 760            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 761            $values[$j]->[$j] = sprintf($precisionstr,0);
 762
 763            $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
 764            $var[$j]->[$j] = $values[$j]->[$j];
 765        }
 766    }
 767    return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 768                                          -matrix  => \%dist,
 769                                          -names   => \@names,
 770                                          -values  => \@values),
 771             Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 772                                          -matrix  => \%dist,
 773                                          -names   => \@names,
 774                                          -values  => \@var)
 775             );
 776 }
 777
 778
 779 #  K Tamura, Mol. Biol. Evol. 1992, 9, 678.
 780
 781 =head2 D_Tamura
 782
 783  Title   : D_Tamura
 784  Usage   : Calculates D (pairwise distance) between 2 sequences in an
 785            alignment using Tamura 1992 distance model.
 786  Returns : L<Bio::Matrix::PhylipDist>
 787  Args    : L<Bio::Align::AlignI> of DNA sequences
 788
 789
 790 =cut
 791
 792 sub D_Tamura {
 793    my ($self,$aln) = @_;
 794    return 0 unless $self->_check_arg($aln);
 795    # ambiguities ignored at this point
 796    my (@seqs,@names,@values,%dist,$i,$j);
 797    my $seqct = 0;
 798    my $length = $aln->length;
 799    foreach my $seq ( $aln->each_seq) {
 800        push @names, $seq->display_id;;
 801        push @seqs, uc $seq->seq();
 802        $seqct++;
 803    }
 804
 805    my $precisionstr = "%.$Precision"."f";
 806    my (@gap,@gc,@trans,@tranv,@score);
 807    $i = 0;
 808    for my $t1 ( @seqs ) {
 809        $j = 0;
 810        for my $t2 ( @seqs ) {
 811            $gap[$i][$j] = 0;
 812            for( my $k = 0; $k < $length; $k++ ) {
 813                my ($c1,$c2) = ( substr($seqs[$i],$k,1),
 814                                 substr($seqs[$j],$k,1) );
 815                if( $c1 =~ /^$GapChars$/ ||
 816                    $c2 =~ /^$GapChars$/ ) {
 817                    $gap[$i][$j]++;
 818                } elsif( $c2 =~ /^$GCChhars$/i ) {
 819                    $gc[$i][$j]++;
 820                }
 821            }
 822            $gc[$i][$j] = ( $gc[$i][$j] /
 823                            ($length - $gap[$i][$j]) );
 824            $j++;
 825        }
 826        $i++;
 827    }
 828
 829    for( $i = 0; $i < $seqct-1; $i++ ) {
 830        # (diagonals) distance is 0 for same sequence
 831        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 832        $values[$i][$i] = sprintf($precisionstr,0);
 833
 834        for( $j = $i+1; $j < $seqct; $j++ ) {
 835
 836            my $pairwise = $aln->select_noncont($i+1,$j+1);
 837            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 838            my $P = $self->transitions($pairwise) / $L;
 839            my $Q = $self->transversions($pairwise) / $L;
 840            my $C = $gc[$i][$j] + $gc[$j][$i]-
 841                ( 2 * $gc[$i][$j] * $gc[$j][$i] );
 842            if( $P ) {
 843                $P = $P / $C;
 844            }
 845            my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
 846            # fwd and rev lookup
 847            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 848            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 849            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 850            # (diagonals) distance is 0 for same sequence
 851            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 852            $values[$j][$j] = sprintf($precisionstr,0);
 853        }
 854    }
 855    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 856                                        -matrix  => \%dist,
 857                                        -names   => \@names,
 858                                        -values  => \@values);
 859
 860 }
 861
 862 =head2 D_F84
 863
 864  Title   : D_F84
 865  Usage   : my $d = $stat->D_F84($aln)
 866  Function: Calculates D (pairwise distance) between 2 sequences in an
 867            alignment using the Felsenstein 1984 distance model.
 868  Returns : L<Bio::Matrix::PhylipDist>
 869  Args    : L<Bio::Align::AlignI> of DNA sequences
 870            [optional] double - gap penalty
 871
 872 =cut
 873
 874 sub D_F84 {
 875    my ($self,$aln,$gappenalty) = @_;
 876    return 0 unless $self->_check_arg($aln);
 877    $self->throw_not_implemented();
 878    # ambiguities ignored at this point
 879    my (@seqs,@names,@values,%dist);
 880    my $seqct = 0;
 881    foreach my $seq ( $aln->each_seq) {
 882        # if there is no name,
 883        my $id = $seq->display_id;
 884        if( ! length($id) ||       # deal with empty names
 885            $id =~ /^\s+$/ ) {
 886            $id = $seqct+1;
 887        }
 888        push @names, $id;
 889        push @seqs, uc $seq->seq();
 890        $seqct++;
 891    }
 892
 893    my $precisionstr = "%.$Precision"."f";
 894
 895    for( my $i = 0; $i < $seqct-1; $i++ ) {
 896        # (diagonals) distance is 0 for same sequence
 897        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 898        $values[$i][$i] = sprintf($precisionstr,0);
 899
 900        for( my $j = $i+1; $j < $seqct; $j++ ) {
 901        }
 902    }
 903 }
 904
 905 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
 906 #  Tajima-Nei correction used for multiple substitutions in the calc
 907 # of the distance matrix. Nucleic acids only.
 908 #
 909 #  D = p-distance = 1 - (matches/(posns_scored + gaps)
 910 #
 911 #  distance = -b * ln(1-D/b)
 912 #
 913
 914 =head2 D_TajimaNei
 915
 916  Title   : D_TajimaNei
 917  Usage   : my $d = $stat->D_TajimaNei($aln)
 918  Function: Calculates D (pairwise distance) between 2 sequences in an
 919            alignment using the TajimaNei 1984 distance model.
 920  Returns : L<Bio::Matrix::PhylipDist>
 921  Args    : Bio::Align::AlignI of DNA sequences
 922
 923
 924 =cut
 925
 926 sub D_TajimaNei{
 927    my ($self,$aln) = @_;
 928    return 0 unless $self->_check_arg($aln);
 929    # ambiguities ignored at this point
 930    my (@seqs,@names,@values,%dist);
 931    my $seqct = 0;
 932    foreach my $seq ( $aln->each_seq) {
 933        # if there is no name,
 934        push @names, $seq->display_id;
 935        push @seqs, uc $seq->seq();
 936        $seqct++;
 937    }
 938    my $precisionstr = "%.$Precision"."f";
 939    my ($i,$j,$bs);
 940    # pairwise
 941    for( $i =0; $i < $seqct -1; $i++ ) {
 942        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 943        $values[$i][$i] = sprintf($precisionstr,0);
 944
 945        for ( $j = $i+1; $j <$seqct;$j++ ) {
 946            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 947                                                                $seqs[$j]);
 948            my $pairwise = $aln->select_noncont($i+1,$j+1);
 949            my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 950            my $fij2 = 0;
 951            for( $bs = 0; $bs < 4; $bs++ ) {
 952                my $fi = 0;
 953                map {$fi += $matrix->[$bs]->[$_] } 0..3;
 954                my $fj = 0;
 955                # summation
 956                map { $fj += $matrix->[$_]->[$bs] } 0..3;
 957                my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
 958                $fij2 += $fij**2;
 959            }
 960
 961            my ($pair,$h) = (0,0);
 962            for( $bs = 0; $bs < 3; $bs++ ) {
 963                for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
 964                    my $fij = $pfreq->[$pair++] / $slen;
 965                    if( $fij ) {
 966
 967                        my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
 968
 969                        map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
 970                        map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
 971                        map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
 972                        map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
 973
 974                        if( $fij ) {
 975                            $h += ( ($fij**2) / 2 ) /
 976                                (  ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
 977                                   ( ( $ci2 + $cj2 ) / (2 * $slen) )
 978                                   );
 979                        }
 980                        $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
 981                    }
 982                }
 983            }
 984            # just want diagonals which are matches (A matched A, C -> C)
 985
 986            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 987                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 988            my $D = 1 - ( $m / $slen);
 989            my $d;
 990            if( $h == 0 ) {
 991                $d = -1;
 992            } else {
 993                my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
 994                my $c = 1- $D/ $b;
 995
 996                if( $c < 0 ) {
 997                    $d = -1;
 998                } else {
 999                    $d = (-1 * $b) * log ( $c);
1000                }
1001            }
1002            # fwd and rev lookup
1003            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1004            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1005            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1006
1007            # (diagonals) distance is 0 for same sequence
1008            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1009            $values[$j][$j] = sprintf($precisionstr,0);
1010        }
1011    }
1012    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1013                                        -matrix  => \%dist,
1014                                        -names   => \@names,
1015                                        -values  => \@values);
1016
1017 }
1018
1019 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1020
1021 =head2 D_JinNei
1022
1023  Title   : D_JinNei
1024  Usage   : my $d = $stat->D_JinNei($aln)
1025  Function: Calculates D (pairwise distance) between 2 sequences in an
1026            alignment using the Jin-Nei 1990 distance model.
1027  Returns : L<Bio::Matrix::PhylipDist>
1028  Args    : L<Bio::Align::AlignI> of DNA sequences
1029
1030
1031 =cut
1032
1033 sub D_JinNei{
1034    my ($self,@args) = @_;
1035    $self->warn("JinNei implementation not completed");
1036    return;
1037 }
1038
1039 =head2 transversions
1040
1041  Title   : transversions
1042  Usage   : my $transversions = $stats->transversion($aln);
1043  Function: Calculates the number of transversions between two sequences in
1044            an alignment
1045  Returns : integer
1046  Args    : Bio::Align::AlignI
1047
1048
1049 =cut
1050
1051 sub transversions{
1052    my ($self,$aln) = @_;
1053    return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1054 }
1055
1056 =head2 transitions
1057
1058  Title   : transitions
1059  Usage   : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1060  Function: Calculates the number of transitions in a given DNA alignment
1061  Returns : integer representing the number of transitions
1062  Args    : Bio::Align::AlignI object
1063
1064
1065 =cut
1066
1067 sub transitions{
1068    my ($self,$aln) = @_;
1069    return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1070 }
1071
1072
1073 sub _trans_count_helper {
1074     my ($self,$aln,$type) = @_;
1075     return 0 unless( $self->_check_arg($aln) );
1076     if( ! $aln->is_flush ) { $self->throw("must be flush") }
1077     my (@tcount);
1078     my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1079                             uc $aln->get_seq_by_pos(2)->seq() );
1080     my $alen = $aln->length;
1081     for (my $i = 0;$i<$alen; $i++ ) {
1082         my ($c1,$c2) = ( substr($first,$i,1),
1083                          substr($second,$i,1) );
1084         if( $c1 ne $c2 ) {
1085             foreach my $nt ( @{$type->{$c1}} ) {
1086                 if( $nt eq $c2) {
1087                    $tcount[$i]++;
1088                }
1089             }
1090         }
1091     }
1092     my $sum = 0;
1093     map { if( $_) { $sum += $_} } @tcount;
1094     return $sum;
1095 }
1096
1097 # this will generate a matrix which records across the row, the number
1098 # of DNA subst
1099 #
1100 sub _build_nt_matrix {
1101     my ($self,$seqa,$seqb) = @_;
1102
1103
1104     my $basect_matrix = [ [ qw(0 0 0 0) ],  # number of bases that match
1105                           [ qw(0 0 0 0) ],
1106                           [ qw(0 0 0 0) ],
1107                           [ qw(0 0 0 0) ] ];
1108     my $gaps = 0;                           # number of gaps
1109     my $pfreq = [ qw( 0 0 0 0 0 0)];        # matrix for pair frequency
1110     my $len_a = length($seqa);
1111     for( my $i = 0; $i < $len_a; $i++) {
1112         my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1113         $ti =~ tr/U/T/;
1114         $tj =~ tr/U/T/;
1115
1116         if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1117         if( $tj =~ /^$GapChars$/) { $gaps++; next }
1118
1119         my $ti_index = $NucleotideIndexes{$ti};
1120         my $tj_index = $NucleotideIndexes{$tj};
1121
1122         if( ! defined $ti_index ) {
1123             $self->warn("ti_index not defined for $ti\n");
1124             next;
1125         }
1126
1127         $basect_matrix->[$ti_index]->[$tj_index]++;
1128
1129         if( $ti ne $tj ) {
1130             $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1131         }
1132     }
1133     return ($basect_matrix,$pfreq,$gaps);
1134 }
1135
1136 sub _check_ambiguity_nucleotide {
1137     my ($base1,$base2) = @_;
1138     my %iub = Bio::Tools::IUPAC->iupac_iub();
1139     my @amb1 = @{ $iub{uc($base1)} };
1140     my @amb2 = @{ $iub{uc($base2)} };
1141     my ($pmatch) = (0);
1142     for my $amb ( @amb1 ) {
1143         if( grep { $amb eq $_ } @amb2 ) {
1144             $pmatch = 1;
1145             last;
1146         }
1147     }
1148     if( $pmatch ) {
1149         return (1 / scalar @amb1) * (1 / scalar @amb2);
1150     } else {
1151         return 0;
1152     }
1153 }
1154
1155
1156 sub _check_arg {
1157     my($self,$aln ) = @_;
1158     if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1159         $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1160         return 0;
1161     } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1162         $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1163         return 0;
1164     }
1165     return 1;
1166 }
1167
1168 =head2 Data Methods
1169
1170 =cut
1171
1172 =head2 pairwise_stats
1173
1174  Title   : pairwise_stats
1175  Usage   : $obj->pairwise_stats($newval)
1176  Function:
1177  Returns : value of pairwise_stats
1178  Args    : newvalue (optional)
1179
1180
1181 =cut
1182
1183 sub pairwise_stats{
1184    my ($self,$value) = @_;
1185    if( defined $value) {
1186       $self->{'_pairwise_stats'} = $value;
1187     }
1188     return $self->{'_pairwise_stats'};
1189
1190 }
1191
1192 =head2 calc_KaKs_pair
1193
1194  Title    : calc_KaKs_pair
1195  Useage   : my $results = $stats->calc_KaKs_pair($alnobj,
1196             $name1, $name2).
1197  Function : calculates Nei-Gojobori statistics for pairwise
1198             comparison.
1199  Args     : A Bio::Align::AlignI compliant object such as a
1200             Bio::SimpleAlign object, and 2 sequence name strings.
1201  Returns  : a reference to a hash of statistics with keys as
1202             listed in Description.
1203
1204 =cut
1205
1206 sub calc_KaKs_pair {
1207     my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1208     $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1209         if @_!= 4;
1210     $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1211     my @seqs = (
1212                 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1213                 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1214                 {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1215                 {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1216                ) ;
1217     if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1218         $self->throw(" aligned sequences must be of equal length!");
1219     }
1220     my $results = [];
1221     $self->_get_av_ds_dn(\@seqs, $results);
1222     return $results;
1223
1224 }
1225
1226 =head2 calc_all_KaKs_pairs
1227
1228  Title    : calc_all_KaKs_pairs
1229  Useage   : my $results2 = $stats->calc_KaKs_pair($alnobj).
1230  Function : Calculates Nei_gojobori statistics for all pairwise
1231             combinations in sequence.
1232  Arguments: A Bio::Align::ALignI compliant object such as
1233             a Bio::SimpleAlign object.
1234  Returns  : A reference to an array of hashes of statistics of
1235             all pairwise comparisons in the alignment.
1236
1237 =cut
1238
1239
1240
1241 sub calc_all_KaKs_pairs {
1242 #returns a multi_element_array with all pairwise comparisons
1243         my ($self,$aln) = @_;
1244         $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1245         my @seqs;
1246         for my $seq ($aln->each_seq) {
1247                 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1248                 }
1249         my $results ;
1250         $results = $self->_get_av_ds_dn(\@seqs, $results);
1251         return $results;
1252 }
1253
1254 =head2 calc_average_KaKs
1255
1256  Title    : calc_average_KaKs.
1257  Useage   : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1258  Function : calculates Nei_Gojobori stats for average of all
1259             sequences in the alignment.
1260  Args     : A Bio::Align::AlignI compliant object such as a
1261             Bio::SimpleAlign object, number of bootstrap iterations
1262             (default 1000).
1263  Returns  : A reference to a hash of statistics as listed in Description.
1264
1265 =cut
1266
1267 sub calc_average_KaKs {
1268 #calculates global value for sequences in alignment using bootstrapping
1269 #this is quite slow (~10 seconds per  3 X 200nt seqs);
1270     my ($self, $aln, $bootstrap_rpt) = @_;
1271     $bootstrap_rpt ||= 1000;
1272     $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1273     my @seqs;
1274     for my $seq ($aln->each_seq) {
1275         push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1276     }
1277     my $results ;
1278     my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1279     #print "ds = $ds_orig, dn = $dn_orig\n";
1280     $results = {D_s => $ds_orig, D_n => $dn_orig};
1281     $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1282     return $results;
1283 }
1284
1285 ############## primary internal subs for alignment comparisons ########################
1286
1287 sub _run_bootstrap {
1288     ### generates sampled sequences, calculates Ds and Dn values,
1289     ### then calculates variance of sampled sequences and add results to results hash
1290     ###
1291     my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1292     my @seqs = @$seq_ref;
1293     my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1294     my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1295
1296     #1st make alternative array of codons;
1297     my $c = 0;
1298     while ($c < length $seqs[0]{'seq'}) {
1299         for (0..$#seqs) {
1300             push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1301         }
1302         $c+=3;
1303     }
1304
1305     for (1..$bootstrap_rpt) {
1306         my $sampled = _resample (\@btstrp_aoa);
1307         my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1308         push @{$bootstrap_values{'ds'}}, $ds;
1309         push @{$bootstrap_values{'dn'}}, $dn;
1310     }
1311
1312     $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1313     $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1314     $results->{'z_score'} =     ($results->{'D_n'} - $results->{'D_s'}) /
1315         sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1316     #print "bootstrapped var_syn =      $results->{'D_s_var'} \n" ;
1317     #print "bootstrapped var_nc =       $results->{'D_n_var'} \n";
1318     #print "z is $results->{'z_score'}\n";      ### end of global set up of/perm look up data
1319 }
1320
1321 sub _resample {
1322     my $ref = shift;
1323     my $codon_num = scalar (@{$ref->[0]});
1324     my @altered;
1325     for (0..$codon_num -1) {    #for each codon
1326         my $rand = int (rand ($codon_num));
1327         for (0..$#$ref) {
1328             push @{$altered[$_]}, $ref->[$_][$rand];
1329         }
1330     }
1331     my @stringed = map {join '', @$_}@altered;
1332     my @return;
1333     #now out in random name to keep other subs happy
1334     for (@stringed) {
1335         push @return, {id=>'1', seq=> $_};
1336     }
1337     return \@return;
1338 }
1339
1340 sub _get_av_ds_dn {
1341     # takes array of hashes of sequence strings and ids   #
1342     my $self = shift;
1343     my $seq_ref = shift;
1344     my $result = shift if @_;
1345     my @caller = caller(1);
1346     my @seqarray = @$seq_ref;
1347     my $bootstrap_score_list;
1348     #for a multiple alignment considers all pairwise combinations#
1349     my %dsfor_average = (ds => [], dn => []);
1350     for (my $i = 0; $i < scalar @seqarray; $i++) {
1351         for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1352 #                       print "comparing $i and $j\n";
1353             if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1354                 $self->warn(" aligned sequences must be of equal length!");
1355                 next;
1356             }
1357
1358             my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1359             my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1360 #                       print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1361             my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1362             #get averages
1363             my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1364             my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1365
1366             #calculate ps and pn  (p54)
1367             my $syn_prop = $syn_count / $av_s_site;
1368             my $nc_prop = $non_syn_count / $av_ns_syn_site      ;
1369
1370             #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1371             my $d_syn = $self->jk($syn_prop);
1372             my $d_nc = $self->jk($nc_prop);
1373
1374             #JK calculation must succeed for continuation of calculation
1375             #ret_value = -1 if error
1376             next unless $d_nc >=0 && $d_syn >=0;
1377
1378
1379             push @{$dsfor_average{'ds'}}, $d_syn;
1380             push @{$dsfor_average{'dn'}}, $d_nc;
1381
1382             #if not doing bootstrap, calculate the pairwise comparisin stats
1383             if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1384                                 #now calculate variances assuming large sample
1385                 my $d_syn_var =  jk_var($syn_prop, length($seqarray[$i]{'seq'})  - $gap_cnt );
1386                 my $d_nc_var =  jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1387                 #now calculate z_value
1388                 #print "d_syn_var is  $d_syn_var,and d_nc_var is $d_nc_var\n";
1389                 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1390                 my $z = ($d_syn_var + $d_nc_var) ?
1391                   ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1392                 #       print "z is $z\n";
1393                 push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1394                                  S_d => $syn_count, N_d =>$non_syn_count,
1395                                  P_s => $syn_prop, P_n=>$nc_prop,
1396                                  D_s => @{$dsfor_average{'ds'}}[-1],
1397                                  D_n => @{$dsfor_average{'dn'}}[-1],
1398                                  D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1399                                  Seq1 => $seqarray[$i]{'id'},
1400                                  Seq2 => $seqarray[$j]{'id'},
1401                                  z_score => $z,
1402                              };
1403                 $self->warn (" number of mutations too small to justify normal test for  $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1404                     if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1405             }#endif
1406             }
1407     }
1408
1409     #warn of failure if no results hashes are present
1410     #will fail if Jukes Cantor has failed for all pairwise combinations
1411     #$self->warn("calculation failed!") if scalar @$result ==0;
1412
1413     #return results unless bootstrapping
1414     return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1415     #else if getting average for bootstrap
1416     return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1417 }
1418
1419
1420 sub jk {
1421     my ($self, $p) = @_;
1422     if ($p > 0.75) {
1423         $self->warn( " Jukes Cantor won't  work -too divergent!");
1424         return -1;
1425     }
1426     return -1 * (3/4) * (log(1 - (4/3) * $p));
1427 }
1428
1429 #works for large value of n (50?100?)
1430 sub jk_var {
1431     my ($p, $n) = @_;
1432     return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1433 }
1434
1435
1436 # compares 2 sequences to find the number of synonymous/non
1437 # synonymous mutations between them
1438
1439 sub analyse_mutations {
1440     my ($seq1, $seq2) = @_;
1441     my %mutator = ( 2=> {0=>[[1,2],  # codon positions to be altered
1442                              [2,1]], # depend on which is the same
1443                          1=>[[0,2],
1444                              [2,0]],
1445                          2=>[[0,1],
1446                              [1,0]],
1447                      },
1448                     3=> [ [0,1,2],  # all need to be altered
1449                           [1,0,2],
1450                           [0,2,1],
1451                           [1,2,0],
1452                           [2,0,1],
1453                           [2,1,0] ],
1454                     );
1455     my $TOTAL   = 0;    # total synonymous changes
1456     my $TOTAL_n = 0;    # total non-synonymous changes
1457     my $gap_cnt = 0;
1458
1459     my %input;
1460     my $seqlen = length($seq1);
1461     for (my $j=0; $j< $seqlen; $j+=3) {
1462         $input{'cod1'} = substr($seq1, $j,3);
1463         $input{'cod2'} = substr($seq2, $j,3);
1464
1465         #ignore codon if beeing compared with gaps!
1466         if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1467             $gap_cnt += 3; #just increments once if there is a pair of gaps
1468             next;
1469         }
1470
1471         my ($diff_cnt, $same) = count_diffs(\%input);
1472
1473         #ignore if codons are identical
1474         next if $diff_cnt == 0 ;
1475         if ($diff_cnt == 1) {
1476             $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1477             $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1478             #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1479         }
1480         elsif ($diff_cnt ==2) {
1481             my $s_cnt = 0;
1482             my $n_cnt = 0;
1483             my $tot_muts = 4;
1484             #will stay 4 unless there are stop codons at intervening point
1485           OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
1486               my $altered = $input{'cod1'};
1487               my $prev= $altered;
1488               #         print "$prev -> (", $t[$CODONS->{$altered}], ")";
1489               for       my $mut_i (@$perm) { #index of codon mutated
1490                   substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1491                   if ($t[$CODONS->{$altered}] eq '*') {
1492                       $tot_muts -=2;
1493                       #print "changes to stop codon!!\n";
1494                       next OUTER;
1495                   }
1496                   else {
1497                       $s_cnt += $synchanges{$prev}{$altered};
1498                       #                                 print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1499                   }
1500                   $prev = $altered;
1501               }
1502               #         print "\n";
1503           }
1504             if ($tot_muts != 0) {
1505                 $TOTAL += ($s_cnt/($tot_muts/2));
1506                 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1507             }
1508
1509         }
1510         elsif ($diff_cnt ==3 ) {
1511             my $s_cnt = 0;
1512             my $n_cnt = 0;
1513             my $tot_muts = 18;  #potential number  of mutations
1514           OUTER: for my $perm (@{$mutator{'3'}}) {
1515               my $altered = $input{'cod1'};
1516               my $prev= $altered;
1517               # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1518               for my $mut_i (@$perm) { #index of codon mutated
1519                   substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1520                   if ($t[$CODONS->{$altered}] eq '*') {
1521                       $tot_muts -=3;
1522                       # print "changes to stop codon!!\n";
1523                       next OUTER;
1524
1525                   }
1526                   else {
1527                       $s_cnt += $synchanges{$prev}{$altered};
1528                       #                 print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1529                   }
1530                   $prev = $altered;
1531               }
1532               # print "\n";
1533
1534           }#end OUTER loop
1535               #calculate number of synonymous/non synonymous mutations for that codon
1536               # and add to total
1537               if ($tot_muts != 0) {
1538                   $TOTAL += ($s_cnt / ($tot_muts /3));
1539                   $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1540               }
1541         }                       #endif $diffcnt = 3
1542     }                           #end of sequencetraversal
1543     return ($TOTAL, $TOTAL_n, $gap_cnt);
1544 }
1545
1546
1547 sub count_diffs {
1548     #counts the number of nucleotide differences between 2 codons
1549     # returns this value plus the codon index of which nucleotide is the same when 2
1550     #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1551     # to change.
1552     my $ref = shift;
1553     my $cnt = 0;
1554     my $same= undef;
1555     #just for 2 differences
1556     for (0..2) {
1557         if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1558             $cnt++;
1559         } else {
1560             $same = $_;
1561         }
1562     }
1563     return ($cnt, $same);
1564 }
1565
1566 =head2 get_syn_changes
1567
1568  Title   : get_syn_changes
1569  Usage   : Bio::Align::DNAStatitics->get_syn_changes
1570  Function: Generate a hashref of all pairwise combinations of codns
1571            differing by 1
1572  Returns : Symetic matrix using hashes
1573            First key is codon
1574            and each codon points to a hashref of codons
1575            the values of which describe type of change.
1576            my $type = $hash{$codon1}->{$codon2};
1577            values are :
1578              1   synonymous
1579              0   non-syn
1580             -1   either codon is a stop codon
1581  Args    : none
1582
1583 =cut
1584
1585 sub get_syn_changes {
1586 #hash of all pairwise combinations of codons differing by 1
1587 # 1 = syn, 0 = non-syn, -1 = stop
1588     my %results;
1589     my @codons = _make_codons ();
1590     my $arr_len = scalar @codons;
1591     for (my $i = 0; $i < $arr_len -1; $i++) {
1592         my $cod1 = $codons[$i];
1593         for (my $j = $i +1; $j < $arr_len; $j++) {
1594             my $diff_cnt = 0;
1595             for my $pos(0..2) {
1596                 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1597             }
1598             next if $diff_cnt !=1;
1599
1600             #synon change
1601             if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1602                 $results{$cod1}{$codons[$j]} =1;
1603                 $results{$codons[$j]}{$cod1} = 1;
1604             }
1605             #stop codon
1606             elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1607                 $results{$cod1}{$codons[$j]} = -1;
1608                 $results{$codons[$j]}{$cod1} = -1;
1609             }
1610             # nc change
1611             else {
1612                 $results{$cod1}{$codons[$j]} = 0;
1613                 $results{$codons[$j]}{$cod1} = 0;
1614             }
1615         }
1616     }
1617     return %results;
1618 }
1619
1620 =head2 dnds_pattern_number
1621
1622  Title   : dnds_pattern_number
1623  Usage   : my $patterns = $stats->dnds_pattern_number($alnobj);
1624  Function: Counts the number of codons with no gaps in the MSA
1625  Returns : Number of codons with no gaps ('patterns' in PAML notation)
1626  Args    : A Bio::Align::AlignI compliant object such as a
1627             Bio::SimpleAlign object.
1628
1629 =cut
1630
1631 sub dnds_pattern_number{
1632     my ($self, $aln) = @_;
1633     return ($aln->remove_gaps->length)/3;
1634 }
1635
1636 sub count_syn_sites {
1637     #counts the number of possible synonymous changes for sequence
1638     my ($seq, $synsite) = @_;
1639     __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1640     my $S = 0;
1641     for (my $i = 0; $i< length($seq); $i+=3) {
1642         my $cod = substr($seq, $i, 3);
1643         next if $cod =~ /\-/;   #deal with alignment gaps
1644         $S +=  $synsite->{$cod}{'s'};
1645     }
1646     #print "S is $S\n";
1647     return $S;
1648 }
1649
1650
1651
1652 sub get_syn_sites {
1653     #sub to generate lookup hash for the number of synonymous changes per codon
1654     my @nucs = qw(T C A G);
1655     my %raw_results;
1656     for my $i (@nucs) {
1657         for my $j (@nucs) {
1658             for my $k (@nucs) {
1659                 # for each possible codon
1660                 my $cod = "$i$j$k";
1661                 my $aa = $t[$CODONS->{$cod}];
1662                 #calculate number of synonymous mutations vs non syn mutations
1663                 for my $i (qw(0 1 2)){
1664                     my $s = 0;
1665                     my $n = 3;
1666                     for my $nuc (qw(A T C G)) {
1667                         next if substr ($cod, $i,1) eq $nuc;
1668                         my $test = $cod;
1669                         substr($test, $i, 1) = $nuc ;
1670                         if ($t[$CODONS->{$test}] eq $aa) {
1671                             $s++;
1672                         }
1673                         if ($t[$CODONS->{$test}] eq '*') {
1674                             $n--;
1675                         }
1676                     }
1677                     $raw_results{$cod}[$i] = {'s' => $s ,
1678                                               'n' => $n };
1679                 }
1680
1681             } #end analysis of single codon
1682         }
1683     } #end analysis of all codons
1684     my %final_results;
1685
1686     for my $cod (sort keys %raw_results) {
1687         my $t = 0;
1688         map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
1689         $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1690     }
1691     return \%final_results;
1692 }
1693
1694 sub _make_codons {
1695 #makes all codon combinations, returns array of them
1696     my @nucs = qw(T C A G);
1697     my @codons;
1698     for my $i (@nucs) {
1699         for my $j (@nucs) {
1700             for my $k (@nucs) {
1701                 push @codons, "$i$j$k";
1702             }
1703         }
1704     }
1705     return @codons;
1706 }
1707
1708 sub get_codons {
1709  #generates codon translation look up table#
1710  my $x = 0;
1711  my  $CODONS = {};
1712  for my $codon (_make_codons) {
1713      $CODONS->{$codon} = $x;
1714      $x++;
1715  }
1716  return $CODONS;
1717 }
1718
1719 #########stats subs, can go in another module? Here for speed. ###
1720 sub mean {
1721     my $ref = shift;
1722     my $el_num = scalar @$ref;
1723     my $tot = 0;
1724     map{$tot += $_}@$ref;
1725     return ($tot/$el_num);
1726 }
1727
1728 sub variance {
1729     my $ref = shift;
1730     my $mean = mean($ref);
1731     my $sum_of_squares = 0;
1732     map{$sum_of_squares += ($_ - $mean) **2}@$ref;
1733     return $sum_of_squares;
1734 }
1735
1736 sub sampling_variance {
1737     my $ref = shift;
1738     return variance($ref) / (scalar @$ref -1);
1739 }
1740
1741 1;