Bio/Align/DNAStatistics.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Align::DNAStatistics
   4 #
   5 # Cared for by Jason Stajich <jason-AT-bioperl.org>
   6 #
   7 # Copyright Jason Stajich
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
  16
  17 =head1 SYNOPSIS
  18
  19   use Bio::AlignIO;
  20   use Bio::Align::DNAStatistics;
  21
  22   my $stats = Bio::Align::DNAStatistics->new();
  23   my $alignin = Bio::AlignIO->new(-format => 'emboss',
  24                                  -file   => 't/data/insulin.water');
  25   my $aln = $alignin->next_aln;
  26   my $jcmatrix = $stats->distance(-align => $aln,
  27                                   -method => 'Jukes-Cantor');
  28
  29   print $jcmatrix->print_matrix;
  30   ## and for measurements of synonymous /nonsynonymous substitutions ##
  31
  32   my $in = Bio::AlignIO->new(-format => 'fasta',
  33                             -file   => 't/data/nei_gojobori_test.aln');
  34   my $alnobj = $in->next_aln;
  35   my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
  36   my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
  37   print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
  38   for (sort keys %{$results->[0]} ){
  39       next if /Seq/;
  40       printf("%-9s %.4f \n",$_ , $results->[0]{$_});
  41   }
  42
  43   my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
  44   for my $an (@$results2){
  45       print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
  46       for (sort keys %$an ){
  47           next if /Seq/;
  48           printf("%-9s %.4f \n",$_ , $an->{$_});
  49       }
  50       print "\n\n";
  51   }
  52
  53   my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
  54   for (sort keys %$result3 ){
  55       next if /Seq/;
  56       printf("%-9s %.4f \n",$_ , $result3->{$_});
  57   }
  58
  59 =head1 DESCRIPTION
  60
  61 This object contains routines for calculating various statistics and
  62 distances for DNA alignments.  The routines are not well tested and do
  63 contain errors at this point.  Work is underway to correct them, but
  64 do not expect this code to give you the right answer currently!  Use
  65 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
  66 distances.
  67
  68
  69 Several different distance method calculations are supported.  Listed
  70 in brackets are the pattern which will match
  71
  72 =over 3
  73
  74 =item JukesCantor [jc|jukes|jukescantor|jukes-cantor]
  75
  76 =item Uncorrected [jcuncor|uncorrected]
  77
  78 =item F81 [f81|felsenstein]
  79
  80 =item Kimura [k2|k2p|k80|kimura]
  81
  82 =item Tamura [t92|tamura|tamura92]
  83
  84 =item F84 [f84|felsenstein84]
  85
  86 =item TajimaNei [tajimanei|tajima\-nei]
  87
  88 =item JinNei [jinnei|jin\-nei] (not implemented)
  89
  90 =back
  91
  92 There are also three methods to calculate the ratio of synonymous to
  93 non-synonymous mutations.  All are implementations of the Nei-Gojobori
  94 evolutionary pathway method and use the Jukes-Cantor method of
  95 nucleotide substitution. This method works well so long as the
  96 nucleotide frequencies are roughly equal and there is no significant
  97 transition/transversion bias.  In order to use these methods there are
  98 several pre-requisites for the alignment.
  99
 100 =over 3
 101
 102 =item 1
 103
 104 DNA alignment must be based on protein alignment. Use the subroutine
 105 L<aa_to_dna_aln> in Bio::Align::Utilities to achieve this.
 106
 107 =item 2
 108
 109 Therefore alignment gaps must be in multiples of 3 (representing an aa
 110 deletion/insertion) and at present must be indicated by a '-' symbol.
 111
 112 =item 3
 113
 114 Alignment must be solely of coding region and be in reading frame 0 to
 115 achieve meaningful results
 116
 117 =item 4
 118
 119 Alignment must therefore be a multiple of 3 nucleotides long.
 120
 121 =item 5
 122
 123 All sequences must be the same length (including gaps). This should be
 124 the case anyway if the sequences have been automatically aligned using
 125 a program like Clustal.
 126
 127 =item 6
 128
 129 Only the standard codon alphabet is supported at present.
 130
 131 =back
 132
 133 calc_KaKs_pair() calculates a number of statistics for a named pair of
 134 sequences in the alignment.
 135
 136 calc_all_KaKs_pairs() calculates these statistics for all pairwise
 137 comparisons in an MSA.  The statistics returned are:
 138
 139 =over 3
 140
 141 =item S_d
 142
 143 Number of synonymous mutations between the 2 sequences.
 144
 145 =item N_d
 146
 147 Number of non-synonymous mutations between the 2 sequences.
 148
 149 =item S
 150
 151 Mean number of  synonymous sites in both sequences.
 152
 153 =item N
 154
 155 mean number of  synonymous sites in both sequences.
 156
 157 =item P_s
 158
 159 proportion of synonymous differences in both sequences given by P_s = S_d/S.
 160
 161 =item P_n
 162
 163 proportion of non-synonymous differences in both sequences given by P_n = S_n/S.
 164
 165 =item D_s
 166
 167 estimation of synonymous mutations per synonymous site (by Jukes-Cantor).
 168
 169 =item D_n
 170
 171 estimation of non-synonymous mutations per non-synonymous site (by Jukes-Cantor).
 172
 173 =item D_n_var
 174
 175 estimation of variance of D_n .
 176
 177 =item D_s_var
 178
 179 estimation of variance of S_n.
 180
 181 =item z_value
 182
 183 calculation of z value.Positive value indicates D_n E<gt> D_s,
 184 negative value indicates D_s E<gt> D_n.
 185
 186 =back
 187
 188 The statistics returned by calc_average_KaKs are:
 189
 190 =over 3
 191
 192 =item D_s
 193
 194 Average number of synonymous mutations/synonymous site.
 195
 196 =item D_n
 197
 198 Average number of non-synonymous mutations/non-synonymous site.
 199
 200 =item D_s_var
 201
 202 Estimated variance of Ds from bootstrapped alignments.
 203
 204 =item D_n_var
 205
 206 Estimated variance of Dn from bootstrapped alignments.
 207
 208 =item z_score
 209
 210 calculation of z value. Positive value indicates D_n E<gt>D_s,
 211 negative values vice versa.
 212
 213 =back
 214
 215 The design of the code is based around the explanation of the
 216 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
 217 Phylogenetics" by Nei and Kumar, published by Oxford University
 218 Press. The methods have been tested using the worked example 4.1 in
 219 the book, and reproduce those results. If people like having this sort
 220 of analysis in BioPerl other methods for estimating Ds and Dn can be
 221 provided later.
 222
 223
 224 Much of the DNA distance code is based on implementations in EMBOSS
 225 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
 226 al) [dnadist.c].  Insight also gained from Eddy, Durbin, Krogh, &
 227 Mitchison.
 228
 229 =head1 REFERENCES
 230
 231 =over 3
 232
 233 =item D_JukesCantor
 234
 235 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
 236 Mol. Systematics, 2nd ed, 1996, Ch 11.  Derived from "Evolution of
 237 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
 238 1969, pp. 21-132.
 239
 240 =item D_Tamura
 241
 242 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
 243
 244 =item D_Kimura
 245
 246 M Kimura, J. Mol. Evol., 1980, 16, 111.
 247
 248 =item JinNei
 249
 250 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
 251
 252 =item D_TajimaNei
 253
 254 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
 255
 256 =back
 257
 258 =head1 FEEDBACK
 259
 260 =head2 Mailing Lists
 261
 262 User feedback is an integral part of the evolution of this and other
 263 Bioperl modules. Send your comments and suggestions preferably to
 264 the Bioperl mailing list.  Your participation is much appreciated.
 265
 266   bioperl-l@bioperl.org                  - General discussion
 267   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 268
 269 =head2 Reporting Bugs
 270
 271 Report bugs to the Bioperl bug tracking system to help us keep track
 272 of the bugs and their resolution. Bug reports can be submitted via the
 273 web:
 274
 275   http://bugzilla.open-bio.org/
 276
 277 =head1 AUTHOR - Jason Stajich
 278
 279 Email jason-AT-bioperl.org
 280
 281 =head1 CONTRIBUTORS
 282
 283 Richard Adams, richard.adams@ed.ac.uk
 284
 285 =head1 APPENDIX
 286
 287 The rest of the documentation details each of the object methods.
 288 Internal methods are usually preceded with a _
 289
 290 =cut
 291
 292
 293 # Let the code begin...
 294
 295
 296 package Bio::Align::DNAStatistics;
 297 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
 298             $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
 299             $CODONS %synchanges $synsites $Precision $GCChhars);
 300 use strict;
 301 use Bio::Align::PairwiseStatistics;
 302 use Bio::Matrix::PhylipDist;
 303 use Bio::Tools::IUPAC;
 304
 305 BEGIN {
 306     $GapChars = '[\.\-]';
 307     $GCChhars = '[GCS]';
 308     @Nucleotides = qw(A G T C);
 309     $SeqCount = 2;
 310     $Precision = 5;
 311
 312     # these values come from EMBOSS distmat implementation
 313     %NucleotideIndexes = ( 'A' => 0,
 314                            'T' => 1,
 315                            'C' => 2,
 316                            'G' => 3,
 317
 318                            'AT' => 0,
 319                            'AC' => 1,
 320                            'AG' => 2,
 321                            'CT' => 3,
 322                            'GT' => 4,
 323                            'CG' => 5,
 324
 325 # these are wrong now
 326 #                          'S' => [ 1, 3],
 327 #                          'W' => [ 0, 4],
 328 #                          'Y' => [ 2, 3],
 329 #                          'R' => [ 0, 1],
 330 #                          'M' => [ 0, 3],
 331 #                          'K' => [ 1, 2],
 332 #                          'B' => [ 1, 2, 3],
 333 #                          'H' => [ 0, 2, 3],
 334 #                          'V' => [ 0, 1, 3],
 335 #                          'D' => [ 0, 1, 2],
 336                            );
 337
 338     $DefaultGapPenalty = 0;
 339     # could put ambiguities here?
 340     %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
 341                                          'T' => [ 'A', 'G'],
 342                                          'C' => [ 'A', 'G'],
 343                                          'G' => [ 'C', 'T'],
 344                                      },
 345                     'Transitions'   => { 'A' => [ 'G' ],
 346                                          'G' => [ 'A' ],
 347                                          'C' => [ 'T' ],
 348                                          'T' => [ 'C' ],
 349                                      },
 350                     );
 351     %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
 352                          'jcuncor|uncorrected'   => 'Uncorrected',
 353                          'f81|felsenstein81'     => 'F81',
 354                          'k2|k2p|k80|kimura'     => 'Kimura',
 355                          't92|tamura|tamura92'   => 'Tamura',
 356                          'f84|felsenstein84'     => 'F84',
 357                          'tajimanei|tajima\-nei' => 'TajimaNei',
 358                          'jinnei|jin\-nei'       => 'JinNei');
 359
 360 }
 361 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
 362
 363 ## generate look up hashes for Nei_Gojobori methods##
 364 $CODONS = get_codons();
 365 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
 366 #create look up hash of number of possible synonymous mutations per codon
 367 $synsites = get_syn_sites();
 368 #create reference look up hash of single basechanges in codons
 369 %synchanges = get_syn_changes();
 370
 371
 372
 373 =head2 new
 374
 375  Title   : new
 376  Usage   : my $obj = Bio::Align::DNAStatistics->new();
 377  Function: Builds a new Bio::Align::DNAStatistics object
 378  Returns : Bio::Align::DNAStatistics
 379  Args    : none
 380
 381
 382 =cut
 383
 384 sub new {
 385     my ($class,@args) = @_;
 386     my $self = $class->SUPER::new(@args);
 387
 388     $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
 389
 390     return $self;
 391 }
 392
 393
 394 =head2 distance
 395
 396  Title   : distance
 397  Usage   : my $distance_mat = $stats->distance(-align  => $aln,
 398                                                -method => $method);
 399  Function: Calculates a distance matrix for all pairwise distances of
 400            sequences in an alignment.
 401  Returns : L<Bio::Matrix::PhylipDist> object
 402  Args    : -align  => Bio::Align::AlignI object
 403            -method => String specifying specific distance method
 404                       (implementing class may assume a default)
 405 See also: L<Bio::Matrix::PhylipDist>
 406
 407 =cut
 408
 409 sub distance{
 410    my ($self,@args) = @_;
 411    my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
 412    if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
 413        $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
 414    }
 415    $method ||= 'JukesCantor';
 416    foreach my $m ( keys %DistanceMethods ) {
 417        if(defined $m &&  $method =~ /$m/i ) {
 418            my $mtd = "D_$DistanceMethods{$m}";
 419            return $self->$mtd($aln);
 420        }
 421    }
 422    $self->warn("Unrecognized distance method $method must be one of [".
 423                join(',',$self->available_distance_methods())."]");
 424    return;
 425 }
 426
 427 =head2 available_distance_methods
 428
 429  Title   : available_distance_methods
 430  Usage   : my @methods = $stats->available_distance_methods();
 431  Function: Enumerates the possible distance methods
 432  Returns : Array of strings
 433  Args    : none
 434
 435
 436 =cut
 437
 438 sub available_distance_methods{
 439    my ($self,@args) = @_;
 440    return values %DistanceMethods;
 441 }
 442
 443 =head2 D - distance methods
 444
 445
 446 =cut
 447
 448
 449 =head2 D_JukesCantor
 450
 451  Title   : D_JukesCantor
 452  Usage   : my $d = $stat->D_JukesCantor($aln)
 453  Function: Calculates D (pairwise distance) between 2 sequences in an
 454            alignment using the Jukes-Cantor 1 parameter model.
 455  Returns : L<Bio::Matrix::PhylipDist>
 456  Args    : L<Bio::Align::AlignI> of DNA sequences
 457            double - gap penalty
 458
 459
 460 =cut
 461
 462 sub D_JukesCantor{
 463    my ($self,$aln,$gappenalty) = @_;
 464    return 0 unless $self->_check_arg($aln);
 465    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 466    # ambiguities ignored at this point
 467    my (@seqs,@names,@values,%dist);
 468    my $seqct = 0;
 469    foreach my $seq ( $aln->each_seq) {
 470        push @names, $seq->display_id;
 471        push @seqs, uc $seq->seq();
 472        $seqct++;
 473    }
 474    my $precisionstr = "%.$Precision"."f";
 475    for(my $i = 0; $i < $seqct-1; $i++ ) {
 476        # (diagonals) distance is 0 for same sequence
 477        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 478        $values[$i][$i] = sprintf($precisionstr,0);
 479
 480        for( my $j = $i+1; $j < $seqct; $j++ ) {
 481            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 482                                                                $seqs[$j]);
 483            # just want diagonals
 484            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 485                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 486            my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
 487            my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
 488            # fwd and rev lookup
 489            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 490            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 491            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 492            # (diagonals) distance is 0 for same sequence
 493            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 494            $values[$j][$j] = sprintf($precisionstr,0);
 495        }
 496    }
 497    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 498                                        -matrix  => \%dist,
 499                                        -names   => \@names,
 500                                        -values  => \@values);
 501 }
 502
 503 =head2 D_F81
 504
 505  Title   : D_F81
 506  Usage   : my $d = $stat->D_F81($aln)
 507  Function: Calculates D (pairwise distance) between 2 sequences in an
 508            alignment using the Felsenstein 1981 distance model.
 509            Relaxes the assumption of equal base frequencies that is
 510            in JC.
 511  Returns : L<Bio::Matrix::PhylipDist>
 512  Args    : L<Bio::Align::AlignI> of DNA sequences
 513
 514
 515 =cut
 516
 517 sub D_F81{
 518    my ($self,$aln,$gappenalty) = @_;
 519    return 0 unless $self->_check_arg($aln);
 520    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 521    # ambiguities ignored at this point
 522    my (@seqs,@names,@values,%dist);
 523    my $seqct = 0;
 524    foreach my $seq ( $aln->each_seq) {
 525        push @names, $seq->display_id;;
 526        push @seqs, uc $seq->seq();
 527        $seqct++;
 528    }
 529    my $precisionstr = "%.$Precision"."f";
 530    for(my $i = 0; $i < $seqct-1; $i++ ) {
 531        # (diagonals) distance is 0 for same sequence
 532        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 533        $values[$i][$i] = sprintf($precisionstr,0);
 534
 535        for( my $j = $i+1; $j < $seqct; $j++ ) {
 536
 537            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 538                                                                $seqs[$j]);
 539            # just want diagonals
 540            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 541                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 542            my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
 543            my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
 544            # fwd and rev lookup
 545            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 546            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 547            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 548            # (diagonals) distance is 0 for same sequence
 549            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 550            $values[$j][$j] = sprintf($precisionstr,0);
 551        }
 552    }
 553    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 554                                        -matrix  => \%dist,
 555                                        -names   => \@names,
 556                                        -values  => \@values);
 557 }
 558
 559 =head2 D_Uncorrected
 560
 561  Title   : D_Uncorrected
 562  Usage   : my $d = $stats->D_Uncorrected($aln)
 563  Function: Calculate a distance D, no correction for multiple substitutions
 564            is used.
 565  Returns : L<Bio::Matrix::PhylipDist>
 566  Args    : L<Bio::Align::AlignI> (DNA Alignment)
 567            [optional] gap penalty
 568
 569 =cut
 570
 571 sub D_Uncorrected {
 572    my ($self,$aln,$gappenalty) = @_;
 573    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 574    return 0 unless $self->_check_arg($aln);
 575    # ambiguities ignored at this point
 576    my (@seqs,@names,@values,%dist);
 577    my $seqct = 0;
 578    foreach my $seq ( $aln->each_seq) {
 579        push @names, $seq->display_id;
 580        push @seqs, uc $seq->seq();
 581        $seqct++;
 582    }
 583
 584    my $precisionstr = "%.$Precision"."f";
 585    my $len = $aln->length;
 586    for( my $i = 0; $i < $seqct-1; $i++ ) {
 587        # (diagonals) distance is 0 for same sequence
 588        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 589        $values[$i][$i] = sprintf($precisionstr,0);
 590
 591        for( my $j = $i+1; $j < $seqct; $j++ ) {
 592            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 593                                                                $seqs[$j]);
 594            my $m = ( $matrix->[0]->[0] +
 595                      $matrix->[1]->[1] +
 596                      $matrix->[2]->[2] +
 597                      $matrix->[3]->[3] );
 598            my $D = 1 - ( $m / ( $len - $gaps + ( $gaps * $gappenalty)));
 599            # fwd and rev lookup
 600            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 601            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 602            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
 603            # (diagonals) distance is 0 for same sequence
 604            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 605            $values[$j][$j] = sprintf($precisionstr,0);
 606        }
 607    }
 608    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 609                                        -matrix  => \%dist,
 610                                        -names   => \@names,
 611                                        -values  => \@values);
 612 }
 613
 614
 615 # M Kimura, J. Mol. Evol., 1980, 16, 111.
 616
 617 =head2 D_Kimura
 618
 619  Title   : D_Kimura
 620  Usage   : my $d = $stat->D_Kimura($aln)
 621  Function: Calculates D (pairwise distance) between all pairs of sequences
 622            in an alignment using the Kimura 2 parameter model.
 623  Returns : L<Bio::Matrix::PhylipDist>
 624  Args    : L<Bio::Align::AlignI> of DNA sequences
 625
 626
 627 =cut
 628
 629 sub D_Kimura {
 630    my ($self,$aln) = @_;
 631    return 0 unless $self->_check_arg($aln);
 632    # ambiguities ignored at this point
 633    my (@names,@values,%dist);
 634    my $seqct = 0;
 635    foreach my $seq ( $aln->each_seq) {
 636        push @names, $seq->display_id;
 637        $seqct++;
 638    }
 639
 640    my $precisionstr = "%.$Precision"."f";
 641
 642    for( my $i = 0; $i < $seqct-1; $i++ ) {
 643        # (diagonals) distance is 0 for same sequence
 644        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 645        $values[$i][$i] = sprintf($precisionstr,0);
 646
 647        for( my $j = $i+1; $j < $seqct; $j++ ) {
 648            my $pairwise = $aln->select_noncont($i+1,$j+1);
 649            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 650            unless( $L ) {
 651                $L = 1;
 652            }
 653            my $P = $self->transitions($pairwise) / $L;
 654            my $Q = $self->transversions($pairwise) / $L;
 655            my $K = 0;
 656            my $a = 1 / ( 1 - (2 * $P) - $Q);
 657            my $b = 1 / ( 1 - 2 * $Q );
 658            if( $a < 0 || $b < 0 ) {
 659                $K = -1;
 660            } else{
 661                $K = (1/2) * log ( $a ) + (1/4) * log($b);
 662            }
 663            # fwd and rev lookup
 664            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 665            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 666            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
 667            # (diagonals) distance is 0 for same sequence
 668            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 669            $values[$j][$j] = sprintf($precisionstr,0);
 670        }
 671    }
 672    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 673                                        -matrix  => \%dist,
 674                                        -names   => \@names,
 675                                        -values  => \@values);
 676 }
 677
 678
 679 =head2 D_Kimura_variance
 680
 681  Title   : D_Kimura
 682  Usage   : my $d = $stat->D_Kimura_variance($aln)
 683  Function: Calculates D (pairwise distance) between all pairs of sequences
 684            in an alignment using the Kimura 2 parameter model.
 685  Returns : array of 2 L<Bio::Matrix::PhylipDist>,
 686            the first is the Kimura distance and the second is
 687            a matrix of variance V(K)
 688  Args    : L<Bio::Align::AlignI> of DNA sequences
 689
 690
 691 =cut
 692
 693 sub D_Kimura_variance {
 694    my ($self,$aln) = @_;
 695    return 0 unless $self->_check_arg($aln);
 696    # ambiguities ignored at this point
 697    my (@names,@values,%dist,@var);
 698    my $seqct = 0;
 699    foreach my $seq ( $aln->each_seq) {
 700        push @names, $seq->display_id;
 701        $seqct++;
 702    }
 703
 704    my $precisionstr = "%.$Precision"."f";
 705
 706    for( my $i = 0; $i < $seqct-1; $i++ ) {
 707        # (diagonals) distance is 0 for same sequence
 708        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 709        $values[$i][$i] = sprintf($precisionstr,0);
 710
 711        for( my $j = $i+1; $j < $seqct; $j++ ) {
 712            my $pairwise = $aln->select_noncont($i+1,$j+1);
 713            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 714            unless( $L ) {
 715                $L = 1;
 716            }
 717            my $P = $self->transitions($pairwise) / $L;
 718            my $Q = $self->transversions($pairwise) / $L;
 719            my ($a,$b,$K,$var_k);
 720            my $a_denom = ( 1 - (2 * $P) - $Q);
 721            my $b_denom = 1 - 2 * $Q;
 722            unless( $a_denom > 0 && $b_denom > 0 ) {
 723                $a = 1;
 724                $b = 1;
 725                $K = -1;
 726                $var_k = -1;
 727            } else {
 728                $a = 1 / $a_denom;
 729                $b = 1 / $b_denom;
 730                $K = (1/2) * log ( $a ) + (1/4) * log($b);
 731                # from Wu and Li 1985 which in turn is from Kimura 1980
 732                my $c = ( $a - $b ) / 2;
 733                my $d = ( $a + $b ) / 2;
 734                $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
 735            }
 736
 737            # fwd and rev lookup
 738            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 739            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 740            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
 741            # (diagonals) distance is 0 for same sequence
 742            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 743            $values[$j]->[$j] = sprintf($precisionstr,0);
 744
 745            $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
 746            $var[$j]->[$j] = $values[$j]->[$j];
 747        }
 748    }
 749    return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 750                                          -matrix  => \%dist,
 751                                          -names   => \@names,
 752                                          -values  => \@values),
 753             Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 754                                          -matrix  => \%dist,
 755                                          -names   => \@names,
 756                                          -values  => \@var)
 757             );
 758 }
 759
 760
 761 #  K Tamura, Mol. Biol. Evol. 1992, 9, 678.
 762
 763 =head2 D_Tamura
 764
 765  Title   : D_Tamura
 766  Usage   : Calculates D (pairwise distance) between 2 sequences in an
 767            alignment using Tamura 1992 distance model.
 768  Returns : L<Bio::Matrix::PhylipDist>
 769  Args    : L<Bio::Align::AlignI> of DNA sequences
 770
 771
 772 =cut
 773
 774 sub D_Tamura {
 775    my ($self,$aln) = @_;
 776    return 0 unless $self->_check_arg($aln);
 777    # ambiguities ignored at this point
 778    my (@seqs,@names,@values,%dist,$i,$j);
 779    my $seqct = 0;
 780    my $length = $aln->length;
 781    foreach my $seq ( $aln->each_seq) {
 782        push @names, $seq->display_id;;
 783        push @seqs, uc $seq->seq();
 784        $seqct++;
 785    }
 786
 787    my $precisionstr = "%.$Precision"."f";
 788    my (@gap,@gc,@trans,@tranv,@score);
 789    $i = 0;
 790    for my $t1 ( @seqs ) {
 791        $j = 0;
 792        for my $t2 ( @seqs ) {
 793            $gap[$i][$j] = 0;
 794            for( my $k = 0; $k < $length; $k++ ) {
 795                my ($c1,$c2) = ( substr($seqs[$i],$k,1),
 796                                 substr($seqs[$j],$k,1) );
 797                if( $c1 =~ /^$GapChars$/ ||
 798                    $c2 =~ /^$GapChars$/ ) {
 799                    $gap[$i][$j]++;
 800                } elsif( $c2 =~ /^$GCChhars$/i ) {
 801                    $gc[$i][$j]++;
 802                }
 803            }
 804            $gc[$i][$j] = ( $gc[$i][$j] /
 805                            ($length - $gap[$i][$j]) );
 806            $j++;
 807        }
 808        $i++;
 809    }
 810
 811    for( $i = 0; $i < $seqct-1; $i++ ) {
 812        # (diagonals) distance is 0 for same sequence
 813        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 814        $values[$i][$i] = sprintf($precisionstr,0);
 815
 816        for( $j = $i+1; $j < $seqct; $j++ ) {
 817
 818            my $pairwise = $aln->select_noncont($i+1,$j+1);
 819            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 820            my $P = $self->transitions($pairwise) / $L;
 821            my $Q = $self->transversions($pairwise) / $L;
 822            my $C = $gc[$i][$j] + $gc[$j][$i]-
 823                ( 2 * $gc[$i][$j] * $gc[$j][$i] );
 824            if( $P ) {
 825                $P = $P / $C;
 826            }
 827            my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
 828            # fwd and rev lookup
 829            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 830            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 831            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 832            # (diagonals) distance is 0 for same sequence
 833            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 834            $values[$j][$j] = sprintf($precisionstr,0);
 835        }
 836    }
 837    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 838                                        -matrix  => \%dist,
 839                                        -names   => \@names,
 840                                        -values  => \@values);
 841
 842 }
 843
 844 =head2 D_F84
 845
 846  Title   : D_F84
 847  Usage   : my $d = $stat->D_F84($aln)
 848  Function: Calculates D (pairwise distance) between 2 sequences in an
 849            alignment using the Felsenstein 1984 distance model.
 850  Returns : L<Bio::Matrix::PhylipDist>
 851  Args    : L<Bio::Align::AlignI> of DNA sequences
 852            [optional] double - gap penalty
 853
 854 =cut
 855
 856 sub D_F84 {
 857    my ($self,$aln,$gappenalty) = @_;
 858    return 0 unless $self->_check_arg($aln);
 859    $self->throw_not_implemented();
 860    # ambiguities ignored at this point
 861    my (@seqs,@names,@values,%dist);
 862    my $seqct = 0;
 863    foreach my $seq ( $aln->each_seq) {
 864        # if there is no name,
 865        my $id = $seq->display_id;
 866        if( ! length($id) ||       # deal with empty names
 867            $id =~ /^\s+$/ ) {
 868            $id = $seqct+1;
 869        }
 870        push @names, $id;
 871        push @seqs, uc $seq->seq();
 872        $seqct++;
 873    }
 874
 875    my $precisionstr = "%.$Precision"."f";
 876
 877    for( my $i = 0; $i < $seqct-1; $i++ ) {
 878        # (diagonals) distance is 0 for same sequence
 879        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 880        $values[$i][$i] = sprintf($precisionstr,0);
 881
 882        for( my $j = $i+1; $j < $seqct; $j++ ) {
 883        }
 884    }
 885 }
 886
 887 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
 888 #  Tajima-Nei correction used for multiple substitutions in the calc
 889 # of the distance matrix. Nucleic acids only.
 890 #
 891 #  D = p-distance = 1 - (matches/(posns_scored + gaps)
 892 #
 893 #  distance = -b * ln(1-D/b)
 894 #
 895
 896 =head2 D_TajimaNei
 897
 898  Title   : D_TajimaNei
 899  Usage   : my $d = $stat->D_TajimaNei($aln)
 900  Function: Calculates D (pairwise distance) between 2 sequences in an
 901            alignment using the TajimaNei 1984 distance model.
 902  Returns : L<Bio::Matrix::PhylipDist>
 903  Args    : Bio::Align::AlignI of DNA sequences
 904
 905
 906 =cut
 907
 908 sub D_TajimaNei{
 909    my ($self,$aln) = @_;
 910    return 0 unless $self->_check_arg($aln);
 911    # ambiguities ignored at this point
 912    my (@seqs,@names,@values,%dist);
 913    my $seqct = 0;
 914    foreach my $seq ( $aln->each_seq) {
 915        # if there is no name,
 916        push @names, $seq->display_id;
 917        push @seqs, uc $seq->seq();
 918        $seqct++;
 919    }
 920    my $precisionstr = "%.$Precision"."f";
 921    my ($i,$j,$bs);
 922    # pairwise
 923    for( $i =0; $i < $seqct -1; $i++ ) {
 924        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 925        $values[$i][$i] = sprintf($precisionstr,0);
 926
 927        for ( $j = $i+1; $j <$seqct;$j++ ) {
 928            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 929                                                                $seqs[$j]);
 930            my $pairwise = $aln->select_noncont($i+1,$j+1);
 931            my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 932            my $fij2 = 0;
 933            for( $bs = 0; $bs < 4; $bs++ ) {
 934                my $fi = 0;
 935                map {$fi += $matrix->[$bs]->[$_] } 0..3;
 936                my $fj = 0;
 937                # summation
 938                map { $fj += $matrix->[$_]->[$bs] } 0..3;
 939                my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
 940                $fij2 += $fij**2;
 941            }
 942
 943            my ($pair,$h) = (0,0);
 944            for( $bs = 0; $bs < 3; $bs++ ) {
 945                for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
 946                    my $fij = $pfreq->[$pair++] / $slen;
 947                    if( $fij ) {
 948
 949                        my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
 950
 951                        map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
 952                        map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
 953                        map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
 954                        map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
 955
 956                        if( $fij ) {
 957                            $h += ( ($fij**2) / 2 ) /
 958                                (  ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
 959                                   ( ( $ci2 + $cj2 ) / (2 * $slen) )
 960                                   );
 961                        }
 962                        $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
 963                    }
 964                }
 965            }
 966            # just want diagonals which are matches (A matched A, C -> C)
 967
 968            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 969                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 970            my $D = 1 - ( $m / $slen);
 971            my $d;
 972            if( $h == 0 ) {
 973                $d = -1;
 974            } else {
 975                my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
 976                my $c = 1- $D/ $b;
 977
 978                if( $c < 0 ) {
 979                    $d = -1;
 980                } else {
 981                    $d = (-1 * $b) * log ( $c);
 982                }
 983            }
 984            # fwd and rev lookup
 985            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 986            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 987            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 988
 989            # (diagonals) distance is 0 for same sequence
 990            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 991            $values[$j][$j] = sprintf($precisionstr,0);
 992        }
 993    }
 994    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 995                                        -matrix  => \%dist,
 996                                        -names   => \@names,
 997                                        -values  => \@values);
 998
 999 }
1000
1001 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1002
1003 =head2 D_JinNei
1004
1005  Title   : D_JinNei
1006  Usage   : my $d = $stat->D_JinNei($aln)
1007  Function: Calculates D (pairwise distance) between 2 sequences in an
1008            alignment using the Jin-Nei 1990 distance model.
1009  Returns : L<Bio::Matrix::PhylipDist>
1010  Args    : L<Bio::Align::AlignI> of DNA sequences
1011
1012
1013 =cut
1014
1015 sub D_JinNei{
1016    my ($self,@args) = @_;
1017    $self->warn("JinNei implementation not completed");
1018    return;
1019 }
1020
1021 =head2 transversions
1022
1023  Title   : transversions
1024  Usage   : my $transversions = $stats->transversion($aln);
1025  Function: Calculates the number of transversions between two sequences in
1026            an alignment
1027  Returns : integer
1028  Args    : Bio::Align::AlignI
1029
1030
1031 =cut
1032
1033 sub transversions{
1034    my ($self,$aln) = @_;
1035    return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1036 }
1037
1038 =head2 transitions
1039
1040  Title   : transitions
1041  Usage   : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1042  Function: Calculates the number of transitions in a given DNA alignment
1043  Returns : integer representing the number of transitions
1044  Args    : Bio::Align::AlignI object
1045
1046
1047 =cut
1048
1049 sub transitions{
1050    my ($self,$aln) = @_;
1051    return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1052 }
1053
1054
1055 sub _trans_count_helper {
1056     my ($self,$aln,$type) = @_;
1057     return 0 unless( $self->_check_arg($aln) );
1058     if( ! $aln->is_flush ) { $self->throw("must be flush") }
1059     my (@tcount);
1060     my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1061                             uc $aln->get_seq_by_pos(2)->seq() );
1062     my $alen = $aln->length;
1063     for (my $i = 0;$i<$alen; $i++ ) {
1064         my ($c1,$c2) = ( substr($first,$i,1),
1065                          substr($second,$i,1) );
1066         if( $c1 ne $c2 ) {
1067             foreach my $nt ( @{$type->{$c1}} ) {
1068                 if( $nt eq $c2) {
1069                    $tcount[$i]++;
1070                }
1071             }
1072         }
1073     }
1074     my $sum = 0;
1075     map { if( $_) { $sum += $_} } @tcount;
1076     return $sum;
1077 }
1078
1079 # this will generate a matrix which records across the row, the number
1080 # of DNA subst
1081 #
1082 sub _build_nt_matrix {
1083     my ($self,$seqa,$seqb) = @_;
1084
1085
1086     my $basect_matrix = [ [ qw(0 0 0 0) ],  # number of bases that match
1087                           [ qw(0 0 0 0) ],
1088                           [ qw(0 0 0 0) ],
1089                           [ qw(0 0 0 0) ] ];
1090     my $gaps = 0;                           # number of gaps
1091     my $pfreq = [ qw( 0 0 0 0 0 0)];        # matrix for pair frequency
1092     my $len_a = length($seqa);
1093     for( my $i = 0; $i < $len_a; $i++) {
1094         my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1095         $ti =~ tr/U/T/;
1096         $tj =~ tr/U/T/;
1097
1098         if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1099         if( $tj =~ /^$GapChars$/) { $gaps++; next }
1100
1101         my $ti_index = $NucleotideIndexes{$ti};
1102         my $tj_index = $NucleotideIndexes{$tj};
1103
1104         if( ! defined $ti_index ) {
1105             $self->warn("ti_index not defined for $ti\n");
1106             next;
1107         }
1108
1109         $basect_matrix->[$ti_index]->[$tj_index]++;
1110
1111         if( $ti ne $tj ) {
1112             $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1113         }
1114     }
1115     return ($basect_matrix,$pfreq,$gaps);
1116 }
1117
1118 sub _check_ambiguity_nucleotide {
1119     my ($base1,$base2) = @_;
1120     my %iub = Bio::Tools::IUPAC->iupac_iub();
1121     my @amb1 = @{ $iub{uc($base1)} };
1122     my @amb2 = @{ $iub{uc($base2)} };
1123     my ($pmatch) = (0);
1124     for my $amb ( @amb1 ) {
1125         if( grep { $amb eq $_ } @amb2 ) {
1126             $pmatch = 1;
1127             last;
1128         }
1129     }
1130     if( $pmatch ) {
1131         return (1 / scalar @amb1) * (1 / scalar @amb2);
1132     } else {
1133         return 0;
1134     }
1135 }
1136
1137
1138 sub _check_arg {
1139     my($self,$aln ) = @_;
1140     if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1141         $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1142         return 0;
1143     } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1144         $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1145         return 0;
1146     }
1147     return 1;
1148 }
1149
1150 =head2 Data Methods
1151
1152 =cut
1153
1154 =head2 pairwise_stats
1155
1156  Title   : pairwise_stats
1157  Usage   : $obj->pairwise_stats($newval)
1158  Function:
1159  Returns : value of pairwise_stats
1160  Args    : newvalue (optional)
1161
1162
1163 =cut
1164
1165 sub pairwise_stats{
1166    my ($self,$value) = @_;
1167    if( defined $value) {
1168       $self->{'_pairwise_stats'} = $value;
1169     }
1170     return $self->{'_pairwise_stats'};
1171
1172 }
1173
1174 =head2 calc_KaKs_pair
1175
1176  Title    : calc_KaKs_pair
1177  Useage   : my $results = $stats->calc_KaKs_pair($alnobj,
1178             $name1, $name2).
1179  Function : calculates Nei-Gojobori statistics for pairwise
1180             comparison.
1181  Args     : A Bio::Align::AlignI compliant object such as a
1182             Bio::SimpleAlign object, and 2 sequence name strings.
1183  Returns  : a reference to a hash of statistics with keys as
1184             listed in Description.
1185
1186 =cut
1187
1188 sub calc_KaKs_pair {
1189     my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1190     $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1191         if @_!= 4;
1192     $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1193     my @seqs = (
1194                 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1195                 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1196                 {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1197                 {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1198                ) ;
1199     if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1200         $self->throw(" aligned sequences must be of equal length!");
1201     }
1202     my $results = [];
1203     $self->_get_av_ds_dn(\@seqs, $results);
1204     return $results;
1205
1206 }
1207
1208 =head2 calc_all_KaKs_pairs
1209
1210  Title    : calc_all_KaKs_pairs
1211  Useage   : my $results2 = $stats->calc_KaKs_pair($alnobj).
1212  Function : Calculates Nei_gojobori statistics for all pairwise
1213             combinations in sequence.
1214  Arguments: A Bio::Align::ALignI compliant object such as
1215             a Bio::SimpleAlign object.
1216  Returns  : A reference to an array of hashes of statistics of
1217             all pairwise comparisons in the alignment.
1218
1219 =cut
1220
1221
1222
1223 sub calc_all_KaKs_pairs {
1224 #returns a multi_element_array with all pairwise comparisons
1225         my ($self,$aln) = @_;
1226         $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1227         my @seqs;
1228         for my $seq ($aln->each_seq) {
1229                 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1230                 }
1231         my $results ;
1232         $results = $self->_get_av_ds_dn(\@seqs, $results);
1233         return $results;
1234 }
1235
1236 =head2 calc_average_KaKs
1237
1238  Title    : calc_average_KaKs.
1239  Useage   : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1240  Function : calculates Nei_Gojobori stats for average of all
1241             sequences in the alignment.
1242  Args     : A Bio::Align::AlignI compliant object such as a
1243             Bio::SimpleAlign object, number of bootstrap iterations
1244             (default 1000).
1245  Returns  : A reference to a hash of statistics as listed in Description.
1246
1247 =cut
1248
1249 sub calc_average_KaKs {
1250 #calculates global value for sequences in alignment using bootstrapping
1251 #this is quite slow (~10 seconds per  3 X 200nt seqs);
1252     my ($self, $aln, $bootstrap_rpt) = @_;
1253     $bootstrap_rpt ||= 1000;
1254     $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1255     my @seqs;
1256     for my $seq ($aln->each_seq) {
1257         push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1258     }
1259     my $results ;
1260     my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1261     #print "ds = $ds_orig, dn = $dn_orig\n";
1262     $results = {D_s => $ds_orig, D_n => $dn_orig};
1263     $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1264     return $results;
1265 }
1266
1267 ############## primary internal subs for alignment comparisons ########################
1268
1269 sub _run_bootstrap {
1270     ### generates sampled sequences, calculates Ds and Dn values,
1271     ### then calculates variance of sampled sequences and add results to results hash
1272     ###
1273     my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1274     my @seqs = @$seq_ref;
1275     my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1276     my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1277
1278     #1st make alternative array of codons;
1279     my $c = 0;
1280     while ($c < length $seqs[0]{'seq'}) {
1281         for (0..$#seqs) {
1282             push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1283         }
1284         $c+=3;
1285     }
1286
1287     for (1..$bootstrap_rpt) {
1288         my $sampled = _resample (\@btstrp_aoa);
1289         my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1290         push @{$bootstrap_values{'ds'}}, $ds;
1291         push @{$bootstrap_values{'dn'}}, $dn;
1292     }
1293
1294     $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1295     $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1296     $results->{'z_score'} =     ($results->{'D_n'} - $results->{'D_s'}) /
1297         sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1298     #print "bootstrapped var_syn =      $results->{'D_s_var'} \n" ;
1299     #print "bootstrapped var_nc =       $results->{'D_n_var'} \n";
1300     #print "z is $results->{'z_score'}\n";      ### end of global set up of/perm look up data
1301 }
1302
1303 sub _resample {
1304     my $ref = shift;
1305     my $codon_num = scalar (@{$ref->[0]});
1306     my @altered;
1307     for (0..$codon_num -1) {    #for each codon
1308         my $rand = int (rand ($codon_num));
1309         for (0..$#$ref) {
1310             push @{$altered[$_]}, $ref->[$_][$rand];
1311         }
1312     }
1313     my @stringed = map {join '', @$_}@altered;
1314     my @return;
1315     #now out in random name to keep other subs happy
1316     for (@stringed) {
1317         push @return, {id=>'1', seq=> $_};
1318     }
1319     return \@return;
1320 }
1321
1322 sub _get_av_ds_dn {
1323     # takes array of hashes of sequence strings and ids   #
1324     my $self = shift;
1325     my $seq_ref = shift;
1326     my $result = shift if @_;
1327     my @caller = caller(1);
1328     my @seqarray = @$seq_ref;
1329     my $bootstrap_score_list;
1330     #for a multiple alignment considers all pairwise combinations#
1331     my %dsfor_average = (ds => [], dn => []);
1332     for (my $i = 0; $i < scalar @seqarray; $i++) {
1333         for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1334 #                       print "comparing $i and $j\n";
1335             if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1336                 $self->warn(" aligned sequences must be of equal length!");
1337                 next;
1338             }
1339
1340             my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1341             my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1342 #                       print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1343             my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1344             #get averages
1345             my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1346             my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1347
1348             #calculate ps and pn  (p54)
1349             my $syn_prop = $syn_count / $av_s_site;
1350             my $nc_prop = $non_syn_count / $av_ns_syn_site      ;
1351
1352             #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1353             my $d_syn = $self->jk($syn_prop);
1354             my $d_nc = $self->jk($nc_prop);
1355
1356             #JK calculation must succeed for continuation of calculation
1357             #ret_value = -1 if error
1358             next unless $d_nc >=0 && $d_syn >=0;
1359
1360
1361             push @{$dsfor_average{'ds'}}, $d_syn;
1362             push @{$dsfor_average{'dn'}}, $d_nc;
1363
1364             #if not doing bootstrap, calculate the pairwise comparisin stats
1365             if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1366                                 #now calculate variances assuming large sample
1367                 my $d_syn_var =  jk_var($syn_prop, length($seqarray[$i]{'seq'})  - $gap_cnt );
1368                 my $d_nc_var =  jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1369                 #now calculate z_value
1370                 #print "d_syn_var is  $d_syn_var,and d_nc_var is $d_nc_var\n";
1371                 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1372                 my $z = ($d_syn_var + $d_nc_var) ?
1373                   ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1374                 #       print "z is $z\n";
1375                 push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1376                                  S_d => $syn_count, N_d =>$non_syn_count,
1377                                  P_s => $syn_prop, P_n=>$nc_prop,
1378                                  D_s => @{$dsfor_average{'ds'}}[-1],
1379                                  D_n => @{$dsfor_average{'dn'}}[-1],
1380                                  D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1381                                  Seq1 => $seqarray[$i]{'id'},
1382                                  Seq2 => $seqarray[$j]{'id'},
1383                                  z_score => $z,
1384                              };
1385                 $self->warn (" number of mutations too small to justify normal test for  $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1386                     if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1387             }#endif
1388             }
1389     }
1390
1391     #warn of failure if no results hashes are present
1392     #will fail if Jukes Cantor has failed for all pairwise combinations
1393     #$self->warn("calculation failed!") if scalar @$result ==0;
1394
1395     #return results unless bootstrapping
1396     return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1397     #else if getting average for bootstrap
1398     return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1399 }
1400
1401
1402 sub jk {
1403     my ($self, $p) = @_;
1404     if ($p > 0.75) {
1405         $self->warn( " Jukes Cantor won't  work -too divergent!");
1406         return -1;
1407     }
1408     return -1 * (3/4) * (log(1 - (4/3) * $p));
1409 }
1410
1411 #works for large value of n (50?100?)
1412 sub jk_var {
1413     my ($p, $n) = @_;
1414     return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1415 }
1416
1417
1418 # compares 2 sequences to find the number of synonymous/non
1419 # synonymous mutations between them
1420
1421 sub analyse_mutations {
1422     my ($seq1, $seq2) = @_;
1423     my %mutator = ( 2=> {0=>[[1,2],  # codon positions to be altered
1424                              [2,1]], # depend on which is the same
1425                          1=>[[0,2],
1426                              [2,0]],
1427                          2=>[[0,1],
1428                              [1,0]],
1429                      },
1430                     3=> [ [0,1,2],  # all need to be altered
1431                           [1,0,2],
1432                           [0,2,1],
1433                           [1,2,0],
1434                           [2,0,1],
1435                           [2,1,0] ],
1436                     );
1437     my $TOTAL   = 0;    # total synonymous changes
1438     my $TOTAL_n = 0;    # total non-synonymous changes
1439     my $gap_cnt = 0;
1440
1441     my %input;
1442     my $seqlen = length($seq1);
1443     for (my $j=0; $j< $seqlen; $j+=3) {
1444         $input{'cod1'} = substr($seq1, $j,3);
1445         $input{'cod2'} = substr($seq2, $j,3);
1446
1447         #ignore codon if beeing compared with gaps!
1448         if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1449             $gap_cnt += 3; #just increments once if there is a pair of gaps
1450             next;
1451         }
1452
1453         my ($diff_cnt, $same) = count_diffs(\%input);
1454
1455         #ignore if codons are identical
1456         next if $diff_cnt == 0 ;
1457         if ($diff_cnt == 1) {
1458             $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1459             $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1460             #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1461         }
1462         elsif ($diff_cnt ==2) {
1463             my $s_cnt = 0;
1464             my $n_cnt = 0;
1465             my $tot_muts = 4;
1466             #will stay 4 unless there are stop codons at intervening point
1467           OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
1468               my $altered = $input{'cod1'};
1469               my $prev= $altered;
1470               #         print "$prev -> (", $t[$CODONS->{$altered}], ")";
1471               for       my $mut_i (@$perm) { #index of codon mutated
1472                   substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1473                   if ($t[$CODONS->{$altered}] eq '*') {
1474                       $tot_muts -=2;
1475                       #print "changes to stop codon!!\n";
1476                       next OUTER;
1477                   }
1478                   else {
1479                       $s_cnt += $synchanges{$prev}{$altered};
1480                       #                                 print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1481                   }
1482                   $prev = $altered;
1483               }
1484               #         print "\n";
1485           }
1486             if ($tot_muts != 0) {
1487                 $TOTAL += ($s_cnt/($tot_muts/2));
1488                 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1489             }
1490
1491         }
1492         elsif ($diff_cnt ==3 ) {
1493             my $s_cnt = 0;
1494             my $n_cnt = 0;
1495             my $tot_muts = 18;  #potential number  of mutations
1496           OUTER: for my $perm (@{$mutator{'3'}}) {
1497               my $altered = $input{'cod1'};
1498               my $prev= $altered;
1499               # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1500               for my $mut_i (@$perm) { #index of codon mutated
1501                   substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1502                   if ($t[$CODONS->{$altered}] eq '*') {
1503                       $tot_muts -=3;
1504                       # print "changes to stop codon!!\n";
1505                       next OUTER;
1506
1507                   }
1508                   else {
1509                       $s_cnt += $synchanges{$prev}{$altered};
1510                       #                 print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1511                   }
1512                   $prev = $altered;
1513               }
1514               # print "\n";
1515
1516           }#end OUTER loop
1517               #calculate number of synonymous/non synonymous mutations for that codon
1518               # and add to total
1519               if ($tot_muts != 0) {
1520                   $TOTAL += ($s_cnt / ($tot_muts /3));
1521                   $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1522               }
1523         }                       #endif $diffcnt = 3
1524     }                           #end of sequencetraversal
1525     return ($TOTAL, $TOTAL_n, $gap_cnt);
1526 }
1527
1528
1529 sub count_diffs {
1530     #counts the number of nucleotide differences between 2 codons
1531     # returns this value plus the codon index of which nucleotide is the same when 2
1532     #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1533     # to change.
1534     my $ref = shift;
1535     my $cnt = 0;
1536     my $same= undef;
1537     #just for 2 differences
1538     for (0..2) {
1539         if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1540             $cnt++;
1541         } else {
1542             $same = $_;
1543         }
1544     }
1545     return ($cnt, $same);
1546 }
1547
1548 =head2 get_syn_changes
1549
1550  Title   : get_syn_changes
1551  Usage   : Bio::Align::DNAStatitics->get_syn_changes
1552  Function: Generate a hashref of all pairwise combinations of codns
1553            differing by 1
1554  Returns : Symetic matrix using hashes
1555            First key is codon
1556            and each codon points to a hashref of codons
1557            the values of which describe type of change.
1558            my $type = $hash{$codon1}->{$codon2};
1559            values are :
1560              1   synonymous
1561              0   non-syn
1562             -1   either codon is a stop codon
1563  Args    : none
1564
1565 =cut
1566
1567 sub get_syn_changes {
1568 #hash of all pairwise combinations of codons differing by 1
1569 # 1 = syn, 0 = non-syn, -1 = stop
1570     my %results;
1571     my @codons = _make_codons ();
1572     my $arr_len = scalar @codons;
1573     for (my $i = 0; $i < $arr_len -1; $i++) {
1574         my $cod1 = $codons[$i];
1575         for (my $j = $i +1; $j < $arr_len; $j++) {
1576             my $diff_cnt = 0;
1577             for my $pos(0..2) {
1578                 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1579             }
1580             next if $diff_cnt !=1;
1581
1582             #synon change
1583             if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1584                 $results{$cod1}{$codons[$j]} =1;
1585                 $results{$codons[$j]}{$cod1} = 1;
1586             }
1587             #stop codon
1588             elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1589                 $results{$cod1}{$codons[$j]} = -1;
1590                 $results{$codons[$j]}{$cod1} = -1;
1591             }
1592             # nc change
1593             else {
1594                 $results{$cod1}{$codons[$j]} = 0;
1595                 $results{$codons[$j]}{$cod1} = 0;
1596             }
1597         }
1598     }
1599     return %results;
1600 }
1601
1602 =head2 dnds_pattern_number
1603
1604  Title   : dnds_pattern_number
1605  Usage   : my $patterns = $stats->dnds_pattern_number($alnobj);
1606  Function: Counts the number of codons with no gaps in the MSA
1607  Returns : Number of codons with no gaps ('patterns' in PAML notation)
1608  Args    : A Bio::Align::AlignI compliant object such as a
1609             Bio::SimpleAlign object.
1610
1611 =cut
1612
1613 sub dnds_pattern_number{
1614     my ($self, $aln) = @_;
1615     return ($aln->remove_gaps->length)/3;
1616 }
1617
1618 sub count_syn_sites {
1619     #counts the number of possible synonymous changes for sequence
1620     my ($seq, $synsite) = @_;
1621     __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1622     my $S = 0;
1623     for (my $i = 0; $i< length($seq); $i+=3) {
1624         my $cod = substr($seq, $i, 3);
1625         next if $cod =~ /\-/;   #deal with alignment gaps
1626         $S +=  $synsite->{$cod}{'s'};
1627     }
1628     #print "S is $S\n";
1629     return $S;
1630 }
1631
1632
1633
1634 sub get_syn_sites {
1635     #sub to generate lookup hash for the number of synonymous changes per codon
1636     my @nucs = qw(T C A G);
1637     my %raw_results;
1638     for my $i (@nucs) {
1639         for my $j (@nucs) {
1640             for my $k (@nucs) {
1641                 # for each possible codon
1642                 my $cod = "$i$j$k";
1643                 my $aa = $t[$CODONS->{$cod}];
1644                 #calculate number of synonymous mutations vs non syn mutations
1645                 for my $i (qw(0 1 2)){
1646                     my $s = 0;
1647                     my $n = 3;
1648                     for my $nuc (qw(A T C G)) {
1649                         next if substr ($cod, $i,1) eq $nuc;
1650                         my $test = $cod;
1651                         substr($test, $i, 1) = $nuc ;
1652                         if ($t[$CODONS->{$test}] eq $aa) {
1653                             $s++;
1654                         }
1655                         if ($t[$CODONS->{$test}] eq '*') {
1656                             $n--;
1657                         }
1658                     }
1659                     $raw_results{$cod}[$i] = {'s' => $s ,
1660                                               'n' => $n };
1661                 }
1662
1663             } #end analysis of single codon
1664         }
1665     } #end analysis of all codons
1666     my %final_results;
1667
1668     for my $cod (sort keys %raw_results) {
1669         my $t = 0;
1670         map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
1671         $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1672     }
1673     return \%final_results;
1674 }
1675
1676 sub _make_codons {
1677 #makes all codon combinations, returns array of them
1678     my @nucs = qw(T C A G);
1679     my @codons;
1680     for my $i (@nucs) {
1681         for my $j (@nucs) {
1682             for my $k (@nucs) {
1683                 push @codons, "$i$j$k";
1684             }
1685         }
1686     }
1687     return @codons;
1688 }
1689
1690 sub get_codons {
1691  #generates codon translation look up table#
1692  my $x = 0;
1693  my  $CODONS = {};
1694  for my $codon (_make_codons) {
1695      $CODONS->{$codon} = $x;
1696      $x++;
1697  }
1698  return $CODONS;
1699 }
1700
1701 #########stats subs, can go in another module? Here for speed. ###
1702 sub mean {
1703     my $ref = shift;
1704     my $el_num = scalar @$ref;
1705     my $tot = 0;
1706     map{$tot += $_}@$ref;
1707     return ($tot/$el_num);
1708 }
1709
1710 sub variance {
1711     my $ref = shift;
1712     my $mean = mean($ref);
1713     my $sum_of_squares = 0;
1714     map{$sum_of_squares += ($_ - $mean) **2}@$ref;
1715     return $sum_of_squares;
1716 }
1717
1718 sub sampling_variance {
1719     my $ref = shift;
1720     return variance($ref) / (scalar @$ref -1);
1721 }
1722
1723 1;