Bio/PopGen/Statistics.pm

   1 #
   2 # BioPerl module for Bio::PopGen::Statistics
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::PopGen::Statistics - Population Genetics statistical tests
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::PopGen::Statistics;
  21   use Bio::AlignIO;
  22   use Bio::PopGen::IO;
  23   use Bio::PopGen::Simulation::Coalescent;
  24
  25   my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12);
  26
  27   my $tree = $sim->next_tree;
  28
  29   $sim->add_Mutations($tree,20);
  30
  31   my $stats = Bio::PopGen::Statistics->new();
  32   my $individuals = [ $tree->get_leaf_nodes];
  33   my $pi = $stats->pi($individuals);
  34   my $D  = $stats->tajima_D($individuals);
  35
  36   # Alternatively to do this on input data from
  37   # See the tests in t/PopGen.t for more examples
  38   my $parser = Bio::PopGen::IO->new(-format => 'prettybase',
  39                                    -file   => 't/data/popstats.prettybase');
  40   my $pop = $parser->next_population;
  41   # Note that you can also call the stats as a class method if you like
  42   # the only reason to instantiate it (as above) is if you want
  43   # to set the verbosity for debugging
  44   $pi     = Bio::PopGen::Statistics->pi($pop);
  45   $theta  = Bio::PopGen::Statistics->theta($pop);
  46
  47   # Pi and Theta also take additional arguments,
  48   # see the documentation for more information
  49
  50   use Bio::PopGen::Utilities;
  51   use Bio::AlignIO;
  52
  53   my $in = Bio::AlignIO->new(-file   => 't/data/t7.aln',
  54                             -format => 'clustalw');
  55   my $aln = $in->next_aln;
  56   # get a population, each sequence is an individual and
  57   # for the default case, every site which is not monomorphic
  58   # is a 'marker'.  Each individual will have a 'genotype' for the
  59   # site which will be the specific base in the alignment at that
  60   # site
  61
  62   my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
  63
  64
  65 =head1 DESCRIPTION
  66
  67 This object is intended to provide implementations some standard
  68 population genetics statistics about alleles in populations.
  69
  70 This module was previously named Bio::Tree::Statistics.
  71
  72 This object is a place to accumulate routines for calculating various
  73 statistics from the coalescent simulation, marker/allele, or from
  74 aligned sequence data given that you can calculate alleles, number of
  75 segregating sites.
  76
  77 Currently implemented:
  78  Fu and Li's D    (fu_and_li_D)
  79  Fu and Li's D*   (fu_and_li_D_star)
  80  Fu and Li's F    (fu_and_li_F)
  81  Fu and Li's F*   (fu_and_li_F_star)
  82  Tajima's D       (tajima_D)
  83  Watterson's theta (theta)
  84  pi               (pi) - number of pairwise differences
  85  composite_LD     (composite_LD)
  86  McDonald-Kreitman (mcdonald_kreitman or MK)
  87
  88 Count based methods also exist in case you have already calculated the
  89 key statistics (seg sites, num individuals, etc) and just want to
  90 compute the statistic.
  91
  92 In all cases where a the method expects an arrayref of
  93 L<Bio::PopGen::IndividualI> objects and L<Bio::PopGen::PopulationI>
  94 object will also work.
  95
  96 =head2 REFERENCES
  97
  98 Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
  99 Mutations." Genetics 133:693-709.
 100
 101 Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
 102 from a Population." Genetics 143:557-570.
 103
 104 McDonald J, Kreitman M.
 105
 106 Tajima F. (1989) "Statistical method for testing the neutral mutation
 107 hypothesis by DNA polymorphism." Genetics 123:585-595.
 108
 109
 110 =head2 CITING THIS WORK
 111
 112 Please see this reference for use of this implementation.
 113
 114 Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
 115
 116 If you use these Bio::PopGen modules please cite the Bioperl
 117 publication (see FAQ) and the above reference.
 118
 119
 120 =head1 FEEDBACK
 121
 122 =head2 Mailing Lists
 123
 124 User feedback is an integral part of the evolution of this and other
 125 Bioperl modules. Send your comments and suggestions preferably to
 126 the Bioperl mailing list.  Your participation is much appreciated.
 127
 128   bioperl-l@bioperl.org                  - General discussion
 129   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 130
 131 =head2 Support
 132
 133 Please direct usage questions or support issues to the mailing list:
 134
 135 I<bioperl-l@bioperl.org>
 136
 137 rather than to the module maintainer directly. Many experienced and
 138 reponsive experts will be able look at the problem and quickly
 139 address it. Please include a thorough description of the problem
 140 with code and data examples if at all possible.
 141
 142 =head2 Reporting Bugs
 143
 144 Report bugs to the Bioperl bug tracking system to help us keep track
 145 of the bugs and their resolution. Bug reports can be submitted via
 146 the web:
 147
 148   https://github.com/bioperl/bioperl-live/issues
 149
 150 =head1 AUTHOR - Jason Stajich, Matthew Hahn
 151
 152 Email jason-at-bioperl-dot-org
 153 Email matthew-dot-hahn-at-duke-dot-edu
 154
 155 McDonald-Kreitman implementation based on work by Alisha Holloway at
 156 UC Davis.
 157
 158
 159 =head1 APPENDIX
 160
 161 The rest of the documentation details each of the object methods.
 162 Internal methods are usually preceded with a _
 163
 164 =cut
 165
 166
 167 # Let the code begin...
 168
 169
 170 package Bio::PopGen::Statistics;
 171 use strict;
 172 use constant {
 173     in_label => 'ingroup',
 174     out_label => 'outgroup',
 175     non_syn   => 'non_synonymous',
 176     syn       => 'synonymous',
 177     default_codon_table => 1, # Standard Codon table
 178 };
 179
 180 use Bio::MolEvol::CodonModel;
 181 use List::Util qw(sum);
 182
 183 use base qw(Bio::Root::Root);
 184 our $codon_table => default_codon_table;
 185 our $has_twotailed => 0;
 186 BEGIN {
 187     eval { require Text::NSP::Measures::2D::Fisher2::twotailed };
 188     if( $@ ) { $has_twotailed = 0; }
 189     else { $has_twotailed = 1; }
 190 }
 191
 192
 193
 194
 195
 196
 197 =head2 new
 198
 199  Title   : new
 200  Usage   : my $obj = Bio::PopGen::Statistics->new();
 201  Function: Builds a new Bio::PopGen::Statistics object
 202  Returns : an instance of Bio::PopGen::Statistics
 203  Args    : none
 204
 205
 206 =cut
 207
 208
 209 =head2 fu_and_li_D
 210
 211  Title   : fu_and_li_D
 212  Usage   : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup);
 213             OR
 214            my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations);
 215  Function: Fu and Li D statistic for a list of individuals
 216            given an outgroup and the number of external mutations
 217            (either provided or calculated from list of outgroup individuals)
 218  Returns : decimal
 219  Args    : $individuals - array reference which contains ingroup individuals
 220            (L<Bio::PopGen::Individual> or derived classes)
 221            $extmutations - number of external mutations OR
 222            arrayref of outgroup individuals
 223
 224 =cut
 225
 226 sub fu_and_li_D {
 227     my ($self,$ingroup,$outgroup) = @_;
 228
 229     my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0);
 230     if( ref($ingroup) =~ /ARRAY/i ) {
 231         $n = scalar @$ingroup;
 232         # pi - all pairwise differences
 233         $seg_sites   = $self->segregating_sites_count($ingroup);
 234     } elsif( ref($ingroup) &&
 235              $ingroup->isa('Bio::PopGen::PopulationI')) {
 236         $n = $ingroup->get_number_individuals;
 237         $seg_sites   = $self->segregating_sites_count($ingroup);
 238     } else {
 239         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
 240         return 0;
 241     }
 242
 243     if( $seg_sites <= 0 ) {
 244         $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
 245         return 0;
 246     }
 247
 248     if( ! defined $outgroup ) {
 249         $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
 250         return 0;
 251     } elsif( ref($outgroup) ) {
 252         ($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
 253         $ancestral = 0 unless defined $ancestral;
 254     } else {
 255         $ancestral = $outgroup;
 256     }
 257
 258     return $self->fu_and_li_D_counts($n,$seg_sites,
 259                                      $ancestral,$derived);
 260 }
 261
 262 =head2 fu_and_li_D_counts
 263
 264  Title   : fu_li_D_counts
 265  Usage   : my $D = $statistics->fu_and_li_D_counts($samps,$sites,
 266                                                    $external);
 267  Function: Fu and Li D statistic for the raw counts of the number
 268            of samples, sites, external and internal mutations
 269  Returns : decimal number
 270  Args    : number of samples (N)
 271            number of segregating sites (n)
 272            number of external mutations (n_e)
 273
 274 =cut
 275
 276
 277 sub fu_and_li_D_counts {
 278     my ($self,$n,$seg_sites, $external_mut) = @_;
 279     my $a_n = 0;
 280     if( $n <= 3 ) {
 281         $self->warn("n is $n, too small, must be > 3\n");
 282         return;
 283     }
 284     for(my $k= 1; $k < $n; $k++ ) {
 285         $a_n += ( 1 / $k );
 286     }
 287     my $b = 0;
 288     for(my $k= 1; $k < $n; $k++ ) {
 289         $b += ( 1 / $k**2 );
 290     }
 291
 292     my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
 293                   ( ( $n - 1) * ( $n - 2 ) ) );
 294
 295     my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) *
 296                   ( $c - ( ( $n + 1) /
 297                            ( $n - 1) ) ));
 298
 299     my $u = $a_n - 1 - $v;
 300
 301     ($seg_sites - $a_n * $external_mut) /
 302         sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites));
 303
 304 }
 305
 306
 307 =head2 fu_and_li_D_star
 308
 309  Title   : fu_and_li_D_star
 310  Usage   : my $D = $statistics->fu_an_li_D_star(\@individuals);
 311  Function: Fu and Li's D* statistic for a set of samples
 312             Without an outgroup
 313  Returns : decimal number
 314  Args    : array ref of L<Bio::PopGen::IndividualI> objects
 315            OR
 316            L<Bio::PopGen::PopulationI> object
 317
 318 =cut
 319
 320 #'
 321 # fu_and_li_D*
 322
 323 sub fu_and_li_D_star {
 324     my ($self,$individuals) = @_;
 325
 326     my ($seg_sites,$n,$singletons);
 327     if( ref($individuals) =~ /ARRAY/i ) {
 328         $n = scalar @$individuals;
 329         $seg_sites   = $self->segregating_sites_count($individuals);
 330         $singletons  = $self->singleton_count($individuals);
 331     } elsif( ref($individuals) &&
 332              $individuals->isa('Bio::PopGen::PopulationI')) {
 333         my $pop = $individuals;
 334         $n = $pop->get_number_individuals;
 335         $seg_sites   = $self->segregating_sites_count($pop);
 336         $singletons  = $self->singleton_count($pop);
 337     } else {
 338         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star");
 339         return 0;
 340     }
 341
 342     return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons);
 343 }
 344
 345 =head2 fu_and_li_D_star_counts
 346
 347  Title   : fu_li_D_star_counts
 348  Usage   : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites,
 349                                                         $singletons);
 350
 351  Function: Fu and Li D statistic for the raw counts of the number
 352            of samples, sites, external and internal mutations
 353  Returns : decimal number
 354  Args    : number of samples (N)
 355            number of segregating sites (n)
 356            singletons (n_s)
 357
 358 =cut
 359
 360
 361 sub fu_and_li_D_star_counts {
 362     my ($self,$n,$seg_sites, $singletons) = @_;
 363     my $a_n;
 364     for(my $k = 1; $k < $n; $k++ ) {
 365         $a_n += ( 1 / $k );
 366     }
 367
 368     my $a1 = $a_n + 1 / $n;
 369
 370     my $b = 0;
 371     for(my $k= 1; $k < $n; $k++ ) {
 372         $b += ( 1 / $k**2 );
 373     }
 374
 375     my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
 376                   ( ( $n - 1) * ( $n - 2 ) ) );
 377
 378     my $d = $c + ($n -2) / ($n - 1)**2 +
 379         2 / ($n -1) *
 380         ( 1.5 - ( (2*$a1 - 3) / ($n -2) ) -
 381           1 / $n );
 382
 383     my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) -
 384                  (2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) )  /
 385                    (($a_n**2) + $b);
 386
 387     my $u_star = ( ($n/($n-1))*
 388                    ($a_n - ($n/
 389                           ($n-1)))) - $v_star;
 390
 391
 392     return (($n / ($n - 1)) * $seg_sites -
 393             $a_n * $singletons) /
 394             sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites));
 395 }
 396
 397
 398 =head2 fu_and_li_F
 399
 400  Title   : fu_and_li_F
 401  Usage   : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
 402  Function: Calculate Fu and Li's F on an ingroup with either the set of
 403            outgroup individuals, or the number of external mutations
 404  Returns : decimal number
 405  Args    : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
 406            OR a L<Bio::PopGen::PopulationI> object
 407            number of external mutations OR list of individuals for the outgroup
 408
 409 =cut
 410
 411 #'
 412
 413 sub fu_and_li_F {
 414     my ($self,$ingroup,$outgroup) = @_;
 415     my ($seg_sites,$pi,$n,$external,$internal);
 416     if( ref($ingroup) =~ /ARRAY/i ) {
 417         $n = scalar @$ingroup;
 418         # pi - all pairwise differences
 419         $pi          = $self->pi($ingroup);
 420         $seg_sites   = $self->segregating_sites_count($ingroup);
 421     } elsif( ref($ingroup) &&
 422              $ingroup->isa('Bio::PopGen::PopulationI')) {
 423         $n = $ingroup->get_number_individuals;
 424         $pi          = $self->pi($ingroup);
 425         $seg_sites   = $self->segregating_sites_count($ingroup);
 426     } else {
 427         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F");
 428         return 0;
 429     }
 430
 431     if( ! defined $outgroup ) {
 432         $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
 433         return 0;
 434     } elsif( ref($outgroup) ) {
 435         ($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
 436     } else {
 437         $external = $outgroup;
 438     }
 439     $self->fu_and_li_F_counts($n,$pi,$seg_sites,$external);
 440 }
 441
 442 =head2 fu_and_li_F_counts
 443
 444  Title   : fu_li_F_counts
 445  Usage   : my $F = $statistics->fu_and_li_F_counts($samps,$pi,
 446                                                    $sites,
 447                                                    $external);
 448  Function: Fu and Li F statistic for the raw counts of the number
 449            of samples, sites, external and internal mutations
 450  Returns : decimal number
 451  Args    : number of samples (N)
 452            average pairwise differences (pi)
 453            number of segregating sites (n)
 454            external mutations (n_e)
 455
 456 =cut
 457
 458
 459 sub fu_and_li_F_counts {
 460     my ($self,$n,$pi,$seg_sites, $external) = @_;
 461     my $a_n = 0;
 462     for(my $k= 1; $k < $n; $k++ ) {
 463         $a_n += ( 1 / $k );
 464     }
 465
 466     my $a1 = $a_n + (1 / $n );
 467
 468     my $b = 0;
 469     for(my $k= 1; $k < $n; $k++ ) {
 470         $b += ( 1 / $k**2 );
 471     }
 472
 473     my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
 474                   ( ( $n - 1) * ( $n - 2 ) ) );
 475
 476     my $v_F = ( $c + ( (2*(($n**2)+$n+3)) /
 477                        ( (9*$n)*($n-1) ) ) -
 478                 (2/($n-1)) ) / ( ($a_n**2)+$b );
 479
 480     my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )-
 481                 ( 4*( ($n+1)/(($n-1)**2) ))*
 482                 ($a1 - ((2*$n)/($n+1))) ) /
 483                 $a_n - $v_F;
 484
 485     # warn("$v_F vf $u_F uf n = $n\n");
 486     my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
 487                                         ($v_F*($seg_sites**2)) ) );
 488
 489     return $F;
 490 }
 491
 492 =head2 fu_and_li_F_star
 493
 494  Title   : fu_and_li_F_star
 495  Usage   : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
 496  Function: Calculate Fu and Li's F* on an ingroup without an outgroup
 497            It uses count of singleton alleles instead
 498  Returns : decimal number
 499  Args    : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
 500            OR
 501            L<Bio::PopGen::PopulationI> object
 502
 503 =cut
 504
 505 #' keep my emacs happy
 506
 507 sub fu_and_li_F_star {
 508     my ($self,$individuals) = @_;
 509
 510     my ($seg_sites,$pi,$n,$singletons);
 511     if( ref($individuals) =~ /ARRAY/i ) {
 512         $n = scalar @$individuals;
 513         # pi - all pairwise differences
 514         $pi          = $self->pi($individuals);
 515         $seg_sites   = $self->segregating_sites_count($individuals);
 516         $singletons  = $self->singleton_count($individuals);
 517     } elsif( ref($individuals) &&
 518              $individuals->isa('Bio::PopGen::PopulationI')) {
 519         my $pop = $individuals;
 520         $n = $pop->get_number_individuals;
 521         $pi          = $self->pi($pop);
 522         $seg_sites   = $self->segregating_sites_count($pop);
 523         $singletons  = $self->singleton_count($pop);
 524     } else {
 525         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star");
 526         return 0;
 527     }
 528     return $self->fu_and_li_F_star_counts($n,
 529                                           $pi,
 530                                           $seg_sites,
 531                                           $singletons);
 532 }
 533
 534 =head2 fu_and_li_F_star_counts
 535
 536  Title   : fu_li_F_star_counts
 537  Usage   : my $F = $statistics->fu_and_li_F_star_counts($samps,
 538                                                    $pi,$sites,
 539                                                    $singletons);
 540  Function: Fu and Li F statistic for the raw counts of the number
 541            of samples, sites, external and internal mutations
 542  Returns : decimal number
 543  Args    : number of samples (N)
 544            average pairwise differences (pi)
 545            number of segregating sites (n)
 546            singleton  mutations (n_s)
 547
 548 =cut
 549
 550
 551 sub fu_and_li_F_star_counts {
 552     my ($self,$n,$pi,$seg_sites, $singletons) = @_;
 553     if( $n <= 1 ) {
 554         $self->warn("N must be > 1\n");
 555         return;
 556     }
 557     if( $n == 2) {
 558         return 0;
 559     }
 560
 561     my $a_n = 0;
 562
 563
 564     my $b = 0;
 565     for(my $k= 1; $k < $n; $k++ ) {
 566         $b += (1 / ($k**2));
 567         $a_n += ( 1 / $k );     # Eq (2)
 568     }
 569     my $a1 = $a_n + (1 / $n );
 570
 571     # warn("a_n is $a_n a1 is $a1 n is $n b is $b\n");
 572
 573     # From Simonsen et al (1995) instead of Fu and Li 1993
 574     my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/
 575                       (9 * ($n ** 2) * ( $n - 1))) +
 576                      ((2 * ($n - 1) * $a_n ) / $n ** 2) -
 577                      (8 * $b / $n) ) /
 578                      ( ($a_n ** 2) + $b );
 579
 580     my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) /
 581                     (3 * $n * ( $n - 1))) / $a_n) - $v_F_star;
 582
 583     # warn("vf* = $v_F_star uf* = $u_F_star n = $n\n");
 584     my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) /
 585         sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2);
 586     return $F_star;
 587 }
 588
 589 =head2 tajima_D
 590
 591  Title   : tajima_D
 592  Usage   : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
 593  Function: Calculate Tajima's D on a set of samples
 594  Returns : decimal number
 595  Args    : array ref of L<Bio::PopGen::IndividualI> objects
 596            OR
 597            L<Bio::PopGen::PopulationI> object
 598
 599
 600 =cut
 601
 602 #'
 603
 604 sub tajima_D {
 605     my ($self,$individuals) = @_;
 606     my ($seg_sites,$pi,$n);
 607
 608     if( ref($individuals) =~ /ARRAY/i ) {
 609         $n = scalar @$individuals;
 610         # pi - all pairwise differences
 611         $pi          = $self->pi($individuals);
 612         $seg_sites = $self->segregating_sites_count($individuals);
 613
 614     } elsif( ref($individuals) &&
 615              $individuals->isa('Bio::PopGen::PopulationI')) {
 616         my $pop = $individuals;
 617         $n = $pop->get_number_individuals;
 618         $pi          = $self->pi($pop);
 619         $seg_sites = $self->segregating_sites_count($pop);
 620     } else {
 621         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
 622         return 0;
 623     }
 624     $self->tajima_D_counts($n,$seg_sites,$pi);
 625 }
 626
 627 =head2 tajima_D_counts
 628
 629  Title   : tajima_D_counts
 630  Usage   : my $D = $statistics->tajima_D_counts($samps,$sites,$pi);
 631  Function: Tajima's D statistic for the raw counts of the number
 632            of samples, sites, and avg pairwise distances (pi)
 633  Returns : decimal number
 634  Args    : number of samples (N)
 635            number of segregating sites (n)
 636            average pairwise differences (pi)
 637
 638 =cut
 639
 640 #'
 641
 642 sub tajima_D_counts {
 643     my ($self,$n,$seg_sites,$pi) = @_;
 644     my $a1 = 0;
 645     for(my $k= 1; $k < $n; $k++ ) {
 646         $a1 += ( 1 / $k );
 647     }
 648
 649      my $a2 = 0;
 650      for(my $k= 1; $k < $n; $k++ ) {
 651          $a2 += ( 1 / $k**2 );
 652      }
 653
 654     my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) );
 655     my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) /
 656              ( ( 9 * $n) * ( $n - 1) );
 657     my $c1 = $b1 - ( 1 / $a1 );
 658     my $c2 = $b2 - ( ( $n + 2 ) /
 659                      ( $a1 * $n))+( $a2 / $a1 ** 2);
 660     my $e1 = $c1 / $a1;
 661     my $e2 = $c2 / ( $a1**2 + $a2 );
 662
 663     my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
 664     return if $denom == 0;
 665     my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom;
 666     return $D;
 667 }
 668
 669
 670 =head2 pi
 671
 672  Title   : pi
 673  Usage   : my $pi = Bio::PopGen::Statistics->pi(\@inds)
 674  Function: Calculate pi (average number of pairwise differences) given
 675            a list of individuals which have the same number of markers
 676            (also called sites) as available from the get_Genotypes()
 677            call in L<Bio::PopGen::IndividualI>
 678  Returns : decimal number
 679  Args    : Arg1= array ref of L<Bio::PopGen::IndividualI> objects
 680              which have markers/mutations.  We expect all individuals to
 681              have a marker - we will deal with missing data as a special case.
 682            OR
 683            Arg1= L<Bio::PopGen::PopulationI> object.  In the event that
 684                  only allele frequency data is available, storing it in
 685                  Population object will make this available.
 686            num sites [optional], an optional second argument (integer)
 687              which is the number of sites, then pi returned is pi/site.
 688
 689 =cut
 690
 691 sub pi {
 692     my ($self,$individuals,$numsites) = @_;
 693     my (%data,%marker_total,@marker_names,$n);
 694
 695     if( ref($individuals) =~ /ARRAY/i ) {
 696         # one possible argument is an arrayref of Bio::PopGen::IndividualI objs
 697         @marker_names = $individuals->[0]->get_marker_names;
 698         $n = scalar @$individuals;
 699
 700         # Here we are calculating the allele frequencies
 701         foreach my $ind ( @$individuals ) {
 702             if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
 703                 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
 704                 return 0;
 705             }
 706             foreach my $m ( @marker_names ) {
 707                 foreach my $allele (map { $_->get_Alleles}
 708                                     $ind->get_Genotypes($m) ) {
 709                     $data{$m}->{$allele}++;
 710                     $marker_total{$m}++;
 711                 }
 712             }
 713         }
 714 #       while( my ($marker,$count) =  each %marker_total ) {
 715 #           foreach my $c ( values %{$data{$marker}} ) {
 716 #               $c /= $count;
 717 #           }
 718 #       }
 719         # %data will contain allele frequencies for each marker, allele
 720     } elsif( ref($individuals) &&
 721              $individuals->isa('Bio::PopGen::PopulationI') ) {
 722         my $pop = $individuals;
 723         $n = $pop->get_number_individuals;
 724         foreach my $marker( $pop->get_Markers ) {
 725             push @marker_names, $marker->name;
 726             #$data{$marker->name} = {$marker->get_Allele_Frequencies};
 727             my @genotypes = $pop->get_Genotypes(-marker => $marker->name);
 728             for my $al ( map { $_->get_Alleles} @genotypes ) {
 729               $data{$marker->name}->{$al}++;
 730               $marker_total{$marker->name}++;
 731            }
 732         }
 733     } else {
 734         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
 735     }
 736     # based on Kevin Thornton's code:
 737     # http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152
 738     # For now we assume that all individuals have the same markers
 739     my ($diffcount,$totalcompare) = (0,0);
 740     my $pi = 0;
 741     while ( my ($marker,$markerdat) = each %data ) {
 742       my $sampsize = $marker_total{$marker};
 743       my $ssh = 0;
 744       my @alleles = keys %$markerdat;
 745       if ( $sampsize > 1 ) {
 746         my $denom = $sampsize * ($sampsize - 1.0);
 747         foreach my $al ( @alleles ) {
 748           $ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom;
 749         }
 750         $pi += 1.0 - $ssh;
 751       }
 752     }
 753     $self->debug( "pi=$pi\n");
 754     if( $numsites ) {
 755         return $pi / $numsites;
 756     } else {
 757         return $pi;
 758     }
 759 }
 760
 761
 762 =head2 theta
 763
 764  Title   : theta
 765  Usage   : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
 766  Function: Calculates Watterson's theta from the sample size
 767            and the number of segregating sites.
 768            Providing the third parameter, total number of sites will
 769            return theta per site.
 770            This is also known as K-hat = K / a_n
 771  Returns : decimal number
 772  Args    : sample size (integer),
 773            num segregating sites (integer)
 774            total sites (integer) [optional] (to calculate theta per site)
 775            OR
 776            provide an arrayref of the L<Bio::PopGen::IndividualI> objects
 777            total sites (integer) [optional] (to calculate theta per site)
 778            OR
 779            provide an L<Bio::PopGen::PopulationI> object
 780            total sites (integer)[optional]
 781
 782 =cut
 783
 784 #'
 785
 786 sub theta {
 787     my $self = shift;
 788     my ( $n, $seg_sites,$totalsites) = @_;
 789     if( ref($n) =~ /ARRAY/i ) {
 790         my $samps = $n;
 791         $totalsites = $seg_sites; # only 2 arguments if one is an array
 792         my %data;
 793         my @marker_names = $samps->[0]->get_marker_names;
 794         # we need to calculate number of polymorphic sites
 795         $seg_sites = $self->segregating_sites_count($samps);
 796         $n = scalar @$samps;
 797
 798     } elsif(ref($n) &&
 799             $n->isa('Bio::PopGen::PopulationI') ) {
 800         # This will handle the case when we pass in a PopulationI object
 801         my $pop = $n;
 802         $totalsites = $seg_sites; # shift the arguments over by one
 803         $n = $pop->haploid_population->get_number_individuals;
 804         $seg_sites = $self->segregating_sites_count($pop);
 805     }
 806     my $a1 = 0;
 807     for(my $k= 1; $k < $n; $k++ ) {
 808         $a1 += ( 1 / $k );
 809     }
 810     if( $totalsites ) { # 0 and undef are the same can't divide by them
 811         $seg_sites /= $totalsites;
 812     }
 813     if( $a1 == 0 ) {
 814         return 0;
 815     }
 816     return $seg_sites / $a1;
 817 }
 818
 819 =head2 singleton_count
 820
 821  Title   : singleton_count
 822  Usage   : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
 823  Function: Calculate the number of mutations/alleles which only occur once in
 824            a list of individuals for all sites/markers
 825  Returns : (integer) number of alleles which only occur once (integer)
 826  Args    : arrayref of L<Bio::PopGen::IndividualI> objects
 827            OR
 828            L<Bio::PopGen::PopulationI> object
 829
 830 =cut
 831
 832 sub singleton_count {
 833     my ($self,$individuals) = @_;
 834
 835     my @inds;
 836     if( ref($individuals) =~ /ARRAY/ ) {
 837         @inds = @$individuals;
 838     } elsif( ref($individuals) &&
 839              $individuals->isa('Bio::PopGen::PopulationI') ) {
 840         my $pop = $individuals;
 841         @inds = $pop->get_Individuals();
 842         unless( @inds ) {
 843             $self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
 844             return 0;
 845         }
 846     } else {
 847         $self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
 848         return 0;
 849     }
 850     # find number of sites where a particular allele is only seen once
 851
 852     my ($singleton_allele_ct,%sites) = (0);
 853     # first collect all the alleles into a hash structure
 854
 855     foreach my $n ( @inds ) {
 856         if( ! $n->isa('Bio::PopGen::IndividualI') ) {
 857             $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
 858             return 0;
 859         }
 860         foreach my $g ( $n->get_Genotypes ) {
 861             my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
 862             foreach my $allele (@alleles ) {
 863                 $sites{$nm}->{$allele}++;
 864             }
 865         }
 866     }
 867     foreach my $site ( values %sites ) { # don't really care what the name is
 868         foreach my $allelect ( values %$site ) { #
 869             # find the sites which have an allele with only 1 copy
 870             $singleton_allele_ct++ if( $allelect == 1 );
 871         }
 872     }
 873     return $singleton_allele_ct;
 874 }
 875
 876 # Yes I know that singleton_count and segregating_sites_count are
 877 # basically processing the same data so calling them both is
 878 # redundant, something I want to fix later but want to make things
 879 # correct and simple first
 880
 881 =head2 segregating_sites_count
 882
 883  Title   : segregating_sites_count
 884  Usage   : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
 885  Function: Gets the number of segregating sites (number of polymorphic sites)
 886  Returns : (integer) number of segregating sites
 887  Args    : arrayref of L<Bio::PopGen::IndividualI> objects
 888            OR
 889            L<Bio::PopGen::PopulationI> object
 890
 891 =cut
 892
 893 # perhaps we'll change this in the future
 894 # to return the actual segregating sites
 895 # so one can use this to pull in the names of those sites.
 896 # Would be trivial if it is useful.
 897
 898 sub segregating_sites_count {
 899    my ($self,$individuals) = @_;
 900    my $type = ref($individuals);
 901    my $seg_sites = 0;
 902    if( $type =~ /ARRAY/i ) {
 903        my %sites;
 904        foreach my $n ( @$individuals ) {
 905            if( ! $n->isa('Bio::PopGen::IndividualI') ) {
 906                $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
 907                return 0;
 908            }
 909            foreach my $g ( $n->get_Genotypes ) {
 910                my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
 911                foreach my $allele (@alleles ) {
 912                    $sites{$nm}->{$allele}++;
 913                }
 914            }
 915        }
 916        foreach my $site ( values %sites ) { # use values b/c we don't
 917                                             # really care what the name is
 918            # find the sites which >1 allele
 919            $seg_sites++ if( keys %$site > 1 );
 920        }
 921    } elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
 922        foreach my $marker ( $individuals->haploid_population->get_Markers ) {
 923            my @alleles = $marker->get_Alleles;
 924            $seg_sites++ if ( scalar @alleles > 1 );
 925        }
 926    } else {
 927        $self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
 928        return 0;
 929    }
 930    return $seg_sites;
 931 }
 932
 933
 934 =head2 heterozygosity
 935
 936  Title   : heterozygosity
 937  Usage   : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
 938  Function: Calculate the heterozgosity for a sample set for a set of alleles
 939  Returns : decimal number
 940  Args    : sample size (integer)
 941            frequency of one allele (fraction - must be less than 1)
 942            [optional] frequency of another allele - this is only needed
 943                       in a non-binary allele system
 944
 945 Note     : p^2 + 2pq + q^2
 946
 947 =cut
 948
 949
 950 sub heterozygosity {
 951     my ($self,$samp_size, $freq1,$freq2) = @_;
 952     if( ! $freq2 ) { $freq2 = 1 - $freq1 }
 953     if( $freq1 > 1 || $freq2 > 1 ) {
 954         $self->warn("heterozygosity expects frequencies to be less than 1");
 955     }
 956     my $sum = ($freq1**2) + (($freq2)**2);
 957     my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
 958     return $h;
 959 }
 960
 961
 962 =head2 derived_mutations
 963
 964  Title   : derived_mutations
 965  Usage   : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
 966  Function: Calculate the number of alleles or (mutations) which are ancestral
 967            and the number which are derived (occurred only on the tips)
 968  Returns : array of 2 items - number of external and internal derived
 969            mutation
 970  Args    : ingroup - L<Bio::PopGen::IndividualI>s arrayref OR
 971                      L<Bio::PopGen::PopulationI>
 972            outgroup- L<Bio::PopGen::IndividualI>s arrayref OR
 973                      L<Bio::PopGen::PopulationI> OR
 974                      a single L<Bio::PopGen::IndividualI>
 975
 976 =cut
 977
 978 sub derived_mutations {
 979    my ($self,$ingroup,$outgroup) = @_;
 980    my (%indata,%outdata,@marker_names);
 981
 982    # basically we have to do some type checking
 983    # if that perl were typed...
 984    my ($itype,$otype) = (ref($ingroup),ref($outgroup));
 985
 986    return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
 987                                       # are already the value we
 988                                       # are searching for
 989    # pick apart the ingroup
 990    # get the data
 991    if( ref($ingroup) =~ /ARRAY/i ) {
 992        if( ! ref($ingroup->[0]) ||
 993            ! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
 994            $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
 995            return 0;
 996        }
 997        # we assume that all individuals have the same markers
 998        # i.e. that they are aligned
 999        @marker_names = $ingroup->[0]->get_marker_names;
1000        for my $ind ( @$ingroup ) {
1001            for my $m ( @marker_names ) {
1002                for my $allele ( map { $_->get_Alleles }
1003                                     $ind->get_Genotypes($m) ) {
1004                    $indata{$m}->{$allele}++;
1005                }
1006            }
1007        }
1008    } elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
1009        @marker_names = $ingroup->get_marker_names;
1010        for my $ind ( $ingroup->haploid_population->get_Individuals() ) {
1011            for my $m ( @marker_names ) {
1012                for my $allele ( map { $_->get_Alleles}
1013                                     $ind->get_Genotypes($m) ) {
1014                    $indata{$m}->{$allele}++;
1015                }
1016            }
1017        }
1018    } else {
1019        $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
1020        return 0;
1021    }
1022
1023    if( $otype =~ /ARRAY/i ) {
1024        if( ! ref($outgroup->[0]) ||
1025            ! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
1026            $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
1027            return 0;
1028        }
1029        for my $ind ( @$outgroup ) {
1030            for my $m ( @marker_names ) {
1031                for my $allele ( map { $_->get_Alleles }
1032                                 $ind->get_Genotypes($m) ) {
1033                    $outdata{$m}->{$allele}++;
1034                }
1035            }
1036        }
1037
1038    } elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
1039        for my $ind ( $outgroup->haploid_population->get_Individuals() ) {
1040            for my $m ( @marker_names ) {
1041                for my $allele ( map { $_->get_Alleles}
1042                                     $ind->get_Genotypes($m) ) {
1043                    $outdata{$m}->{$allele}++;
1044                }
1045            }
1046        }
1047    } else {
1048        $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
1049        return 0;
1050    }
1051
1052    # derived mutations are defined as
1053    #
1054    # ingroup  (G A T)
1055    # outgroup (A)
1056    # derived mutations are G and T, A is the external mutation
1057
1058    # ingroup  (A T)
1059    # outgroup (C)
1060    # derived mutations A,T no external/ancestral mutations
1061
1062    # ingroup  (G A T)
1063    # outgroup (A T)
1064    # cannot determine
1065
1066    my ($internal,$external);
1067    foreach my $marker ( @marker_names ) {
1068        my @outalleles = keys %{$outdata{$marker}};
1069        my @in_alleles = keys %{$indata{$marker}};
1070        next if( @outalleles > 1 || @in_alleles == 1);
1071        for my $allele ( @in_alleles ) {
1072            if( ! exists $outdata{$marker}->{$allele} ) {
1073                if( $indata{$marker}->{$allele} == 1 ) {
1074                    $external++;
1075                } else {
1076                    $internal++;
1077                }
1078            }
1079        }
1080    }
1081    return ($external, $internal);
1082 }
1083
1084
1085 =head2 composite_LD
1086
1087  Title   : composite_LD
1088  Usage   : %matrix = Bio::PopGen::Statistics->composite_LD($population);
1089  Function: Calculate the Linkage Disequilibrium
1090            This is for calculating LD for unphased data.
1091            Other methods will be appropriate for phased haplotype data.
1092
1093  Returns : Hash of Hashes - first key is site 1,second key is site 2
1094            and value is LD for those two sites.
1095            my $LDarrayref = $matrix{$site1}->{$site2};
1096            my ($ldval, $chisquared) = @$LDarrayref;
1097  Args    : L<Bio::PopGen::PopulationI> or arrayref of
1098            L<Bio::PopGen::IndividualI>s
1099  Reference: Weir B.S. (1996) "Genetic Data Analysis II",
1100                       Sinauer, Sunderlanm MA.
1101
1102 =cut
1103
1104 sub composite_LD {
1105     my ($self,$pop) = @_;
1106     if( ref($pop) =~ /ARRAY/i ) {
1107         if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
1108             $pop = Bio::PopGen::Population->new(-individuals => @$pop);
1109         } else {
1110             $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1111             return ();
1112         }
1113     } elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
1114         $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1115         return ();
1116     }
1117
1118     my @marker_names = $pop->get_marker_names;
1119     my @inds = $pop->get_Individuals;
1120     my $num_inds = scalar @inds;
1121     my (%lookup);
1122     # calculate allele frequencies for each marker from the population
1123     # use the built-in get_Marker to get the allele freqs
1124     # we still need to calculate the genotype frequencies
1125     foreach my $marker_name ( @marker_names ) {
1126         my(%allelef);
1127
1128         foreach my $ind ( @inds ) {
1129             my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
1130             if( ! defined $genotype ) {
1131                 $self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
1132                 next;
1133             }
1134             my @alleles  = sort $genotype->get_Alleles;
1135             next if( scalar @alleles != 2);
1136             my $genostr  = join(',', @alleles);
1137             $allelef{$alleles[0]}++;
1138             $allelef{$alleles[1]}++;
1139         }
1140
1141         # we should check for cases where there > 2 alleles or
1142         # only 1 allele and throw out those markers.
1143         my @alleles      = sort keys %allelef;
1144         my $allele_count = scalar @alleles;
1145         # test if site is polymorphic
1146         if( $allele_count != 2) {
1147             # only really warn if we're seeing multi-allele
1148             $self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
1149             next;               # skip this marker
1150         }
1151
1152         # Need to do something here to detect alleles which aren't
1153         # a single character
1154         if( length($alleles[0]) != 1 ||
1155             length($alleles[1]) != 1 ) {
1156             $self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
1157             next;
1158         }
1159
1160         # fix the call for allele 1 (A or B) and
1161         # allele 2 (a or b) in terms of how we'll do the
1162         # N square from Weir p.126
1163         $self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
1164         $lookup{$marker_name}->{'1'} = $alleles[0];
1165         $lookup{$marker_name}->{'2'} = $alleles[1];
1166     }
1167
1168     @marker_names = sort keys %lookup;
1169     my $site_count   = scalar @marker_names;
1170     # where the final data will be stored
1171     my %stats_for_sites;
1172
1173     # standard way of generating pairwise combos
1174     # LD is done by comparing all the pairwise site (marker)
1175     # combinations and keeping track of the genotype and
1176     # pairwise genotype (ie genotypes of the 2 sites) frequencies
1177     for( my $i = 0; $i < $site_count - 1; $i++ ) {
1178         my $site1 = $marker_names[$i];
1179
1180         for( my $j = $i+1; $j < $site_count ; $j++) {
1181             my (%genotypes, %total_genotype_count,$total_pairwisegeno_count,
1182                 %pairwise_genotypes);
1183
1184             my $site2 = $marker_names[$j];
1185             my (%allele_count,%allele_freqs) = (0,0);
1186             foreach my $ind ( @inds ) {
1187                 # build string of genotype at site 1
1188                 my ($genotype1) = $ind->get_Genotypes(-marker => $site1);
1189                 my @alleles1  = sort $genotype1->get_Alleles;
1190
1191                 # if an individual has only one available allele
1192                 # (has a blank or N for one of the chromosomes)
1193                 # we don't want to use it in our calculation
1194
1195                 next unless( scalar @alleles1 == 2);
1196                 my $genostr1  = join(',', @alleles1);
1197
1198                 # build string of genotype at site 2
1199                 my ($genotype2) = $ind->get_Genotypes(-marker => $site2);
1200                 my @alleles2  = sort $genotype2->get_Alleles;
1201                 my $genostr2  = join(',', @alleles2);
1202
1203                 next unless( scalar @alleles2 == 2);
1204                 for (@alleles1) {
1205                     $allele_count{$site1}++;
1206                     $allele_freqs{$site1}->{$_}++;
1207                 }
1208                 $genotypes{$site1}->{$genostr1}++;
1209                 $total_genotype_count{$site1}++;
1210
1211                 for (@alleles2) {
1212                     $allele_count{$site2}++;
1213                     $allele_freqs{$site2}->{$_}++;
1214                 }
1215                 $genotypes{$site2}->{$genostr2}++;
1216                 $total_genotype_count{$site2}++;
1217
1218                 # We are using the $site1,$site2 to signify
1219                 # a unique key
1220                 $pairwise_genotypes{"$genostr1,$genostr2"}++;
1221                 # some individuals
1222                 $total_pairwisegeno_count++;
1223             }
1224             for my $site ( %allele_freqs ) {
1225                 for my $al ( keys %{ $allele_freqs{$site} } ) {
1226                     $allele_freqs{$site}->{$al} /= $allele_count{$site};
1227                 }
1228             }
1229             my $n = $total_pairwisegeno_count;  # number of pairs of comparisons
1230             # 'A' and 'B' are two loci or in our case site1 and site2
1231             my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1232             my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1233             my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1234             my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1235             # AABB
1236             my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1237                                        $allele1_site2, $allele1_site2));
1238             $self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1239             # AABb
1240             my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1241                                        $allele1_site2, $allele2_site2));
1242             $self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1243             # AaBB
1244             my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1245                                        $allele1_site2, $allele1_site2));
1246             $self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1247             # AaBb
1248             my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1249                                        $allele1_site2, $allele2_site2));
1250             $self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1251             # count of AABB in
1252             my $n1 = $pairwise_genotypes{$N1genostr} || 0;
1253             # count of AABb in
1254             my $n2 = $pairwise_genotypes{$N2genostr} || 0;
1255             # count of AaBB in
1256             my $n4 = $pairwise_genotypes{$N4genostr} || 0;
1257             # count of AaBb in
1258             my $n5 = $pairwise_genotypes{$N5genostr} || 0;
1259
1260             my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1261             my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1262             my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1263             my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1264             my $p_A  = $allele_freqs{$site1}->{$allele1_site1} || 0;    # an individual allele freq
1265             my $p_a  =  1 - $p_A;
1266
1267             my $p_B  = $allele_freqs{$site2}->{$allele1_site2} || 0;    # an individual allele freq
1268             my $p_b  =  1 - $p_B;
1269
1270             # variance of allele frequencies
1271             my $pi_A = $p_A * $p_a;
1272             my $pi_B = $p_B * $p_b;
1273
1274             # hardy weinberg
1275             my $D_A  = $p_AA - $p_A**2;
1276             my $D_B  = $p_BB - $p_B**2;
1277             my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1278             $self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1279
1280             my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1281             $self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1282             $self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1283                                  $n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1284
1285             my $chisquared;
1286             eval { $chisquared = ( $n * ($delta_AB**2) ) /
1287                        ( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1288                };
1289             if( $@ ) {
1290                 $self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1291                 next;
1292             }
1293             # this will be an upper triangular matrix
1294             $stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1295         }
1296     }
1297     return %stats_for_sites;
1298 }
1299
1300 =head2 mcdonald_kreitman
1301
1302  Title   : mcdonald_kreitman
1303  Usage   : $Fstat = mcdonald_kreitman($ingroup, $outgroup);
1304  Function: Calculates McDonald-Kreitman statistic based on a set of ingroup
1305            individuals and an outgroup by computing the number of
1306            differences at synonymous and non-synonymous sites
1307            for intraspecific comparisons and with the outgroup
1308  Returns : 2x2 table, followed by a hash reference indicating any
1309            warning messages about the status of the alleles or codons
1310  Args    : -ingroup    => L<Bio::PopGen::Population> object or
1311                           arrayref of L<Bio::PopGen::Individual>s
1312            -outgroup   => L<Bio::PopGen::Population> object or
1313                           arrayef of L<Bio::PopGen::Individual>s
1314            -polarized  => Boolean, to indicate if this should be
1315                           a polarized test. Must provide two individuals
1316                           as outgroups.
1317
1318 =cut
1319
1320 sub mcdonald_kreitman {
1321     my ($self,@args) = @_;
1322     my ($ingroup, $outgroup,$polarized) =
1323         $self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args);
1324     my $verbose = $self->verbose;
1325     my $outgroup_count;
1326     my $gapchar = '\-';
1327     if( ref($outgroup) =~ /ARRAY/i ) {
1328         $outgroup_count = scalar @$outgroup;
1329     } elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) {
1330         $outgroup_count = $outgroup->get_number_individuals;
1331     } else {
1332         $self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI");
1333     }
1334
1335     if( $polarized ) {
1336         if( $outgroup_count < 2 ) {
1337             $self->throw("Need 2 outgroups with polarized option\n");
1338         }
1339     } elsif( $outgroup_count > 1 ) {
1340         $self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count ));
1341     } elsif( $outgroup_count == 0 ) {
1342         $self->throw("No outgroup sequence provided");
1343     }
1344
1345     my $codon_path = Bio::MolEvol::CodonModel->codon_path;
1346
1347     my (%marker_names,%unique,@inds);
1348     for my $p ( $ingroup, $outgroup)  {
1349         if( ref($p) =~ /ARRAY/i ) {
1350             push @inds, @$p;
1351         } else {
1352             push @inds, $p->get_Individuals;
1353         }
1354     }
1355     for my $i ( @inds ) {
1356         if( $unique{$i->unique_id}++ ) {
1357             $self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n");
1358         }
1359         for my $n ( $i->get_marker_names ) {
1360             $marker_names{$n}++;
1361         }
1362     }
1363
1364     my @marker_names = keys %marker_names;
1365     if( $marker_names[0] =~ /^(Site|Codon)/ ) {
1366         # sort by site or codon number and do it in
1367         # a schwartzian transformation baby!
1368         @marker_names = map { $_->[1] }
1369         sort { $a->[0] <=> $b->[0] }
1370         map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names;
1371     }
1372
1373
1374     my $num_inds = scalar @inds;
1375     my %vals = ( 'ingroup'  => $ingroup,
1376                  'outgroup' => $outgroup,
1377                  );
1378
1379     # Make the Codon Table type a parameter!
1380     my $table = Bio::Tools::CodonTable->new(-id => $codon_table);
1381     my @vt = qw(outgroup ingroup);
1382     my %changes;
1383     my %status;
1384     my %two_by_two = ( 'fixed_N' => 0,
1385                        'fixed_S' => 0,
1386                        'poly_N'  => 0,
1387                        'poly_S'  => 0);
1388
1389     for my $codon ( @marker_names ) {
1390         my (%codonvals);
1391         my %all_alleles;
1392         for my $t ( @vt ) {
1393             my $outcount = 1;
1394             for my $ind ( @{$vals{$t}} ) {
1395                 my @alleles = $ind->get_Genotypes($codon)->get_Alleles;
1396                 if( @alleles > 2 ) {
1397                     warn("Codon $codon saw ", scalar @alleles, " alleles for ind ", $ind->unique_id, "\n");
1398                     die;
1399                 } else {
1400                     my ($allele) = shift @alleles;
1401                     $all_alleles{$ind->unique_id} = $allele;
1402                     my $AA = $table->translate($allele);
1403                     next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i);
1404
1405                     my $label = $t;
1406                     if( $t eq 'outgroup' ) {
1407                         $label = $t.$outcount++;
1408                     }
1409                     $codonvals{$label}->{$allele}++;
1410                     $codonvals{all}->{$allele}++;
1411                 }
1412             }
1413         }
1414         my $total = sum ( values %{$codonvals{'ingroup'}} );
1415         next if( $total && $total < 2 ); # skip sites with < alleles
1416         # process all the seen alleles (codons)
1417         # this is a vertical slide through the alignment
1418         if( keys %{$codonvals{all}} <= 1 ) {
1419             # no changes or no VALID codons - monomorphic
1420         } else {
1421             # grab only the first outgroup codon (what to do with rest?)
1422             my ($outcodon) = keys %{$codonvals{'outgroup1'}};
1423             if( ! $outcodon ) {
1424                 $status{"no outgroup codon $codon"}++;
1425                 next;
1426             }
1427             my $out_AA = $table->translate($outcodon);
1428             my ($outcodon2) = keys %{$codonvals{'outgroup2'}};
1429             if( ($polarized && ($outcodon ne $outcodon2)) ||
1430                 $out_AA eq 'X' || $out_AA eq '*' ) {
1431                 # skip if outgroup codons are different
1432                 # (when polarized option is on)
1433                 # or skip if the outcodon is STOP or 'NNN'
1434                 if( $verbose > 0 ) {
1435                     $self->debug("skipping $out_AA and $outcodon $outcodon2\n");
1436                 }
1437                 $status{'outgroup codons different'}++;
1438                 next;
1439             }
1440
1441             # check if ingroup is actually different from outgroup -
1442             # if there are the same number of alleles when considering
1443             # ALL or just the ingroup, then there is nothing new seen
1444             # in the outgroup so it must be a shared allele (codon)
1445
1446             # so we just count how many total alleles were seen
1447             # if this is the same as the number of alleles seen for just
1448             # the ingroup then the outgroup presents no new information
1449
1450             my @ingroup_codons = keys %{$codonvals{'ingroup'}};
1451             my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon};
1452
1453             if( $verbose > 0 ) {
1454                 $self->debug("alleles are in: ", join(",", @ingroup_codons),
1455                              " out: ", join(",", keys %{$codonvals{outgroup1}}),
1456                              " diff_from_out=$diff_from_out\n");
1457
1458                 for my $ind ( sort keys %all_alleles ) {
1459                     $self->debug( "$ind\t$all_alleles{$ind}\n");
1460                 }
1461             }
1462             # are all the ingroup alleles the same and diferent from outgroup?
1463             # fixed differences between species
1464             if( $diff_from_out ) {
1465                 if( scalar @ingroup_codons == 1 ) {
1466                     # fixed differences
1467                     if( $outcodon =~ /^$gapchar/ ) {
1468                         $status{'outgroup codons with gaps'}++;
1469                         next;
1470                     } elsif( $ingroup_codons[0] =~ /$gapchar/) {
1471                         $status{'ingroup codons with gaps'}++;
1472                         next;
1473                     }
1474                     my $path = $codon_path->{uc $ingroup_codons[0].$outcodon};
1475                     $two_by_two{fixed_N} += $path->[0];
1476                     $two_by_two{fixed_S} += $path->[1];
1477                     if( $verbose > 0 ) {
1478                         $self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n");
1479                         $self->debug("path is ",join(",",@$path),"\n");
1480                         $self->debug
1481                             (sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA,
1482                                      @$path, map { $two_by_two{$_} }
1483                                      qw(fixed_N fixed_S poly_N poly_S)));
1484                     }
1485                 } else {
1486                     # polymorphic and all are different from outgroup
1487                     # Here we find the minimum number of NS subst
1488                     my ($Ndiff,$Sdiff) = (3,0); # most different path
1489                     for my $c ( @ingroup_codons ) {
1490                         next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/);
1491                         my $path = $codon_path->{uc $c.$outcodon};
1492                         my ($tNdiff,$tSdiff) = @$path;
1493                         if( $path->[0] < $Ndiff ||
1494                             ($tNdiff == $Ndiff &&
1495                              $tSdiff  <= $Sdiff)) {
1496                             ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1497                         }
1498                     }
1499                     $two_by_two{fixed_N} += $Ndiff;
1500                     $two_by_two{fixed_S} += $Sdiff;
1501                     if( @ingroup_codons > 2 ) {
1502                         $status{"more than 2 ingroup codons $codon"}++;
1503                         warn("more than 2 ingroup codons (@ingroup_codons)\n");
1504                     } else {
1505                         my $path = $codon_path->{uc join('',@ingroup_codons)};
1506
1507                         $two_by_two{poly_N} += $path->[0];
1508                         $two_by_two{poly_S} += $path->[1];
1509                         if( $verbose > 0 ) {
1510                             $self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S)));
1511                         }
1512                     }
1513                 }
1514             } else {
1515                 my %unq = map { $_ => 1 } @ingroup_codons;
1516                 delete $unq{$outcodon};
1517                 my @unique_codons = keys %unq;
1518
1519                 # calc path for diff add to poly
1520                 # Here we find the minimum number of subst bw
1521                 # codons
1522                 my ($Ndiff,$Sdiff) = (3,0); # most different path
1523                 for my $c ( @unique_codons ) {
1524                     my $path = $codon_path->{uc $c.$outcodon };
1525                     if( ! defined $path ) {
1526                         die " cannot get path for ", $c.$outcodon, "\n";
1527                     }
1528                     my ($tNdiff,$tSdiff) = @$path;
1529                     if( $path->[0] < $Ndiff ||
1530                         ($tNdiff == $Ndiff &&
1531                          $tSdiff  <= $Sdiff)) {
1532                         ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1533                     }
1534                 }
1535
1536                 if( @unique_codons == 2 ) {
1537                     my $path = $codon_path->{uc join('',@unique_codons)};
1538                     if( ! defined $path ) {
1539                         $self->throw("no path for @unique_codons\n");
1540                     }
1541                     $Ndiff += $path->[0];
1542                     $Sdiff += $path->[1];
1543                 }
1544                 $two_by_two{poly_N} += $Ndiff;
1545                 $two_by_two{poly_S} += $Sdiff;
1546                 if( $verbose > 0 ) {
1547                     $self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,
1548                                          $Ndiff, $Sdiff, map { $two_by_two{$_} }
1549                                          qw(fixed_N fixed_S poly_N poly_S)));
1550                 }
1551             }
1552         }
1553     }
1554     return ( $two_by_two{'poly_N'},
1555              $two_by_two{'fixed_N'},
1556              $two_by_two{'poly_S'},
1557              $two_by_two{'fixed_S'},
1558              {%status});
1559
1560 }
1561
1562 *MK = \&mcdonald_kreitman;
1563
1564
1565 =head2 mcdonald_kreitman_counts
1566
1567  Title   : mcdonald_kreitman_counts
1568  Usage   : my $MK = $statistics->mcdonald_kreitman_counts(
1569
1570              N_poly -> integer of count of non-syn polymorphism
1571              N_fix  -> integer of count of non-syn fixed substitutions
1572              S_poly -> integer of count of syn polymorphism
1573              S_fix  -> integer of count of syn fixed substitutions
1574                                                           );
1575  Function:
1576  Returns : decimal number
1577  Args    :
1578
1579 =cut
1580
1581
1582 sub mcdonald_kreitman_counts {
1583     my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_;
1584     if( $has_twotailed ) {
1585         return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic
1586             (n11=>$Npoly,
1587              n1p=>$Npoly+$Spoly,
1588              np1=>$Npoly+$Nfix,
1589              npp=>$Npoly+$Nfix+$Spoly+$Sfix);
1590     } else {
1591         $self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed");
1592         return 0;
1593     }
1594 }
1595
1596
1597 1;