Bio/PopGen/Statistics.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::PopGen::Statistics
   4 #
   5 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
   6 #
   7 # Copyright Jason Stajich
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::PopGen::Statistics - Population Genetics statistical tests
  16
  17 =head1 SYNOPSIS
  18
  19   use Bio::PopGen::Statistics;
  20   use Bio::AlignIO;
  21   use Bio::PopGen::IO;
  22   use Bio::PopGen::Simulation::Coalescent;
  23
  24   my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12);
  25
  26   my $tree = $sim->next_tree;
  27
  28   $sim->add_Mutations($tree,20);
  29
  30   my $stats = Bio::PopGen::Statistics->new();
  31   my $individuals = [ $tree->get_leaf_nodes];
  32   my $pi = $stats->pi($individuals);
  33   my $D  = $stats->tajima_D($individuals);
  34
  35   # Alternatively to do this on input data from
  36   # See the tests in t/PopGen.t for more examples
  37   my $parser = Bio::PopGen::IO->new(-format => 'prettybase',
  38                                    -file   => 't/data/popstats.prettybase');
  39   my $pop = $parser->next_population;
  40   # Note that you can also call the stats as a class method if you like
  41   # the only reason to instantiate it (as above) is if you want
  42   # to set the verbosity for debugging
  43   $pi     = Bio::PopGen::Statistics->pi($pop);
  44   $theta  = Bio::PopGen::Statistics->theta($pop);
  45
  46   # Pi and Theta also take additional arguments,
  47   # see the documentation for more information
  48
  49   use Bio::PopGen::Utilities;
  50   use Bio::AlignIO;
  51
  52   my $in = Bio::AlignIO->new(-file   => 't/data/t7.aln',
  53                             -format => 'clustalw');
  54   my $aln = $in->next_aln;
  55   # get a population, each sequence is an individual and
  56   # for the default case, every site which is not monomorphic
  57   # is a 'marker'.  Each individual will have a 'genotype' for the
  58   # site which will be the specific base in the alignment at that
  59   # site
  60
  61   my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
  62
  63
  64 =head1 DESCRIPTION
  65
  66 This object is intended to provide implementations some standard
  67 population genetics statistics about alleles in populations.
  68
  69 This module was previously named Bio::Tree::Statistics.
  70
  71 This object is a place to accumulate routines for calculating various
  72 statistics from the coalescent simulation, marker/allele, or from
  73 aligned sequence data given that you can calculate alleles, number of
  74 segregating sites.
  75
  76 Currently implemented:
  77  Fu and Li's D    (fu_and_li_D)
  78  Fu and Li's D*   (fu_and_li_D_star)
  79  Fu and Li's F    (fu_and_li_F)
  80  Fu and Li's F*   (fu_and_li_F_star)
  81  Tajima's D       (tajima_D)
  82  Watterson's theta (theta)
  83  pi               (pi) - number of pairwise differences
  84  composite_LD     (composite_LD)
  85  McDonald-Kreitman (mcdonald_kreitman or MK)
  86
  87 Count based methods also exist in case you have already calculated the
  88 key statistics (seg sites, num individuals, etc) and just want to
  89 compute the statistic.
  90
  91 In all cases where a the method expects an arrayref of
  92 L<Bio::PopGen::IndividualI> objects and L<Bio::PopGen::PopulationI>
  93 object will also work.
  94
  95 =head2 REFERENCES
  96
  97 Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
  98 Mutations." Genetics 133:693-709.
  99
 100 Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
 101 from a Population." Genetics 143:557-570.
 102
 103 McDonald J, Kreitman M.
 104
 105 Tajima F. (1989) "Statistical method for testing the neutral mutation
 106 hypothesis by DNA polymorphism." Genetics 123:585-595.
 107
 108
 109 =head2 CITING THIS WORK
 110
 111 Please see this reference for use of this implementation.
 112
 113 Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
 114
 115 If you use these Bio::PopGen modules please cite the Bioperl
 116 publication (see FAQ) and the above reference.
 117
 118
 119 =head1 FEEDBACK
 120
 121 =head2 Mailing Lists
 122
 123 User feedback is an integral part of the evolution of this and other
 124 Bioperl modules. Send your comments and suggestions preferably to
 125 the Bioperl mailing list.  Your participation is much appreciated.
 126
 127   bioperl-l@bioperl.org                  - General discussion
 128   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 129
 130 =head2 Reporting Bugs
 131
 132 Report bugs to the Bioperl bug tracking system to help us keep track
 133 of the bugs and their resolution. Bug reports can be submitted via
 134 the web:
 135
 136   http://bugzilla.open-bio.org/
 137
 138 =head1 AUTHOR - Jason Stajich, Matthew Hahn
 139
 140 Email jason-at-bioperl-dot-org
 141 Email matthew-dot-hahn-at-duke-dot-edu
 142
 143 McDonald-Kreitman implementation based on work by Alisha Holloway at
 144 UC Davis.
 145
 146
 147 =head1 APPENDIX
 148
 149 The rest of the documentation details each of the object methods.
 150 Internal methods are usually preceded with a _
 151
 152 =cut
 153
 154
 155 # Let the code begin...
 156
 157
 158 package Bio::PopGen::Statistics;
 159 use strict;
 160 use constant {
 161     in_label => 'ingroup',
 162     out_label => 'outgroup',
 163     non_syn   => 'non_synonymous',
 164     syn       => 'synonymous',
 165     default_codon_table => 1, # Standard Codon table
 166 };
 167
 168 use Bio::MolEvol::CodonModel;
 169 use List::Util qw(sum);
 170
 171 use base qw(Bio::Root::Root);
 172 our $codon_table => default_codon_table;
 173 our $has_twotailed => 0;
 174 BEGIN {
 175     eval { require Text::NSP::Measures::2D::Fisher2::twotailed };
 176     if( $@ ) { $has_twotailed = 0; }
 177     else { $has_twotailed = 1; }
 178 }
 179
 180
 181
 182
 183
 184
 185 =head2 new
 186
 187  Title   : new
 188  Usage   : my $obj = Bio::PopGen::Statistics->new();
 189  Function: Builds a new Bio::PopGen::Statistics object
 190  Returns : an instance of Bio::PopGen::Statistics
 191  Args    : none
 192
 193
 194 =cut
 195
 196
 197 =head2 fu_and_li_D
 198
 199  Title   : fu_and_li_D
 200  Usage   : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup);
 201             OR
 202            my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations);
 203  Function: Fu and Li D statistic for a list of individuals
 204            given an outgroup and the number of external mutations
 205            (either provided or calculated from list of outgroup individuals)
 206  Returns : decimal
 207  Args    : $individuals - array reference which contains ingroup individuals
 208            (L<Bio::PopGen::Individual> or derived classes)
 209            $extmutations - number of external mutations OR
 210            arrayref of outgroup individuals
 211
 212 =cut
 213
 214 sub fu_and_li_D {
 215     my ($self,$ingroup,$outgroup) = @_;
 216
 217     my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0);
 218     if( ref($ingroup) =~ /ARRAY/i ) {
 219         $n = scalar @$ingroup;
 220         # pi - all pairwise differences
 221         $seg_sites   = $self->segregating_sites_count($ingroup);
 222     } elsif( ref($ingroup) &&
 223              $ingroup->isa('Bio::PopGen::PopulationI')) {
 224         $n = $ingroup->get_number_individuals;
 225         $seg_sites   = $self->segregating_sites_count($ingroup);
 226     } else {
 227         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
 228         return 0;
 229     }
 230
 231     if( $seg_sites <= 0 ) {
 232         $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
 233         return 0;
 234     }
 235
 236     if( ! defined $outgroup ) {
 237         $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
 238         return 0;
 239     } elsif( ref($outgroup) ) {
 240         ($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
 241         $ancestral = 0 unless defined $ancestral;
 242     } else {
 243         $ancestral = $outgroup;
 244     }
 245
 246     return $self->fu_and_li_D_counts($n,$seg_sites,
 247                                      $ancestral,$derived);
 248 }
 249
 250 =head2 fu_and_li_D_counts
 251
 252  Title   : fu_li_D_counts
 253  Usage   : my $D = $statistics->fu_and_li_D_counts($samps,$sites,
 254                                                    $external);
 255  Function: Fu and Li D statistic for the raw counts of the number
 256            of samples, sites, external and internal mutations
 257  Returns : decimal number
 258  Args    : number of samples (N)
 259            number of segregating sites (n)
 260            number of external mutations (n_e)
 261
 262 =cut
 263
 264
 265 sub fu_and_li_D_counts {
 266     my ($self,$n,$seg_sites, $external_mut) = @_;
 267     my $a_n = 0;
 268     for(my $k= 1; $k < $n; $k++ ) {
 269         $a_n += ( 1 / $k );
 270     }
 271     my $b = 0;
 272     for(my $k= 1; $k < $n; $k++ ) {
 273         $b += ( 1 / $k**2 );
 274     }
 275
 276     my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
 277                   ( ( $n - 1) * ( $n - 2 ) ) );
 278
 279     my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) *
 280                   ( $c - ( ( $n + 1) /
 281                            ( $n - 1) ) ));
 282
 283     my $u = $a_n - 1 - $v;
 284
 285     ($seg_sites - $a_n * $external_mut) /
 286         sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites));
 287
 288 }
 289
 290
 291 =head2 fu_and_li_D_star
 292
 293  Title   : fu_and_li_D_star
 294  Usage   : my $D = $statistics->fu_an_li_D_star(\@individuals);
 295  Function: Fu and Li's D* statistic for a set of samples
 296             Without an outgroup
 297  Returns : decimal number
 298  Args    : array ref of L<Bio::PopGen::IndividualI> objects
 299            OR
 300            L<Bio::PopGen::PopulationI> object
 301
 302 =cut
 303
 304 #'
 305 # fu_and_li_D*
 306
 307 sub fu_and_li_D_star {
 308     my ($self,$individuals) = @_;
 309
 310     my ($seg_sites,$n,$singletons);
 311     if( ref($individuals) =~ /ARRAY/i ) {
 312         $n = scalar @$individuals;
 313         $seg_sites   = $self->segregating_sites_count($individuals);
 314         $singletons  = $self->singleton_count($individuals);
 315     } elsif( ref($individuals) &&
 316              $individuals->isa('Bio::PopGen::PopulationI')) {
 317         my $pop = $individuals;
 318         $n = $pop->get_number_individuals;
 319         $seg_sites   = $self->segregating_sites_count($pop);
 320         $singletons  = $self->singleton_count($pop);
 321     } else {
 322         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star");
 323         return 0;
 324     }
 325
 326     return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons);
 327 }
 328
 329 =head2 fu_and_li_D_star_counts
 330
 331  Title   : fu_li_D_star_counts
 332  Usage   : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites,
 333                                                         $singletons);
 334
 335  Function: Fu and Li D statistic for the raw counts of the number
 336            of samples, sites, external and internal mutations
 337  Returns : decimal number
 338  Args    : number of samples (N)
 339            number of segregating sites (n)
 340            singletons (n_s)
 341
 342 =cut
 343
 344
 345 sub fu_and_li_D_star_counts {
 346     my ($self,$n,$seg_sites, $singletons) = @_;
 347     my $a_n;
 348     for(my $k = 1; $k < $n; $k++ ) {
 349         $a_n += ( 1 / $k );
 350     }
 351
 352     my $a1 = $a_n + 1 / $n;
 353
 354     my $b = 0;
 355     for(my $k= 1; $k < $n; $k++ ) {
 356         $b += ( 1 / $k**2 );
 357     }
 358
 359     my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
 360                   ( ( $n - 1) * ( $n - 2 ) ) );
 361
 362     my $d = $c + ($n -2) / ($n - 1)**2 +
 363         2 / ($n -1) *
 364         ( 1.5 - ( (2*$a1 - 3) / ($n -2) ) -
 365           1 / $n );
 366
 367     my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) -
 368                  (2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) )  /
 369                    (($a_n**2) + $b);
 370
 371     my $u_star = ( ($n/($n-1))*
 372                    ($a_n - ($n/
 373                           ($n-1)))) - $v_star;
 374
 375
 376     return (($n / ($n - 1)) * $seg_sites -
 377             $a_n * $singletons) /
 378             sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites));
 379 }
 380
 381
 382 =head2 fu_and_li_F
 383
 384  Title   : fu_and_li_F
 385  Usage   : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
 386  Function: Calculate Fu and Li's F on an ingroup with either the set of
 387            outgroup individuals, or the number of external mutations
 388  Returns : decimal number
 389  Args    : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
 390            OR a L<Bio::PopGen::PopulationI> object
 391            number of external mutations OR list of individuals for the outgroup
 392
 393 =cut
 394
 395 #'
 396
 397 sub fu_and_li_F {
 398     my ($self,$ingroup,$outgroup) = @_;
 399     my ($seg_sites,$pi,$n,$external,$internal);
 400     if( ref($ingroup) =~ /ARRAY/i ) {
 401         $n = scalar @$ingroup;
 402         # pi - all pairwise differences
 403         $pi          = $self->pi($ingroup);
 404         $seg_sites   = $self->segregating_sites_count($ingroup);
 405     } elsif( ref($ingroup) &&
 406              $ingroup->isa('Bio::PopGen::PopulationI')) {
 407         $n = $ingroup->get_number_individuals;
 408         $pi          = $self->pi($ingroup);
 409         $seg_sites   = $self->segregating_sites_count($ingroup);
 410     } else {
 411         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F");
 412         return 0;
 413     }
 414
 415     if( ! defined $outgroup ) {
 416         $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
 417         return 0;
 418     } elsif( ref($outgroup) ) {
 419         ($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
 420     } else {
 421         $external = $outgroup;
 422     }
 423     $self->fu_and_li_F_counts($n,$pi,$seg_sites,$external);
 424 }
 425
 426 =head2 fu_and_li_F_counts
 427
 428  Title   : fu_li_F_counts
 429  Usage   : my $F = $statistics->fu_and_li_F_counts($samps,$pi,
 430                                                    $sites,
 431                                                    $external);
 432  Function: Fu and Li F statistic for the raw counts of the number
 433            of samples, sites, external and internal mutations
 434  Returns : decimal number
 435  Args    : number of samples (N)
 436            average pairwise differences (pi)
 437            number of segregating sites (n)
 438            external mutations (n_e)
 439
 440 =cut
 441
 442
 443 sub fu_and_li_F_counts {
 444     my ($self,$n,$pi,$seg_sites, $external) = @_;
 445     my $a_n = 0;
 446     for(my $k= 1; $k < $n; $k++ ) {
 447         $a_n += ( 1 / $k );
 448     }
 449
 450     my $a1 = $a_n + (1 / $n );
 451
 452     my $b = 0;
 453     for(my $k= 1; $k < $n; $k++ ) {
 454         $b += ( 1 / $k**2 );
 455     }
 456
 457     my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
 458                   ( ( $n - 1) * ( $n - 2 ) ) );
 459
 460     my $v_F = ( $c + ( (2*(($n**2)+$n+3)) /
 461                        ( (9*$n)*($n-1) ) ) -
 462                 (2/($n-1)) ) / ( ($a_n**2)+$b );
 463
 464     my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )-
 465                 ( 4*( ($n+1)/(($n-1)**2) ))*
 466                 ($a1 - ((2*$n)/($n+1))) ) /
 467                 $a_n - $v_F;
 468
 469     # warn("$v_F vf $u_F uf n = $n\n");
 470     my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
 471                                         ($v_F*($seg_sites**2)) ) );
 472
 473     return $F;
 474 }
 475
 476 =head2 fu_and_li_F_star
 477
 478  Title   : fu_and_li_F_star
 479  Usage   : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
 480  Function: Calculate Fu and Li's F* on an ingroup without an outgroup
 481            It uses count of singleton alleles instead
 482  Returns : decimal number
 483  Args    : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
 484            OR
 485            L<Bio::PopGen::PopulationI> object
 486
 487 =cut
 488
 489 #' keep my emacs happy
 490
 491 sub fu_and_li_F_star {
 492     my ($self,$individuals) = @_;
 493
 494     my ($seg_sites,$pi,$n,$singletons);
 495     if( ref($individuals) =~ /ARRAY/i ) {
 496         $n = scalar @$individuals;
 497         # pi - all pairwise differences
 498         $pi          = $self->pi($individuals);
 499         $seg_sites   = $self->segregating_sites_count($individuals);
 500         $singletons  = $self->singleton_count($individuals);
 501     } elsif( ref($individuals) &&
 502              $individuals->isa('Bio::PopGen::PopulationI')) {
 503         my $pop = $individuals;
 504         $n = $pop->get_number_individuals;
 505         $pi          = $self->pi($pop);
 506         $seg_sites   = $self->segregating_sites_count($pop);
 507         $singletons  = $self->singleton_count($pop);
 508     } else {
 509         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star");
 510         return 0;
 511     }
 512     return $self->fu_and_li_F_star_counts($n,
 513                                           $pi,
 514                                           $seg_sites,
 515                                           $singletons);
 516 }
 517
 518 =head2 fu_and_li_F_star_counts
 519
 520  Title   : fu_li_F_star_counts
 521  Usage   : my $F = $statistics->fu_and_li_F_star_counts($samps,
 522                                                    $pi,$sites,
 523                                                    $singletons);
 524  Function: Fu and Li F statistic for the raw counts of the number
 525            of samples, sites, external and internal mutations
 526  Returns : decimal number
 527  Args    : number of samples (N)
 528            average pairwise differences (pi)
 529            number of segregating sites (n)
 530            singleton  mutations (n_s)
 531
 532 =cut
 533
 534
 535 sub fu_and_li_F_star_counts {
 536     my ($self,$n,$pi,$seg_sites, $singletons) = @_;
 537     if( $n <= 1 ) {
 538         $self->warn("N must be > 1\n");
 539         return;
 540     }
 541     if( $n == 2) {
 542         return 0;
 543     }
 544
 545     my $a_n = 0;
 546
 547
 548     my $b = 0;
 549     for(my $k= 1; $k < $n; $k++ ) {
 550         $b += (1 / ($k**2));
 551         $a_n += ( 1 / $k );     # Eq (2)
 552     }
 553     my $a1 = $a_n + (1 / $n );
 554
 555     # warn("a_n is $a_n a1 is $a1 n is $n b is $b\n");
 556
 557     # From Simonsen et al (1995) instead of Fu and Li 1993
 558     my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/
 559                       (9 * ($n ** 2) * ( $n - 1))) +
 560                      ((2 * ($n - 1) * $a_n ) / $n ** 2) -
 561                      (8 * $b / $n) ) /
 562                      ( ($a_n ** 2) + $b );
 563
 564     my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) /
 565                     (3 * $n * ( $n - 1))) / $a_n) - $v_F_star;
 566
 567     # warn("vf* = $v_F_star uf* = $u_F_star n = $n\n");
 568     my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) /
 569         sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2);
 570     return $F_star;
 571 }
 572
 573 =head2 tajima_D
 574
 575  Title   : tajima_D
 576  Usage   : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
 577  Function: Calculate Tajima's D on a set of samples
 578  Returns : decimal number
 579  Args    : array ref of L<Bio::PopGen::IndividualI> objects
 580            OR
 581            L<Bio::PopGen::PopulationI> object
 582
 583
 584 =cut
 585
 586 #'
 587
 588 sub tajima_D {
 589     my ($self,$individuals) = @_;
 590     my ($seg_sites,$pi,$n);
 591
 592     if( ref($individuals) =~ /ARRAY/i ) {
 593         $n = scalar @$individuals;
 594         # pi - all pairwise differences
 595         $pi          = $self->pi($individuals);
 596         $seg_sites = $self->segregating_sites_count($individuals);
 597
 598     } elsif( ref($individuals) &&
 599              $individuals->isa('Bio::PopGen::PopulationI')) {
 600         my $pop = $individuals;
 601         $n = $pop->get_number_individuals;
 602         $pi          = $self->pi($pop);
 603         $seg_sites = $self->segregating_sites_count($pop);
 604     } else {
 605         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
 606         return 0;
 607     }
 608     $self->tajima_D_counts($n,$seg_sites,$pi);
 609 }
 610
 611 =head2 tajima_D_counts
 612
 613  Title   : tajima_D_counts
 614  Usage   : my $D = $statistics->tajima_D_counts($samps,$sites,$pi);
 615  Function: Tajima's D statistic for the raw counts of the number
 616            of samples, sites, and avg pairwise distances (pi)
 617  Returns : decimal number
 618  Args    : number of samples (N)
 619            number of segregating sites (n)
 620            average pairwise differences (pi)
 621
 622 =cut
 623
 624 #'
 625
 626 sub tajima_D_counts {
 627     my ($self,$n,$seg_sites,$pi) = @_;
 628     my $a1 = 0;
 629     for(my $k= 1; $k < $n; $k++ ) {
 630         $a1 += ( 1 / $k );
 631     }
 632
 633      my $a2 = 0;
 634      for(my $k= 1; $k < $n; $k++ ) {
 635          $a2 += ( 1 / $k**2 );
 636      }
 637
 638     my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) );
 639     my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) /
 640              ( ( 9 * $n) * ( $n - 1) );
 641     my $c1 = $b1 - ( 1 / $a1 );
 642     my $c2 = $b2 - ( ( $n + 2 ) /
 643                      ( $a1 * $n))+( $a2 / $a1 ** 2);
 644     my $e1 = $c1 / $a1;
 645     my $e2 = $c2 / ( $a1**2 + $a2 );
 646
 647     my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
 648     return undef if $denom == 0;
 649     my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom;
 650     return $D;
 651 }
 652
 653
 654 =head2 pi
 655
 656  Title   : pi
 657  Usage   : my $pi = Bio::PopGen::Statistics->pi(\@inds)
 658  Function: Calculate pi (average number of pairwise differences) given
 659            a list of individuals which have the same number of markers
 660            (also called sites) as available from the get_Genotypes()
 661            call in L<Bio::PopGen::IndividualI>
 662  Returns : decimal number
 663  Args    : Arg1= array ref of L<Bio::PopGen::IndividualI> objects
 664              which have markers/mutations.  We expect all individuals to
 665              have a marker - we will deal with missing data as a special case.
 666            OR
 667            Arg1= L<Bio::PopGen::PopulationI> object.  In the event that
 668                  only allele frequency data is available, storing it in
 669                  Population object will make this available.
 670            num sites [optional], an optional second argument (integer)
 671              which is the number of sites, then pi returned is pi/site.
 672
 673 =cut
 674
 675 sub pi {
 676     my ($self,$individuals,$numsites) = @_;
 677     my (%data,%marker_total,@marker_names,$n);
 678
 679     if( ref($individuals) =~ /ARRAY/i ) {
 680         # one possible argument is an arrayref of Bio::PopGen::IndividualI objs
 681         @marker_names = $individuals->[0]->get_marker_names;
 682         $n = scalar @$individuals;
 683
 684         # Here we are calculating the allele frequencies
 685         foreach my $ind ( @$individuals ) {
 686             if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
 687                 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
 688                 return 0;
 689             }
 690             foreach my $m ( @marker_names ) {
 691                 foreach my $allele (map { $_->get_Alleles}
 692                                     $ind->get_Genotypes($m) ) {
 693                     $data{$m}->{$allele}++;
 694                     $marker_total{$m}++;
 695                 }
 696             }
 697         }
 698 #       while( my ($marker,$count) =  each %marker_total ) {
 699 #           foreach my $c ( values %{$data{$marker}} ) {
 700 #               $c /= $count;
 701 #           }
 702 #       }
 703         # %data will contain allele frequencies for each marker, allele
 704     } elsif( ref($individuals) &&
 705              $individuals->isa('Bio::PopGen::PopulationI') ) {
 706         my $pop = $individuals;
 707         $n = $pop->get_number_individuals;
 708         foreach my $marker( $pop->get_Markers ) {
 709             push @marker_names, $marker->name;
 710             #$data{$marker->name} = {$marker->get_Allele_Frequencies};
 711             my @genotypes = $pop->get_Genotypes(-marker => $marker->name);
 712             for my $al ( map { $_->get_Alleles} @genotypes ) {
 713               $data{$marker->name}->{$al}++;
 714               $marker_total{$marker->name}++;
 715            }
 716         }
 717     } else {
 718         $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
 719     }
 720     # based on Kevin Thornton's code:
 721     # http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152
 722     # For now we assume that all individuals have the same markers
 723     my ($diffcount,$totalcompare) = (0,0);
 724     my $pi = 0;
 725     while ( my ($marker,$markerdat) = each %data ) {
 726       my $sampsize = $marker_total{$marker};
 727       my $ssh = 0;
 728       my @alleles = keys %$markerdat;
 729       if ( $sampsize > 1 ) {
 730         my $denom = $sampsize * ($sampsize - 1.0);
 731         foreach my $al ( @alleles ) {
 732           $ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom;
 733         }
 734         $pi += 1.0 - $ssh;
 735       }
 736     }
 737     $self->debug( "pi=$pi\n");
 738     if( $numsites ) {
 739         return $pi / $numsites;
 740     } else {
 741         return $pi;
 742     }
 743 }
 744
 745
 746 =head2 theta
 747
 748  Title   : theta
 749  Usage   : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
 750  Function: Calculates Watterson's theta from the sample size
 751            and the number of segregating sites.
 752            Providing the third parameter, total number of sites will
 753            return theta per site.
 754            This is also known as K-hat = K / a_n
 755  Returns : decimal number
 756  Args    : sample size (integer),
 757            num segregating sites (integer)
 758            total sites (integer) [optional] (to calculate theta per site)
 759            OR
 760            provide an arrayref of the L<Bio::PopGen::IndividualI> objects
 761            total sites (integer) [optional] (to calculate theta per site)
 762            OR
 763            provide an L<Bio::PopGen::PopulationI> object
 764            total sites (integer)[optional]
 765
 766 =cut
 767
 768 #'
 769
 770 sub theta {
 771     my $self = shift;
 772     my ( $n, $seg_sites,$totalsites) = @_;
 773     if( ref($n) =~ /ARRAY/i ) {
 774         my $samps = $n;
 775         $totalsites = $seg_sites; # only 2 arguments if one is an array
 776         my %data;
 777         my @marker_names = $samps->[0]->get_marker_names;
 778         # we need to calculate number of polymorphic sites
 779         $seg_sites = $self->segregating_sites_count($samps);
 780         $n = scalar @$samps;
 781
 782     } elsif(ref($n) &&
 783             $n->isa('Bio::PopGen::PopulationI') ) {
 784         # This will handle the case when we pass in a PopulationI object
 785         my $pop = $n;
 786         $totalsites = $seg_sites; # shift the arguments over by one
 787         $n = $pop->haploid_population->get_number_individuals;
 788         $seg_sites = $self->segregating_sites_count($pop);
 789     }
 790     my $a1 = 0;
 791     for(my $k= 1; $k < $n; $k++ ) {
 792         $a1 += ( 1 / $k );
 793     }
 794     if( $totalsites ) { # 0 and undef are the same can't divide by them
 795         $seg_sites /= $totalsites;
 796     }
 797     if( $a1 == 0 ) {
 798         return 0;
 799     }
 800     return $seg_sites / $a1;
 801 }
 802
 803 =head2 singleton_count
 804
 805  Title   : singleton_count
 806  Usage   : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
 807  Function: Calculate the number of mutations/alleles which only occur once in
 808            a list of individuals for all sites/markers
 809  Returns : (integer) number of alleles which only occur once (integer)
 810  Args    : arrayref of L<Bio::PopGen::IndividualI> objects
 811            OR
 812            L<Bio::PopGen::PopulationI> object
 813
 814 =cut
 815
 816 sub singleton_count {
 817     my ($self,$individuals) = @_;
 818
 819     my @inds;
 820     if( ref($individuals) =~ /ARRAY/ ) {
 821         @inds = @$individuals;
 822     } elsif( ref($individuals) &&
 823              $individuals->isa('Bio::PopGen::PopulationI') ) {
 824         my $pop = $individuals;
 825         @inds = $pop->get_Individuals();
 826         unless( @inds ) {
 827             $self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
 828             return 0;
 829         }
 830     } else {
 831         $self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
 832         return 0;
 833     }
 834     # find number of sites where a particular allele is only seen once
 835
 836     my ($singleton_allele_ct,%sites) = (0);
 837     # first collect all the alleles into a hash structure
 838
 839     foreach my $n ( @inds ) {
 840         if( ! $n->isa('Bio::PopGen::IndividualI') ) {
 841             $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
 842             return 0;
 843         }
 844         foreach my $g ( $n->get_Genotypes ) {
 845             my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
 846             foreach my $allele (@alleles ) {
 847                 $sites{$nm}->{$allele}++;
 848             }
 849         }
 850     }
 851     foreach my $site ( values %sites ) { # don't really care what the name is
 852         foreach my $allelect ( values %$site ) { #
 853             # find the sites which have an allele with only 1 copy
 854             $singleton_allele_ct++ if( $allelect == 1 );
 855         }
 856     }
 857     return $singleton_allele_ct;
 858 }
 859
 860 # Yes I know that singleton_count and segregating_sites_count are
 861 # basically processing the same data so calling them both is
 862 # redundant, something I want to fix later but want to make things
 863 # correct and simple first
 864
 865 =head2 segregating_sites_count
 866
 867  Title   : segregating_sites_count
 868  Usage   : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
 869  Function: Gets the number of segregating sites (number of polymorphic sites)
 870  Returns : (integer) number of segregating sites
 871  Args    : arrayref of L<Bio::PopGen::IndividualI> objects
 872            OR
 873            L<Bio::PopGen::PopulationI> object
 874
 875 =cut
 876
 877 # perhaps we'll change this in the future
 878 # to return the actual segregating sites
 879 # so one can use this to pull in the names of those sites.
 880 # Would be trivial if it is useful.
 881
 882 sub segregating_sites_count {
 883    my ($self,$individuals) = @_;
 884    my $type = ref($individuals);
 885    my $seg_sites = 0;
 886    if( $type =~ /ARRAY/i ) {
 887        my %sites;
 888        foreach my $n ( @$individuals ) {
 889            if( ! $n->isa('Bio::PopGen::IndividualI') ) {
 890                $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
 891                return 0;
 892            }
 893            foreach my $g ( $n->get_Genotypes ) {
 894                my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
 895                foreach my $allele (@alleles ) {
 896                    $sites{$nm}->{$allele}++;
 897                }
 898            }
 899        }
 900        foreach my $site ( values %sites ) { # use values b/c we don't
 901                                             # really care what the name is
 902            # find the sites which >1 allele
 903            $seg_sites++ if( keys %$site > 1 );
 904        }
 905    } elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
 906        foreach my $marker ( $individuals->haploid_population->get_Markers ) {
 907            my @alleles = $marker->get_Alleles;
 908            $seg_sites++ if ( scalar @alleles > 1 );
 909        }
 910    } else {
 911        $self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
 912        return 0;
 913    }
 914    return $seg_sites;
 915 }
 916
 917
 918 =head2 heterozygosity
 919
 920  Title   : heterozygosity
 921  Usage   : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
 922  Function: Calculate the heterozgosity for a sample set for a set of alleles
 923  Returns : decimal number
 924  Args    : sample size (integer)
 925            frequency of one allele (fraction - must be less than 1)
 926            [optional] frequency of another allele - this is only needed
 927                       in a non-binary allele system
 928
 929 Note     : p^2 + 2pq + q^2
 930
 931 =cut
 932
 933
 934 sub heterozygosity {
 935     my ($self,$samp_size, $freq1,$freq2) = @_;
 936     if( ! $freq2 ) { $freq2 = 1 - $freq1 }
 937     if( $freq1 > 1 || $freq2 > 1 ) {
 938         $self->warn("heterozygosity expects frequencies to be less than 1");
 939     }
 940     my $sum = ($freq1**2) + (($freq2)**2);
 941     my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
 942     return $h;
 943 }
 944
 945
 946 =head2 derived_mutations
 947
 948  Title   : derived_mutations
 949  Usage   : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
 950  Function: Calculate the number of alleles or (mutations) which are ancestral
 951            and the number which are derived (occurred only on the tips)
 952  Returns : array of 2 items - number of external and internal derived
 953            mutation
 954  Args    : ingroup - L<Bio::PopGen::IndividualI>s arrayref OR
 955                      L<Bio::PopGen::PopulationI>
 956            outgroup- L<Bio::PopGen::IndividualI>s arrayref OR
 957                      L<Bio::PopGen::PopulationI> OR
 958                      a single L<Bio::PopGen::IndividualI>
 959
 960 =cut
 961
 962 sub derived_mutations {
 963    my ($self,$ingroup,$outgroup) = @_;
 964    my (%indata,%outdata,@marker_names);
 965
 966    # basically we have to do some type checking
 967    # if that perl were typed...
 968    my ($itype,$otype) = (ref($ingroup),ref($outgroup));
 969
 970    return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
 971                                       # are already the value we
 972                                       # are searching for
 973    # pick apart the ingroup
 974    # get the data
 975    if( ref($ingroup) =~ /ARRAY/i ) {
 976        if( ! ref($ingroup->[0]) ||
 977            ! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
 978            $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
 979            return 0;
 980        }
 981        # we assume that all individuals have the same markers
 982        # i.e. that they are aligned
 983        @marker_names = $ingroup->[0]->get_marker_names;
 984        for my $ind ( @$ingroup ) {
 985            for my $m ( @marker_names ) {
 986                for my $allele ( map { $_->get_Alleles }
 987                                     $ind->get_Genotypes($m) ) {
 988                    $indata{$m}->{$allele}++;
 989                }
 990            }
 991        }
 992    } elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
 993        @marker_names = $ingroup->get_marker_names;
 994        for my $ind ( $ingroup->haploid_population->get_Individuals() ) {
 995            for my $m ( @marker_names ) {
 996                for my $allele ( map { $_->get_Alleles}
 997                                     $ind->get_Genotypes($m) ) {
 998                    $indata{$m}->{$allele}++;
 999                }
1000            }
1001        }
1002    } else {
1003        $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
1004        return 0;
1005    }
1006
1007    if( $otype =~ /ARRAY/i ) {
1008        if( ! ref($outgroup->[0]) ||
1009            ! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
1010            $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
1011            return 0;
1012        }
1013        for my $ind ( @$outgroup ) {
1014            for my $m ( @marker_names ) {
1015                for my $allele ( map { $_->get_Alleles }
1016                                 $ind->get_Genotypes($m) ) {
1017                    $outdata{$m}->{$allele}++;
1018                }
1019            }
1020        }
1021
1022    } elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
1023        for my $ind ( $outgroup->haploid_population->get_Individuals() ) {
1024            for my $m ( @marker_names ) {
1025                for my $allele ( map { $_->get_Alleles}
1026                                     $ind->get_Genotypes($m) ) {
1027                    $outdata{$m}->{$allele}++;
1028                }
1029            }
1030        }
1031    } else {
1032        $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
1033        return 0;
1034    }
1035
1036    # derived mutations are defined as
1037    #
1038    # ingroup  (G A T)
1039    # outgroup (A)
1040    # derived mutations are G and T, A is the external mutation
1041
1042    # ingroup  (A T)
1043    # outgroup (C)
1044    # derived mutations A,T no external/ancestral mutations
1045
1046    # ingroup  (G A T)
1047    # outgroup (A T)
1048    # cannot determine
1049
1050    my ($internal,$external);
1051    foreach my $marker ( @marker_names ) {
1052        my @outalleles = keys %{$outdata{$marker}};
1053        my @in_alleles = keys %{$indata{$marker}};
1054        next if( @outalleles > 1 || @in_alleles == 1);
1055        for my $allele ( @in_alleles ) {
1056            if( ! exists $outdata{$marker}->{$allele} ) {
1057                if( $indata{$marker}->{$allele} == 1 ) {
1058                    $external++;
1059                } else {
1060                    $internal++;
1061                }
1062            }
1063        }
1064    }
1065    return ($external, $internal);
1066 }
1067
1068
1069 =head2 composite_LD
1070
1071  Title   : composite_LD
1072  Usage   : %matrix = Bio::PopGen::Statistics->composite_LD($population);
1073  Function: Calculate the Linkage Disequilibrium
1074            This is for calculating LD for unphased data.
1075            Other methods will be appropriate for phased haplotype data.
1076
1077  Returns : Hash of Hashes - first key is site 1,second key is site 2
1078            and value is LD for those two sites.
1079            my $LDarrayref = $matrix{$site1}->{$site2};
1080            my ($ldval, $chisquared) = @$LDarrayref;
1081  Args    : L<Bio::PopGen::PopulationI> or arrayref of
1082            L<Bio::PopGen::IndividualI>s
1083  Reference: Weir B.S. (1996) "Genetic Data Analysis II",
1084                       Sinauer, Sunderlanm MA.
1085
1086 =cut
1087
1088 sub composite_LD {
1089     my ($self,$pop) = @_;
1090     if( ref($pop) =~ /ARRAY/i ) {
1091         if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
1092             $pop = Bio::PopGen::Population->new(-individuals => @$pop);
1093         } else {
1094             $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1095             return ();
1096         }
1097     } elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
1098         $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1099         return ();
1100     }
1101
1102     my @marker_names = $pop->get_marker_names;
1103     my @inds = $pop->get_Individuals;
1104     my $num_inds = scalar @inds;
1105     my (%lookup);
1106     # calculate allele frequencies for each marker from the population
1107     # use the built-in get_Marker to get the allele freqs
1108     # we still need to calculate the genotype frequencies
1109     foreach my $marker_name ( @marker_names ) {
1110         my(%allelef);
1111
1112         foreach my $ind ( @inds ) {
1113             my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
1114             if( ! defined $genotype ) {
1115                 $self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
1116                 next;
1117             }
1118             my @alleles  = sort $genotype->get_Alleles;
1119             next if( scalar @alleles != 2);
1120             my $genostr  = join(',', @alleles);
1121             $allelef{$alleles[0]}++;
1122             $allelef{$alleles[1]}++;
1123         }
1124
1125         # we should check for cases where there > 2 alleles or
1126         # only 1 allele and throw out those markers.
1127         my @alleles      = sort keys %allelef;
1128         my $allele_count = scalar @alleles;
1129         # test if site is polymorphic
1130         if( $allele_count != 2) {
1131             # only really warn if we're seeing multi-allele
1132             $self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
1133             next;               # skip this marker
1134         }
1135
1136         # Need to do something here to detect alleles which aren't
1137         # a single character
1138         if( length($alleles[0]) != 1 ||
1139             length($alleles[1]) != 1 ) {
1140             $self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
1141             next;
1142         }
1143
1144         # fix the call for allele 1 (A or B) and
1145         # allele 2 (a or b) in terms of how we'll do the
1146         # N square from Weir p.126
1147         $self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
1148         $lookup{$marker_name}->{'1'} = $alleles[0];
1149         $lookup{$marker_name}->{'2'} = $alleles[1];
1150     }
1151
1152     @marker_names = sort keys %lookup;
1153     my $site_count   = scalar @marker_names;
1154     # where the final data will be stored
1155     my %stats_for_sites;
1156
1157     # standard way of generating pairwise combos
1158     # LD is done by comparing all the pairwise site (marker)
1159     # combinations and keeping track of the genotype and
1160     # pairwise genotype (ie genotypes of the 2 sites) frequencies
1161     for( my $i = 0; $i < $site_count - 1; $i++ ) {
1162         my $site1 = $marker_names[$i];
1163         my (%genotypes, %total_genotype_count,
1164         %total_pairwisegeno_count,%pairwise_genotypes);
1165         for( my $j = $i+1; $j < $site_count ; $j++) {
1166
1167         my (%genotypes, %total_genotype_count,
1168             %total_pairwisegeno_count,%pairwise_genotypes);
1169
1170             my $site2 = $marker_names[$j];
1171             my (%allele_count,%allele_freqs) = (0,0);
1172             foreach my $ind ( @inds ) {
1173                 # build string of genotype at site 1
1174                 my ($genotype1) = $ind->get_Genotypes(-marker => $site1);
1175                 my @alleles1  = sort $genotype1->get_Alleles;
1176
1177                 # if an individual has only one available allele
1178                 # (has a blank or N for one of the chromosomes)
1179                 # we don't want to use it in our calculation
1180
1181                 next unless( scalar @alleles1 == 2);
1182                 my $genostr1  = join(',', @alleles1);
1183
1184                 # build string of genotype at site 2
1185                 my ($genotype2) = $ind->get_Genotypes(-marker => $site2);
1186                 my @alleles2  = sort $genotype2->get_Alleles;
1187                 my $genostr2  = join(',', @alleles2);
1188
1189                 next unless( scalar @alleles2 == 2);
1190                 for (@alleles1) {
1191                     $allele_count{$site1}++;
1192                     $allele_freqs{$site1}->{$_}++;
1193                 }
1194                 $genotypes{$site1}->{$genostr1}++;
1195                 $total_genotype_count{$site1}++;
1196
1197                 for (@alleles2) {
1198                     $allele_count{$site2}++;
1199                     $allele_freqs{$site2}->{$_}++;
1200                 }
1201                 $genotypes{$site2}->{$genostr2}++;
1202                 $total_genotype_count{$site2}++;
1203
1204                 # We are using the $site1,$site2 to signify
1205                 # a unique key
1206                 $pairwise_genotypes{"$site1,$site2"}->{"$genostr1,$genostr2"}++;
1207                 # some individuals
1208                 $total_pairwisegeno_count{"$site1,$site2"}++;
1209             }
1210             for my $site ( %allele_freqs ) {
1211                 for my $al ( keys %{ $allele_freqs{$site} } ) {
1212                     $allele_freqs{$site}->{$al} /= $allele_count{$site};
1213                 }
1214             }
1215             my $n = $total_pairwisegeno_count{"$site1,$site2"}; # number of inds
1216             # 'A' and 'B' are two loci or in our case site1 and site2
1217             my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1218             my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1219             my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1220             my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1221             # AABB
1222             my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1223                                        $allele1_site2, $allele1_site2));
1224             $self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1225             # AABb
1226             my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1227                                        $allele1_site2, $allele2_site2));
1228             $self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1229             # AaBB
1230             my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1231                                        $allele1_site2, $allele1_site2));
1232             $self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1233             # AaBb
1234             my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1235                                        $allele1_site2, $allele2_site2));
1236             $self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1237             # count of AABB in
1238             my $n1 = $pairwise_genotypes{"$site1,$site2"}->{$N1genostr} || 0;
1239             # count of AABb in
1240             my $n2 = $pairwise_genotypes{"$site1,$site2"}->{$N2genostr} || 0;
1241             # count of AaBB in
1242             my $n4 = $pairwise_genotypes{"$site1,$site2"}->{$N4genostr} || 0;
1243             # count of AaBb in
1244             my $n5 = $pairwise_genotypes{"$site1,$site2"}->{$N5genostr} || 0;
1245
1246             my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1247             my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1248         my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1249             my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1250             my $p_A  = $allele_freqs{$site1}->{$allele1_site1} || 0;    # an individual allele freq
1251             my $p_a  =  1 - $p_A;
1252
1253             my $p_B  = $allele_freqs{$site2}->{$allele1_site2} || 0;    # an individual allele freq
1254             my $p_b  =  1 - $p_B;
1255
1256             # variance of allele frequencies
1257             my $pi_A = $p_A * $p_a;
1258             my $pi_B = $p_B * $p_b;
1259
1260             # hardy weinberg
1261             my $D_A  = $p_AA - $p_A**2;
1262             my $D_B  = $p_BB - $p_B**2;
1263             my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1264             $self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1265
1266             my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1267             $self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1268             $self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1269                                  $n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1270
1271             my $chisquared;
1272             eval { $chisquared = ( $n * ($delta_AB**2) ) /
1273                        ( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1274                };
1275             if( $@ ) {
1276                 $self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1277                 next;
1278             }
1279             # this will be an upper triangular matrix
1280             $stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1281         }
1282     }
1283     return %stats_for_sites;
1284 }
1285
1286 =head2 mcdonald_kreitman
1287
1288  Title   : mcdonald_kreitman
1289  Usage   : $Fstat = mcdonald_kreitman($ingroup, $outgroup);
1290  Function: Calculates McDonald-Kreitman statistic based on a set of ingroup
1291            individuals and an outgroup by computing the number of
1292            differences at synonymous and non-synonymous sites
1293            for intraspecific comparisons and with the outgroup
1294  Returns : 2x2 table, followed by a hash reference indicating any
1295            warning messages about the status of the alleles or codons
1296  Args    : -ingroup    => L<Bio::PopGen::Population> object or
1297                           arrayref of L<Bio::PopGen::Individual>s
1298            -outgroup   => L<Bio::PopGen::Population> object or
1299                           arrayef of L<Bio::PopGen::Individual>s
1300            -polarized  => Boolean, to indicate if this should be
1301                           a polarized test. Must provide two individuals
1302                           as outgroups.
1303
1304 =cut
1305
1306 sub mcdonald_kreitman {
1307     my ($self,@args) = @_;
1308     my ($ingroup, $outgroup,$polarized) =
1309         $self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args);
1310     my $verbose = $self->verbose;
1311     my $outgroup_count;
1312     my $gapchar = '\-';
1313     if( ref($outgroup) =~ /ARRAY/i ) {
1314         $outgroup_count = scalar @$outgroup;
1315     } elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) {
1316         $outgroup_count = $outgroup->get_number_individuals;
1317     } else {
1318         $self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI");
1319     }
1320
1321     if( $polarized ) {
1322         if( $outgroup_count < 2 ) {
1323             $self->throw("Need 2 outgroups with polarized option\n");
1324         }
1325     } elsif( $outgroup_count > 1 ) {
1326         $self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count ));
1327     } elsif( $outgroup_count == 0 ) {
1328         $self->throw("No outgroup sequence provided");
1329     }
1330
1331     my $codon_path = Bio::MolEvol::CodonModel->codon_path;
1332
1333     my (%marker_names,%unique,@inds);
1334     for my $p ( $ingroup, $outgroup)  {
1335         if( ref($p) =~ /ARRAY/i ) {
1336             push @inds, @$p;
1337         } else {
1338             push @inds, $p->get_Individuals;
1339         }
1340     }
1341     for my $i ( @inds ) {
1342         if( $unique{$i->unique_id}++ ) {
1343             $self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n");
1344         }
1345         for my $n ( $i->get_marker_names ) {
1346             $marker_names{$n}++;
1347         }
1348     }
1349
1350     my @marker_names = keys %marker_names;
1351     if( $marker_names[0] =~ /^(Site|Codon)/ ) {
1352         # sort by site or codon number and do it in
1353         # a schwartzian transformation baby!
1354         @marker_names = map { $_->[1] }
1355         sort { $a->[0] <=> $b->[0] }
1356         map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names;
1357     }
1358
1359
1360     my $num_inds = scalar @inds;
1361     my %vals = ( 'ingroup'  => $ingroup,
1362                  'outgroup' => $outgroup,
1363                  );
1364
1365     # Make the Codon Table type a parameter!
1366     my $table = Bio::Tools::CodonTable->new(-id => $codon_table);
1367     my @vt = qw(outgroup ingroup);
1368     my %changes;
1369     my %status;
1370     my %two_by_two = ( 'fixed_N' => 0,
1371                        'fixed_S' => 0,
1372                        'poly_N'  => 0,
1373                        'poly_S'  => 0);
1374
1375     for my $codon ( @marker_names ) {
1376         my (%codonvals);
1377         my %all_alleles;
1378         for my $t ( @vt ) {
1379             my $outcount = 1;
1380             for my $ind ( @{$vals{$t}} ) {
1381                 my @alleles = $ind->get_Genotypes($codon)->get_Alleles;
1382                 if( @alleles > 1 ) {
1383                     die;
1384 #                 warn("$codon $codon saw ", scalar @alleles, " for ind ", $ind->unique_id, "\n");
1385                 } else {
1386                     my ($allele) = shift @alleles;
1387                     $all_alleles{$ind->unique_id} = $allele;
1388                     my $AA = $table->translate($allele);
1389                     next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i);
1390
1391                     my $label = $t;
1392                     if( $t eq 'outgroup' ) {
1393                         $label = $t.$outcount++;
1394                     }
1395                     $codonvals{$label}->{$allele}++;
1396                     $codonvals{all}->{$allele}++;
1397                 }
1398             }
1399         }
1400         my $total = sum ( values %{$codonvals{'ingroup'}} );
1401         next if( $total && $total < 2 ); # skip sites with < alleles
1402         # process all the seen alleles (codons)
1403         # this is a vertical slide through the alignment
1404         if( keys %{$codonvals{all}} <= 1 ) {
1405             # no changes or no VALID codons - monomorphic
1406         } else {
1407             # grab only the first outgroup codon (what to do with rest?)
1408             my ($outcodon) = keys %{$codonvals{'outgroup1'}};
1409             if( ! $outcodon ) {
1410                 $status{"no outgroup codon $codon"}++;
1411                 next;
1412             }
1413             my $out_AA = $table->translate($outcodon);
1414             my ($outcodon2) = keys %{$codonvals{'outgroup2'}};
1415             if( ($polarized && ($outcodon ne $outcodon2)) ||
1416                 $out_AA eq 'X' || $out_AA eq '*' ) {
1417                 # skip if outgroup codons are different
1418                 # (when polarized option is on)
1419                 # or skip if the outcodon is STOP or 'NNN'
1420                 if( $verbose > 0 ) {
1421                     $self->debug("skipping $out_AA and $outcodon $outcodon2\n");
1422                 }
1423                 $status{'outgroup codons different'}++;
1424                 next;
1425             }
1426
1427             # check if ingroup is actually different from outgroup -
1428             # if there are the same number of alleles when considering
1429             # ALL or just the ingroup, then there is nothing new seen
1430             # in the outgroup so it must be a shared allele (codon)
1431
1432             # so we just count how many total alleles were seen
1433             # if this is the same as the number of alleles seen for just
1434             # the ingroup then the outgroup presents no new information
1435
1436             my @ingroup_codons = keys %{$codonvals{'ingroup'}};
1437             my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon};
1438
1439             if( $verbose > 0 ) {
1440                 $self->debug("alleles are in: ", join(",", @ingroup_codons),
1441                              " out: ", join(",", keys %{$codonvals{outgroup1}}),
1442                              " diff_from_out=$diff_from_out\n");
1443
1444                 for my $ind ( sort keys %all_alleles ) {
1445                     $self->debug( "$ind\t$all_alleles{$ind}\n");
1446                 }
1447             }
1448             # are all the ingroup alleles the same and diferent from outgroup?
1449             # fixed differences between species
1450             if( $diff_from_out ) {
1451                 if( scalar @ingroup_codons == 1 ) {
1452                     # fixed differences
1453                     if( $outcodon =~ /^$gapchar/ ) {
1454                         $status{'outgroup codons with gaps'}++;
1455                         next;
1456                     } elsif( $ingroup_codons[0] =~ /$gapchar/) {
1457                         $status{'ingroup codons with gaps'}++;
1458                         next;
1459                     }
1460                     my $path = $codon_path->{uc $ingroup_codons[0].$outcodon};
1461                     $two_by_two{fixed_N} += $path->[0];
1462                     $two_by_two{fixed_S} += $path->[1];
1463                     if( $verbose > 0 ) {
1464                         $self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n");
1465                         $self->debug("path is ",join(",",@$path),"\n");
1466                         $self->debug
1467                             (sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA,
1468                                      @$path, map { $two_by_two{$_} }
1469                                      qw(fixed_N fixed_S poly_N poly_S)));
1470                     }
1471                 } else {
1472                     # polymorphic and all are different from outgroup
1473                     # Here we find the minimum number of NS subst
1474                     my ($Ndiff,$Sdiff) = (3,0); # most different path
1475                     for my $c ( @ingroup_codons ) {
1476                         next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/);
1477                         my $path = $codon_path->{uc $c.$outcodon};
1478                         my ($tNdiff,$tSdiff) = @$path;
1479                         if( $path->[0] < $Ndiff ||
1480                             ($tNdiff == $Ndiff &&
1481                              $tSdiff  <= $Sdiff)) {
1482                             ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1483                         }
1484                     }
1485                     $two_by_two{fixed_N} += $Ndiff;
1486                     $two_by_two{fixed_S} += $Sdiff;
1487                     if( @ingroup_codons > 2 ) {
1488                         $status{"more than 2 ingroup codons $codon"}++;
1489                         warn("more than 2 ingroup codons (@ingroup_codons)\n");
1490                     } else {
1491                         my $path = $codon_path->{uc join('',@ingroup_codons)};
1492
1493                         $two_by_two{poly_N} += $path->[0];
1494                         $two_by_two{poly_S} += $path->[1];
1495                         if( $verbose > 0 ) {
1496                             $self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S)));
1497                         }
1498                     }
1499                 }
1500             } else {
1501                 my %unq = map { $_ => 1 } @ingroup_codons;
1502                 delete $unq{$outcodon};
1503                 my @unique_codons = keys %unq;
1504
1505                 # calc path for diff add to poly
1506                 # Here we find the minimum number of subst bw
1507                 # codons
1508                 my ($Ndiff,$Sdiff) = (3,0); # most different path
1509                 for my $c ( @unique_codons ) {
1510                     my $path = $codon_path->{uc $c.$outcodon };
1511                     if( ! defined $path ) {
1512                         die " cannot get path for ", $c.$outcodon, "\n";
1513                     }
1514                     my ($tNdiff,$tSdiff) = @$path;
1515                     if( $path->[0] < $Ndiff ||
1516                         ($tNdiff == $Ndiff &&
1517                          $tSdiff  <= $Sdiff)) {
1518                         ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1519                     }
1520                 }
1521
1522                 if( @unique_codons == 2 ) {
1523                     my $path = $codon_path->{uc join('',@unique_codons)};
1524                     if( ! defined $path ) {
1525                         $self->throw("no path for @unique_codons\n");
1526                     }
1527                     $Ndiff += $path->[0];
1528                     $Sdiff += $path->[1];
1529                 }
1530                 $two_by_two{poly_N} += $Ndiff;
1531                 $two_by_two{poly_S} += $Sdiff;
1532                 if( $verbose > 0 ) {
1533                     $self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,
1534                                          $Ndiff, $Sdiff, map { $two_by_two{$_} }
1535                                          qw(fixed_N fixed_S poly_N poly_S)));
1536                 }
1537             }
1538         }
1539     }
1540     return ( $two_by_two{'poly_N'},
1541              $two_by_two{'fixed_N'},
1542              $two_by_two{'poly_S'},
1543              $two_by_two{'fixed_S'},
1544              {%status});
1545
1546 }
1547
1548 *MK = \&mcdonald_kreitman;
1549
1550
1551 =head2 mcdonald_kreitman_counts
1552
1553  Title   : mcdonald_kreitman_counts
1554  Usage   : my $MK = $statistics->mcdonald_kreitman_counts(
1555
1556              N_poly -> integer of count of non-syn polymorphism
1557              N_fix  -> integer of count of non-syn fixed substitutions
1558              S_poly -> integer of count of syn polymorphism
1559              S_fix  -> integer of count of syn fixed substitutions
1560                                                           );
1561  Function:
1562  Returns : decimal number
1563  Args    :
1564
1565 =cut
1566
1567
1568 sub mcdonald_kreitman_counts {
1569     my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_;
1570     if( $has_twotailed ) {
1571         return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic
1572             (n11=>$Npoly,
1573              n1p=>$Npoly+$Spoly,
1574              np1=>$Npoly+$Nfix,
1575              npp=>$Npoly+$Nfix+$Spoly+$Sfix);
1576     } else {
1577         $self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed");
1578         return 0;
1579     }
1580 }
1581
1582
1583 1;