nexml.t: Added a missing XML::Twig requirement.
[bioperl-live.git] / Bio / PopGen / Statistics.pm
blob5dc2f44da490c9ea053c7882f611e688a56e8256
2 # BioPerl module for Bio::PopGen::Statistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PopGen::Statistics - Population Genetics statistical tests
18 =head1 SYNOPSIS
20 use Bio::PopGen::Statistics;
21 use Bio::AlignIO;
22 use Bio::PopGen::IO;
23 use Bio::PopGen::Simulation::Coalescent;
25 my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12);
27 my $tree = $sim->next_tree;
29 $sim->add_Mutations($tree,20);
31 my $stats = Bio::PopGen::Statistics->new();
32 my $individuals = [ $tree->get_leaf_nodes];
33 my $pi = $stats->pi($individuals);
34 my $D = $stats->tajima_D($individuals);
36 # Alternatively to do this on input data from
37 # See the tests in t/PopGen.t for more examples
38 my $parser = Bio::PopGen::IO->new(-format => 'prettybase',
39 -file => 't/data/popstats.prettybase');
40 my $pop = $parser->next_population;
41 # Note that you can also call the stats as a class method if you like
42 # the only reason to instantiate it (as above) is if you want
43 # to set the verbosity for debugging
44 $pi = Bio::PopGen::Statistics->pi($pop);
45 $theta = Bio::PopGen::Statistics->theta($pop);
47 # Pi and Theta also take additional arguments,
48 # see the documentation for more information
50 use Bio::PopGen::Utilities;
51 use Bio::AlignIO;
53 my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
54 -format => 'clustalw');
55 my $aln = $in->next_aln;
56 # get a population, each sequence is an individual and
57 # for the default case, every site which is not monomorphic
58 # is a 'marker'. Each individual will have a 'genotype' for the
59 # site which will be the specific base in the alignment at that
60 # site
62 my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
65 =head1 DESCRIPTION
67 This object is intended to provide implementations some standard
68 population genetics statistics about alleles in populations.
70 This module was previously named Bio::Tree::Statistics.
72 This object is a place to accumulate routines for calculating various
73 statistics from the coalescent simulation, marker/allele, or from
74 aligned sequence data given that you can calculate alleles, number of
75 segregating sites.
77 Currently implemented:
78 Fu and Li's D (fu_and_li_D)
79 Fu and Li's D* (fu_and_li_D_star)
80 Fu and Li's F (fu_and_li_F)
81 Fu and Li's F* (fu_and_li_F_star)
82 Tajima's D (tajima_D)
83 Watterson's theta (theta)
84 pi (pi) - number of pairwise differences
85 composite_LD (composite_LD)
86 McDonald-Kreitman (mcdonald_kreitman or MK)
88 Count based methods also exist in case you have already calculated the
89 key statistics (seg sites, num individuals, etc) and just want to
90 compute the statistic.
92 In all cases where a the method expects an arrayref of
93 L<Bio::PopGen::IndividualI> objects and L<Bio::PopGen::PopulationI>
94 object will also work.
96 =head2 REFERENCES
98 Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
99 Mutations." Genetics 133:693-709.
101 Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
102 from a Population." Genetics 143:557-570.
104 McDonald J, Kreitman M.
106 Tajima F. (1989) "Statistical method for testing the neutral mutation
107 hypothesis by DNA polymorphism." Genetics 123:585-595.
110 =head2 CITING THIS WORK
112 Please see this reference for use of this implementation.
114 Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
116 If you use these Bio::PopGen modules please cite the Bioperl
117 publication (see FAQ) and the above reference.
120 =head1 FEEDBACK
122 =head2 Mailing Lists
124 User feedback is an integral part of the evolution of this and other
125 Bioperl modules. Send your comments and suggestions preferably to
126 the Bioperl mailing list. Your participation is much appreciated.
128 bioperl-l@bioperl.org - General discussion
129 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
131 =head2 Support
133 Please direct usage questions or support issues to the mailing list:
135 I<bioperl-l@bioperl.org>
137 rather than to the module maintainer directly. Many experienced and
138 reponsive experts will be able look at the problem and quickly
139 address it. Please include a thorough description of the problem
140 with code and data examples if at all possible.
142 =head2 Reporting Bugs
144 Report bugs to the Bioperl bug tracking system to help us keep track
145 of the bugs and their resolution. Bug reports can be submitted via
146 the web:
148 https://github.com/bioperl/bioperl-live/issues
150 =head1 AUTHOR - Jason Stajich, Matthew Hahn
152 Email jason-at-bioperl-dot-org
153 Email matthew-dot-hahn-at-duke-dot-edu
155 McDonald-Kreitman implementation based on work by Alisha Holloway at
156 UC Davis.
159 =head1 APPENDIX
161 The rest of the documentation details each of the object methods.
162 Internal methods are usually preceded with a _
164 =cut
167 # Let the code begin...
170 package Bio::PopGen::Statistics;
171 use strict;
172 use constant {
173 in_label => 'ingroup',
174 out_label => 'outgroup',
175 non_syn => 'non_synonymous',
176 syn => 'synonymous',
177 default_codon_table => 1, # Standard Codon table
180 use Bio::MolEvol::CodonModel;
181 use List::Util qw(sum);
183 use base qw(Bio::Root::Root);
184 our $codon_table => default_codon_table;
185 our $has_twotailed => 0;
186 BEGIN {
187 eval { require Text::NSP::Measures::2D::Fisher2::twotailed };
188 if( $@ ) { $has_twotailed = 0; }
189 else { $has_twotailed = 1; }
197 =head2 new
199 Title : new
200 Usage : my $obj = Bio::PopGen::Statistics->new();
201 Function: Builds a new Bio::PopGen::Statistics object
202 Returns : an instance of Bio::PopGen::Statistics
203 Args : none
206 =cut
209 =head2 fu_and_li_D
211 Title : fu_and_li_D
212 Usage : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup);
214 my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations);
215 Function: Fu and Li D statistic for a list of individuals
216 given an outgroup and the number of external mutations
217 (either provided or calculated from list of outgroup individuals)
218 Returns : decimal
219 Args : $individuals - array reference which contains ingroup individuals
220 (L<Bio::PopGen::Individual> or derived classes)
221 $extmutations - number of external mutations OR
222 arrayref of outgroup individuals
224 =cut
226 sub fu_and_li_D {
227 my ($self,$ingroup,$outgroup) = @_;
229 my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0);
230 if( ref($ingroup) =~ /ARRAY/i ) {
231 $n = scalar @$ingroup;
232 # pi - all pairwise differences
233 $seg_sites = $self->segregating_sites_count($ingroup);
234 } elsif( ref($ingroup) &&
235 $ingroup->isa('Bio::PopGen::PopulationI')) {
236 $n = $ingroup->get_number_individuals;
237 $seg_sites = $self->segregating_sites_count($ingroup);
238 } else {
239 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
240 return 0;
243 if( $seg_sites <= 0 ) {
244 $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
245 return 0;
248 if( ! defined $outgroup ) {
249 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
250 return 0;
251 } elsif( ref($outgroup) ) {
252 ($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
253 $ancestral = 0 unless defined $ancestral;
254 } else {
255 $ancestral = $outgroup;
258 return $self->fu_and_li_D_counts($n,$seg_sites,
259 $ancestral,$derived);
262 =head2 fu_and_li_D_counts
264 Title : fu_li_D_counts
265 Usage : my $D = $statistics->fu_and_li_D_counts($samps,$sites,
266 $external);
267 Function: Fu and Li D statistic for the raw counts of the number
268 of samples, sites, external and internal mutations
269 Returns : decimal number
270 Args : number of samples (N)
271 number of segregating sites (n)
272 number of external mutations (n_e)
274 =cut
277 sub fu_and_li_D_counts {
278 my ($self,$n,$seg_sites, $external_mut) = @_;
279 my $a_n = 0;
280 if( $n <= 3 ) {
281 $self->warn("n is $n, too small, must be > 3\n");
282 return;
284 for(my $k= 1; $k < $n; $k++ ) {
285 $a_n += ( 1 / $k );
287 my $b = 0;
288 for(my $k= 1; $k < $n; $k++ ) {
289 $b += ( 1 / $k**2 );
292 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
293 ( ( $n - 1) * ( $n - 2 ) ) );
295 my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) *
296 ( $c - ( ( $n + 1) /
297 ( $n - 1) ) ));
299 my $u = $a_n - 1 - $v;
301 ($seg_sites - $a_n * $external_mut) /
302 sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites));
307 =head2 fu_and_li_D_star
309 Title : fu_and_li_D_star
310 Usage : my $D = $statistics->fu_an_li_D_star(\@individuals);
311 Function: Fu and Li's D* statistic for a set of samples
312 Without an outgroup
313 Returns : decimal number
314 Args : array ref of L<Bio::PopGen::IndividualI> objects
316 L<Bio::PopGen::PopulationI> object
318 =cut
321 # fu_and_li_D*
323 sub fu_and_li_D_star {
324 my ($self,$individuals) = @_;
326 my ($seg_sites,$n,$singletons);
327 if( ref($individuals) =~ /ARRAY/i ) {
328 $n = scalar @$individuals;
329 $seg_sites = $self->segregating_sites_count($individuals);
330 $singletons = $self->singleton_count($individuals);
331 } elsif( ref($individuals) &&
332 $individuals->isa('Bio::PopGen::PopulationI')) {
333 my $pop = $individuals;
334 $n = $pop->get_number_individuals;
335 $seg_sites = $self->segregating_sites_count($pop);
336 $singletons = $self->singleton_count($pop);
337 } else {
338 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star");
339 return 0;
342 return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons);
345 =head2 fu_and_li_D_star_counts
347 Title : fu_li_D_star_counts
348 Usage : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites,
349 $singletons);
351 Function: Fu and Li D statistic for the raw counts of the number
352 of samples, sites, external and internal mutations
353 Returns : decimal number
354 Args : number of samples (N)
355 number of segregating sites (n)
356 singletons (n_s)
358 =cut
361 sub fu_and_li_D_star_counts {
362 my ($self,$n,$seg_sites, $singletons) = @_;
363 my $a_n;
364 for(my $k = 1; $k < $n; $k++ ) {
365 $a_n += ( 1 / $k );
368 my $a1 = $a_n + 1 / $n;
370 my $b = 0;
371 for(my $k= 1; $k < $n; $k++ ) {
372 $b += ( 1 / $k**2 );
375 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
376 ( ( $n - 1) * ( $n - 2 ) ) );
378 my $d = $c + ($n -2) / ($n - 1)**2 +
379 2 / ($n -1) *
380 ( 1.5 - ( (2*$a1 - 3) / ($n -2) ) -
381 1 / $n );
383 my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) -
384 (2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) ) /
385 (($a_n**2) + $b);
387 my $u_star = ( ($n/($n-1))*
388 ($a_n - ($n/
389 ($n-1)))) - $v_star;
392 return (($n / ($n - 1)) * $seg_sites -
393 $a_n * $singletons) /
394 sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites));
398 =head2 fu_and_li_F
400 Title : fu_and_li_F
401 Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
402 Function: Calculate Fu and Li's F on an ingroup with either the set of
403 outgroup individuals, or the number of external mutations
404 Returns : decimal number
405 Args : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
406 OR a L<Bio::PopGen::PopulationI> object
407 number of external mutations OR list of individuals for the outgroup
409 =cut
413 sub fu_and_li_F {
414 my ($self,$ingroup,$outgroup) = @_;
415 my ($seg_sites,$pi,$n,$external,$internal);
416 if( ref($ingroup) =~ /ARRAY/i ) {
417 $n = scalar @$ingroup;
418 # pi - all pairwise differences
419 $pi = $self->pi($ingroup);
420 $seg_sites = $self->segregating_sites_count($ingroup);
421 } elsif( ref($ingroup) &&
422 $ingroup->isa('Bio::PopGen::PopulationI')) {
423 $n = $ingroup->get_number_individuals;
424 $pi = $self->pi($ingroup);
425 $seg_sites = $self->segregating_sites_count($ingroup);
426 } else {
427 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F");
428 return 0;
431 if( ! defined $outgroup ) {
432 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
433 return 0;
434 } elsif( ref($outgroup) ) {
435 ($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
436 } else {
437 $external = $outgroup;
439 $self->fu_and_li_F_counts($n,$pi,$seg_sites,$external);
442 =head2 fu_and_li_F_counts
444 Title : fu_li_F_counts
445 Usage : my $F = $statistics->fu_and_li_F_counts($samps,$pi,
446 $sites,
447 $external);
448 Function: Fu and Li F statistic for the raw counts of the number
449 of samples, sites, external and internal mutations
450 Returns : decimal number
451 Args : number of samples (N)
452 average pairwise differences (pi)
453 number of segregating sites (n)
454 external mutations (n_e)
456 =cut
459 sub fu_and_li_F_counts {
460 my ($self,$n,$pi,$seg_sites, $external) = @_;
461 my $a_n = 0;
462 for(my $k= 1; $k < $n; $k++ ) {
463 $a_n += ( 1 / $k );
466 my $a1 = $a_n + (1 / $n );
468 my $b = 0;
469 for(my $k= 1; $k < $n; $k++ ) {
470 $b += ( 1 / $k**2 );
473 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
474 ( ( $n - 1) * ( $n - 2 ) ) );
476 my $v_F = ( $c + ( (2*(($n**2)+$n+3)) /
477 ( (9*$n)*($n-1) ) ) -
478 (2/($n-1)) ) / ( ($a_n**2)+$b );
480 my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )-
481 ( 4*( ($n+1)/(($n-1)**2) ))*
482 ($a1 - ((2*$n)/($n+1))) ) /
483 $a_n - $v_F;
485 # warn("$v_F vf $u_F uf n = $n\n");
486 my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
487 ($v_F*($seg_sites**2)) ) );
489 return $F;
492 =head2 fu_and_li_F_star
494 Title : fu_and_li_F_star
495 Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
496 Function: Calculate Fu and Li's F* on an ingroup without an outgroup
497 It uses count of singleton alleles instead
498 Returns : decimal number
499 Args : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
501 L<Bio::PopGen::PopulationI> object
503 =cut
505 #' keep my emacs happy
507 sub fu_and_li_F_star {
508 my ($self,$individuals) = @_;
510 my ($seg_sites,$pi,$n,$singletons);
511 if( ref($individuals) =~ /ARRAY/i ) {
512 $n = scalar @$individuals;
513 # pi - all pairwise differences
514 $pi = $self->pi($individuals);
515 $seg_sites = $self->segregating_sites_count($individuals);
516 $singletons = $self->singleton_count($individuals);
517 } elsif( ref($individuals) &&
518 $individuals->isa('Bio::PopGen::PopulationI')) {
519 my $pop = $individuals;
520 $n = $pop->get_number_individuals;
521 $pi = $self->pi($pop);
522 $seg_sites = $self->segregating_sites_count($pop);
523 $singletons = $self->singleton_count($pop);
524 } else {
525 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star");
526 return 0;
528 return $self->fu_and_li_F_star_counts($n,
529 $pi,
530 $seg_sites,
531 $singletons);
534 =head2 fu_and_li_F_star_counts
536 Title : fu_li_F_star_counts
537 Usage : my $F = $statistics->fu_and_li_F_star_counts($samps,
538 $pi,$sites,
539 $singletons);
540 Function: Fu and Li F statistic for the raw counts of the number
541 of samples, sites, external and internal mutations
542 Returns : decimal number
543 Args : number of samples (N)
544 average pairwise differences (pi)
545 number of segregating sites (n)
546 singleton mutations (n_s)
548 =cut
551 sub fu_and_li_F_star_counts {
552 my ($self,$n,$pi,$seg_sites, $singletons) = @_;
553 if( $n <= 1 ) {
554 $self->warn("N must be > 1\n");
555 return;
557 if( $n == 2) {
558 return 0;
561 my $a_n = 0;
564 my $b = 0;
565 for(my $k= 1; $k < $n; $k++ ) {
566 $b += (1 / ($k**2));
567 $a_n += ( 1 / $k ); # Eq (2)
569 my $a1 = $a_n + (1 / $n );
571 # warn("a_n is $a_n a1 is $a1 n is $n b is $b\n");
573 # From Simonsen et al (1995) instead of Fu and Li 1993
574 my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/
575 (9 * ($n ** 2) * ( $n - 1))) +
576 ((2 * ($n - 1) * $a_n ) / $n ** 2) -
577 (8 * $b / $n) ) /
578 ( ($a_n ** 2) + $b );
580 my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) /
581 (3 * $n * ( $n - 1))) / $a_n) - $v_F_star;
583 # warn("vf* = $v_F_star uf* = $u_F_star n = $n\n");
584 my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) /
585 sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2);
586 return $F_star;
589 =head2 tajima_D
591 Title : tajima_D
592 Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
593 Function: Calculate Tajima's D on a set of samples
594 Returns : decimal number
595 Args : array ref of L<Bio::PopGen::IndividualI> objects
597 L<Bio::PopGen::PopulationI> object
600 =cut
604 sub tajima_D {
605 my ($self,$individuals) = @_;
606 my ($seg_sites,$pi,$n);
608 if( ref($individuals) =~ /ARRAY/i ) {
609 $n = scalar @$individuals;
610 # pi - all pairwise differences
611 $pi = $self->pi($individuals);
612 $seg_sites = $self->segregating_sites_count($individuals);
614 } elsif( ref($individuals) &&
615 $individuals->isa('Bio::PopGen::PopulationI')) {
616 my $pop = $individuals;
617 $n = $pop->get_number_individuals;
618 $pi = $self->pi($pop);
619 $seg_sites = $self->segregating_sites_count($pop);
620 } else {
621 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
622 return 0;
624 $self->tajima_D_counts($n,$seg_sites,$pi);
627 =head2 tajima_D_counts
629 Title : tajima_D_counts
630 Usage : my $D = $statistics->tajima_D_counts($samps,$sites,$pi);
631 Function: Tajima's D statistic for the raw counts of the number
632 of samples, sites, and avg pairwise distances (pi)
633 Returns : decimal number
634 Args : number of samples (N)
635 number of segregating sites (n)
636 average pairwise differences (pi)
638 =cut
642 sub tajima_D_counts {
643 my ($self,$n,$seg_sites,$pi) = @_;
644 my $a1 = 0;
645 for(my $k= 1; $k < $n; $k++ ) {
646 $a1 += ( 1 / $k );
649 my $a2 = 0;
650 for(my $k= 1; $k < $n; $k++ ) {
651 $a2 += ( 1 / $k**2 );
654 my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) );
655 my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) /
656 ( ( 9 * $n) * ( $n - 1) );
657 my $c1 = $b1 - ( 1 / $a1 );
658 my $c2 = $b2 - ( ( $n + 2 ) /
659 ( $a1 * $n))+( $a2 / $a1 ** 2);
660 my $e1 = $c1 / $a1;
661 my $e2 = $c2 / ( $a1**2 + $a2 );
663 my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
664 return if $denom == 0;
665 my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom;
666 return $D;
670 =head2 pi
672 Title : pi
673 Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds)
674 Function: Calculate pi (average number of pairwise differences) given
675 a list of individuals which have the same number of markers
676 (also called sites) as available from the get_Genotypes()
677 call in L<Bio::PopGen::IndividualI>
678 Returns : decimal number
679 Args : Arg1= array ref of L<Bio::PopGen::IndividualI> objects
680 which have markers/mutations. We expect all individuals to
681 have a marker - we will deal with missing data as a special case.
683 Arg1= L<Bio::PopGen::PopulationI> object. In the event that
684 only allele frequency data is available, storing it in
685 Population object will make this available.
686 num sites [optional], an optional second argument (integer)
687 which is the number of sites, then pi returned is pi/site.
689 =cut
691 sub pi {
692 my ($self,$individuals,$numsites) = @_;
693 my (%data,%marker_total,@marker_names,$n);
695 if( ref($individuals) =~ /ARRAY/i ) {
696 # one possible argument is an arrayref of Bio::PopGen::IndividualI objs
697 @marker_names = $individuals->[0]->get_marker_names;
698 $n = scalar @$individuals;
700 # Here we are calculating the allele frequencies
701 foreach my $ind ( @$individuals ) {
702 if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
703 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
704 return 0;
706 foreach my $m ( @marker_names ) {
707 foreach my $allele (map { $_->get_Alleles}
708 $ind->get_Genotypes($m) ) {
709 $data{$m}->{$allele}++;
710 $marker_total{$m}++;
714 # while( my ($marker,$count) = each %marker_total ) {
715 # foreach my $c ( values %{$data{$marker}} ) {
716 # $c /= $count;
719 # %data will contain allele frequencies for each marker, allele
720 } elsif( ref($individuals) &&
721 $individuals->isa('Bio::PopGen::PopulationI') ) {
722 my $pop = $individuals;
723 $n = $pop->get_number_individuals;
724 foreach my $marker( $pop->get_Markers ) {
725 push @marker_names, $marker->name;
726 #$data{$marker->name} = {$marker->get_Allele_Frequencies};
727 my @genotypes = $pop->get_Genotypes(-marker => $marker->name);
728 for my $al ( map { $_->get_Alleles} @genotypes ) {
729 $data{$marker->name}->{$al}++;
730 $marker_total{$marker->name}++;
733 } else {
734 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
736 # based on Kevin Thornton's code:
737 # http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152
738 # For now we assume that all individuals have the same markers
739 my ($diffcount,$totalcompare) = (0,0);
740 my $pi = 0;
741 while ( my ($marker,$markerdat) = each %data ) {
742 my $sampsize = $marker_total{$marker};
743 my $ssh = 0;
744 my @alleles = keys %$markerdat;
745 if ( $sampsize > 1 ) {
746 my $denom = $sampsize * ($sampsize - 1.0);
747 foreach my $al ( @alleles ) {
748 $ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom;
750 $pi += 1.0 - $ssh;
753 $self->debug( "pi=$pi\n");
754 if( $numsites ) {
755 return $pi / $numsites;
756 } else {
757 return $pi;
762 =head2 theta
764 Title : theta
765 Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
766 Function: Calculates Watterson's theta from the sample size
767 and the number of segregating sites.
768 Providing the third parameter, total number of sites will
769 return theta per site.
770 This is also known as K-hat = K / a_n
771 Returns : decimal number
772 Args : sample size (integer),
773 num segregating sites (integer)
774 total sites (integer) [optional] (to calculate theta per site)
776 provide an arrayref of the L<Bio::PopGen::IndividualI> objects
777 total sites (integer) [optional] (to calculate theta per site)
779 provide an L<Bio::PopGen::PopulationI> object
780 total sites (integer)[optional]
782 =cut
786 sub theta {
787 my $self = shift;
788 my ( $n, $seg_sites,$totalsites) = @_;
789 if( ref($n) =~ /ARRAY/i ) {
790 my $samps = $n;
791 $totalsites = $seg_sites; # only 2 arguments if one is an array
792 my %data;
793 my @marker_names = $samps->[0]->get_marker_names;
794 # we need to calculate number of polymorphic sites
795 $seg_sites = $self->segregating_sites_count($samps);
796 $n = scalar @$samps;
798 } elsif(ref($n) &&
799 $n->isa('Bio::PopGen::PopulationI') ) {
800 # This will handle the case when we pass in a PopulationI object
801 my $pop = $n;
802 $totalsites = $seg_sites; # shift the arguments over by one
803 $n = $pop->haploid_population->get_number_individuals;
804 $seg_sites = $self->segregating_sites_count($pop);
806 my $a1 = 0;
807 for(my $k= 1; $k < $n; $k++ ) {
808 $a1 += ( 1 / $k );
810 if( $totalsites ) { # 0 and undef are the same can't divide by them
811 $seg_sites /= $totalsites;
813 if( $a1 == 0 ) {
814 return 0;
816 return $seg_sites / $a1;
819 =head2 singleton_count
821 Title : singleton_count
822 Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
823 Function: Calculate the number of mutations/alleles which only occur once in
824 a list of individuals for all sites/markers
825 Returns : (integer) number of alleles which only occur once (integer)
826 Args : arrayref of L<Bio::PopGen::IndividualI> objects
828 L<Bio::PopGen::PopulationI> object
830 =cut
832 sub singleton_count {
833 my ($self,$individuals) = @_;
835 my @inds;
836 if( ref($individuals) =~ /ARRAY/ ) {
837 @inds = @$individuals;
838 } elsif( ref($individuals) &&
839 $individuals->isa('Bio::PopGen::PopulationI') ) {
840 my $pop = $individuals;
841 @inds = $pop->get_Individuals();
842 unless( @inds ) {
843 $self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
844 return 0;
846 } else {
847 $self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
848 return 0;
850 # find number of sites where a particular allele is only seen once
852 my ($singleton_allele_ct,%sites) = (0);
853 # first collect all the alleles into a hash structure
855 foreach my $n ( @inds ) {
856 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
857 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
858 return 0;
860 foreach my $g ( $n->get_Genotypes ) {
861 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
862 foreach my $allele (@alleles ) {
863 $sites{$nm}->{$allele}++;
867 foreach my $site ( values %sites ) { # don't really care what the name is
868 foreach my $allelect ( values %$site ) { #
869 # find the sites which have an allele with only 1 copy
870 $singleton_allele_ct++ if( $allelect == 1 );
873 return $singleton_allele_ct;
876 # Yes I know that singleton_count and segregating_sites_count are
877 # basically processing the same data so calling them both is
878 # redundant, something I want to fix later but want to make things
879 # correct and simple first
881 =head2 segregating_sites_count
883 Title : segregating_sites_count
884 Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
885 Function: Gets the number of segregating sites (number of polymorphic sites)
886 Returns : (integer) number of segregating sites
887 Args : arrayref of L<Bio::PopGen::IndividualI> objects
889 L<Bio::PopGen::PopulationI> object
891 =cut
893 # perhaps we'll change this in the future
894 # to return the actual segregating sites
895 # so one can use this to pull in the names of those sites.
896 # Would be trivial if it is useful.
898 sub segregating_sites_count {
899 my ($self,$individuals) = @_;
900 my $type = ref($individuals);
901 my $seg_sites = 0;
902 if( $type =~ /ARRAY/i ) {
903 my %sites;
904 foreach my $n ( @$individuals ) {
905 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
906 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
907 return 0;
909 foreach my $g ( $n->get_Genotypes ) {
910 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
911 foreach my $allele (@alleles ) {
912 $sites{$nm}->{$allele}++;
916 foreach my $site ( values %sites ) { # use values b/c we don't
917 # really care what the name is
918 # find the sites which >1 allele
919 $seg_sites++ if( keys %$site > 1 );
921 } elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
922 foreach my $marker ( $individuals->haploid_population->get_Markers ) {
923 my @alleles = $marker->get_Alleles;
924 $seg_sites++ if ( scalar @alleles > 1 );
926 } else {
927 $self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
928 return 0;
930 return $seg_sites;
934 =head2 heterozygosity
936 Title : heterozygosity
937 Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
938 Function: Calculate the heterozgosity for a sample set for a set of alleles
939 Returns : decimal number
940 Args : sample size (integer)
941 frequency of one allele (fraction - must be less than 1)
942 [optional] frequency of another allele - this is only needed
943 in a non-binary allele system
945 Note : p^2 + 2pq + q^2
947 =cut
950 sub heterozygosity {
951 my ($self,$samp_size, $freq1,$freq2) = @_;
952 if( ! $freq2 ) { $freq2 = 1 - $freq1 }
953 if( $freq1 > 1 || $freq2 > 1 ) {
954 $self->warn("heterozygosity expects frequencies to be less than 1");
956 my $sum = ($freq1**2) + (($freq2)**2);
957 my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
958 return $h;
962 =head2 derived_mutations
964 Title : derived_mutations
965 Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
966 Function: Calculate the number of alleles or (mutations) which are ancestral
967 and the number which are derived (occurred only on the tips)
968 Returns : array of 2 items - number of external and internal derived
969 mutation
970 Args : ingroup - L<Bio::PopGen::IndividualI>s arrayref OR
971 L<Bio::PopGen::PopulationI>
972 outgroup- L<Bio::PopGen::IndividualI>s arrayref OR
973 L<Bio::PopGen::PopulationI> OR
974 a single L<Bio::PopGen::IndividualI>
976 =cut
978 sub derived_mutations {
979 my ($self,$ingroup,$outgroup) = @_;
980 my (%indata,%outdata,@marker_names);
982 # basically we have to do some type checking
983 # if that perl were typed...
984 my ($itype,$otype) = (ref($ingroup),ref($outgroup));
986 return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
987 # are already the value we
988 # are searching for
989 # pick apart the ingroup
990 # get the data
991 if( ref($ingroup) =~ /ARRAY/i ) {
992 if( ! ref($ingroup->[0]) ||
993 ! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
994 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
995 return 0;
997 # we assume that all individuals have the same markers
998 # i.e. that they are aligned
999 @marker_names = $ingroup->[0]->get_marker_names;
1000 for my $ind ( @$ingroup ) {
1001 for my $m ( @marker_names ) {
1002 for my $allele ( map { $_->get_Alleles }
1003 $ind->get_Genotypes($m) ) {
1004 $indata{$m}->{$allele}++;
1008 } elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
1009 @marker_names = $ingroup->get_marker_names;
1010 for my $ind ( $ingroup->haploid_population->get_Individuals() ) {
1011 for my $m ( @marker_names ) {
1012 for my $allele ( map { $_->get_Alleles}
1013 $ind->get_Genotypes($m) ) {
1014 $indata{$m}->{$allele}++;
1018 } else {
1019 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
1020 return 0;
1023 if( $otype =~ /ARRAY/i ) {
1024 if( ! ref($outgroup->[0]) ||
1025 ! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
1026 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
1027 return 0;
1029 for my $ind ( @$outgroup ) {
1030 for my $m ( @marker_names ) {
1031 for my $allele ( map { $_->get_Alleles }
1032 $ind->get_Genotypes($m) ) {
1033 $outdata{$m}->{$allele}++;
1038 } elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
1039 for my $ind ( $outgroup->haploid_population->get_Individuals() ) {
1040 for my $m ( @marker_names ) {
1041 for my $allele ( map { $_->get_Alleles}
1042 $ind->get_Genotypes($m) ) {
1043 $outdata{$m}->{$allele}++;
1047 } else {
1048 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
1049 return 0;
1052 # derived mutations are defined as
1054 # ingroup (G A T)
1055 # outgroup (A)
1056 # derived mutations are G and T, A is the external mutation
1058 # ingroup (A T)
1059 # outgroup (C)
1060 # derived mutations A,T no external/ancestral mutations
1062 # ingroup (G A T)
1063 # outgroup (A T)
1064 # cannot determine
1066 my ($internal,$external);
1067 foreach my $marker ( @marker_names ) {
1068 my @outalleles = keys %{$outdata{$marker}};
1069 my @in_alleles = keys %{$indata{$marker}};
1070 next if( @outalleles > 1 || @in_alleles == 1);
1071 for my $allele ( @in_alleles ) {
1072 if( ! exists $outdata{$marker}->{$allele} ) {
1073 if( $indata{$marker}->{$allele} == 1 ) {
1074 $external++;
1075 } else {
1076 $internal++;
1081 return ($external, $internal);
1085 =head2 composite_LD
1087 Title : composite_LD
1088 Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population);
1089 Function: Calculate the Linkage Disequilibrium
1090 This is for calculating LD for unphased data.
1091 Other methods will be appropriate for phased haplotype data.
1093 Returns : Hash of Hashes - first key is site 1,second key is site 2
1094 and value is LD for those two sites.
1095 my $LDarrayref = $matrix{$site1}->{$site2};
1096 my ($ldval, $chisquared) = @$LDarrayref;
1097 Args : L<Bio::PopGen::PopulationI> or arrayref of
1098 L<Bio::PopGen::IndividualI>s
1099 Reference: Weir B.S. (1996) "Genetic Data Analysis II",
1100 Sinauer, Sunderlanm MA.
1102 =cut
1104 sub composite_LD {
1105 my ($self,$pop) = @_;
1106 if( ref($pop) =~ /ARRAY/i ) {
1107 if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
1108 $pop = Bio::PopGen::Population->new(-individuals => @$pop);
1109 } else {
1110 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1111 return ();
1113 } elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
1114 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1115 return ();
1118 my @marker_names = $pop->get_marker_names;
1119 my @inds = $pop->get_Individuals;
1120 my $num_inds = scalar @inds;
1121 my (%lookup);
1122 # calculate allele frequencies for each marker from the population
1123 # use the built-in get_Marker to get the allele freqs
1124 # we still need to calculate the genotype frequencies
1125 foreach my $marker_name ( @marker_names ) {
1126 my(%allelef);
1128 foreach my $ind ( @inds ) {
1129 my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
1130 if( ! defined $genotype ) {
1131 $self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
1132 next;
1134 my @alleles = sort $genotype->get_Alleles;
1135 next if( scalar @alleles != 2);
1136 my $genostr = join(',', @alleles);
1137 $allelef{$alleles[0]}++;
1138 $allelef{$alleles[1]}++;
1141 # we should check for cases where there > 2 alleles or
1142 # only 1 allele and throw out those markers.
1143 my @alleles = sort keys %allelef;
1144 my $allele_count = scalar @alleles;
1145 # test if site is polymorphic
1146 if( $allele_count != 2) {
1147 # only really warn if we're seeing multi-allele
1148 $self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
1149 next; # skip this marker
1152 # Need to do something here to detect alleles which aren't
1153 # a single character
1154 if( length($alleles[0]) != 1 ||
1155 length($alleles[1]) != 1 ) {
1156 $self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
1157 next;
1160 # fix the call for allele 1 (A or B) and
1161 # allele 2 (a or b) in terms of how we'll do the
1162 # N square from Weir p.126
1163 $self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
1164 $lookup{$marker_name}->{'1'} = $alleles[0];
1165 $lookup{$marker_name}->{'2'} = $alleles[1];
1168 @marker_names = sort keys %lookup;
1169 my $site_count = scalar @marker_names;
1170 # where the final data will be stored
1171 my %stats_for_sites;
1173 # standard way of generating pairwise combos
1174 # LD is done by comparing all the pairwise site (marker)
1175 # combinations and keeping track of the genotype and
1176 # pairwise genotype (ie genotypes of the 2 sites) frequencies
1177 for( my $i = 0; $i < $site_count - 1; $i++ ) {
1178 my $site1 = $marker_names[$i];
1180 for( my $j = $i+1; $j < $site_count ; $j++) {
1181 my (%genotypes, %total_genotype_count,$total_pairwisegeno_count,
1182 %pairwise_genotypes);
1184 my $site2 = $marker_names[$j];
1185 my (%allele_count,%allele_freqs) = (0,0);
1186 foreach my $ind ( @inds ) {
1187 # build string of genotype at site 1
1188 my ($genotype1) = $ind->get_Genotypes(-marker => $site1);
1189 my @alleles1 = sort $genotype1->get_Alleles;
1191 # if an individual has only one available allele
1192 # (has a blank or N for one of the chromosomes)
1193 # we don't want to use it in our calculation
1195 next unless( scalar @alleles1 == 2);
1196 my $genostr1 = join(',', @alleles1);
1198 # build string of genotype at site 2
1199 my ($genotype2) = $ind->get_Genotypes(-marker => $site2);
1200 my @alleles2 = sort $genotype2->get_Alleles;
1201 my $genostr2 = join(',', @alleles2);
1203 next unless( scalar @alleles2 == 2);
1204 for (@alleles1) {
1205 $allele_count{$site1}++;
1206 $allele_freqs{$site1}->{$_}++;
1208 $genotypes{$site1}->{$genostr1}++;
1209 $total_genotype_count{$site1}++;
1211 for (@alleles2) {
1212 $allele_count{$site2}++;
1213 $allele_freqs{$site2}->{$_}++;
1215 $genotypes{$site2}->{$genostr2}++;
1216 $total_genotype_count{$site2}++;
1218 # We are using the $site1,$site2 to signify
1219 # a unique key
1220 $pairwise_genotypes{"$genostr1,$genostr2"}++;
1221 # some individuals
1222 $total_pairwisegeno_count++;
1224 for my $site ( %allele_freqs ) {
1225 for my $al ( keys %{ $allele_freqs{$site} } ) {
1226 $allele_freqs{$site}->{$al} /= $allele_count{$site};
1229 my $n = $total_pairwisegeno_count; # number of pairs of comparisons
1230 # 'A' and 'B' are two loci or in our case site1 and site2
1231 my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1232 my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1233 my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1234 my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1235 # AABB
1236 my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1237 $allele1_site2, $allele1_site2));
1238 $self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1239 # AABb
1240 my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1241 $allele1_site2, $allele2_site2));
1242 $self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1243 # AaBB
1244 my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1245 $allele1_site2, $allele1_site2));
1246 $self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1247 # AaBb
1248 my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1249 $allele1_site2, $allele2_site2));
1250 $self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1251 # count of AABB in
1252 my $n1 = $pairwise_genotypes{$N1genostr} || 0;
1253 # count of AABb in
1254 my $n2 = $pairwise_genotypes{$N2genostr} || 0;
1255 # count of AaBB in
1256 my $n4 = $pairwise_genotypes{$N4genostr} || 0;
1257 # count of AaBb in
1258 my $n5 = $pairwise_genotypes{$N5genostr} || 0;
1260 my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1261 my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1262 my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1263 my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1264 my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq
1265 my $p_a = 1 - $p_A;
1267 my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq
1268 my $p_b = 1 - $p_B;
1270 # variance of allele frequencies
1271 my $pi_A = $p_A * $p_a;
1272 my $pi_B = $p_B * $p_b;
1274 # hardy weinberg
1275 my $D_A = $p_AA - $p_A**2;
1276 my $D_B = $p_BB - $p_B**2;
1277 my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1278 $self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1280 my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1281 $self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1282 $self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1283 $n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1285 my $chisquared;
1286 eval { $chisquared = ( $n * ($delta_AB**2) ) /
1287 ( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1289 if( $@ ) {
1290 $self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1291 next;
1293 # this will be an upper triangular matrix
1294 $stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1297 return %stats_for_sites;
1300 =head2 mcdonald_kreitman
1302 Title : mcdonald_kreitman
1303 Usage : $Fstat = mcdonald_kreitman($ingroup, $outgroup);
1304 Function: Calculates McDonald-Kreitman statistic based on a set of ingroup
1305 individuals and an outgroup by computing the number of
1306 differences at synonymous and non-synonymous sites
1307 for intraspecific comparisons and with the outgroup
1308 Returns : 2x2 table, followed by a hash reference indicating any
1309 warning messages about the status of the alleles or codons
1310 Args : -ingroup => L<Bio::PopGen::Population> object or
1311 arrayref of L<Bio::PopGen::Individual>s
1312 -outgroup => L<Bio::PopGen::Population> object or
1313 arrayef of L<Bio::PopGen::Individual>s
1314 -polarized => Boolean, to indicate if this should be
1315 a polarized test. Must provide two individuals
1316 as outgroups.
1318 =cut
1320 sub mcdonald_kreitman {
1321 my ($self,@args) = @_;
1322 my ($ingroup, $outgroup,$polarized) =
1323 $self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args);
1324 my $verbose = $self->verbose;
1325 my $outgroup_count;
1326 my $gapchar = '\-';
1327 if( ref($outgroup) =~ /ARRAY/i ) {
1328 $outgroup_count = scalar @$outgroup;
1329 } elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) {
1330 $outgroup_count = $outgroup->get_number_individuals;
1331 } else {
1332 $self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI");
1335 if( $polarized ) {
1336 if( $outgroup_count < 2 ) {
1337 $self->throw("Need 2 outgroups with polarized option\n");
1339 } elsif( $outgroup_count > 1 ) {
1340 $self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count ));
1341 } elsif( $outgroup_count == 0 ) {
1342 $self->throw("No outgroup sequence provided");
1345 my $codon_path = Bio::MolEvol::CodonModel->codon_path;
1347 my (%marker_names,%unique,@inds);
1348 for my $p ( $ingroup, $outgroup) {
1349 if( ref($p) =~ /ARRAY/i ) {
1350 push @inds, @$p;
1351 } else {
1352 push @inds, $p->get_Individuals;
1355 for my $i ( @inds ) {
1356 if( $unique{$i->unique_id}++ ) {
1357 $self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n");
1359 for my $n ( $i->get_marker_names ) {
1360 $marker_names{$n}++;
1364 my @marker_names = keys %marker_names;
1365 if( $marker_names[0] =~ /^(Site|Codon)/ ) {
1366 # sort by site or codon number and do it in
1367 # a schwartzian transformation baby!
1368 @marker_names = map { $_->[1] }
1369 sort { $a->[0] <=> $b->[0] }
1370 map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names;
1374 my $num_inds = scalar @inds;
1375 my %vals = ( 'ingroup' => $ingroup,
1376 'outgroup' => $outgroup,
1379 # Make the Codon Table type a parameter!
1380 my $table = Bio::Tools::CodonTable->new(-id => $codon_table);
1381 my @vt = qw(outgroup ingroup);
1382 my %changes;
1383 my %status;
1384 my %two_by_two = ( 'fixed_N' => 0,
1385 'fixed_S' => 0,
1386 'poly_N' => 0,
1387 'poly_S' => 0);
1389 for my $codon ( @marker_names ) {
1390 my (%codonvals);
1391 my %all_alleles;
1392 for my $t ( @vt ) {
1393 my $outcount = 1;
1394 for my $ind ( @{$vals{$t}} ) {
1395 my @alleles = $ind->get_Genotypes($codon)->get_Alleles;
1396 if( @alleles > 2 ) {
1397 warn("Codon $codon saw ", scalar @alleles, " alleles for ind ", $ind->unique_id, "\n");
1398 die;
1399 } else {
1400 my ($allele) = shift @alleles;
1401 $all_alleles{$ind->unique_id} = $allele;
1402 my $AA = $table->translate($allele);
1403 next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i);
1405 my $label = $t;
1406 if( $t eq 'outgroup' ) {
1407 $label = $t.$outcount++;
1409 $codonvals{$label}->{$allele}++;
1410 $codonvals{all}->{$allele}++;
1414 my $total = sum ( values %{$codonvals{'ingroup'}} );
1415 next if( $total && $total < 2 ); # skip sites with < alleles
1416 # process all the seen alleles (codons)
1417 # this is a vertical slide through the alignment
1418 if( keys %{$codonvals{all}} <= 1 ) {
1419 # no changes or no VALID codons - monomorphic
1420 } else {
1421 # grab only the first outgroup codon (what to do with rest?)
1422 my ($outcodon) = keys %{$codonvals{'outgroup1'}};
1423 if( ! $outcodon ) {
1424 $status{"no outgroup codon $codon"}++;
1425 next;
1427 my $out_AA = $table->translate($outcodon);
1428 my ($outcodon2) = keys %{$codonvals{'outgroup2'}};
1429 if( ($polarized && ($outcodon ne $outcodon2)) ||
1430 $out_AA eq 'X' || $out_AA eq '*' ) {
1431 # skip if outgroup codons are different
1432 # (when polarized option is on)
1433 # or skip if the outcodon is STOP or 'NNN'
1434 if( $verbose > 0 ) {
1435 $self->debug("skipping $out_AA and $outcodon $outcodon2\n");
1437 $status{'outgroup codons different'}++;
1438 next;
1441 # check if ingroup is actually different from outgroup -
1442 # if there are the same number of alleles when considering
1443 # ALL or just the ingroup, then there is nothing new seen
1444 # in the outgroup so it must be a shared allele (codon)
1446 # so we just count how many total alleles were seen
1447 # if this is the same as the number of alleles seen for just
1448 # the ingroup then the outgroup presents no new information
1450 my @ingroup_codons = keys %{$codonvals{'ingroup'}};
1451 my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon};
1453 if( $verbose > 0 ) {
1454 $self->debug("alleles are in: ", join(",", @ingroup_codons),
1455 " out: ", join(",", keys %{$codonvals{outgroup1}}),
1456 " diff_from_out=$diff_from_out\n");
1458 for my $ind ( sort keys %all_alleles ) {
1459 $self->debug( "$ind\t$all_alleles{$ind}\n");
1462 # are all the ingroup alleles the same and diferent from outgroup?
1463 # fixed differences between species
1464 if( $diff_from_out ) {
1465 if( scalar @ingroup_codons == 1 ) {
1466 # fixed differences
1467 if( $outcodon =~ /^$gapchar/ ) {
1468 $status{'outgroup codons with gaps'}++;
1469 next;
1470 } elsif( $ingroup_codons[0] =~ /$gapchar/) {
1471 $status{'ingroup codons with gaps'}++;
1472 next;
1474 my $path = $codon_path->{uc $ingroup_codons[0].$outcodon};
1475 $two_by_two{fixed_N} += $path->[0];
1476 $two_by_two{fixed_S} += $path->[1];
1477 if( $verbose > 0 ) {
1478 $self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n");
1479 $self->debug("path is ",join(",",@$path),"\n");
1480 $self->debug
1481 (sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA,
1482 @$path, map { $two_by_two{$_} }
1483 qw(fixed_N fixed_S poly_N poly_S)));
1485 } else {
1486 # polymorphic and all are different from outgroup
1487 # Here we find the minimum number of NS subst
1488 my ($Ndiff,$Sdiff) = (3,0); # most different path
1489 for my $c ( @ingroup_codons ) {
1490 next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/);
1491 my $path = $codon_path->{uc $c.$outcodon};
1492 my ($tNdiff,$tSdiff) = @$path;
1493 if( $path->[0] < $Ndiff ||
1494 ($tNdiff == $Ndiff &&
1495 $tSdiff <= $Sdiff)) {
1496 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1499 $two_by_two{fixed_N} += $Ndiff;
1500 $two_by_two{fixed_S} += $Sdiff;
1501 if( @ingroup_codons > 2 ) {
1502 $status{"more than 2 ingroup codons $codon"}++;
1503 warn("more than 2 ingroup codons (@ingroup_codons)\n");
1504 } else {
1505 my $path = $codon_path->{uc join('',@ingroup_codons)};
1507 $two_by_two{poly_N} += $path->[0];
1508 $two_by_two{poly_S} += $path->[1];
1509 if( $verbose > 0 ) {
1510 $self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S)));
1514 } else {
1515 my %unq = map { $_ => 1 } @ingroup_codons;
1516 delete $unq{$outcodon};
1517 my @unique_codons = keys %unq;
1519 # calc path for diff add to poly
1520 # Here we find the minimum number of subst bw
1521 # codons
1522 my ($Ndiff,$Sdiff) = (3,0); # most different path
1523 for my $c ( @unique_codons ) {
1524 my $path = $codon_path->{uc $c.$outcodon };
1525 if( ! defined $path ) {
1526 die " cannot get path for ", $c.$outcodon, "\n";
1528 my ($tNdiff,$tSdiff) = @$path;
1529 if( $path->[0] < $Ndiff ||
1530 ($tNdiff == $Ndiff &&
1531 $tSdiff <= $Sdiff)) {
1532 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1536 if( @unique_codons == 2 ) {
1537 my $path = $codon_path->{uc join('',@unique_codons)};
1538 if( ! defined $path ) {
1539 $self->throw("no path for @unique_codons\n");
1541 $Ndiff += $path->[0];
1542 $Sdiff += $path->[1];
1544 $two_by_two{poly_N} += $Ndiff;
1545 $two_by_two{poly_S} += $Sdiff;
1546 if( $verbose > 0 ) {
1547 $self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,
1548 $Ndiff, $Sdiff, map { $two_by_two{$_} }
1549 qw(fixed_N fixed_S poly_N poly_S)));
1554 return ( $two_by_two{'poly_N'},
1555 $two_by_two{'fixed_N'},
1556 $two_by_two{'poly_S'},
1557 $two_by_two{'fixed_S'},
1558 {%status});
1562 *MK = \&mcdonald_kreitman;
1565 =head2 mcdonald_kreitman_counts
1567 Title : mcdonald_kreitman_counts
1568 Usage : my $MK = $statistics->mcdonald_kreitman_counts(
1570 N_poly -> integer of count of non-syn polymorphism
1571 N_fix -> integer of count of non-syn fixed substitutions
1572 S_poly -> integer of count of syn polymorphism
1573 S_fix -> integer of count of syn fixed substitutions
1575 Function:
1576 Returns : decimal number
1577 Args :
1579 =cut
1582 sub mcdonald_kreitman_counts {
1583 my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_;
1584 if( $has_twotailed ) {
1585 return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic
1586 (n11=>$Npoly,
1587 n1p=>$Npoly+$Spoly,
1588 np1=>$Npoly+$Nfix,
1589 npp=>$Npoly+$Nfix+$Spoly+$Sfix);
1590 } else {
1591 $self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed");
1592 return 0;