3 # BioPerl module for Bio::PopGen::Statistics
5 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::PopGen::Statistics - Population Genetics statistical tests
19 use Bio::PopGen::Statistics;
22 use Bio::PopGen::Simulation::Coalescent;
24 my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12);
26 my $tree = $sim->next_tree;
28 $sim->add_Mutations($tree,20);
30 my $stats = Bio::PopGen::Statistics->new();
31 my $individuals = [ $tree->get_leaf_nodes];
32 my $pi = $stats->pi($individuals);
33 my $D = $stats->tajima_D($individuals);
35 # Alternatively to do this on input data from
36 # See the tests in t/PopGen.t for more examples
37 my $parser = Bio::PopGen::IO->new(-format => 'prettybase',
38 -file => 't/data/popstats.prettybase');
39 my $pop = $parser->next_population;
40 # Note that you can also call the stats as a class method if you like
41 # the only reason to instantiate it (as above) is if you want
42 # to set the verbosity for debugging
43 $pi = Bio::PopGen::Statistics->pi($pop);
44 $theta = Bio::PopGen::Statistics->theta($pop);
46 # Pi and Theta also take additional arguments,
47 # see the documentation for more information
49 use Bio::PopGen::Utilities;
52 my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
53 -format => 'clustalw');
54 my $aln = $in->next_aln;
55 # get a population, each sequence is an individual and
56 # for the default case, every site which is not monomorphic
57 # is a 'marker'. Each individual will have a 'genotype' for the
58 # site which will be the specific base in the alignment at that
61 my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
66 This object is intended to provide implementations some standard
67 population genetics statistics about alleles in populations.
69 This module was previously named Bio::Tree::Statistics.
71 This object is a place to accumulate routines for calculating various
72 statistics from the coalescent simulation, marker/allele, or from
73 aligned sequence data given that you can calculate alleles, number of
76 Currently implemented:
77 Fu and Li's D (fu_and_li_D)
78 Fu and Li's D* (fu_and_li_D_star)
79 Fu and Li's F (fu_and_li_F)
80 Fu and Li's F* (fu_and_li_F_star)
82 Watterson's theta (theta)
83 pi (pi) - number of pairwise differences
84 composite_LD (composite_LD)
85 McDonald-Kreitman (mcdonald_kreitman or MK)
87 Count based methods also exist in case you have already calculated the
88 key statistics (seg sites, num individuals, etc) and just want to
89 compute the statistic.
91 In all cases where a the method expects an arrayref of
92 L<Bio::PopGen::IndividualI> objects and L<Bio::PopGen::PopulationI>
93 object will also work.
97 Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
98 Mutations." Genetics 133:693-709.
100 Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
101 from a Population." Genetics 143:557-570.
103 McDonald J, Kreitman M.
105 Tajima F. (1989) "Statistical method for testing the neutral mutation
106 hypothesis by DNA polymorphism." Genetics 123:585-595.
109 =head2 CITING THIS WORK
111 Please see this reference for use of this implementation.
113 Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
115 If you use these Bio::PopGen modules please cite the Bioperl
116 publication (see FAQ) and the above reference.
123 User feedback is an integral part of the evolution of this and other
124 Bioperl modules. Send your comments and suggestions preferably to
125 the Bioperl mailing list. Your participation is much appreciated.
127 bioperl-l@bioperl.org - General discussion
128 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
130 =head2 Reporting Bugs
132 Report bugs to the Bioperl bug tracking system to help us keep track
133 of the bugs and their resolution. Bug reports can be submitted via
136 http://bugzilla.open-bio.org/
138 =head1 AUTHOR - Jason Stajich, Matthew Hahn
140 Email jason-at-bioperl-dot-org
141 Email matthew-dot-hahn-at-duke-dot-edu
143 McDonald-Kreitman implementation based on work by Alisha Holloway at
149 The rest of the documentation details each of the object methods.
150 Internal methods are usually preceded with a _
155 # Let the code begin...
158 package Bio
::PopGen
::Statistics
;
161 in_label
=> 'ingroup',
162 out_label
=> 'outgroup',
163 non_syn
=> 'non_synonymous',
165 default_codon_table
=> 1, # Standard Codon table
168 use Bio
::MolEvol
::CodonModel
;
169 use List
::Util
qw(sum);
171 use base
qw(Bio::Root::Root);
172 our $codon_table => default_codon_table
;
173 our $has_twotailed => 0;
175 eval { require Text
::NSP
::Measures
::2D
::Fisher2
::twotailed
};
176 if( $@
) { $has_twotailed = 0; }
177 else { $has_twotailed = 1; }
188 Usage : my $obj = Bio::PopGen::Statistics->new();
189 Function: Builds a new Bio::PopGen::Statistics object
190 Returns : an instance of Bio::PopGen::Statistics
200 Usage : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup);
202 my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations);
203 Function: Fu and Li D statistic for a list of individuals
204 given an outgroup and the number of external mutations
205 (either provided or calculated from list of outgroup individuals)
207 Args : $individuals - array reference which contains ingroup individuals
208 (L<Bio::PopGen::Individual> or derived classes)
209 $extmutations - number of external mutations OR
210 arrayref of outgroup individuals
215 my ($self,$ingroup,$outgroup) = @_;
217 my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0);
218 if( ref($ingroup) =~ /ARRAY/i ) {
219 $n = scalar @
$ingroup;
220 # pi - all pairwise differences
221 $seg_sites = $self->segregating_sites_count($ingroup);
222 } elsif( ref($ingroup) &&
223 $ingroup->isa('Bio::PopGen::PopulationI')) {
224 $n = $ingroup->get_number_individuals;
225 $seg_sites = $self->segregating_sites_count($ingroup);
227 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
231 if( $seg_sites <= 0 ) {
232 $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
236 if( ! defined $outgroup ) {
237 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
239 } elsif( ref($outgroup) ) {
240 ($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
241 $ancestral = 0 unless defined $ancestral;
243 $ancestral = $outgroup;
246 return $self->fu_and_li_D_counts($n,$seg_sites,
247 $ancestral,$derived);
250 =head2 fu_and_li_D_counts
252 Title : fu_li_D_counts
253 Usage : my $D = $statistics->fu_and_li_D_counts($samps,$sites,
255 Function: Fu and Li D statistic for the raw counts of the number
256 of samples, sites, external and internal mutations
257 Returns : decimal number
258 Args : number of samples (N)
259 number of segregating sites (n)
260 number of external mutations (n_e)
265 sub fu_and_li_D_counts
{
266 my ($self,$n,$seg_sites, $external_mut) = @_;
268 for(my $k= 1; $k < $n; $k++ ) {
272 for(my $k= 1; $k < $n; $k++ ) {
276 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
277 ( ( $n - 1) * ( $n - 2 ) ) );
279 my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) *
283 my $u = $a_n - 1 - $v;
285 ($seg_sites - $a_n * $external_mut) /
286 sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites));
291 =head2 fu_and_li_D_star
293 Title : fu_and_li_D_star
294 Usage : my $D = $statistics->fu_an_li_D_star(\@individuals);
295 Function: Fu and Li's D* statistic for a set of samples
297 Returns : decimal number
298 Args : array ref of L<Bio::PopGen::IndividualI> objects
300 L<Bio::PopGen::PopulationI> object
307 sub fu_and_li_D_star
{
308 my ($self,$individuals) = @_;
310 my ($seg_sites,$n,$singletons);
311 if( ref($individuals) =~ /ARRAY/i ) {
312 $n = scalar @
$individuals;
313 $seg_sites = $self->segregating_sites_count($individuals);
314 $singletons = $self->singleton_count($individuals);
315 } elsif( ref($individuals) &&
316 $individuals->isa('Bio::PopGen::PopulationI')) {
317 my $pop = $individuals;
318 $n = $pop->get_number_individuals;
319 $seg_sites = $self->segregating_sites_count($pop);
320 $singletons = $self->singleton_count($pop);
322 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star");
326 return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons);
329 =head2 fu_and_li_D_star_counts
331 Title : fu_li_D_star_counts
332 Usage : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites,
335 Function: Fu and Li D statistic for the raw counts of the number
336 of samples, sites, external and internal mutations
337 Returns : decimal number
338 Args : number of samples (N)
339 number of segregating sites (n)
345 sub fu_and_li_D_star_counts
{
346 my ($self,$n,$seg_sites, $singletons) = @_;
348 for(my $k = 1; $k < $n; $k++ ) {
352 my $a1 = $a_n + 1 / $n;
355 for(my $k= 1; $k < $n; $k++ ) {
359 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
360 ( ( $n - 1) * ( $n - 2 ) ) );
362 my $d = $c + ($n -2) / ($n - 1)**2 +
364 ( 1.5 - ( (2*$a1 - 3) / ($n -2) ) -
367 my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) -
368 (2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) ) /
371 my $u_star = ( ($n/($n-1))*
376 return (($n / ($n - 1)) * $seg_sites -
377 $a_n * $singletons) /
378 sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites));
385 Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
386 Function: Calculate Fu and Li's F on an ingroup with either the set of
387 outgroup individuals, or the number of external mutations
388 Returns : decimal number
389 Args : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
390 OR a L<Bio::PopGen::PopulationI> object
391 number of external mutations OR list of individuals for the outgroup
398 my ($self,$ingroup,$outgroup) = @_;
399 my ($seg_sites,$pi,$n,$external,$internal);
400 if( ref($ingroup) =~ /ARRAY/i ) {
401 $n = scalar @
$ingroup;
402 # pi - all pairwise differences
403 $pi = $self->pi($ingroup);
404 $seg_sites = $self->segregating_sites_count($ingroup);
405 } elsif( ref($ingroup) &&
406 $ingroup->isa('Bio::PopGen::PopulationI')) {
407 $n = $ingroup->get_number_individuals;
408 $pi = $self->pi($ingroup);
409 $seg_sites = $self->segregating_sites_count($ingroup);
411 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F");
415 if( ! defined $outgroup ) {
416 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
418 } elsif( ref($outgroup) ) {
419 ($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
421 $external = $outgroup;
423 $self->fu_and_li_F_counts($n,$pi,$seg_sites,$external);
426 =head2 fu_and_li_F_counts
428 Title : fu_li_F_counts
429 Usage : my $F = $statistics->fu_and_li_F_counts($samps,$pi,
432 Function: Fu and Li F statistic for the raw counts of the number
433 of samples, sites, external and internal mutations
434 Returns : decimal number
435 Args : number of samples (N)
436 average pairwise differences (pi)
437 number of segregating sites (n)
438 external mutations (n_e)
443 sub fu_and_li_F_counts
{
444 my ($self,$n,$pi,$seg_sites, $external) = @_;
446 for(my $k= 1; $k < $n; $k++ ) {
450 my $a1 = $a_n + (1 / $n );
453 for(my $k= 1; $k < $n; $k++ ) {
457 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
458 ( ( $n - 1) * ( $n - 2 ) ) );
460 my $v_F = ( $c + ( (2*(($n**2)+$n+3)) /
461 ( (9*$n)*($n-1) ) ) -
462 (2/($n-1)) ) / ( ($a_n**2)+$b );
464 my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )-
465 ( 4*( ($n+1)/(($n-1)**2) ))*
466 ($a1 - ((2*$n)/($n+1))) ) /
469 # warn("$v_F vf $u_F uf n = $n\n");
470 my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
471 ($v_F*($seg_sites**2)) ) );
476 =head2 fu_and_li_F_star
478 Title : fu_and_li_F_star
479 Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
480 Function: Calculate Fu and Li's F* on an ingroup without an outgroup
481 It uses count of singleton alleles instead
482 Returns : decimal number
483 Args : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
485 L<Bio::PopGen::PopulationI> object
489 #' keep my emacs happy
491 sub fu_and_li_F_star
{
492 my ($self,$individuals) = @_;
494 my ($seg_sites,$pi,$n,$singletons);
495 if( ref($individuals) =~ /ARRAY/i ) {
496 $n = scalar @
$individuals;
497 # pi - all pairwise differences
498 $pi = $self->pi($individuals);
499 $seg_sites = $self->segregating_sites_count($individuals);
500 $singletons = $self->singleton_count($individuals);
501 } elsif( ref($individuals) &&
502 $individuals->isa('Bio::PopGen::PopulationI')) {
503 my $pop = $individuals;
504 $n = $pop->get_number_individuals;
505 $pi = $self->pi($pop);
506 $seg_sites = $self->segregating_sites_count($pop);
507 $singletons = $self->singleton_count($pop);
509 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star");
512 return $self->fu_and_li_F_star_counts($n,
518 =head2 fu_and_li_F_star_counts
520 Title : fu_li_F_star_counts
521 Usage : my $F = $statistics->fu_and_li_F_star_counts($samps,
524 Function: Fu and Li F statistic for the raw counts of the number
525 of samples, sites, external and internal mutations
526 Returns : decimal number
527 Args : number of samples (N)
528 average pairwise differences (pi)
529 number of segregating sites (n)
530 singleton mutations (n_s)
535 sub fu_and_li_F_star_counts
{
536 my ($self,$n,$pi,$seg_sites, $singletons) = @_;
538 $self->warn("N must be > 1\n");
549 for(my $k= 1; $k < $n; $k++ ) {
551 $a_n += ( 1 / $k ); # Eq (2)
553 my $a1 = $a_n + (1 / $n );
555 # warn("a_n is $a_n a1 is $a1 n is $n b is $b\n");
557 # From Simonsen et al (1995) instead of Fu and Li 1993
558 my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/
559 (9 * ($n ** 2) * ( $n - 1))) +
560 ((2 * ($n - 1) * $a_n ) / $n ** 2) -
562 ( ($a_n ** 2) + $b );
564 my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) /
565 (3 * $n * ( $n - 1))) / $a_n) - $v_F_star;
567 # warn("vf* = $v_F_star uf* = $u_F_star n = $n\n");
568 my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) /
569 sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2);
576 Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
577 Function: Calculate Tajima's D on a set of samples
578 Returns : decimal number
579 Args : array ref of L<Bio::PopGen::IndividualI> objects
581 L<Bio::PopGen::PopulationI> object
589 my ($self,$individuals) = @_;
590 my ($seg_sites,$pi,$n);
592 if( ref($individuals) =~ /ARRAY/i ) {
593 $n = scalar @
$individuals;
594 # pi - all pairwise differences
595 $pi = $self->pi($individuals);
596 $seg_sites = $self->segregating_sites_count($individuals);
598 } elsif( ref($individuals) &&
599 $individuals->isa('Bio::PopGen::PopulationI')) {
600 my $pop = $individuals;
601 $n = $pop->get_number_individuals;
602 $pi = $self->pi($pop);
603 $seg_sites = $self->segregating_sites_count($pop);
605 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
608 $self->tajima_D_counts($n,$seg_sites,$pi);
611 =head2 tajima_D_counts
613 Title : tajima_D_counts
614 Usage : my $D = $statistics->tajima_D_counts($samps,$sites,$pi);
615 Function: Tajima's D statistic for the raw counts of the number
616 of samples, sites, and avg pairwise distances (pi)
617 Returns : decimal number
618 Args : number of samples (N)
619 number of segregating sites (n)
620 average pairwise differences (pi)
626 sub tajima_D_counts
{
627 my ($self,$n,$seg_sites,$pi) = @_;
629 for(my $k= 1; $k < $n; $k++ ) {
634 for(my $k= 1; $k < $n; $k++ ) {
635 $a2 += ( 1 / $k**2 );
638 my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) );
639 my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) /
640 ( ( 9 * $n) * ( $n - 1) );
641 my $c1 = $b1 - ( 1 / $a1 );
642 my $c2 = $b2 - ( ( $n + 2 ) /
643 ( $a1 * $n))+( $a2 / $a1 ** 2);
645 my $e2 = $c2 / ( $a1**2 + $a2 );
647 my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
648 return undef if $denom == 0;
649 my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom;
657 Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds)
658 Function: Calculate pi (average number of pairwise differences) given
659 a list of individuals which have the same number of markers
660 (also called sites) as available from the get_Genotypes()
661 call in L<Bio::PopGen::IndividualI>
662 Returns : decimal number
663 Args : Arg1= array ref of L<Bio::PopGen::IndividualI> objects
664 which have markers/mutations. We expect all individuals to
665 have a marker - we will deal with missing data as a special case.
667 Arg1= L<Bio::PopGen::PopulationI> object. In the event that
668 only allele frequency data is available, storing it in
669 Population object will make this available.
670 num sites [optional], an optional second argument (integer)
671 which is the number of sites, then pi returned is pi/site.
676 my ($self,$individuals,$numsites) = @_;
677 my (%data,%marker_total,@marker_names,$n);
679 if( ref($individuals) =~ /ARRAY/i ) {
680 # one possible argument is an arrayref of Bio::PopGen::IndividualI objs
681 @marker_names = $individuals->[0]->get_marker_names;
682 $n = scalar @
$individuals;
684 # Here we are calculating the allele frequencies
685 foreach my $ind ( @
$individuals ) {
686 if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
687 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
690 foreach my $m ( @marker_names ) {
691 foreach my $allele (map { $_->get_Alleles}
692 $ind->get_Genotypes($m) ) {
693 $data{$m}->{$allele}++;
698 # while( my ($marker,$count) = each %marker_total ) {
699 # foreach my $c ( values %{$data{$marker}} ) {
703 # %data will contain allele frequencies for each marker, allele
704 } elsif( ref($individuals) &&
705 $individuals->isa('Bio::PopGen::PopulationI') ) {
706 my $pop = $individuals;
707 $n = $pop->get_number_individuals;
708 foreach my $marker( $pop->get_Markers ) {
709 push @marker_names, $marker->name;
710 #$data{$marker->name} = {$marker->get_Allele_Frequencies};
711 my @genotypes = $pop->get_Genotypes(-marker
=> $marker->name);
712 for my $al ( map { $_->get_Alleles} @genotypes ) {
713 $data{$marker->name}->{$al}++;
714 $marker_total{$marker->name}++;
718 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
720 # based on Kevin Thornton's code:
721 # http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152
722 # For now we assume that all individuals have the same markers
723 my ($diffcount,$totalcompare) = (0,0);
725 while ( my ($marker,$markerdat) = each %data ) {
726 my $sampsize = $marker_total{$marker};
728 my @alleles = keys %$markerdat;
729 if ( $sampsize > 1 ) {
730 my $denom = $sampsize * ($sampsize - 1.0);
731 foreach my $al ( @alleles ) {
732 $ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom;
737 $self->debug( "pi=$pi\n");
739 return $pi / $numsites;
749 Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
750 Function: Calculates Watterson's theta from the sample size
751 and the number of segregating sites.
752 Providing the third parameter, total number of sites will
753 return theta per site.
754 This is also known as K-hat = K / a_n
755 Returns : decimal number
756 Args : sample size (integer),
757 num segregating sites (integer)
758 total sites (integer) [optional] (to calculate theta per site)
760 provide an arrayref of the L<Bio::PopGen::IndividualI> objects
761 total sites (integer) [optional] (to calculate theta per site)
763 provide an L<Bio::PopGen::PopulationI> object
764 total sites (integer)[optional]
772 my ( $n, $seg_sites,$totalsites) = @_;
773 if( ref($n) =~ /ARRAY/i ) {
775 $totalsites = $seg_sites; # only 2 arguments if one is an array
777 my @marker_names = $samps->[0]->get_marker_names;
778 # we need to calculate number of polymorphic sites
779 $seg_sites = $self->segregating_sites_count($samps);
783 $n->isa('Bio::PopGen::PopulationI') ) {
784 # This will handle the case when we pass in a PopulationI object
786 $totalsites = $seg_sites; # shift the arguments over by one
787 $n = $pop->haploid_population->get_number_individuals;
788 $seg_sites = $self->segregating_sites_count($pop);
791 for(my $k= 1; $k < $n; $k++ ) {
794 if( $totalsites ) { # 0 and undef are the same can't divide by them
795 $seg_sites /= $totalsites;
800 return $seg_sites / $a1;
803 =head2 singleton_count
805 Title : singleton_count
806 Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
807 Function: Calculate the number of mutations/alleles which only occur once in
808 a list of individuals for all sites/markers
809 Returns : (integer) number of alleles which only occur once (integer)
810 Args : arrayref of L<Bio::PopGen::IndividualI> objects
812 L<Bio::PopGen::PopulationI> object
816 sub singleton_count
{
817 my ($self,$individuals) = @_;
820 if( ref($individuals) =~ /ARRAY/ ) {
821 @inds = @
$individuals;
822 } elsif( ref($individuals) &&
823 $individuals->isa('Bio::PopGen::PopulationI') ) {
824 my $pop = $individuals;
825 @inds = $pop->get_Individuals();
827 $self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
831 $self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
834 # find number of sites where a particular allele is only seen once
836 my ($singleton_allele_ct,%sites) = (0);
837 # first collect all the alleles into a hash structure
839 foreach my $n ( @inds ) {
840 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
841 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
844 foreach my $g ( $n->get_Genotypes ) {
845 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
846 foreach my $allele (@alleles ) {
847 $sites{$nm}->{$allele}++;
851 foreach my $site ( values %sites ) { # don't really care what the name is
852 foreach my $allelect ( values %$site ) { #
853 # find the sites which have an allele with only 1 copy
854 $singleton_allele_ct++ if( $allelect == 1 );
857 return $singleton_allele_ct;
860 # Yes I know that singleton_count and segregating_sites_count are
861 # basically processing the same data so calling them both is
862 # redundant, something I want to fix later but want to make things
863 # correct and simple first
865 =head2 segregating_sites_count
867 Title : segregating_sites_count
868 Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
869 Function: Gets the number of segregating sites (number of polymorphic sites)
870 Returns : (integer) number of segregating sites
871 Args : arrayref of L<Bio::PopGen::IndividualI> objects
873 L<Bio::PopGen::PopulationI> object
877 # perhaps we'll change this in the future
878 # to return the actual segregating sites
879 # so one can use this to pull in the names of those sites.
880 # Would be trivial if it is useful.
882 sub segregating_sites_count
{
883 my ($self,$individuals) = @_;
884 my $type = ref($individuals);
886 if( $type =~ /ARRAY/i ) {
888 foreach my $n ( @
$individuals ) {
889 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
890 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
893 foreach my $g ( $n->get_Genotypes ) {
894 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
895 foreach my $allele (@alleles ) {
896 $sites{$nm}->{$allele}++;
900 foreach my $site ( values %sites ) { # use values b/c we don't
901 # really care what the name is
902 # find the sites which >1 allele
903 $seg_sites++ if( keys %$site > 1 );
905 } elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
906 foreach my $marker ( $individuals->haploid_population->get_Markers ) {
907 my @alleles = $marker->get_Alleles;
908 $seg_sites++ if ( scalar @alleles > 1 );
911 $self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
918 =head2 heterozygosity
920 Title : heterozygosity
921 Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
922 Function: Calculate the heterozgosity for a sample set for a set of alleles
923 Returns : decimal number
924 Args : sample size (integer)
925 frequency of one allele (fraction - must be less than 1)
926 [optional] frequency of another allele - this is only needed
927 in a non-binary allele system
929 Note : p^2 + 2pq + q^2
935 my ($self,$samp_size, $freq1,$freq2) = @_;
936 if( ! $freq2 ) { $freq2 = 1 - $freq1 }
937 if( $freq1 > 1 || $freq2 > 1 ) {
938 $self->warn("heterozygosity expects frequencies to be less than 1");
940 my $sum = ($freq1**2) + (($freq2)**2);
941 my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
946 =head2 derived_mutations
948 Title : derived_mutations
949 Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
950 Function: Calculate the number of alleles or (mutations) which are ancestral
951 and the number which are derived (occurred only on the tips)
952 Returns : array of 2 items - number of external and internal derived
954 Args : ingroup - L<Bio::PopGen::IndividualI>s arrayref OR
955 L<Bio::PopGen::PopulationI>
956 outgroup- L<Bio::PopGen::IndividualI>s arrayref OR
957 L<Bio::PopGen::PopulationI> OR
958 a single L<Bio::PopGen::IndividualI>
962 sub derived_mutations
{
963 my ($self,$ingroup,$outgroup) = @_;
964 my (%indata,%outdata,@marker_names);
966 # basically we have to do some type checking
967 # if that perl were typed...
968 my ($itype,$otype) = (ref($ingroup),ref($outgroup));
970 return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
971 # are already the value we
973 # pick apart the ingroup
975 if( ref($ingroup) =~ /ARRAY/i ) {
976 if( ! ref($ingroup->[0]) ||
977 ! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
978 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
981 # we assume that all individuals have the same markers
982 # i.e. that they are aligned
983 @marker_names = $ingroup->[0]->get_marker_names;
984 for my $ind ( @
$ingroup ) {
985 for my $m ( @marker_names ) {
986 for my $allele ( map { $_->get_Alleles }
987 $ind->get_Genotypes($m) ) {
988 $indata{$m}->{$allele}++;
992 } elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
993 @marker_names = $ingroup->get_marker_names;
994 for my $ind ( $ingroup->haploid_population->get_Individuals() ) {
995 for my $m ( @marker_names ) {
996 for my $allele ( map { $_->get_Alleles}
997 $ind->get_Genotypes($m) ) {
998 $indata{$m}->{$allele}++;
1003 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
1007 if( $otype =~ /ARRAY/i ) {
1008 if( ! ref($outgroup->[0]) ||
1009 ! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
1010 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
1013 for my $ind ( @
$outgroup ) {
1014 for my $m ( @marker_names ) {
1015 for my $allele ( map { $_->get_Alleles }
1016 $ind->get_Genotypes($m) ) {
1017 $outdata{$m}->{$allele}++;
1022 } elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
1023 for my $ind ( $outgroup->haploid_population->get_Individuals() ) {
1024 for my $m ( @marker_names ) {
1025 for my $allele ( map { $_->get_Alleles}
1026 $ind->get_Genotypes($m) ) {
1027 $outdata{$m}->{$allele}++;
1032 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
1036 # derived mutations are defined as
1040 # derived mutations are G and T, A is the external mutation
1044 # derived mutations A,T no external/ancestral mutations
1050 my ($internal,$external);
1051 foreach my $marker ( @marker_names ) {
1052 my @outalleles = keys %{$outdata{$marker}};
1053 my @in_alleles = keys %{$indata{$marker}};
1054 next if( @outalleles > 1 || @in_alleles == 1);
1055 for my $allele ( @in_alleles ) {
1056 if( ! exists $outdata{$marker}->{$allele} ) {
1057 if( $indata{$marker}->{$allele} == 1 ) {
1065 return ($external, $internal);
1071 Title : composite_LD
1072 Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population);
1073 Function: Calculate the Linkage Disequilibrium
1074 This is for calculating LD for unphased data.
1075 Other methods will be appropriate for phased haplotype data.
1077 Returns : Hash of Hashes - first key is site 1,second key is site 2
1078 and value is LD for those two sites.
1079 my $LDarrayref = $matrix{$site1}->{$site2};
1080 my ($ldval, $chisquared) = @$LDarrayref;
1081 Args : L<Bio::PopGen::PopulationI> or arrayref of
1082 L<Bio::PopGen::IndividualI>s
1083 Reference: Weir B.S. (1996) "Genetic Data Analysis II",
1084 Sinauer, Sunderlanm MA.
1089 my ($self,$pop) = @_;
1090 if( ref($pop) =~ /ARRAY/i ) {
1091 if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
1092 $pop = Bio
::PopGen
::Population
->new(-individuals
=> @
$pop);
1094 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1097 } elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
1098 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1102 my @marker_names = $pop->get_marker_names;
1103 my @inds = $pop->get_Individuals;
1104 my $num_inds = scalar @inds;
1106 # calculate allele frequencies for each marker from the population
1107 # use the built-in get_Marker to get the allele freqs
1108 # we still need to calculate the genotype frequencies
1109 foreach my $marker_name ( @marker_names ) {
1112 foreach my $ind ( @inds ) {
1113 my ($genotype) = $ind->get_Genotypes(-marker
=> $marker_name);
1114 if( ! defined $genotype ) {
1115 $self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
1118 my @alleles = sort $genotype->get_Alleles;
1119 next if( scalar @alleles != 2);
1120 my $genostr = join(',', @alleles);
1121 $allelef{$alleles[0]}++;
1122 $allelef{$alleles[1]}++;
1125 # we should check for cases where there > 2 alleles or
1126 # only 1 allele and throw out those markers.
1127 my @alleles = sort keys %allelef;
1128 my $allele_count = scalar @alleles;
1129 # test if site is polymorphic
1130 if( $allele_count != 2) {
1131 # only really warn if we're seeing multi-allele
1132 $self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
1133 next; # skip this marker
1136 # Need to do something here to detect alleles which aren't
1137 # a single character
1138 if( length($alleles[0]) != 1 ||
1139 length($alleles[1]) != 1 ) {
1140 $self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
1144 # fix the call for allele 1 (A or B) and
1145 # allele 2 (a or b) in terms of how we'll do the
1146 # N square from Weir p.126
1147 $self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
1148 $lookup{$marker_name}->{'1'} = $alleles[0];
1149 $lookup{$marker_name}->{'2'} = $alleles[1];
1152 @marker_names = sort keys %lookup;
1153 my $site_count = scalar @marker_names;
1154 # where the final data will be stored
1155 my %stats_for_sites;
1157 # standard way of generating pairwise combos
1158 # LD is done by comparing all the pairwise site (marker)
1159 # combinations and keeping track of the genotype and
1160 # pairwise genotype (ie genotypes of the 2 sites) frequencies
1161 for( my $i = 0; $i < $site_count - 1; $i++ ) {
1162 my $site1 = $marker_names[$i];
1163 my (%genotypes, %total_genotype_count,
1164 %total_pairwisegeno_count,%pairwise_genotypes);
1165 for( my $j = $i+1; $j < $site_count ; $j++) {
1167 my (%genotypes, %total_genotype_count,
1168 %total_pairwisegeno_count,%pairwise_genotypes);
1170 my $site2 = $marker_names[$j];
1171 my (%allele_count,%allele_freqs) = (0,0);
1172 foreach my $ind ( @inds ) {
1173 # build string of genotype at site 1
1174 my ($genotype1) = $ind->get_Genotypes(-marker
=> $site1);
1175 my @alleles1 = sort $genotype1->get_Alleles;
1177 # if an individual has only one available allele
1178 # (has a blank or N for one of the chromosomes)
1179 # we don't want to use it in our calculation
1181 next unless( scalar @alleles1 == 2);
1182 my $genostr1 = join(',', @alleles1);
1184 # build string of genotype at site 2
1185 my ($genotype2) = $ind->get_Genotypes(-marker
=> $site2);
1186 my @alleles2 = sort $genotype2->get_Alleles;
1187 my $genostr2 = join(',', @alleles2);
1189 next unless( scalar @alleles2 == 2);
1191 $allele_count{$site1}++;
1192 $allele_freqs{$site1}->{$_}++;
1194 $genotypes{$site1}->{$genostr1}++;
1195 $total_genotype_count{$site1}++;
1198 $allele_count{$site2}++;
1199 $allele_freqs{$site2}->{$_}++;
1201 $genotypes{$site2}->{$genostr2}++;
1202 $total_genotype_count{$site2}++;
1204 # We are using the $site1,$site2 to signify
1206 $pairwise_genotypes{"$site1,$site2"}->{"$genostr1,$genostr2"}++;
1208 $total_pairwisegeno_count{"$site1,$site2"}++;
1210 for my $site ( %allele_freqs ) {
1211 for my $al ( keys %{ $allele_freqs{$site} } ) {
1212 $allele_freqs{$site}->{$al} /= $allele_count{$site};
1215 my $n = $total_pairwisegeno_count{"$site1,$site2"}; # number of inds
1216 # 'A' and 'B' are two loci or in our case site1 and site2
1217 my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1218 my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1219 my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1220 my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1222 my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1223 $allele1_site2, $allele1_site2));
1224 $self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1226 my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1227 $allele1_site2, $allele2_site2));
1228 $self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1230 my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1231 $allele1_site2, $allele1_site2));
1232 $self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1234 my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1235 $allele1_site2, $allele2_site2));
1236 $self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1238 my $n1 = $pairwise_genotypes{"$site1,$site2"}->{$N1genostr} || 0;
1240 my $n2 = $pairwise_genotypes{"$site1,$site2"}->{$N2genostr} || 0;
1242 my $n4 = $pairwise_genotypes{"$site1,$site2"}->{$N4genostr} || 0;
1244 my $n5 = $pairwise_genotypes{"$site1,$site2"}->{$N5genostr} || 0;
1246 my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1247 my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1248 my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1249 my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1250 my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq
1253 my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq
1256 # variance of allele frequencies
1257 my $pi_A = $p_A * $p_a;
1258 my $pi_B = $p_B * $p_b;
1261 my $D_A = $p_AA - $p_A**2;
1262 my $D_B = $p_BB - $p_B**2;
1263 my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1264 $self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1266 my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1267 $self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1268 $self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1269 $n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1272 eval { $chisquared = ( $n * ($delta_AB**2) ) /
1273 ( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1276 $self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1279 # this will be an upper triangular matrix
1280 $stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1283 return %stats_for_sites;
1286 =head2 mcdonald_kreitman
1288 Title : mcdonald_kreitman
1289 Usage : $Fstat = mcdonald_kreitman($ingroup, $outgroup);
1290 Function: Calculates McDonald-Kreitman statistic based on a set of ingroup
1291 individuals and an outgroup by computing the number of
1292 differences at synonymous and non-synonymous sites
1293 for intraspecific comparisons and with the outgroup
1294 Returns : 2x2 table, followed by a hash reference indicating any
1295 warning messages about the status of the alleles or codons
1296 Args : -ingroup => L<Bio::PopGen::Population> object or
1297 arrayref of L<Bio::PopGen::Individual>s
1298 -outgroup => L<Bio::PopGen::Population> object or
1299 arrayef of L<Bio::PopGen::Individual>s
1300 -polarized => Boolean, to indicate if this should be
1301 a polarized test. Must provide two individuals
1306 sub mcdonald_kreitman
{
1307 my ($self,@args) = @_;
1308 my ($ingroup, $outgroup,$polarized) =
1309 $self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args);
1310 my $verbose = $self->verbose;
1313 if( ref($outgroup) =~ /ARRAY/i ) {
1314 $outgroup_count = scalar @
$outgroup;
1315 } elsif( UNIVERSAL
::isa
($outgroup,'Bio::PopGen::PopulationI') ) {
1316 $outgroup_count = $outgroup->get_number_individuals;
1318 $self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI");
1322 if( $outgroup_count < 2 ) {
1323 $self->throw("Need 2 outgroups with polarized option\n");
1325 } elsif( $outgroup_count > 1 ) {
1326 $self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count ));
1327 } elsif( $outgroup_count == 0 ) {
1328 $self->throw("No outgroup sequence provided");
1331 my $codon_path = Bio
::MolEvol
::CodonModel
->codon_path;
1333 my (%marker_names,%unique,@inds);
1334 for my $p ( $ingroup, $outgroup) {
1335 if( ref($p) =~ /ARRAY/i ) {
1338 push @inds, $p->get_Individuals;
1341 for my $i ( @inds ) {
1342 if( $unique{$i->unique_id}++ ) {
1343 $self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n");
1345 for my $n ( $i->get_marker_names ) {
1346 $marker_names{$n}++;
1350 my @marker_names = keys %marker_names;
1351 if( $marker_names[0] =~ /^(Site|Codon)/ ) {
1352 # sort by site or codon number and do it in
1353 # a schwartzian transformation baby!
1354 @marker_names = map { $_->[1] }
1355 sort { $a->[0] <=> $b->[0] }
1356 map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names;
1360 my $num_inds = scalar @inds;
1361 my %vals = ( 'ingroup' => $ingroup,
1362 'outgroup' => $outgroup,
1365 # Make the Codon Table type a parameter!
1366 my $table = Bio
::Tools
::CodonTable
->new(-id
=> $codon_table);
1367 my @vt = qw(outgroup ingroup);
1370 my %two_by_two = ( 'fixed_N' => 0,
1375 for my $codon ( @marker_names ) {
1380 for my $ind ( @
{$vals{$t}} ) {
1381 my @alleles = $ind->get_Genotypes($codon)->get_Alleles;
1382 if( @alleles > 1 ) {
1384 # warn("$codon $codon saw ", scalar @alleles, " for ind ", $ind->unique_id, "\n");
1386 my ($allele) = shift @alleles;
1387 $all_alleles{$ind->unique_id} = $allele;
1388 my $AA = $table->translate($allele);
1389 next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i);
1392 if( $t eq 'outgroup' ) {
1393 $label = $t.$outcount++;
1395 $codonvals{$label}->{$allele}++;
1396 $codonvals{all
}->{$allele}++;
1400 my $total = sum
( values %{$codonvals{'ingroup'}} );
1401 next if( $total && $total < 2 ); # skip sites with < alleles
1402 # process all the seen alleles (codons)
1403 # this is a vertical slide through the alignment
1404 if( keys %{$codonvals{all
}} <= 1 ) {
1405 # no changes or no VALID codons - monomorphic
1407 # grab only the first outgroup codon (what to do with rest?)
1408 my ($outcodon) = keys %{$codonvals{'outgroup1'}};
1410 $status{"no outgroup codon $codon"}++;
1413 my $out_AA = $table->translate($outcodon);
1414 my ($outcodon2) = keys %{$codonvals{'outgroup2'}};
1415 if( ($polarized && ($outcodon ne $outcodon2)) ||
1416 $out_AA eq 'X' || $out_AA eq '*' ) {
1417 # skip if outgroup codons are different
1418 # (when polarized option is on)
1419 # or skip if the outcodon is STOP or 'NNN'
1420 if( $verbose > 0 ) {
1421 $self->debug("skipping $out_AA and $outcodon $outcodon2\n");
1423 $status{'outgroup codons different'}++;
1427 # check if ingroup is actually different from outgroup -
1428 # if there are the same number of alleles when considering
1429 # ALL or just the ingroup, then there is nothing new seen
1430 # in the outgroup so it must be a shared allele (codon)
1432 # so we just count how many total alleles were seen
1433 # if this is the same as the number of alleles seen for just
1434 # the ingroup then the outgroup presents no new information
1436 my @ingroup_codons = keys %{$codonvals{'ingroup'}};
1437 my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon};
1439 if( $verbose > 0 ) {
1440 $self->debug("alleles are in: ", join(",", @ingroup_codons),
1441 " out: ", join(",", keys %{$codonvals{outgroup1
}}),
1442 " diff_from_out=$diff_from_out\n");
1444 for my $ind ( sort keys %all_alleles ) {
1445 $self->debug( "$ind\t$all_alleles{$ind}\n");
1448 # are all the ingroup alleles the same and diferent from outgroup?
1449 # fixed differences between species
1450 if( $diff_from_out ) {
1451 if( scalar @ingroup_codons == 1 ) {
1453 if( $outcodon =~ /^$gapchar/ ) {
1454 $status{'outgroup codons with gaps'}++;
1456 } elsif( $ingroup_codons[0] =~ /$gapchar/) {
1457 $status{'ingroup codons with gaps'}++;
1460 my $path = $codon_path->{uc $ingroup_codons[0].$outcodon};
1461 $two_by_two{fixed_N
} += $path->[0];
1462 $two_by_two{fixed_S
} += $path->[1];
1463 if( $verbose > 0 ) {
1464 $self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n");
1465 $self->debug("path is ",join(",",@
$path),"\n");
1467 (sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA,
1468 @
$path, map { $two_by_two{$_} }
1469 qw(fixed_N fixed_S poly_N poly_S)));
1472 # polymorphic and all are different from outgroup
1473 # Here we find the minimum number of NS subst
1474 my ($Ndiff,$Sdiff) = (3,0); # most different path
1475 for my $c ( @ingroup_codons ) {
1476 next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/);
1477 my $path = $codon_path->{uc $c.$outcodon};
1478 my ($tNdiff,$tSdiff) = @
$path;
1479 if( $path->[0] < $Ndiff ||
1480 ($tNdiff == $Ndiff &&
1481 $tSdiff <= $Sdiff)) {
1482 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1485 $two_by_two{fixed_N
} += $Ndiff;
1486 $two_by_two{fixed_S
} += $Sdiff;
1487 if( @ingroup_codons > 2 ) {
1488 $status{"more than 2 ingroup codons $codon"}++;
1489 warn("more than 2 ingroup codons (@ingroup_codons)\n");
1491 my $path = $codon_path->{uc join('',@ingroup_codons)};
1493 $two_by_two{poly_N
} += $path->[0];
1494 $two_by_two{poly_S
} += $path->[1];
1495 if( $verbose > 0 ) {
1496 $self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@
$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S)));
1501 my %unq = map { $_ => 1 } @ingroup_codons;
1502 delete $unq{$outcodon};
1503 my @unique_codons = keys %unq;
1505 # calc path for diff add to poly
1506 # Here we find the minimum number of subst bw
1508 my ($Ndiff,$Sdiff) = (3,0); # most different path
1509 for my $c ( @unique_codons ) {
1510 my $path = $codon_path->{uc $c.$outcodon };
1511 if( ! defined $path ) {
1512 die " cannot get path for ", $c.$outcodon, "\n";
1514 my ($tNdiff,$tSdiff) = @
$path;
1515 if( $path->[0] < $Ndiff ||
1516 ($tNdiff == $Ndiff &&
1517 $tSdiff <= $Sdiff)) {
1518 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1522 if( @unique_codons == 2 ) {
1523 my $path = $codon_path->{uc join('',@unique_codons)};
1524 if( ! defined $path ) {
1525 $self->throw("no path for @unique_codons\n");
1527 $Ndiff += $path->[0];
1528 $Sdiff += $path->[1];
1530 $two_by_two{poly_N
} += $Ndiff;
1531 $two_by_two{poly_S
} += $Sdiff;
1532 if( $verbose > 0 ) {
1533 $self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,
1534 $Ndiff, $Sdiff, map { $two_by_two{$_} }
1535 qw(fixed_N fixed_S poly_N poly_S)));
1540 return ( $two_by_two{'poly_N'},
1541 $two_by_two{'fixed_N'},
1542 $two_by_two{'poly_S'},
1543 $two_by_two{'fixed_S'},
1548 *MK
= \
&mcdonald_kreitman
;
1551 =head2 mcdonald_kreitman_counts
1553 Title : mcdonald_kreitman_counts
1554 Usage : my $MK = $statistics->mcdonald_kreitman_counts(
1556 N_poly -> integer of count of non-syn polymorphism
1557 N_fix -> integer of count of non-syn fixed substitutions
1558 S_poly -> integer of count of syn polymorphism
1559 S_fix -> integer of count of syn fixed substitutions
1562 Returns : decimal number
1568 sub mcdonald_kreitman_counts
{
1569 my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_;
1570 if( $has_twotailed ) {
1571 return &Text
::NSP
::Measures
::2D
::Fisher2
::twotailed
::calculateStatistic
1575 npp
=>$Npoly+$Nfix+$Spoly+$Sfix);
1577 $self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed");