Bio/PopGen/PopStats.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::PopGen::PopStats
   4 #
   5 # Cared for by Jason Stajich <jason-at-bioperl.org>
   6 #
   7 # Copyright Jason Stajich
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::PopGen::PopStats - A collection of methods for calculating
  16 statistics about a population or sets of populations
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::PopGen::PopStats;
  21   my $stats = Bio::PopGen::PopStats->new(); # add -haploid => 1
  22                                            # to process haploid data
  23
  24 =head1 DESCRIPTION
  25
  26 Calculate various population structure statistics, most notably Wright's Fst.
  27
  28 =head1 FEEDBACK
  29
  30 =head2 Mailing Lists
  31
  32 User feedback is an integral part of the evolution of this and other
  33 Bioperl modules. Send your comments and suggestions preferably to
  34 the Bioperl mailing list.  Your participation is much appreciated.
  35
  36   bioperl-l@bioperl.org                  - General discussion
  37   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  38
  39 =head2 Reporting Bugs
  40
  41 Report bugs to the Bioperl bug tracking system to help us keep track
  42 of the bugs and their resolution. Bug reports can be submitted via
  43 the web:
  44
  45   http://bugzilla.open-bio.org/
  46
  47 =head1 AUTHOR - Jason Stajich
  48
  49 Email jason-at-bioperl.org
  50
  51 =head1 CONTRIBUTORS
  52
  53 Matthew Hahn, matthew.hahn-at-duke.edu
  54
  55 =head1 APPENDIX
  56
  57 The rest of the documentation details each of the object methods.
  58 Internal methods are usually preceded with a _
  59
  60 =cut
  61
  62
  63 # Let the code begin...
  64
  65
  66 package Bio::PopGen::PopStats;
  67 use strict;
  68
  69 # Object preamble - inherits from Bio::Root::Root
  70
  71
  72
  73 use base qw(Bio::Root::Root);
  74
  75 =head2 new
  76
  77  Title   : new
  78  Usage   : my $obj = Bio::PopGen::PopStats->new();
  79  Function: Builds a new Bio::PopGen::PopStats object
  80  Returns : an instance of Bio::PopGen::PopStats
  81  Args    : -haploid => 1 (if want to use haploid calculations)
  82
  83
  84 =cut
  85
  86 sub new {
  87   my($class,@args) = @_;
  88
  89   my $self = $class->SUPER::new(@args);
  90   my ($haploid) = $self->_rearrange([qw(HAPLOID)],@args);
  91   if( $haploid ) { $self->haploid_status(1) }
  92   return $self;
  93 }
  94
  95
  96 =head2 haploid_status
  97
  98  Title   : haploid_status
  99  Usage   : $obj->haploid_status($newval)
 100  Function: Boolean value for whether or not to do haploid
 101            or diploid calculations, where appropriate
 102  Returns : Boolean
 103  Args    : on set, new boolean value optional)
 104
 105
 106 =cut
 107
 108 sub haploid_status{
 109     my $self = shift;
 110     return $self->{'haploid_status'} = shift if @_;
 111     return $self->{'haploid_status'};
 112 }
 113
 114
 115 # Implementation provided my Matthew Hahn, massaged by Jason Stajich
 116
 117 =head2 Fst
 118
 119  Title   : Fst
 120  Usage   : my $fst = $stats->Fst(\@populations,\@markernames)
 121  Function: Calculate Wright's Fst based on a set of sub-populations
 122            and specific markers
 123  Returns : Fst value (a value between 0 and 1)
 124  Args    : Arrayref of populations to process
 125            Arrayref of marker names to process
 126  Note    : Based on diploid method in Weir BS, Genetics Data Analysis II, 1996
 127            page 178.
 128
 129 =cut
 130
 131 #' make emacs happy here
 132 sub Fst {
 133    my ($self,$populations,$markernames) = @_;
 134
 135    if( ! defined $populations ||
 136        ref($populations) !~ /ARRAY/i ) {
 137        $self->warn("Must provide a valid arrayref for populations");
 138        return;
 139    } elsif( ! defined $markernames ||
 140             ref($markernames) !~ /ARRAY/i ) {
 141        $self->warn("Must provide a valid arrayref for marker names");
 142        return;
 143    }
 144    my $num_sub_pops          = scalar @$populations;
 145
 146    if( $num_sub_pops < 2 ) {
 147        $self->warn("Must provide at least 2 populations for this test, you provided $num_sub_pops");
 148        return;
 149    }
 150
 151    # This code assumes that pop 1 contains at least one of all the
 152    # alleles - need to do some more work to insure that the complete
 153    # set of alleles is seen.
 154    my $Fst;
 155    my ($TS_sub1,$TS_sub2);
 156
 157    foreach my $marker ( @$markernames ) {
 158        # Get all the alleles from all the genotypes in all subpopulations
 159        my %allAlleles;
 160        foreach my $allele ( map { $_->get_Alleles() }
 161                             map { $_->get_Genotypes($marker) } @$populations ){
 162            $allAlleles{$allele}++;
 163        }
 164        my @alleles = keys %allAlleles;
 165
 166        foreach my $allele_name ( @alleles ) {
 167            my $avg_samp_size         = 0; # n-bar
 168            my $avg_allele_freq       = 0; # p-tilda-A-dot
 169
 170            my $total_samples_squared = 0; #
 171            my $sum_heterozygote      = 0;
 172
 173            my @marker_freqs;
 174
 175            # Walk through each population, get the calculated allele frequencies
 176            # for the marker, do some bookkeeping
 177
 178
 179            foreach my $pop ( @$populations ) {
 180                my $s = $pop->get_number_individuals($marker);
 181
 182                $avg_samp_size += $s;
 183                $total_samples_squared += $s**2;
 184
 185                my $markerobj = $pop->get_Marker($marker);
 186                if( ! defined $markerobj ) {
 187                    $self->warn("Could not derive Marker for $marker ".
 188                                "from population ". $pop->name);
 189                    return;
 190                }
 191
 192                my $freq_homozygotes =
 193                    $pop->get_Frequency_Homozygotes($marker,$allele_name);
 194                my %af = $markerobj->get_Allele_Frequencies();
 195                my $all_freq = ( ($af{$allele_name} || 0));
 196
 197                $avg_allele_freq += $s * $all_freq;
 198                $sum_heterozygote += (2 * $s)*( $all_freq - $freq_homozygotes);
 199
 200                push @marker_freqs, \%af;
 201            }
 202            my $total_samples =  $avg_samp_size; # sum of n over i sub-populations
 203            $avg_samp_size /= $num_sub_pops;
 204            $avg_allele_freq /= $total_samples;
 205
 206            # n-sub-c
 207            my $adj_samp_size = ( 1/ ($num_sub_pops - 1)) *
 208                ( $total_samples - ( $total_samples_squared/$total_samples));
 209
 210            my $variance              = 0; # s-squared-sub-A
 211            my $sum_variance          = 0;
 212            my $i = 0;           # we have cached the marker info
 213            foreach my $pop ( @$populations ) {
 214                my $s = $pop->get_number_individuals($marker);
 215                my %af = %{$marker_freqs[$i++]};
 216                $sum_variance += $s * (( ($af{$allele_name} || 0) -
 217                                         $avg_allele_freq)**2);
 218            }
 219            $variance = ( 1 / (( $num_sub_pops-1)*$avg_samp_size))*$sum_variance;
 220
 221            # H-tilda-A-dot
 222            my $freq_heterozygote = ($sum_heterozygote / $total_samples);
 223
 224            if( $self->haploid_status ) {
 225                # Haploid calculations
 226
 227                my $T_sub1 = $variance -
 228                    ( ( 1/($avg_samp_size-1))*
 229                      ( ($avg_allele_freq*(1-$avg_allele_freq))-
 230                        ( (($num_sub_pops-1)/$num_sub_pops)*$variance)));
 231                my $T_sub2 = ( (($adj_samp_size-1)/($avg_samp_size-1))*
 232                               $avg_allele_freq*(1-$avg_allele_freq) ) +
 233                               ( 1 + ( (($num_sub_pops-1)*
 234                                        ($avg_samp_size-$adj_samp_size))/
 235                                       ($avg_samp_size - 1))) *
 236                                       ($variance/$num_sub_pops);
 237
 238
 239                #to get total Fst from all alleles (if more than two) or all
 240                #loci (if more than one), we need to calculate $T_sub1 and
 241                #$T_sub2 for all alleles for all loci, sum, and then divide
 242                #again to get Fst.
 243                $TS_sub1 += $T_sub1;
 244                $TS_sub2 += $T_sub2;
 245
 246            } else {
 247                my $S_sub1 = $variance - ( (1/($avg_samp_size-1))*
 248                                           ( ($avg_allele_freq*
 249                                              (1-$avg_allele_freq)) -
 250                                             ((($num_sub_pops-1)/$num_sub_pops)*
 251                                              $variance)-0.25*$freq_heterozygote ) );
 252                my $S_sub2 = ($avg_allele_freq*(1-$avg_allele_freq)) -
 253                    ( ($avg_samp_size/($num_sub_pops*($avg_samp_size-1)))*
 254                      ( ((($num_sub_pops*($avg_samp_size- $adj_samp_size))/
 255                          $avg_samp_size)*$avg_allele_freq*
 256                         (1-$avg_allele_freq)) -
 257                        ( (1/$avg_samp_size)* (($avg_samp_size-1)+
 258                                               ($num_sub_pops-1)*
 259                                               ($avg_samp_size-
 260                                                $adj_samp_size) )*$variance ) -
 261                        ( (($num_sub_pops*($avg_samp_size-$adj_samp_size))/
 262                           (4*$avg_samp_size*$adj_samp_size))*
 263                          $freq_heterozygote ) ) );
 264
 265                my $S_sub3 = ($adj_samp_size/(2*$avg_samp_size))*
 266                    $freq_heterozygote;
 267
 268                #Again, to get the average over many alleles or many loci,
 269                #we will have to run the above for each and then sum the $S
 270                #variables and recalculate the F statistics
 271                $TS_sub1 += $S_sub1;
 272                $TS_sub2 += $S_sub2;
 273            }
 274        }
 275    }
 276    # $Fst_diploid = $S_sub1/$S_sub2;
 277    #my $Fit_diploid = 1 - ($S_sub3/$S_sub2);
 278    #my $Fis_diploid = ($Fit_diploid-$Fst_diploid)/(1-$Fst_diploid);
 279    $Fst = $TS_sub1 / $TS_sub2;
 280
 281    return $Fst;
 282 }
 283
 284 1;