Bio/PopGen/PopStats.pm

   1 #
   2 # BioPerl module for Bio::PopGen::PopStats
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl.org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::PopGen::PopStats - A collection of methods for calculating
  17 statistics about a population or sets of populations
  18
  19 =head1 SYNOPSIS
  20
  21   use Bio::PopGen::PopStats;
  22   my $stats = Bio::PopGen::PopStats->new(); # add -haploid => 1
  23                                            # to process haploid data
  24
  25 =head1 DESCRIPTION
  26
  27 Calculate various population structure statistics, most notably Wright's Fst.
  28
  29 =head1 FEEDBACK
  30
  31 =head2 Mailing Lists
  32
  33 User feedback is an integral part of the evolution of this and other
  34 Bioperl modules. Send your comments and suggestions preferably to
  35 the Bioperl mailing list.  Your participation is much appreciated.
  36
  37   bioperl-l@bioperl.org                  - General discussion
  38   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  39
  40 =head2 Support
  41
  42 Please direct usage questions or support issues to the mailing list:
  43
  44 I<bioperl-l@bioperl.org>
  45
  46 rather than to the module maintainer directly. Many experienced and
  47 reponsive experts will be able look at the problem and quickly
  48 address it. Please include a thorough description of the problem
  49 with code and data examples if at all possible.
  50
  51 =head2 Reporting Bugs
  52
  53 Report bugs to the Bioperl bug tracking system to help us keep track
  54 of the bugs and their resolution. Bug reports can be submitted via
  55 the web:
  56
  57   https://github.com/bioperl/bioperl-live/issues
  58
  59 =head1 AUTHOR - Jason Stajich
  60
  61 Email jason-at-bioperl.org
  62
  63 =head1 CONTRIBUTORS
  64
  65 Matthew Hahn, matthew.hahn-at-duke.edu
  66
  67 =head1 APPENDIX
  68
  69 The rest of the documentation details each of the object methods.
  70 Internal methods are usually preceded with a _
  71
  72 =cut
  73
  74
  75 # Let the code begin...
  76
  77
  78 package Bio::PopGen::PopStats;
  79 use strict;
  80
  81 # Object preamble - inherits from Bio::Root::Root
  82
  83
  84
  85 use base qw(Bio::Root::Root);
  86
  87 =head2 new
  88
  89  Title   : new
  90  Usage   : my $obj = Bio::PopGen::PopStats->new();
  91  Function: Builds a new Bio::PopGen::PopStats object
  92  Returns : an instance of Bio::PopGen::PopStats
  93  Args    : -haploid => 1 (if want to use haploid calculations)
  94
  95
  96 =cut
  97
  98 sub new {
  99   my($class,@args) = @_;
 100
 101   my $self = $class->SUPER::new(@args);
 102   my ($haploid) = $self->_rearrange([qw(HAPLOID)],@args);
 103   if( $haploid ) { $self->haploid_status(1) }
 104   return $self;
 105 }
 106
 107
 108 =head2 haploid_status
 109
 110  Title   : haploid_status
 111  Usage   : $obj->haploid_status($newval)
 112  Function: Boolean value for whether or not to do haploid
 113            or diploid calculations, where appropriate
 114  Returns : Boolean
 115  Args    : on set, new boolean value optional)
 116
 117
 118 =cut
 119
 120 sub haploid_status{
 121     my $self = shift;
 122     return $self->{'haploid_status'} = shift if @_;
 123     return $self->{'haploid_status'};
 124 }
 125
 126
 127 # Implementation provided my Matthew Hahn, massaged by Jason Stajich
 128
 129 =head2 Fst
 130
 131  Title   : Fst
 132  Usage   : my $fst = $stats->Fst(\@populations,\@markernames)
 133  Function: Calculate Wright's Fst based on a set of sub-populations
 134            and specific markers
 135  Returns : Fst value (a value between 0 and 1)
 136  Args    : Arrayref of populations to process
 137            Arrayref of marker names to process
 138  Note    : Based on diploid method in Weir BS, Genetics Data Analysis II, 1996
 139            page 178.
 140
 141 =cut
 142
 143 #' make emacs happy here
 144 sub Fst {
 145    my ($self,$populations,$markernames) = @_;
 146
 147    if( ! defined $populations ||
 148        ref($populations) !~ /ARRAY/i ) {
 149        $self->warn("Must provide a valid arrayref for populations");
 150        return;
 151    } elsif( ! defined $markernames ||
 152             ref($markernames) !~ /ARRAY/i ) {
 153        $self->warn("Must provide a valid arrayref for marker names");
 154        return;
 155    }
 156    my $num_sub_pops          = scalar @$populations;
 157
 158    if( $num_sub_pops < 2 ) {
 159        $self->warn("Must provide at least 2 populations for this test, you provided $num_sub_pops");
 160        return;
 161    }
 162
 163    # This code assumes that pop 1 contains at least one of all the
 164    # alleles - need to do some more work to insure that the complete
 165    # set of alleles is seen.
 166    my $Fst;
 167    my ($TS_sub1,$TS_sub2);
 168
 169    foreach my $marker ( @$markernames ) {
 170        # Get all the alleles from all the genotypes in all subpopulations
 171        my %allAlleles;
 172        foreach my $allele ( map { $_->get_Alleles() }
 173                             map { $_->get_Genotypes($marker) } @$populations ){
 174            $allAlleles{$allele}++;
 175        }
 176        my @alleles = keys %allAlleles;
 177
 178        foreach my $allele_name ( @alleles ) {
 179            my $avg_samp_size         = 0; # n-bar
 180            my $avg_allele_freq       = 0; # p-tilda-A-dot
 181
 182            my $total_samples_squared = 0; #
 183            my $sum_heterozygote      = 0;
 184
 185            my @marker_freqs;
 186
 187            # Walk through each population, get the calculated allele frequencies
 188            # for the marker, do some bookkeeping
 189
 190
 191            foreach my $pop ( @$populations ) {
 192                my $s = $pop->get_number_individuals($marker);
 193
 194                $avg_samp_size += $s;
 195                $total_samples_squared += $s**2;
 196
 197                my $markerobj = $pop->get_Marker($marker);
 198                if( ! defined $markerobj ) {
 199                    $self->warn("Could not derive Marker for $marker ".
 200                                "from population ". $pop->name);
 201                    return;
 202                }
 203
 204                my $freq_homozygotes =
 205                    $pop->get_Frequency_Homozygotes($marker,$allele_name);
 206                my %af = $markerobj->get_Allele_Frequencies();
 207                my $all_freq = ( ($af{$allele_name} || 0));
 208
 209                $avg_allele_freq += $s * $all_freq;
 210                $sum_heterozygote += (2 * $s)*( $all_freq - $freq_homozygotes);
 211
 212                push @marker_freqs, \%af;
 213            }
 214            my $total_samples =  $avg_samp_size; # sum of n over i sub-populations
 215            $avg_samp_size /= $num_sub_pops;
 216            $avg_allele_freq /= $total_samples;
 217
 218            # n-sub-c
 219            my $adj_samp_size = ( 1/ ($num_sub_pops - 1)) *
 220                ( $total_samples - ( $total_samples_squared/$total_samples));
 221
 222            my $variance              = 0; # s-squared-sub-A
 223            my $sum_variance          = 0;
 224            my $i = 0;           # we have cached the marker info
 225            foreach my $pop ( @$populations ) {
 226                my $s = $pop->get_number_individuals($marker);
 227                my %af = %{$marker_freqs[$i++]};
 228                $sum_variance += $s * (( ($af{$allele_name} || 0) -
 229                                         $avg_allele_freq)**2);
 230            }
 231            $variance = ( 1 / (( $num_sub_pops-1)*$avg_samp_size))*$sum_variance;
 232
 233            # H-tilda-A-dot
 234            my $freq_heterozygote = ($sum_heterozygote / $total_samples);
 235
 236            if( $self->haploid_status ) {
 237                # Haploid calculations
 238
 239                my $T_sub1 = $variance -
 240                    ( ( 1/($avg_samp_size-1))*
 241                      ( ($avg_allele_freq*(1-$avg_allele_freq))-
 242                        ( (($num_sub_pops-1)/$num_sub_pops)*$variance)));
 243                my $T_sub2 = ( (($adj_samp_size-1)/($avg_samp_size-1))*
 244                               $avg_allele_freq*(1-$avg_allele_freq) ) +
 245                               ( 1 + ( (($num_sub_pops-1)*
 246                                        ($avg_samp_size-$adj_samp_size))/
 247                                       ($avg_samp_size - 1))) *
 248                                       ($variance/$num_sub_pops);
 249
 250
 251                #to get total Fst from all alleles (if more than two) or all
 252                #loci (if more than one), we need to calculate $T_sub1 and
 253                #$T_sub2 for all alleles for all loci, sum, and then divide
 254                #again to get Fst.
 255                $TS_sub1 += $T_sub1;
 256                $TS_sub2 += $T_sub2;
 257
 258            } else {
 259                my $S_sub1 = $variance - ( (1/($avg_samp_size-1))*
 260                                           ( ($avg_allele_freq*
 261                                              (1-$avg_allele_freq)) -
 262                                             ((($num_sub_pops-1)/$num_sub_pops)*
 263                                              $variance)-0.25*$freq_heterozygote ) );
 264                my $S_sub2 = ($avg_allele_freq*(1-$avg_allele_freq)) -
 265                    ( ($avg_samp_size/($num_sub_pops*($avg_samp_size-1)))*
 266                      ( ((($num_sub_pops*($avg_samp_size- $adj_samp_size))/
 267                          $avg_samp_size)*$avg_allele_freq*
 268                         (1-$avg_allele_freq)) -
 269                        ( (1/$avg_samp_size)* (($avg_samp_size-1)+
 270                                               ($num_sub_pops-1)*
 271                                               ($avg_samp_size-
 272                                                $adj_samp_size) )*$variance ) -
 273                        ( (($num_sub_pops*($avg_samp_size-$adj_samp_size))/
 274                           (4*$avg_samp_size*$adj_samp_size))*
 275                          $freq_heterozygote ) ) );
 276
 277                my $S_sub3 = ($adj_samp_size/(2*$avg_samp_size))*
 278                    $freq_heterozygote;
 279
 280                #Again, to get the average over many alleles or many loci,
 281                #we will have to run the above for each and then sum the $S
 282                #variables and recalculate the F statistics
 283                $TS_sub1 += $S_sub1;
 284                $TS_sub2 += $S_sub2;
 285            }
 286        }
 287    }
 288    # $Fst_diploid = $S_sub1/$S_sub2;
 289    #my $Fit_diploid = 1 - ($S_sub3/$S_sub2);
 290    #my $Fis_diploid = ($Fit_diploid-$Fst_diploid)/(1-$Fst_diploid);
 291    $Fst = $TS_sub1 / $TS_sub2;
 292
 293    return $Fst;
 294 }
 295
 296 1;