scripts/popgen/bp_heterogeneity_test.pl

   1 #!/usr/bin/perl
   2 # -*-Perl-*- (for my emacs)
   3
   4 use strict;
   5 use warnings;
   6
   7 =head1 NAME
   8
   9 bp_heterogeneity_test - a test for distinguishing between selection and population expansion.
  10
  11 =head1 SYNOPSIS
  12
  13 heterogenetity_test -mut_1/--mutsyn synonymous_mut_count -mut_2/--mutnon nonsyn_mut_count -s/--smaplesize sample_size [-i/--iterations iterations] [-o/--observed observed_D] [-v/--verbose] [--silent ] [-m/--method tajimaD or fuD] [--precision]
  14
  15 =head2 DESCRIPTION
  16
  17 This is an implementation of the Heterogenetity test as described in
  18 Hahn MW, Rausher MD, and Cunningham CW. 2002. Genetics 161(1):11-20.
  19
  20 =head2 OPTIONS
  21
  22  Options in brackets above are optional
  23
  24  -s or --samplesize samplesize
  25  -mut_1 or --mutsyn synonymous mutation count
  26  -mut_2 or --mutnon nonsynonmous mutation count
  27  -i or --iterations number of iterations
  28  -o or --observed   observed D
  29  -m or --method     tajimaD or fuD  for Tajima's D or Fu and Li's D
  30  -v or --verbose    print out extra verbose messages
  31  --silent           Be extra quiet
  32  --precision        Level of precision - specify the number of digits
  33                    (default 4)
  34
  35 =head2 AUTHOR Matthew Hahn E<lt>matthew.hahn-at-duke.eduE<gt>
  36
  37 For more information contact:
  38
  39 Matthew Hahn, E<lt>matthew.hahn-at-duke.eduE<gt>
  40 Jason Stajich E<lt>jason-at-bioperl-dot-orgE<gt>
  41
  42 =cut
  43
  44 use Getopt::Long;
  45 use Bio::PopGen::Simulation::Coalescent;
  46 use Bio::PopGen::Statistics;
  47 use Bio::PopGen::Individual;
  48 use Bio::PopGen::Genotype;
  49
  50 my $sample_size = 4;
  51 my $mut_count_1 = 10; # synonymous
  52 my $mut_count_2 = 20; # non-synonymous
  53 my $iterations = 1;
  54 my $verbose = 0;
  55 my $observedD = undef;
  56 my $method = 'fuD';
  57 my $help = 0;
  58 my $precision = '4'; # Let's make the random precision between
  59                      # 0->1 to 1000th digits
  60
  61 GetOptions(
  62             's|samplesize|samp_size:i' => \$sample_size,
  63             'mut_1|mutsyn:i'           => \$mut_count_1,
  64             'mut_2|mutnon:i'           => \$mut_count_2,
  65             'i|iterations:i'           => \$iterations,
  66             'o|obsered|observedD:f'    => \$observedD,
  67             'v|verbose'                => \$verbose,
  68             'm|method:s'               => \$method,
  69             'h|help'                   => \$help,
  70             'silent'                   => sub { $verbose = -1; },
  71             'p|precision:i'            => \$precision,
  72             );
  73
  74 if( $help ) {
  75     system("perldoc",$0);
  76     exit(0);
  77 }
  78
  79 if( $method ne 'fuD' and $method ne 'tajimaD' ) {
  80     die("available methods are [fu and li's D] (fuD) and Tajima's D (tajimaD)");
  81 }
  82 my @D_distribution;
  83 printf("sample size is %d iteration count = %d\n", $sample_size,
  84        $iterations);
  85
  86 my $sim = new Bio::PopGen::Simulation::Coalescent
  87     (-sample_size => $sample_size);
  88
  89 for(my $iter = 0; $iter < $iterations; $iter++ ) {
  90     my $tree = $sim->next_tree($sample_size);
  91     my $f1 = 0;
  92     if( $mut_count_1 > 0 ) {
  93         $sim->add_Mutations($tree,$mut_count_1,$precision);
  94
  95         my @leaves = $tree->get_leaf_nodes;
  96         # the outgroup is just an individual with the ancestral state
  97         # (no mutations)
  98         my $outgroup = new Bio::PopGen::Individual();
  99         foreach my $m ( $leaves[0]->get_marker_names ) {
 100             $outgroup->add_Genotype(Bio::PopGen::Genotype->new
 101                                     (-marker_name=> $m,
 102                                      -alleles    => [ 0 ]));
 103         }
 104         if( $method eq 'fuD' ) {
 105             $f1 = Bio::PopGen::Statistics->fu_and_li_D(\@leaves,[$outgroup]);
 106         } elsif( $method eq 'tajimaD' ) {
 107             $f1 = Bio::PopGen::Statistics->tajima_D(\@leaves);
 108         }
 109         print "(mutation count = $mut_count_1) D=$f1\n"
 110             if( $verbose >= 0);
 111     }
 112
 113     my $f2 = 0;
 114     if( $mut_count_2 > 0 ) {
 115         $sim->add_Mutations($tree,$mut_count_2,$precision);
 116         my @leaves = $tree->get_leaf_nodes;
 117         # the outgroup is just an individual with the ancestral state
 118         # (no mutations)
 119         my $outgroup = new Bio::PopGen::Individual();
 120         foreach my $m ( $leaves[0]->get_marker_names ) {
 121             $outgroup->add_Genotype(Bio::PopGen::Genotype->new
 122                                     (-marker_name=> $m,
 123                                      -alleles    => [ 0 ]));
 124         }
 125         if( $method eq 'fuD' ) {
 126             $f2 = Bio::PopGen::Statistics->fu_and_li_D(\@leaves,[$outgroup]);
 127         } elsif( $method eq 'tajimaD' ) {
 128             $f2 = Bio::PopGen::Statistics->tajima_D(\@leaves);
 129         }
 130         print "(mutation count = $mut_count_2) D=$f2\n" if( $verbose >= 0);
 131
 132     }
 133     my $deltaD = ( $f1 - $f2 );
 134     push @D_distribution, $deltaD;
 135     if( $iter % 10 == 0 && $iter > 0 ) {
 136         print STDERR "iter = $iter\n";
 137     }
 138 }
 139
 140 if( defined $observedD && $iterations > 1 ) {
 141     my @sortedD = sort { $a <=> $b } @D_distribution;
 142     my $i;
 143     for($i = 0; $i < scalar @sortedD; $i++ ) {
 144         if( $sortedD[$i] > $observedD ) {
 145             last;
 146         }
 147     }
 148
 149     printf( "index %d value=%.4f out of %d total (obs=%.4f)\n",
 150             $i, $sortedD[$i], scalar @sortedD, $observedD);
 151 }
 152