1 # -*-Perl-*- Test Harness script for Bioperl
4 # This will outline many tests for the population genetics
5 # objects in the Bio::PopGen namespace
13 test_begin(-tests => 100);
15 use_ok('Bio::PopGen::Individual');
16 use_ok('Bio::PopGen::Genotype');
17 use_ok('Bio::PopGen::Population');
18 use_ok('Bio::PopGen::IO');
19 use_ok('Bio::PopGen::PopStats');
20 use_ok('Bio::AlignIO');
21 use_ok('Bio::PopGen::Statistics');
22 use_ok('Bio::PopGen::Utilities');
25 # test Fu and Li's D using data from the paper
28 Bio::PopGen::Statistics->fu_and_li_D_counts(24, 18, 9)),
32 Bio::PopGen::Statistics->fu_and_li_D_star_counts(24, 18, 10)),
36 Bio::PopGen::Statistics->fu_and_li_F_counts(24, 3.16, 18, 9)),
40 Bio::PopGen::Statistics->fu_and_li_F_star_counts(24, 3.16, 18, 10)),
43 my $FILE1 = test_output_file();
45 my @individuals = ( Bio::PopGen::Individual->new(-unique_id => '10a'));
48 my @genotypes = ( Bio::PopGen::Genotype->new(-marker_name => 'Mkr1',
49 -individual_id => '10a',
50 -alleles => [ qw(A a)]),
51 Bio::PopGen::Genotype->new(-marker_name => 'Mkr2',
52 -individual_id => '10a',
53 -alleles => [ qw(B B)]),
54 Bio::PopGen::Genotype->new(-marker_name => 'Mkr3',
55 -individual_id => '10a',
56 -alleles => [ qw(A a)]));
57 is(($genotypes[1]->get_Alleles)[0], 'B');
59 $individuals[0]->add_Genotype(@genotypes);
60 is($individuals[0]->get_Genotypes,3);
61 is($individuals[0]->get_Genotypes(-marker => 'Mkr3')->get_Alleles(),2);
62 my @alleles = $individuals[0]->get_Genotypes(-marker => 'Mkr2')->get_Alleles();
66 my $population = Bio::PopGen::Population->new(-name => 'TestPop1',
67 -source => 'testjasondata',
68 -description => 'throw away example',
69 -individuals => \@individuals);
71 is(scalar ($population->get_Individuals()), 1);
72 is($population->name, 'TestPop1');
73 is($population->source, 'testjasondata');
74 is($population->description, 'throw away example');
76 my @genotypes2 = ( Bio::PopGen::Genotype->new(-marker_name => 'Mkr1',
77 -individual_id => '11',
78 -alleles => [ qw(A A)]),
79 Bio::PopGen::Genotype->new(-marker_name => 'Mkr2',
80 -individual_id => '11',
81 -alleles => [ qw(B B)]),
82 Bio::PopGen::Genotype->new(-marker_name => 'Mkr3',
83 -individual_id => '11',
84 -alleles => [ qw(a a)]),
85 Bio::PopGen::Genotype->new(-marker_name => 'Mkr4',
86 -individual_id => '11',
87 -alleles => [ qw(C C)])
89 push @individuals, Bio::PopGen::Individual->new(-genotypes => \@genotypes2,
91 $population->add_Individual($individuals[1]);
93 is(scalar ($population->get_Individuals()), 2);
94 my ($found_ind) = $population->get_Individuals(-unique_id => '10a');
95 is($found_ind->unique_id, '10a');
96 is(scalar($population->get_Individuals(-marker => 'Mkr4')) , 1);
97 is(scalar($population->get_Individuals(-marker => 'Mkr3')) , 2);
99 my @g = $population->get_Genotypes(-marker => 'Mkr4');
101 is($g[0]->individual_id, '11');
102 is(($g[0]->get_Alleles())[0], 'C');
104 my $marker = $population->get_Marker('Mkr3');
107 @alleles = $marker->get_Alleles;
109 my %af = $marker->get_Allele_Frequencies();
113 $population->remove_Individuals('10a');
114 $marker = $population->get_Marker('Mkr3');
115 %af = $marker->get_Allele_Frequencies();
120 # Read in data from a file
121 my $io = Bio::PopGen::IO->new(-format => 'csv',
122 -file => test_input_file('popgen_saureus.dat'));
125 while( my $ind = $io->next_individual ) {
129 my @mrsainds = grep { $_->unique_id =~ /^MRSA/ } @inds;
130 my @mssainds = grep { $_->unique_id =~ /^MSSA/ } @inds;
131 my @envinds = grep { $_->unique_id =~ /^NC/ } @inds;
133 is(scalar @mrsainds, 9);
134 is(scalar @mssainds, 10);
135 is(scalar @envinds, 5);
137 my $mrsapop = Bio::PopGen::Population->new(-name => 'MRSA',
138 -description => 'Resistant S.aureus',
139 -individuals => \@mrsainds);
141 my $mssapop = Bio::PopGen::Population->new(-name => 'MSSA',
142 -description =>'Suceptible S.aureus',
143 -individuals => \@mssainds);
145 my $envpop = Bio::PopGen::Population->new(-name => 'NC',
146 -description => 'WT isolates',
147 -individuals => \@envinds);
149 my $stats = Bio::PopGen::PopStats->new(-haploid => 1);
150 my $fst = $stats->Fst([$mrsapop,$mssapop],[qw(AFLP1)]);
151 # We're going to check the values against other programs first
152 is(sprintf("%.3f",$fst),0.077,'mrsa,mssa aflp1');
154 $fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[qw(AFLP1 )]);
155 is(sprintf("%.3f",$fst),0.035,'all pops, aflp1');
157 $fst = $stats->Fst([$mrsapop,$envpop],[qw(AFLP1 AFLP2)]);
158 is(sprintf("%.3f",$fst),0.046,'mrsa,envpop aflp1,aflp2');
160 # Read in data from a file
161 $io = Bio::PopGen::IO->new(-format => 'csv',
162 -file => test_input_file('popgen_saureus.multidat'));
165 while( my $ind = $io->next_individual ) {
169 @mrsainds = grep { $_->unique_id =~ /^MRSA/ } @inds;
170 @mssainds = grep { $_->unique_id =~ /^MSSA/ } @inds;
171 @envinds = grep { $_->unique_id =~ /^NC/ } @inds;
173 is(scalar @mrsainds, 7);
174 is(scalar @mssainds, 10);
175 is(scalar @envinds, 5);
177 $mrsapop = Bio::PopGen::Population->new(-name => 'MRSA',
178 -description => 'Resistant S.aureus',
179 -individuals => \@mrsainds);
181 $mssapop = Bio::PopGen::Population->new(-name => 'MSSA',
182 -description =>'Suceptible S.aureus',
183 -individuals => \@mssainds);
185 $envpop = Bio::PopGen::Population->new(-name => 'NC',
186 -description => 'WT isolates',
187 -individuals => \@envinds);
189 $stats = Bio::PopGen::PopStats->new(-haploid => 1);
190 my @all_bands = map { 'B' . $_ } 1..20;
191 my @mkr1 = map { 'B' . $_ } 1..13;
192 my @mkr2 = map { 'B' . $_ } 14..20;
194 # still wrong ? seems to work now? --sendubala
195 $fst = $stats->Fst([$mrsapop,$mssapop],[@all_bands ]);
197 # local $TODO = 'Possible bug with Fst() output';
198 is(sprintf("%.3f",$fst),'-0.001','mssa,mrsa all_bands'); # We're going to check the values against other programs first
200 $fst = $stats->Fst([$envpop,$mssapop],[ @mkr1 ]);
201 is(sprintf("%.3f",$fst),0.023,'env,mssa mkr1'); # We're going to check the values against other programs first
203 $fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[ @all_bands ]);
204 is(sprintf("%.3f",$fst),0.071,'env,mssa,mrsa all bands'); # We're going to check the values against other programs first
206 $fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[ @mkr2 ]);
207 is(sprintf("%.3f",$fst),0.076, 'env,mssa,mrsa mkr2'); # We're going to check the values against other programs first
209 $fst = $stats->Fst([$mrsapop,$envpop],[@all_bands ]);
210 is(sprintf("%.3f",$fst),0.241,'mrsa,nc all_bands'); # We're going to check the values against other programs first
212 # test overall allele freq setting for a population
214 my $poptst1 = Bio::PopGen::Population->new(-name => 'tst1');
215 my $poptst2 = Bio::PopGen::Population->new(-name => 'tst2');
217 $poptst1->set_Allele_Frequency(-frequencies =>
218 { 'marker1' => { 'a' => '0.20',
220 'marker2' => { 'A' => '0.10',
225 my $mk1 = $poptst1->get_Marker('marker1');
226 my %f1 = $mk1->get_Allele_Frequencies;
227 is($f1{'a'}, '0.20');
228 is($f1{'A'}, '0.80');
229 my $mk2 = $poptst1->get_Marker('marker2');
230 my %f2 = $mk2->get_Allele_Frequencies;
231 is($f2{'C'}, '0.70');
233 $poptst2->set_Allele_Frequency(-name => 'marker1',
235 -frequency => '0.60');
236 $poptst2->set_Allele_Frequency(-name => 'marker1',
238 -frequency => '0.40');
241 # local $TODO = 'Fst not calculated yet for just allele freqs';
243 # #$fst = $stats->Fst([$poptst1,$poptst2],[qw(marker1 marker2) ]);
246 $io = Bio::PopGen::IO->new(-format => 'csv',
249 $io->write_individual(@inds);
252 $io = Bio::PopGen::IO->new(-format => 'csv',
255 $io->write_population(($mssapop,$mrsapop));
259 $io = Bio::PopGen::IO->new(-format => 'prettybase',
262 $io->write_individual(@inds);
266 $io = Bio::PopGen::IO->new(-format => 'prettybase',
269 $io->write_population(($mssapop,$mrsapop));
274 # Let's do PopGen::Statistics tests here
276 $io = Bio::PopGen::IO->new(-format => 'prettybase',
278 -file => test_input_file('popstats.prettybase'));
279 my (@ingroup,@outgroup);
281 while( my $ind = $io->next_individual ) {
282 if($ind->unique_id =~ /out/) {
283 push @outgroup, $ind;
286 $sitecount = scalar $ind->get_marker_names() unless defined $sitecount;
289 $stats = Bio::PopGen::Statistics->new();
291 # Real data and values courtesy M.Hahn and DNASP
293 is($stats->pi(\@ingroup),2);
294 is(Bio::PopGen::Statistics->pi(\@ingroup,$sitecount),0.4);
296 is(Bio::PopGen::Statistics->theta(\@ingroup),1.92);
297 is(Bio::PopGen::Statistics->theta(\@ingroup,$sitecount),0.384);
299 # Test with a population object
300 my $ingroup = Bio::PopGen::Population->new(-individuals => \@ingroup);
301 my $outgroup = Bio::PopGen::Population->new(-individuals => \@outgroup);
303 is($stats->pi($ingroup),2);
304 is(Bio::PopGen::Statistics->pi($ingroup,$sitecount),0.4);
306 is(Bio::PopGen::Statistics->theta($ingroup),1.92);
307 is(Bio::PopGen::Statistics->theta($ingroup,$sitecount),0.384);
309 my $haploidpop = $ingroup->haploid_population;
310 is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D($haploidpop)), 0.27345);
313 is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D(\@ingroup)),0.27345);
314 is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D($ingroup)),0.27345);
316 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D_star(\@ingroup)),
318 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D_star($ingroup)),
321 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup)),
323 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F_star($ingroup)),
326 is((Bio::PopGen::Statistics->derived_mutations(\@ingroup,\@outgroup))[0], 1);
327 is((Bio::PopGen::Statistics->derived_mutations($ingroup,\@outgroup))[0], 1);
328 is((Bio::PopGen::Statistics->derived_mutations(\@ingroup,$outgroup))[0], 1);
329 is((Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup))[0], 1);
331 # expect to have 1 external mutation
332 @ingroup = $haploidpop->get_Individuals;
334 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,1)),0.75653);
335 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,1)),0.75653);
337 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,
338 \@outgroup)),0.75653);
339 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,
340 \@outgroup)),0.75653);
341 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,
342 $outgroup)),0.75653);
343 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,
344 $outgroup)),0.75653);
346 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,1)),
348 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($haploidpop,1)),0.77499);
349 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($ingroup,
350 \@outgroup)),0.77499);
351 is( sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($ingroup,
352 $outgroup)),0.77499);
357 $io = Bio::PopGen::IO->new(-format => 'prettybase',
358 -file => test_input_file('compLD_test.prettybase'));
360 my $pop = $io->next_population;
362 my %LD = $stats->composite_LD($pop);
364 is($LD{'01'}->{'02'}->[1], 10);
365 is($LD{'01'}->{'03'}->[1], 0);
366 is($LD{'02'}->{'03'}->[1], 0);
370 $io = Bio::PopGen::IO->new(-format => 'prettybase',
371 -file => test_input_file('compLD_missingtest.prettybase'));
373 $pop = $io->next_population;
375 %LD = $stats->composite_LD($pop);
377 is(sprintf("%.4f",$LD{'ProC9198EA'}->{'ProcR2973EA'}->[0]), -0.0375);
378 is(sprintf("%.2f",$LD{'ProC9198EA'}->{'ProcR2973EA'}->[1]), 2.56);
382 # build a population from an alignment
384 my $alnin = Bio::AlignIO->new(-format => 'clustalw',
385 -file => test_input_file('T7.aln'));
386 my $aln = $alnin->next_aln;
387 $population = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
388 is($population->get_number_individuals,9);
389 #warn($aln->match_line,"\n");
390 my $matchline = $aln->match_line;
391 is( $population->get_marker_names, $matchline =~ tr/ //);
392 for my $name ( $population->get_marker_names ) {
393 my $marker = $population->get_Marker($name);
394 # warn("$name ",join(" ",$marker->get_Alleles()),"\n");
398 # test Rich's phase and hap parsers
400 $io = Bio::PopGen::IO->new(-format => 'hapmap',
403 -starting_column => 10,
404 -file => test_input_file('example.hap'));
406 # Some IO might support reading in a population at a time
409 while( my $ind = $io->next_individual ) {
410 push @population, $ind;
413 is($population[3]->unique_id, 'NA06994');
414 is($population[3]->get_Genotypes, 34);
415 $population = Bio::PopGen::Population->new(-individuals => \@population);
417 is(sprintf("%.3f",$stats->pi($population)),12.266);
418 # if forced haploid population is called within pi
419 # need to decide about that...
420 # is(sprintf("%.3f",$stats->pi($population)),12.266);
422 is(sprintf("%.3f",$stats->theta($population)),5.548);
424 # local $TODO = 'May be TJd inconsistency, need to recalculate';
425 is(sprintf("%.3f",$stats->tajima_D($population)),'2.926');
426 is(sprintf("%.3f",$stats->tajima_D($population->haploid_population)),3.468);
428 $io = Bio::PopGen::IO->new(-format => 'phase',
429 -file => test_input_file('example.phase'));
431 # Some IO might support reading in a population at a time
434 while( my $ind = $io->next_individual ) {
435 push @population, $ind;
444 my $in = Bio::PopGen::IO->new(-format=>"csv", -fh=>\*DATA);
445 my $pop = $in->next_population;
446 is(sprintf("%.3f",$stats->pi($pop)),0.833,'Pi on 3-allele data');
447 is(sprintf("%.3f",$stats->theta($pop)),0.545,'Theta on 3-allele data');