1 # -*-Perl-*- Test Harness script for Bioperl
4 # This will outline many tests for the population genetics
5 # objects in the Bio::PopGen namespace
13 test_begin(-tests => 105);
16 use_ok('Bio::PopGen::Individual');
17 use_ok('Bio::PopGen::Genotype');
18 use_ok('Bio::PopGen::Population');
19 use_ok('Bio::PopGen::IO');
20 use_ok('Bio::PopGen::PopStats');
21 use_ok('Bio::AlignIO');
22 use_ok('Bio::PopGen::Statistics');
23 use_ok('Bio::PopGen::Utilities');
26 # test Fu and Li's D using data from the paper
29 Bio::PopGen::Statistics->fu_and_li_D_counts(24, 18, 9)),
33 Bio::PopGen::Statistics->fu_and_li_D_star_counts(24, 18, 10)),
37 Bio::PopGen::Statistics->fu_and_li_F_counts(24, 3.16, 18, 9)),
41 Bio::PopGen::Statistics->fu_and_li_F_star_counts(24, 3.16, 18, 10)),
44 my $FILE1 = test_output_file();
46 my @individuals = ( Bio::PopGen::Individual->new(-unique_id => '10a'));
49 my @genotypes = ( Bio::PopGen::Genotype->new(-marker_name => 'Mkr1',
50 -individual_id => '10a',
51 -alleles => [ qw(A a)]),
52 Bio::PopGen::Genotype->new(-marker_name => 'Mkr2',
53 -individual_id => '10a',
54 -alleles => [ qw(B B)]),
55 Bio::PopGen::Genotype->new(-marker_name => 'Mkr3',
56 -individual_id => '10a',
57 -alleles => [ qw(A a)]));
58 is(($genotypes[1]->get_Alleles)[0], 'B');
60 $individuals[0]->add_Genotype(@genotypes);
61 is($individuals[0]->get_Genotypes,3);
62 is($individuals[0]->get_Genotypes(-marker => 'Mkr3')->get_Alleles(),2);
63 my @alleles = $individuals[0]->get_Genotypes(-marker => 'Mkr2')->get_Alleles();
67 my $population = Bio::PopGen::Population->new(-name => 'TestPop1',
68 -source => 'testjasondata',
69 -description => 'throw away example',
70 -individuals => \@individuals);
72 is(scalar ($population->get_Individuals()), 1);
73 is($population->name, 'TestPop1');
74 is($population->source, 'testjasondata');
75 is($population->description, 'throw away example');
77 my @genotypes2 = ( Bio::PopGen::Genotype->new(-marker_name => 'Mkr1',
78 -individual_id => '11',
79 -alleles => [ qw(A A)]),
80 Bio::PopGen::Genotype->new(-marker_name => 'Mkr2',
81 -individual_id => '11',
82 -alleles => [ qw(B B)]),
83 Bio::PopGen::Genotype->new(-marker_name => 'Mkr3',
84 -individual_id => '11',
85 -alleles => [ qw(a a)]),
86 Bio::PopGen::Genotype->new(-marker_name => 'Mkr4',
87 -individual_id => '11',
88 -alleles => [ qw(C C)])
90 push @individuals, Bio::PopGen::Individual->new(-genotypes => \@genotypes2,
92 $population->add_Individual($individuals[1]);
94 is(scalar ($population->get_Individuals()), 2);
95 my ($found_ind) = $population->get_Individuals(-unique_id => '10a');
96 is($found_ind->unique_id, '10a');
97 is(scalar($population->get_Individuals(-marker => 'Mkr4')) , 1);
98 is(scalar($population->get_Individuals(-marker => 'Mkr3')) , 2);
100 my @g = $population->get_Genotypes(-marker => 'Mkr4');
102 is($g[0]->individual_id, '11');
103 is(($g[0]->get_Alleles())[0], 'C');
105 my $marker = $population->get_Marker('Mkr3');
108 is($marker->marker_coverage, 2);
109 @alleles = $marker->get_Alleles;
111 my %af = $marker->get_Allele_Frequencies();
115 $population->remove_Individuals('10a');
116 $marker = $population->get_Marker('Mkr3');
117 is($marker->marker_coverage, 1);
118 %af = $marker->get_Allele_Frequencies();
123 # Read in data from a file
124 my $io = Bio::PopGen::IO->new(-format => 'csv',
125 -file => test_input_file('popgen_saureus.dat'));
128 while( my $ind = $io->next_individual ) {
132 my @mrsainds = grep { $_->unique_id =~ /^MRSA/ } @inds;
133 my @mssainds = grep { $_->unique_id =~ /^MSSA/ } @inds;
134 my @envinds = grep { $_->unique_id =~ /^NC/ } @inds;
136 is(scalar @mrsainds, 9);
137 is(scalar @mssainds, 10);
138 is(scalar @envinds, 5);
140 my $mrsapop = Bio::PopGen::Population->new(-name => 'MRSA',
141 -description => 'Resistant S.aureus',
142 -individuals => \@mrsainds);
144 my $mssapop = Bio::PopGen::Population->new(-name => 'MSSA',
145 -description =>'Suceptible S.aureus',
146 -individuals => \@mssainds);
148 my $envpop = Bio::PopGen::Population->new(-name => 'NC',
149 -description => 'WT isolates',
150 -individuals => \@envinds);
152 my $stats = Bio::PopGen::PopStats->new(-haploid => 1);
153 my $fst = $stats->Fst([$mrsapop,$mssapop],[qw(AFLP1)]);
154 # We're going to check the values against other programs first
155 is(sprintf("%.3f",$fst),0.077,'mrsa,mssa aflp1');
157 $fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[qw(AFLP1 )]);
158 is(sprintf("%.3f",$fst),0.035,'all pops, aflp1');
160 $fst = $stats->Fst([$mrsapop,$envpop],[qw(AFLP1 AFLP2)]);
161 is(sprintf("%.3f",$fst),0.046,'mrsa,envpop aflp1,aflp2');
163 # Read in data from a file
164 $io = Bio::PopGen::IO->new(-format => 'csv',
165 -file => test_input_file('popgen_saureus.multidat'));
168 while( my $ind = $io->next_individual ) {
172 @mrsainds = grep { $_->unique_id =~ /^MRSA/ } @inds;
173 @mssainds = grep { $_->unique_id =~ /^MSSA/ } @inds;
174 @envinds = grep { $_->unique_id =~ /^NC/ } @inds;
176 is(scalar @mrsainds, 7);
177 is(scalar @mssainds, 10);
178 is(scalar @envinds, 5);
180 $mrsapop = Bio::PopGen::Population->new(-name => 'MRSA',
181 -description => 'Resistant S.aureus',
182 -individuals => \@mrsainds);
184 $mssapop = Bio::PopGen::Population->new(-name => 'MSSA',
185 -description =>'Suceptible S.aureus',
186 -individuals => \@mssainds);
188 $envpop = Bio::PopGen::Population->new(-name => 'NC',
189 -description => 'WT isolates',
190 -individuals => \@envinds);
192 $stats = Bio::PopGen::PopStats->new(-haploid => 1);
193 my @all_bands = map { 'B' . $_ } 1..20;
194 my @mkr1 = map { 'B' . $_ } 1..13;
195 my @mkr2 = map { 'B' . $_ } 14..20;
197 # still wrong ? seems to work now? --sendubala
198 $fst = $stats->Fst([$mrsapop,$mssapop],[@all_bands ]);
200 # local $TODO = 'Possible bug with Fst() output';
201 is(sprintf("%.3f",$fst),'-0.001','mssa,mrsa all_bands'); # We're going to check the values against other programs first
203 $fst = $stats->Fst([$envpop,$mssapop],[ @mkr1 ]);
204 is(sprintf("%.3f",$fst),0.023,'env,mssa mkr1'); # We're going to check the values against other programs first
206 $fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[ @all_bands ]);
207 is(sprintf("%.3f",$fst),0.071,'env,mssa,mrsa all bands'); # We're going to check the values against other programs first
209 $fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[ @mkr2 ]);
210 is(sprintf("%.3f",$fst),0.076, 'env,mssa,mrsa mkr2'); # We're going to check the values against other programs first
212 $fst = $stats->Fst([$mrsapop,$envpop],[@all_bands ]);
213 is(sprintf("%.3f",$fst),0.241,'mrsa,nc all_bands'); # We're going to check the values against other programs first
215 # test overall allele freq setting for a population
217 my $poptst1 = Bio::PopGen::Population->new(-name => 'tst1');
218 my $poptst2 = Bio::PopGen::Population->new(-name => 'tst2');
220 $poptst1->set_Allele_Frequency(-frequencies =>
221 { 'marker1' => { 'a' => '0.20',
223 'marker2' => { 'A' => '0.10',
228 my $mk1 = $poptst1->get_Marker('marker1');
229 my %f1 = $mk1->get_Allele_Frequencies;
230 is($f1{'a'}, '0.20');
231 is($f1{'A'}, '0.80');
232 my $mk2 = $poptst1->get_Marker('marker2');
233 my %f2 = $mk2->get_Allele_Frequencies;
234 is($f2{'C'}, '0.70');
236 $poptst2->set_Allele_Frequency(-name => 'marker1',
238 -frequency => '0.60');
239 $poptst2->set_Allele_Frequency(-name => 'marker1',
241 -frequency => '0.40');
244 # local $TODO = 'Fst not calculated yet for just allele freqs';
246 # #$fst = $stats->Fst([$poptst1,$poptst2],[qw(marker1 marker2) ]);
249 $io = Bio::PopGen::IO->new(-format => 'csv',
252 $io->write_individual(@inds);
255 $io = Bio::PopGen::IO->new(-format => 'csv',
258 $io->write_population(($mssapop,$mrsapop));
262 $io = Bio::PopGen::IO->new(-format => 'prettybase',
265 $io->write_individual(@inds);
269 $io = Bio::PopGen::IO->new(-format => 'prettybase',
272 $io->write_population(($mssapop,$mrsapop));
277 # Let's do PopGen::Statistics tests here
279 $io = Bio::PopGen::IO->new(-format => 'prettybase',
281 -file => test_input_file('popstats.prettybase'));
282 my (@ingroup,@outgroup);
284 while( my $ind = $io->next_individual ) {
285 if($ind->unique_id =~ /out/) {
286 push @outgroup, $ind;
289 $sitecount = scalar $ind->get_marker_names() unless defined $sitecount;
292 $stats = Bio::PopGen::Statistics->new();
294 # Real data and values courtesy M.Hahn and DNASP
296 is($stats->pi(\@ingroup),2);
297 is(Bio::PopGen::Statistics->pi(\@ingroup,$sitecount),0.4);
299 is(Bio::PopGen::Statistics->theta(\@ingroup),1.92);
300 is(Bio::PopGen::Statistics->theta(\@ingroup,$sitecount),0.384);
302 # Test with a population object
303 my $ingroup = Bio::PopGen::Population->new(-individuals => \@ingroup);
304 my $outgroup = Bio::PopGen::Population->new(-individuals => \@outgroup);
306 is($stats->pi($ingroup),2);
307 is(Bio::PopGen::Statistics->pi($ingroup,$sitecount),0.4);
309 is(Bio::PopGen::Statistics->theta($ingroup),1.92);
310 is(Bio::PopGen::Statistics->theta($ingroup,$sitecount),0.384);
312 my $haploidpop = $ingroup->haploid_population;
313 is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D($haploidpop)), 0.27345);
316 is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D(\@ingroup)),0.27345);
317 is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D($ingroup)),0.27345);
319 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D_star(\@ingroup)),
321 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D_star($ingroup)),
324 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup)),
326 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F_star($ingroup)),
329 is((Bio::PopGen::Statistics->derived_mutations(\@ingroup,\@outgroup))[0], 1);
330 is((Bio::PopGen::Statistics->derived_mutations($ingroup,\@outgroup))[0], 1);
331 is((Bio::PopGen::Statistics->derived_mutations(\@ingroup,$outgroup))[0], 1);
332 is((Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup))[0], 1);
334 # expect to have 1 external mutation
335 @ingroup = $haploidpop->get_Individuals;
337 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,1)),0.75653);
338 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,1)),0.75653);
340 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,
341 \@outgroup)),0.75653);
342 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,
343 \@outgroup)),0.75653);
344 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,
345 $outgroup)),0.75653);
346 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,
347 $outgroup)),0.75653);
349 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,1)),
351 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($haploidpop,1)),0.77499);
352 is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($ingroup,
353 \@outgroup)),0.77499);
354 is( sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($ingroup,
355 $outgroup)),0.77499);
360 $io = Bio::PopGen::IO->new(-format => 'prettybase',
361 -file => test_input_file('compLD_test.prettybase'));
363 my $pop = $io->next_population;
365 my %LD = $stats->composite_LD($pop);
367 is($LD{'01'}->{'02'}->[1], 10);
368 is($LD{'01'}->{'03'}->[1], 0);
369 is($LD{'02'}->{'03'}->[1], 0);
373 $io = Bio::PopGen::IO->new(-format => 'prettybase',
374 -file => test_input_file('compLD_missingtest.prettybase'));
376 $pop = $io->next_population;
378 %LD = $stats->composite_LD($pop);
380 is(sprintf("%.4f",$LD{'ProC9198EA'}->{'ProcR2973EA'}->[0]), -0.0375);
381 is(sprintf("%.2f",$LD{'ProC9198EA'}->{'ProcR2973EA'}->[1]), 2.56);
385 # build a population from an alignment
387 my $alnin = Bio::AlignIO->new(-format => 'clustalw',
388 -file => test_input_file('T7.aln'));
389 my $aln = $alnin->next_aln;
390 $population = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
391 is($population->get_number_individuals,9);
392 #warn($aln->match_line,"\n");
393 my $matchline = $aln->match_line;
394 is( $population->get_marker_names, $matchline =~ tr/ //);
395 for my $name ( $population->get_marker_names ) {
396 my $marker = $population->get_Marker($name);
397 # warn("$name ",join(" ",$marker->get_Alleles()),"\n");
401 # test Rich's phase and hap parsers
403 $io = Bio::PopGen::IO->new(-format => 'hapmap',
406 -starting_column => 10,
407 -file => test_input_file('example.hap'));
409 # Some IO might support reading in a population at a time
412 while( my $ind = $io->next_individual ) {
413 push @population, $ind;
416 is($population[3]->unique_id, 'NA06994');
417 is($population[3]->get_Genotypes, 34);
418 $population = Bio::PopGen::Population->new(-individuals => \@population);
420 is(sprintf("%.3f",$stats->pi($population)),12.266);
421 # if forced haploid population is called within pi
422 # need to decide about that...
423 # is(sprintf("%.3f",$stats->pi($population)),12.266);
425 is(sprintf("%.3f",$stats->theta($population)),5.548);
427 # local $TODO = 'May be TJd inconsistency, need to recalculate';
428 is(sprintf("%.3f",$stats->tajima_D($population)),'2.926');
429 is(sprintf("%.3f",$stats->tajima_D($population->haploid_population)),3.468);
432 # test converting from hapmap to phase
434 my $out = IO::String->new($string);
435 Bio::PopGen::IO->new(-fh => $out, -format => 'phase')->write_individual($population[0]);
438 P rs1022827 rs1323262 rs1359058 rs1475202 rs1570473 rs2025307 rs2025308 rs2296049 rs2296050 rs2296054 rs3739586 rs4400444 rs4584192 rs4740849 rs4740850 rs4740851 rs4740866 rs4742215 rs4742219 rs4742220 rs4742222 rs4742223 rs4742225 rs4742227 rs4742236 rs4742292 rs732118 rs732119 rs745876 rs745877 rs881684 rs912174 rs912175 rs962817
439 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
441 G C A G G A A A C T C T A C A A T G C A A C G G A A A T A T A A C C
442 G G T G G A A G G T C T G G G T T T T G G T T G G A A T A T A C G T
446 $io = Bio::PopGen::IO->new(-format => 'phase',
447 -file => test_input_file('example.phase'));
449 # Some IO might support reading in a population at a time
452 while( my $ind = $io->next_individual ) {
453 push @population, $ind;
457 # test writing in phase format
458 $population = Bio::PopGen::Population->new(-individuals => \@population);
460 $out = IO::String->new($string);
461 Bio::PopGen::IO->new(-fh => $out, -format => 'phase')->write_population($population);
464 P 1313 1500 2023 300 5635
481 my $in = Bio::PopGen::IO->new(-format=>"csv", -fh=>\*DATA);
482 my $pop = $in->next_population;
483 is(sprintf("%.3f",$stats->pi($pop)),0.833,'Pi on 3-allele data');
484 is(sprintf("%.3f",$stats->theta($pop)),0.545,'Theta on 3-allele data');