1 # -*-Perl-*- Test Harness script for Bioperl
11 -requires_modules => [qw(DB_File
16 use_ok('Bio::DB::Taxonomy');
17 use_ok('Bio::Tree::Tree');
20 my $temp_dir = test_output_dir();
22 # we're actually testing Bio::Taxon and Bio::DB::Taxonomy::* here, not
25 ok my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
26 isa_ok $db_entrez, 'Bio::DB::Taxonomy::entrez';
27 isa_ok $db_entrez, 'Bio::DB::Taxonomy';
29 ok my $db_flatfile = Bio::DB::Taxonomy->new(
30 -source => 'flatfile',
31 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
32 -namesfile => test_input_file('taxdump', 'names.dmp'),
34 isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
35 isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
37 # By not specifying a '-directory' argument, index files go to a temporary
38 # folder ($Bio::Root::IO::TEMPDIR, such as 'C:\Users\USER\AppData\Local\Temp'),
39 # and are implied to be temporary. So test the ability of flatfile->DESTROY to
40 # remove the temporary index files at object destruction (this also affects files
41 # in "test_output_dir()", since the folder is created inside the temporary folder)
42 no warnings qw(once); # silence 'Name "$Bio::Root::IO::TEMPDIR" used only once'
43 is $db_flatfile->{index_directory}, $Bio::Root::IO::TEMPDIR, 'removal of temporary index files: no -directory';
44 $db_flatfile->DESTROY;
45 ok not -e ($db_flatfile->{index_directory} . '/id2names');
46 ok not -e ($db_flatfile->{index_directory} . '/names2id');
47 ok not -e ($db_flatfile->{index_directory} . '/nodes');
48 ok not -e ($db_flatfile->{index_directory} . '/parents');
50 # Test removal of temporary index files from test_output_dir folder
51 # (since test_output_dir() =~ m/^$Bio::Root::IO::TEMPDIR/)
52 ok $db_flatfile = Bio::DB::Taxonomy->new(
53 -source => 'flatfile',
54 -directory => $temp_dir,
55 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
56 -namesfile => test_input_file('taxdump', 'names.dmp'),
59 is $db_flatfile->{index_directory}, $temp_dir, 'removal of temporary index files: test_output_dir()';
60 $db_flatfile->DESTROY;
61 ok not -e ($db_flatfile->{index_directory} . '/id2names');
62 ok not -e ($db_flatfile->{index_directory} . '/names2id');
63 ok not -e ($db_flatfile->{index_directory} . '/nodes');
64 ok not -e ($db_flatfile->{index_directory} . '/parents');
66 # Generate the object (and the files) again for the remaining tests
67 ok $db_flatfile = Bio::DB::Taxonomy->new(
68 -source => 'flatfile',
69 -directory => $temp_dir,
70 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
71 -namesfile => test_input_file('taxdump', 'names.dmp'),
76 for my $db ($db_entrez, $db_flatfile) {
78 test_skip(-tests => 46, -requires_networking => 1) if $db eq $db_entrez;
81 if ($db eq $db_entrez) {
82 cmp_ok $db->get_num_taxa, '>', 880_000; # 886,907 as of 08-May-2012
84 is $db->get_num_taxa, 189;
87 eval { $id = $db->get_taxonid('Homo sapiens');};
88 skip "Unable to connect to entrez database; no network or server busy?", 38 if $@;
92 # easy test on human, try out the main Taxon methods
93 ok $n = $db->get_taxon(9606);
95 is $n->object_id, $n->id;
96 is $n->ncbi_taxid, $n->id;
97 is $n->parent_id, 9605;
98 is $n->rank, 'species';
100 is $n->node_name, 'Homo sapiens';
101 is $n->scientific_name, $n->node_name;
102 is ${$n->name('scientific')}[0], $n->node_name;
104 my %common_names = map { $_ => 1 } $n->common_names;
105 cmp_ok keys %common_names, '>=', 3, ref($db).": common names";
106 ok exists $common_names{human};
107 ok exists $common_names{man};
109 is $n->division, 'Primates';
110 is $n->genetic_code, 1;
111 is $n->mitochondrial_genetic_code, 2;
112 # these are entrez-only, data not available in dmp files
113 if ($db eq $db_entrez) {
114 ok defined $n->pub_date;
115 ok defined $n->create_date;
116 ok defined $n->update_date;
119 # briefly test some Bio::Tree::NodeI methods
120 ok my $ancestor = $n->ancestor;
121 is $ancestor->scientific_name, 'Homo';
122 # unless set explicitly, Bio::Taxon doesn't return anything for
123 # each_Descendent; must ask the database directly
124 ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
125 cmp_ok @children, '>', 0;
127 sleep(3) if $db eq $db_entrez;
129 # do some trickier things...
130 ok my $n2 = $db->get_Taxonomy_Node('89593');
131 is $n2->scientific_name, 'Craniata';
133 # briefly check we can use some Tree methods
134 my $tree = Bio::Tree::Tree->new();
135 is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
138 my @nodes = $tree->get_nodes;
139 is scalar(@nodes), 0;
141 @lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
142 is scalar @lineage_nodes, 0;
143 @lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
144 cmp_ok(scalar @lineage_nodes, '>', 20);
147 like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
148 like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
149 like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
151 # can we actually form a Tree and use other Tree methods?
152 ok $tree = Bio::Tree::Tree->new(-node => $n);
153 cmp_ok($tree->number_nodes, '>', 20);
154 cmp_ok(scalar($tree->get_nodes), '>', 20);
155 is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
157 # check that getting the ancestor still works now we have explitly set the
158 # ancestor by making a Tree
159 is $n->ancestor->scientific_name, 'Homo';
161 sleep(3) if $db eq $db_entrez;
163 ok $n = $db->get_Taxonomy_Node('1760');
164 is $n->scientific_name, 'Actinobacteria';
166 sleep(3) if $db eq $db_entrez;
168 # entrez isn't as good at searching as flatfile, so we have to special-case
169 my @ids = sort $db->get_taxonids('Chloroflexi');
171 is_deeply \@ids, [200795, 32061];
173 $id = $db->get_taxonids('Chloroflexi (class)');
174 $db eq $db_entrez ? is($id, undef) : is($id, 32061);
176 @ids = $db->get_taxonids('Rhodotorula');
177 cmp_ok @ids, '>=' , 2;
178 if ($db eq $db_entrez) {
179 ok grep { $_ == 592558 } @ids;
180 ok grep { $_ == 5533 } @ids;
182 # note the locally cached flatfile is out-of-date, but technically
183 # correct for testing purposes
184 ok grep { $_ == 266791 } @ids;
185 ok grep { $_ == 5533 } @ids;
191 # Test the list database
193 ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
194 isa_ok $db_list, 'Bio::DB::Taxonomy::list';
195 isa_ok $db_list, 'Bio::DB::Taxonomy';
197 my @ranks = qw(superkingdom class genus species);
198 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
199 ok $db_list = Bio::DB::Taxonomy->new(
201 -names => \@h_lineage,
204 is $db_list->get_num_taxa, 4;
207 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
208 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
209 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
211 @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus');
212 $db_list->add_lineage(-names => \@h_lineage, -ranks => \@ranks);
214 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
215 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
216 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
219 ok my $tree = $db_list->get_tree('Homo sapiens', 'Homo erectus');
220 isa_ok $tree, 'Bio::Tree::TreeI';
221 is $tree->number_nodes, 5;
222 is $tree->total_branch_length, 4;
223 ok my $node1 = $tree->find_node( -scientific_name => 'Homo sapiens' );
224 ok my $node2 = $tree->find_node( -scientific_name => 'Homo erectus' );
225 is $tree->distance($node1, $node2), 2;
227 ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
228 ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
230 is $h_list->ancestor->scientific_name, 'Homo';
232 my @names = $h_list->common_names;
234 $h_list->common_names('woman');
235 @names = $h_list->common_names;
237 @names = $h_flat->common_names;
240 # you can switch to another database when you need more information, which also
241 # merges information in the node from the two different dbs
242 $h_list->db_handle($db_flatfile);
243 @names = $h_list->common_names;
246 # form a tree with the list lineage first, preventing a subsequent database
247 # change from giving us all those extra ranks
248 $h_list->db_handle($db_list);
249 my $ancestors_ancestor = $h_list->ancestor->ancestor;
250 is $ancestors_ancestor->scientific_name, 'Mammalia';
252 $tree = Bio::Tree::Tree->new(-node => $h_list);
253 $h_list->db_handle($db_flatfile);
254 $ancestors_ancestor = $h_list->ancestor->ancestor;
255 is $ancestors_ancestor->scientific_name, 'Mammalia';
257 # or we can get the flatfile database's idea of the ancestors by removing
258 # ourselves from the tree
259 is $h_flat->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
260 $h_list->ancestor(undef);
261 is $h_list->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
263 # get_lca should work on nodes from different databases
265 test_skip(-tests => 9, -requires_networking => 1);
267 # check that the result is the same as if we are retrieving from the same DB
269 $h_flat = $db_flatfile->get_taxon(-name => 'Homo');
270 my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
271 ok my $tree_functions = Bio::Tree::Tree->new();
272 is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
276 eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
277 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
279 eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
280 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
281 ok $tree_functions = Bio::Tree::Tree->new();
282 is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
284 ok $tree_functions = Bio::Tree::Tree->new();
285 # mixing entrez and flatfile
287 local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
288 is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
290 # even though the species taxa for Homo sapiens from list and flat databases
291 # have the same internal id, get_lca won't work because they have different
292 # roots and descendents
293 $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
294 is $h_list->ancestor->internal_id, $h_flat->internal_id;
295 ok ! $tree_functions->get_lca($h_flat, $h_list);
297 # but we can form a tree with the flat node then remove all the ranks we're
298 # not interested in and try again
299 $tree = Bio::Tree::Tree->new(-node => $h_flat);
300 $tree->splice(-keep_rank => \@ranks);
301 is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo';
304 # ideas from taxonomy2tree.PLS that let us make nice tree, using
305 # Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just
306 # because our test flatfile database only has the full lineage of one species
308 for my $name ('Human', 'Hominidae') {
309 my $ncbi_id = $db_flatfile->get_taxonid($name);
311 my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id);
314 ok $tree->merge_lineage($node);
317 ok $tree = Bio::Tree::Tree->new(-node => $node);
321 is $tree->get_nodes, 30;
322 $tree->contract_linear_paths;
323 my $ids = join(",", map { $_->id } $tree->get_nodes);
324 is $ids, '131567,9606';
326 # More thorough tests of merge_lineage
327 ok my $node = $db_list->get_taxon(-name => 'Eukaryota');
328 $tree = Bio::Tree::Tree->new(-node => $node);
329 ok $node = $db_list->get_taxon(-name => 'Homo erectus');
330 ok $tree->merge_lineage($node);
331 for my $name ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus') {
332 ok $node = $tree->find_node(-scientific_name => $name);
335 # we can recursively fetch all descendents of a taxon
337 test_skip(-tests => 1, -requires_networking => 1);
338 eval {$db_entrez->get_taxon(10090);};
339 skip "Unable to connect to entrez database; no network or server busy?", 1 if $@;
341 my $lca = $db_entrez->get_taxon(314146);
342 my @descs = $db_entrez->get_all_Descendents($lca);
343 cmp_ok @descs, '>=', 17;
347 $db_list = Bio::DB::Taxonomy->new(-source => 'list',
349 (split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
350 Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
351 Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
352 Pterygota, Neoptera, Endopterygota, Diptera, Nematocera, Culicimorpha,
353 Culicoidea, Culicidae, Anophelinae, Anopheles, Anopheles, Angusticorn,
354 Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
356 my @taxonids = $db_list->get_taxonids('Anopheles');
357 is @taxonids, 3, 'List context';
359 my $taxonid = $db_list->get_taxonids('Anopheles');
360 isa_ok \$taxonid, 'SCALAR', 'Scalar context';
361 ok exists { map({$_ => undef} @taxonids) }->{$taxonid};
363 # but we should still be able to merge in an incomplete lineage of a sister
364 # species and have the 'tree' remain consistent:
366 # missing 'no rank' Anopheles
367 $db_list->add_lineage(-names => [
368 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
369 maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
370 $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
371 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
372 is $node->rank, undef;
374 # missing 'subgenus' Anopheles
375 $db_list->add_lineage(-names => [
376 (split(/,\s+/, "Anophelinae, Anopheles, Angusticorn, Anopheles,
377 maculipennis group, maculipennis species complex, Anopheles maculipennis"))]);
378 $node = $db_list->get_taxon(-name => 'Anopheles maculipennis');
379 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
381 # missing 'no rank' Angusticorn
382 $db_list->add_lineage(-names => [
383 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Anopheles,
384 maculipennis group, maculipennis species complex, Anopheles melanoon"))]);
385 $node = $db_list->get_taxon(-name => 'Anopheles melanoon');
386 is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
388 @taxonids = $db_list->get_taxonids('Anopheles');
389 is scalar @taxonids, 3;
391 # bug: duplicate topmost taxa
392 $db_list = Bio::DB::Taxonomy->new( -source => 'list',
393 -names => ['Bacteria', 'Tenericutes'] );
394 $db_list->add_lineage( -names => ['Bacteria'] );
395 @taxonids = $db_list->get_taxonids('Bacteria');
396 is scalar @taxonids, 1;
398 # Disambiguate between taxa with same name using -names
399 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
400 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
401 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
403 ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
404 is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
406 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
407 is $node->ancestor->node_name, 'o__Alteromonadales';
408 my $iid = $node->internal_id;
410 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae'] );
411 is $node->ancestor->node_name, 'o__Oceanospirillales';
412 isnt $node->internal_id, $iid;
415 # More tests with ambiguous names, internal IDs and multiple databases
416 my ($node3, $node4, $db_list_2);
417 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' );
418 ok $db_list->add_lineage( -names => [ 'o__Enterobacteriales', 'g__Escherichia' ] );
419 ok $db_list->add_lineage( -names => [ 'o__Pseudomonadales' , 'g__Pseudomonas' ] );
420 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
421 ok $node1 = $db_list->get_taxon( -names => [ 'k__Chroococcales', 'g__Microcoleus' ] );
423 ok $db_list_2 = Bio::DB::Taxonomy->new( -source => 'list' );
424 ok $db_list_2->add_lineage( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
425 ok $node2 = $db_list_2->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
427 is $node1->scientific_name, 'g__Microcoleus';
428 is $node2->scientific_name, 'g__Microcoleus'; # same taxon name
429 isnt $node1->id, $node2->id; # but different dbs and hence taxids
430 is $node1->internal_id, $node1->internal_id; # but same cross-database internal ID
432 ok $db_list->add_lineage( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
433 ok $db_list->add_lineage( -names => [ 'o__Acidobacteriales', 'g__Microcoleus' ] );
435 ok $node1 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
436 ok $node2 = $db_list->get_taxon( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
437 ok $node3 = $db_list->get_taxon( -names => [ 'o__Acidobacteriales' , 'g__Microcoleus' ] );
438 my @nodes = ($node1, $node2, $node3);
440 is map({$_->id => undef} @nodes), 6; # 3 distinct taxids
441 is map({$_->internal_id => undef} @nodes), 6; # 3 distinct iids
443 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
444 ok $node2 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
445 is $node2->scientific_name, $node1->scientific_name;
446 is $node2->id, $node1->id;
447 is $node2->internal_id, $node1->internal_id;
451 test_skip(-tests => 12, -requires_networking => 1);
453 my $db=Bio::DB::Taxonomy->new(-source=>"entrez");
455 my @taxa = qw(viruses Deltavirus unclassified plasmid);
457 for my $taxon (@taxa) {
458 test_taxid($db, $taxon);
462 my ($db, $taxa) = @_;
463 my @taxonids = $db->get_taxonids($taxa);
464 cmp_ok(scalar(@taxonids), '>', 0, "Got IDs returned for $taxa:".join(',', @taxonids));
466 lives_ok { $taxon = $db->get_taxon(-taxonid => pop @taxonids) } "IDs generates a Bio::Taxonomy::Node";
467 if (defined $taxon) {
468 like( $taxon->scientific_name, qr/$taxa/i, "Name returned matches $taxa");
470 ok(0, "No taxon object returned for $taxa");
477 test_skip( -tests => 12, -requires_networking => 0 );
479 my $db = Bio::DB::Taxonomy->new( -source => "entrez" );
481 # String | What I expect | What I get
482 # ---------------------- | ------------- | ----------
483 # 'Lissotriton vulgaris' | 8324 | 8324
484 # 'Chlorella vulgaris' | 3077 | 3077
485 # 'Phygadeuon solidus' | 1763951 | 1763951
486 # 'Ovatus' | 666060 | 666060
487 # 'Phygadeuon ovatus' | "No hit" | 666060
488 # 'Trimorus ovatus' | "No hit" | 666060
490 my @ids = $db_entrez->get_taxonids('Lissotriton vulgaris');
491 is $ids[0], 8324, 'Correct: Lissotriton vulgaris';
492 my @ids = $db_entrez->get_taxonids('Chlorella vulgaris');
493 is $ids[0], 3077, 'Correct: Chlorella vulgaris';
494 my @ids = $db_entrez->get_taxonids('Phygadeuon solidus');
495 is $ids[0], 1763951, 'Correct: Phygadeuon solidus';
496 my @ids = $db_entrez->get_taxonids('Ovatus');
497 is $ids[0], 666060, 'Correct: Ovatus';
498 my @ids = $db_entrez->get_taxonids('Phygadeuon ovatus');
499 is $ids[0], 'No hit', 'Correct: No hit';
500 my @ids = $db_entrez->get_taxonids('Trimorus ovatus');
501 is $ids[0], 'No hit', 'Correct: No hit';