1 # -*-Perl-*- Test Harness script for Bioperl
12 -requires_modules => [qw(DB_File
17 use_ok('Bio::DB::Taxonomy');
18 use_ok('Bio::Tree::Tree');
21 my $temp_dir = test_output_dir();
23 # we're actually testing Bio::Taxon and Bio::DB::Taxonomy::* here, not
26 ok my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
27 isa_ok $db_entrez, 'Bio::DB::Taxonomy::entrez';
28 isa_ok $db_entrez, 'Bio::DB::Taxonomy';
30 ok my $db_flatfile = Bio::DB::Taxonomy->new(
31 -source => 'flatfile',
32 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
33 -namesfile => test_input_file('taxdump', 'names.dmp'),
35 isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
36 isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
38 # By not specifying a '-directory' argument, index files go to a temporary
39 # folder ($Bio::Root::IO::TEMPDIR, such as 'C:\Users\USER\AppData\Local\Temp'),
40 # and are implied to be temporary. So test the ability of flatfile->DESTROY to
41 # remove the temporary index files at object destruction (this also affects files
42 # in "test_output_dir()", since the folder is created inside the temporary folder)
43 no warnings qw(once); # silence 'Name "$Bio::Root::IO::TEMPDIR" used only once'
44 is $db_flatfile->{index_directory}, $Bio::Root::IO::TEMPDIR, 'removal of temporary index files: no -directory';
45 $db_flatfile->DESTROY;
46 ok not -e ($db_flatfile->{index_directory} . '/id2names');
47 ok not -e ($db_flatfile->{index_directory} . '/names2id');
48 ok not -e ($db_flatfile->{index_directory} . '/nodes');
49 ok not -e ($db_flatfile->{index_directory} . '/parents');
51 # Test removal of temporary index files from test_output_dir folder
52 # (since test_output_dir() =~ m/^$Bio::Root::IO::TEMPDIR/)
53 ok $db_flatfile = Bio::DB::Taxonomy->new(
54 -source => 'flatfile',
55 -directory => $temp_dir,
56 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
57 -namesfile => test_input_file('taxdump', 'names.dmp'),
60 is $db_flatfile->{index_directory}, $temp_dir, 'removal of temporary index files: test_output_dir()';
61 $db_flatfile->DESTROY;
62 ok not -e ($db_flatfile->{index_directory} . '/id2names');
63 ok not -e ($db_flatfile->{index_directory} . '/names2id');
64 ok not -e ($db_flatfile->{index_directory} . '/nodes');
65 ok not -e ($db_flatfile->{index_directory} . '/parents');
67 # Generate the object (and the files) again for the remaining tests
68 ok $db_flatfile = Bio::DB::Taxonomy->new(
69 -source => 'flatfile',
70 -directory => $temp_dir,
71 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
72 -namesfile => test_input_file('taxdump', 'names.dmp'),
77 for my $db ($db_entrez, $db_flatfile) {
79 test_skip(-tests => 46, -requires_networking => 1) if $db eq $db_entrez;
82 if ($db eq $db_entrez) {
83 cmp_ok $db->get_num_taxa, '>', 880_000; # 886,907 as of 08-May-2012
85 is $db->get_num_taxa, 189;
88 eval { $id = $db->get_taxonid('Homo sapiens');};
89 skip "Unable to connect to entrez database; no network or server busy?", 38 if $@;
93 # easy test on human, try out the main Taxon methods
94 ok $n = $db->get_taxon(9606);
96 is $n->object_id, $n->id;
97 is $n->ncbi_taxid, $n->id;
98 is $n->parent_id, 9605;
99 is $n->rank, 'species';
101 is $n->node_name, 'Homo sapiens';
102 is $n->scientific_name, $n->node_name;
103 is ${$n->name('scientific')}[0], $n->node_name;
105 my %common_names = map { $_ => 1 } $n->common_names;
106 cmp_ok keys %common_names, '>=', 3, ref($db).": common names";
107 ok exists $common_names{human};
108 ok exists $common_names{man};
110 is $n->division, 'Primates';
111 is $n->genetic_code, 1;
112 is $n->mitochondrial_genetic_code, 2;
113 # these are entrez-only, data not available in dmp files
114 if ($db eq $db_entrez) {
115 ok defined $n->pub_date;
116 ok defined $n->create_date;
117 ok defined $n->update_date;
120 # briefly test some Bio::Tree::NodeI methods
121 ok my $ancestor = $n->ancestor;
122 is $ancestor->scientific_name, 'Homo';
123 # unless set explicitly, Bio::Taxon doesn't return anything for
124 # each_Descendent; must ask the database directly
125 ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
126 cmp_ok @children, '>', 0;
128 sleep(3) if $db eq $db_entrez;
130 # do some trickier things...
131 ok my $n2 = $db->get_Taxonomy_Node('89593');
132 is $n2->scientific_name, 'Craniata';
134 # briefly check we can use some Tree methods
135 my $tree = Bio::Tree::Tree->new();
136 is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
139 my @nodes = $tree->get_nodes;
140 is scalar(@nodes), 0;
142 @lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
143 is scalar @lineage_nodes, 0;
144 @lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
145 cmp_ok(scalar @lineage_nodes, '>', 20);
148 like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
149 like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
150 like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
152 # can we actually form a Tree and use other Tree methods?
153 ok $tree = Bio::Tree::Tree->new(-node => $n);
154 cmp_ok($tree->number_nodes, '>', 20);
155 cmp_ok(scalar($tree->get_nodes), '>', 20);
156 is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
158 # check that getting the ancestor still works now we have explitly set the
159 # ancestor by making a Tree
160 is $n->ancestor->scientific_name, 'Homo';
162 sleep(3) if $db eq $db_entrez;
164 ok $n = $db->get_Taxonomy_Node('1760');
165 is $n->scientific_name, 'Actinobacteria';
167 sleep(3) if $db eq $db_entrez;
169 # entrez isn't as good at searching as flatfile, so we have to special-case
170 my @ids = sort $db->get_taxonids('Chloroflexi');
172 is_deeply \@ids, [200795, 32061];
174 $id = $db->get_taxonids('Chloroflexi (class)');
175 $db eq $db_entrez ? is($id, undef) : is($id, 32061);
177 @ids = $db->get_taxonids('Rhodotorula');
178 cmp_ok @ids, '>=' , 2;
179 if ($db eq $db_entrez) {
180 ok grep { $_ == 592558 } @ids;
181 ok grep { $_ == 5533 } @ids;
183 # note the locally cached flatfile is out-of-date, but technically
184 # correct for testing purposes
185 ok grep { $_ == 266791 } @ids;
186 ok grep { $_ == 5533 } @ids;
192 # Test the list database
194 ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
195 isa_ok $db_list, 'Bio::DB::Taxonomy::list';
196 isa_ok $db_list, 'Bio::DB::Taxonomy';
198 my @ranks = qw(superkingdom class genus species);
199 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
200 ok $db_list = Bio::DB::Taxonomy->new(
202 -names => \@h_lineage,
205 is $db_list->get_num_taxa, 4;
208 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
209 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
210 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
212 @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus');
213 $db_list->add_lineage(-names => \@h_lineage, -ranks => \@ranks);
215 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
216 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
217 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
220 ok my $tree = $db_list->get_tree('Homo sapiens', 'Homo erectus');
221 isa_ok $tree, 'Bio::Tree::TreeI';
222 is $tree->number_nodes, 5;
223 is $tree->total_branch_length, 4;
224 ok my $node1 = $tree->find_node( -scientific_name => 'Homo sapiens' );
225 ok my $node2 = $tree->find_node( -scientific_name => 'Homo erectus' );
226 is $tree->distance($node1, $node2), 2;
228 ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
229 ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
231 is $h_list->ancestor->scientific_name, 'Homo';
233 my @names = $h_list->common_names;
235 $h_list->common_names('woman');
236 @names = $h_list->common_names;
238 @names = $h_flat->common_names;
241 # you can switch to another database when you need more information, which also
242 # merges information in the node from the two different dbs
243 $h_list->db_handle($db_flatfile);
244 @names = $h_list->common_names;
247 # form a tree with the list lineage first, preventing a subsequent database
248 # change from giving us all those extra ranks
249 $h_list->db_handle($db_list);
250 my $ancestors_ancestor = $h_list->ancestor->ancestor;
251 is $ancestors_ancestor->scientific_name, 'Mammalia';
253 $tree = Bio::Tree::Tree->new(-node => $h_list);
254 $h_list->db_handle($db_flatfile);
255 $ancestors_ancestor = $h_list->ancestor->ancestor;
256 is $ancestors_ancestor->scientific_name, 'Mammalia';
258 # or we can get the flatfile database's idea of the ancestors by removing
259 # ourselves from the tree
260 is $h_flat->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
261 $h_list->ancestor(undef);
262 is $h_list->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
264 # get_lca should work on nodes from different databases
266 test_skip(-tests => 9, -requires_networking => 1);
268 # check that the result is the same as if we are retrieving from the same DB
270 $h_flat = $db_flatfile->get_taxon(-name => 'Homo');
271 my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
272 ok my $tree_functions = Bio::Tree::Tree->new();
273 is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
277 eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
278 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
280 eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
281 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
282 ok $tree_functions = Bio::Tree::Tree->new();
283 is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
285 ok $tree_functions = Bio::Tree::Tree->new();
286 # mixing entrez and flatfile
288 local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
289 is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
291 # even though the species taxa for Homo sapiens from list and flat databases
292 # have the same internal id, get_lca won't work because they have different
293 # roots and descendents
294 $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
295 is $h_list->ancestor->internal_id, $h_flat->internal_id;
296 ok ! $tree_functions->get_lca($h_flat, $h_list);
298 # but we can form a tree with the flat node then remove all the ranks we're
299 # not interested in and try again
300 $tree = Bio::Tree::Tree->new(-node => $h_flat);
301 $tree->splice(-keep_rank => \@ranks);
302 is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo';
305 # ideas from taxonomy2tree.PLS that let us make nice tree, using
306 # Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just
307 # because our test flatfile database only has the full lineage of one species
309 for my $name ('Human', 'Hominidae') {
310 my $ncbi_id = $db_flatfile->get_taxonid($name);
312 my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id);
315 ok $tree->merge_lineage($node);
318 ok $tree = Bio::Tree::Tree->new(-node => $node);
322 is $tree->get_nodes, 30;
323 $tree->contract_linear_paths;
324 my $ids = join(",", map { $_->id } $tree->get_nodes);
325 is $ids, '131567,9606';
327 # More thorough tests of merge_lineage
328 ok my $node = $db_list->get_taxon(-name => 'Eukaryota');
329 $tree = Bio::Tree::Tree->new(-node => $node);
330 ok $node = $db_list->get_taxon(-name => 'Homo erectus');
331 ok $tree->merge_lineage($node);
332 for my $name ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus') {
333 ok $node = $tree->find_node(-scientific_name => $name);
336 # we can recursively fetch all descendents of a taxon
338 test_skip(-tests => 1, -requires_networking => 1);
339 eval {$db_entrez->get_taxon(10090);};
340 skip "Unable to connect to entrez database; no network or server busy?", 1 if $@;
342 my $lca = $db_entrez->get_taxon(314146);
343 my @descs = $db_entrez->get_all_Descendents($lca);
344 cmp_ok @descs, '>=', 17;
348 $db_list = Bio::DB::Taxonomy->new(-source => 'list',
350 (split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
351 Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
352 Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
353 Pterygota, Neoptera, Endopterygota, Diptera, Nematocera, Culicimorpha,
354 Culicoidea, Culicidae, Anophelinae, Anopheles, Anopheles, Angusticorn,
355 Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
357 my @taxonids = $db_list->get_taxonids('Anopheles');
358 is @taxonids, 3, 'List context';
360 my $taxonid = $db_list->get_taxonids('Anopheles');
361 isa_ok \$taxonid, 'SCALAR', 'Scalar context';
362 ok exists { map({$_ => undef} @taxonids) }->{$taxonid};
364 # but we should still be able to merge in an incomplete lineage of a sister
365 # species and have the 'tree' remain consistent:
367 # missing 'no rank' Anopheles
368 $db_list->add_lineage(-names => [
369 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
370 maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
371 $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
372 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
373 is $node->rank, undef;
375 # missing 'subgenus' Anopheles
376 $db_list->add_lineage(-names => [
377 (split(/,\s+/, "Anophelinae, Anopheles, Angusticorn, Anopheles,
378 maculipennis group, maculipennis species complex, Anopheles maculipennis"))]);
379 $node = $db_list->get_taxon(-name => 'Anopheles maculipennis');
380 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
382 # missing 'no rank' Angusticorn
383 $db_list->add_lineage(-names => [
384 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Anopheles,
385 maculipennis group, maculipennis species complex, Anopheles melanoon"))]);
386 $node = $db_list->get_taxon(-name => 'Anopheles melanoon');
387 is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
389 @taxonids = $db_list->get_taxonids('Anopheles');
390 is scalar @taxonids, 3;
392 # bug: duplicate topmost taxa
393 $db_list = Bio::DB::Taxonomy->new( -source => 'list',
394 -names => ['Bacteria', 'Tenericutes'] );
395 $db_list->add_lineage( -names => ['Bacteria'] );
396 @taxonids = $db_list->get_taxonids('Bacteria');
397 is scalar @taxonids, 1;
399 # Disambiguate between taxa with same name using -names
400 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
401 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
402 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
404 ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
405 is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
407 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
408 is $node->ancestor->node_name, 'o__Alteromonadales';
409 my $iid = $node->internal_id;
411 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae'] );
412 is $node->ancestor->node_name, 'o__Oceanospirillales';
413 isnt $node->internal_id, $iid;
416 # More tests with ambiguous names, internal IDs and multiple databases
417 my ($node3, $node4, $db_list_2);
418 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' );
419 ok $db_list->add_lineage( -names => [ 'o__Enterobacteriales', 'g__Escherichia' ] );
420 ok $db_list->add_lineage( -names => [ 'o__Pseudomonadales' , 'g__Pseudomonas' ] );
421 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
422 ok $node1 = $db_list->get_taxon( -names => [ 'k__Chroococcales', 'g__Microcoleus' ] );
424 ok $db_list_2 = Bio::DB::Taxonomy->new( -source => 'list' );
425 ok $db_list_2->add_lineage( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
426 ok $node2 = $db_list_2->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
428 is $node1->scientific_name, 'g__Microcoleus';
429 is $node2->scientific_name, 'g__Microcoleus'; # same taxon name
430 isnt $node1->id, $node2->id; # but different dbs and hence taxids
431 is $node1->internal_id, $node1->internal_id; # but same cross-database internal ID
433 ok $db_list->add_lineage( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
434 ok $db_list->add_lineage( -names => [ 'o__Acidobacteriales', 'g__Microcoleus' ] );
436 ok $node1 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
437 ok $node2 = $db_list->get_taxon( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
438 ok $node3 = $db_list->get_taxon( -names => [ 'o__Acidobacteriales' , 'g__Microcoleus' ] );
439 my @nodes = ($node1, $node2, $node3);
441 is map({$_->id => undef} @nodes), 6; # 3 distinct taxids
442 is map({$_->internal_id => undef} @nodes), 6; # 3 distinct iids
444 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
445 ok $node2 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
446 is $node2->scientific_name, $node1->scientific_name;
447 is $node2->id, $node1->id;
448 is $node2->internal_id, $node1->internal_id;
452 test_skip(-tests => 12, -requires_networking => 1);
454 my $db=Bio::DB::Taxonomy->new(-source=>"entrez");
456 my @taxa = qw(viruses Deltavirus unclassified plasmid);
458 for my $taxon (@taxa) {
459 test_taxid($db, $taxon);
463 my ($db, $taxa) = @_;
464 my @taxonids = $db->get_taxonids($taxa);
465 cmp_ok(scalar(@taxonids), '>', 0, "Got IDs returned for $taxa:".join(',', @taxonids));
467 lives_ok { $taxon = $db->get_taxon(-taxonid => pop @taxonids) } "IDs generates a Bio::Taxonomy::Node";
468 if (defined $taxon) {
469 like( $taxon->scientific_name, qr/$taxa/i, "Name returned matches $taxa");
471 ok(0, "No taxon object returned for $taxa");