1 # -*-Perl-*- Test Harness script for Bioperl
12 -requires_modules => [qw(DB_File
17 use_ok('Bio::DB::Taxonomy');
18 use_ok('Bio::Tree::Tree');
21 my $temp_dir = test_output_dir();
23 # we're actually testing Bio::Taxon and Bio::DB::Taxonomy::* here, not
26 ok my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
27 isa_ok $db_entrez, 'Bio::DB::Taxonomy::entrez';
28 isa_ok $db_entrez, 'Bio::DB::Taxonomy';
30 ok my $db_flatfile = Bio::DB::Taxonomy->new(
31 -source => 'flatfile',
32 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
33 -namesfile => test_input_file('taxdump', 'names.dmp'),
35 isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
36 isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
38 # By not specifying a '-directory' argument, index files go to a temporary
39 # folder ($Bio::Root::IO::TEMPDIR, such as 'C:\Users\USER\AppData\Local\Temp'),
40 # and are implied to be temporary. So test the ability of flatfile->DESTROY to
41 # remove the temporary index files at object destruction (this also affects files
42 # in "test_output_dir()", since the folder is created inside the temporary folder)
43 no warnings qw(once); # silence 'Name "$Bio::Root::IO::TEMPDIR" used only once'
44 is $db_flatfile->{index_directory}, $Bio::Root::IO::TEMPDIR, 'removal of temporary index files: no -directory';
45 $db_flatfile->DESTROY;
46 ok not -e ($db_flatfile->{index_directory} . '/id2names');
47 ok not -e ($db_flatfile->{index_directory} . '/names2id');
48 ok not -e ($db_flatfile->{index_directory} . '/nodes');
49 ok not -e ($db_flatfile->{index_directory} . '/parents');
51 # Test removal of temporary index files from test_output_dir folder
52 # (since test_output_dir() =~ m/^$Bio::Root::IO::TEMPDIR/)
53 ok $db_flatfile = Bio::DB::Taxonomy->new(
54 -source => 'flatfile',
55 -directory => $temp_dir,
56 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
57 -namesfile => test_input_file('taxdump', 'names.dmp'),
60 is $db_flatfile->{index_directory}, $temp_dir, 'removal of temporary index files: test_output_dir()';
61 $db_flatfile->DESTROY;
62 ok not -e ($db_flatfile->{index_directory} . '/id2names');
63 ok not -e ($db_flatfile->{index_directory} . '/names2id');
64 ok not -e ($db_flatfile->{index_directory} . '/nodes');
65 ok not -e ($db_flatfile->{index_directory} . '/parents');
67 # Generate the object (and the files) again for the remaining tests
68 ok $db_flatfile = Bio::DB::Taxonomy->new(
69 -source => 'flatfile',
70 -directory => $temp_dir,
71 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
72 -namesfile => test_input_file('taxdump', 'names.dmp'),
77 for my $db ($db_entrez, $db_flatfile) {
79 test_skip(-tests => 46, -requires_networking => 1) if $db eq $db_entrez;
82 if ($db eq $db_entrez) {
83 cmp_ok $db->get_num_taxa, '>', 880_000; # 886,907 as of 08-May-2012
85 is $db->get_num_taxa, 189;
88 eval { $id = $db->get_taxonid('Homo sapiens');};
89 skip "Unable to connect to entrez database; no network or server busy?", 38 if $@;
93 # easy test on human, try out the main Taxon methods
94 ok $n = $db->get_taxon(9606);
96 is $n->object_id, $n->id;
97 is $n->ncbi_taxid, $n->id;
98 is $n->parent_id, 9605;
99 is $n->rank, 'species';
101 is $n->node_name, 'Homo sapiens';
102 is $n->scientific_name, $n->node_name;
103 is ${$n->name('scientific')}[0], $n->node_name;
105 my %common_names = map { $_ => 1 } $n->common_names;
106 is keys %common_names, 3, ref($db).": common names";
107 ok exists $common_names{human};
108 ok exists $common_names{man};
110 is $n->division, 'Primates';
111 is $n->genetic_code, 1;
112 is $n->mitochondrial_genetic_code, 2;
113 # these are entrez-only, data not available in dmp files
114 if ($db eq $db_entrez) {
115 ok defined $n->pub_date;
116 ok defined $n->create_date;
117 ok defined $n->update_date;
120 # briefly test some Bio::Tree::NodeI methods
121 ok my $ancestor = $n->ancestor;
122 is $ancestor->scientific_name, 'Homo';
123 # unless set explicitly, Bio::Taxon doesn't return anything for
124 # each_Descendent; must ask the database directly
125 ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
126 cmp_ok @children, '>', 0;
128 sleep(3) if $db eq $db_entrez;
130 # do some trickier things...
131 ok my $n2 = $db->get_Taxonomy_Node('89593');
132 is $n2->scientific_name, 'Craniata';
134 # briefly check we can use some Tree methods
135 my $tree = Bio::Tree::Tree->new();
136 is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
139 my @nodes = $tree->get_nodes;
140 is scalar(@nodes), 0;
142 @lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
143 is scalar @lineage_nodes, 0;
144 @lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
145 cmp_ok(scalar @lineage_nodes, '>', 20);
148 like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
149 like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
150 like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
152 # can we actually form a Tree and use other Tree methods?
153 ok $tree = Bio::Tree::Tree->new(-node => $n);
154 cmp_ok($tree->number_nodes, '>', 20);
155 cmp_ok(scalar($tree->get_nodes), '>', 20);
156 is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
158 # check that getting the ancestor still works now we have explitly set the
159 # ancestor by making a Tree
160 is $n->ancestor->scientific_name, 'Homo';
162 sleep(3) if $db eq $db_entrez;
164 ok $n = $db->get_Taxonomy_Node('1760');
165 is $n->scientific_name, 'Actinobacteria';
167 sleep(3) if $db eq $db_entrez;
169 # entrez isn't as good at searching as flatfile, so we have to special-case
170 my @ids = sort $db->get_taxonids('Chloroflexi');
172 is_deeply \@ids, [200795, 32061];
174 $id = $db->get_taxonids('Chloroflexi (class)');
175 $db eq $db_entrez ? is($id, undef) : is($id, 32061);
177 @ids = $db->get_taxonids('Rhodotorula');
178 cmp_ok @ids, '>=' , 8;
179 @ids = $db->get_taxonids('Rhodotorula <Microbotryomycetidae>');
186 # Test the list database
188 ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
189 isa_ok $db_list, 'Bio::DB::Taxonomy::list';
190 isa_ok $db_list, 'Bio::DB::Taxonomy';
192 my @ranks = qw(superkingdom class genus species);
193 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
194 ok $db_list = Bio::DB::Taxonomy->new(
196 -names => \@h_lineage,
199 is $db_list->get_num_taxa, 4;
202 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
203 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
204 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
206 @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus');
207 $db_list->add_lineage(-names => \@h_lineage, -ranks => \@ranks);
209 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
210 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
211 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
214 ok my $tree = $db_list->get_tree('Homo sapiens', 'Homo erectus');
215 isa_ok $tree, 'Bio::Tree::TreeI';
216 is $tree->number_nodes, 5;
217 is $tree->total_branch_length, 4;
218 ok my $node1 = $tree->find_node( -scientific_name => 'Homo sapiens' );
219 ok my $node2 = $tree->find_node( -scientific_name => 'Homo erectus' );
220 is $tree->distance($node1, $node2), 2;
222 ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
223 ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
225 is $h_list->ancestor->scientific_name, 'Homo';
227 my @names = $h_list->common_names;
229 $h_list->common_names('woman');
230 @names = $h_list->common_names;
232 @names = $h_flat->common_names;
235 # you can switch to another database when you need more information, which also
236 # merges information in the node from the two different dbs
237 $h_list->db_handle($db_flatfile);
238 @names = $h_list->common_names;
241 # form a tree with the list lineage first, preventing a subsequent database
242 # change from giving us all those extra ranks
243 $h_list->db_handle($db_list);
244 my $ancestors_ancestor = $h_list->ancestor->ancestor;
245 is $ancestors_ancestor->scientific_name, 'Mammalia';
247 $tree = Bio::Tree::Tree->new(-node => $h_list);
248 $h_list->db_handle($db_flatfile);
249 $ancestors_ancestor = $h_list->ancestor->ancestor;
250 is $ancestors_ancestor->scientific_name, 'Mammalia';
252 # or we can get the flatfile database's idea of the ancestors by removing
253 # ourselves from the tree
254 is $h_flat->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
255 $h_list->ancestor(undef);
256 is $h_list->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
258 # get_lca should work on nodes from different databases
260 test_skip(-tests => 9, -requires_networking => 1);
262 # check that the result is the same as if we are retrieving from the same DB
264 $h_flat = $db_flatfile->get_taxon(-name => 'Homo');
265 my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
266 ok my $tree_functions = Bio::Tree::Tree->new();
267 is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
271 eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
272 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
274 eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
275 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
276 ok $tree_functions = Bio::Tree::Tree->new();
277 is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
279 ok $tree_functions = Bio::Tree::Tree->new();
280 # mixing entrez and flatfile
282 local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
283 is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
285 # even though the species taxa for Homo sapiens from list and flat databases
286 # have the same internal id, get_lca won't work because they have different
287 # roots and descendents
288 $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
289 is $h_list->ancestor->internal_id, $h_flat->internal_id;
290 ok ! $tree_functions->get_lca($h_flat, $h_list);
292 # but we can form a tree with the flat node then remove all the ranks we're
293 # not interested in and try again
294 $tree = Bio::Tree::Tree->new(-node => $h_flat);
295 $tree->splice(-keep_rank => \@ranks);
296 is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo';
299 # ideas from taxonomy2tree.PLS that let us make nice tree, using
300 # Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just
301 # because our test flatfile database only has the full lineage of one species
303 for my $name ('Human', 'Hominidae') {
304 my $ncbi_id = $db_flatfile->get_taxonid($name);
306 my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id);
309 ok $tree->merge_lineage($node);
312 ok $tree = Bio::Tree::Tree->new(-node => $node);
316 is $tree->get_nodes, 30;
317 $tree->contract_linear_paths;
318 my $ids = join(",", map { $_->id } $tree->get_nodes);
319 is $ids, '131567,9606';
321 # More thorough tests of merge_lineage
322 ok my $node = $db_list->get_taxon(-name => 'Eukaryota');
323 $tree = Bio::Tree::Tree->new(-node => $node);
324 ok $node = $db_list->get_taxon(-name => 'Homo erectus');
325 ok $tree->merge_lineage($node);
326 for my $name ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus') {
327 ok $node = $tree->find_node(-scientific_name => $name);
330 # we can recursively fetch all descendents of a taxon
332 test_skip(-tests => 1, -requires_networking => 1);
333 eval {$db_entrez->get_taxon(10090);};
334 skip "Unable to connect to entrez database; no network or server busy?", 1 if $@;
336 my $lca = $db_entrez->get_taxon(314146);
337 my @descs = $db_entrez->get_all_Descendents($lca);
338 cmp_ok @descs, '>=', 17;
342 $db_list = Bio::DB::Taxonomy->new(-source => 'list',
344 (split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
345 Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
346 Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
347 Pterygota, Neoptera, Endopterygota, Diptera, Nematocera, Culicimorpha,
348 Culicoidea, Culicidae, Anophelinae, Anopheles, Anopheles, Angusticorn,
349 Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
351 my @taxonids = $db_list->get_taxonids('Anopheles');
352 is @taxonids, 3, 'List context';
354 my $taxonid = $db_list->get_taxonids('Anopheles');
355 isa_ok \$taxonid, 'SCALAR', 'Scalar context';
356 ok exists { map({$_ => undef} @taxonids) }->{$taxonid};
358 # but we should still be able to merge in an incomplete lineage of a sister
359 # species and have the 'tree' remain consistent:
361 # missing 'no rank' Anopheles
362 $db_list->add_lineage(-names => [
363 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
364 maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
365 $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
366 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
367 is $node->rank, undef;
369 # missing 'subgenus' Anopheles
370 $db_list->add_lineage(-names => [
371 (split(/,\s+/, "Anophelinae, Anopheles, Angusticorn, Anopheles,
372 maculipennis group, maculipennis species complex, Anopheles maculipennis"))]);
373 $node = $db_list->get_taxon(-name => 'Anopheles maculipennis');
374 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
376 # missing 'no rank' Angusticorn
377 $db_list->add_lineage(-names => [
378 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Anopheles,
379 maculipennis group, maculipennis species complex, Anopheles melanoon"))]);
380 $node = $db_list->get_taxon(-name => 'Anopheles melanoon');
381 is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
383 @taxonids = $db_list->get_taxonids('Anopheles');
384 is scalar @taxonids, 3;
386 # bug: duplicate topmost taxa
387 $db_list = Bio::DB::Taxonomy->new( -source => 'list',
388 -names => ['Bacteria', 'Tenericutes'] );
389 $db_list->add_lineage( -names => ['Bacteria'] );
390 @taxonids = $db_list->get_taxonids('Bacteria');
391 is scalar @taxonids, 1;
393 # Disambiguate between taxa with same name using -names
394 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
395 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
396 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
398 ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
399 is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
401 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
402 is $node->ancestor->node_name, 'o__Alteromonadales';
403 my $iid = $node->internal_id;
405 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae'] );
406 is $node->ancestor->node_name, 'o__Oceanospirillales';
407 isnt $node->internal_id, $iid;
410 # More tests with ambiguous names, internal IDs and multiple databases
411 my ($node3, $node4, $db_list_2);
412 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' );
413 ok $db_list->add_lineage( -names => [ 'o__Enterobacteriales', 'g__Escherichia' ] );
414 ok $db_list->add_lineage( -names => [ 'o__Pseudomonadales' , 'g__Pseudomonas' ] );
415 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
416 ok $node1 = $db_list->get_taxon( -names => [ 'k__Chroococcales', 'g__Microcoleus' ] );
418 ok $db_list_2 = Bio::DB::Taxonomy->new( -source => 'list' );
419 ok $db_list_2->add_lineage( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
420 ok $node2 = $db_list_2->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
422 is $node1->scientific_name, 'g__Microcoleus';
423 is $node2->scientific_name, 'g__Microcoleus'; # same taxon name
424 isnt $node1->id, $node2->id; # but different dbs and hence taxids
425 is $node1->internal_id, $node1->internal_id; # but same cross-database internal ID
427 ok $db_list->add_lineage( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
428 ok $db_list->add_lineage( -names => [ 'o__Acidobacteriales', 'g__Microcoleus' ] );
430 ok $node1 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
431 ok $node2 = $db_list->get_taxon( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
432 ok $node3 = $db_list->get_taxon( -names => [ 'o__Acidobacteriales' , 'g__Microcoleus' ] );
433 my @nodes = ($node1, $node2, $node3);
435 is map({$_->id => undef} @nodes), 6; # 3 distinct taxids
436 is map({$_->internal_id => undef} @nodes), 6; # 3 distinct iids
438 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
439 ok $node2 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
440 is $node2->scientific_name, $node1->scientific_name;
441 is $node2->id, $node1->id;
442 is $node2->internal_id, $node1->internal_id;