1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin(-tests => 146,
11 -requires_module => 'XML::Twig');
13 use_ok('Bio::DB::Taxonomy');
14 use_ok('Bio::Tree::Tree');
17 my $temp_dir = test_output_dir();
19 # we're actually testing Bio::Taxon and Bio::DB::Taxonomy::* here, not
22 ok my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
23 isa_ok $db_entrez, 'Bio::DB::Taxonomy::entrez';
24 isa_ok $db_entrez, 'Bio::DB::Taxonomy';
26 ok my $db_flatfile = Bio::DB::Taxonomy->new(
27 -source => 'flatfile',
28 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
29 -namesfile => test_input_file('taxdump','names.dmp'),
31 isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
32 isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
34 ok $db_flatfile = Bio::DB::Taxonomy->new(
35 -source => 'flatfile',
36 -directory => $temp_dir,
37 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
38 -namesfile => test_input_file('taxdump','names.dmp'),
43 for my $db ($db_entrez, $db_flatfile) {
45 test_skip(-tests => 46, -requires_networking => 1) if $db eq $db_entrez;
48 if ($db eq $db_entrez) {
49 cmp_ok $db->get_num_taxa, '>', 880_000; # 886,907 as of 08-May-2012
51 is $db->get_num_taxa, 189;
54 eval { $id = $db->get_taxonid('Homo sapiens');};
55 skip "Unable to connect to entrez database; no network or server busy?", 38 if $@;
59 # easy test on human, try out the main Taxon methods
60 ok $n = $db->get_taxon(9606);
62 is $n->object_id, $n->id;
63 is $n->ncbi_taxid, $n->id;
64 is $n->parent_id, 9605;
65 is $n->rank, 'species';
67 is $n->node_name, 'Homo sapiens';
68 is $n->scientific_name, $n->node_name;
69 is ${$n->name('scientific')}[0], $n->node_name;
71 my %common_names = map { $_ => 1 } $n->common_names;
72 is keys %common_names, 3, ref($db).": common names";
73 ok exists $common_names{human};
74 ok exists $common_names{man};
76 is $n->division, 'Primates';
77 is $n->genetic_code, 1;
78 is $n->mitochondrial_genetic_code, 2;
79 # these are entrez-only, data not available in dmp files
80 if ($db eq $db_entrez) {
81 ok defined $n->pub_date;
82 ok defined $n->create_date;
83 ok defined $n->update_date;
86 # briefly test some Bio::Tree::NodeI methods
87 ok my $ancestor = $n->ancestor;
88 is $ancestor->scientific_name, 'Homo';
89 # unless set explicitly, Bio::Taxon doesn't return anything for
90 # each_Descendent; must ask the database directly
91 ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
92 cmp_ok @children, '>', 0;
94 sleep(3) if $db eq $db_entrez;
96 # do some trickier things...
97 ok my $n2 = $db->get_Taxonomy_Node('89593');
98 is $n2->scientific_name, 'Craniata';
100 # briefly check we can use some Tree methods
101 my $tree = Bio::Tree::Tree->new();
102 is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
105 my @nodes = $tree->get_nodes;
106 is scalar(@nodes), 0;
108 @lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
109 is scalar @lineage_nodes, 0;
110 @lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
111 cmp_ok(scalar @lineage_nodes, '>', 20);
114 like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
115 like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
116 like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
118 # can we actually form a Tree and use other Tree methods?
119 ok $tree = Bio::Tree::Tree->new(-node => $n);
120 cmp_ok($tree->number_nodes, '>', 20);
121 cmp_ok(scalar($tree->get_nodes), '>', 20);
122 is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
124 # check that getting the ancestor still works now we have explitly set the
125 # ancestor by making a Tree
126 is $n->ancestor->scientific_name, 'Homo';
128 sleep(3) if $db eq $db_entrez;
130 ok $n = $db->get_Taxonomy_Node('1760');
131 is $n->scientific_name, 'Actinobacteria';
133 sleep(3) if $db eq $db_entrez;
135 # entrez isn't as good at searching as flatfile, so we have to special-case
136 my @ids = sort $db->get_taxonids('Chloroflexi');
138 is_deeply \@ids, [200795, 32061];
140 $id = $db->get_taxonids('Chloroflexi (class)');
141 $db eq $db_entrez ? is($id, undef) : is($id, 32061);
143 @ids = $db->get_taxonids('Rhodotorula');
144 cmp_ok @ids, '>=' , 8;
145 @ids = $db->get_taxonids('Rhodotorula <Microbotryomycetidae>');
152 # Test the list database
154 ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
155 isa_ok $db_list, 'Bio::DB::Taxonomy::list';
156 isa_ok $db_list, 'Bio::DB::Taxonomy';
158 my @ranks = qw(superkingdom class genus species);
159 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
160 ok $db_list = Bio::DB::Taxonomy->new(
162 -names => \@h_lineage,
165 is $db_list->get_num_taxa, 4;
168 my $tree = $db_list->get_tree('Homo sapiens');
169 isa_ok $tree, 'Bio::Tree::TreeI';
170 is $tree->number_nodes, 4;
172 ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
173 ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
175 is $h_list->ancestor->scientific_name, 'Homo';
177 my @names = $h_list->common_names;
179 $h_list->common_names('woman');
180 @names = $h_list->common_names;
182 @names = $h_flat->common_names;
185 # you can switch to another database when you need more information, which also
186 # merges information in the node from the two different dbs
187 $h_list->db_handle($db_flatfile);
188 @names = $h_list->common_names;
191 # form a tree with the list lineage first, preventing a subsequent database
192 # change from giving us all those extra ranks
193 $h_list->db_handle($db_list);
194 my $ancestors_ancestor = $h_list->ancestor->ancestor;
195 is $ancestors_ancestor->scientific_name, 'Mammalia';
197 $tree = Bio::Tree::Tree->new(-node => $h_list);
198 $h_list->db_handle($db_flatfile);
199 $ancestors_ancestor = $h_list->ancestor->ancestor;
200 is $ancestors_ancestor->scientific_name, 'Mammalia';
202 # or we can get the flatfile database's idea of the ancestors by removing
203 # ourselves from the tree
204 is $h_flat->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
205 $h_list->ancestor(undef);
206 is $h_list->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
208 # get_lca should work on nodes from different databases
210 test_skip(-tests => 9, -requires_networking => 1);
212 # check that the result is the same as if we are retrieving from the same DB
214 $h_flat = $db_flatfile->get_taxon(-name => 'Homo');
215 my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
216 ok my $tree_functions = Bio::Tree::Tree->new();
217 is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
221 eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
222 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
224 eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
225 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
226 ok $tree_functions = Bio::Tree::Tree->new();
227 is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
229 ok $tree_functions = Bio::Tree::Tree->new();
230 # mixing entrez and flatfile
232 local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
233 is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
235 # even though the species taxa for Homo sapiens from list and flat databases
236 # have the same internal id, get_lca won't work because they have different
237 # roots and descendents
238 $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
239 is $h_list->ancestor->internal_id, $h_flat->internal_id;
240 ok ! $tree_functions->get_lca($h_flat, $h_list);
242 # but we can form a tree with the flat node then remove all the ranks we're
243 # not interested in and try again
244 $tree = Bio::Tree::Tree->new(-node => $h_flat);
245 $tree->splice(-keep_rank => \@ranks);
246 is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo';
249 # ideas from taxonomy2tree.PLS that let us make nice tree, using
250 # Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just
251 # because our test flatfile database only has the full lineage of one species
253 for my $name ('Human', 'Hominidae') {
254 my $ncbi_id = $db_flatfile->get_taxonid($name);
256 my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id);
259 $tree->merge_lineage($node);
262 ok $tree = Bio::Tree::Tree->new(-node => $node);
266 is $tree->get_nodes, 30;
267 $tree->contract_linear_paths;
268 my $ids = join(",", map { $_->id } $tree->get_nodes);
269 is $ids, '131567,9606';
271 # we can recursively fetch all descendents of a taxon
273 test_skip(-tests => 1, -requires_networking => 1);
274 eval {$db_entrez->get_taxon(10090);};
275 skip "Unable to connect to entrez database; no network or server busy?", 1 if $@;
277 my $lca = $db_entrez->get_taxon(314146);
278 my @descs = $db_entrez->get_all_Descendents($lca);
279 cmp_ok @descs, '>=', 17;
283 $db_list = Bio::DB::Taxonomy->new(-source => 'list',
285 (split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
286 Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
287 Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
288 Pterygota, Neoptera, Endopterygota, Diptera, Nematocera, Culicimorpha,
289 Culicoidea, Culicidae, Anophelinae, Anopheles, Anopheles, Angusticorn,
290 Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
292 my @taxonids = $db_list->get_taxonids('Anopheles');
295 # but we should still be able to merge in an incomplete lineage of a sister
296 # species and have the 'tree' remain consistent:
298 # missing 'no rank' Anopheles
299 $db_list->add_lineage(-names => [
300 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
301 maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
302 my $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
303 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
305 # missing 'subgenus' Anopheles
306 $db_list->add_lineage(-names => [
307 (split(/,\s+/, "Anophelinae, Anopheles, Angusticorn, Anopheles,
308 maculipennis group, maculipennis species complex, Anopheles maculipennis"))]);
309 $node = $db_list->get_taxon(-name => 'Anopheles maculipennis');
310 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
312 # missing 'no rank' Angusticorn
313 $db_list->add_lineage(-names => [
314 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Anopheles,
315 maculipennis group, maculipennis species complex, Anopheles melanoon"))]);
316 $node = $db_list->get_taxon(-name => 'Anopheles melanoon');
317 is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
319 @taxonids = $db_list->get_taxonids('Anopheles');
320 is scalar @taxonids, 3;
322 # bug: duplicate topmost taxa
323 $db_list = Bio::DB::Taxonomy->new( -source => 'list',
324 -names => ['Bacteria', 'Tenericutes'] );
325 $db_list->add_lineage( -names => ['Bacteria'] );
326 @taxonids = $db_list->get_taxonids('Bacteria');
327 is scalar @taxonids, 1;
329 # Disambiguate between taxa with same name using -names
330 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
331 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
332 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
334 ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
335 is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
337 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
338 is $node->ancestor->node_name, 'o__Alteromonadales';
340 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales' , 'f__Alteromonadaceae'] );
341 is $node->ancestor->node_name, 'o__Oceanospirillales';