2 # BioPerl module for Bio::DB::Taxonomy
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::DB::Taxonomy - Access to a taxonomy database
20 use Bio::DB::Taxonomy;
21 my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
22 # use NCBI Entrez over HTTP
23 my $taxonid = $db->get_taxonid('Homo sapiens');
26 my $taxon = $db->get_taxon(-taxonid => $taxonid);
30 This is a front end module for access to a taxonomy database.
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHOR - Jason Stajich
64 Email jason-at-bioperl.org
68 Sendu Bala: bix@sendu.me.uk
72 The rest of the documentation details each of the object methods.
73 Internal methods are usually preceded with a _
77 # Let the code begin...
79 package Bio
::DB
::Taxonomy
;
80 use vars
qw($DefaultSource $TAXON_IIDS);
84 use base qw(Bio::Root::Root);
86 $DefaultSource = 'entrez';
93 Usage : my $obj = Bio::DB::Taxonomy->new(-source => 'entrez');
94 Function: Builds a new Bio::DB::Taxonomy object.
95 Returns : an instance of Bio::DB::Taxonomy
96 Args : -source => which database source 'entrez' (NCBI taxonomy online),
97 'flatfile' (local NCBI taxonomy), 'greengenes' (local
98 GreenGenes taxonomy), 'silva' (local Silva taxonomy), or
99 'list' (Do-It-Yourself taxonomy)
104 my($class,@args) = @_;
106 if( $class =~ /Bio::DB::Taxonomy::(\S+)/ ) {
107 my ($self) = $class->SUPER::new
(@args);
108 $self->_initialize(@args);
112 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
113 my $source = $param{'-source'} || $DefaultSource;
115 $source = "\L$source"; # normalize capitalization to lower case
117 # normalize capitalization
118 return unless( $class->_load_tax_module($source) );
119 return "Bio::DB::Taxonomy::$source"->new(@args);
131 Usage : my $num = $db->get_num_taxa();
132 Function: Get the number of taxa stored in the database.
139 shift->throw_not_implemented();
146 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid);
147 Function: Get a Bio::Taxon object from the database.
148 Returns : Bio::Taxon object
149 Args : just a single value which is the database id, OR named args:
150 -taxonid => taxonomy id (to query by taxonid)
152 -name => string (to query by a taxonomy name: common name,
153 scientific name, etc)
158 shift->throw_not_implemented();
161 *get_Taxonomy_Node
= \
&get_taxon
;
167 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
168 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
169 string. Note that multiple taxonids can match to the same supplied
171 Returns : array of integer ids in list context, one of these in scalar context
172 Args : string representing the taxon's name
177 shift->throw_not_implemented();
180 *get_taxonid
= \
&get_taxonids
;
181 *get_taxaid
= \
&get_taxonids
;
187 Usage : my $tree = $db->get_tree(@species_names);
188 Function: Generate a tree comprised of the full lineages of all the supplied
189 species names. The nodes for the requested species are given
190 name('supplied') values corresponding to the supplied name, such that
191 they can be identified if the real species name in the database
192 (stored under node_name()) is different. The nodes are also given an
193 arbitrary branch length of 1.
194 Returns : Bio::Tree::Tree
195 Args : A list of species names (strings) to include in the tree.
200 my ($self, @species_names) = @_;
202 # the full lineages of the species are merged into a single tree
204 for my $name (@species_names) {
205 my @ids = $self->get_taxonids($name);
206 if (not scalar @ids) {
207 $self->throw("Could not find species $name in the taxonomy");
210 my $node = $self->get_taxon(-taxonid
=> $id);
211 $node->name('supplied', $name);
213 $tree->merge_lineage($node);
215 $tree = Bio
::Tree
::Tree
->new(-verbose
=> $self->verbose, -node
=> $node);
220 # add arbitrary branch length
221 for my $node ($tree->get_nodes) {
222 $node->branch_length(1);
232 Usage : my $ancestor_taxon = $db->ancestor($taxon);
233 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
236 Args : Bio::Taxon (that was retrieved from this database)
241 shift->throw_not_implemented();
245 =head2 each_Descendent
247 Title : each_Descendent
248 Usage : my @taxa = $db->each_Descendent($taxon);
249 Function: Get all the descendents of the supplied Taxon (but not their
250 descendents, ie. not a recursive fetchall).
251 Returns : Array of Bio::Taxon objects
252 Args : Bio::Taxon (that was retrieved from this database)
256 sub each_Descendent
{
257 shift->throw_not_implemented();
261 =head2 get_all_Descendents
263 Title : get_all_Descendents
264 Usage : my @taxa = $db->get_all_Descendents($taxon);
265 Function: Like each_Descendent(), but do a recursive fetchall
266 Returns : Array of Bio::Taxon objects
267 Args : Bio::Taxon (that was retrieved from this database)
271 sub get_all_Descendents
{
272 my ($self, $taxon) = @_;
274 foreach my $desc_taxon ($self->each_Descendent($taxon)) {
275 push @taxa, ($desc_taxon, $self->get_all_Descendents($desc_taxon));
281 =head2 _load_tax_module
283 Title : _load_tax_module
284 Usage : *INTERNAL Bio::DB::Taxonomy stuff*
285 Function: Loads up (like use) a module at run time on demand
289 sub _load_tax_module
{
290 my ($self, $source) = @_;
291 my $module = "Bio::DB::Taxonomy::" . $source;
294 eval { $ok = $self->_load_module($module) };
298 $self: $source cannot be found
300 For more information about the Bio::DB::Taxonomy system please see
301 the Bio::DB::Taxonomy docs. This includes ways of checking for
302 formats at compile time, not run time.
310 =head2 _handle_internal_id
312 Title : _handle_internal_id
313 Usage : *INTERNAL Bio::DB::Taxonomy stuff*
314 Function: Add an internal ID to a taxon object, ensuring that the taxon gets
315 the same internal ID, regardless of which database it is retrieved
317 Returns : The assigned internal ID
318 Args : * A Bio::Taxon
319 * An optional boolean to decide whether or not to try and do the job
320 using scientific name & rank in addition to taxon ID. This is
321 useful if your IDs are not comparable to that of other databases,
322 e.g. if they are arbitrary, as in the case of Bio::DB::Taxonomy::list.
323 CAVEAT: will handle ambiguous names within a database fine, but not
324 across multiple databases.
328 sub _handle_internal_id
{
329 my ($self, $taxon, $try_name) = @_;
330 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
332 my $taxid = $taxon->id || return;
333 my $name = $taxon->scientific_name || '';
334 my $rank = $taxon->rank || 'no rank';
335 my $dbh = $try_name ?
$taxon->db_handle : 'any';
337 my $iid = $TAXON_IIDS->{taxids
}->{$dbh}->{$taxid};
338 if ( (not defined $iid) && $try_name && $name && exists $TAXON_IIDS->{names
}->{$name}) {
339 # Search for a suitable IID based on species name and ranks
340 my %test_ranks = map {$_ => undef} ($rank, 'no rank');
341 SEARCH
: while (my ($test_rank, undef) = each %test_ranks) {
342 # Search at the specified rank first, then with 'no rank'
343 while ( my ($test_iid, $test_info) = each %{$TAXON_IIDS->{names
}->{$name}->{$rank}} ) {
344 while (my ($test_db, $test_taxid) = each %$test_info) {
345 if ( ($test_db eq $dbh) && not($test_taxid eq $taxid) ) {
346 # Taxa are different (same database, different taxid)
349 # IID is acceptable since taxa are from different databases,
350 # or from the same database but have the same taxid
352 $TAXON_IIDS->{taxids
}->{$dbh}->{$taxid} = $iid;
360 # Assign Bio::DB::Taxonomy IID with risky Bio::Tree::Node internal method
361 $taxon->_creation_id($iid);
363 # Register new IID in Bio::DB::Taxonomy
364 $iid = $taxon->internal_id;
365 $TAXON_IIDS->{taxids
}->{$dbh}->{$taxid} = $iid;
367 $TAXON_IIDS->{names
}->{$name}->{$rank}->{$iid}->{$taxon->db_handle} = $taxid