tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / Taxonomy.pm
blobb7d1336efc4a6d0577244daaa9b3ae3c0c70652a
1 # $Id$
3 # BioPerl module for Bio::DB::Taxonomy
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::DB::Taxonomy - Access to a taxonomy database
19 =head1 SYNOPSIS
21 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
23 # use NCBI Entrez over HTTP
24 my $taxonid = $db->get_taxonid('Homo sapiens');
26 # get a taxon
27 my $taxon = $db->get_taxon(-taxonid => $taxonid);
29 =head1 DESCRIPTION
31 This is a front end module for access to a taxonomy database.
33 =head1 FEEDBACK
35 =head2 Mailing Lists
37 User feedback is an integral part of the evolution of this and other
38 Bioperl modules. Send your comments and suggestions preferably to
39 the Bioperl mailing list. Your participation is much appreciated.
41 bioperl-l@bioperl.org - General discussion
42 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
44 =head2 Support
46 Please direct usage questions or support issues to the mailing list:
48 I<bioperl-l@bioperl.org>
50 rather than to the module maintainer directly. Many experienced and
51 reponsive experts will be able look at the problem and quickly
52 address it. Please include a thorough description of the problem
53 with code and data examples if at all possible.
55 =head2 Reporting Bugs
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via
59 the web:
61 http://bugzilla.open-bio.org/
63 =head1 AUTHOR - Jason Stajich
65 Email jason-at-bioperl.org
67 =head1 CONTRIBUTORS
69 Sendu Bala: bix@sendu.me.uk
71 =head1 APPENDIX
73 The rest of the documentation details each of the object methods.
74 Internal methods are usually preceded with a _
76 =cut
78 # Let the code begin...
80 package Bio::DB::Taxonomy;
81 use vars qw($DefaultSource $TAXON_IIDS);
82 use strict;
83 use Bio::Tree::Tree;
85 use base qw(Bio::Root::Root);
87 $DefaultSource = 'entrez';
88 $TAXON_IIDS = {};
90 =head2 new
92 Title : new
93 Usage : my $obj = Bio::DB::Taxonomy->new(-source => 'entrez');
94 Function: Builds a new Bio::DB::Taxonomy object.
95 Returns : an instance of Bio::DB::Taxonomy
96 Args : -source => which database source 'entrez' or 'flatfile' or 'list'
98 =cut
100 sub new {
101 my($class,@args) = @_;
103 if( $class =~ /Bio::DB::Taxonomy::(\S+)/ ) {
104 my ($self) = $class->SUPER::new(@args);
105 $self->_initialize(@args);
106 return $self;
107 } else {
108 my %param = @args;
109 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
110 my $source = $param{'-source'} || $DefaultSource;
112 $source = "\L$source"; # normalize capitalization to lower case
114 # normalize capitalization
115 return unless( $class->_load_tax_module($source) );
116 return "Bio::DB::Taxonomy::$source"->new(@args);
120 # empty for now
121 sub _initialize { }
123 =head2 get_taxon
125 Title : get_taxon
126 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
127 Function: Get a Bio::Taxon object from the database.
128 Returns : Bio::Taxon object
129 Args : just a single value which is the database id, OR named args:
130 -taxonid => taxonomy id (to query by taxonid)
132 -name => string (to query by a taxonomy name: common name,
133 scientific name, etc)
135 =cut
137 sub get_taxon {
138 shift->throw_not_implemented();
141 *get_Taxonomy_Node = \&get_taxon;
143 =head2 get_taxonids
145 Title : get_taxonids
146 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
147 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
148 string. Note that multiple taxonids can match to the same supplied
149 name.
150 Returns : array of integer ids in list context, one of these in scalar context
151 Args : string representing taxon's name
153 =cut
155 sub get_taxonids {
156 shift->throw_not_implemented();
159 *get_taxonid = \&get_taxonids;
160 *get_taxaid = \&get_taxonids;
162 =head2 get_tree
164 Title : get_tree
165 Usage : my $tree = $db->get_tree(@species_names)
166 Function: Generate a tree comprised of the full lineages of all the supplied
167 species names. The nodes for the requested species are given
168 name('supplied') values corresponding to the supplied name, such that
169 they can be identified if the real species name in the database
170 (stored under node_name()) is different.
171 Returns : Bio::Tree::Tree
172 Args : a list of species names (strings)
174 =cut
176 sub get_tree {
177 my ($self, @species_names) = @_;
179 # the full lineages of the species are merged into a single tree
180 my $tree;
181 foreach my $name (@species_names) {
182 my $ncbi_id = $self->get_taxonid($name);
183 if ($ncbi_id) {
184 my $node = $self->get_taxon(-taxonid => $ncbi_id);
185 $node->name('supplied', $name);
187 if ($tree) {
188 $tree->merge_lineage($node);
190 else {
191 $tree = Bio::Tree::Tree->new(-verbose => $self->verbose, -node => $node);
194 else {
195 $self->throw("No taxonomy database node for species ".$name);
199 return $tree;
202 =head2 ancestor
204 Title : ancestor
205 Usage : my $ancestor_taxon = $db->ancestor($taxon)
206 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
207 database.
208 Returns : Bio::Taxon
209 Args : Bio::Taxon (that was retrieved from this database)
211 =cut
213 sub ancestor {
214 shift->throw_not_implemented();
217 =head2 each_Descendent
219 Title : each_Descendent
220 Usage : my @taxa = $db->each_Descendent($taxon);
221 Function: Get all the descendents of the supplied Taxon (but not their
222 descendents, ie. not a recursive fetchall).
223 Returns : Array of Bio::Taxon objects
224 Args : Bio::Taxon (that was retrieved from this database)
226 =cut
228 sub each_Descendent {
229 shift->throw_not_implemented();
232 =head2 get_all_Descendents
234 Title : get_all_Descendents
235 Usage : my @taxa = $db->get_all_Descendents($taxon);
236 Function: Like each_Descendent(), but do a recursive fetchall
237 Returns : Array of Bio::Taxon objects
238 Args : Bio::Taxon (that was retrieved from this database)
240 =cut
242 sub get_all_Descendents {
243 my ($self, $taxon) = @_;
244 my @taxa;
245 foreach my $desc_taxon ($self->each_Descendent($taxon)) {
246 push @taxa, ($desc_taxon, $self->get_all_Descendents($desc_taxon));
248 return @taxa;
251 =head2 _load_tax_module
253 Title : _load_tax_module
254 Usage : *INTERNAL Bio::DB::Taxonomy stuff*
255 Function: Loads up (like use) a module at run time on demand
257 =cut
259 sub _load_tax_module {
260 my ($self, $source) = @_;
261 my $module = "Bio::DB::Taxonomy::" . $source;
262 my $ok;
264 eval { $ok = $self->_load_module($module) };
265 if ( $@ ) {
266 print STDERR $@;
267 print STDERR <<END;
268 $self: $source cannot be found
269 Exception $@
270 For more information about the Bio::DB::Taxonomy system please see
271 the Bio::DB::Taxonomy docs. This includes ways of checking for
272 formats at compile time, not run time.
276 return $ok;
279 =head2 _handle_internal_id
281 Title : _handle_internal_id
282 Usage : *INTERNAL Bio::DB::Taxonomy stuff*
283 Function: Tries to ensure that when a taxon is requested from any database,
284 the Taxon object returned will have the same internal id regardless
285 of database.
286 Args : Bio::Taxon, and optionally true value to try and do the job using
287 scientific name & rank if your ids aren't comparable to other dbs.
289 =cut
291 sub _handle_internal_id {
292 my ($self, $taxon, $try_name) = @_;
293 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
294 my $taxid = $taxon->id || return;
295 my $sci_name = $taxon->scientific_name || '';
296 my $rank = $taxon->rank || 'no rank';
298 if ($try_name && $sci_name && defined $TAXON_IIDS->{names}->{$sci_name}) {
299 if (defined $TAXON_IIDS->{names}->{$sci_name}->{$rank}) {
300 $TAXON_IIDS->{taxids}->{$taxid} = $TAXON_IIDS->{names}->{$sci_name}->{$rank};
302 elsif ($rank eq 'no rank') {
303 # pick the internal id of one named rank taxa at random
304 my ($iid) = values %{$TAXON_IIDS->{names}->{$sci_name}};
305 $TAXON_IIDS->{taxids}->{$taxid} = $iid;
309 if (defined $TAXON_IIDS->{taxids}->{$taxid}) {
310 # a little dangerous to use this internal method of Bio::Tree::Node;
311 # but it is how internal_id() is set
312 $taxon->_creation_id($TAXON_IIDS->{taxids}->{$taxid});
314 else {
315 $TAXON_IIDS->{taxids}->{$taxid} = $taxon->internal_id;
316 $TAXON_IIDS->{names}->{$sci_name}->{$rank} = $taxon->internal_id if $sci_name;