add some significant milestones
[bioperl-live.git] / Bio / Taxon.pm
blob42c3582e57eccebc4185783cf432dcd569a8bd9b
1 # $Id$
3 # BioPerl module for Bio::Taxon
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
9 # Copyright Sendu Bala, based heavily on a module by Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Taxon - A node in a represented taxonomy
19 =head1 SYNOPSIS
21 use Bio::Taxon;
23 # Typically you will get a Taxon from a Bio::DB::Taxonomy object
24 # but here is how you initialize one
25 my $taxon = Bio::Taxon->new(-name => $name,
26 -id => $id,
27 -rank => $rank,
28 -division => $div);
30 # Get one from a database
31 my $dbh = Bio::DB::Taxonomy->new(-source => 'flatfile',
32 -directory=> '/tmp',
33 -nodesfile=> '/path/to/nodes.dmp',
34 -namesfile=> '/path/to/names.dmp');
35 my $human = $dbh->get_taxon(-name => 'Homo sapiens');
36 $human = $dbh->get_taxon(-taxonid => '9606');
38 print "id is ", $human->id, "\n"; # 9606
39 print "rank is ", $human->rank, "\n"; # species
40 print "scientific name is ", $human->scientific_name, "\n"; # Homo sapiens
41 print "division is ", $human->division, "\n"; # Primates
43 my $mouse = $dbh->get_taxon(-name => 'Mus musculus');
45 # You can quickly make your own lineages with the list database
46 my @ranks = qw(superkingdom class genus species);
47 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
48 my $list_dbh = Bio::DB::Taxonomy->new(-source => 'list', -names => \@h_lineage,
49 -ranks => \@ranks);
50 $human = $list_dbh->get_taxon(-name => 'Homo sapiens');
51 my @names = $human->common_names; # @names is empty
52 $human->common_names('woman');
53 @names = $human->common_names; # @names contains woman
55 # You can switch to another database when you need more information
56 my $entrez_dbh = Bio::DB::Taxonomy->new(-source => 'entrez');
57 $human->db_handle($entrez_dbh);
58 @names = $human->common_names; # @names contains woman, human, man
60 # Since Bio::Taxon implements Bio::Tree::NodeI, we have access to those
61 # methods (and can manually create our own taxa and taxonomy without the use
62 # of any database)
63 my $homo = $human->ancestor;
65 # Though be careful with each_Descendent - unless you add_Descendent()
66 # yourself, you won't get an answer because unlike for ancestor(), Bio::Taxon
67 # does not ask the database for the answer. You can ask the database yourself
68 # using the same method:
69 ($human) = $homo->db_handle->each_Descendent($homo);
71 # We can also take advantage of Bio::Tree::Tree* methods:
72 # a) some methods are available with just an empty tree object
73 use Bio::Tree::Tree;
74 my $tree_functions = Bio::Tree::Tree->new();
75 my @lineage = $tree_functions->get_lineage_nodes($human);
76 my $lca = $tree_functions->get_lca($human, $mouse);
78 # b) for other methods, create a tree using your Taxon object
79 my $tree = Bio::Tree::Tree->new(-node => $human);
80 my @taxa = $tree->get_nodes;
81 $homo = $tree->find_node(-rank => 'genus');
83 # Normally you can't get the lca of a list-database derived Taxon and an
84 # entrez or flatfile-derived one because the two different databases might
85 # have different roots and different numbers of ranks between the root and the
86 # taxa of interest. To solve this, make a tree of the Taxon with the more
87 # detailed lineage and splice out all the taxa that won't be in the lineage of
88 # your other Taxon:
89 my $entrez_mouse = $entrez_dbh->get_taxon(-name => 'Mus musculus');
90 my $list_human = $list_dbh->get_taxon(-name => 'Homo sapiens');
91 my $mouse_tree = Bio::Tree::Tree->new(-node => $entrez_mouse);
92 $mouse_tree->splice(-keep_rank => \@ranks);
93 $lca = $mouse_tree->get_lca($entrez_mouse, $list_human);
95 =head1 DESCRIPTION
97 This is the next generation (for Bioperl) of representing Taxonomy
98 information. Previously all information was managed by a single
99 object called Bio::Species. This new implementation allows
100 representation of the intermediate nodes not just the species nodes
101 and can relate their connections.
103 =head1 FEEDBACK
105 =head2 Mailing Lists
107 User feedback is an integral part of the evolution of this and other
108 Bioperl modules. Send your comments and suggestions preferably to
109 the Bioperl mailing list. Your participation is much appreciated.
111 bioperl-l@bioperl.org - General discussion
112 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
114 =head2 Support
116 Please direct usage questions or support issues to the mailing list:
118 I<bioperl-l@bioperl.org>
120 rather than to the module maintainer directly. Many experienced and
121 reponsive experts will be able look at the problem and quickly
122 address it. Please include a thorough description of the problem
123 with code and data examples if at all possible.
125 =head2 Reporting Bugs
127 Report bugs to the Bioperl bug tracking system to help us keep track
128 of the bugs and their resolution. Bug reports can be submitted via
129 the web:
131 http://bugzilla.open-bio.org/
133 =head1 AUTHOR - Sendu Bala
135 Email bix@sendu.me.uk
137 =head1 CONTRIBUTORS
139 Jason Stajich, jason-at-bioperl-dot-org (original Bio::Taxonomy::Node)
140 Juguang Xiao, juguang@tll.org.sg
141 Gabriel Valiente, valiente@lsi.upc.edu
143 =head1 APPENDIX
145 The rest of the documentation details each of the object methods.
146 Internal methods are usually preceded with a _
148 =cut
150 package Bio::Taxon;
151 use strict;
152 use Scalar::Util qw(blessed);
154 use Bio::DB::Taxonomy;
156 use base qw(Bio::Tree::Node Bio::IdentifiableI);
158 =head2 new
160 Title : new
161 Usage : my $obj = Bio::Taxonomy::Node->new();
162 Function: Builds a new Bio::Taxonomy::Node object
163 Returns : an instance of Bio::Taxonomy::Node
164 Args : -dbh => a reference to a Bio::DB::Taxonomy object
165 [no default]
166 -name => a string representing the taxon name
167 (scientific name)
168 -id => human readable id - typically NCBI taxid
169 -ncbi_taxid => same as -id, but explicitely say that it is an
170 NCBI taxid
171 -rank => node rank (one of 'species', 'genus', etc)
172 -common_names => array ref of all common names
173 -division => 'Primates', 'Rodents', etc
174 -genetic_code => genetic code table number
175 -mito_genetic_code => mitochondrial genetic code table number
176 -create_date => date created in database
177 -update_date => date last updated in database
178 -pub_date => date published in database
180 =cut
182 sub new {
183 my ($class, @args) = @_;
184 my $self = $class->SUPER::new(@args);
185 my ($name, $id, $objid, $rank, $div, $dbh, $ncbitaxid, $commonname,
186 $commonnames, $gcode, $mitocode, $createdate, $updatedate, $pubdate,
187 $parent_id) = $self->_rearrange([qw(NAME ID OBJECT_ID RANK DIVISION DBH
188 NCBI_TAXID COMMON_NAME COMMON_NAMES
189 GENETIC_CODE MITO_GENETIC_CODE
190 CREATE_DATE UPDATE_DATE PUB_DATE
191 PARENT_ID)], @args);
193 if (defined $id && (defined $ncbitaxid && $ncbitaxid ne $id || defined $objid && $objid ne $id)) {
194 $self->warn("Only provide one of -id, -object_id or -ncbi_taxid, using $id\n");
196 elsif(!defined $id) {
197 $id = $objid || $ncbitaxid;
199 defined $id && $self->id($id);
200 $self->{_ncbi_tax_id_provided} = 1 if $ncbitaxid;
202 defined $rank && $self->rank($rank);
203 defined $name && $self->node_name($name);
205 my @common_names;
206 if ($commonnames) {
207 $self->throw("-common_names takes only an array reference") unless $commonnames
208 && ref($commonnames) eq 'ARRAY';
209 @common_names = @{$commonnames};
211 if ($commonname) {
212 my %c_names = map { $_ => 1 } @common_names;
213 unless (exists $c_names{$commonname}) {
214 unshift(@common_names, $commonname);
217 @common_names > 0 && $self->common_names(@common_names);
219 defined $gcode && $self->genetic_code($gcode);
220 defined $mitocode && $self->mitochondrial_genetic_code($mitocode);
221 defined $createdate && $self->create_date($createdate);
222 defined $updatedate && $self->update_date($updatedate);
223 defined $pubdate && $self->pub_date($pubdate);
224 defined $div && $self->division($div);
225 defined $dbh && $self->db_handle($dbh);
227 # deprecated and will issue a warning when method called,
228 # eventually to be removed completely as option
229 defined $parent_id && $self->parent_id($parent_id);
231 # some things want to freeze/thaw Bio::Species objects, but
232 # _root_cleanup_methods contains a CODE ref, delete it.
233 delete $self->{_root_cleanup_methods};
235 return $self;
238 =head1 Bio::IdentifiableI interface
240 Also see L<Bio::IdentifiableI>
242 =head2 version
244 Title : version
245 Usage : $taxon->version($newval)
246 Returns : value of version (a scalar)
247 Args : on set, new value (a scalar or undef, optional)
249 =cut
251 sub version {
252 my $self = shift;
253 return $self->{'version'} = shift if @_;
254 return $self->{'version'};
257 =head2 authority
259 Title : authority
260 Usage : $taxon->authority($newval)
261 Returns : value of authority (a scalar)
262 Args : on set, new value (a scalar or undef, optional)
264 =cut
266 sub authority {
267 my $self = shift;
268 return $self->{'authority'} = shift if @_;
269 return $self->{'authority'};
272 =head2 namespace
274 Title : namespace
275 Usage : $taxon->namespace($newval)
276 Returns : value of namespace (a scalar)
277 Args : on set, new value (a scalar or undef, optional)
279 =cut
281 sub namespace {
282 my $self = shift;
283 return $self->{'namespace'} = shift if @_;
284 return $self->{'namespace'};
287 =head1 Bio::Taxonomy::Node implementation
289 =head2 db_handle
291 Title : db_handle
292 Usage : $taxon->db_handle($newval)
293 Function: Get/Set Bio::DB::Taxonomy Handle
294 Returns : value of db_handle (a scalar) (Bio::DB::Taxonomy object)
295 Args : on set, new value (a scalar, optional) Bio::DB::Taxonomy object
297 Also see L<Bio::DB::Taxonomy>
299 =cut
301 sub db_handle {
302 my $self = shift;
303 if (@_) {
304 my $db = shift;
306 if (! ref($db) || ! $db->isa('Bio::DB::Taxonomy')) {
307 $self->throw("Must provide a valid Bio::DB::Taxonomy object to db_handle()");
309 if (!$self->{'db_handle'} || ($self->{'db_handle'} && $self->{'db_handle'} ne $db)) {
310 my $new_self = $self->_get_similar_taxon_from_db($self, $db);
311 $self->_merge_taxa($new_self) if $new_self;
314 # NB: The Bio::DB::Taxonomy modules access this data member directly
315 # to avoid calling this method and going infinite
316 $self->{'db_handle'} = $db;
318 return $self->{'db_handle'};
321 =head2 rank
323 Title : rank
324 Usage : $taxon->rank($newval)
325 Function: Get/set rank of this Taxon, 'species', 'genus', 'order', etc...
326 Returns : value of rank (a scalar)
327 Args : on set, new value (a scalar or undef, optional)
329 =cut
331 sub rank {
332 my $self = shift;
333 return $self->{'rank'} = shift if @_;
334 return $self->{'rank'};
337 =head2 id
339 Title : id
340 Usage : $taxon->id($newval)
341 Function: Get/Set id (NCBI Taxonomy ID in most cases); object_id() and
342 ncbi_taxid() are synonyms of this method.
343 Returns : id (a scalar)
344 Args : none to get, OR scalar to set
346 =cut
348 sub id {
349 my $self = shift;
350 return $self->SUPER::id(@_);
353 *object_id = \&id;
355 =head2 ncbi_taxid
357 Title : ncbi_taxid
358 Usage : $taxon->ncbi_taxid($newval)
359 Function: Get/Set the NCBI Taxonomy ID; This actually sets the id() but only
360 returns an id when ncbi_taxid has been explictely set with this
361 method.
362 Returns : id (a scalar)
363 Args : none to get, OR scalar to set
365 =cut
367 sub ncbi_taxid {
368 my ($self, $id) = @_;
370 if ($id) {
371 $self->{_ncbi_tax_id_provided} = 1;
372 return $self->SUPER::id($id);
375 if ($self->{_ncbi_tax_id_provided}) {
376 return $self->SUPER::id;
378 return;
381 =head2 parent_id
383 Title : parent_id
384 Usage : $taxon->parent_id()
385 Function: Get parent ID, (NCBI Taxonomy ID in most cases);
386 parent_taxon_id() is a synonym of this method.
387 Returns : value of parent_id (a scalar)
388 Args : none
389 Status : deprecated
391 =cut
393 sub parent_id {
394 my $self = shift;
395 if (@_) {
396 $self->warn("You can no longer set the parent_id - use ancestor() instead");
398 my $ancestor = $self->ancestor() || return;
399 return $ancestor->id;
402 *parent_taxon_id = \&parent_id;
404 =head2 genetic_code
406 Title : genetic_code
407 Usage : $taxon->genetic_code($newval)
408 Function: Get/set genetic code table
409 Returns : value of genetic_code (a scalar)
410 Args : on set, new value (a scalar or undef, optional)
412 =cut
414 sub genetic_code {
415 my $self = shift;
416 return $self->{'genetic_code'} = shift if @_;
417 return $self->{'genetic_code'};
420 =head2 mitochondrial_genetic_code
422 Title : mitochondrial_genetic_code
423 Usage : $taxon->mitochondrial_genetic_code($newval)
424 Function: Get/set mitochondrial genetic code table
425 Returns : value of mitochondrial_genetic_code (a scalar)
426 Args : on set, new value (a scalar or undef, optional)
428 =cut
430 sub mitochondrial_genetic_code {
431 my $self = shift;
432 return $self->{'mitochondrial_genetic_code'} = shift if @_;
433 return $self->{'mitochondrial_genetic_code'};
436 =head2 create_date
438 Title : create_date
439 Usage : $taxon->create_date($newval)
440 Function: Get/Set Date this node was created (in the database)
441 Returns : value of create_date (a scalar)
442 Args : on set, new value (a scalar or undef, optional)
444 =cut
446 sub create_date {
447 my $self = shift;
448 return $self->{'create_date'} = shift if @_;
449 return $self->{'create_date'};
452 =head2 update_date
454 Title : update_date
455 Usage : $taxon->update_date($newval)
456 Function: Get/Set Date this node was updated (in the database)
457 Returns : value of update_date (a scalar)
458 Args : on set, new value (a scalar or undef, optional)
460 =cut
462 sub update_date {
463 my $self = shift;
464 return $self->{'update_date'} = shift if @_;
465 return $self->{'update_date'};
468 =head2 pub_date
470 Title : pub_date
471 Usage : $taxon->pub_date($newval)
472 Function: Get/Set Date this node was published (in the database)
473 Returns : value of pub_date (a scalar)
474 Args : on set, new value (a scalar or undef, optional)
476 =cut
478 sub pub_date {
479 my $self = shift;
480 return $self->{'pub_date'} = shift if @_;
481 return $self->{'pub_date'};
484 =head2 ancestor
486 Title : ancestor
487 Usage : my $ancestor_taxon = $taxon->ancestor()
488 Function: Retrieve the ancestor taxon. Normally the database is asked what the
489 ancestor is.
491 If you manually set the ancestor (or you make a Bio::Tree::Tree with
492 this object as an argument to new()), the database (if any) will not
493 be used for the purposes of this method.
495 To restore normal database behaviour, call ancestor(undef) (which
496 would remove this object from the tree), or request this taxon again
497 as a new Taxon object from the database.
499 Returns : Bio::Taxon
500 Args : none
502 =cut
504 sub ancestor {
505 my $self = shift;
506 my $ancestor = $self->SUPER::ancestor(@_);
507 if ($ancestor) {
508 return $ancestor;
510 my $dbh = $self->db_handle;
511 #*** could avoid the db lookup if we knew our current id was definitely
512 # information from the db...
513 my $definitely_from_dbh = $self->_get_similar_taxon_from_db($self);
514 return $dbh->ancestor($definitely_from_dbh);
517 =head2 get_Parent_Node
519 Title : get_Parent_Node
520 Function: Synonym of ancestor()
521 Status : deprecated
523 =cut
525 sub get_Parent_Node {
526 my $self = shift;
527 $self->warn("get_Parent_Node is deprecated, use ancestor() instead");
528 return $self->ancestor(@_);
531 =head2 each_Descendent
533 Title : each_Descendent
534 Usage : my @taxa = $taxon->each_Descendent();
535 Function: Get all the descendents for this Taxon (but not their descendents,
536 ie. not a recursive fetchall). get_Children_Nodes() is a synonym of
537 this method.
539 Note that this method never asks the database for the descendents;
540 it will only return objects you have manually set with
541 add_Descendent(), or where this was done for you by making a
542 Bio::Tree::Tree with this object as an argument to new().
544 To get the database descendents use
545 $taxon->db_handle->each_Descendent($taxon).
547 Returns : Array of Bio::Taxon objects
548 Args : optionally, when you have set your own descendents, the string
549 "height", "creation", "alpha", "revalpha", or coderef to be used to
550 sort the order of children nodes.
552 =cut
554 # implemented by Bio::Tree::Node
556 =head2 get_Children_Nodes
558 Title : get_Children_Nodes
559 Function: Synonym of each_Descendent()
560 Status : deprecated
562 =cut
564 sub get_Children_Nodes {
565 my $self = shift;
566 $self->warn("get_Children_Nodes is deprecated, use each_Descendent() instead");
567 return $self->each_Descendent(@_);
570 =head2 name
572 Title: name
573 Usage: $taxon->name('scientific', 'Homo sapiens');
574 $taxon->name('common', 'human', 'man');
575 my @names = @{$taxon->name('common')};
576 Function: Get/set the names. node_name(), scientific_name() and common_names()
577 are shorthands to name('scientific'), name('scientific') and
578 name('common') respectively.
579 Returns: names (a array reference)
580 Args: Arg1 => the name_class. You can assign any text, but the words
581 'scientific' and 'common' have the special meaning, as
582 scientific name and common name, respectively. 'scientific' and
583 'division' are treated specially, allowing only the first value
584 in the Arg2 list to be set.
585 Arg2 .. => list of names
587 =cut
589 sub name {
590 my ($self, $name_class, @names) = @_;
591 $self->throw('No name class specified') unless defined $name_class;
593 if (@names) {
594 if ($name_class =~ /scientific|division/i) {
595 delete $self->{'_names_hash'}->{$name_class};
596 @names = (shift(@names));
598 push @{$self->{'_names_hash'}->{$name_class}}, @names;
600 return $self->{'_names_hash'}->{$name_class} || return;
603 =head2 node_name
605 Title : node_name
606 Usage : $taxon->node_name($newval)
607 Function: Get/set the name of this taxon (node), typically the scientific name
608 of the taxon, eg. 'Primate' or 'Homo'; scientific_name() is a synonym
609 of this method.
610 Returns : value of node_name (a scalar)
611 Args : on set, new value (a scalar or undef, optional)
613 =cut
615 sub node_name {
616 my $self = shift;
617 my @v = @{$self->name('scientific', @_) || []};
618 return pop @v;
621 *scientific_name = \&node_name;
623 =head2 common_names
625 Title : common_names
626 Usage : $taxon->common_names($newval)
627 Function: Get/add the other names of this taxon, typically the genbank common
628 name and others, eg. 'Human' and 'man'. common_name() is a synonym
629 of this method.
630 Returns : array of names in list context, one of those names in scalar context
631 Args : on add, new list of names (scalars, optional)
633 =cut
635 sub common_names {
636 my $self = shift;
637 my @v = @{$self->name('common', @_) || []};
638 return ( wantarray ) ? @v : pop @v;
641 *common_name = \&common_names;
643 =head2 division
645 Title : division
646 Usage : $taxon->division($newval)
647 Function: Get/set the division this taxon belongs to, eg. 'Primates' or
648 'Bacteria'.
649 Returns : value of division (a scalar)
650 Args : on set, new value (a scalar or undef, optional)
652 =cut
654 sub division {
655 my $self = shift;
656 my @v = @{$self->name('division',@_) || []};
657 return pop @v;
660 # get a node from the database that is like the supplied node
661 sub _get_similar_taxon_from_db {
662 #*** not really happy with this having to be called so much; there must be
663 # a better way...
664 my ($self, $taxon, $db) = @_;
665 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa("Bio::Taxon");
666 ($self->id || $self->node_name) || return;
667 $db ||= $self->db_handle || return;
668 if (!blessed($db) || !$db->isa('Bio::DB::Taxonomy')) {
669 $self->throw("DB handle is not a Bio::DB::Taxonomy: got $db in node ".$self->node_name)
671 my $db_taxon = $db->get_taxon(-taxonid => $taxon->id) if $taxon->id;
672 unless ($db_taxon) {
673 my @try_ids = $db->get_taxonids($taxon->node_name) if $taxon->node_name;
675 my $own_rank = $taxon->rank || 'no rank';
676 foreach my $try_id (@try_ids) {
677 my $try = $db->get_taxon(-taxonid => $try_id);
678 my $try_rank = $try->rank || 'no rank';
679 if ($own_rank eq 'no rank' || $try_rank eq 'no rank' || $own_rank eq $try_rank) {
680 $db_taxon = $try;
681 last;
686 return $db_taxon;
689 # merge data from supplied Taxon into self
690 sub _merge_taxa {
691 my ($self, $taxon) = @_;
692 $self->throw("Must supply a Bio::Taxon object") unless ref($taxon) && $taxon->isa('Bio::Taxon');
693 return if ($taxon eq $self);
695 foreach my $attrib (qw(scientific_name version authority namespace genetic_code mitochondrial_genetic_code create_date update_date pub_date division id)) {
696 my $own = $self->$attrib();
697 my $his = $taxon->$attrib();
698 if (!$own && $his) {
699 $self->$attrib($his);
703 my $own = $self->rank || 'no rank';
704 my $his = $taxon->rank || 'no rank';
705 if ($own eq 'no rank' && $his ne 'no rank') {
706 $self->rank($his);
709 my %own_cnames = map { $_ => 1 } $self->common_names;
710 my %his_cnames = map { $_ => 1 } $taxon->common_names;
711 foreach (keys %his_cnames) {
712 unless (exists $own_cnames{$_}) {
713 $self->common_names($_);
717 #*** haven't merged the other things in names() hash, could do above much easier with direct access to object data
720 =head2 remove_Descendent
722 Title : remove_Descendent
723 Usage : $node->remove_Descedent($node_foo);
724 Function: Removes a specific node from being a Descendent of this node
725 Returns : nothing
726 Args : An array of Bio::Node::NodeI objects which have been previously
727 passed to the add_Descendent call of this object.
729 =cut
731 sub remove_Descendent {
732 # need to override this method from Bio::Tree::Node since it casually
733 # throws away nodes if they don't branch
734 my ($self,@nodes) = @_;
735 my $c= 0;
736 foreach my $n ( @nodes ) {
737 if ($self->{'_desc'}->{$n->internal_id}) {
738 $self->{_removing_descendent} = 1;
739 $n->ancestor(undef);
740 $self->{_removing_descendent} = 0;
741 $self->{'_desc'}->{$n->internal_id}->ancestor(undef);
742 delete $self->{'_desc'}->{$n->internal_id};
743 $c++;
746 return $c;