* sync with trunk
[bioperl-live.git] / Bio / Taxon.pm
blobaba57dc292b17174ebff7a5f7759fa2c4986008f
1 # $Id$
3 # BioPerl module for Bio::Taxon
5 # Cared for by Sendu Bala <bix@sendu.me.uk>
7 # Copyright Sendu Bala, based heavily on a module by Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Taxon - A node in a represented taxonomy
17 =head1 SYNOPSIS
19 use Bio::Taxon;
21 # Typically you will get a Taxon from a Bio::DB::Taxonomy object
22 # but here is how you initialize one
23 my $taxon = Bio::Taxon->new(-name => $name,
24 -id => $id,
25 -rank => $rank,
26 -division => $div);
28 # Get one from a database
29 my $dbh = Bio::DB::Taxonomy->new(-source => 'flatfile',
30 -directory=> '/tmp',
31 -nodesfile=> '/path/to/nodes.dmp',
32 -namesfile=> '/path/to/names.dmp');
33 my $human = $dbh->get_taxon(-name => 'Homo sapiens');
34 $human = $dbh->get_taxon(-taxonid => '9606');
36 print "id is ", $human->id, "\n"; # 9606
37 print "rank is ", $human->rank, "\n"; # species
38 print "scientific name is ", $human->scientific_name, "\n"; # Homo sapiens
39 print "division is ", $human->division, "\n"; # Primates
41 my $mouse = $dbh->get_taxon(-name => 'Mus musculus');
43 # You can quickly make your own lineages with the list database
44 my @ranks = qw(superkingdom class genus species);
45 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
46 my $list_dbh = Bio::DB::Taxonomy->new(-source => 'list', -names => \@h_lineage,
47 -ranks => \@ranks);
48 $human = $list_dbh->get_taxon(-name => 'Homo sapiens');
49 my @names = $human->common_names; # @names is empty
50 $human->common_names('woman');
51 @names = $human->common_names; # @names contains woman
53 # You can switch to another database when you need more information
54 my $entrez_dbh = Bio::DB::Taxonomy->new(-source => 'entrez');
55 $human->db_handle($entrez_dbh);
56 @names = $human->common_names; # @names contains woman, human, man
58 # Since Bio::Taxon implements Bio::Tree::NodeI, we have access to those
59 # methods (and can manually create our own taxa and taxonomy without the use
60 # of any database)
61 my $homo = $human->ancestor;
63 # Though be careful with each_Descendent - unless you add_Descendent()
64 # yourself, you won't get an answer because unlike for ancestor(), Bio::Taxon
65 # does not ask the database for the answer. You can ask the database yourself
66 # using the same method:
67 ($human) = $homo->db_handle->each_Descendent($homo);
69 # We can also take advantage of Bio::Tree::Tree* methods:
70 # a) some methods are available with just an empty tree object
71 use Bio::Tree::Tree;
72 my $tree_functions = Bio::Tree::Tree->new();
73 my @lineage = $tree_functions->get_lineage_nodes($human);
74 my $lca = $tree_functions->get_lca($human, $mouse);
76 # b) for other methods, create a tree using your Taxon object
77 my $tree = Bio::Tree::Tree->new(-node => $human);
78 my @taxa = $tree->get_nodes;
79 $homo = $tree->find_node(-rank => 'genus');
81 # Normally you can't get the lca of a list-database derived Taxon and an
82 # entrez or flatfile-derived one because the two different databases might
83 # have different roots and different numbers of ranks between the root and the
84 # taxa of interest. To solve this, make a tree of the Taxon with the more
85 # detailed lineage and splice out all the taxa that won't be in the lineage of
86 # your other Taxon:
87 my $entrez_mouse = $entrez_dbh->get_taxon(-name => 'Mus musculus');
88 my $list_human = $list_dbh->get_taxon(-name => 'Homo sapiens');
89 my $mouse_tree = Bio::Tree::Tree->new(-node => $entrez_mouse);
90 $mouse_tree->splice(-keep_rank => \@ranks);
91 $lca = $mouse_tree->get_lca($entrez_mouse, $list_human);
93 =head1 DESCRIPTION
95 This is the next generation (for Bioperl) of representing Taxonomy
96 information. Previously all information was managed by a single
97 object called Bio::Species. This new implementation allows
98 representation of the intermediate nodes not just the species nodes
99 and can relate their connections.
101 =head1 FEEDBACK
103 =head2 Mailing Lists
105 User feedback is an integral part of the evolution of this and other
106 Bioperl modules. Send your comments and suggestions preferably to
107 the Bioperl mailing list. Your participation is much appreciated.
109 bioperl-l@bioperl.org - General discussion
110 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
112 =head2 Reporting Bugs
114 Report bugs to the Bioperl bug tracking system to help us keep track
115 of the bugs and their resolution. Bug reports can be submitted via
116 the web:
118 http://bugzilla.open-bio.org/
120 =head1 AUTHOR - Sendu Bala
122 Email bix@sendu.me.uk
124 =head1 CONTRIBUTORS
126 Jason Stajich, jason-at-bioperl-dot-org (original Bio::Taxonomy::Node)
127 Juguang Xiao, juguang@tll.org.sg
128 Gabriel Valiente, valiente@lsi.upc.edu
130 =head1 APPENDIX
132 The rest of the documentation details each of the object methods.
133 Internal methods are usually preceded with a _
135 =cut
137 package Bio::Taxon;
138 use strict;
140 use Bio::DB::Taxonomy;
142 use base qw(Bio::Tree::Node Bio::IdentifiableI);
144 =head2 new
146 Title : new
147 Usage : my $obj = Bio::Taxonomy::Node->new();
148 Function: Builds a new Bio::Taxonomy::Node object
149 Returns : an instance of Bio::Taxonomy::Node
150 Args : -dbh => a reference to a Bio::DB::Taxonomy object
151 [no default]
152 -name => a string representing the taxon name
153 (scientific name)
154 -id => human readable id - typically NCBI taxid
155 -ncbi_taxid => same as -id, but explicitely say that it is an
156 NCBI taxid
157 -rank => node rank (one of 'species', 'genus', etc)
158 -common_names => array ref of all common names
159 -division => 'Primates', 'Rodents', etc
160 -genetic_code => genetic code table number
161 -mito_genetic_code => mitochondrial genetic code table number
162 -create_date => date created in database
163 -update_date => date last updated in database
164 -pub_date => date published in database
166 =cut
168 sub new {
169 my ($class, @args) = @_;
170 my $self = $class->SUPER::new(@args);
171 my ($name, $id, $objid, $rank, $div, $dbh, $ncbitaxid, $commonname,
172 $commonnames, $gcode, $mitocode, $createdate, $updatedate, $pubdate,
173 $parent_id) = $self->_rearrange([qw(NAME ID OBJECT_ID RANK DIVISION DBH
174 NCBI_TAXID COMMON_NAME COMMON_NAMES
175 GENETIC_CODE MITO_GENETIC_CODE
176 CREATE_DATE UPDATE_DATE PUB_DATE
177 PARENT_ID)], @args);
179 if (defined $id && (defined $ncbitaxid && $ncbitaxid ne $id || defined $objid && $objid ne $id)) {
180 $self->warn("Only provide one of -id, -object_id or -ncbi_taxid, using $id\n");
182 elsif(!defined $id) {
183 $id = $objid || $ncbitaxid;
185 defined $id && $self->id($id);
186 $self->{_ncbi_tax_id_provided} = 1 if $ncbitaxid;
188 defined $rank && $self->rank($rank);
189 defined $name && $self->node_name($name);
191 my @common_names;
192 if ($commonnames) {
193 $self->throw("-common_names takes only an array reference") unless $commonnames
194 && ref($commonnames) eq 'ARRAY';
195 @common_names = @{$commonnames};
197 if ($commonname) {
198 my %c_names = map { $_ => 1 } @common_names;
199 unless (exists $c_names{$commonname}) {
200 unshift(@common_names, $commonname);
203 @common_names > 0 && $self->common_names(@common_names);
205 defined $gcode && $self->genetic_code($gcode);
206 defined $mitocode && $self->mitochondrial_genetic_code($mitocode);
207 defined $createdate && $self->create_date($createdate);
208 defined $updatedate && $self->update_date($updatedate);
209 defined $pubdate && $self->pub_date($pubdate);
210 defined $div && $self->division($div);
211 defined $dbh && $self->db_handle($dbh);
213 # deprecated and will issue a warning when method called,
214 # eventually to be removed completely as option
215 defined $parent_id && $self->parent_id($parent_id);
217 # some things want to freeze/thaw Bio::Species objects, but
218 # _root_cleanup_methods contains a CODE ref, delete it.
219 delete $self->{_root_cleanup_methods};
221 return $self;
224 =head1 Bio::IdentifiableI interface
226 Also see L<Bio::IdentifiableI>
228 =head2 version
230 Title : version
231 Usage : $taxon->version($newval)
232 Returns : value of version (a scalar)
233 Args : on set, new value (a scalar or undef, optional)
235 =cut
237 sub version {
238 my $self = shift;
239 return $self->{'version'} = shift if @_;
240 return $self->{'version'};
243 =head2 authority
245 Title : authority
246 Usage : $taxon->authority($newval)
247 Returns : value of authority (a scalar)
248 Args : on set, new value (a scalar or undef, optional)
250 =cut
252 sub authority {
253 my $self = shift;
254 return $self->{'authority'} = shift if @_;
255 return $self->{'authority'};
258 =head2 namespace
260 Title : namespace
261 Usage : $taxon->namespace($newval)
262 Returns : value of namespace (a scalar)
263 Args : on set, new value (a scalar or undef, optional)
265 =cut
267 sub namespace {
268 my $self = shift;
269 return $self->{'namespace'} = shift if @_;
270 return $self->{'namespace'};
273 =head1 Bio::Taxonomy::Node implementation
275 =head2 db_handle
277 Title : db_handle
278 Usage : $taxon->db_handle($newval)
279 Function: Get/Set Bio::DB::Taxonomy Handle
280 Returns : value of db_handle (a scalar) (Bio::DB::Taxonomy object)
281 Args : on set, new value (a scalar, optional) Bio::DB::Taxonomy object
283 Also see L<Bio::DB::Taxonomy>
285 =cut
287 sub db_handle {
288 my $self = shift;
289 if (@_) {
290 my $db = shift;
292 if (! ref($db) || ! $db->isa('Bio::DB::Taxonomy')) {
293 $self->throw("Must provide a valid Bio::DB::Taxonomy object to db_handle()");
295 if (!$self->{'db_handle'} || ($self->{'db_handle'} && $self->{'db_handle'} ne $db)) {
296 my $new_self = $self->_get_similar_taxon_from_db($self, $db);
297 $self->_merge_taxa($new_self) if $new_self;
300 # NB: The Bio::DB::Taxonomy modules access this data member directly
301 # to avoid calling this method and going infinite
302 $self->{'db_handle'} = $db;
304 return $self->{'db_handle'};
307 =head2 rank
309 Title : rank
310 Usage : $taxon->rank($newval)
311 Function: Get/set rank of this Taxon, 'species', 'genus', 'order', etc...
312 Returns : value of rank (a scalar)
313 Args : on set, new value (a scalar or undef, optional)
315 =cut
317 sub rank {
318 my $self = shift;
319 return $self->{'rank'} = shift if @_;
320 return $self->{'rank'};
323 =head2 id
325 Title : id
326 Usage : $taxon->id($newval)
327 Function: Get/Set id (NCBI Taxonomy ID in most cases); object_id() and
328 ncbi_taxid() are synonyms of this method.
329 Returns : id (a scalar)
330 Args : none to get, OR scalar to set
332 =cut
334 sub id {
335 my $self = shift;
336 return $self->SUPER::id(@_);
339 *object_id = \&id;
341 =head2 ncbi_taxid
343 Title : ncbi_taxid
344 Usage : $taxon->ncbi_taxid($newval)
345 Function: Get/Set the NCBI Taxonomy ID; This actually sets the id() but only
346 returns an id when ncbi_taxid has been explictely set with this
347 method.
348 Returns : id (a scalar)
349 Args : none to get, OR scalar to set
351 =cut
353 sub ncbi_taxid {
354 my ($self, $id) = @_;
356 if ($id) {
357 $self->{_ncbi_tax_id_provided} = 1;
358 return $self->SUPER::id($id);
361 if ($self->{_ncbi_tax_id_provided}) {
362 return $self->SUPER::id;
364 return;
367 =head2 parent_id
369 Title : parent_id
370 Usage : $taxon->parent_id()
371 Function: Get parent ID, (NCBI Taxonomy ID in most cases);
372 parent_taxon_id() is a synonym of this method.
373 Returns : value of parent_id (a scalar)
374 Args : none
375 Status : deprecated
377 =cut
379 sub parent_id {
380 my $self = shift;
381 if (@_) {
382 $self->warn("You can no longer set the parent_id - use ancestor() instead");
384 my $ancestor = $self->ancestor() || return;
385 return $ancestor->id;
388 *parent_taxon_id = \&parent_id;
390 =head2 genetic_code
392 Title : genetic_code
393 Usage : $taxon->genetic_code($newval)
394 Function: Get/set genetic code table
395 Returns : value of genetic_code (a scalar)
396 Args : on set, new value (a scalar or undef, optional)
398 =cut
400 sub genetic_code {
401 my $self = shift;
402 return $self->{'genetic_code'} = shift if @_;
403 return $self->{'genetic_code'};
406 =head2 mitochondrial_genetic_code
408 Title : mitochondrial_genetic_code
409 Usage : $taxon->mitochondrial_genetic_code($newval)
410 Function: Get/set mitochondrial genetic code table
411 Returns : value of mitochondrial_genetic_code (a scalar)
412 Args : on set, new value (a scalar or undef, optional)
414 =cut
416 sub mitochondrial_genetic_code {
417 my $self = shift;
418 return $self->{'mitochondrial_genetic_code'} = shift if @_;
419 return $self->{'mitochondrial_genetic_code'};
422 =head2 create_date
424 Title : create_date
425 Usage : $taxon->create_date($newval)
426 Function: Get/Set Date this node was created (in the database)
427 Returns : value of create_date (a scalar)
428 Args : on set, new value (a scalar or undef, optional)
430 =cut
432 sub create_date {
433 my $self = shift;
434 return $self->{'create_date'} = shift if @_;
435 return $self->{'create_date'};
438 =head2 update_date
440 Title : update_date
441 Usage : $taxon->update_date($newval)
442 Function: Get/Set Date this node was updated (in the database)
443 Returns : value of update_date (a scalar)
444 Args : on set, new value (a scalar or undef, optional)
446 =cut
448 sub update_date {
449 my $self = shift;
450 return $self->{'update_date'} = shift if @_;
451 return $self->{'update_date'};
454 =head2 pub_date
456 Title : pub_date
457 Usage : $taxon->pub_date($newval)
458 Function: Get/Set Date this node was published (in the database)
459 Returns : value of pub_date (a scalar)
460 Args : on set, new value (a scalar or undef, optional)
462 =cut
464 sub pub_date {
465 my $self = shift;
466 return $self->{'pub_date'} = shift if @_;
467 return $self->{'pub_date'};
470 =head2 ancestor
472 Title : ancestor
473 Usage : my $ancestor_taxon = $taxon->ancestor()
474 Function: Retrieve the ancestor taxon. Normally the database is asked what the
475 ancestor is.
477 If you manually set the ancestor (or you make a Bio::Tree::Tree with
478 this object as an argument to new()), the database (if any) will not
479 be used for the purposes of this method.
481 To restore normal database behaviour, call ancestor(undef) (which
482 would remove this object from the tree), or request this taxon again
483 as a new Taxon object from the database.
485 Returns : Bio::Taxon
486 Args : none
488 =cut
490 sub ancestor {
491 my $self = shift;
492 my $ancestor = $self->SUPER::ancestor(@_);
493 my $dbh = $self->db_handle || return $ancestor;
495 if ($ancestor) {
496 return $ancestor;
498 else {
499 #*** could avoid the db lookup if we knew our current id was definitely
500 # information from the db...
501 my $definitely_from_dbh = $self->_get_similar_taxon_from_db($self);
502 return $dbh->ancestor($definitely_from_dbh);
506 =head2 get_Parent_Node
508 Title : get_Parent_Node
509 Function: Synonym of ancestor()
510 Status : deprecated
512 =cut
514 sub get_Parent_Node {
515 my $self = shift;
516 $self->warn("get_Parent_Node is deprecated, use ancestor() instead");
517 return $self->ancestor(@_);
520 =head2 each_Descendent
522 Title : each_Descendent
523 Usage : my @taxa = $taxon->each_Descendent();
524 Function: Get all the descendents for this Taxon (but not their descendents,
525 ie. not a recursive fetchall). get_Children_Nodes() is a synonym of
526 this method.
528 Note that this method never asks the database for the descendents;
529 it will only return objects you have manually set with
530 add_Descendent(), or where this was done for you by making a
531 Bio::Tree::Tree with this object as an argument to new().
533 To get the database descendents use
534 $taxon->db_handle->each_Descendent($taxon).
536 Returns : Array of Bio::Taxon objects
537 Args : optionally, when you have set your own descendents, the string
538 "height", "creation", "alpha", "revalpha", or coderef to be used to
539 sort the order of children nodes.
541 =cut
543 # implemented by Bio::Tree::Node
545 =head2 get_Children_Nodes
547 Title : get_Children_Nodes
548 Function: Synonym of each_Descendent()
549 Status : deprecated
551 =cut
553 sub get_Children_Nodes {
554 my $self = shift;
555 $self->warn("get_Children_Nodes is deprecated, use each_Descendent() instead");
556 return $self->each_Descendent(@_);
559 =head2 name
561 Title: name
562 Usage: $taxon->name('scientific', 'Homo sapiens');
563 $taxon->name('common', 'human', 'man');
564 my @names = @{$taxon->name('common')};
565 Function: Get/set the names. node_name(), scientific_name() and common_names()
566 are shorthands to name('scientific'), name('scientific') and
567 name('common') respectively.
568 Returns: names (a array reference)
569 Args: Arg1 => the name_class. You can assign any text, but the words
570 'scientific' and 'common' have the special meaning, as
571 scientific name and common name, respectively. 'scientific' and
572 'division' are treated specially, allowing only the first value
573 in the Arg2 list to be set.
574 Arg2 .. => list of names
576 =cut
578 sub name {
579 my ($self, $name_class, @names) = @_;
580 $self->throw('No name class specified') unless defined $name_class;
582 if (@names) {
583 if ($name_class =~ /scientific|division/i) {
584 delete $self->{'_names_hash'}->{$name_class};
585 @names = (shift(@names));
587 push @{$self->{'_names_hash'}->{$name_class}}, @names;
589 return $self->{'_names_hash'}->{$name_class} || return;
592 =head2 node_name
594 Title : node_name
595 Usage : $taxon->node_name($newval)
596 Function: Get/set the name of this taxon (node), typically the scientific name
597 of the taxon, eg. 'Primate' or 'Homo'; scientific_name() is a synonym
598 of this method.
599 Returns : value of node_name (a scalar)
600 Args : on set, new value (a scalar or undef, optional)
602 =cut
604 sub node_name {
605 my $self = shift;
606 my @v = @{$self->name('scientific', @_) || []};
607 return pop @v;
610 *scientific_name = \&node_name;
612 =head2 common_names
614 Title : common_names
615 Usage : $taxon->common_names($newval)
616 Function: Get/add the other names of this taxon, typically the genbank common
617 name and others, eg. 'Human' and 'man'. common_name() is a synonym
618 of this method.
619 Returns : array of names in list context, one of those names in scalar context
620 Args : on add, new list of names (scalars, optional)
622 =cut
624 sub common_names {
625 my $self = shift;
626 my @v = @{$self->name('common', @_) || []};
627 return ( wantarray ) ? @v : pop @v;
630 *common_name = \&common_names;
632 =head2 division
634 Title : division
635 Usage : $taxon->division($newval)
636 Function: Get/set the division this taxon belongs to, eg. 'Primates' or
637 'Bacteria'.
638 Returns : value of division (a scalar)
639 Args : on set, new value (a scalar or undef, optional)
641 =cut
643 sub division {
644 my $self = shift;
645 my @v = @{$self->name('division',@_) || []};
646 return pop @v;
649 # get a node from the database that is like the supplied node
650 sub _get_similar_taxon_from_db {
651 #*** not really happy with this having to be called so much; there must be
652 # a better way...
653 my ($self, $taxon, $db) = @_;
654 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa("Bio::Taxon");
655 ($self->id || $self->node_name) || return;
656 $db ||= $self->db_handle || return;
658 my $db_taxon = $db->get_taxon(-taxonid => $taxon->id) if $taxon->id;
659 unless ($db_taxon) {
660 my @try_ids = $db->get_taxonids($taxon->node_name) if $taxon->node_name;
662 my $own_rank = $taxon->rank || 'no rank';
663 foreach my $try_id (@try_ids) {
664 my $try = $db->get_taxon(-taxonid => $try_id);
665 my $try_rank = $try->rank || 'no rank';
666 if ($own_rank eq 'no rank' || $try_rank eq 'no rank' || $own_rank eq $try_rank) {
667 $db_taxon = $try;
668 last;
673 return $db_taxon;
676 # merge data from supplied Taxon into self
677 sub _merge_taxa {
678 my ($self, $taxon) = @_;
679 $self->throw("Must supply a Bio::Taxon object") unless ref($taxon) && $taxon->isa('Bio::Taxon');
680 return if ($taxon eq $self);
682 foreach my $attrib (qw(scientific_name version authority namespace genetic_code mitochondrial_genetic_code create_date update_date pub_date division id)) {
683 my $own = $self->$attrib();
684 my $his = $taxon->$attrib();
685 if (!$own && $his) {
686 $self->$attrib($his);
690 my $own = $self->rank || 'no rank';
691 my $his = $taxon->rank || 'no rank';
692 if ($own eq 'no rank' && $his ne 'no rank') {
693 $self->rank($his);
696 my %own_cnames = map { $_ => 1 } $self->common_names;
697 my %his_cnames = map { $_ => 1 } $taxon->common_names;
698 foreach (keys %his_cnames) {
699 unless (exists $own_cnames{$_}) {
700 $self->common_names($_);
704 #*** haven't merged the other things in names() hash, could do above much easier with direct access to object data
707 =head2 remove_Descendent
709 Title : remove_Descendent
710 Usage : $node->remove_Descedent($node_foo);
711 Function: Removes a specific node from being a Descendent of this node
712 Returns : nothing
713 Args : An array of Bio::Node::NodeI objects which have been previously
714 passed to the add_Descendent call of this object.
716 =cut
718 sub remove_Descendent {
719 # need to override this method from Bio::Tree::Node since it casually
720 # throws away nodes if they don't branch
721 my ($self,@nodes) = @_;
722 my $c= 0;
723 foreach my $n ( @nodes ) {
724 if ($self->{'_desc'}->{$n->internal_id}) {
725 $self->{_removing_descendent} = 1;
726 $n->ancestor(undef);
727 $self->{_removing_descendent} = 0;
728 $self->{'_desc'}->{$n->internal_id}->ancestor(undef);
729 delete $self->{'_desc'}->{$n->internal_id};
730 $c++;
733 return $c;