FTLocationFactory.pm: Fixed a long-standing issue at "from_string()"
[bioperl-live.git] / Bio / Taxon.pm
blob245ebaa20a85a0e93ad97e0499d3ea10d37698fd
2 # BioPerl module for Bio::Taxon
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sendu Bala <bix@sendu.me.uk>
8 # Copyright Sendu Bala, based heavily on a module by Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Taxon - A node in a represented taxonomy
18 =head1 SYNOPSIS
20 use Bio::Taxon;
22 # Typically you will get a Taxon from a Bio::DB::Taxonomy object
23 # but here is how you initialize one
24 my $taxon = Bio::Taxon->new(-name => $name,
25 -id => $id,
26 -rank => $rank,
27 -division => $div);
29 # Get one from a database
30 my $dbh = Bio::DB::Taxonomy->new(-source => 'flatfile',
31 -directory=> '/tmp',
32 -nodesfile=> '/path/to/nodes.dmp',
33 -namesfile=> '/path/to/names.dmp');
34 my $human = $dbh->get_taxon(-name => 'Homo sapiens');
35 $human = $dbh->get_taxon(-taxonid => '9606');
37 print "id is ", $human->id, "\n"; # 9606
38 print "rank is ", $human->rank, "\n"; # species
39 print "scientific name is ", $human->scientific_name, "\n"; # Homo sapiens
40 print "division is ", $human->division, "\n"; # Primates
42 my $mouse = $dbh->get_taxon(-name => 'Mus musculus');
44 # You can quickly make your own lineages with the list database
45 my @ranks = qw(superkingdom class genus species);
46 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
47 my $list_dbh = Bio::DB::Taxonomy->new(-source => 'list', -names => \@h_lineage,
48 -ranks => \@ranks);
49 $human = $list_dbh->get_taxon(-name => 'Homo sapiens');
50 my @names = $human->common_names; # @names is empty
51 $human->common_names('woman');
52 @names = $human->common_names; # @names contains woman
54 # You can switch to another database when you need more information
55 my $entrez_dbh = Bio::DB::Taxonomy->new(-source => 'entrez');
56 $human->db_handle($entrez_dbh);
57 @names = $human->common_names; # @names contains woman, human, man
59 # Since Bio::Taxon implements Bio::Tree::NodeI, we have access to those
60 # methods (and can manually create our own taxa and taxonomy without the use
61 # of any database)
62 my $homo = $human->ancestor;
64 # Though be careful with each_Descendent - unless you add_Descendent()
65 # yourself, you won't get an answer because unlike for ancestor(), Bio::Taxon
66 # does not ask the database for the answer. You can ask the database yourself
67 # using the same method:
68 ($human) = $homo->db_handle->each_Descendent($homo);
70 # We can also take advantage of Bio::Tree::Tree* methods:
71 # a) some methods are available with just an empty tree object
72 use Bio::Tree::Tree;
73 my $tree_functions = Bio::Tree::Tree->new();
74 my @lineage = $tree_functions->get_lineage_nodes($human);
75 my $lineage = $tree_functions->get_lineage_string($human);
76 my $lca = $tree_functions->get_lca($human, $mouse);
78 # b) for other methods, create a tree using your Taxon object
79 my $tree = Bio::Tree::Tree->new(-node => $human);
80 my @taxa = $tree->get_nodes;
81 $homo = $tree->find_node(-rank => 'genus');
83 # Normally you can't get the lca of a list-database derived Taxon and an
84 # entrez or flatfile-derived one because the two different databases might
85 # have different roots and different numbers of ranks between the root and the
86 # taxa of interest. To solve this, make a tree of the Taxon with the more
87 # detailed lineage and splice out all the taxa that won't be in the lineage of
88 # your other Taxon:
89 my $entrez_mouse = $entrez_dbh->get_taxon(-name => 'Mus musculus');
90 my $list_human = $list_dbh->get_taxon(-name => 'Homo sapiens');
91 my $mouse_tree = Bio::Tree::Tree->new(-node => $entrez_mouse);
92 $mouse_tree->splice(-keep_rank => \@ranks);
93 $lca = $mouse_tree->get_lca($entrez_mouse, $list_human);
95 =head1 DESCRIPTION
97 This is the next generation (for Bioperl) of representing Taxonomy
98 information. Previously all information was managed by a single
99 object called Bio::Species. This new implementation allows
100 representation of the intermediate nodes not just the species nodes
101 and can relate their connections.
103 =head1 FEEDBACK
105 =head2 Mailing Lists
107 User feedback is an integral part of the evolution of this and other
108 Bioperl modules. Send your comments and suggestions preferably to
109 the Bioperl mailing list. Your participation is much appreciated.
111 bioperl-l@bioperl.org - General discussion
112 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
114 =head2 Support
116 Please direct usage questions or support issues to the mailing list:
118 I<bioperl-l@bioperl.org>
120 rather than to the module maintainer directly. Many experienced and
121 reponsive experts will be able look at the problem and quickly
122 address it. Please include a thorough description of the problem
123 with code and data examples if at all possible.
125 =head2 Reporting Bugs
127 Report bugs to the Bioperl bug tracking system to help us keep track
128 of the bugs and their resolution. Bug reports can be submitted via
129 the web:
131 https://github.com/bioperl/bioperl-live/issues
133 =head1 AUTHOR - Sendu Bala
135 Email bix@sendu.me.uk
137 =head1 CONTRIBUTORS
139 Jason Stajich, jason-at-bioperl-dot-org (original Bio::Taxonomy::Node)
140 Juguang Xiao, juguang@tll.org.sg
141 Gabriel Valiente, valiente@lsi.upc.edu
143 =head1 APPENDIX
145 The rest of the documentation details each of the object methods.
146 Internal methods are usually preceded with a _
148 =cut
151 package Bio::Taxon;
152 use strict;
153 use Scalar::Util qw(blessed);
155 use Bio::DB::Taxonomy;
157 use base qw(Bio::Tree::Node Bio::IdentifiableI);
160 =head2 new
162 Title : new
163 Usage : my $obj = Bio::Taxonomy::Node->new();
164 Function: Builds a new Bio::Taxonomy::Node object
165 Returns : an instance of Bio::Taxonomy::Node
166 Args : -dbh => a reference to a Bio::DB::Taxonomy object
167 [no default]
168 -name => a string representing the taxon name
169 (scientific name)
170 -id => human readable id - typically NCBI taxid
171 -ncbi_taxid => same as -id, but explicitly say that it is an
172 NCBI taxid
173 -rank => node rank (one of 'species', 'genus', etc)
174 -common_names => array ref of all common names
175 -division => 'Primates', 'Rodents', etc
176 -genetic_code => genetic code table number
177 -mito_genetic_code => mitochondrial genetic code table number
178 -create_date => date created in database
179 -update_date => date last updated in database
180 -pub_date => date published in database
182 =cut
184 sub new {
185 my ($class, @args) = @_;
186 my $self = $class->SUPER::new(@args);
187 my ($name, $id, $objid, $rank, $div, $dbh, $ncbitaxid, $commonname,
188 $commonnames, $gcode, $mitocode, $createdate, $updatedate, $pubdate,
189 $parent_id) = $self->_rearrange([qw(NAME ID OBJECT_ID RANK DIVISION DBH
190 NCBI_TAXID COMMON_NAME COMMON_NAMES
191 GENETIC_CODE MITO_GENETIC_CODE
192 CREATE_DATE UPDATE_DATE PUB_DATE
193 PARENT_ID)], @args);
195 if (defined $id && (defined $ncbitaxid && $ncbitaxid ne $id || defined $objid && $objid ne $id)) {
196 $self->warn("Only provide one of -id, -object_id or -ncbi_taxid, using $id\n");
198 elsif(!defined $id) {
199 $id = $objid || $ncbitaxid;
201 defined $id && $self->id($id);
202 $self->{_ncbi_tax_id_provided} = 1 if $ncbitaxid;
204 defined $rank && $self->rank($rank);
205 defined $name && $self->node_name($name);
207 my @common_names;
208 if ($commonnames) {
209 $self->throw("-common_names takes only an array reference") unless $commonnames
210 && ref($commonnames) eq 'ARRAY';
211 @common_names = @{$commonnames};
213 if ($commonname) {
214 my %c_names = map { $_ => 1 } @common_names;
215 unless (exists $c_names{$commonname}) {
216 unshift(@common_names, $commonname);
219 @common_names > 0 && $self->common_names(@common_names);
221 defined $gcode && $self->genetic_code($gcode);
222 defined $mitocode && $self->mitochondrial_genetic_code($mitocode);
223 defined $createdate && $self->create_date($createdate);
224 defined $updatedate && $self->update_date($updatedate);
225 defined $pubdate && $self->pub_date($pubdate);
226 defined $div && $self->division($div);
227 defined $dbh && $self->db_handle($dbh);
229 # deprecated and will issue a warning when method called,
230 # eventually to be removed completely as option
231 defined $parent_id && $self->parent_id($parent_id);
233 # some things want to freeze/thaw Bio::Species objects, but
234 # _root_cleanup_methods contains a CODE ref, delete it.
235 delete $self->{_root_cleanup_methods};
237 return $self;
241 =head1 Bio::IdentifiableI interface
243 Also see L<Bio::IdentifiableI>
245 =head2 version
247 Title : version
248 Usage : $taxon->version($newval)
249 Returns : value of version (a scalar)
250 Args : on set, new value (a scalar or undef, optional)
252 =cut
254 sub version {
255 my $self = shift;
256 return $self->{'version'} = shift if @_;
257 return $self->{'version'};
261 =head2 authority
263 Title : authority
264 Usage : $taxon->authority($newval)
265 Returns : value of authority (a scalar)
266 Args : on set, new value (a scalar or undef, optional)
268 =cut
270 sub authority {
271 my $self = shift;
272 return $self->{'authority'} = shift if @_;
273 return $self->{'authority'};
277 =head2 namespace
279 Title : namespace
280 Usage : $taxon->namespace($newval)
281 Returns : value of namespace (a scalar)
282 Args : on set, new value (a scalar or undef, optional)
284 =cut
286 sub namespace {
287 my $self = shift;
288 return $self->{'namespace'} = shift if @_;
289 return $self->{'namespace'};
293 =head1 Bio::Taxonomy::Node implementation
295 =head2 db_handle
297 Title : db_handle
298 Usage : $taxon->db_handle($newval)
299 Function: Get/Set Bio::DB::Taxonomy Handle
300 Returns : value of db_handle (a scalar) (Bio::DB::Taxonomy object)
301 Args : on set, new value (a scalar, optional) Bio::DB::Taxonomy object
303 Also see L<Bio::DB::Taxonomy>
305 =cut
307 sub db_handle {
308 my $self = shift;
309 if (@_) {
310 my $db = shift;
312 if (! ref($db) || ! $db->isa('Bio::DB::Taxonomy')) {
313 $self->throw("Must provide a valid Bio::DB::Taxonomy object to db_handle()");
315 if (!$self->{'db_handle'} || ($self->{'db_handle'} && $self->{'db_handle'} ne $db)) {
316 my $new_self = $self->_get_similar_taxon_from_db($self, $db);
317 $self->_merge_taxa($new_self) if $new_self;
320 # NB: The Bio::DB::Taxonomy modules access this data member directly
321 # to avoid calling this method and going infinite
322 $self->{'db_handle'} = $db;
324 return $self->{'db_handle'};
328 =head2 rank
330 Title : rank
331 Usage : $taxon->rank($newval)
332 Function: Get/set rank of this Taxon, 'species', 'genus', 'order', etc...
333 Returns : value of rank (a scalar)
334 Args : on set, new value (a scalar or undef, optional)
336 =cut
338 sub rank {
339 my $self = shift;
340 return $self->{'rank'} = shift if @_;
341 return $self->{'rank'};
345 =head2 id
347 Title : id
348 Usage : $taxon->id($newval)
349 Function: Get/Set id (NCBI Taxonomy ID in most cases); object_id() and
350 ncbi_taxid() are synonyms of this method.
351 Returns : id (a scalar)
352 Args : none to get, OR scalar to set
354 =cut
356 sub id {
357 my $self = shift;
358 return $self->SUPER::id(@_);
361 *object_id = \&id;
364 =head2 ncbi_taxid
366 Title : ncbi_taxid
367 Usage : $taxon->ncbi_taxid($newval)
368 Function: Get/Set the NCBI Taxonomy ID; This actually sets the id() but only
369 returns an id when ncbi_taxid has been explictely set with this
370 method.
371 Returns : id (a scalar)
372 Args : none to get, OR scalar to set
374 =cut
376 sub ncbi_taxid {
377 my ($self, $id) = @_;
379 if ($id) {
380 $self->{_ncbi_tax_id_provided} = 1;
381 return $self->SUPER::id($id);
384 if ($self->{_ncbi_tax_id_provided}) {
385 return $self->SUPER::id;
387 return;
391 =head2 parent_id
393 Title : parent_id
394 Usage : $taxon->parent_id()
395 Function: Get parent ID, (NCBI Taxonomy ID in most cases);
396 parent_taxon_id() is a synonym of this method.
397 Returns : value of parent_id (a scalar)
398 Args : none
399 Status : deprecated
401 =cut
403 sub parent_id {
404 my $self = shift;
405 if (@_) {
406 $self->warn("You can no longer set the parent_id - use ancestor() instead");
408 my $ancestor = $self->ancestor() || return;
409 return $ancestor->id;
412 *parent_taxon_id = \&parent_id;
415 =head2 genetic_code
417 Title : genetic_code
418 Usage : $taxon->genetic_code($newval)
419 Function: Get/set genetic code table
420 Returns : value of genetic_code (a scalar)
421 Args : on set, new value (a scalar or undef, optional)
423 =cut
425 sub genetic_code {
426 my $self = shift;
427 return $self->{'genetic_code'} = shift if @_;
428 return $self->{'genetic_code'};
432 =head2 mitochondrial_genetic_code
434 Title : mitochondrial_genetic_code
435 Usage : $taxon->mitochondrial_genetic_code($newval)
436 Function: Get/set mitochondrial genetic code table
437 Returns : value of mitochondrial_genetic_code (a scalar)
438 Args : on set, new value (a scalar or undef, optional)
440 =cut
442 sub mitochondrial_genetic_code {
443 my $self = shift;
444 return $self->{'mitochondrial_genetic_code'} = shift if @_;
445 return $self->{'mitochondrial_genetic_code'};
449 =head2 create_date
451 Title : create_date
452 Usage : $taxon->create_date($newval)
453 Function: Get/Set Date this node was created (in the database)
454 Returns : value of create_date (a scalar)
455 Args : on set, new value (a scalar or undef, optional)
457 =cut
459 sub create_date {
460 my $self = shift;
461 return $self->{'create_date'} = shift if @_;
462 return $self->{'create_date'};
466 =head2 update_date
468 Title : update_date
469 Usage : $taxon->update_date($newval)
470 Function: Get/Set Date this node was updated (in the database)
471 Returns : value of update_date (a scalar)
472 Args : on set, new value (a scalar or undef, optional)
474 =cut
476 sub update_date {
477 my $self = shift;
478 return $self->{'update_date'} = shift if @_;
479 return $self->{'update_date'};
483 =head2 pub_date
485 Title : pub_date
486 Usage : $taxon->pub_date($newval)
487 Function: Get/Set Date this node was published (in the database)
488 Returns : value of pub_date (a scalar)
489 Args : on set, new value (a scalar or undef, optional)
491 =cut
493 sub pub_date {
494 my $self = shift;
495 return $self->{'pub_date'} = shift if @_;
496 return $self->{'pub_date'};
500 =head2 ancestor
502 Title : ancestor
503 Usage : my $ancestor_taxon = $taxon->ancestor()
504 Function: Retrieve the ancestor taxon. Normally the database is asked what the
505 ancestor is.
507 If you manually set the ancestor (or you make a Bio::Tree::Tree with
508 this object as an argument to new()), the database (if any) will not
509 be used for the purposes of this method.
511 To restore normal database behaviour, call ancestor(undef) (which
512 would remove this object from the tree), or request this taxon again
513 as a new Taxon object from the database.
515 Returns : Bio::Taxon
516 Args : none
518 =cut
520 sub ancestor {
521 my $self = shift;
522 my $ancestor = $self->SUPER::ancestor(@_);
523 if ($ancestor) {
524 return $ancestor;
526 my $dbh = $self->db_handle;
527 #*** could avoid the db lookup if we knew our current id was definitely
528 # information from the db...
529 my $definitely_from_dbh = $self->_get_similar_taxon_from_db($self);
530 return $dbh->ancestor($definitely_from_dbh);
534 =head2 get_Parent_Node
536 Title : get_Parent_Node
537 Function: Synonym of ancestor()
538 Status : deprecated
540 =cut
542 sub get_Parent_Node {
543 my $self = shift;
544 $self->warn("get_Parent_Node is deprecated, use ancestor() instead");
545 return $self->ancestor(@_);
549 =head2 each_Descendent
551 Title : each_Descendent
552 Usage : my @taxa = $taxon->each_Descendent();
553 Function: Get all the descendents for this Taxon (but not their descendents,
554 ie. not a recursive fetchall). get_Children_Nodes() is a synonym of
555 this method.
557 Note that this method never asks the database for the descendents;
558 it will only return objects you have manually set with
559 add_Descendent(), or where this was done for you by making a
560 Bio::Tree::Tree with this object as an argument to new().
562 To get the database descendents use
563 $taxon->db_handle->each_Descendent($taxon).
565 Returns : Array of Bio::Taxon objects
566 Args : optionally, when you have set your own descendents, the string
567 "height", "creation", "alpha", "revalpha", or coderef to be used to
568 sort the order of children nodes.
570 =cut
573 # implemented by Bio::Tree::Node
575 =head2 get_Children_Nodes
577 Title : get_Children_Nodes
578 Function: Synonym of each_Descendent()
579 Status : deprecated
581 =cut
583 sub get_Children_Nodes {
584 my $self = shift;
585 $self->warn("get_Children_Nodes is deprecated, use each_Descendent() instead");
586 return $self->each_Descendent(@_);
590 =head2 name
592 Title: name
593 Usage: $taxon->name('scientific', 'Homo sapiens');
594 $taxon->name('common', 'human', 'man');
595 my @names = @{$taxon->name('common')};
596 Function: Get/set the names. node_name(), scientific_name() and common_names()
597 are shorthands to name('scientific'), name('scientific') and
598 name('common') respectively.
599 Returns: names (a array reference)
600 Args: Arg1 => the name_class. You can assign any text, but the words
601 'scientific' and 'common' have the special meaning, as
602 scientific name and common name, respectively. 'scientific' and
603 'division' are treated specially, allowing only the first value
604 in the Arg2 list to be set.
605 Arg2 ... => list of names
607 =cut
609 sub name {
610 my ($self, $name_class, @names) = @_;
611 $self->throw('No name class specified') unless defined $name_class;
613 if (@names) {
614 if ($name_class =~ /scientific|division/i) {
615 delete $self->{'_names_hash'}->{$name_class};
616 @names = (shift(@names));
618 push @{$self->{'_names_hash'}->{$name_class}}, @names;
620 return $self->{'_names_hash'}->{$name_class} || return;
624 =head2 node_name
626 Title : node_name
627 Usage : $taxon->node_name($newval)
628 Function: Get/set the name of this taxon (node), typically the scientific name
629 of the taxon, eg. 'Primate' or 'Homo'; scientific_name() is a synonym
630 of this method.
631 Returns : value of node_name (a scalar)
632 Args : on set, new value (a scalar or undef, optional)
634 =cut
636 sub node_name {
637 my $self = shift;
638 my @v = @{$self->name('scientific', @_) || []};
639 return pop @v;
642 *scientific_name = \&node_name;
645 =head2 common_names
647 Title : common_names
648 Usage : $taxon->common_names($newval)
649 Function: Get/add the other names of this taxon, typically the genbank common
650 name and others, eg. 'Human' and 'man'. common_name() is a synonym
651 of this method.
652 Returns : array of names in list context, one of those names in scalar context
653 Args : on add, new list of names (scalars, optional)
655 =cut
657 sub common_names {
658 my $self = shift;
659 my @v = @{$self->name('common', @_) || []};
660 return ( wantarray ) ? @v : pop @v;
663 *common_name = \&common_names;
666 =head2 division
668 Title : division
669 Usage : $taxon->division($newval)
670 Function: Get/set the division this taxon belongs to, eg. 'Primates' or
671 'Bacteria'.
672 Returns : value of division (a scalar)
673 Args : on set, new value (a scalar or undef, optional)
675 =cut
677 sub division {
678 my $self = shift;
679 my @v = @{$self->name('division',@_) || []};
680 return pop @v;
684 # get a node from the database that is like the supplied node
685 sub _get_similar_taxon_from_db {
686 #*** not really happy with this having to be called so much; there must be
687 # a better way...
688 my ($self, $taxon, $db) = @_;
689 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa("Bio::Taxon");
690 ($self->id || $self->node_name) || return;
691 $db ||= $self->db_handle || return;
692 if (!blessed($db) || !$db->isa('Bio::DB::Taxonomy')) {
693 $self->throw("DB handle is not a Bio::DB::Taxonomy: got $db in node ".$self->node_name)
695 my $db_taxon = $db->get_taxon(-taxonid => $taxon->id) if $taxon->id;
696 unless ($db_taxon) {
697 my @try_ids = $db->get_taxonids($taxon->node_name) if $taxon->node_name;
699 my $own_rank = $taxon->rank || 'no rank';
700 foreach my $try_id (@try_ids) {
701 my $try = $db->get_taxon(-taxonid => $try_id);
702 my $try_rank = $try->rank || 'no rank';
703 if ($own_rank eq 'no rank' || $try_rank eq 'no rank' || $own_rank eq $try_rank) {
704 $db_taxon = $try;
705 last;
710 return $db_taxon;
714 # merge data from supplied Taxon into self
715 sub _merge_taxa {
716 my ($self, $taxon) = @_;
717 $self->throw("Must supply a Bio::Taxon object") unless ref($taxon) && $taxon->isa('Bio::Taxon');
718 return if ($taxon eq $self);
720 foreach my $attrib (qw(scientific_name version authority namespace genetic_code mitochondrial_genetic_code create_date update_date pub_date division id)) {
721 my $own = $self->$attrib();
722 my $his = $taxon->$attrib();
723 if (!$own && $his) {
724 $self->$attrib($his);
728 my $own = $self->rank || 'no rank';
729 my $his = $taxon->rank || 'no rank';
730 if ($own eq 'no rank' && $his ne 'no rank') {
731 $self->rank($his);
734 my %own_cnames = map { $_ => 1 } $self->common_names;
735 my %his_cnames = map { $_ => 1 } $taxon->common_names;
736 foreach (keys %his_cnames) {
737 unless (exists $own_cnames{$_}) {
738 $self->common_names($_);
742 #*** haven't merged the other things in names() hash, could do above much easier with direct access to object data
746 =head2 remove_Descendent
748 Title : remove_Descendent
749 Usage : $node->remove_Descedent($node_foo);
750 Function: Removes a specific node from being a Descendent of this node
751 Returns : nothing
752 Args : An array of Bio::Node::NodeI objects which have been previously
753 passed to the add_Descendent call of this object.
755 =cut
757 sub remove_Descendent {
758 # need to override this method from Bio::Tree::Node since it casually
759 # throws away nodes if they don't branch
760 my ($self,@nodes) = @_;
761 my $c= 0;
762 foreach my $n ( @nodes ) {
763 if ($self->{'_desc'}->{$n->internal_id}) {
764 $self->{_removing_descendent} = 1;
765 $n->ancestor(undef);
766 $self->{_removing_descendent} = 0;
767 $self->{'_desc'}->{$n->internal_id}->ancestor(undef);
768 delete $self->{'_desc'}->{$n->internal_id};
769 $c++;
772 return $c;