merge Peter's changes in
[bioperl-live.git] / Bio / Species.pm
blob7ffcc4d8e65f7c276e1a5db3be2940cf2e7c48b6
2 # BioPerl module for Bio::Species
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
7 # Reimplemented by Sendu Bala <bix@sendu.me.uk>
8 # Re-reimplemented by Chris Fields <cjfields - at - bioperl dot org>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Species - Generic species object.
18 =head1 SYNOPSIS
20 $species = Bio::Species->new(-classification => [@classification]);
21 # Can also pass classification
22 # array to new as below
24 $species->classification(qw( sapiens Homo Hominidae
25 Catarrhini Primates Eutheria
26 Mammalia Vertebrata Chordata
27 Metazoa Eukaryota ));
29 $genus = $species->genus();
31 $bi = $species->binomial(); # $bi is now "Homo sapiens"
33 # For storing common name
34 $species->common_name("human");
36 # For storing subspecies
37 $species->sub_species("accountant");
39 =head1 DESCRIPTION
41 B<NOTE: This class is planned for deprecation in favor of the simpler Bio::Taxon.
42 Please use that class instead.>
44 Provides a very simple object for storing phylogenetic information. The
45 classification is stored in an array, which is a list of nodes in a phylogenetic
46 tree. Access to getting and setting species and genus is provided, but not to
47 any of the other node types (eg: "phylum", "class", "order", "family"). There's
48 plenty of scope for making the model more sophisticated, if this is ever needed.
50 A methods are also provided for storing common names, and subspecies.
52 =head1 FEEDBACK
54 =head2 Mailing Lists
56 User feedback is an integral part of the evolution of this and other
57 Bioperl modules. Send your comments and suggestions preferably to
58 the Bioperl mailing list. Your participation is much appreciated.
60 bioperl-l@bioperl.org - General discussion
61 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63 =head2 Support
65 Please direct usage questions or support issues to the mailing list:
67 I<bioperl-l@bioperl.org>
69 rather than to the module maintainer directly. Many experienced and
70 reponsive experts will be able look at the problem and quickly
71 address it. Please include a thorough description of the problem
72 with code and data examples if at all possible.
74 =head2 Reporting Bugs
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 of the bugs and their resolution. Bug reports can be submitted via the
78 web:
80 https://redmine.open-bio.org/projects/bioperl/
82 =head1 AUTHOR
84 James Gilbert email B<jgrg@sanger.ac.uk>
86 =head1 CONTRIBUTORS
88 Sendu Bala, bix@sendu.me.uk
89 Chris Fields, cjfields at bioperl dot org
91 =head1 APPENDIX
93 The rest of the documentation details each of the object
94 methods. Internal methods are usually preceded with a _
96 =cut
98 #' Let the code begin...
100 package Bio::Species;
101 use strict;
102 use warnings;
104 use Bio::DB::Taxonomy;
105 use Bio::Tree::Tree;
106 use Bio::Taxon;
107 use base qw(Bio::Root::Root Bio::Tree::NodeI);
109 =head2 new
111 Title : new
112 Usage : my $obj = Bio::Species->new(-classification => \@class)
113 Function: Build a new Species object
114 Returns : Bio::Species object
115 Args : -ncbi_taxid => NCBI taxonomic ID (optional)
116 -classification => arrayref of classification
118 =cut
120 sub new {
121 my($class, @args) = @_;
123 my $self = $class->SUPER::new(@args);
125 # Bio::Species is now just a proxy object that just observes the NodeI
126 # interface methods but delegates them to the proper classes (Bio::Taxon and
127 # Bio::Tree::Tree). This will be surplanted by the much simpler
128 # Bio::Taxon/Bio::DB::Taxonomy modules in the future.
130 # Using a proxy allows proper GC w/o using weaken(). This just wraps the
131 # older instances, which have no reciprocal refs (thus no circular refs).
132 # This can then run proper cleanup
134 $self->taxon(Bio::Taxon->new(@args));
136 my ($org, $sp, $var, $classification) =
137 $self->_rearrange([qw(ORGANELLE
138 SUB_SPECIES
139 VARIANT
140 CLASSIFICATION)], @args);
142 if (defined $classification && ref($classification) eq "ARRAY" && @{$classification}) {
143 $self->classification(@$classification);
145 else {
146 $self->tree(Bio::Tree::Tree->new());
149 defined $org && $self->organelle($org);
150 defined $sp && $self->sub_species($sp);
151 defined $var && $self->variant($var);
153 return $self;
156 =head2 classification
158 Title : classification
159 Usage : $self->classification(@class_array);
160 @classification = $self->classification();
161 Function: Get/set the lineage of this species. The array provided must be in
162 the order ... ---> SPECIES, GENUS ---> KINGDOM ---> etc.
163 Example : $obj->classification(qw( 'Homo sapiens' Homo Hominidae
164 Catarrhini Primates Eutheria Mammalia Vertebrata
165 Chordata Metazoa Eukaryota));
166 Returns : Classification array
167 Args : Classification array
169 A reference to the classification array. In the latter case
170 if there is a second argument and it evaluates to true,
171 names will not be validated. NB: in any case, names are never
172 validated anyway.
174 =cut
176 sub classification {
177 my ($self, @vals) = @_;
179 my $taxon = $self->taxon;
181 if (@vals) {
182 if (ref($vals[0]) eq 'ARRAY') {
183 @vals = @{$vals[0]};
186 $vals[1] ||= '';
187 # make sure the lineage contains us as first or second element
188 # (lineage may have subspecies, species, genus ...)
189 my $name = $taxon->node_name;
190 my ($genus, $species) = (quotemeta($vals[1]), quotemeta($vals[0]));
191 if ($name && ($name !~ m{$species}i && $name !~ m{$genus}i) && $name !~ m{$vals[1] $vals[0]}i) {
192 if ($name =~ /^$vals[1] $vals[0]\s*(.+)/) {
193 # just assume the problem is someone tried to make a Bio::Species starting at subspecies
194 #*** no idea if this is appropriate! just a possible fix related to bug 2092
195 $self->sub_species($1);
196 $name = $taxon->node_name("$vals[1] $vals[0]");
198 else {
199 $self->warn("The supplied lineage does not start near '$name' (I was supplied '".join(" | ", @vals)."')");
203 # create a lineage for ourselves
204 my $db = Bio::DB::Taxonomy->new(-source => 'list', -names => [reverse @vals]);
205 unless ($taxon->scientific_name) {
206 # assume we're supposed to be the leaf of the supplied lineage
207 $self->taxon->scientific_name($vals[0]);
209 unless ($taxon->rank) {
210 # and that we are rank species
211 $taxon->rank('species');
214 $taxon->db_handle($db);
216 $self->tree(Bio::Tree::Tree->new(-node => $taxon));
219 @vals = ();
220 foreach my $node ($self->tree->get_lineage_nodes($taxon), $taxon) {
221 unshift(@vals, $node->scientific_name || next);
223 return @vals;
226 =head2 ncbi_taxid
228 Title : ncbi_taxid
229 Usage : $obj->ncbi_taxid($newval)
230 Function: Get/set the NCBI Taxon ID
231 Returns : the NCBI Taxon ID as a string
232 Args : newvalue to set or undef to unset (optional)
234 =cut
236 =head2 common_name
238 Title : common_name
239 Usage : $self->common_name( $common_name );
240 $common_name = $self->common_name();
241 Function: Get or set the common name of the species
242 Example : $self->common_name('human')
243 Returns : The common name in a string
244 Args : String, which is the common name (optional)
246 =cut
248 =head2 division
250 Title : division
251 Usage : $obj->division($newval)
252 Function: Genbank Division for a species
253 Returns : value of division (a scalar)
254 Args : value of division (a scalar)
256 =cut
258 =head2 species
260 Title : species
261 Usage : $self->species( $species );
262 $species = $self->species();
263 Function: Get or set the species name.
264 Note that this is NOT genus and species
265 -- use $self->binomial() for that.
266 Example : $self->species('sapiens');
267 Returns : species name as string (NOT genus and species)
268 Args : species name as string (NOT genus and species)
270 =cut
272 sub species {
273 my ($self, $species) = @_;
275 if ($species) {
276 $self->{_species} = $species;
279 unless (defined $self->{_species}) {
280 # work it out from our nodes
281 my $species_taxon = $self->tree->find_node(-rank => 'species');
282 unless ($species_taxon) {
283 # just assume we are rank species
284 $species_taxon = $self->taxon;
287 $species = $species_taxon->scientific_name;
290 # munge it like the Bio::SeqIO modules used to do
291 # (more or less copy/pasted from old Bio::SeqIO::genbank, hence comments
292 # referring to 'ORGANISM' etc.)
295 my $root = $self->tree->get_root_node;
296 unless ($root) {
297 $self->tree(Bio::Tree::Tree->new(-node => $species_taxon));
298 $root = $self->tree->get_root_node;
301 my @spflds = split(' ', $species);
302 if (@spflds > 1 && $root->node_name ne 'Viruses') {
303 $species = undef;
305 # does the next term start with uppercase?
306 # yes: valid genus; no then unconventional
307 # e.g. leaf litter basidiomycete sp. Collb2-39
308 my $genus;
309 if ($spflds[0] =~ m/^[A-Z]/) {
310 $genus = shift(@spflds);
312 else {
313 undef $genus;
316 my $sub_species;
317 if (@spflds) {
318 while (my $fld = shift @spflds) {
319 $species .= "$fld ";
320 # does it have subspecies or varieties?
321 last if ($fld =~ m/(sp\.|var\.)/);
323 chop $species; # last space
324 $sub_species = join ' ',@spflds if(@spflds);
326 else {
327 $species = 'sp.';
330 # does ORGANISM start with any words which make its genus undefined?
331 # these are in @unkn_genus
332 # this in case species starts with uppercase so isn't caught above.
333 # alter common name if required
334 my $unconv = 0; # is it unconventional species name?
335 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
336 foreach (@unkn_genus) {
337 if ($genus && $genus =~ m/$_/i) {
338 $species = $genus . " " . $species;
339 undef $genus;
340 $unconv = 1;
341 last;
343 elsif ($species =~ m/$_/i) {
344 $unconv = 1;
345 last;
348 if (!$unconv && !$sub_species && $species =~ s/^(\w+)\s(\w+)$/$1/) {
349 # need to extract subspecies from conventional ORGANISM format.
350 # Will the 'word' in a two element species name
351 # e.g. $species = 'thummi thummi' => $species='thummi' &
352 # $sub_species='thummi'
353 $sub_species = $2;
356 $self->genus($genus) if $genus;
357 $self->sub_species($sub_species) if $sub_species;
360 $self->{_species} = $species;
362 return $self->{_species};
365 =head2 genus
367 Title : genus
368 Usage : $self->genus( $genus );
369 $genus = $self->genus();
370 Function: Get or set the scientific genus name.
371 Example : $self->genus('Homo');
372 Returns : Scientific genus name as string
373 Args : Scientific genus name as string
375 =cut
377 sub genus {
378 my ($self, $genus) = @_;
380 # TODO: instead of caching the raw name, cache the actual node instance.
381 if ($genus) {
382 $self->{_genus} = $genus;
384 unless (defined $self->{_genus}) {
385 my $genus_taxon = $self->tree->find_node(-rank => 'genus');
386 unless ($genus_taxon) {
387 # just assume our ancestor is rank genus
388 $genus_taxon = $self->taxon->ancestor;
391 $self->{_genus} = $genus_taxon->scientific_name if $genus_taxon;
394 return $self->{_genus};
397 =head2 sub_species
399 Title : sub_species
400 Usage : $obj->sub_species($newval)
401 Function: Get or set the scientific subspecies name.
402 Returns : value of sub_species
403 Args : newvalue (optional)
405 =cut
407 sub sub_species {
408 my ($self, $sub) = @_;
410 # TODO: instead of caching the raw name, cache the actual node instance.
411 if (!defined $self->{'_sub_species'}) {
412 my $ss_taxon = $self->tree->find_node(-rank => 'subspecies');
413 if ($ss_taxon) {
414 if ($sub) {
415 $ss_taxon->scientific_name($sub);
417 # *** weakening ref to our root node in species() to solve a
418 # memory leak means that we have a subspecies taxon to set
419 # during the first call to species(), but it has vanished by
420 # the time a user subsequently calls sub_species() to get the
421 # value. So we 'cheat' and just store the subspecies name in
422 # our self hash, instead of the tree. Is this a problem for
423 # a Species object? Can't decide --sendu
425 # This can now be changed to deal with this information on the
426 # fly. For now, the caching remains, but maybe we should just
427 # let these things deal with mutable data as needed? -- cjfields
429 $self->{'_sub_species'} = $sub;
431 return $ss_taxon->scientific_name;
433 else {
434 # should we create a node here to be added to the tree?
438 # fall back to direct storage on self
439 $self->{'_sub_species'} = $sub if $sub;
440 return $self->{'_sub_species'};
443 =head2 variant
445 Title : variant
446 Usage : $obj->variant($newval)
447 Function: Get/set variant information for this species object (strain,
448 isolate, etc).
449 Example :
450 Returns : value of variant (a scalar)
451 Args : new value (a scalar or undef, optional)
453 =cut
455 sub variant{
456 my ($self, $var) = @_;
458 # TODO: instead of caching the raw name, cache the actual node instance.
459 if (!defined $self->{'_variant'}) {
460 my $var_taxon = $self->tree->find_node(-rank => 'variant');
461 if ($var_taxon) {
462 if ($var) {
463 $var_taxon->scientific_name($var);
465 return $var_taxon->scientific_name;
467 else {
468 # should we create a node here to be added to the tree?
472 # fall back to direct storage on self
473 $self->{'_variant'} = $var if $var;
474 return $self->{'_variant'};
477 =head2 binomial
479 Title : binomial
480 Usage : $binomial = $self->binomial();
481 $binomial = $self->binomial('FULL');
482 Function: Returns a string "Genus species", or "Genus species subspecies",
483 if the first argument is 'FULL' (and the species has a subspecies).
484 Args : Optionally the string 'FULL' to get the full name including
485 the subspecies.
486 Note : This is just munged from the taxon() name
488 =cut
490 sub binomial {
491 my ($self, $full) = @_;
492 my $rank = $self->taxon->rank || 'no rank';
494 my ($species, $genus) = ($self->species, $self->genus);
495 unless (defined $species) {
496 $species = 'sp.';
497 $self->warn("requested binomial but classification was not set");
499 $genus = '' unless( defined $genus);
501 $species =~ s/$genus\s+//;
503 my $bi = "$genus $species";
504 if (defined($full) && $full =~ /full/i) {
505 my $ssp = $self->sub_species;
506 if ($ssp) {
507 $ssp =~ s/$bi\s+//;
508 $ssp =~ s/$species\s+//;
509 $bi .= " $ssp";
512 return $bi;
515 =head2 validate_species_name
517 Title : validate_species_name
518 Usage : $result = $self->validate_species_name($string);
519 Function: Validate the species portion of the binomial
520 Args : string
521 Notes : The string following the "genus name" in the NCBI binomial is so
522 variable that it's not clear that this is a useful function. Consider
523 the binomials "Simian 11 rotavirus (serotype 3 / strain
524 SA11-Patton)", or "St. Thomas 3 rotavirus", straight from GenBank.
525 This is particularly problematic in microbes and viruses. As such,
526 this isn't actually used automatically by any Bio::Species method.
528 =cut
530 sub validate_species_name {
531 my( $self, $string ) = @_;
533 return 1 if $string eq "sp.";
534 return 1 if $string =~ /strain/;
535 return 1 if $string =~ /^[a-z][\w\s-]+$/i;
536 $self->throw("Invalid species name '$string'");
539 sub validate_name {
540 return 1;
543 =head2 organelle
545 Title : organelle
546 Usage : $self->organelle( $organelle );
547 $organelle = $self->organelle();
548 Function: Get or set the organelle name
549 Example : $self->organelle('Chloroplast')
550 Returns : The organelle name in a string
551 Args : String, which is the organelle name
552 Note : TODO: We currently do not know where the organelle definition will
553 eventually go. This is stored in the source seqfeature, though,
554 so the information isn't lost.
556 =cut
558 sub organelle {
559 my($self) = shift;
560 return $self->{'_organelle'} = shift if @_;
561 return $self->{'_organelle'};
564 =head2 Delegation
566 The following methods delegate to the internal Bio::Taxon instance. This is
567 mainly to allow code continue using older methods, with the mind to migrate to
568 using Bio::Taxon and related methods when this class is deprecated.
570 =cut
572 sub node_name {shift->taxon->node_name(@_)}
573 sub scientific_name {shift->taxon->node_name(@_)}
575 sub id {shift->taxon->id(@_)}
576 sub object_id {shift->taxon->id(@_)}
577 sub ncbi_taxid {shift->taxon->ncbi_taxid(@_)}
578 sub rank {shift->taxon->rank(@_)}
579 sub division {shift->taxon->division(@_)}
581 sub common_names {shift->taxon->common_names(@_)}
582 sub common_name {shift->taxon->common_names(@_)}
584 sub genetic_code {shift->taxon->genetic_code(@_)}
585 sub mitochondrial_genetic_code {shift->taxon->mitochondrial_genetic_code(@_)}
587 sub create_date { shift->taxon->create_date(@_)}
588 sub pub_date { shift->taxon->pub_date(@_)}
589 sub update_date { shift->taxon->update_date(@_)}
591 sub db_handle { shift->taxon->db_handle(@_)}
593 sub parent_id { shift->taxon->parent_id(@_)}
594 sub parent_taxon_id { shift->taxon->parent_id(@_)}
596 sub version { shift->taxon->version(@_)}
597 sub authority { shift->taxon->authority(@_)}
598 sub namespace { shift->taxon->namespace(@_)}
600 sub ancestor { shift->taxon->ancestor(@_)}
601 sub get_Parent_Node { shift->taxon->get_Parent_Node(@_)}
602 sub each_Descendent { shift->taxon->each_Descendent(@_)}
603 sub get_Children_Nodes { shift->taxon->get_Children_Nodes(@_)}
604 sub remove_Descendant { shift->taxon->remove_Descendant(@_)}
606 sub name { shift->taxon->name(@_)}
608 =head2 taxon
610 Title : taxon
611 Usage : $obj->taxon
612 Function : retrieve the internal Bio::Taxon instance
613 Returns : A Bio::Taxon. If one is not previously set,
614 an instance is created lazily
615 Args : Bio::Taxon (optional)
617 =cut
619 sub taxon {
620 my ($self, $taxon) = @_;
621 if (!$self->{taxon} || $taxon) {
622 $taxon ||= Bio::Taxon->new();
623 $self->{taxon} = $taxon;
625 $self->{taxon};
628 =head2 tree
630 Title : tree
631 Usage : $obj->tree
632 Function : Returns a Bio::Tree::Tree object
633 Returns : A Bio::Tree::Tree. If one is not previously set,
634 an instance is created lazily
635 Args : Bio::Tree::Tree (optional)
637 =cut
639 sub tree {
640 my ($self, $tree) = @_;
641 if (!$self->{tree} || $tree) {
642 $tree ||= Bio::Tree::Tree->new();
643 delete $tree->{_root_cleanup_methods};
644 $self->{tree} = $tree;
646 $self->{tree};
649 sub DESTROY {
650 my $self = shift;
651 $self->tree->cleanup_tree;
652 delete $self->{tree};
653 $self->taxon->node_cleanup;