bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / Species.pm
blobe7512153976a75a85139b12f0ac2e5c8f625f2ef
1 # $Id$
3 # BioPerl module for Bio::Species
5 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
6 # Reimplemented by Sendu Bala <bix@sendu.me.uk>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::Species - Generic species object
16 =head1 SYNOPSIS
18 $species = Bio::Species->new(-classification => [@classification]);
19 # Can also pass classification
20 # array to new as below
22 $species->classification(qw( sapiens Homo Hominidae
23 Catarrhini Primates Eutheria
24 Mammalia Vertebrata Chordata
25 Metazoa Eukaryota ));
27 $genus = $species->genus();
29 $bi = $species->binomial(); # $bi is now "Homo sapiens"
31 # For storing common name
32 $species->common_name("human");
34 # For storing subspecies
35 $species->sub_species("accountant");
37 =head1 DESCRIPTION
39 Provides a very simple object for storing phylogenetic
40 information. The classification is stored in an array,
41 which is a list of nodes in a phylogenetic tree. Access to
42 getting and setting species and genus is provided, but not
43 to any of the other node types (eg: "phylum", "class",
44 "order", "family"). There's plenty of scope for making the
45 model more sophisticated, if this is ever needed.
47 A methods are also provided for storing common
48 names, and subspecies.
50 =head1 FEEDBACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to
56 the Bioperl mailing list. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 =head2 Reporting Bugs
63 Report bugs to the Bioperl bug tracking system to help us keep track
64 of the bugs and their resolution. Bug reports can be submitted via the
65 web:
67 http://bugzilla.open-bio.org/
69 =head1 AUTHOR
71 James Gilbert email B<jgrg@sanger.ac.uk>
73 =head1 CONTRIBUTORS
75 Sendu Bala, bix@sendu.me.uk
77 =head1 APPENDIX
79 The rest of the documentation details each of the object
80 methods. Internal methods are usually preceded with a _
82 =cut
84 #' Let the code begin...
86 package Bio::Species;
87 use strict;
89 use Bio::DB::Taxonomy;
90 use Bio::Tree::Tree;
91 use Scalar::Util qw(weaken isweak);
92 use base qw(Bio::Taxon);
94 =head2 new
96 Title : new
97 Usage : my $obj = Bio::Species->new(-classification => \@class)
98 Function: Build a new Species object
99 Returns : Bio::Species object
100 Args : -ncbi_taxid => NCBI taxonomic ID (optional)
101 -classification => arrayref of classification
103 =cut
105 sub new {
106 my($class, @args) = @_;
108 my $self = $class->SUPER::new(@args);
110 my ($org, $sp, $var, $classification) =
111 $self->_rearrange([qw(ORGANELLE
112 SUB_SPECIES
113 VARIANT
114 CLASSIFICATION)], @args);
116 if (defined $classification && ref($classification) eq "ARRAY" && @{$classification}) {
117 $self->classification(@$classification);
119 else {
120 # store a tree on ourselves so we can use Tree methods
121 $self->{tree} = Bio::Tree::Tree->new();
123 # some things want to freeze/thaw Bio::Species objects, but
124 # _root_cleanup_methods contains a CODE ref, delete it.
125 # delete $self->{tree}->{_root_cleanup_methods};
128 defined $org && $self->organelle($org);
129 defined $sp && $self->sub_species($sp);
130 defined $var && $self->variant($var);
132 return $self;
135 =head2 classification
137 Title : classification
138 Usage : $self->classification(@class_array);
139 @classification = $self->classification();
140 Function: Get/set the lineage of this species. The array provided must be in
141 the order ... ---> SPECIES, GENUS ---> KINGDOM ---> etc.
142 Example : $obj->classification(qw( 'Homo sapiens' Homo Hominidae
143 Catarrhini Primates Eutheria Mammalia Vertebrata
144 Chordata Metazoa Eukaryota));
145 Returns : Classification array
146 Args : Classification array
148 A reference to the classification array. In the latter case
149 if there is a second argument and it evaluates to true,
150 names will not be validated. NB: in any case, names are never
151 validated anyway.
153 =cut
155 sub classification {
156 my ($self, @vals) = @_;
158 if (@vals) {
159 if (ref($vals[0]) eq 'ARRAY') {
160 @vals = @{$vals[0]};
163 # make sure the lineage contains us as first or second element
164 # (lineage may have subspecies, species, genus ...)
165 my $name = $self->node_name;
166 my ($genus, $species) = (quotemeta($vals[1]), quotemeta($vals[0]));
167 if ($name && ($name !~ m{$species}i && $name !~ m{$genus}i) && $name !~ m{$vals[1] $vals[0]}i) {
168 if ($name =~ /^$vals[1] $vals[0]\s*(.+)/) {
169 # just assume the problem is someone tried to make a Bio::Species starting at subspecies
170 #*** no idea if this is appropriate! just a possible fix related to bug 2092
171 $self->sub_species($1);
172 $name = $self->node_name("$vals[1] $vals[0]");
174 else {
175 $self->warn("The supplied lineage does not start near '$name' (I was supplied '".join(" | ", @vals)."')");
179 # create a lineage for ourselves
180 my $db = Bio::DB::Taxonomy->new(-source => 'list', -names => [reverse @vals]);
181 unless ($self->scientific_name) {
182 # assume we're supposed to be the leaf of the supplied lineage
183 $self->scientific_name($vals[0]);
185 unless ($self->rank) {
186 # and that we are rank species
187 $self->rank('species');
190 $self->db_handle($db);
192 $self->{tree} = Bio::Tree::Tree->new(-node => $self);
193 # some things want to freeze/thaw Bio::Species objects, but tree's
194 # _root_cleanup_methods contains a CODE ref, delete it.
195 #*** even if we don't delete the cleanup methods, we still get memory
196 # leak-like symtoms, and the actual cleanup causes a mass of
197 # warnings... needs investigation!
198 delete $self->{tree}->{_root_cleanup_methods};
201 @vals = ();
202 foreach my $node ($self->{tree}->get_lineage_nodes($self), $self) {
203 unshift(@vals, $node->scientific_name || next);
205 weaken($self->{tree}->{'_rootnode'}) unless isweak($self->{tree}->{'_rootnode'});
206 return @vals;
209 =head2 ncbi_taxid
211 Title : ncbi_taxid
212 Usage : $obj->ncbi_taxid($newval)
213 Function: Get/set the NCBI Taxon ID
214 Returns : the NCBI Taxon ID as a string
215 Args : newvalue to set or undef to unset (optional)
217 =cut
219 =head2 common_name
221 Title : common_name
222 Usage : $self->common_name( $common_name );
223 $common_name = $self->common_name();
224 Function: Get or set the common name of the species
225 Example : $self->common_name('human')
226 Returns : The common name in a string
227 Args : String, which is the common name (optional)
229 =cut
231 =head2 division
233 Title : division
234 Usage : $obj->division($newval)
235 Function: Genbank Division for a species
236 Returns : value of division (a scalar)
237 Args : value of division (a scalar)
239 =cut
241 =head2 species
243 Title : species
244 Usage : $self->species( $species );
245 $species = $self->species();
246 Function: Get or set the scientific species name.
247 Example : $self->species('Homo sapiens');
248 Returns : Scientific species name as string
249 Args : Scientific species name as string
251 =cut
253 sub species {
254 my ($self, $species) = @_;
256 if ($species) {
257 $self->{_species} = $species;
260 unless (defined $self->{_species}) {
261 # work it out from our nodes
262 my $species_taxon = $self->{tree}->find_node(-rank => 'species');
263 unless ($species_taxon) {
264 # just assume we are rank species
265 $species_taxon = $self;
268 $species = $species_taxon->scientific_name;
271 # munge it like the Bio::SeqIO modules used to do
272 # (more or less copy/pasted from old Bio::SeqIO::genbank, hence comments
273 # referring to 'ORGANISM' etc.)
276 my $root = $self->{tree}->get_root_node;
277 unless ($root) {
278 $self->{tree} = Bio::Tree::Tree->new(-node => $species_taxon);
279 delete $self->{tree}->{_root_cleanup_methods};
280 $root = $self->{tree}->get_root_node;
283 my @spflds = split(' ', $species);
284 if (@spflds > 1 && $root->node_name ne 'Viruses') {
285 $species = undef;
287 # does the next term start with uppercase?
288 # yes: valid genus; no then unconventional
289 # e.g. leaf litter basidiomycete sp. Collb2-39
290 my $genus;
291 if ($spflds[0] =~ m/^[A-Z]/) {
292 $genus = shift(@spflds);
294 else {
295 undef $genus;
298 my $sub_species;
299 if (@spflds) {
300 while (my $fld = shift @spflds) {
301 $species .= "$fld ";
302 # does it have subspecies or varieties?
303 last if ($fld =~ m/(sp\.|var\.)/);
305 chop $species; # last space
306 $sub_species = join ' ',@spflds if(@spflds);
308 else {
309 $species = 'sp.';
312 # does ORGANISM start with any words which make its genus undefined?
313 # these are in @unkn_genus
314 # this in case species starts with uppercase so isn't caught above.
315 # alter common name if required
316 my $unconv = 0; # is it unconventional species name?
317 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
318 foreach (@unkn_genus) {
319 if ($genus && $genus =~ m/$_/i) {
320 $species = $genus . " " . $species;
321 undef $genus;
322 $unconv = 1;
323 last;
325 elsif ($species =~ m/$_/i) {
326 $unconv = 1;
327 last;
330 if (!$unconv && !$sub_species && $species =~ s/^(\w+)\s(\w+)$/$1/) {
331 # need to extract subspecies from conventional ORGANISM format.
332 # Will the 'word' in a two element species name
333 # e.g. $species = 'thummi thummi' => $species='thummi' &
334 # $sub_species='thummi'
335 $sub_species = $2;
338 $self->genus($genus) if $genus;
339 $self->sub_species($sub_species) if $sub_species;
342 $self->{_species} = $species;
344 return $self->{_species};
347 =head2 genus
349 Title : genus
350 Usage : $self->genus( $genus );
351 $genus = $self->genus();
352 Function: Get or set the scientific genus name.
353 Example : $self->genus('Homo');
354 Returns : Scientific genus name as string
355 Args : Scientific genus name as string
357 =cut
359 sub genus {
360 my ($self, $genus) = @_;
362 if ($genus) {
363 $self->{_genus} = $genus;
366 unless (defined $self->{_genus}) {
367 my $genus_taxon = $self->{tree}->find_node(-rank => 'genus');
368 unless ($genus_taxon) {
369 # just assume our ancestor is rank genus
370 $genus_taxon = $self->ancestor;
373 $self->{_genus} = $genus_taxon->scientific_name if $genus_taxon;
376 return $self->{_genus};
379 =head2 sub_species
381 Title : sub_species
382 Usage : $obj->sub_species($newval)
383 Function: Get or set the scientific subspecies name.
384 Returns : value of sub_species
385 Args : newvalue (optional)
387 =cut
389 sub sub_species {
390 my ($self, $sub) = @_;
392 unless (defined $self->{'_sub_species'}) {
393 my $ss_taxon = $self->{tree}->find_node(-rank => 'subspecies');
394 if ($ss_taxon) {
395 if ($sub) {
396 $ss_taxon->scientific_name($sub);
398 return $ss_taxon->scientific_name;
402 # fall back to direct storage on self
403 $self->{'_sub_species'} = $sub if $sub;
404 return $self->{'_sub_species'};
407 =head2 variant
409 Title : variant
410 Usage : $obj->variant($newval)
411 Function: Get/set variant information for this species object (strain,
412 isolate, etc).
413 Example :
414 Returns : value of variant (a scalar)
415 Args : new value (a scalar or undef, optional)
417 =cut
419 sub variant{
420 my ($self, $var) = @_;
422 unless (defined $self->{'_variant'}) {
423 my $var_taxon = $self->{tree}->find_node(-rank => 'variant');
424 if ($var_taxon) {
425 if ($var) {
426 $var_taxon->scientific_name($var);
428 return $var_taxon->scientific_name;
432 # fall back to direct storage on self
433 $self->{'_variant'} = $var if $var;
434 return $self->{'_variant'};
437 =head2 binomial
439 Title : binomial
440 Usage : $binomial = $self->binomial();
441 $binomial = $self->binomial('FULL');
442 Function: Returns a string "Genus species", or "Genus species subspecies",
443 if the first argument is 'FULL' (and the species has a subspecies).
444 Args : Optionally the string 'FULL' to get the full name including
445 the subspecies.
447 =cut
449 sub binomial {
450 my ($self, $full) = @_;
451 my $rank = $self->rank || 'no rank';
453 my ($species, $genus) = ($self->species, $self->genus);
454 unless (defined $species) {
455 $species = 'sp.';
456 $self->warn("requested binomial but classification was not set");
458 $genus = '' unless( defined $genus);
460 $species =~ s/$genus\s+//;
462 my $bi = "$genus $species";
463 if (defined($full) && $full =~ /full/i) {
464 my $ssp = $self->sub_species;
465 if ($ssp) {
466 $ssp =~ s/$bi\s+//;
467 $ssp =~ s/$species\s+//;
468 $bi .= " $ssp";
471 return $bi;
474 =head2 validate_species_name
476 Title : validate_species_name
477 Usage : $result = $self->validate_species_name($string);
478 Function: Validate the species portion of the binomial
479 Args : string
480 Notes : The string following the "genus name" in the NCBI binomial
481 is so variable that it's not clear that this is a useful
482 function. Consider the binomials
483 "Simian 11 rotavirus (serotype 3 / strain SA11-Patton)",
484 or "St. Thomas 3 rotavirus", straight from GenBank.
485 This is particularly problematic in microbes and viruses.
486 As such, this isn't actually used automatically by any Bio::Species
487 method.
488 =cut
490 sub validate_species_name {
491 my( $self, $string ) = @_;
493 return 1 if $string eq "sp.";
494 return 1 if $string =~ /strain/;
495 return 1 if $string =~ /^[a-z][\w\s-]+$/i;
496 $self->throw("Invalid species name '$string'");
499 sub validate_name {
500 return 1;
503 =head2 organelle
505 Title : organelle
506 Usage : $self->organelle( $organelle );
507 $organelle = $self->organelle();
508 Function: Get or set the organelle name
509 Example : $self->organelle('Chloroplast')
510 Returns : The organelle name in a string
511 Args : String, which is the organelle name
513 =cut
515 sub organelle {
516 my($self) = shift;
517 return $self->{'_organelle'} = shift if @_;
518 return $self->{'_organelle'};
521 sub dont_DESTROY {
522 my $self = shift;
523 $self->{tree}->cleanup_tree if $self->{tree};
524 delete $self->{tree};
525 $self->node_cleanup;