* sync with trunk
[bioperl-live.git] / Bio / Species.pm
blobf6d020d63c13cc4f8a82b512ec00a8a4b5bbcae6
1 # $Id$
3 # BioPerl module for Bio::Species
5 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
6 # Reimplemented by Sendu Bala <bix@sendu.me.uk>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::Species - Generic species object
16 =head1 SYNOPSIS
18 $species = Bio::Species->new(-classification => [@classification]);
19 # Can also pass classification
20 # array to new as below
22 $species->classification(qw( sapiens Homo Hominidae
23 Catarrhini Primates Eutheria
24 Mammalia Vertebrata Chordata
25 Metazoa Eukaryota ));
27 $genus = $species->genus();
29 $bi = $species->binomial(); # $bi is now "Homo sapiens"
31 # For storing common name
32 $species->common_name("human");
34 # For storing subspecies
35 $species->sub_species("accountant");
37 =head1 DESCRIPTION
39 Provides a very simple object for storing phylogenetic
40 information. The classification is stored in an array,
41 which is a list of nodes in a phylogenetic tree. Access to
42 getting and setting species and genus is provided, but not
43 to any of the other node types (eg: "phylum", "class",
44 "order", "family"). There's plenty of scope for making the
45 model more sophisticated, if this is ever needed.
47 A methods are also provided for storing common
48 names, and subspecies.
50 =head1 FEEDBACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to
56 the Bioperl mailing list. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 =head2 Reporting Bugs
63 Report bugs to the Bioperl bug tracking system to help us keep track
64 of the bugs and their resolution. Bug reports can be submitted via the
65 web:
67 http://bugzilla.open-bio.org/
69 =head1 AUTHOR
71 James Gilbert email B<jgrg@sanger.ac.uk>
73 =head1 CONTRIBUTORS
75 Sendu Bala, bix@sendu.me.uk
77 =head1 APPENDIX
79 The rest of the documentation details each of the object
80 methods. Internal methods are usually preceded with a _
82 =cut
84 #' Let the code begin...
86 package Bio::Species;
87 use strict;
89 use Bio::DB::Taxonomy;
90 use Bio::Tree::Tree;
91 use Scalar::Util qw(weaken isweak);
92 use base qw(Bio::Taxon);
94 =head2 new
96 Title : new
97 Usage : my $obj = Bio::Species->new(-classification => \@class)
98 Function: Build a new Species object
99 Returns : Bio::Species object
100 Args : -ncbi_taxid => NCBI taxonomic ID (optional)
101 -classification => arrayref of classification
103 =cut
105 sub new {
106 my($class, @args) = @_;
108 my $self = $class->SUPER::new(@args);
110 my ($org, $sp, $var, $classification) =
111 $self->_rearrange([qw(ORGANELLE
112 SUB_SPECIES
113 VARIANT
114 CLASSIFICATION)], @args);
116 if (defined $classification && ref($classification) eq "ARRAY" && @{$classification}) {
117 $self->classification(@$classification);
119 else {
120 # store a tree on ourselves so we can use Tree methods
121 $self->{tree} = Bio::Tree::Tree->new();
123 # some things want to freeze/thaw Bio::Species objects, but
124 # _root_cleanup_methods contains a CODE ref, delete it.
125 # delete $self->{tree}->{_root_cleanup_methods};
128 defined $org && $self->organelle($org);
129 defined $sp && $self->sub_species($sp);
130 defined $var && $self->variant($var);
132 return $self;
135 =head2 classification
137 Title : classification
138 Usage : $self->classification(@class_array);
139 @classification = $self->classification();
140 Function: Get/set the lineage of this species. The array provided must be in
141 the order ... ---> SPECIES, GENUS ---> KINGDOM ---> etc.
142 Example : $obj->classification(qw( 'Homo sapiens' Homo Hominidae
143 Catarrhini Primates Eutheria Mammalia Vertebrata
144 Chordata Metazoa Eukaryota));
145 Returns : Classification array
146 Args : Classification array
148 A reference to the classification array. In the latter case
149 if there is a second argument and it evaluates to true,
150 names will not be validated. NB: in any case, names are never
151 validated anyway.
153 =cut
155 sub classification {
156 my ($self, @vals) = @_;
158 if (@vals) {
159 if (ref($vals[0]) eq 'ARRAY') {
160 @vals = @{$vals[0]};
163 # make sure the lineage contains us as first or second element
164 # (lineage may have subspecies, species, genus ...)
165 my $name = $self->node_name;
166 my ($genus, $species) = (quotemeta($vals[1]), quotemeta($vals[0]));
167 if ($name && ($name !~ m{$species}i && $name !~ m{$genus}i) && $name !~ m{$vals[1] $vals[0]}i) {
168 if ($name =~ /^$vals[1] $vals[0]\s*(.+)/) {
169 # just assume the problem is someone tried to make a Bio::Species starting at subspecies
170 #*** no idea if this is appropriate! just a possible fix related to bug 2092
171 $self->sub_species($1);
172 $name = $self->node_name("$vals[1] $vals[0]");
174 else {
175 $self->warn("The supplied lineage does not start near '$name' (I was supplied '".join(" | ", @vals)."')");
179 # create a lineage for ourselves
180 my $db = Bio::DB::Taxonomy->new(-source => 'list', -names => [reverse @vals]);
181 unless ($self->scientific_name) {
182 # assume we're supposed to be the leaf of the supplied lineage
183 $self->scientific_name($vals[0]);
185 unless ($self->rank) {
186 # and that we are rank species
187 $self->rank('species');
190 $self->db_handle($db);
192 $self->{tree} = Bio::Tree::Tree->new(-node => $self);
193 # some things want to freeze/thaw Bio::Species objects, but tree's
194 # _root_cleanup_methods contains a CODE ref, delete it.
195 #*** even if we don't delete the cleanup methods, we still get memory
196 # leak-like symtoms, and the actual cleanup causes a mass of
197 # warnings... needs investigation!
198 delete $self->{tree}->{_root_cleanup_methods};
201 @vals = ();
202 foreach my $node ($self->{tree}->get_lineage_nodes($self), $self) {
203 unshift(@vals, $node->scientific_name || next);
205 weaken($self->{tree}->{'_rootnode'}) unless isweak($self->{tree}->{'_rootnode'});
206 return @vals;
209 =head2 ncbi_taxid
211 Title : ncbi_taxid
212 Usage : $obj->ncbi_taxid($newval)
213 Function: Get/set the NCBI Taxon ID
214 Returns : the NCBI Taxon ID as a string
215 Args : newvalue to set or undef to unset (optional)
217 =cut
219 =head2 common_name
221 Title : common_name
222 Usage : $self->common_name( $common_name );
223 $common_name = $self->common_name();
224 Function: Get or set the common name of the species
225 Example : $self->common_name('human')
226 Returns : The common name in a string
227 Args : String, which is the common name (optional)
229 =cut
231 =head2 division
233 Title : division
234 Usage : $obj->division($newval)
235 Function: Genbank Division for a species
236 Returns : value of division (a scalar)
237 Args : value of division (a scalar)
239 =cut
241 =head2 species
243 Title : species
244 Usage : $self->species( $species );
245 $species = $self->species();
246 Function: Get or set the scientific species name.
247 Example : $self->species('Homo sapiens');
248 Returns : Scientific species name as string
249 Args : Scientific species name as string
251 =cut
253 sub species {
254 my ($self, $species) = @_;
256 if ($species) {
257 $self->{_species} = $species;
260 unless (defined $self->{_species}) {
261 # work it out from our nodes
262 my $species_taxon = $self->{tree}->find_node(-rank => 'species');
263 unless ($species_taxon) {
264 # just assume we are rank species
265 $species_taxon = $self;
268 $species = $species_taxon->scientific_name;
271 # munge it like the Bio::SeqIO modules used to do
272 # (more or less copy/pasted from old Bio::SeqIO::genbank, hence comments
273 # referring to 'ORGANISM' etc.)
276 my $root = $self->{tree}->get_root_node;
277 unless ($root) {
278 $self->{tree} = Bio::Tree::Tree->new(-node => $species_taxon);
279 delete $self->{tree}->{_root_cleanup_methods};
280 $root = $self->{tree}->get_root_node;
281 weaken($self->{tree}->{'_rootnode'}) unless isweak($self->{tree}->{'_rootnode'});
284 my @spflds = split(' ', $species);
285 if (@spflds > 1 && $root->node_name ne 'Viruses') {
286 $species = undef;
288 # does the next term start with uppercase?
289 # yes: valid genus; no then unconventional
290 # e.g. leaf litter basidiomycete sp. Collb2-39
291 my $genus;
292 if ($spflds[0] =~ m/^[A-Z]/) {
293 $genus = shift(@spflds);
295 else {
296 undef $genus;
299 my $sub_species;
300 if (@spflds) {
301 while (my $fld = shift @spflds) {
302 $species .= "$fld ";
303 # does it have subspecies or varieties?
304 last if ($fld =~ m/(sp\.|var\.)/);
306 chop $species; # last space
307 $sub_species = join ' ',@spflds if(@spflds);
309 else {
310 $species = 'sp.';
313 # does ORGANISM start with any words which make its genus undefined?
314 # these are in @unkn_genus
315 # this in case species starts with uppercase so isn't caught above.
316 # alter common name if required
317 my $unconv = 0; # is it unconventional species name?
318 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
319 foreach (@unkn_genus) {
320 if ($genus && $genus =~ m/$_/i) {
321 $species = $genus . " " . $species;
322 undef $genus;
323 $unconv = 1;
324 last;
326 elsif ($species =~ m/$_/i) {
327 $unconv = 1;
328 last;
331 if (!$unconv && !$sub_species && $species =~ s/^(\w+)\s(\w+)$/$1/) {
332 # need to extract subspecies from conventional ORGANISM format.
333 # Will the 'word' in a two element species name
334 # e.g. $species = 'thummi thummi' => $species='thummi' &
335 # $sub_species='thummi'
336 $sub_species = $2;
339 $self->genus($genus) if $genus;
340 $self->sub_species($sub_species) if $sub_species;
343 $self->{_species} = $species;
345 return $self->{_species};
348 =head2 genus
350 Title : genus
351 Usage : $self->genus( $genus );
352 $genus = $self->genus();
353 Function: Get or set the scientific genus name.
354 Example : $self->genus('Homo');
355 Returns : Scientific genus name as string
356 Args : Scientific genus name as string
358 =cut
360 sub genus {
361 my ($self, $genus) = @_;
363 if ($genus) {
364 $self->{_genus} = $genus;
367 unless (defined $self->{_genus}) {
368 my $genus_taxon = $self->{tree}->find_node(-rank => 'genus');
369 unless ($genus_taxon) {
370 # just assume our ancestor is rank genus
371 $genus_taxon = $self->ancestor;
374 $self->{_genus} = $genus_taxon->scientific_name if $genus_taxon;
377 return $self->{_genus};
380 =head2 sub_species
382 Title : sub_species
383 Usage : $obj->sub_species($newval)
384 Function: Get or set the scientific subspecies name.
385 Returns : value of sub_species
386 Args : newvalue (optional)
388 =cut
390 sub sub_species {
391 my ($self, $sub) = @_;
393 unless (defined $self->{'_sub_species'}) {
394 my $ss_taxon = $self->{tree}->find_node(-rank => 'subspecies');
395 if ($ss_taxon) {
396 if ($sub) {
397 $ss_taxon->scientific_name($sub);
399 # *** weakening ref to our root node in species() to solve a
400 # memory leak means that we have a subspecies taxon to set
401 # during the first call to species(), but it has vanished by
402 # the time a user subsequently calls sub_species() to get the
403 # value. So we 'cheat' and just store the subspecies name in
404 # our self hash, instead of the tree. Is this a problem for
405 # a Species object? Can't decide --sendu
406 $self->{'_sub_species'} = $sub;
408 return $ss_taxon->scientific_name;
412 # fall back to direct storage on self
413 $self->{'_sub_species'} = $sub if $sub;
414 return $self->{'_sub_species'};
417 =head2 variant
419 Title : variant
420 Usage : $obj->variant($newval)
421 Function: Get/set variant information for this species object (strain,
422 isolate, etc).
423 Example :
424 Returns : value of variant (a scalar)
425 Args : new value (a scalar or undef, optional)
427 =cut
429 sub variant{
430 my ($self, $var) = @_;
432 unless (defined $self->{'_variant'}) {
433 my $var_taxon = $self->{tree}->find_node(-rank => 'variant');
434 if ($var_taxon) {
435 if ($var) {
436 $var_taxon->scientific_name($var);
438 return $var_taxon->scientific_name;
442 # fall back to direct storage on self
443 $self->{'_variant'} = $var if $var;
444 return $self->{'_variant'};
447 =head2 binomial
449 Title : binomial
450 Usage : $binomial = $self->binomial();
451 $binomial = $self->binomial('FULL');
452 Function: Returns a string "Genus species", or "Genus species subspecies",
453 if the first argument is 'FULL' (and the species has a subspecies).
454 Args : Optionally the string 'FULL' to get the full name including
455 the subspecies.
457 =cut
459 sub binomial {
460 my ($self, $full) = @_;
461 my $rank = $self->rank || 'no rank';
463 my ($species, $genus) = ($self->species, $self->genus);
464 unless (defined $species) {
465 $species = 'sp.';
466 $self->warn("requested binomial but classification was not set");
468 $genus = '' unless( defined $genus);
470 $species =~ s/$genus\s+//;
472 my $bi = "$genus $species";
473 if (defined($full) && $full =~ /full/i) {
474 my $ssp = $self->sub_species;
475 if ($ssp) {
476 $ssp =~ s/$bi\s+//;
477 $ssp =~ s/$species\s+//;
478 $bi .= " $ssp";
481 return $bi;
484 =head2 validate_species_name
486 Title : validate_species_name
487 Usage : $result = $self->validate_species_name($string);
488 Function: Validate the species portion of the binomial
489 Args : string
490 Notes : The string following the "genus name" in the NCBI binomial
491 is so variable that it's not clear that this is a useful
492 function. Consider the binomials
493 "Simian 11 rotavirus (serotype 3 / strain SA11-Patton)",
494 or "St. Thomas 3 rotavirus", straight from GenBank.
495 This is particularly problematic in microbes and viruses.
496 As such, this isn't actually used automatically by any Bio::Species
497 method.
498 =cut
500 sub validate_species_name {
501 my( $self, $string ) = @_;
503 return 1 if $string eq "sp.";
504 return 1 if $string =~ /strain/;
505 return 1 if $string =~ /^[a-z][\w\s-]+$/i;
506 $self->throw("Invalid species name '$string'");
509 sub validate_name {
510 return 1;
513 =head2 organelle
515 Title : organelle
516 Usage : $self->organelle( $organelle );
517 $organelle = $self->organelle();
518 Function: Get or set the organelle name
519 Example : $self->organelle('Chloroplast')
520 Returns : The organelle name in a string
521 Args : String, which is the organelle name
523 =cut
525 sub organelle {
526 my($self) = shift;
527 return $self->{'_organelle'} = shift if @_;
528 return $self->{'_organelle'};
531 sub dont_DESTROY {
532 my $self = shift;
533 $self->{tree}->cleanup_tree if $self->{tree};
534 delete $self->{tree};
535 $self->node_cleanup;