2 # BioPerl module for Bio::DB::Taxonomy::list
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sendu Bala <bix@sendu.me.uk>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
17 Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18 that accepts lists of words to build a database
22 use Bio::DB::Taxonomy;
24 my $db = Bio::DB::Taxonomy->new( -source => 'list' );
26 my @ranks = ('superkingdom', 'class', 'genus', 'species');
27 my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28 $db->add_lineage(-names => \@names, -ranks => \@ranks);
30 @names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31 $db->add_lineage(-names => \@names, -ranks => \@ranks);
35 This is an implementation which uses supplied lists of words to create a
36 database from which you can extract Bio::Taxon objects.
40 It is possible this module could do something like store the data it builds
41 up to disc. Would that be useful?
42 At any rate, this is why the module is called 'list' and not 'in_memory' or
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to
51 the Bioperl mailing list. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58 Please direct usage questions or support issues to the mailing list:
60 I<bioperl-l@bioperl.org>
62 rather than to the module maintainer directly. Many experienced and
63 reponsive experts will be able look at the problem and quickly
64 address it. Please include a thorough description of the problem
65 with code and data examples if at all possible.
69 Report bugs to the Bioperl bug tracking system to help us keep track
70 of the bugs and their resolution. Bug reports can be submitted via
73 https://github.com/bioperl/bioperl-live/issues
75 =head1 AUTHOR - Sendu Bala
81 The rest of the documentation details each of the object methods.
82 Internal methods are usually preceded with a _
86 # Let the code begin...
89 package Bio
::DB
::Taxonomy
::list
;
94 use base
qw(Bio::DB::Taxonomy);
102 Usage : my $obj = Bio::DB::Taxonomy::list->new();
103 Function: Builds a new Bio::DB::Taxonomy::list object
104 Returns : an instance of Bio::DB::Taxonomy::list
105 Args : optional, as per the add_lineage() method.
110 my ($class, @args) = @_;
111 my $self = $class->SUPER::new
(@args);
113 delete $args{'-source'};
115 $self->add_lineage(%args) if %args;
124 Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125 my @ranks = ('superkingdom', 'class', 'genus', 'species');
126 $db->add_lineage( -names => \@names, -ranks => \@ranks );
127 Function: Add a lineage to the database, where the lineage is described by
128 a list of scientific names in the order root->leaf. The rank of each
129 name can optionally be described by supplying an additional list
130 of rank names in the same order (eg. superkingdom->species).
131 Returns : 1 for success
132 Args : -names => [] : array ref of scientific names, REQUIRED
133 -ranks => [] : array ref of rank names, same order as above, OPTIONAL
138 my ($self, @args) = @_;
139 my ($names, $ranks) = $self->_rearrange([qw
(NAMES RANKS
)], @args);
140 $self->throw("-names must be supplied and its value must be an array reference")
141 unless $names && ref($names) eq 'ARRAY';
143 my $names_idx = scalar @
$names - 1;
146 $self->throw("-ranks must be an array reference")
147 unless ref($ranks) eq 'ARRAY';
148 $self->throw("The -names and -ranks lists must be of equal length")
149 unless $names_idx == scalar @
$ranks - 1;
152 # This is non-trivial because names are not guaranteed unique in a taxonomy,
153 # and neither are name&rank combinations. Furthermore, different name&rank
154 # combinations can actually refer to the same taxon, eg. when one time
155 # 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
157 # name&rank&ancestor could well be unique (or good enough 99.9999% of the
158 # time), but we have the added complication that lineages could sometimes be
159 # supplied with differing numbers of taxa. Ideally we want to realise that
160 # the first of these two lineages shares all its nodes with the second:
161 # ('Mammalia', 'Homo', 'Homo sapiens')
162 # ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
164 # Clearly with limited information we can't do a perfect job, but we can try
165 # and do a reasonable one. So, let's just do the trivial implementation now
166 # and see how bad it is! (assumes ranks are unique except for 'no rank')
168 my $ancestors = $self->{ancestors
};
169 my $node_data = $self->{node_data
};
170 my $name_to_id = $self->{name_to_id
};
171 my $children = $self->{children
};
173 my $my_ancestor_id = '';
175 for my $i (0 .. $names_idx) {
176 my $name = $names->[$i];
177 my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
179 # This is a new node with a new id if we haven't seen this name before.
180 # It's also always a new node if this is the first lineage going into
183 # We need to handle, however, situations in the future where we try to
184 # merge in a new lineage but we have non-unique names in the lineage
185 # and possible missing classes in some lineages, e.g.
186 # '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
188 # '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189 # but still need the 'tree' to be correct
191 # Look for a node that is consistent with this lineage
193 SAME_NAMED
: for my $same_id (@
{$name_to_id->{$name}}) {
195 # Taxa are the same if it they have the same ancestor or none
196 my $this_ancestor_id = $ancestors->{$same_id} || '';
197 if ($my_ancestor_id eq $this_ancestor_id) {
203 next if $i >= $names_idx; # this taxon has no child
204 my $my_child_name = $names->[$i + 1];
205 #while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206 for my $this_child_id (keys %{$children->{$same_id}}) {
207 if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208 if ($my_ancestor_id) {
210 while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211 if ($my_ancestor_id eq $this_ancestor_id) {
212 $my_ancestor_id = $ancestors->{$same_id};
213 push @node_ids, @s_ancestors, $my_ancestor_id;
217 unshift @s_ancestors, $this_ancestor_id;
220 # This new lineage (@$names) doesn't start at the
221 # same root as the existing lineages. Assuming
222 # '$name' corresponds to node $same_id");
230 if (not defined $node_id) {
231 # This is a new node. Add it to the database, using the prefix 'list'
232 # for its ID to distinguish it from the IDs from other taxonomies.
233 my $next_num = ++$self->{node_ids
};
234 $node_id = $prefix.$next_num;
235 push @
{$self->{name_to_id
}->{$name}}, $node_id;
236 $self->{node_data
}->{$node_id}->[0] = $name;
239 if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240 # Save rank if node in database has no rank but the current node has one
241 $self->{node_data
}->{$node_id}->[1] = $rank;
244 if ($my_ancestor_id) {
245 if ($self->{ancestors
}->{$node_id} && $self->{ancestors
}->{$node_id} ne $my_ancestor_id) {
246 $self->throw("The lineage '".join(', ', @
$names)."' and a ".
247 "previously stored lineage share a node name but have ".
248 "different ancestries for that node. Can't cope!");
250 $self->{ancestors
}->{$node_id} = $my_ancestor_id;
253 $my_ancestor_id = $node_id;
254 push @node_ids, $node_id;
257 # Go through the lineage in reverse so we can remember the children
258 for (my $i = $names_idx - 1; $i >= 0; $i--) {
259 $self->{children
}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
265 =head2 Bio::DB::Taxonomy Interface implementation
270 Usage : my $num = $db->get_num_taxa();
271 Function: Get the number of taxa stored in the database.
279 return $self->{node_ids
} || 0;
286 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287 Function: Get a Bio::Taxon object from the database.
288 Returns : Bio::Taxon object
289 Args : A single value which is the ID of the taxon to retrieve
290 OR named args, as follows:
291 -taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292 'list' pre-fixed ids unique to the list database).
294 -name => String (to query by a taxonomy name). A given taxon name
295 can match different taxonomy objects. When that is the
296 case, a warning is displayed and the first matching taxon
297 is reported. See get_taxonids() to get all matching taxon
300 -names => Array ref of lineage names, like in add_lineage(). To
301 overcome the limitations of -name, you can use -names to
302 provide the full lineage of the taxon you want and get a
303 unique, unambiguous taxon object.
308 my ($self, @args) = @_;
311 if (scalar @args == 1) {
312 # Argument is a taxon ID
315 # Got named arguments
317 ($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
322 $name = $names->[-1];
324 my @taxonids = $self->get_taxonids($name);
325 $taxonid = $taxonids[0];
327 # Use provided lineage to find correct ID amongst several matching IDs
328 if ( (scalar @taxonids > 1) && (scalar @
$names > 1) ) {
329 for my $query_taxonid (@taxonids) {
331 my $db_ancestor = $self->get_taxon($query_taxonid);
332 for (my $i = $#$names-1; $i >= 0; $i--) {
333 my $query_ancestor_name = $names->[$i];
334 $db_ancestor = $db_ancestor->ancestor;
335 my $db_ancestor_name = '';
337 $db_ancestor_name = $db_ancestor->node_name;
339 if (not ($query_ancestor_name eq $db_ancestor_name) ) {
341 last; # done testing this taxonid
345 @taxonids = [$query_taxonid];
346 $taxonid = $query_taxonid;
347 last; # done testing all taxonids
352 # Warn if several taxon IDs matched
353 if (scalar @taxonids > 1) {
354 $self->warn("There were multiple ids (@taxonids) matching '$name',".
355 " using '$taxonid'") if scalar @taxonids > 1;
361 # Now that we have the taxon ID, retrieve the corresponding Taxon object
363 my $node = $self->{node_data
}->{$taxonid};
365 my ($sci_name, $rank) = @
$node;
366 $taxon = Bio
::Taxon
->new(
368 -object_id
=> $taxonid, # not an ncbi taxid, simply an object id
375 # we can't use -dbh or the db_handle() method ourselves or we'll go
376 # infinite on the merge attempt
377 $taxon->{'db_handle'} = $self;
379 $self->_handle_internal_id($taxon, 1);
385 *get_Taxonomy_Node
= \
&get_taxon
;
391 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392 Function: Searches for a taxonid (generated by the list module) based on a
393 query string. Note that multiple taxonids can match to the same
395 Returns : array of integer ids in list context, one of these in scalar context
396 Args : string representing taxon's name
401 my ($self, $name) = @_;
402 return wantarray() ? @
{$self->{name_to_id
}->{$name} || []} : $self->{name_to_id
}->{$name}->[0];
405 *get_taxonid
= \
&get_taxonids
;
411 Usage : my $ancestor_taxon = $db->ancestor($taxon)
412 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
415 Args : Bio::Taxon (that was retrieved from this database)
420 my ($self, $taxon) = @_;
421 $taxon || return; # for bug 2092, or something similar to it at least: shouldn't need this!
422 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423 $self->throw("The supplied Taxon must belong to this database")
424 unless $taxon->db_handle && $taxon->db_handle eq $self;
425 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
427 my $ancestor_id = $self->{ancestors
}->{$id} || return;
428 return $self->get_taxon($ancestor_id);
432 =head2 each_Descendent
434 Title : each_Descendent
435 Usage : my @taxa = $db->each_Descendent($taxon);
436 Function: Get all the descendents of the supplied Taxon (but not their
437 descendents, ie. not a recursive fetchall).
438 Returns : Array of Bio::Taxon objects
439 Args : Bio::Taxon (that was retrieved from this database)
443 sub each_Descendent
{
444 my ($self, $taxon) = @_;
445 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446 $self->throw("The supplied Taxon must belong to this database")
447 unless $taxon->db_handle && $taxon->db_handle eq $self;
448 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
451 while ( my ($child_id, undef) = each %{$self->{children
}->{$id}} ) {
452 push @children, ($self->get_taxon($child_id) || next);