changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / DB / Taxonomy / list.pm
blobc83061542b467e53d98c1182538a50e267f03c65
2 # BioPerl module for Bio::DB::Taxonomy::list
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sendu Bala <bix@sendu.me.uk>
8 # Copyright Sendu Bala
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18 that accepts lists of words to build a database
20 =head1 SYNOPSIS
22 use Bio::DB::Taxonomy;
24 my $db = Bio::DB::Taxonomy->new( -source => 'list' );
26 my @ranks = ('superkingdom', 'class', 'genus', 'species');
27 my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28 $db->add_lineage(-names => \@names, -ranks => \@ranks);
30 @names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31 $db->add_lineage(-names => \@names, -ranks => \@ranks);
33 =head1 DESCRIPTION
35 This is an implementation which uses supplied lists of words to create a
36 database from which you can extract Bio::Taxon objects.
38 =head1 TODO
40 It is possible this module could do something like store the data it builds
41 up to disc. Would that be useful?
42 At any rate, this is why the module is called 'list' and not 'in_memory' or
43 similar.
45 =head1 FEEDBACK
47 =head2 Mailing Lists
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to
51 the Bioperl mailing list. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 =head2 Support
58 Please direct usage questions or support issues to the mailing list:
60 I<bioperl-l@bioperl.org>
62 rather than to the module maintainer directly. Many experienced and
63 reponsive experts will be able look at the problem and quickly
64 address it. Please include a thorough description of the problem
65 with code and data examples if at all possible.
67 =head2 Reporting Bugs
69 Report bugs to the Bioperl bug tracking system to help us keep track
70 of the bugs and their resolution. Bug reports can be submitted via
71 the web:
73 https://github.com/bioperl/bioperl-live/issues
75 =head1 AUTHOR - Sendu Bala
77 Email bix@sendu.me.uk
79 =head1 APPENDIX
81 The rest of the documentation details each of the object methods.
82 Internal methods are usually preceded with a _
84 =cut
86 # Let the code begin...
89 package Bio::DB::Taxonomy::list;
91 use strict;
92 use Bio::Taxon;
94 use base qw(Bio::DB::Taxonomy);
96 our $prefix = 'list';
99 =head2 new
101 Title : new
102 Usage : my $obj = Bio::DB::Taxonomy::list->new();
103 Function: Builds a new Bio::DB::Taxonomy::list object
104 Returns : an instance of Bio::DB::Taxonomy::list
105 Args : optional, as per the add_lineage() method.
107 =cut
109 sub new {
110 my ($class, @args) = @_;
111 my $self = $class->SUPER::new(@args);
112 my %args = @args;
113 delete $args{'-source'};
115 $self->add_lineage(%args) if %args;
117 return $self;
121 =head2 add_lineage
123 Title : add_lineage
124 Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125 my @ranks = ('superkingdom', 'class', 'genus', 'species');
126 $db->add_lineage( -names => \@names, -ranks => \@ranks );
127 Function: Add a lineage to the database, where the lineage is described by
128 a list of scientific names in the order root->leaf. The rank of each
129 name can optionally be described by supplying an additional list
130 of rank names in the same order (eg. superkingdom->species).
131 Returns : 1 for success
132 Args : -names => [] : array ref of scientific names, REQUIRED
133 -ranks => [] : array ref of rank names, same order as above, OPTIONAL
135 =cut
137 sub add_lineage {
138 my ($self, @args) = @_;
139 my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
140 $self->throw("-names must be supplied and its value must be an array reference")
141 unless $names && ref($names) eq 'ARRAY';
143 my $names_idx = scalar @$names - 1;
145 if ($ranks) {
146 $self->throw("-ranks must be an array reference")
147 unless ref($ranks) eq 'ARRAY';
148 $self->throw("The -names and -ranks lists must be of equal length")
149 unless $names_idx == scalar @$ranks - 1;
152 # This is non-trivial because names are not guaranteed unique in a taxonomy,
153 # and neither are name&rank combinations. Furthermore, different name&rank
154 # combinations can actually refer to the same taxon, eg. when one time
155 # 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
157 # name&rank&ancestor could well be unique (or good enough 99.9999% of the
158 # time), but we have the added complication that lineages could sometimes be
159 # supplied with differing numbers of taxa. Ideally we want to realise that
160 # the first of these two lineages shares all its nodes with the second:
161 # ('Mammalia', 'Homo', 'Homo sapiens')
162 # ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
164 # Clearly with limited information we can't do a perfect job, but we can try
165 # and do a reasonable one. So, let's just do the trivial implementation now
166 # and see how bad it is! (assumes ranks are unique except for 'no rank')
168 my $ancestors = $self->{ancestors};
169 my $node_data = $self->{node_data};
170 my $name_to_id = $self->{name_to_id};
171 my $children = $self->{children};
173 my $my_ancestor_id = '';
174 my @node_ids;
175 for my $i (0 .. $names_idx) {
176 my $name = $names->[$i];
177 my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
179 # This is a new node with a new id if we haven't seen this name before.
180 # It's also always a new node if this is the first lineage going into
181 # the db.
183 # We need to handle, however, situations in the future where we try to
184 # merge in a new lineage but we have non-unique names in the lineage
185 # and possible missing classes in some lineages, e.g.
186 # '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
187 # merged with
188 # '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189 # but still need the 'tree' to be correct
191 # Look for a node that is consistent with this lineage
192 my $node_id;
193 SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
195 # Taxa are the same if it they have the same ancestor or none
196 my $this_ancestor_id = $ancestors->{$same_id} || '';
197 if ($my_ancestor_id eq $this_ancestor_id) {
198 $node_id = $same_id;
199 last SAME_NAMED;
202 # Compare children
203 next if $i >= $names_idx; # this taxon has no child
204 my $my_child_name = $names->[$i + 1];
205 #while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206 for my $this_child_id (keys %{$children->{$same_id}}) {
207 if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208 if ($my_ancestor_id) {
209 my @s_ancestors;
210 while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211 if ($my_ancestor_id eq $this_ancestor_id) {
212 $my_ancestor_id = $ancestors->{$same_id};
213 push @node_ids, @s_ancestors, $my_ancestor_id;
214 $node_id = $same_id;
215 last SAME_NAMED;
217 unshift @s_ancestors, $this_ancestor_id;
219 } else {
220 # This new lineage (@$names) doesn't start at the
221 # same root as the existing lineages. Assuming
222 # '$name' corresponds to node $same_id");
223 $node_id = $same_id;
224 last SAME_NAMED;
230 if (not defined $node_id) {
231 # This is a new node. Add it to the database, using the prefix 'list'
232 # for its ID to distinguish it from the IDs from other taxonomies.
233 my $next_num = ++$self->{node_ids};
234 $node_id = $prefix.$next_num;
235 push @{$self->{name_to_id}->{$name}}, $node_id;
236 $self->{node_data}->{$node_id}->[0] = $name;
239 if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240 # Save rank if node in database has no rank but the current node has one
241 $self->{node_data}->{$node_id}->[1] = $rank;
244 if ($my_ancestor_id) {
245 if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246 $self->throw("The lineage '".join(', ', @$names)."' and a ".
247 "previously stored lineage share a node name but have ".
248 "different ancestries for that node. Can't cope!");
250 $self->{ancestors}->{$node_id} = $my_ancestor_id;
253 $my_ancestor_id = $node_id;
254 push @node_ids, $node_id;
257 # Go through the lineage in reverse so we can remember the children
258 for (my $i = $names_idx - 1; $i >= 0; $i--) {
259 $self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
261 return 1;
265 =head2 Bio::DB::Taxonomy Interface implementation
267 =head2 get_num_taxa
269 Title : get_num_taxa
270 Usage : my $num = $db->get_num_taxa();
271 Function: Get the number of taxa stored in the database.
272 Returns : A number
273 Args : None
275 =cut
277 sub get_num_taxa {
278 my ($self) = @_;
279 return $self->{node_ids} || 0;
283 =head2 get_taxon
285 Title : get_taxon
286 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287 Function: Get a Bio::Taxon object from the database.
288 Returns : Bio::Taxon object
289 Args : A single value which is the ID of the taxon to retrieve
290 OR named args, as follows:
291 -taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292 'list' pre-fixed ids unique to the list database).
294 -name => String (to query by a taxonomy name). A given taxon name
295 can match different taxonomy objects. When that is the
296 case, a warning is displayed and the first matching taxon
297 is reported. See get_taxonids() to get all matching taxon
298 IDs.
300 -names => Array ref of lineage names, like in add_lineage(). To
301 overcome the limitations of -name, you can use -names to
302 provide the full lineage of the taxon you want and get a
303 unique, unambiguous taxon object.
305 =cut
307 sub get_taxon {
308 my ($self, @args) = @_;
310 my $taxonid;
311 if (scalar @args == 1) {
312 # Argument is a taxon ID
313 $taxonid = $args[0];
314 } else {
315 # Got named arguments
316 my ($name, $names);
317 ($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
318 if ($name) {
319 $names = [$name];
321 if ($names) {
322 $name = $names->[-1];
324 my @taxonids = $self->get_taxonids($name);
325 $taxonid = $taxonids[0];
327 # Use provided lineage to find correct ID amongst several matching IDs
328 if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329 for my $query_taxonid (@taxonids) {
330 my $matched = 1;
331 my $db_ancestor = $self->get_taxon($query_taxonid);
332 for (my $i = $#$names-1; $i >= 0; $i--) {
333 my $query_ancestor_name = $names->[$i];
334 $db_ancestor = $db_ancestor->ancestor;
335 my $db_ancestor_name = '';
336 if ($db_ancestor) {
337 $db_ancestor_name = $db_ancestor->node_name;
339 if (not ($query_ancestor_name eq $db_ancestor_name) ) {
340 $matched = 0;
341 last; # done testing this taxonid
344 if ($matched == 1) {
345 @taxonids = [$query_taxonid];
346 $taxonid = $query_taxonid;
347 last; # done testing all taxonids
352 # Warn if several taxon IDs matched
353 if (scalar @taxonids > 1) {
354 $self->warn("There were multiple ids (@taxonids) matching '$name',".
355 " using '$taxonid'") if scalar @taxonids > 1;
361 # Now that we have the taxon ID, retrieve the corresponding Taxon object
362 my $taxon;
363 my $node = $self->{node_data}->{$taxonid};
364 if ($node) {
365 my ($sci_name, $rank) = @$node;
366 $taxon = Bio::Taxon->new(
367 -name => $sci_name,
368 -object_id => $taxonid, # not an ncbi taxid, simply an object id
371 if ($rank) {
372 $taxon->rank($rank);
375 # we can't use -dbh or the db_handle() method ourselves or we'll go
376 # infinite on the merge attempt
377 $taxon->{'db_handle'} = $self;
379 $self->_handle_internal_id($taxon, 1);
382 return $taxon;
385 *get_Taxonomy_Node = \&get_taxon;
388 =head2 get_taxonids
390 Title : get_taxonids
391 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392 Function: Searches for a taxonid (generated by the list module) based on a
393 query string. Note that multiple taxonids can match to the same
394 supplied name.
395 Returns : array of integer ids in list context, one of these in scalar context
396 Args : string representing taxon's name
398 =cut
400 sub get_taxonids {
401 my ($self, $name) = @_;
402 return wantarray() ? @{$self->{name_to_id}->{$name} || []} : $self->{name_to_id}->{$name}->[0];
405 *get_taxonid = \&get_taxonids;
408 =head2 ancestor
410 Title : ancestor
411 Usage : my $ancestor_taxon = $db->ancestor($taxon)
412 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
413 database.
414 Returns : Bio::Taxon
415 Args : Bio::Taxon (that was retrieved from this database)
417 =cut
419 sub ancestor {
420 my ($self, $taxon) = @_;
421 $taxon || return; # for bug 2092, or something similar to it at least: shouldn't need this!
422 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423 $self->throw("The supplied Taxon must belong to this database")
424 unless $taxon->db_handle && $taxon->db_handle eq $self;
425 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
427 my $ancestor_id = $self->{ancestors}->{$id} || return;
428 return $self->get_taxon($ancestor_id);
432 =head2 each_Descendent
434 Title : each_Descendent
435 Usage : my @taxa = $db->each_Descendent($taxon);
436 Function: Get all the descendents of the supplied Taxon (but not their
437 descendents, ie. not a recursive fetchall).
438 Returns : Array of Bio::Taxon objects
439 Args : Bio::Taxon (that was retrieved from this database)
441 =cut
443 sub each_Descendent {
444 my ($self, $taxon) = @_;
445 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446 $self->throw("The supplied Taxon must belong to this database")
447 unless $taxon->db_handle && $taxon->db_handle eq $self;
448 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
450 my @children;
451 while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
452 push @children, ($self->get_taxon($child_id) || next);
455 return @children;