Bio/DB/Taxonomy/list.pm

   1 #
   2 # BioPerl module for Bio::DB::Taxonomy::list
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Sendu Bala <bix@sendu.me.uk>
   7 #
   8 # Copyright Sendu Bala
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14
  15 =head1 NAME
  16
  17 Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
  18 that accepts lists of words to build a database
  19
  20 =head1 SYNOPSIS
  21
  22   use Bio::DB::Taxonomy;
  23
  24   my $db = Bio::DB::Taxonomy->new( -source => 'list' );
  25
  26   my @ranks = ('superkingdom', 'class', 'genus', 'species');
  27   my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
  28   $db->add_lineage(-names => \@names, -ranks => \@ranks);
  29
  30   @names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
  31   $db->add_lineage(-names => \@names, -ranks => \@ranks);
  32
  33 =head1 DESCRIPTION
  34
  35 This is an implementation which uses supplied lists of words to create a
  36 database from which you can extract Bio::Taxon objects.
  37
  38 =head1 TODO
  39
  40 It is possible this module could do something like store the data it builds
  41 up to disc. Would that be useful?
  42 At any rate, this is why the module is called 'list' and not 'in_memory' or
  43 similar.
  44
  45 =head1 FEEDBACK
  46
  47 =head2 Mailing Lists
  48
  49 User feedback is an integral part of the evolution of this and other
  50 Bioperl modules. Send your comments and suggestions preferably to
  51 the Bioperl mailing list.  Your participation is much appreciated.
  52
  53   bioperl-l@bioperl.org                  - General discussion
  54   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  55
  56 =head2 Support
  57
  58 Please direct usage questions or support issues to the mailing list:
  59
  60 I<bioperl-l@bioperl.org>
  61
  62 rather than to the module maintainer directly. Many experienced and
  63 reponsive experts will be able look at the problem and quickly
  64 address it. Please include a thorough description of the problem
  65 with code and data examples if at all possible.
  66
  67 =head2 Reporting Bugs
  68
  69 Report bugs to the Bioperl bug tracking system to help us keep track
  70 of the bugs and their resolution. Bug reports can be submitted via
  71 the web:
  72
  73   https://github.com/bioperl/bioperl-live/issues
  74
  75 =head1 AUTHOR - Sendu Bala
  76
  77 Email bix@sendu.me.uk
  78
  79 =head1 APPENDIX
  80
  81 The rest of the documentation details each of the object methods.
  82 Internal methods are usually preceded with a _
  83
  84 =cut
  85
  86 # Let the code begin...
  87
  88
  89 package Bio::DB::Taxonomy::list;
  90
  91 use strict;
  92 use Bio::Taxon;
  93
  94 use base qw(Bio::DB::Taxonomy);
  95
  96 our $prefix = 'list';
  97
  98
  99 =head2 new
 100
 101  Title   : new
 102  Usage   : my $obj = Bio::DB::Taxonomy::list->new();
 103  Function: Builds a new Bio::DB::Taxonomy::list object
 104  Returns : an instance of Bio::DB::Taxonomy::list
 105  Args    : optional, as per the add_lineage() method.
 106
 107 =cut
 108
 109 sub new {
 110     my ($class, @args) = @_;
 111     my $self = $class->SUPER::new(@args);
 112     my %args = @args;
 113     delete $args{'-source'};
 114
 115     $self->add_lineage(%args) if %args;
 116
 117     return $self;
 118 }
 119
 120
 121 =head2 add_lineage
 122
 123  Title   : add_lineage
 124  Usage   : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
 125            my @ranks = ('superkingdom', 'class', 'genus', 'species');
 126            $db->add_lineage( -names => \@names, -ranks => \@ranks );
 127  Function: Add a lineage to the database, where the lineage is described by
 128            a list of scientific names in the order root->leaf. The rank of each
 129            name can optionally be described by supplying an additional list
 130            of rank names in the same order (eg. superkingdom->species).
 131  Returns : 1 for success
 132  Args    : -names => [] : array ref of scientific names, REQUIRED
 133            -ranks => [] : array ref of rank names, same order as above, OPTIONAL
 134
 135 =cut
 136
 137 sub add_lineage {
 138     my ($self, @args) = @_;
 139     my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
 140     $self->throw("-names must be supplied and its value must be an array reference")
 141         unless $names && ref($names) eq 'ARRAY';
 142
 143     my $names_idx = scalar @$names - 1;
 144
 145     if ($ranks) {
 146         $self->throw("-ranks must be an array reference")
 147             unless ref($ranks) eq 'ARRAY';
 148         $self->throw("The -names and -ranks lists must be of equal length")
 149             unless $names_idx == scalar @$ranks - 1;
 150     }
 151
 152     # This is non-trivial because names are not guaranteed unique in a taxonomy,
 153     # and neither are name&rank combinations. Furthermore, different name&rank
 154     # combinations can actually refer to the same taxon, eg. when one time
 155     # 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
 156     #
 157     # name&rank&ancestor could well be unique (or good enough 99.9999% of the
 158     # time), but we have the added complication that lineages could sometimes be
 159     # supplied with differing numbers of taxa. Ideally we want to realise that
 160     # the first of these two lineages shares all its nodes with the second:
 161     # ('Mammalia', 'Homo', 'Homo sapiens')
 162     # ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
 163     #
 164     # Clearly with limited information we can't do a perfect job, but we can try
 165     # and do a reasonable one. So, let's just do the trivial implementation now
 166     # and see how bad it is! (assumes ranks are unique except for 'no rank')
 167
 168     my $ancestors  = $self->{ancestors};
 169     my $node_data  = $self->{node_data};
 170     my $name_to_id = $self->{name_to_id};
 171     my $children   = $self->{children};
 172
 173     my $my_ancestor_id = '';
 174     my @node_ids;
 175     for my $i (0 .. $names_idx) {
 176         my $name = $names->[$i];
 177         my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
 178
 179         # This is a new node with a new id if we haven't seen this name before.
 180         # It's also always a new node if this is the first lineage going into
 181         # the db.
 182         #
 183         # We need to handle, however, situations in the future where we try to
 184         # merge in a new lineage but we have non-unique names in the lineage
 185         # and possible missing classes in some lineages, e.g.
 186         #    '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
 187         # merged with
 188         #    '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
 189         # but still need the 'tree' to be correct
 190
 191         # Look for a node that is consistent with this lineage
 192         my $node_id;
 193         SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
 194
 195             # Taxa are the same if it they have the same ancestor or none
 196             my $this_ancestor_id = $ancestors->{$same_id} || '';
 197             if ($my_ancestor_id eq $this_ancestor_id) {
 198                 $node_id = $same_id;
 199                 last SAME_NAMED;
 200             }
 201
 202             # Compare children
 203             next if $i >= $names_idx; # this taxon has no child
 204             my $my_child_name = $names->[$i + 1];
 205             #while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
 206             for my $this_child_id (keys %{$children->{$same_id}}) {
 207                 if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
 208                     if ($my_ancestor_id) {
 209                         my @s_ancestors;
 210                         while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
 211                             if ($my_ancestor_id eq $this_ancestor_id) {
 212                                 $my_ancestor_id = $ancestors->{$same_id};
 213                                 push @node_ids, @s_ancestors, $my_ancestor_id;
 214                                 $node_id = $same_id;
 215                                 last SAME_NAMED;
 216                             }
 217                             unshift @s_ancestors, $this_ancestor_id;
 218                         }
 219                     } else {
 220                         # This new lineage (@$names) doesn't start at the
 221                         # same root as the existing lineages. Assuming
 222                         # '$name' corresponds to node $same_id");
 223                         $node_id = $same_id;
 224                         last SAME_NAMED;
 225                     }
 226                 }
 227             }
 228         }
 229
 230         if (not defined $node_id) {
 231             # This is a new node. Add it to the database, using the prefix 'list'
 232             # for its ID to distinguish it from the IDs from other taxonomies.
 233             my $next_num = ++$self->{node_ids};
 234             $node_id = $prefix.$next_num;
 235             push @{$self->{name_to_id}->{$name}}, $node_id;
 236             $self->{node_data}->{$node_id}->[0] = $name;
 237         }
 238
 239         if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
 240             # Save rank if node in database has no rank but the current node has one
 241             $self->{node_data}->{$node_id}->[1] = $rank;
 242         }
 243
 244         if ($my_ancestor_id) {
 245             if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
 246                 $self->throw("The lineage '".join(', ', @$names)."' and a ".
 247                     "previously stored lineage share a node name but have ".
 248                     "different ancestries for that node. Can't cope!");
 249             }
 250             $self->{ancestors}->{$node_id} = $my_ancestor_id;
 251         }
 252
 253         $my_ancestor_id = $node_id;
 254         push @node_ids, $node_id;
 255     }
 256
 257     # Go through the lineage in reverse so we can remember the children
 258     for (my $i = $names_idx - 1; $i >= 0; $i--) {
 259         $self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
 260     }
 261     return 1;
 262 }
 263
 264
 265 =head2 Bio::DB::Taxonomy Interface implementation
 266
 267 =head2 get_num_taxa
 268
 269  Title   : get_num_taxa
 270  Usage   : my $num = $db->get_num_taxa();
 271  Function: Get the number of taxa stored in the database.
 272  Returns : A number
 273  Args    : None
 274
 275 =cut
 276
 277 sub get_num_taxa {
 278     my ($self) = @_;
 279     return $self->{node_ids} || 0;
 280 }
 281
 282
 283 =head2 get_taxon
 284
 285  Title   : get_taxon
 286  Usage   : my $taxon = $db->get_taxon(-taxonid => $taxonid)
 287  Function: Get a Bio::Taxon object from the database.
 288  Returns : Bio::Taxon object
 289  Args    : A single value which is the ID of the taxon to retrieve
 290              OR named args, as follows:
 291            -taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
 292                        'list' pre-fixed ids unique to the list database).
 293              OR
 294            -name    => String (to query by a taxonomy name). A given taxon name
 295                        can match different taxonomy objects. When that is the
 296                        case, a warning is displayed and the first matching taxon
 297                        is reported. See get_taxonids() to get all matching taxon
 298                        IDs.
 299              OR
 300            -names   => Array ref of lineage names, like in add_lineage(). To
 301                        overcome the limitations of -name, you can use -names to
 302                        provide the full lineage of the taxon you want and get a
 303                        unique, unambiguous taxon object.
 304
 305 =cut
 306
 307 sub get_taxon {
 308     my ($self, @args) = @_;
 309
 310     my $taxonid;
 311     if (scalar @args == 1) {
 312         # Argument is a taxon ID
 313         $taxonid = $args[0];
 314     } else {
 315         # Got named arguments
 316         my ($name, $names);
 317         ($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
 318         if ($name) {
 319             $names = [$name];
 320         }
 321         if ($names) {
 322             $name = $names->[-1];
 323
 324             my @taxonids = $self->get_taxonids($name);
 325             $taxonid = $taxonids[0];
 326
 327             # Use provided lineage to find correct ID amongst several matching IDs
 328             if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
 329                 for my $query_taxonid (@taxonids) {
 330                     my $matched = 1;
 331                     my $db_ancestor = $self->get_taxon($query_taxonid);
 332                     for (my $i = $#$names-1; $i >= 0; $i--) {
 333                         my $query_ancestor_name = $names->[$i];
 334                         $db_ancestor = $db_ancestor->ancestor;
 335                         my $db_ancestor_name = '';
 336                         if ($db_ancestor) {
 337                             $db_ancestor_name = $db_ancestor->node_name;
 338                         }
 339                         if (not ($query_ancestor_name eq $db_ancestor_name) ) {
 340                             $matched = 0;
 341                             last; # done testing this taxonid
 342                         }
 343                     }
 344                     if ($matched == 1) {
 345                        @taxonids = [$query_taxonid];
 346                        $taxonid  =  $query_taxonid;
 347                        last; # done testing all taxonids
 348                     }
 349                 }
 350             }
 351
 352             # Warn if several taxon IDs matched
 353             if (scalar @taxonids > 1) {
 354                 $self->warn("There were multiple ids (@taxonids) matching '$name',".
 355                     " using '$taxonid'") if scalar @taxonids > 1;
 356             }
 357
 358         }
 359     }
 360
 361     # Now that we have the taxon ID, retrieve the corresponding Taxon object
 362     my $taxon;
 363     my $node = $self->{node_data}->{$taxonid};
 364     if ($node) {
 365         my ($sci_name, $rank) = @$node;
 366         $taxon = Bio::Taxon->new(
 367             -name      => $sci_name,
 368             -object_id => $taxonid, # not an ncbi taxid, simply an object id
 369         );
 370
 371         if ($rank) {
 372             $taxon->rank($rank);
 373         }
 374
 375         # we can't use -dbh or the db_handle() method ourselves or we'll go
 376         # infinite on the merge attempt
 377         $taxon->{'db_handle'} = $self;
 378
 379         $self->_handle_internal_id($taxon, 1);
 380     }
 381
 382     return $taxon;
 383 }
 384
 385 *get_Taxonomy_Node = \&get_taxon;
 386
 387
 388 =head2 get_taxonids
 389
 390  Title   : get_taxonids
 391  Usage   : my @taxonids = $db->get_taxonids('Homo sapiens');
 392  Function: Searches for a taxonid (generated by the list module) based on a
 393            query string. Note that multiple taxonids can match to the same
 394            supplied name.
 395  Returns : array of integer ids in list context, one of these in scalar context
 396  Args    : string representing taxon's name
 397
 398 =cut
 399
 400 sub get_taxonids {
 401     my ($self, $name) = @_;
 402     return wantarray() ? @{$self->{name_to_id}->{$name} || []} : $self->{name_to_id}->{$name}->[0];
 403 }
 404
 405 *get_taxonid = \&get_taxonids;
 406
 407
 408 =head2 ancestor
 409
 410  Title   : ancestor
 411  Usage   : my $ancestor_taxon = $db->ancestor($taxon)
 412  Function: Retrieve the full ancestor taxon of a supplied Taxon from the
 413            database.
 414  Returns : Bio::Taxon
 415  Args    : Bio::Taxon (that was retrieved from this database)
 416
 417 =cut
 418
 419 sub ancestor {
 420     my ($self, $taxon) = @_;
 421     $taxon || return; # for bug 2092, or something similar to it at least: shouldn't need this!
 422     $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
 423     $self->throw("The supplied Taxon must belong to this database")
 424         unless $taxon->db_handle && $taxon->db_handle eq $self;
 425     my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 426
 427     my $ancestor_id = $self->{ancestors}->{$id} || return;
 428     return $self->get_taxon($ancestor_id);
 429 }
 430
 431
 432 =head2 each_Descendent
 433
 434  Title   : each_Descendent
 435  Usage   : my @taxa = $db->each_Descendent($taxon);
 436  Function: Get all the descendents of the supplied Taxon (but not their
 437            descendents, ie. not a recursive fetchall).
 438  Returns : Array of Bio::Taxon objects
 439  Args    : Bio::Taxon (that was retrieved from this database)
 440
 441 =cut
 442
 443 sub each_Descendent {
 444     my ($self, $taxon) = @_;
 445     $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
 446     $self->throw("The supplied Taxon must belong to this database")
 447         unless $taxon->db_handle && $taxon->db_handle eq $self;
 448     my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 449
 450     my @children;
 451     while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
 452         push @children, ($self->get_taxon($child_id) || next);
 453     }
 454
 455     return @children;
 456 }
 457
 458
 459 1;