From 8a8b2ce9f0b0b09049ddc7a9e84115417cc8e748 Mon Sep 17 00:00:00 2001 From: Chris Fields Date: Sun, 1 Mar 2015 00:20:12 -0600 Subject: [PATCH] trust your DB implementation, particularly if ancestor data are already available --- Bio/DB/Taxonomy/sqlite.pm | 53 +++++++++++++++++++++++++++++++++------------ Bio/Taxon.pm | 28 ++++++++++++++++++++++-- t/LocalDB/Taxonomy/sqlite.t | 50 +++++++++++++++++++++--------------------- 3 files changed, 90 insertions(+), 41 deletions(-) diff --git a/Bio/DB/Taxonomy/sqlite.pm b/Bio/DB/Taxonomy/sqlite.pm index 417b9202c..57c9eb4a4 100644 --- a/Bio/DB/Taxonomy/sqlite.pm +++ b/Bio/DB/Taxonomy/sqlite.pm @@ -233,7 +233,15 @@ SQL my $taxon = Bio::Taxon->new( -name => $sci_name, -common_names => [@common_names], - -ncbi_taxid => $taxonid, + -ncbi_taxid => $taxonid, + + # TODO: + # Okay, this is a pretty goofy thing; we have the parent_id in hand + # but can't assign it b/c of semantics (one apparently must call + # ancestor() to get this, which seems roundabout if the information is + # already at hand) + + -parent_id => $parent_id, -rank => $rank, -division => $DIVISIONS[$divid]->[1], -genetic_code => $gen_code, @@ -290,7 +298,7 @@ SQL sub get_Children_Taxids { my ( $self, $node ) = @_; - $self->deprecated(); + $self->deprecated(); # ? #$self->warn( # "get_Children_Taxids is deprecated, use each_Descendent instead"); #my $id; @@ -333,15 +341,19 @@ sub ancestor { unless $taxon->db_handle && $taxon->db_handle eq $self; my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!"); - - my $node = $self->{'_nodes'}->[$id]; - if ( length($node) ) { - my ( undef, $parent_id ) = split( SEPARATOR, $node ); - $parent_id || return; - $parent_id eq $id && return; # one of the roots - return $self->get_taxon($parent_id); + + # TODO: + # Note here we explicitly set the parent ID, but use a separate method to + # check whether it is defined. Mixing back-end databases, even if from the + # same source, should still work (since a different backend wouldn't + # explicitly set the parent_id) + + if (defined $taxon->trusted_parent_id) { + return $self->get_taxon($taxon->parent_id); + } else { + # TODO: would there be any other option? + return; } - return; } =head2 each_Descendent @@ -360,13 +372,26 @@ sub each_Descendent { $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon'); $self->throw("The supplied Taxon must belong to this database") - unless $taxon->db_handle && $taxon->db_handle eq $self; + unless $taxon->db_handle && $taxon->db_handle eq $self; # yikes + my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!"); - - my @desc_ids = $self->{'_parentbtree'}->get_dup($id); + + #my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class ); + # single join or two calls? + + # probably not optimal, maybe set up as a cached statement with bindings? + my $desc_ids = $self->{dbh}->selectcol_arrayref(<throw($self->{dbh}->errstr); + SELECT tax.taxon_id + FROM taxon as tax + WHERE + tax.parent_id = $id +SQL + + return unless ref $desc_ids eq 'ARRAY'; + my @descs; - foreach my $desc_id (@desc_ids) { + foreach my $desc_id (@$desc_ids) { push( @descs, $self->get_taxon($desc_id) || next ); } return @descs; diff --git a/Bio/Taxon.pm b/Bio/Taxon.pm index 245ebaa20..a7e7ed6b7 100644 --- a/Bio/Taxon.pm +++ b/Bio/Taxon.pm @@ -226,6 +226,11 @@ sub new { defined $div && $self->division($div); defined $dbh && $self->db_handle($dbh); + # Making an administrative decision to override this behavior, particularly + # for optimization reasons (if it works to cache it up front, why not? + # Please trust your implementations to get it right) + + # Original note: # deprecated and will issue a warning when method called, # eventually to be removed completely as option defined $parent_id && $self->parent_id($parent_id); @@ -396,14 +401,16 @@ sub ncbi_taxid { parent_taxon_id() is a synonym of this method. Returns : value of parent_id (a scalar) Args : none - Status : deprecated =cut sub parent_id { my $self = shift; if (@_) { - $self->warn("You can no longer set the parent_id - use ancestor() instead"); + $self->{parent_id} = shift; + } + if (defined $self->{parent_id}) { + return $self->{parent_id} } my $ancestor = $self->ancestor() || return; return $ancestor->id; @@ -411,6 +418,19 @@ sub parent_id { *parent_taxon_id = \&parent_id; +=head2 trusted_parent_id + + Title : trusted_parent_id + Usage : $taxon->trusted_parent_id() + Function: If the parent_id is explicitly set, trust it + Returns : simple boolean value (whether or not it has been set) + Args : none + +=cut + +sub trusted_parent_id { + return defined $_[0]->{parent_id}; +} =head2 genetic_code @@ -526,6 +546,10 @@ sub ancestor { my $dbh = $self->db_handle; #*** could avoid the db lookup if we knew our current id was definitely # information from the db... + + # TODO: you must trust your implementation to get it right. + # If there is a parent_id set, trust it. If not, fall back to calling this + # method my $definitely_from_dbh = $self->_get_similar_taxon_from_db($self); return $dbh->ancestor($definitely_from_dbh); } diff --git a/t/LocalDB/Taxonomy/sqlite.t b/t/LocalDB/Taxonomy/sqlite.t index 974683f12..741698a0c 100644 --- a/t/LocalDB/Taxonomy/sqlite.t +++ b/t/LocalDB/Taxonomy/sqlite.t @@ -94,39 +94,39 @@ is $n->node_name, 'Homo sapiens'; is $n->scientific_name, $n->node_name; is ${$n->name('scientific')}[0], $n->node_name; -#my %common_names = map { $_ => 1 } $n->common_names; -#is keys %common_names, 3, ref($db).": common names"; -#ok exists $common_names{human}; -#ok exists $common_names{man}; -# -#is $n->division, 'Primates'; -#is $n->genetic_code, 1; -#is $n->mitochondrial_genetic_code, 2; -## these are entrez-only, data not available in dmp files +my %common_names = map { $_ => 1 } $n->common_names; +is keys %common_names, 3, ref($db).": common names"; +ok exists $common_names{human}; +ok exists $common_names{man}; + +is $n->division, 'Primates'; +is $n->genetic_code, 1; +is $n->mitochondrial_genetic_code, 2; +# these are entrez-only, data not available in dmp files #if ($db eq $db_entrez) { # ok defined $n->pub_date; # ok defined $n->create_date; # ok defined $n->update_date; #} -# -## briefly test some Bio::Tree::NodeI methods -#ok my $ancestor = $n->ancestor; -#is $ancestor->scientific_name, 'Homo'; -## unless set explicitly, Bio::Taxon doesn't return anything for -## each_Descendent; must ask the database directly -#ok my @children = $ancestor->db_handle->each_Descendent($ancestor); -#cmp_ok @children, '>', 0; -# + +# briefly test some Bio::Tree::NodeI methods +ok my $ancestor = $n->ancestor; +is $ancestor->scientific_name, 'Homo'; +# unless set explicitly, Bio::Taxon doesn't return anything for +# each_Descendent; must ask the database directly +ok my @children = $ancestor->db_handle->each_Descendent($ancestor); +cmp_ok @children, '>', 0; + #sleep(3) if $db eq $db_entrez; # ## do some trickier things... -#ok my $n2 = $db->get_Taxonomy_Node('89593'); -#is $n2->scientific_name, 'Craniata'; -# -## briefly check we can use some Tree methods -#my $tree = Bio::Tree::Tree->new(); +ok my $n2 = $db->get_Taxonomy_Node('89593'); +is $n2->scientific_name, 'Craniata'; + +# briefly check we can use some Tree methods +my $tree = Bio::Tree::Tree->new(); #is $tree->get_lca($n, $n2)->scientific_name, 'Craniata'; -# + ## get lineage_nodes #my @nodes = $tree->get_nodes; #is scalar(@nodes), 0; @@ -140,7 +140,7 @@ is ${$n->name('scientific')}[0], $n->node_name; #like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/); #like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/); #like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/); -# + ## can we actually form a Tree and use other Tree methods? #ok $tree = Bio::Tree::Tree->new(-node => $n); #cmp_ok($tree->number_nodes, '>', 20); -- 2.11.4.GIT