From e51a676b81e370776fcf2764a58be0b479ae9e63 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Carn=C3=AB=20Draug?= Date: Tue, 25 Sep 2018 18:39:17 +0100 Subject: [PATCH] Bio::DB::Taxonomy::sqlite: move into its own distribution This reduces dependency on the DBI module. Now we're only dependent on it because of 'bin/bp_classify_hits_kingdom' --- Changes | 1 + lib/Bio/DB/Taxonomy/sqlite.pm | 695 ------------------------------------------ t/LocalDB/Taxonomy/sqlite.t | 218 ------------- 3 files changed, 1 insertion(+), 913 deletions(-) delete mode 100644 lib/Bio/DB/Taxonomy/sqlite.pm delete mode 100644 t/LocalDB/Taxonomy/sqlite.t diff --git a/Changes b/Changes index ece460bc7..7acd206b0 100644 --- a/Changes +++ b/Changes @@ -65,6 +65,7 @@ be removed. Bio::DB::GFF::Typename Bio::DB::SeqFeature Bio::DB::SeqFeature::* + Bio::DB::Taxonomy::sqlite Bio::Index::Stockholm Bio::LiveSeq::* Bio::Phenotype::* diff --git a/lib/Bio/DB/Taxonomy/sqlite.pm b/lib/Bio/DB/Taxonomy/sqlite.pm deleted file mode 100644 index 9bd78b416..000000000 --- a/lib/Bio/DB/Taxonomy/sqlite.pm +++ /dev/null @@ -1,695 +0,0 @@ -# -# BioPerl module for Bio::DB::Taxonomy::flatfile -# -# Please direct questions and support issues to -# -# Cared for by Chris Fields -# -# Copyright Chris Fields -# -# You may distribute this module under the same terms as perl itself - -# POD documentation - main docs before the code - -=head1 NAME - -Bio::DB::Taxonomy::sqlite - SQLite-based implementation of Bio::DB::Taxonomy::flatfile - -=head1 SYNOPSIS - - use Bio::DB::Taxonomy; - - my $db = Bio::DB::Taxonomy->new(-source => 'sqlite', - -db => 'mytax.db' # default 'taxonomy.sqlite' - -nodesfile => 'nodes.dmp', - -namesfile => 'names.dmp'); - -=head1 DESCRIPTION - -This is an implementation of Bio::DB::Taxonomy which stores and accesses the -NCBI taxonomy using a simple SQLite3 database stored locally on disk. - -With this implementation, one can do the same basic searches as with the 'flatfile' -database. A test lookup of 1000 NCBI TaxIDs with full lineage information took -about 2 seconds on my older MacBook Pro laptop with an on-disk implementation. - -A few key differences: - -=over 4 - -=item * You can use typical SQL syntax to run a query search; for instance, if you want you can run: - - @ids = sort $db->get_taxonids('Chloroflexi%'); - -=item * In-memory database is allowed - - my $db = Bio::DB::Taxonomy->new(-source => 'sqlite', - -db => ':memory:', - -nodesfile => 'nodes.dmp', - -namesfile => 'names.dmp'); - -=back - -The required database files, nodes.dmp and names.dmp can be obtained from -ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz - -=head1 TODO - -=over 4 - -=item * Small optimizations, such as optimizing name lookups - -=item * Possibly use L to do lineage lookups - -=item * Clean up SQL (still kind of a mess right now) - -=item * Check compat. with other NCBI-specific L implementations - -=item * Plan out feasibility of allowing other backends (Neo4J, other DBI, etc) - -=item * Optionally calculate left/right ID values for TaxID nodes - -=back - -Beyond completing the implementation and optimization, this will -likely be rolled into a more flexible backend at some future point. - -=head1 FEEDBACK - -=head2 Mailing Lists - -User feedback is an integral part of the evolution of this and other -Bioperl modules. Send your comments and suggestions preferably to -the Bioperl mailing list. Your participation is much appreciated. - - bioperl-l@bioperl.org - General discussion - http://bioperl.org/wiki/Mailing_lists - About the mailing lists - -=head2 Support - -Please direct usage questions or support issues to the mailing list: - -I - -rather than to the module maintainer directly. Many experienced and -reponsive experts will be able look at the problem and quickly -address it. Please include a thorough description of the problem -with code and data examples if at all possible. - -=head2 Reporting Bugs - -Report bugs to the Bioperl bug tracking system to help us keep track -of the bugs and their resolution. Bug reports can be submitted via -the web: - - https://github.com/bioperl/bioperl-live/issues - -=head1 AUTHOR - Chris Fields - -Email cjfields-at-cpan-dot-org - -=head1 APPENDIX - -The rest of the documentation details each of the object methods. -Internal methods are usually preceded with a _ - -=cut - -# Let the code begin... - -package Bio::DB::Taxonomy::sqlite; - -use 5.010; -use strict; -use DB_File; -use Bio::Taxon; -use File::Spec::Functions; -use Data::Dumper; -use DBI; - -use constant SEPARATOR => ':'; - -our $DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp -our $DEFAULT_CACHE_SIZE = 0; # /tmp -our $DEFAULT_DB_NAME = 'taxonomy.sqlite'; - -our @DIVISIONS = ( - [qw(BCT Bacteria)], - [qw(INV Invertebrates)], - [qw(MAM Mammals)], - [qw(PHG Phages)], - [qw(PLN Plants)], # (and fungi) - [qw(PRI Primates)], - [qw(ROD Rodents)], - [qw(SYN Synthetic)], - [qw(UNA Unassigned)], - [qw(VRL Viruses)], - [qw(VRT Vertebrates)], - [qw(ENV 'Environmental samples')] -); - -use base qw(Bio::DB::Taxonomy); - -=head2 new - - Title : new - Usage : my $obj = Bio::DB::Taxonomy::flatfile->new(); - Function: Builds a new Bio::DB::Taxonomy::flatfile object - Returns : an instance of Bio::DB::Taxonomy::flatfile - Args : -directory => name of directory where index files should be created - -nodesfile => name of file containing nodes (nodes.dmp from NCBI) - -namesfile => name of the file containing names(names.dmp from NCBI) - -force => 1 to replace current indexes even if they exist - -=cut - -# TODO: get rid of globals! -sub new { - my ( $class, @args ) = @_; - - my $self = $class->SUPER::new(@args); - - my ( $dir, $nodesfile, $namesfile, $db, $force, $cs ) = - $self->_rearrange( [qw(DIRECTORY NODESFILE NAMESFILE DB FORCE CACHE_SIZE)], @args ); - - $self->index_directory( $dir || $DEFAULT_INDEX_DIR ); - - $self->db_name( $db || $DEFAULT_DB_NAME ); - - $self->cache_size($cs // $DEFAULT_CACHE_SIZE); - - if ($nodesfile) { - $self->_build_index( $nodesfile, $namesfile, $force ); - } - - $self->_db_connect; - return $self; -} - -=head2 Bio::DB::Taxonomy interface implementation - -=head2 get_num_taxa - - Title : get_num_taxa - Usage : my $num = $db->get_num_taxa(); - Function: Get the number of taxa stored in the database. - Returns : A number - Args : None - -=cut - -sub get_num_taxa { - my ($self) = @_; - - my $ct = $self->_dbh_fetch(<get_taxon(-taxonid => $taxonid) - Function: Get a Bio::Taxon object from the database. - Returns : Bio::Taxon object - Args : just a single value which is the database id, OR named args: - -taxonid => taxonomy id (to query by taxonid) - OR - -name => string (to query by a taxonomy name: common name, - scientific name, etc) - -=cut - -sub get_taxon { - my ($self) = shift; - my ( $taxonid, $name ); - - if ( @_ > 1 ) { - ( $taxonid, $name ) = $self->_rearrange( [qw(TAXONID NAME)], @_ ); - if ($name) { - ( $taxonid, my @others ) = $self->get_taxonids($name); - $self->warn( -"There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'" - ) if @others > 0; - } - } - else { - $taxonid = shift; - } - - return unless $taxonid; - - $taxonid =~ /^\d+$/ || $self->throw("TaxID must be integer, got [$taxonid]"); - - my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class ); - # single join or two calls? - my $sth = $self->_prepare_cached(<bind_columns(\$parent_id, \$rank, \$code, \$divid, \$gen_code, \$mito, \$nm, \$uniq, \$class); - - $sth->execute($taxonid) or $self->throw($sth->errstr); - - my ($sci_name, @common_names); - - while ($sth->fetch) { - if ($class eq 'scientific name') { - $sci_name = $nm; - } else { - push @common_names, $nm; - } - } - - my $taxon = Bio::Taxon->new( - -name => $sci_name, - -common_names => [@common_names], - -ncbi_taxid => $taxonid, - -parent_id => $parent_id, - -rank => $rank, - -division => $DIVISIONS[$divid]->[1], - -genetic_code => $gen_code, - -mito_genetic_code => $mito - ); - - # we can't use -dbh or the db_handle() method ourselves or we'll go - # infinite on the merge attempt - $taxon->{'db_handle'} = $self; - - $self->_handle_internal_id($taxon); - - return $taxon; -} - -*get_Taxonomy_Node = \&get_taxon; - -=head2 get_taxonids - - Title : get_taxonids - Usage : my @taxonids = $db->get_taxonids('Homo sapiens'); - Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query - string. Note that multiple taxonids can match to the same supplied - name. - Returns : array of integer ids in list context, one of these in scalar context - Args : string representing taxon's name - -=cut - -sub get_taxonids { - my ( $self, $query ) = @_; - - # TODO: note we're not cleaning the query here, so you could technically - # have a fuzzy match (or Bobby Tables someone) - - # TODO: OR'd match seems poor optimally - my $taxids = $self->{dbh}->selectcol_arrayref(<get_Children_Taxids - Function: Get the ids of the children of a node in the taxonomy - Returns : Array of Ids - Args : Bio::Taxon or a taxon_id - Status : deprecated (use each_Descendent()) - -=cut - -sub get_Children_Taxids { - my ( $self, $node ) = @_; - $self->deprecated(); # ? - #$self->warn( - # "get_Children_Taxids is deprecated, use each_Descendent instead"); - #my $id; - #if ( ref($node) ) { - # if ( $node->can('object_id') ) { - # $id = $node->object_id; - # } - # elsif ( $node->can('ncbi_taxid') ) { - # $id = $node->ncbi_taxid; - # } - # else { - # $self->warn( - # "Don't know how to extract a taxon id from the object of type " - # . ref($node) - # . "\n" ); - # return; - # } - #} - #else { $id = $node } - #my @vals = $self->{'_parentbtree'}->get_dup($id); - #return @vals; -} - -=head2 ancestor - - Title : ancestor - Usage : my $ancestor_taxon = $db->ancestor($taxon) - Function: Retrieve the full ancestor taxon of a supplied Taxon from the - database. - Returns : Bio::Taxon - Args : Bio::Taxon (that was retrieved from this database) - -=cut - -sub ancestor { - my ( $self, $taxon ) = @_; - $self->throw("Must supply a Bio::Taxon") - unless ref($taxon) && $taxon->isa('Bio::Taxon'); - $self->throw("The supplied Taxon must belong to this database") - unless $taxon->db_handle && $taxon->db_handle eq $self; - my $id = - $taxon->id || $self->throw("The supplied Taxon is missing its id!"); - - # TODO: - # Note here we explicitly set the parent ID, but use a separate method to - # check whether it is defined. Mixing back-end databases, even if from the - # same source, should still work (since a different backend wouldn't - # explicitly set the parent_id) - - if ($taxon->trusted_parent_id) { - # this is the failsafe when we hit the root node - if ($taxon->parent_id eq $id) { - return; - } - return $self->get_taxon(-taxonid => $taxon->parent_id); - } else { - # TODO: would there be any other option? - return; - } -} - -# TODO: this may act as a drop-in for a recursive CTE lookup - -#=head2 ancestors -# -# Title : ancestors -# Usage : my @ancestor_taxa = $db->ancestors($taxon) -# Function: Retrieve the full ancestor taxon of a supplied Taxon from the -# database. -# Returns : List of Bio::Taxon -# Args : Bio::Taxon (that was retrieved from this database) -# -#=cut - -#sub ancestors { ... } - -=head2 each_Descendent - - Title : each_Descendent - Usage : my @taxa = $db->each_Descendent($taxon); - Function: Get all the descendents of the supplied Taxon (but not their - descendents, ie. not a recursive fetchall). - Returns : Array of Bio::Taxon objects - Args : Bio::Taxon (that was retrieved from this database) - -=cut - -sub each_Descendent { - my ( $self, $taxon ) = @_; - $self->throw("Must supply a Bio::Taxon") - unless ref($taxon) && $taxon->isa('Bio::Taxon'); - $self->throw("The supplied Taxon must belong to this database") - unless $taxon->db_handle && $taxon->db_handle eq $self; # yikes - - my $id = - $taxon->id || $self->throw("The supplied Taxon is missing its id!"); - - #my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class ); - # single join or two calls? - - # probably not optimal, maybe set up as a cached statement with bindings? - my $desc_ids = $self->{dbh}->selectcol_arrayref(<throw($self->{dbh}->errstr); - SELECT tax.taxon_id - FROM taxon as tax - WHERE - tax.parent_id = $id -SQL - - return unless ref $desc_ids eq 'ARRAY'; - - my @descs; - foreach my $desc_id (@$desc_ids) { - push( @descs, $self->get_taxon($desc_id) || next ); - } - return @descs; -} - -=head2 Helper methods - -=cut - -=head2 index_directory - - Title : index_directory - Function : Get/set the location that index files are stored. (this module - will index the supplied database) - Usage : $obj->index_directory($newval) - Returns : value of index_directory (a scalar) - Args : on set, new value (a scalar or undef, optional) - Note : kept for backwards compatibility with older DB_File implementation - -=cut - -sub index_directory { - my $self = shift; - return $self->{'index_directory'} = shift if @_; - return $self->{'index_directory'}; -} - -=head2 db_name - - Title : db_name - Function : Get/set the name of the SQLite3 database where data is stored - Usage : $obj->db_name($newval) - Returns : value of db_name (a scalar) - Args : on set, new value (a scalar or undef, optional) - -=cut - -# TODO: this may need some disambiguation w/ index_directory above; for now we -# assume this doesn't have a full path name (though I see no reason why this -# shouldn't allow that) - -sub db_name { - my $self = shift; - return $self->{'db_name'} = shift if @_; - return $self->{'db_name'}; -} - -=head2 cache_size - - Title : cache_size - Function : Get/set the cachesize used for loading the SQLite3 database - Usage : $obj->cache_size($newval) - Returns : value of cache_size (a scalar) - Args : on set, new value (a scalar or undef, optional) - Note : we do no checking on whether this value is an integer (SQLite does this for use) - -=cut - -sub cache_size { - my $self = shift; - return $self->{'cache_size'} = shift if defined($_[0]); - return $self->{'cache_size'}; -} - -# internal method which does the indexing -sub _build_index { - my ( $self, $nodesfile, $namesfile, $force ) = @_; - - # TODO: need to disambiguate using index_directory here since we only have - # one file. Mayeb ignore it in favor of having full path for db_name? - my ($dir, $db_name) = ($self->index_directory, $self->db_name); - if (! -e $db_name || $force) { - - # TODO: we're ignoring index_directory for now, may add support for this - # down the way - my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!; - - $self->debug("Running SQLite version:".$dbh->{sqlite_version}."\n"); - - #$dbh->do('PRAGMA synchronous = 0'); # Non transaction safe!!! - - if ($self->cache_size) { - my $cs = $self->cache_size; - $self->debug("Setting cache size $cs\n"); - $dbh->do("PRAGMA cache_size = $cs") - } - - $self->debug("Loading taxon table data\n"); - $self->_init_db($dbh); - open my $NODES, '<', $nodesfile - or $self->throw("Could not read node file '$nodesfile': $!"); - - # TODO: this has the really unnecessary 'OR IGNORE' option added, - # apparently b.c the test data expects to handle cases where the TaxID - # is repeated in this table (which should never happen in this table). I - # will likely change this to throw under those circumstances - - my $sth = $dbh->prepare_cached(<do("BEGIN"); - while (<$NODES>) { - next if /^\s*$/; - chomp; - my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_); - next if $taxid == 1; - if ($parent == 1) { - $parent = undef; - } - - $sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid"; - } - $dbh->do("COMMIT") or $self->throw($dbh->errstr); - - close $NODES; - - $self->debug("Loading name table data\n"); - open my $NAMES, '<', $namesfile - or $self->throw("Could not read names file '$namesfile': $!"); - - $sth = $dbh->prepare_cached(<throw($dbh->errstr); - INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?) -SQL - $dbh->do("BEGIN"); - while (<$NAMES>) { - next if /^$/; - chomp; - my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_); - - # don't include the fake root node 'root' or 'all' with id 1 - next if $taxid == 1; - - $class =~ s/\s+\|\s*$//; - - #if ($name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation - # $name =~ s/\s+\(class\)$//; - #} - - $sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr); - } - close $NAMES; - - $dbh->do("COMMIT"); - - $self->debug("Creating taxon index\n"); - $dbh->do("CREATE INDEX parent_idx ON taxon (parent_id)") or $self->throw($dbh->errstr); - $self->debug("Creating name index\n"); - $dbh->do("CREATE INDEX name_idx ON names (name)") or $self->throw($dbh->errstr); - $self->debug("Creating taxon name table index\n"); - $dbh->do("CREATE INDEX taxon_name_idx ON names (taxon_id)") or $self->throw($dbh->errstr); - - $dbh->do("PRAGMA foreign_keys = ON"); - - #$dbh->do('PRAGMA synchronous = 1'); - $self->{dbh} = $dbh; - $self->{'_initialized'} = 1; - } - 1; -} - -# connect the internal db handle -sub _db_connect { - my $self = shift; - return if $self->{'_initialized'}; - - my ($dir, $db_name) = ($self->index_directory, $self->db_name); - - # TODO: we're ignoring index_directory for now, may add support for this - # down the way - my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!; - $dbh->do("PRAGMA foreign_keys = ON"); - if ($self->cache_size) { - my $cs = $self->cache_size; - $self->debug("Setting cache size $cs\n"); - $dbh->do("PRAGMA cache_size = $cs") - } - $self->{dbh} = $dbh; - - $self->{'_initialized'} = 1; -} - -sub _init_db { - my ($self, $dbh) = @_; - my $schema = $self->taxon_schema(); - # TODO: set up handler parameters here - for my $table (sort keys %$schema) { - $dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr); - $dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr); - } - 1; -} - -sub _dbh_fetch { - my ($self, $sql) = @_; - # TODO: more sanity checks - my $rows = $self->{dbh}->selectrow_arrayref($sql) or $self->throw( $self->{dbh}->errstr ); - return $rows; -} - -sub _prepare_cached { - my ($self, $sql) = @_; - # TODO: more sanity checks - my $sth = $self->{dbh}->prepare_cached($sql) or $self->throw( $self->{dbh}->errstr ); - $sth; -} - - -# TODO: check data size, this is a ballpark estimate (could be reduced) -sub taxon_schema { - my $self = shift; - return { - taxon => < <{dbh}; -} - -1; diff --git a/t/LocalDB/Taxonomy/sqlite.t b/t/LocalDB/Taxonomy/sqlite.t deleted file mode 100644 index 49a1884d4..000000000 --- a/t/LocalDB/Taxonomy/sqlite.t +++ /dev/null @@ -1,218 +0,0 @@ -# -*-Perl-*- Test Harness script for Bioperl -# $Id$ - -use strict; - -BEGIN { - use lib '.'; - use Bio::Root::Test; - - test_begin( - - -requires_modules => [qw( 5.010 DB_File DBI DBD::SQLite )] - ); - - use_ok('Bio::DB::Taxonomy'); - use_ok('Bio::Tree::Tree'); -} - -my $temp_dir = test_output_dir(); - -# TODO: run basic tests making sure that a database is not regenerated if -# present or unless forced - -ok my $db_flatfile = Bio::DB::Taxonomy->new( - -source => 'sqlite', - -nodesfile => test_input_file('taxdump', 'nodes.dmp'), - -namesfile => test_input_file('taxdump', 'names.dmp'), -); -isa_ok $db_flatfile, 'Bio::DB::Taxonomy::sqlite'; -isa_ok $db_flatfile, 'Bio::DB::Taxonomy'; - -ok my $db = Bio::DB::Taxonomy->new( - -source => 'sqlite', - -directory => $temp_dir, - -nodesfile => test_input_file('taxdump', 'nodes.dmp'), - -namesfile => test_input_file('taxdump', 'names.dmp'), - -force => 1, -); - -my $id; - -# taxid data in the nodes.dmp file should be unique, we ignore repeated values -# if seen - -is $db->get_num_taxa, 188; - -lives_ok {$id = $db->get_taxonid('Homo sapiens')}; - -is $id, 9606; - -## easy test on human, try out the main Taxon methods -my $n; -ok $n = $db->get_taxon(9606); -is $n->id, 9606; -is $n->object_id, $n->id; -is $n->ncbi_taxid, $n->id; -is $n->parent_id, 9605; -is $n->rank, 'species'; - -is $n->node_name, 'Homo sapiens'; -is $n->scientific_name, $n->node_name; -is ${$n->name('scientific')}[0], $n->node_name; - -my %common_names = map { $_ => 1 } $n->common_names; -is keys %common_names, 3, ref($db).": common names"; -ok exists $common_names{human}; -ok exists $common_names{man}; - -is $n->division, 'Primates'; -is $n->genetic_code, 1; -is $n->mitochondrial_genetic_code, 2; - -# these are entrez-only, data not available in dmp files -#if ($db eq $db_entrez) { -# ok defined $n->pub_date; -# ok defined $n->create_date; -# ok defined $n->update_date; -#} - -# briefly test some Bio::Tree::NodeI methods -ok my $ancestor = $n->ancestor; -is $ancestor->scientific_name, 'Homo'; -# unless set explicitly, Bio::Taxon doesn't return anything for -# each_Descendent; must ask the database directly -ok my @children = $ancestor->db_handle->each_Descendent($ancestor); -is @children, 1; - -#sleep(3) if $db eq $db_entrez; -# -## do some trickier things... -ok my $n2 = $db->get_Taxonomy_Node('89593'); -is $n2->scientific_name, 'Craniata'; - -# briefly check we can use some Tree methods -my $tree = Bio::Tree::Tree->new(); -is $tree->get_lca($n, $n2)->scientific_name, 'Craniata'; - -# get lineage_nodes -my @nodes = $tree->get_nodes; -is scalar(@nodes), 0; -my @lineage_nodes; -@lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree -is scalar @lineage_nodes, 0; -@lineage_nodes = $tree->get_lineage_nodes($n); # node object always works -cmp_ok(scalar @lineage_nodes, '>', 20); - -# get lineage string -like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/); -like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/); -like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/); - -# can we actually form a Tree and use other Tree methods? -ok $tree = Bio::Tree::Tree->new(-node => $n); -cmp_ok($tree->number_nodes, '>', 20); -cmp_ok(scalar($tree->get_nodes), '>', 20); -is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo'; - -# check that getting the ancestor still works now we have explitly set the -# ancestor by making a Tree -is $n->ancestor->scientific_name, 'Homo'; - -ok $n = $db->get_Taxonomy_Node('1760'); -is $n->scientific_name, 'Actinobacteria (class)'; - -# entrez isn't as good at searching as flatfile, so we have to special-case -my @ids = sort $db->get_taxonids('Chloroflexi'); -is scalar @ids, 1; -is_deeply \@ids, [200795]; - -# lowercase -@ids = sort $db->get_taxonids('chloroflexi'); -is scalar @ids, 1; -is_deeply \@ids, [200795]; - -# fuzzy match using SQL syntax to match any 'Chloroflexi' -@ids = sort $db->get_taxonids('Chloroflexi%'); -is scalar @ids, 2; -is_deeply \@ids, [200795, 32061]; - -$id = $db->get_taxonids('Chloroflexi (class)'); -is($id, 32061); - -@ids = $db->get_taxonids('Rhodotorula'); -is @ids, 8; -@ids = $db->get_taxonids('Rhodotorula '); -is @ids, 1; -is $ids[0], 231509; - -# get_lca should work on nodes from different databases -SKIP: { - test_skip(-tests => 9, -requires_networking => 1); - - # check that the result is the same as if we are retrieving from the same DB - # flatfile - my $h_flat = $db_flatfile->get_taxon(-name => 'Homo'); - my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens'); - ok my $tree_functions = Bio::Tree::Tree->new(); - is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db'; - - # entrez - #my $h_entrez; - #eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');}; - #skip "Unable to connect to entrez database; no network or server busy?", 7 if $@; - #my $h_entrez2; - #eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');}; - #skip "Unable to connect to entrez database; no network or server busy?", 7 if $@; - #ok $tree_functions = Bio::Tree::Tree->new(); - #is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db'; - - #ok $tree_functions = Bio::Tree::Tree->new(); - # mixing entrez and flatfile - #TODO:{ - # local $TODO = 'Mixing databases for get_lca() not working, see bug #3416'; - # is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db'; - #} - # even though the species taxa for Homo sapiens from list and flat databases - # have the same internal id, get_lca won't work because they have different - # roots and descendents - #$h_list = $db_list->get_taxon(-name => 'Homo sapiens'); - #is $h_list->ancestor->internal_id, $h_flat->internal_id; - #ok ! $tree_functions->get_lca($h_flat, $h_list); - - # but we can form a tree with the flat node then remove all the ranks we're - # not interested in and try again - #$tree = Bio::Tree::Tree->new(-node => $h_flat); - #$tree->splice(-keep_rank => \@ranks); - #is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo'; -} - -# Some tests carried over from flatfile and others that would be nice to pass - -# ideas from taxonomy2tree.PLS that let us make nice tree, using -# Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just -# because our test flatfile database only has the full lineage of one species -undef $tree; -for my $name ('Human', 'Hominidae') { - my $ncbi_id = $db_flatfile->get_taxonid($name); - if ($ncbi_id) { - my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id); - - if ($tree) { - ok $tree->merge_lineage($node); - } - else { - ok $tree = Bio::Tree::Tree->new(-node => $node); - } - } -} -is $tree->get_nodes, 30; -$tree->contract_linear_paths; -my $ids = join(",", map { $_->id } $tree->get_nodes); -is $ids, '131567,9606'; - -END { - unlink 'taxonomy.sqlite' if (-e 'taxonomy.sqlite'); -} - -done_testing(); -- 2.11.4.GIT