a bit more doc; getting about 1000 full lineage lookups in 2 sec
[bioperl-live.git] / Bio / DB / Taxonomy / sqlite.pm
blobd79419422cfef3260573e0ab802afec8f872fbc1
2 # BioPerl module for Bio::DB::Taxonomy::flatfile
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-cpan-dot-org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Taxonomy::sqlite - SQLite-based implementation of Bio::DB::Taxonomy::flatfile
18 =head1 SYNOPSIS
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'sqlite',
23 -db => 'mytax.db' # default 'taxonomy.sqlite'
24 -nodesfile => 'nodes.dmp',
25 -namesfile => 'names.dmp');
27 =head1 DESCRIPTION
29 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
30 NCBI taxonomy using a simple SQLite3 database stored locally on disk.
32 With this implementation, one can do the same basic searches as with the 'flatfile'
33 database. A test lookup of 1000 NCBI TaxIDs with full lineage information took
34 about 2 seconds on my older MacBook Pro laptop with an on-disk implementation.
36 A few key differences:
38 =over 4
40 =item * You can use typical SQL syntax to run a query search; for instance, if you want you can run:
42 @ids = sort $db->get_taxonids('Chloroflexi%');
44 =item * In-memory database is allowed
46 my $db = Bio::DB::Taxonomy->new(-source => 'sqlite',
47 -db => ':memory:',
48 -nodesfile => 'nodes.dmp',
49 -namesfile => 'names.dmp');
51 =back
53 The required database files, nodes.dmp and names.dmp can be obtained from
54 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
56 =head1 TODO
58 =over 4
60 =item * Small optimizations, such as optimizing name lookups
62 =item * Possibly use L<recursive CTE|http://www.sqlite.org/lang_with.html> to do lineage lookups
64 =item * Clean up SQL (still kind of a mess right now)
66 =item * Check compat. with other NCBI-specific L<Bio::DB::Taxonomy> implementations
68 =item * Plan out feasibility of allowing other backends (Neo4J, other DBI, etc)
70 =item * Optionally calculate left/right ID values for TaxID nodes
72 =back
74 Beyond completing the implementation and optimization, this will
75 likely be rolled into a more flexible backend at some future point.
77 =head1 FEEDBACK
79 =head2 Mailing Lists
81 User feedback is an integral part of the evolution of this and other
82 Bioperl modules. Send your comments and suggestions preferably to
83 the Bioperl mailing list. Your participation is much appreciated.
85 bioperl-l@bioperl.org - General discussion
86 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
88 =head2 Support
90 Please direct usage questions or support issues to the mailing list:
92 I<bioperl-l@bioperl.org>
94 rather than to the module maintainer directly. Many experienced and
95 reponsive experts will be able look at the problem and quickly
96 address it. Please include a thorough description of the problem
97 with code and data examples if at all possible.
99 =head2 Reporting Bugs
101 Report bugs to the Bioperl bug tracking system to help us keep track
102 of the bugs and their resolution. Bug reports can be submitted via
103 the web:
105 https://github.com/bioperl/bioperl-live/issues
107 =head1 AUTHOR - Chris Fields
109 Email cjfields-at-cpan-dot-org
111 =head1 APPENDIX
113 The rest of the documentation details each of the object methods.
114 Internal methods are usually preceded with a _
116 =cut
118 # Let the code begin...
120 package Bio::DB::Taxonomy::sqlite;
122 use 5.010;
123 use strict;
124 use DB_File;
125 use Bio::Taxon;
126 use File::Spec::Functions;
127 use Data::Dumper;
128 use DBI;
130 use constant SEPARATOR => ':';
132 our $DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp
133 our $DEFAULT_CACHE_SIZE = 0; # /tmp
134 our $DEFAULT_DB_NAME = 'taxonomy.sqlite';
136 our @DIVISIONS = (
137 [qw(BCT Bacteria)],
138 [qw(INV Invertebrates)],
139 [qw(MAM Mammals)],
140 [qw(PHG Phages)],
141 [qw(PLN Plants)], # (and fungi)
142 [qw(PRI Primates)],
143 [qw(ROD Rodents)],
144 [qw(SYN Synthetic)],
145 [qw(UNA Unassigned)],
146 [qw(VRL Viruses)],
147 [qw(VRT Vertebrates)],
148 [qw(ENV 'Environmental samples')]
151 use base qw(Bio::DB::Taxonomy);
153 =head2 new
155 Title : new
156 Usage : my $obj = Bio::DB::Taxonomy::flatfile->new();
157 Function: Builds a new Bio::DB::Taxonomy::flatfile object
158 Returns : an instance of Bio::DB::Taxonomy::flatfile
159 Args : -directory => name of directory where index files should be created
160 -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
161 -namesfile => name of the file containing names(names.dmp from NCBI)
162 -force => 1 to replace current indexes even if they exist
164 =cut
166 # TODO: get rid of globals!
167 sub new {
168 my ( $class, @args ) = @_;
170 my $self = $class->SUPER::new(@args);
172 my ( $dir, $nodesfile, $namesfile, $db, $force, $cs ) =
173 $self->_rearrange( [qw(DIRECTORY NODESFILE NAMESFILE DB FORCE CACHE_SIZE)], @args );
175 $self->index_directory( $dir || $DEFAULT_INDEX_DIR );
177 $self->db_name( $db || $DEFAULT_DB_NAME );
179 $self->cache_size($cs // $DEFAULT_CACHE_SIZE);
181 if ($nodesfile) {
182 $self->_build_index( $nodesfile, $namesfile, $force );
185 $self->_db_connect;
186 return $self;
189 =head2 Bio::DB::Taxonomy interface implementation
191 =head2 get_num_taxa
193 Title : get_num_taxa
194 Usage : my $num = $db->get_num_taxa();
195 Function: Get the number of taxa stored in the database.
196 Returns : A number
197 Args : None
199 =cut
201 sub get_num_taxa {
202 my ($self) = @_;
204 my $ct = $self->_dbh_fetch(<<SQL);
205 SELECT COUNT(*) FROM taxon
208 return @{$ct}[0];
211 =head2 get_taxon
213 Title : get_taxon
214 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
215 Function: Get a Bio::Taxon object from the database.
216 Returns : Bio::Taxon object
217 Args : just a single value which is the database id, OR named args:
218 -taxonid => taxonomy id (to query by taxonid)
220 -name => string (to query by a taxonomy name: common name,
221 scientific name, etc)
223 =cut
225 sub get_taxon {
226 my ($self) = shift;
227 my ( $taxonid, $name );
229 if ( @_ > 1 ) {
230 ( $taxonid, $name ) = $self->_rearrange( [qw(TAXONID NAME)], @_ );
231 if ($name) {
232 ( $taxonid, my @others ) = $self->get_taxonids($name);
233 $self->warn(
234 "There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'"
235 ) if @others > 0;
238 else {
239 $taxonid = shift;
242 return unless $taxonid;
244 $taxonid =~ /^\d+$/ || $self->throw("TaxID must be integer, got [$taxonid]");
246 my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
247 # single join or two calls?
248 my $sth = $self->_prepare_cached(<<SQL);
249 SELECT tax.parent_id, tax.rank, tax.code, tax.division_id, tax.gencode_id, tax.mito_id, names.name, names.uniq_name, names.class
250 FROM taxon as tax, names
251 WHERE
252 tax.taxon_id = ?
254 names.taxon_id = tax.taxon_id
257 $sth->bind_columns(\$parent_id, \$rank, \$code, \$divid, \$gen_code, \$mito, \$nm, \$uniq, \$class);
259 $sth->execute($taxonid) or $self->throw($sth->errstr);
261 my ($sci_name, @common_names);
263 while ($sth->fetch) {
264 if ($class eq 'scientific name') {
265 $sci_name = $nm;
266 } else {
267 push @common_names, $nm;
271 my $taxon = Bio::Taxon->new(
272 -name => $sci_name,
273 -common_names => [@common_names],
274 -ncbi_taxid => $taxonid,
275 -parent_id => $parent_id,
276 -rank => $rank,
277 -division => $DIVISIONS[$divid]->[1],
278 -genetic_code => $gen_code,
279 -mito_genetic_code => $mito
282 # we can't use -dbh or the db_handle() method ourselves or we'll go
283 # infinite on the merge attempt
284 $taxon->{'db_handle'} = $self;
286 $self->_handle_internal_id($taxon);
288 return $taxon;
291 *get_Taxonomy_Node = \&get_taxon;
293 =head2 get_taxonids
295 Title : get_taxonids
296 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
297 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
298 string. Note that multiple taxonids can match to the same supplied
299 name.
300 Returns : array of integer ids in list context, one of these in scalar context
301 Args : string representing taxon's name
303 =cut
305 sub get_taxonids {
306 my ( $self, $query ) = @_;
308 # TODO: note we're not cleaning the query here, so you could technically
309 # have a fuzzy match (or Bobby Tables someone)
311 # TODO: OR'd match seems poor optimally
312 my $taxids = $self->{dbh}->selectcol_arrayref(<<SQL);
313 SELECT DISTINCT taxon_id FROM names
314 WHERE
315 name LIKE "$query"
317 uniq_name LIKE "$query"
320 return wantarray() ? @{$taxids} : @{$taxids}[0];
323 *get_taxonid = \&get_taxonids;
325 =head2 get_Children_Taxids
327 Title : get_Children_Taxids
328 Usage : my @childrenids = $db->get_Children_Taxids
329 Function: Get the ids of the children of a node in the taxonomy
330 Returns : Array of Ids
331 Args : Bio::Taxon or a taxon_id
332 Status : deprecated (use each_Descendent())
334 =cut
336 sub get_Children_Taxids {
337 my ( $self, $node ) = @_;
338 $self->deprecated(); # ?
339 #$self->warn(
340 # "get_Children_Taxids is deprecated, use each_Descendent instead");
341 #my $id;
342 #if ( ref($node) ) {
343 # if ( $node->can('object_id') ) {
344 # $id = $node->object_id;
346 # elsif ( $node->can('ncbi_taxid') ) {
347 # $id = $node->ncbi_taxid;
349 # else {
350 # $self->warn(
351 # "Don't know how to extract a taxon id from the object of type "
352 # . ref($node)
353 # . "\n" );
354 # return;
357 #else { $id = $node }
358 #my @vals = $self->{'_parentbtree'}->get_dup($id);
359 #return @vals;
362 =head2 ancestor
364 Title : ancestor
365 Usage : my $ancestor_taxon = $db->ancestor($taxon)
366 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
367 database.
368 Returns : Bio::Taxon
369 Args : Bio::Taxon (that was retrieved from this database)
371 =cut
373 sub ancestor {
374 my ( $self, $taxon ) = @_;
375 $self->throw("Must supply a Bio::Taxon")
376 unless ref($taxon) && $taxon->isa('Bio::Taxon');
377 $self->throw("The supplied Taxon must belong to this database")
378 unless $taxon->db_handle && $taxon->db_handle eq $self;
379 my $id =
380 $taxon->id || $self->throw("The supplied Taxon is missing its id!");
382 # TODO:
383 # Note here we explicitly set the parent ID, but use a separate method to
384 # check whether it is defined. Mixing back-end databases, even if from the
385 # same source, should still work (since a different backend wouldn't
386 # explicitly set the parent_id)
388 if ($taxon->trusted_parent_id) {
389 # this is the failsafe when we hit the root node
390 if ($taxon->parent_id eq $id) {
391 return;
393 return $self->get_taxon(-taxonid => $taxon->parent_id);
394 } else {
395 # TODO: would there be any other option?
396 return;
400 # TODO: this may act as a drop-in for a recursive CTE lookup
402 #=head2 ancestors
404 # Title : ancestors
405 # Usage : my @ancestor_taxa = $db->ancestors($taxon)
406 # Function: Retrieve the full ancestor taxon of a supplied Taxon from the
407 # database.
408 # Returns : List of Bio::Taxon
409 # Args : Bio::Taxon (that was retrieved from this database)
411 #=cut
413 #sub ancestors { ... }
415 =head2 each_Descendent
417 Title : each_Descendent
418 Usage : my @taxa = $db->each_Descendent($taxon);
419 Function: Get all the descendents of the supplied Taxon (but not their
420 descendents, ie. not a recursive fetchall).
421 Returns : Array of Bio::Taxon objects
422 Args : Bio::Taxon (that was retrieved from this database)
424 =cut
426 sub each_Descendent {
427 my ( $self, $taxon ) = @_;
428 $self->throw("Must supply a Bio::Taxon")
429 unless ref($taxon) && $taxon->isa('Bio::Taxon');
430 $self->throw("The supplied Taxon must belong to this database")
431 unless $taxon->db_handle && $taxon->db_handle eq $self; # yikes
433 my $id =
434 $taxon->id || $self->throw("The supplied Taxon is missing its id!");
436 #my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
437 # single join or two calls?
439 # probably not optimal, maybe set up as a cached statement with bindings?
440 my $desc_ids = $self->{dbh}->selectcol_arrayref(<<SQL) or $self->throw($self->{dbh}->errstr);
441 SELECT tax.taxon_id
442 FROM taxon as tax
443 WHERE
444 tax.parent_id = $id
447 return unless ref $desc_ids eq 'ARRAY';
449 my @descs;
450 foreach my $desc_id (@$desc_ids) {
451 push( @descs, $self->get_taxon($desc_id) || next );
453 return @descs;
456 =head2 Helper methods
458 =cut
460 =head2 index_directory
462 Title : index_directory
463 Funtion : Get/set the location that index files are stored. (this module
464 will index the supplied database)
465 Usage : $obj->index_directory($newval)
466 Returns : value of index_directory (a scalar)
467 Args : on set, new value (a scalar or undef, optional)
468 Note : kept for backwards compatibility with older DB_File implementation
470 =cut
472 sub index_directory {
473 my $self = shift;
474 return $self->{'index_directory'} = shift if @_;
475 return $self->{'index_directory'};
478 =head2 db_name
480 Title : db_name
481 Funtion : Get/set the name of the SQLite3 database where data is stored
482 Usage : $obj->db_name($newval)
483 Returns : value of db_name (a scalar)
484 Args : on set, new value (a scalar or undef, optional)
486 =cut
488 # TODO: this may need some disambiguation w/ index_directory above; for now we
489 # assume this doesn't have a full path name (though I see no reason why this
490 # shouldn't allow that)
492 sub db_name {
493 my $self = shift;
494 return $self->{'db_name'} = shift if @_;
495 return $self->{'db_name'};
498 =head2 cache_size
500 Title : cache_size
501 Funtion : Get/set the cachesize used for loading the SQLite3 database
502 Usage : $obj->cache_size($newval)
503 Returns : value of cache_size (a scalar)
504 Args : on set, new value (a scalar or undef, optional)
505 Note : we do no checking on whether this value is an integer (SQLite does this for use)
507 =cut
509 sub cache_size {
510 my $self = shift;
511 return $self->{'cache_size'} = shift if defined($_[0]);
512 return $self->{'cache_size'};
515 # internal method which does the indexing
516 sub _build_index {
517 my ( $self, $nodesfile, $namesfile, $force ) = @_;
519 # TODO: need to disambiguate using index_directory here since we only have
520 # one file. Mayeb ignore it in favor of having full path for db_name?
521 my ($dir, $db_name) = ($self->index_directory, $self->db_name);
522 if (! -e $db_name || $force) {
524 # TODO: we're ignoring index_directory for now, may add support for this
525 # down the way
526 my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
528 $self->debug("Running SQLite version:".$dbh->{sqlite_version}."\n");
530 #$dbh->do('PRAGMA synchronous = 0'); # Non transaction safe!!!
532 if ($self->cache_size) {
533 my $cs = $self->cache_size;
534 $self->debug("Setting cache size $cs\n");
535 $dbh->do("PRAGMA cache_size = $cs")
538 $self->debug("Loading taxon table data\n");
539 $self->_init_db($dbh);
540 open my $NODES, '<', $nodesfile
541 or $self->throw("Could not read node file '$nodesfile': $!");
543 # TODO: this has the really unnecessary 'OR IGNORE' option added,
544 # apparently b.c the test data expects to handle cases where the TaxID
545 # is repeated in this table (which should never happen in this table). I
546 # will likely change this to throw under those circumstances
548 my $sth = $dbh->prepare_cached(<<SQL);
549 INSERT OR IGNORE INTO taxon (taxon_id, parent_id, rank, code, division_id, gencode_id, mito_id) VALUES (?,?,?,?,?,?,?)
551 $dbh->do("BEGIN");
552 while (<$NODES>) {
553 next if /^\s*$/;
554 chomp;
555 my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
556 next if $taxid == 1;
557 if ($parent == 1) {
558 $parent = undef;
561 $sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid";
563 $dbh->do("COMMIT") or $self->throw($dbh->errstr);
565 close $NODES;
567 $self->debug("Loading name table data\n");
568 open my $NAMES, '<', $namesfile
569 or $self->throw("Could not read names file '$namesfile': $!");
571 my $sth = $dbh->prepare_cached(<<SQL) or $self->throw($dbh->errstr);
572 INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?)
574 $dbh->do("BEGIN");
575 while (<$NAMES>) {
576 next if /^$/;
577 chomp;
578 my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
580 # don't include the fake root node 'root' or 'all' with id 1
581 next if $taxid == 1;
583 $class =~ s/\s+\|\s*$//;
585 #if ($name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
586 # $name =~ s/\s+\(class\)$//;
589 $sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr);
591 close $NAMES;
593 $dbh->do("COMMIT");
595 $self->debug("Creating taxon index\n");
596 $dbh->do("CREATE INDEX parent_idx ON taxon (parent_id)") or $self->throw($dbh->errstr);
597 $self->debug("Creating name index\n");
598 $dbh->do("CREATE INDEX name_idx ON names (name)") or $self->throw($dbh->errstr);
599 $self->debug("Creating taxon name table index\n");
600 $dbh->do("CREATE INDEX taxon_name_idx ON names (taxon_id)") or $self->throw($dbh->errstr);
602 $dbh->do("PRAGMA foreign_keys = ON");
604 #$dbh->do('PRAGMA synchronous = 1');
605 $self->{dbh} = $dbh;
606 $self->{'_initialized'} = 1;
611 # connect the internal db handle
612 sub _db_connect {
613 my $self = shift;
614 return if $self->{'_initialized'};
616 my ($dir, $db_name) = ($self->index_directory, $self->db_name);
618 # TODO: we're ignoring index_directory for now, may add support for this
619 # down the way
620 my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
621 $dbh->do("PRAGMA foreign_keys = ON");
622 if ($self->cache_size) {
623 my $cs = $self->cache_size;
624 $self->debug("Setting cache size $cs\n");
625 $dbh->do("PRAGMA cache_size = $cs")
627 $self->{dbh} = $dbh;
629 $self->{'_initialized'} = 1;
632 sub _init_db {
633 my ($self, $dbh) = @_;
634 my $schema = $self->taxon_schema();
635 # TODO: set up handler parameters here
636 for my $table (sort keys %$schema) {
637 $dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr);
638 $dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr);
643 sub _dbh_fetch {
644 my ($self, $sql) = @_;
645 # TODO: more sanity checks
646 my $rows = $self->{dbh}->selectrow_arrayref($sql) or $self->throw( $self->{dbh}->errstr );
647 return $rows;
650 sub _prepare_cached {
651 my ($self, $sql) = @_;
652 # TODO: more sanity checks
653 my $sth = $self->{dbh}->prepare_cached($sql) or $self->throw( $self->{dbh}->errstr );
654 $sth;
658 # TODO: check data size, this is a ballpark estimate (could be reduced)
659 sub taxon_schema {
660 my $self = shift;
661 return {
662 taxon => <<SCHEMA,
664 taxon_id INTEGER UNIQUE PRIMARY KEY NOT NULL,
665 parent_id INTEGER,
666 left_id INTEGER,
667 right_id INTEGER,
668 rank VARCHAR(25),
669 code VARCHAR(5),
670 division_id INTEGER,
671 gencode_id INTEGER,
672 mito_id INTEGER,
673 FOREIGN KEY(parent_id) REFERENCES taxon(taxon_id)
675 SCHEMA
677 names => <<SCHEMA,
679 name_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
680 taxon_id INTEGER,
681 name VARCHAR(50),
682 uniq_name VARCHAR(50),
683 class VARCHAR(25),
684 FOREIGN KEY(taxon_id) REFERENCES taxon(taxon_id)
686 SCHEMA
690 sub DESTROY {
691 my $self = shift;
692 undef $self->{dbh};