Bio/DB/Taxonomy/sqlite.pm

   1 #
   2 # BioPerl module for Bio::DB::Taxonomy::flatfile
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Chris Fields <cjfields-at-cpan-dot-org>
   7 #
   8 # Copyright Chris Fields
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::DB::Taxonomy::sqlite - SQLite-based implementation of Bio::DB::Taxonomy::flatfile
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::DB::Taxonomy;
  21
  22   my $db = Bio::DB::Taxonomy->new(-source    => 'sqlite' ,
  23                                   -nodesfile => 'nodes.dmp',
  24                                   -namesfile => 'names.dmp');
  25
  26 =head1 DESCRIPTION
  27
  28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
  29 NCBI taxonomy using a simple SQLite3 database stored locally on disk.
  30
  31 The required database files, nodes.dmp and names.dmp can be obtained from
  32 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
  33
  34 =head1 TODO
  35
  36 =over 4
  37
  38 =item Optimize codebase
  39
  40 =item Clean up SQL (kinda a mess right now)
  41
  42 =item Check compat. with other NCBI-specific L<Bio::DB::Taxonomy> implementations
  43
  44 =item Plan out feasibility of allowing other backends (Neo4J, other DBI, etc)
  45
  46 =back
  47
  48 Beyond completing the implementation and optimization, this will
  49 likely be rolled into a more flexible backend at some future point.
  50
  51 =head1 FEEDBACK
  52
  53 =head2 Mailing Lists
  54
  55 User feedback is an integral part of the evolution of this and other
  56 Bioperl modules. Send your comments and suggestions preferably to
  57 the Bioperl mailing list.  Your participation is much appreciated.
  58
  59   bioperl-l@bioperl.org                  - General discussion
  60   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  61
  62 =head2 Support
  63
  64 Please direct usage questions or support issues to the mailing list:
  65
  66 I<bioperl-l@bioperl.org>
  67
  68 rather than to the module maintainer directly. Many experienced and
  69 reponsive experts will be able look at the problem and quickly
  70 address it. Please include a thorough description of the problem
  71 with code and data examples if at all possible.
  72
  73 =head2 Reporting Bugs
  74
  75 Report bugs to the Bioperl bug tracking system to help us keep track
  76 of the bugs and their resolution. Bug reports can be submitted via
  77 the web:
  78
  79   https://github.com/bioperl/bioperl-live/issues
  80
  81 =head1 AUTHOR - Chris Fields
  82
  83 Email cjfields-at-cpan-dot-org
  84
  85 =head1 APPENDIX
  86
  87 The rest of the documentation details each of the object methods.
  88 Internal methods are usually preceded with a _
  89
  90 =cut
  91
  92 # Let the code begin...
  93
  94 package Bio::DB::Taxonomy::sqlite;
  95
  96 use 5.010;
  97 use strict;
  98 use DB_File;
  99 use Bio::Taxon;
 100 use File::Spec::Functions;
 101 use Data::Dumper;
 102 use DBI;
 103
 104 use constant SEPARATOR => ':';
 105
 106 our $DEFAULT_INDEX_DIR     = $Bio::Root::IO::TEMPDIR;    # /tmp
 107 our $DEFAULT_CACHE_SIZE    = 0;    # /tmp
 108 our $DEFAULT_DB_NAME       = 'taxonomy.sqlite';
 109
 110 our @DIVISIONS = (
 111     [qw(BCT Bacteria)],
 112     [qw(INV Invertebrates)],
 113     [qw(MAM Mammals)],
 114     [qw(PHG Phages)],
 115     [qw(PLN Plants)],                                    # (and fungi)
 116     [qw(PRI Primates)],
 117     [qw(ROD Rodents)],
 118     [qw(SYN Synthetic)],
 119     [qw(UNA Unassigned)],
 120     [qw(VRL Viruses)],
 121     [qw(VRT Vertebrates)],
 122     [qw(ENV 'Environmental samples')]
 123 );
 124
 125 use base qw(Bio::DB::Taxonomy);
 126
 127 =head2 new
 128
 129  Title   : new
 130  Usage   : my $obj = Bio::DB::Taxonomy::flatfile->new();
 131  Function: Builds a new Bio::DB::Taxonomy::flatfile object
 132  Returns : an instance of Bio::DB::Taxonomy::flatfile
 133  Args    : -directory => name of directory where index files should be created
 134            -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
 135            -namesfile => name of the file containing names(names.dmp from NCBI)
 136            -force     => 1 to replace current indexes even if they exist
 137
 138 =cut
 139
 140 # TODO: get rid of globals!
 141 sub new {
 142     my ( $class, @args ) = @_;
 143
 144     my $self = $class->SUPER::new(@args);
 145
 146     my ( $dir, $nodesfile, $namesfile, $db, $force, $cs ) =
 147       $self->_rearrange( [qw(DIRECTORY NODESFILE NAMESFILE DB FORCE CACHE_SIZE)], @args );
 148
 149     $self->index_directory( $dir || $DEFAULT_INDEX_DIR );
 150
 151     $self->db_name( $db || $DEFAULT_DB_NAME );
 152
 153     $self->cache_size($cs // $DEFAULT_CACHE_SIZE);
 154
 155     if ($nodesfile) {
 156         $self->_build_index( $nodesfile, $namesfile, $force );
 157     }
 158
 159     $self->_db_connect;
 160     return $self;
 161 }
 162
 163 =head2 Bio::DB::Taxonomy interface implementation
 164
 165 =head2 get_num_taxa
 166
 167  Title   : get_num_taxa
 168  Usage   : my $num = $db->get_num_taxa();
 169  Function: Get the number of taxa stored in the database.
 170  Returns : A number
 171  Args    : None
 172
 173 =cut
 174
 175 sub get_num_taxa {
 176     my ($self) = @_;
 177
 178     my $ct = $self->_dbh_fetch(<<SQL);
 179     SELECT COUNT(*) FROM taxon
 180 SQL
 181
 182     return @{$ct}[0];
 183 }
 184
 185 =head2 get_taxon
 186
 187  Title   : get_taxon
 188  Usage   : my $taxon = $db->get_taxon(-taxonid => $taxonid)
 189  Function: Get a Bio::Taxon object from the database.
 190  Returns : Bio::Taxon object
 191  Args    : just a single value which is the database id, OR named args:
 192            -taxonid => taxonomy id (to query by taxonid)
 193             OR
 194            -name    => string (to query by a taxonomy name: common name,
 195                                scientific name, etc)
 196
 197 =cut
 198
 199 sub get_taxon {
 200     my ($self) = shift;
 201     my ( $taxonid, $name );
 202
 203     if ( @_ > 1 ) {
 204         ( $taxonid, $name ) = $self->_rearrange( [qw(TAXONID NAME)], @_ );
 205         if ($name) {
 206             ( $taxonid, my @others ) = $self->get_taxonids($name);
 207             $self->warn(
 208 "There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'"
 209             ) if @others > 0;
 210         }
 211     }
 212     else {
 213         $taxonid = shift;
 214     }
 215
 216     return unless $taxonid;
 217
 218     $taxonid =~ /^\d+$/ || $self->throw("TaxID must be integer, got [$taxonid]");
 219
 220     my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
 221     # single join or two calls?
 222     my $sth = $self->_prepare_cached(<<SQL);
 223     SELECT tax.parent_id, tax.rank, tax.code, tax.division_id, tax.gencode_id, tax.mito_id, names.name, names.uniq_name, names.class
 224     FROM taxon as tax, names
 225     WHERE
 226         tax.taxon_id = ?
 227     AND
 228         names.taxon_id = tax.taxon_id
 229 SQL
 230
 231     $sth->bind_columns(\$parent_id, \$rank, \$code, \$divid, \$gen_code, \$mito, \$nm, \$uniq, \$class);
 232
 233     $sth->execute($taxonid) or $self->throw($sth->errstr);
 234
 235     my ($sci_name, @common_names);
 236
 237     while ($sth->fetch) {
 238         if ($class eq 'scientific name') {
 239             $sci_name = $nm;
 240         } else {
 241             push @common_names, $nm;
 242         }
 243     }
 244
 245     my $taxon = Bio::Taxon->new(
 246         -name         => $sci_name,
 247         -common_names => [@common_names],
 248         -ncbi_taxid   => $taxonid,
 249
 250         # TODO:
 251         # Okay, this is a pretty goofy thing; we have the parent_id in hand
 252         # but can't assign it b/c of semantics (one apparently must call
 253         # ancestor() to get this, which seems roundabout if the information is
 254         # already at hand)
 255
 256         -parent_id    => $parent_id,
 257         -rank              => $rank,
 258         -division          => $DIVISIONS[$divid]->[1],
 259         -genetic_code      => $gen_code,
 260         -mito_genetic_code => $mito
 261     );
 262
 263     # we can't use -dbh or the db_handle() method ourselves or we'll go
 264     # infinite on the merge attempt
 265     $taxon->{'db_handle'} = $self;
 266
 267     $self->_handle_internal_id($taxon);
 268
 269     return $taxon;
 270 }
 271
 272 *get_Taxonomy_Node = \&get_taxon;
 273
 274 =head2 get_taxonids
 275
 276  Title   : get_taxonids
 277  Usage   : my @taxonids = $db->get_taxonids('Homo sapiens');
 278  Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
 279            string. Note that multiple taxonids can match to the same supplied
 280            name.
 281  Returns : array of integer ids in list context, one of these in scalar context
 282  Args    : string representing taxon's name
 283
 284 =cut
 285
 286 sub get_taxonids {
 287     my ( $self, $query ) = @_;
 288
 289     # TODO: note we're not cleaning the query here, so you could technically
 290     # have a fuzzy match (or Bobby Tables someone)
 291
 292     # TODO: OR'd match seems poor optimally
 293     my $taxids = $self->{dbh}->selectcol_arrayref(<<SQL);
 294     SELECT DISTINCT taxon_id FROM names
 295     WHERE
 296         name LIKE "$query"
 297     OR
 298         uniq_name LIKE "$query"
 299 SQL
 300
 301     return wantarray() ? @{$taxids} : @{$taxids}[0];
 302 }
 303
 304 *get_taxonid = \&get_taxonids;
 305
 306 =head2 get_Children_Taxids
 307
 308  Title   : get_Children_Taxids
 309  Usage   : my @childrenids = $db->get_Children_Taxids
 310  Function: Get the ids of the children of a node in the taxonomy
 311  Returns : Array of Ids
 312  Args    : Bio::Taxon or a taxon_id
 313  Status  : deprecated (use each_Descendent())
 314
 315 =cut
 316
 317 sub get_Children_Taxids {
 318     my ( $self, $node ) = @_;
 319     $self->deprecated(); # ?
 320     #$self->warn(
 321     #    "get_Children_Taxids is deprecated, use each_Descendent instead");
 322     #my $id;
 323     #if ( ref($node) ) {
 324     #    if ( $node->can('object_id') ) {
 325     #        $id = $node->object_id;
 326     #    }
 327     #    elsif ( $node->can('ncbi_taxid') ) {
 328     #        $id = $node->ncbi_taxid;
 329     #    }
 330     #    else {
 331     #        $self->warn(
 332     #            "Don't know how to extract a taxon id from the object of type "
 333     #              . ref($node)
 334     #              . "\n" );
 335     #        return;
 336     #    }
 337     #}
 338     #else { $id = $node }
 339     #my @vals = $self->{'_parentbtree'}->get_dup($id);
 340     #return @vals;
 341 }
 342
 343 =head2 ancestor
 344
 345  Title   : ancestor
 346  Usage   : my $ancestor_taxon = $db->ancestor($taxon)
 347  Function: Retrieve the full ancestor taxon of a supplied Taxon from the
 348            database.
 349  Returns : Bio::Taxon
 350  Args    : Bio::Taxon (that was retrieved from this database)
 351
 352 =cut
 353
 354 sub ancestor {
 355     my ( $self, $taxon ) = @_;
 356     $self->throw("Must supply a Bio::Taxon")
 357       unless ref($taxon) && $taxon->isa('Bio::Taxon');
 358     $self->throw("The supplied Taxon must belong to this database")
 359       unless $taxon->db_handle && $taxon->db_handle eq $self;
 360     my $id =
 361       $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 362
 363     # TODO:
 364     # Note here we explicitly set the parent ID, but use a separate method to
 365     # check whether it is defined. Mixing back-end databases, even if from the
 366     # same source, should still work (since a different backend wouldn't
 367     # explicitly set the parent_id)
 368
 369     if ($taxon->trusted_parent_id) {
 370         # this is the failsafe when we hit the root node
 371         if ($taxon->parent_id eq $id) {
 372             return;
 373         }
 374         my $anc = $self->get_taxon(-taxonid => $taxon->parent_id);
 375         $taxon->ancestor($anc);
 376         return $anc;
 377
 378     } else {
 379         # TODO: would there be any other option?
 380         return;
 381     }
 382 }
 383
 384 =head2 ancestors
 385
 386  Title   : ancestors
 387  Usage   : my @ancestor_taxa = $db->ancestors($taxon)
 388  Function: Retrieve the full ancestor taxon of a supplied Taxon from the
 389            database.
 390  Returns : List of Bio::Taxon
 391  Args    : Bio::Taxon (that was retrieved from this database)
 392
 393 =cut
 394
 395 #sub ancestors {
 396 #    my ( $self, $taxon ) = @_;
 397 #    $self->throw("Must supply a Bio::Taxon")
 398 #      unless ref($taxon) && $taxon->isa('Bio::Taxon');
 399 #    $self->throw("The supplied Taxon must belong to this database")
 400 #      unless $taxon->db_handle && $taxon->db_handle eq $self;
 401 #    my $id =
 402 #      $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 403 #
 404 #    my $sth = $self->{dbh}->prepare_cached(<<SQL);
 405 #    WITH RECURSIVE
 406 #        ancestor(id) AS (
 407 #          SELECT taxon_id FROM taxon WHERE id=@BASELINE
 408 #          UNION
 409 #          SELECT derivedfrom.xfrom, checkin.mtime
 410 #            FROM ancestor, derivedfrom, checkin
 411 #           WHERE ancestor.id=derivedfrom.xto
 412 #             AND checkin.id=derivedfrom.xfrom
 413 #           ORDER BY checkin.mtime DESC
 414 #           LIMIT 20
 415 #        )
 416 #    SELECT * FROM checkin JOIN ancestor USING(id);
 417 #SQL
 418 #
 419 #    if ($taxon->trusted_parent_id) {
 420 #        # this is the failsafe when we hit the root node
 421 #        if ($taxon->parent_id eq $id) {
 422 #            return;
 423 #        }
 424 #        my $anc = $self->get_taxon(-taxonid => $taxon->parent_id);
 425 #
 426 #    } else {
 427 #        # TODO: would there be any other option?
 428 #        return;
 429 #    }
 430 #}
 431
 432 =head2 each_Descendent
 433
 434  Title   : each_Descendent
 435  Usage   : my @taxa = $db->each_Descendent($taxon);
 436  Function: Get all the descendents of the supplied Taxon (but not their
 437            descendents, ie. not a recursive fetchall).
 438  Returns : Array of Bio::Taxon objects
 439  Args    : Bio::Taxon (that was retrieved from this database)
 440
 441 =cut
 442
 443 sub each_Descendent {
 444     my ( $self, $taxon ) = @_;
 445     $self->throw("Must supply a Bio::Taxon")
 446       unless ref($taxon) && $taxon->isa('Bio::Taxon');
 447     $self->throw("The supplied Taxon must belong to this database")
 448       unless $taxon->db_handle && $taxon->db_handle eq $self;  # yikes
 449
 450     my $id =
 451       $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 452
 453     #my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
 454     # single join or two calls?
 455
 456     # probably not optimal, maybe set up as a cached statement with bindings?
 457     my $desc_ids = $self->{dbh}->selectcol_arrayref(<<SQL) or $self->throw($self->{dbh}->errstr);
 458     SELECT tax.taxon_id
 459     FROM taxon as tax
 460     WHERE
 461         tax.parent_id = $id
 462 SQL
 463
 464     return unless ref $desc_ids eq 'ARRAY';
 465
 466     my @descs;
 467     foreach my $desc_id (@$desc_ids) {
 468         push( @descs, $self->get_taxon($desc_id) || next );
 469     }
 470     return @descs;
 471 }
 472
 473 =head2 Helper methods
 474
 475 =cut
 476
 477 =head2 index_directory
 478
 479  Title   : index_directory
 480  Funtion : Get/set the location that index files are stored. (this module
 481            will index the supplied database)
 482  Usage   : $obj->index_directory($newval)
 483  Returns : value of index_directory (a scalar)
 484  Args    : on set, new value (a scalar or undef, optional)
 485  Note    : kept for backwards compatibility with older DB_File implementation
 486
 487 =cut
 488
 489 sub index_directory {
 490     my $self = shift;
 491     return $self->{'index_directory'} = shift if @_;
 492     return $self->{'index_directory'};
 493 }
 494
 495 =head2 db_name
 496
 497  Title   : db_name
 498  Funtion : Get/set the name of the SQLite3 database where data is stored
 499  Usage   : $obj->db_name($newval)
 500  Returns : value of db_name (a scalar)
 501  Args    : on set, new value (a scalar or undef, optional)
 502
 503 =cut
 504
 505 # TODO: this may need some disambiguation w/ index_directory above; for now we
 506 # assume this doesn't have a full path name (though I see no reason why this
 507 # shouldn't allow that)
 508
 509 sub db_name {
 510     my $self = shift;
 511     return $self->{'db_name'} = shift if @_;
 512     return $self->{'db_name'};
 513 }
 514
 515 =head2 cache_size
 516
 517  Title   : cache_size
 518  Funtion : Get/set the cachesize used for loading the SQLite3 database
 519  Usage   : $obj->cache_size($newval)
 520  Returns : value of cache_size (a scalar)
 521  Args    : on set, new value (a scalar or undef, optional)
 522  Note    : we do no checking on whether this value is an integer (SQLite does this for use)
 523
 524 =cut
 525
 526 sub cache_size {
 527     my $self = shift;
 528     return $self->{'cache_size'} = shift if defined($_[0]);
 529     return $self->{'cache_size'};
 530 }
 531
 532 # internal method which does the indexing
 533 sub _build_index {
 534     my ( $self, $nodesfile, $namesfile, $force ) = @_;
 535
 536     # TODO: need to disambiguate using index_directory here since we only have
 537     # one file.  Mayeb ignore it in favor of having full path for db_name?
 538     my ($dir, $db_name) = ($self->index_directory, $self->db_name);
 539     if (! -e $db_name || $force) {
 540
 541         # TODO: we're ignoring index_directory for now, may add support for this
 542         # down the way
 543         my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
 544
 545         $self->debug("Running SQLite version:".$dbh->{sqlite_version}."\n");
 546
 547         #$dbh->do('PRAGMA synchronous = 0');      # Non transaction safe!!!
 548
 549         if ($self->cache_size) {
 550             my $cs = $self->cache_size;
 551             $self->debug("Setting cache size $cs\n");
 552             $dbh->do("PRAGMA cache_size = $cs")
 553         }
 554
 555         $self->debug("Loading taxon table data\n");
 556         $self->_init_db($dbh);
 557         open my $NODES, '<', $nodesfile
 558             or $self->throw("Could not read node file '$nodesfile': $!");
 559
 560         # TODO: this has the really unnecessary 'OR IGNORE' option added,
 561         # apparently b.c the test data expects to handle cases where the TaxID
 562         # is repeated in this table (which should never happen in this table). I
 563         # will likely change this to throw under those circumstances
 564
 565         my $sth = $dbh->prepare_cached(<<SQL);
 566     INSERT OR IGNORE INTO taxon (taxon_id, parent_id, rank, code, division_id, gencode_id, mito_id) VALUES (?,?,?,?,?,?,?)
 567 SQL
 568         $dbh->do("BEGIN");
 569         while (<$NODES>) {
 570             next if /^\s*$/;
 571             chomp;
 572             my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
 573             next if $taxid == 1;
 574             if ($parent == 1) {
 575                 $parent = undef;
 576             }
 577
 578             $sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid";
 579         }
 580         $dbh->do("COMMIT") or $self->throw($dbh->errstr);
 581
 582         close $NODES;
 583
 584         $self->debug("Loading name table data\n");
 585         open my $NAMES, '<', $namesfile
 586             or $self->throw("Could not read names file '$namesfile': $!");
 587
 588         my $sth = $dbh->prepare_cached(<<SQL) or $self->throw($dbh->errstr);
 589     INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?)
 590 SQL
 591         $dbh->do("BEGIN");
 592         while (<$NAMES>) {
 593             next if /^$/;
 594             chomp;
 595             my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
 596
 597             # don't include the fake root node 'root' or 'all' with id 1
 598             next if $taxid == 1;
 599
 600             $class =~ s/\s+\|\s*$//;
 601
 602             #if ($name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
 603             #    $name =~ s/\s+\(class\)$//;
 604             #}
 605
 606             $sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr);
 607         }
 608         $dbh->do("COMMIT");
 609
 610         $self->debug("Creating taxon/parent index\n");
 611         $dbh->do("CREATE INDEX parent_idx ON taxon (parent_id)") or $self->throw($dbh->errstr);
 612         $self->debug("Creating name/taxon index\n");
 613         $dbh->do("CREATE INDEX name_taxon_idx ON names (taxon_id,name)") or $self->throw($dbh->errstr);
 614         $dbh->do("PRAGMA foreign_keys = ON");
 615
 616         close $NAMES;
 617
 618         #$dbh->do('PRAGMA synchronous = 1');
 619         $self->{dbh} = $dbh;
 620         $self->{'_initialized'} = 1;
 621     }
 622     1;
 623 }
 624
 625 # connect the internal db handle
 626 sub _db_connect {
 627     my $self = shift;
 628     return if $self->{'_initialized'};
 629
 630     my ($dir, $db_name) = ($self->index_directory, $self->db_name);
 631
 632     # TODO: we're ignoring index_directory for now, may add support for this
 633     # down the way
 634     my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
 635     $dbh->do("PRAGMA foreign_keys = ON");
 636     if ($self->cache_size) {
 637         my $cs = $self->cache_size;
 638         $self->debug("Setting cache size $cs\n");
 639         $dbh->do("PRAGMA cache_size = $cs")
 640     }
 641     $self->{dbh} = $dbh;
 642
 643     $self->{'_initialized'} = 1;
 644 }
 645
 646 sub _init_db {
 647     my ($self, $dbh) = @_;
 648     my $schema = $self->taxon_schema();
 649     # TODO: set up handler parameters here
 650     for my $table (sort keys %$schema) {
 651         $dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr);
 652         $dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr);
 653     }
 654     1;
 655 }
 656
 657 sub _dbh_fetch {
 658     my ($self, $sql) = @_;
 659     # TODO: more sanity checks
 660     my $rows = $self->{dbh}->selectrow_arrayref($sql) or $self->throw( $self->{dbh}->errstr );
 661     return $rows;
 662 }
 663
 664 sub _prepare_cached {
 665     my ($self, $sql) = @_;
 666     # TODO: more sanity checks
 667     my $sth = $self->{dbh}->prepare_cached($sql) or $self->throw( $self->{dbh}->errstr );
 668     $sth;
 669 }
 670
 671
 672 # TODO: check data size, this is a ballpark estimate (could be reduced)
 673 sub taxon_schema {
 674     my $self   = shift;
 675     return {
 676     taxon   => <<SCHEMA,
 677     (
 678         taxon_id        INTEGER UNIQUE PRIMARY KEY NOT NULL,
 679         parent_id       INTEGER,
 680         left_id         INTEGER,
 681         right_id        INTEGER,
 682         rank            VARCHAR(25),
 683         code            VARCHAR(5),
 684         division_id     INTEGER,
 685         gencode_id      INTEGER,
 686         mito_id         INTEGER,
 687         FOREIGN KEY(parent_id) REFERENCES taxon(taxon_id)
 688     )
 689 SCHEMA
 690
 691     names   => <<SCHEMA,
 692     (
 693         name_id         INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
 694         taxon_id        INTEGER,
 695         name            VARCHAR(50),
 696         uniq_name       VARCHAR(50),
 697         class           VARCHAR(25),
 698         FOREIGN KEY(taxon_id) REFERENCES taxon(taxon_id)
 699     )
 700 SCHEMA
 701     };
 702 }
 703
 704 sub DESTROY {
 705     my $self = shift;
 706     undef $self->{dbh};
 707 }
 708
 709 1;
 710