Bio/DB/Taxonomy/sqlite.pm

   1 #
   2 # BioPerl module for Bio::DB::Taxonomy::flatfile
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Chris Fields <cjfields-at-cpan-dot-org>
   7 #
   8 # Copyright Chris Fields
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::DB::Taxonomy::sqlite - SQLite-based implementation of Bio::DB::Taxonomy::flatfile
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::DB::Taxonomy;
  21
  22   my $db = Bio::DB::Taxonomy->new(-source    => 'sqlite' ,
  23                                   -nodesfile => 'nodes.dmp',
  24                                   -namesfile => 'names.dmp');
  25
  26 =head1 DESCRIPTION
  27
  28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
  29 NCBI taxonomy using a simple SQLite3 database stored locally on disk.
  30
  31 The required database files, nodes.dmp and names.dmp can be obtained from
  32 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
  33
  34 =head1 TODO
  35
  36 Beyond completing the implementation and optimization, this will
  37 likely be rolled into a more flexible backend at some future point.
  38
  39 =head1 FEEDBACK
  40
  41 =head2 Mailing Lists
  42
  43 User feedback is an integral part of the evolution of this and other
  44 Bioperl modules. Send your comments and suggestions preferably to
  45 the Bioperl mailing list.  Your participation is much appreciated.
  46
  47   bioperl-l@bioperl.org                  - General discussion
  48   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  49
  50 =head2 Support
  51
  52 Please direct usage questions or support issues to the mailing list:
  53
  54 I<bioperl-l@bioperl.org>
  55
  56 rather than to the module maintainer directly. Many experienced and
  57 reponsive experts will be able look at the problem and quickly
  58 address it. Please include a thorough description of the problem
  59 with code and data examples if at all possible.
  60
  61 =head2 Reporting Bugs
  62
  63 Report bugs to the Bioperl bug tracking system to help us keep track
  64 of the bugs and their resolution. Bug reports can be submitted via
  65 the web:
  66
  67   https://github.com/bioperl/bioperl-live/issues
  68
  69 =head1 AUTHOR - Chris Fields
  70
  71 Email cjfields-at-cpan-dot-org
  72
  73 =head1 APPENDIX
  74
  75 The rest of the documentation details each of the object methods.
  76 Internal methods are usually preceded with a _
  77
  78 =cut
  79
  80 # Let the code begin...
  81
  82 package Bio::DB::Taxonomy::sqlite;
  83
  84 use 5.010;
  85 use strict;
  86 use DB_File;
  87 use Bio::Taxon;
  88 use File::Spec::Functions;
  89 use Data::Dumper;
  90 use DBI;
  91
  92 use constant SEPARATOR => ':';
  93
  94 our $DEFAULT_INDEX_DIR     = $Bio::Root::IO::TEMPDIR;    # /tmp
  95 our $DEFAULT_CACHE_SIZE    = 0;    # /tmp
  96 our $DEFAULT_DB_NAME       = 'taxonomy.sqlite';
  97
  98 our @DIVISIONS = (
  99     [qw(BCT Bacteria)],
 100     [qw(INV Invertebrates)],
 101     [qw(MAM Mammals)],
 102     [qw(PHG Phages)],
 103     [qw(PLN Plants)],                                    # (and fungi)
 104     [qw(PRI Primates)],
 105     [qw(ROD Rodents)],
 106     [qw(SYN Synthetic)],
 107     [qw(UNA Unassigned)],
 108     [qw(VRL Viruses)],
 109     [qw(VRT Vertebrates)],
 110     [qw(ENV 'Environmental samples')]
 111 );
 112
 113 use base qw(Bio::DB::Taxonomy);
 114
 115 =head2 new
 116
 117  Title   : new
 118  Usage   : my $obj = Bio::DB::Taxonomy::flatfile->new();
 119  Function: Builds a new Bio::DB::Taxonomy::flatfile object
 120  Returns : an instance of Bio::DB::Taxonomy::flatfile
 121  Args    : -directory => name of directory where index files should be created
 122            -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
 123            -namesfile => name of the file containing names(names.dmp from NCBI)
 124            -force     => 1 to replace current indexes even if they exist
 125
 126 =cut
 127
 128 # TODO: get rid of globals!
 129 sub new {
 130     my ( $class, @args ) = @_;
 131
 132     my $self = $class->SUPER::new(@args);
 133
 134     my ( $dir, $nodesfile, $namesfile, $db, $force, $cs ) =
 135       $self->_rearrange( [qw(DIRECTORY NODESFILE NAMESFILE DB FORCE CACHE)], @args );
 136
 137     $self->index_directory( $dir || $DEFAULT_INDEX_DIR );
 138
 139     $self->db_name( $db || $DEFAULT_DB_NAME );
 140
 141     $self->cache_size($cs // $DEFAULT_CACHE_SIZE);
 142
 143     if ($nodesfile) {
 144         $self->_build_index( $nodesfile, $namesfile, $force );
 145     }
 146
 147     $self->_db_connect;
 148     return $self;
 149 }
 150
 151 =head2 Bio::DB::Taxonomy interface implementation
 152
 153 =head2 get_num_taxa
 154
 155  Title   : get_num_taxa
 156  Usage   : my $num = $db->get_num_taxa();
 157  Function: Get the number of taxa stored in the database.
 158  Returns : A number
 159  Args    : None
 160
 161 =cut
 162
 163 sub get_num_taxa {
 164     my ($self) = @_;
 165
 166     my $ct = $self->_dbh_fetch(<<SQL);
 167     SELECT COUNT(*) FROM taxon
 168 SQL
 169
 170     return @{$ct}[0];
 171 }
 172
 173 =head2 get_taxon
 174
 175  Title   : get_taxon
 176  Usage   : my $taxon = $db->get_taxon(-taxonid => $taxonid)
 177  Function: Get a Bio::Taxon object from the database.
 178  Returns : Bio::Taxon object
 179  Args    : just a single value which is the database id, OR named args:
 180            -taxonid => taxonomy id (to query by taxonid)
 181             OR
 182            -name    => string (to query by a taxonomy name: common name,
 183                                scientific name, etc)
 184
 185 =cut
 186
 187 sub get_taxon {
 188     my ($self) = shift;
 189     my ( $taxonid, $name );
 190
 191     if ( @_ > 1 ) {
 192         ( $taxonid, $name ) = $self->_rearrange( [qw(TAXONID NAME)], @_ );
 193         if ($name) {
 194             ( $taxonid, my @others ) = $self->get_taxonids($name);
 195             $self->warn(
 196 "There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'"
 197             ) if @others > 0;
 198         }
 199     }
 200     else {
 201         $taxonid = shift;
 202     }
 203
 204     return unless $taxonid;
 205
 206     $taxonid =~ /^\d+$/ || $self->throw("TaxID must be integer, got [$taxonid]");
 207
 208     my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
 209     # single join or two calls?
 210     my $sth = $self->_prepare_cached(<<SQL);
 211     SELECT tax.parent_id, tax.rank, tax.code, tax.division_id, tax.gencode_id, tax.mito_id, names.name, names.uniq_name, names.class
 212     FROM taxon as tax, names
 213     WHERE
 214         tax.taxon_id = ?
 215     AND
 216         names.taxon_id = tax.taxon_id
 217 SQL
 218
 219     $sth->bind_columns(\$parent_id, \$rank, \$code, \$divid, \$gen_code, \$mito, \$nm, \$uniq, \$class);
 220
 221     $sth->execute($taxonid) or $self->throw($sth->errstr);
 222
 223     my ($sci_name, @common_names);
 224
 225     while ($sth->fetch) {
 226         if ($class eq 'scientific name') {
 227             $sci_name = $nm;
 228         } else {
 229             push @common_names, $nm;
 230         }
 231     }
 232
 233     my $taxon = Bio::Taxon->new(
 234         -name         => $sci_name,
 235         -common_names => [@common_names],
 236         -ncbi_taxid   => $taxonid,
 237
 238         # TODO:
 239         # Okay, this is a pretty goofy thing; we have the parent_id in hand
 240         # but can't assign it b/c of semantics (one apparently must call
 241         # ancestor() to get this, which seems roundabout if the information is
 242         # already at hand)
 243
 244         -parent_id    => $parent_id,
 245         -rank              => $rank,
 246         -division          => $DIVISIONS[$divid]->[1],
 247         -genetic_code      => $gen_code,
 248         -mito_genetic_code => $mito
 249     );
 250
 251     # we can't use -dbh or the db_handle() method ourselves or we'll go
 252     # infinite on the merge attempt
 253     $taxon->{'db_handle'} = $self;
 254
 255     $self->_handle_internal_id($taxon);
 256
 257     return $taxon;
 258 }
 259
 260 *get_Taxonomy_Node = \&get_taxon;
 261
 262 =head2 get_taxonids
 263
 264  Title   : get_taxonids
 265  Usage   : my @taxonids = $db->get_taxonids('Homo sapiens');
 266  Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
 267            string. Note that multiple taxonids can match to the same supplied
 268            name.
 269  Returns : array of integer ids in list context, one of these in scalar context
 270  Args    : string representing taxon's name
 271
 272 =cut
 273
 274 sub get_taxonids {
 275     my ( $self, $query ) = @_;
 276
 277     my $taxids = $self->{dbh}->selectcol_arrayref(<<SQL);
 278     SELECT taxon_id FROM names
 279     WHERE
 280         name LIKE "$query"
 281 SQL
 282
 283     return wantarray() ? @{$taxids} : @{$taxids}[0];
 284 }
 285
 286 *get_taxonid = \&get_taxonids;
 287
 288 =head2 get_Children_Taxids
 289
 290  Title   : get_Children_Taxids
 291  Usage   : my @childrenids = $db->get_Children_Taxids
 292  Function: Get the ids of the children of a node in the taxonomy
 293  Returns : Array of Ids
 294  Args    : Bio::Taxon or a taxon_id
 295  Status  : deprecated (use each_Descendent())
 296
 297 =cut
 298
 299 sub get_Children_Taxids {
 300     my ( $self, $node ) = @_;
 301     $self->deprecated(); # ?
 302     #$self->warn(
 303     #    "get_Children_Taxids is deprecated, use each_Descendent instead");
 304     #my $id;
 305     #if ( ref($node) ) {
 306     #    if ( $node->can('object_id') ) {
 307     #        $id = $node->object_id;
 308     #    }
 309     #    elsif ( $node->can('ncbi_taxid') ) {
 310     #        $id = $node->ncbi_taxid;
 311     #    }
 312     #    else {
 313     #        $self->warn(
 314     #            "Don't know how to extract a taxon id from the object of type "
 315     #              . ref($node)
 316     #              . "\n" );
 317     #        return;
 318     #    }
 319     #}
 320     #else { $id = $node }
 321     #my @vals = $self->{'_parentbtree'}->get_dup($id);
 322     #return @vals;
 323 }
 324
 325 =head2 ancestor
 326
 327  Title   : ancestor
 328  Usage   : my $ancestor_taxon = $db->ancestor($taxon)
 329  Function: Retrieve the full ancestor taxon of a supplied Taxon from the
 330            database.
 331  Returns : Bio::Taxon
 332  Args    : Bio::Taxon (that was retrieved from this database)
 333
 334 =cut
 335
 336 sub ancestor {
 337     my ( $self, $taxon ) = @_;
 338     $self->throw("Must supply a Bio::Taxon")
 339       unless ref($taxon) && $taxon->isa('Bio::Taxon');
 340     $self->throw("The supplied Taxon must belong to this database")
 341       unless $taxon->db_handle && $taxon->db_handle eq $self;
 342     my $id =
 343       $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 344
 345     # TODO:
 346     # Note here we explicitly set the parent ID, but use a separate method to
 347     # check whether it is defined. Mixing back-end databases, even if from the
 348     # same source, should still work (since a different backend wouldn't
 349     # explicitly set the parent_id)
 350
 351     if (defined $taxon->trusted_parent_id) {
 352         return $self->get_taxon($taxon->parent_id);
 353     } else {
 354         # TODO: would there be any other option?
 355         return;
 356     }
 357 }
 358
 359 =head2 each_Descendent
 360
 361  Title   : each_Descendent
 362  Usage   : my @taxa = $db->each_Descendent($taxon);
 363  Function: Get all the descendents of the supplied Taxon (but not their
 364            descendents, ie. not a recursive fetchall).
 365  Returns : Array of Bio::Taxon objects
 366  Args    : Bio::Taxon (that was retrieved from this database)
 367
 368 =cut
 369
 370 sub each_Descendent {
 371     my ( $self, $taxon ) = @_;
 372     $self->throw("Must supply a Bio::Taxon")
 373       unless ref($taxon) && $taxon->isa('Bio::Taxon');
 374     $self->throw("The supplied Taxon must belong to this database")
 375       unless $taxon->db_handle && $taxon->db_handle eq $self;  # yikes
 376
 377     my $id =
 378       $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 379
 380     #my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
 381     # single join or two calls?
 382
 383     # probably not optimal, maybe set up as a cached statement with bindings?
 384     my $desc_ids = $self->{dbh}->selectcol_arrayref(<<SQL) or $self->throw($self->{dbh}->errstr);
 385     SELECT tax.taxon_id
 386     FROM taxon as tax
 387     WHERE
 388         tax.parent_id = $id
 389 SQL
 390
 391     return unless ref $desc_ids eq 'ARRAY';
 392
 393     my @descs;
 394     foreach my $desc_id (@$desc_ids) {
 395         push( @descs, $self->get_taxon($desc_id) || next );
 396     }
 397     return @descs;
 398 }
 399
 400 =head2 Helper methods
 401
 402 =cut
 403
 404 =head2 index_directory
 405
 406  Title   : index_directory
 407  Funtion : Get/set the location that index files are stored. (this module
 408            will index the supplied database)
 409  Usage   : $obj->index_directory($newval)
 410  Returns : value of index_directory (a scalar)
 411  Args    : on set, new value (a scalar or undef, optional)
 412  Note    : kept for backwards compatibility with older DB_File implementation
 413
 414 =cut
 415
 416 sub index_directory {
 417     my $self = shift;
 418     return $self->{'index_directory'} = shift if @_;
 419     return $self->{'index_directory'};
 420 }
 421
 422 =head2 db_name
 423
 424  Title   : db_name
 425  Funtion : Get/set the name of the SQLite3 database where data is stored
 426  Usage   : $obj->db_name($newval)
 427  Returns : value of db_name (a scalar)
 428  Args    : on set, new value (a scalar or undef, optional)
 429
 430 =cut
 431
 432 # TODO: this may need some disambiguation w/ index_directory above; for now we
 433 # assume this doesn't have a full path name (though I see no reason why this
 434 # shouldn't allow that)
 435
 436 sub db_name {
 437     my $self = shift;
 438     return $self->{'db_name'} = shift if @_;
 439     return $self->{'db_name'};
 440 }
 441
 442 =head2 cache_size
 443
 444  Title   : cache_size
 445  Funtion : Get/set the cachesize used for loading the SQLite3 database
 446  Usage   : $obj->cache_size($newval)
 447  Returns : value of cache_size (a scalar)
 448  Args    : on set, new value (a scalar or undef, optional)
 449  Note    : we do no checking on whether this value is an integer (SQLite does this for use)
 450
 451 =cut
 452
 453 sub cache_size {
 454     my $self = shift;
 455     return $self->{'cache_size'} = shift if defined($_[0]);
 456     return $self->{'cache_size'};
 457 }
 458
 459 # internal method which does the indexing
 460 sub _build_index {
 461     my ( $self, $nodesfile, $namesfile, $force ) = @_;
 462
 463     # TODO: need to disambiguate using index_directory here since we only have
 464     # one file.  Mayeb ignore it in favor of having full path for db_name?
 465     my ($dir, $db_name) = ($self->index_directory, $self->db_name);
 466
 467     # TODO: we're ignoring index_directory for now, may add support for this
 468     # down the way
 469     my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
 470
 471     $dbh->do('PRAGMA synchronous = 0');      # Non transaction safe!!!
 472
 473     if ($self->cache_size) {
 474         $dbh->do('PRAGMA cache_size = '.$self->cache_size)
 475     }
 476
 477     if (! -e $db_name || $force) {
 478
 479         $self->debug("Loading taxon table data\n");
 480         $self->_init_db($dbh);
 481         open my $NODES, '<', $nodesfile
 482             or $self->throw("Could not read node file '$nodesfile': $!");
 483
 484         # TODO: this has the really unnecessary 'OR IGNORE' option added,
 485         # apparently b.c the test data expects to handle cases where the TaxID
 486         # is repeated in this table (which should never happen in this table). I
 487         # will likely change this to throw under those circumstances
 488
 489         my $sth = $dbh->prepare_cached(<<SQL);
 490     INSERT OR IGNORE INTO taxon (taxon_id, parent_id, rank, code, division_id, gencode_id, mito_id) VALUES (?,?,?,?,?,?,?)
 491 SQL
 492         $dbh->do("BEGIN");
 493         while (<$NODES>) {
 494             next if /^\s*$/;
 495             chomp;
 496             my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
 497             next if $taxid == 1;
 498             if ($parent == 1) {
 499                 $parent = $taxid;
 500             }
 501
 502             $sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid";
 503         }
 504         $dbh->do("COMMIT");
 505
 506         # TODO:index parent_id?
 507         close $NODES;
 508
 509         $self->debug("Loading name table data\n");
 510         open my $NAMES, '<', $namesfile
 511             or $self->throw("Could not read names file '$namesfile': $!");
 512
 513         my $sth = $dbh->prepare_cached(<<SQL) or $self->throw($dbh->errstr);
 514     INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?)
 515 SQL
 516         $dbh->do("BEGIN");
 517         while (<$NAMES>) {
 518             next if /^$/;
 519             chomp;
 520             my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
 521             # don't include the fake root node 'root' or 'all' with id 1
 522             next if $taxid == 1;
 523
 524             $class =~ s/\s+\|\s*$//;
 525             $sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr);
 526         }
 527         $dbh->do("COMMIT");
 528         $dbh->do("PRAGMA foreign_keys = ON");
 529         close $NAMES;
 530         $self->{dbh} = $dbh;
 531         $self->{'_initialized'} = 1;
 532     }
 533     1;
 534 }
 535
 536 # connect the internal db handle
 537 sub _db_connect {
 538     my $self = shift;
 539     return if $self->{'_initialized'};
 540
 541     my ($dir, $db_name) = ($self->index_directory, $self->db_name);
 542
 543     # TODO: we're ignoring index_directory for now, may add support for this
 544     # down the way
 545     $self->{dbh} = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
 546     $self->{'_initialized'} = 1;
 547 }
 548
 549 sub _init_db {
 550     my ($self, $dbh) = @_;
 551     my $schema = $self->taxon_schema();
 552     # TODO: set up handler parameters here
 553     for my $table (sort keys %$schema) {
 554         $dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr);
 555         $dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr);
 556     }
 557     1;
 558 }
 559
 560 sub _dbh_fetch {
 561     my ($self, $sql) = @_;
 562     # TODO: more sanity checks
 563     my $rows = $self->{dbh}->selectrow_arrayref($sql) or $self->throw( $self->{dbh}->errstr );
 564     return $rows;
 565 }
 566
 567 sub _prepare_cached {
 568     my ($self, $sql) = @_;
 569     # TODO: more sanity checks
 570     my $sth = $self->{dbh}->prepare_cached($sql) or $self->throw( $self->{dbh}->errstr );
 571     $sth;
 572 }
 573
 574
 575 # TODO: check data size, this is a ballpark estimate (could be reduced)
 576 sub taxon_schema {
 577     my $self   = shift;
 578     return {
 579     taxon   => <<SCHEMA,
 580     (
 581         taxon_id        INTEGER PRIMARY KEY NOT NULL,
 582         parent_id       INTEGER,
 583         left_id         INTEGER,
 584         right_id        INTEGER,
 585         rank            VARCHAR(25),
 586         code            VARCHAR(5),
 587         division_id     INTEGER,
 588         gencode_id      INTEGER,
 589         mito_id         INTEGER,
 590         FOREIGN KEY(parent_id) REFERENCES taxon(taxon_id)
 591     )
 592 SCHEMA
 593
 594     names   => <<SCHEMA,
 595     (
 596         name_id         INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
 597         taxon_id        INTEGER,
 598         name            VARCHAR(50),
 599         uniq_name       VARCHAR(50),
 600         class           VARCHAR(25),
 601         FOREIGN KEY(taxon_id) REFERENCES taxon(taxon_id)
 602     )
 603 SCHEMA
 604     };
 605 }
 606
 607 sub DESTROY {
 608     my $self = shift;
 609     undef $self->{dbh};
 610 }
 611
 612 1;
 613