Bio/DB/Taxonomy/flatfile.pm

   1 #
   2 # BioPerl module for Bio::DB::Taxonomy::flatfile
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::DB::Taxonomy;
  21
  22   my $db = Bio::DB::Taxonomy->new(-source    => 'flatfile' ,
  23                                   -nodesfile => 'nodes.dmp',
  24                                   -namesfile => 'names.dmp');
  25
  26 =head1 DESCRIPTION
  27
  28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
  29 NCBI taxonomy using flat files stored locally on disk and indexed using the
  30 DB_File module RECNO data structure for fast retrieval.
  31
  32 The required database files, nodes.dmp and names.dmp can be obtained from
  33 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
  34
  35 =head1 FEEDBACK
  36
  37 =head2 Mailing Lists
  38
  39 User feedback is an integral part of the evolution of this and other
  40 Bioperl modules. Send your comments and suggestions preferably to
  41 the Bioperl mailing list.  Your participation is much appreciated.
  42
  43   bioperl-l@bioperl.org                  - General discussion
  44   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  45
  46 =head2 Support
  47
  48 Please direct usage questions or support issues to the mailing list:
  49
  50 I<bioperl-l@bioperl.org>
  51
  52 rather than to the module maintainer directly. Many experienced and
  53 reponsive experts will be able look at the problem and quickly
  54 address it. Please include a thorough description of the problem
  55 with code and data examples if at all possible.
  56
  57 =head2 Reporting Bugs
  58
  59 Report bugs to the Bioperl bug tracking system to help us keep track
  60 of the bugs and their resolution. Bug reports can be submitted via
  61 the web:
  62
  63   https://redmine.open-bio.org/projects/bioperl/
  64
  65 =head1 AUTHOR - Jason Stajich
  66
  67 Email jason-at-bioperl-dot-org
  68
  69 =head1 CONTRIBUTORS
  70
  71 Sendu Bala: bix@sendu.me.uk
  72
  73 =head1 APPENDIX
  74
  75 The rest of the documentation details each of the object methods.
  76 Internal methods are usually preceded with a _
  77
  78 =cut
  79
  80 # Let the code begin...
  81
  82 package Bio::DB::Taxonomy::flatfile;
  83
  84 use vars qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
  85             $DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
  86
  87 use strict;
  88 use DB_File;
  89 use Bio::Taxon;
  90 use File::Spec::Functions;
  91
  92 use constant SEPARATOR => ':';
  93
  94 $DEFAULT_INDEX_DIR     = $Bio::Root::IO::TEMPDIR; # /tmp
  95 $DEFAULT_NODE_INDEX    = 'nodes';
  96 $DEFAULT_NAME2ID_INDEX = 'names2id';
  97 $DEFAULT_ID2NAME_INDEX = 'id2names';
  98 $DEFAULT_PARENT_INDEX  = 'parents';
  99
 100 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
 101
 102 @DIVISIONS =   ([qw(BCT Bacteria)],
 103                 [qw(INV Invertebrates)],
 104                 [qw(MAM Mammals)],
 105                 [qw(PHG Phages)],
 106                 [qw(PLN Plants)], # (and fungi)
 107                 [qw(PRI Primates)],
 108                 [qw(ROD Rodents)],
 109                 [qw(SYN Synthetic)],
 110                 [qw(UNA Unassigned)],
 111                 [qw(VRL Viruses)],
 112                 [qw(VRT Vertebrates)],
 113                 [qw(ENV 'Environmental samples')]);
 114
 115 use base qw(Bio::DB::Taxonomy);
 116
 117 =head2 new
 118
 119  Title   : new
 120  Usage   : my $obj = Bio::DB::Taxonomy::flatfile->new();
 121  Function: Builds a new Bio::DB::Taxonomy::flatfile object
 122  Returns : an instance of Bio::DB::Taxonomy::flatfile
 123  Args    : -directory => name of directory where index files should be created
 124            -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
 125            -namesfile => name of the file containing names(names.dmp from NCBI)
 126            -force     => 1 to replace current indexes even if they exist
 127
 128 =cut
 129
 130 sub new {
 131   my($class, @args) = @_;
 132
 133   my $self = $class->SUPER::new(@args);
 134   my ($dir,$nodesfile,$namesfile,$force) =
 135       $self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
 136
 137   $self->index_directory($dir || $DEFAULT_INDEX_DIR);
 138   if ( $nodesfile ) {
 139           $self->_build_index($nodesfile,$namesfile,$force);
 140   }
 141
 142   $self->_db_connect;
 143   return $self;
 144 }
 145
 146
 147 =head2 Bio::DB::Taxonomy interface implementation
 148
 149 =head2 get_num_taxa
 150
 151  Title   : get_num_taxa
 152  Usage   : my $num = $db->get_num_taxa();
 153  Function: Get the number of taxa stored in the database.
 154  Returns : A number
 155  Args    : None
 156
 157 =cut
 158
 159 sub get_num_taxa {
 160     my ($self) = @_;
 161     if (not exists $self->{_num_taxa}) {
 162         my $num = 0;
 163         while ( my ($parent, undef) = each %{$self->{_parent2children}} ) {
 164            $num++;
 165         }
 166         $self->{_num_taxa} = $num;
 167     }
 168     return $self->{_num_taxa};
 169 }
 170
 171
 172 =head2 get_taxon
 173
 174  Title   : get_taxon
 175  Usage   : my $taxon = $db->get_taxon(-taxonid => $taxonid)
 176  Function: Get a Bio::Taxon object from the database.
 177  Returns : Bio::Taxon object
 178  Args    : just a single value which is the database id, OR named args:
 179            -taxonid => taxonomy id (to query by taxonid)
 180             OR
 181            -name    => string (to query by a taxonomy name: common name,
 182                                scientific name, etc)
 183
 184 =cut
 185
 186 sub get_taxon {
 187     my ($self) = shift;
 188     my ($taxonid, $name);
 189
 190     if (@_ > 1) {
 191         ($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
 192         if ($name) {
 193             ($taxonid, my @others) = $self->get_taxonids($name);
 194             $self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
 195         }
 196     }
 197     else {
 198         $taxonid = shift;
 199     }
 200
 201     return unless $taxonid;
 202
 203     $taxonid =~ /^\d+$/ || return;
 204     my $node = $self->{'_nodes'}->[$taxonid] || return;
 205     length($node) || return;
 206     my ($taxid, undef, $rank, $code, $divid, $gen_code, $mito) = split(SEPARATOR,$node);
 207     last unless defined $taxid;
 208     my ($taxon_names) = $self->{'_id2name'}->[$taxid];
 209     my ($sci_name, @common_names) = split(SEPARATOR, $taxon_names);
 210
 211     my $taxon = Bio::Taxon->new(
 212                         -name         => $sci_name,
 213                         -common_names => [@common_names],
 214                         -ncbi_taxid   => $taxid, # since this is a real ncbi taxid, explicitly set it as one
 215                         -rank         => $rank,
 216                         -division     => $DIVISIONS[$divid]->[1],
 217                         -genetic_code => $gen_code,
 218                         -mito_genetic_code => $mito );
 219     # we can't use -dbh or the db_handle() method ourselves or we'll go
 220     # infinite on the merge attempt
 221     $taxon->{'db_handle'} = $self;
 222
 223     $self->_handle_internal_id($taxon);
 224
 225     return $taxon;
 226 }
 227
 228 *get_Taxonomy_Node = \&get_taxon;
 229
 230
 231 =head2 get_taxonids
 232
 233  Title   : get_taxonids
 234  Usage   : my @taxonids = $db->get_taxonids('Homo sapiens');
 235  Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
 236            string. Note that multiple taxonids can match to the same supplied
 237            name.
 238  Returns : array of integer ids in list context, one of these in scalar context
 239  Args    : string representing taxon's name
 240
 241 =cut
 242
 243 sub get_taxonids {
 244     my ($self, $query) = @_;
 245     my $ids = $self->{'_name2id'}->{lc($query)};
 246     unless ($ids) {
 247         if ($query =~ /_/) {
 248             # try again converting underscores to spaces
 249             $query =~ s/_/ /g;
 250             $ids = $self->{'_name2id'}->{lc($query)};
 251         }
 252         $ids || return;
 253     }
 254     my @ids = split(SEPARATOR, $ids);
 255     return wantarray() ? @ids : shift @ids;
 256 }
 257
 258 *get_taxonid = \&get_taxonids;
 259
 260
 261 =head2 get_Children_Taxids
 262
 263  Title   : get_Children_Taxids
 264  Usage   : my @childrenids = $db->get_Children_Taxids
 265  Function: Get the ids of the children of a node in the taxonomy
 266  Returns : Array of Ids
 267  Args    : Bio::Taxon or a taxon_id
 268  Status  : deprecated (use each_Descendent())
 269
 270 =cut
 271
 272 sub get_Children_Taxids {
 273    my ($self, $node) = @_;
 274    $self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
 275    my $id;
 276    if( ref($node) ) {
 277        if( $node->can('object_id') ) {
 278            $id = $node->object_id;
 279        } elsif( $node->can('ncbi_taxid') ) {
 280            $id = $node->ncbi_taxid;
 281        } else {
 282            $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
 283            return;
 284        }
 285    } else { $id = $node }
 286    my @vals = $self->{'_parentbtree'}->get_dup($id);
 287    return @vals;
 288 }
 289
 290
 291 =head2 ancestor
 292
 293  Title   : ancestor
 294  Usage   : my $ancestor_taxon = $db->ancestor($taxon)
 295  Function: Retrieve the full ancestor taxon of a supplied Taxon from the
 296            database.
 297  Returns : Bio::Taxon
 298  Args    : Bio::Taxon (that was retrieved from this database)
 299
 300 =cut
 301
 302 sub ancestor {
 303     my ($self, $taxon) = @_;
 304     $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
 305     $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
 306     my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 307
 308     my $node = $self->{'_nodes'}->[$id];
 309     if (length($node)) {
 310         my (undef, $parent_id) = split(SEPARATOR,$node);
 311         $parent_id || return;
 312         $parent_id eq $id && return; # one of the roots
 313         return $self->get_taxon($parent_id);
 314     }
 315     return;
 316 }
 317
 318
 319 =head2 each_Descendent
 320
 321  Title   : each_Descendent
 322  Usage   : my @taxa = $db->each_Descendent($taxon);
 323  Function: Get all the descendents of the supplied Taxon (but not their
 324            descendents, ie. not a recursive fetchall).
 325  Returns : Array of Bio::Taxon objects
 326  Args    : Bio::Taxon (that was retrieved from this database)
 327
 328 =cut
 329
 330 sub each_Descendent {
 331     my ($self, $taxon) = @_;
 332     $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
 333     $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
 334     my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 335
 336     my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
 337     my @descs;
 338     foreach my $desc_id (@desc_ids) {
 339         push(@descs, $self->get_taxon($desc_id) || next);
 340     }
 341     return @descs;
 342 }
 343
 344
 345 =head2 Helper methods
 346
 347 =cut
 348
 349 # internal method which does the indexing
 350 sub _build_index {
 351     my ($self, $nodesfile, $namesfile, $force) = @_;
 352
 353     my $dir = $self->index_directory;
 354     my $nodeindex         = catfile($dir, $DEFAULT_NODE_INDEX);
 355     my $name2idindex      = catfile($dir, $DEFAULT_NAME2ID_INDEX);
 356     my $id2nameindex      = catfile($dir, $DEFAULT_ID2NAME_INDEX);
 357     my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
 358     $self->{'_nodes'}           = [];
 359     $self->{'_id2name'}         = [];
 360     $self->{'_name2id'}         = {};
 361     $self->{'_parent2children'} = {};
 362
 363     if (! -e $nodeindex || $force) {
 364         my (%parent2children,@nodes);
 365         open my $NODES, '<', $nodesfile
 366             or $self->throw("Could not read node file '$nodesfile': $!");
 367
 368         unlink $nodeindex;
 369         unlink $parent2childindex;
 370         my $nh = tie ( @nodes, 'DB_File', $nodeindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
 371             $self->throw("Cannot open file '$nodeindex': $!");
 372         my $btree = tie( %parent2children, 'DB_File', $parent2childindex, O_RDWR|O_CREAT, 0644, $DB_BTREE) ||
 373             $self->throw("Cannot tie to file '$parent2childindex': $!");
 374
 375         while (<$NODES>) {
 376             next if /^$/;
 377             chomp;
 378             my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
 379             # don't include the fake root node 'root' with id 1; we essentially have multiple roots here
 380             next if $taxid == 1;
 381             if ($parent == 1) {
 382                 $parent = $taxid;
 383             }
 384
 385             # keep this stringified
 386             $nodes[$taxid] = join(SEPARATOR, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
 387             $btree->put($parent,$taxid);
 388         }
 389         close $NODES;
 390
 391         $nh = $btree = undef;
 392         untie @nodes ;
 393         untie %parent2children;
 394     }
 395
 396     if ((! -e $name2idindex || -z $name2idindex) || (! -e $id2nameindex || -z $id2nameindex) || $force) {
 397         open my $NAMES, '<', $namesfile
 398             or $self->throw("Could not read names file '$namesfile': $!");
 399
 400         unlink $name2idindex;
 401         unlink $id2nameindex;
 402         my (@id2name,%name2id);
 403         my $idh = tie (@id2name, 'DB_File', $id2nameindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
 404             $self->throw("Cannot tie to file '$id2nameindex': $!");
 405         my $nameh = tie ( %name2id, 'DB_File', $name2idindex, O_RDWR|O_CREAT, 0644, $DB_HASH) ||
 406             $self->throw("Cannot tie to file '$name2idindex': $!");
 407
 408         while (<$NAMES>) {
 409             next if /^$/;
 410             chomp;
 411             my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
 412             # don't include the fake root node 'root' or 'all' with id 1
 413             next if $taxid == 1;
 414
 415             $class =~ s/\s+\|\s*$//;
 416             my $lc_name = lc($name);
 417             my $orig_name = $name;
 418
 419             # unique names aren't always in the correct column, sometimes they
 420             # are uniqued by adding bracketed rank names to the normal name;
 421             # store the uniqued version then fix the name for normal use
 422             if ($lc_name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
 423                 $name2id{$lc_name} = $taxid;
 424                 $name =~ s/\s+\(class\)$//;
 425                 $lc_name = lc($name);
 426             }
 427
 428             # handle normal names which aren't necessarily unique
 429             my $taxids = $name2id{$lc_name} || '';
 430             my %taxids = map { $_ => 1 } split(SEPARATOR, $taxids);
 431             unless (exists $taxids{$taxid}) {
 432                 $taxids{$taxid} = 1;
 433                 $name2id{$lc_name} = join(SEPARATOR, keys %taxids);
 434             }
 435
 436             # store unique names in name2id
 437             if ($unique_name) {
 438                 $name2id{lc($unique_name)} = $taxid;
 439             }
 440
 441             # store all names in id2name array
 442             my $names = $id2name[$taxid] || '';
 443             my @names = split(SEPARATOR, $names);
 444             if ($class && $class eq 'scientific name') {
 445                 # the scientific name should be the first name stored
 446                 unshift(@names, $name);
 447                 push(@names, $orig_name) if ($orig_name ne $name);
 448                 push(@names, $unique_name) if $unique_name;
 449             }
 450             else {
 451                 # all other ('common' in this simplification) names get added after
 452                 push(@names, $name);
 453                 push(@names, $orig_name) if ($orig_name ne $name);
 454                 push(@names, $unique_name) if $unique_name;
 455             }
 456             $id2name[$taxid] = join(SEPARATOR, @names);
 457         }
 458         close $NAMES;
 459
 460         $idh = $nameh = undef;
 461         untie( %name2id);
 462         untie( @id2name);
 463     }
 464 }
 465
 466
 467 # connect the internal db handle
 468 sub _db_connect {
 469     my $self = shift;
 470     return if $self->{'_initialized'};
 471
 472     my $dir = $self->index_directory;
 473     my $nodeindex         = catfile($dir, $DEFAULT_NODE_INDEX);
 474     my $name2idindex      = catfile($dir, $DEFAULT_NAME2ID_INDEX);
 475     my $id2nameindex      = catfile($dir, $DEFAULT_ID2NAME_INDEX);
 476     my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
 477     $self->{'_nodes'}           = [];
 478     $self->{'_id2name'}         = [];
 479     $self->{'_name2id'}         = {};
 480     $self->{'_parent2children'} = {};
 481
 482     if( ! -e $nodeindex ||
 483         ! -e $name2idindex ||
 484         ! -e $id2nameindex ) {
 485         $self->warn("Index files have not been created");
 486         return 0;
 487     }
 488     tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDWR,undef, $DB_RECNO)
 489         || $self->throw("$! $nodeindex");
 490     tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDWR, undef,
 491         $DB_RECNO) || $self->throw("$! $id2nameindex");
 492
 493     tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDWR,undef,
 494         $DB_HASH) || $self->throw("$! $name2idindex");
 495     $self->{'_parentbtree'} = tie( %{$self->{'_parent2children'}},
 496                                    'DB_File', $parent2childindex,
 497                                    O_RDWR, 0644, $DB_BTREE);
 498
 499     $self->{'_initialized'} = 1;
 500 }
 501
 502
 503 =head2 index_directory
 504
 505  Title   : index_directory
 506  Funtion : Get/set the location that index files are stored. (this module
 507            will index the supplied database)
 508  Usage   : $obj->index_directory($newval)
 509  Returns : value of index_directory (a scalar)
 510  Args    : on set, new value (a scalar or undef, optional)
 511
 512 =cut
 513
 514
 515 sub index_directory {
 516     my $self = shift;
 517     return $self->{'index_directory'} = shift if @_;
 518     return $self->{'index_directory'};
 519 }
 520
 521
 522 sub DESTROY {
 523     my $self = shift;
 524     # Destroy all filehandle references
 525     # to be able to remove temporary files
 526     undef $self->{_id2name};
 527     undef $self->{_name2id};
 528     undef $self->{_nodes};
 529     undef $self->{_parent2children};
 530     undef $self->{_parentbtree};
 531     unlink catfile($self->{index_directory},'id2names');
 532     unlink catfile($self->{index_directory},'names2id');
 533     unlink catfile($self->{index_directory},'nodes');
 534     unlink catfile($self->{index_directory},'parents');
 535 }
 536
 537 1;