2 # BioPerl module for Bio::DB::Taxonomy::flatfile
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'flatfile' ,
23 -nodesfile => 'nodes.dmp',
24 -namesfile => 'names.dmp');
28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
29 NCBI taxonomy using flat files stored locally on disk and indexed using the
30 DB_File module RECNO data structure for fast retrieval.
32 The required database files, nodes.dmp and names.dmp can be obtained from
33 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
63 https://redmine.open-bio.org/projects/bioperl/
65 =head1 AUTHOR - Jason Stajich
67 Email jason-at-bioperl-dot-org
71 Sendu Bala: bix@sendu.me.uk
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
80 # Let the code begin...
82 package Bio
::DB
::Taxonomy
::flatfile
;
84 use vars
qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
85 $DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
90 use File::Spec::Functions;
92 use constant SEPARATOR => ':';
94 $DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp
95 $DEFAULT_NODE_INDEX = 'nodes';
96 $DEFAULT_NAME2ID_INDEX = 'names2id';
97 $DEFAULT_ID2NAME_INDEX = 'id2names';
98 $DEFAULT_PARENT_INDEX = 'parents';
100 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
102 @DIVISIONS = ([qw(BCT Bacteria)],
103 [qw(INV Invertebrates)],
106 [qw(PLN Plants)], # (and fungi)
110 [qw(UNA Unassigned)],
112 [qw(VRT Vertebrates)],
113 [qw(ENV 'Environmental samples')]);
115 use base
qw(Bio::DB::Taxonomy);
120 Usage : my $obj = Bio::DB::Taxonomy::flatfile->new();
121 Function: Builds a new Bio::DB::Taxonomy::flatfile object
122 Returns : an instance of Bio::DB::Taxonomy::flatfile
123 Args : -directory => name of directory where index files should be created
124 -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
125 -namesfile => name of the file containing names(names.dmp from NCBI)
126 -force => 1 to replace current indexes even if they exist
131 my($class, @args) = @_;
133 my $self = $class->SUPER::new
(@args);
134 my ($dir,$nodesfile,$namesfile,$force) =
135 $self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
137 $self->index_directory($dir || $DEFAULT_INDEX_DIR);
139 $self->_build_index($nodesfile,$namesfile,$force);
147 =head2 Bio::DB::Taxonomy interface implementation
152 Usage : my $num = $db->get_num_taxa();
153 Function: Get the number of taxa stored in the database.
161 if (not exists $self->{_num_taxa
}) {
163 while ( my ($parent, undef) = each %{$self->{_parent2children
}} ) {
166 $self->{_num_taxa
} = $num;
168 return $self->{_num_taxa
};
175 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
176 Function: Get a Bio::Taxon object from the database.
177 Returns : Bio::Taxon object
178 Args : just a single value which is the database id, OR named args:
179 -taxonid => taxonomy id (to query by taxonid)
181 -name => string (to query by a taxonomy name: common name,
182 scientific name, etc)
188 my ($taxonid, $name);
191 ($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
193 ($taxonid, my @others) = $self->get_taxonids($name);
194 $self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
201 return unless $taxonid;
203 $taxonid =~ /^\d+$/ || return;
204 my $node = $self->{'_nodes'}->[$taxonid] || return;
205 length($node) || return;
206 my ($taxid, undef, $rank, $code, $divid, $gen_code, $mito) = split(SEPARATOR
,$node);
207 last unless defined $taxid;
208 my ($taxon_names) = $self->{'_id2name'}->[$taxid];
209 my ($sci_name, @common_names) = split(SEPARATOR
, $taxon_names);
211 my $taxon = Bio
::Taxon
->new(
213 -common_names
=> [@common_names],
214 -ncbi_taxid
=> $taxid, # since this is a real ncbi taxid, explicitly set it as one
216 -division
=> $DIVISIONS[$divid]->[1],
217 -genetic_code
=> $gen_code,
218 -mito_genetic_code
=> $mito );
219 # we can't use -dbh or the db_handle() method ourselves or we'll go
220 # infinite on the merge attempt
221 $taxon->{'db_handle'} = $self;
223 $self->_handle_internal_id($taxon);
228 *get_Taxonomy_Node
= \
&get_taxon
;
234 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
235 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
236 string. Note that multiple taxonids can match to the same supplied
238 Returns : array of integer ids in list context, one of these in scalar context
239 Args : string representing taxon's name
244 my ($self, $query) = @_;
245 my $ids = $self->{'_name2id'}->{lc($query)};
248 # try again converting underscores to spaces
250 $ids = $self->{'_name2id'}->{lc($query)};
254 my @ids = split(SEPARATOR
, $ids);
255 return wantarray() ?
@ids : shift @ids;
258 *get_taxonid
= \
&get_taxonids
;
261 =head2 get_Children_Taxids
263 Title : get_Children_Taxids
264 Usage : my @childrenids = $db->get_Children_Taxids
265 Function: Get the ids of the children of a node in the taxonomy
266 Returns : Array of Ids
267 Args : Bio::Taxon or a taxon_id
268 Status : deprecated (use each_Descendent())
272 sub get_Children_Taxids
{
273 my ($self, $node) = @_;
274 $self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
277 if( $node->can('object_id') ) {
278 $id = $node->object_id;
279 } elsif( $node->can('ncbi_taxid') ) {
280 $id = $node->ncbi_taxid;
282 $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
285 } else { $id = $node }
286 my @vals = $self->{'_parentbtree'}->get_dup($id);
294 Usage : my $ancestor_taxon = $db->ancestor($taxon)
295 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
298 Args : Bio::Taxon (that was retrieved from this database)
303 my ($self, $taxon) = @_;
304 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
305 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
306 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
308 my $node = $self->{'_nodes'}->[$id];
310 my (undef, $parent_id) = split(SEPARATOR
,$node);
311 $parent_id || return;
312 $parent_id eq $id && return; # one of the roots
313 return $self->get_taxon($parent_id);
319 =head2 each_Descendent
321 Title : each_Descendent
322 Usage : my @taxa = $db->each_Descendent($taxon);
323 Function: Get all the descendents of the supplied Taxon (but not their
324 descendents, ie. not a recursive fetchall).
325 Returns : Array of Bio::Taxon objects
326 Args : Bio::Taxon (that was retrieved from this database)
330 sub each_Descendent
{
331 my ($self, $taxon) = @_;
332 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
333 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
334 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
336 my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
338 foreach my $desc_id (@desc_ids) {
339 push(@descs, $self->get_taxon($desc_id) || next);
345 =head2 Helper methods
349 # internal method which does the indexing
351 my ($self, $nodesfile, $namesfile, $force) = @_;
353 my $dir = $self->index_directory;
354 my $nodeindex = catfile
($dir, $DEFAULT_NODE_INDEX);
355 my $name2idindex = catfile
($dir, $DEFAULT_NAME2ID_INDEX);
356 my $id2nameindex = catfile
($dir, $DEFAULT_ID2NAME_INDEX);
357 my $parent2childindex = catfile
($dir, $DEFAULT_PARENT_INDEX);
358 $self->{'_nodes'} = [];
359 $self->{'_id2name'} = [];
360 $self->{'_name2id'} = {};
361 $self->{'_parent2children'} = {};
363 if (! -e
$nodeindex || $force) {
364 my (%parent2children,@nodes);
365 open my $NODES, '<', $nodesfile
366 or $self->throw("Could not read node file '$nodesfile': $!");
369 unlink $parent2childindex;
370 my $nh = tie
( @nodes, 'DB_File', $nodeindex, O_RDWR
|O_CREAT
, 0644, $DB_RECNO) ||
371 $self->throw("Cannot open file '$nodeindex': $!");
372 my $btree = tie
( %parent2children, 'DB_File', $parent2childindex, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) ||
373 $self->throw("Cannot tie to file '$parent2childindex': $!");
378 my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
379 # don't include the fake root node 'root' with id 1; we essentially have multiple roots here
385 # keep this stringified
386 $nodes[$taxid] = join(SEPARATOR
, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
387 $btree->put($parent,$taxid);
391 $nh = $btree = undef;
393 untie %parent2children;
396 if ((! -e
$name2idindex || -z
$name2idindex) || (! -e
$id2nameindex || -z
$id2nameindex) || $force) {
397 open my $NAMES, '<', $namesfile
398 or $self->throw("Could not read names file '$namesfile': $!");
400 unlink $name2idindex;
401 unlink $id2nameindex;
402 my (@id2name,%name2id);
403 my $idh = tie
(@id2name, 'DB_File', $id2nameindex, O_RDWR
|O_CREAT
, 0644, $DB_RECNO) ||
404 $self->throw("Cannot tie to file '$id2nameindex': $!");
405 my $nameh = tie
( %name2id, 'DB_File', $name2idindex, O_RDWR
|O_CREAT
, 0644, $DB_HASH) ||
406 $self->throw("Cannot tie to file '$name2idindex': $!");
411 my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
412 # don't include the fake root node 'root' or 'all' with id 1
415 $class =~ s/\s+\|\s*$//;
416 my $lc_name = lc($name);
417 my $orig_name = $name;
419 # unique names aren't always in the correct column, sometimes they
420 # are uniqued by adding bracketed rank names to the normal name;
421 # store the uniqued version then fix the name for normal use
422 if ($lc_name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
423 $name2id{$lc_name} = $taxid;
424 $name =~ s/\s+\(class\)$//;
425 $lc_name = lc($name);
428 # handle normal names which aren't necessarily unique
429 my $taxids = $name2id{$lc_name} || '';
430 my %taxids = map { $_ => 1 } split(SEPARATOR
, $taxids);
431 unless (exists $taxids{$taxid}) {
433 $name2id{$lc_name} = join(SEPARATOR
, keys %taxids);
436 # store unique names in name2id
438 $name2id{lc($unique_name)} = $taxid;
441 # store all names in id2name array
442 my $names = $id2name[$taxid] || '';
443 my @names = split(SEPARATOR
, $names);
444 if ($class && $class eq 'scientific name') {
445 # the scientific name should be the first name stored
446 unshift(@names, $name);
447 push(@names, $orig_name) if ($orig_name ne $name);
448 push(@names, $unique_name) if $unique_name;
451 # all other ('common' in this simplification) names get added after
453 push(@names, $orig_name) if ($orig_name ne $name);
454 push(@names, $unique_name) if $unique_name;
456 $id2name[$taxid] = join(SEPARATOR
, @names);
460 $idh = $nameh = undef;
467 # connect the internal db handle
470 return if $self->{'_initialized'};
472 my $dir = $self->index_directory;
473 my $nodeindex = catfile
($dir, $DEFAULT_NODE_INDEX);
474 my $name2idindex = catfile
($dir, $DEFAULT_NAME2ID_INDEX);
475 my $id2nameindex = catfile
($dir, $DEFAULT_ID2NAME_INDEX);
476 my $parent2childindex = catfile
($dir, $DEFAULT_PARENT_INDEX);
477 $self->{'_nodes'} = [];
478 $self->{'_id2name'} = [];
479 $self->{'_name2id'} = {};
480 $self->{'_parent2children'} = {};
482 if( ! -e
$nodeindex ||
483 ! -e
$name2idindex ||
484 ! -e
$id2nameindex ) {
485 $self->warn("Index files have not been created");
488 tie
( @
{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDWR
,undef, $DB_RECNO)
489 || $self->throw("$! $nodeindex");
490 tie
(@
{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDWR
, undef,
491 $DB_RECNO) || $self->throw("$! $id2nameindex");
493 tie
( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDWR
,undef,
494 $DB_HASH) || $self->throw("$! $name2idindex");
495 $self->{'_parentbtree'} = tie
( %{$self->{'_parent2children'}},
496 'DB_File', $parent2childindex,
497 O_RDWR
, 0644, $DB_BTREE);
499 $self->{'_initialized'} = 1;
503 =head2 index_directory
505 Title : index_directory
506 Funtion : Get/set the location that index files are stored. (this module
507 will index the supplied database)
508 Usage : $obj->index_directory($newval)
509 Returns : value of index_directory (a scalar)
510 Args : on set, new value (a scalar or undef, optional)
515 sub index_directory
{
517 return $self->{'index_directory'} = shift if @_;
518 return $self->{'index_directory'};
524 # Destroy all filehandle references
525 # to be able to remove temporary files
526 undef $self->{_id2name
};
527 undef $self->{_name2id
};
528 undef $self->{_nodes
};
529 undef $self->{_parent2children
};
530 undef $self->{_parentbtree
};
531 unlink catfile
($self->{index_directory
},'id2names');
532 unlink catfile
($self->{index_directory
},'names2id');
533 unlink catfile
($self->{index_directory
},'nodes');
534 unlink catfile
($self->{index_directory
},'parents');