Massive check of file open lines. Changed bareword filehandles
[bioperl-live.git] / Bio / DB / Taxonomy / flatfile.pm
blob6ba979c51e6b8a14da3055f953e0d89d8ecc82b2
2 # BioPerl module for Bio::DB::Taxonomy::flatfile
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
18 =head1 SYNOPSIS
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'flatfile' ,
23 -nodesfile => 'nodes.dmp',
24 -namesfile => 'names.dmp');
26 =head1 DESCRIPTION
28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
29 NCBI taxonomy using flat files stored locally on disk and indexed using the
30 DB_File module RECNO data structure for fast retrieval.
32 The required database files, nodes.dmp and names.dmp can be obtained from
33 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
61 the web:
63 https://redmine.open-bio.org/projects/bioperl/
65 =head1 AUTHOR - Jason Stajich
67 Email jason-at-bioperl-dot-org
69 =head1 CONTRIBUTORS
71 Sendu Bala: bix@sendu.me.uk
73 =head1 APPENDIX
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
78 =cut
80 # Let the code begin...
82 package Bio::DB::Taxonomy::flatfile;
84 use vars qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
85 $DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
87 use strict;
88 use DB_File;
89 use Bio::Taxon;
90 use File::Spec::Functions;
92 use constant SEPARATOR => ':';
94 $DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp
95 $DEFAULT_NODE_INDEX = 'nodes';
96 $DEFAULT_NAME2ID_INDEX = 'names2id';
97 $DEFAULT_ID2NAME_INDEX = 'id2names';
98 $DEFAULT_PARENT_INDEX = 'parents';
100 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
102 @DIVISIONS = ([qw(BCT Bacteria)],
103 [qw(INV Invertebrates)],
104 [qw(MAM Mammals)],
105 [qw(PHG Phages)],
106 [qw(PLN Plants)], # (and fungi)
107 [qw(PRI Primates)],
108 [qw(ROD Rodents)],
109 [qw(SYN Synthetic)],
110 [qw(UNA Unassigned)],
111 [qw(VRL Viruses)],
112 [qw(VRT Vertebrates)],
113 [qw(ENV 'Environmental samples')]);
115 use base qw(Bio::DB::Taxonomy);
117 =head2 new
119 Title : new
120 Usage : my $obj = Bio::DB::Taxonomy::flatfile->new();
121 Function: Builds a new Bio::DB::Taxonomy::flatfile object
122 Returns : an instance of Bio::DB::Taxonomy::flatfile
123 Args : -directory => name of directory where index files should be created
124 -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
125 -namesfile => name of the file containing names(names.dmp from NCBI)
126 -force => 1 to replace current indexes even if they exist
128 =cut
130 sub new {
131 my($class, @args) = @_;
133 my $self = $class->SUPER::new(@args);
134 my ($dir,$nodesfile,$namesfile,$force) =
135 $self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
137 $self->index_directory($dir || $DEFAULT_INDEX_DIR);
138 if ( $nodesfile ) {
139 $self->_build_index($nodesfile,$namesfile,$force);
142 $self->_db_connect;
143 return $self;
147 =head2 Bio::DB::Taxonomy interface implementation
149 =head2 get_num_taxa
151 Title : get_num_taxa
152 Usage : my $num = $db->get_num_taxa();
153 Function: Get the number of taxa stored in the database.
154 Returns : A number
155 Args : None
157 =cut
159 sub get_num_taxa {
160 my ($self) = @_;
161 if (not exists $self->{_num_taxa}) {
162 my $num = 0;
163 while ( my ($parent, undef) = each %{$self->{_parent2children}} ) {
164 $num++;
166 $self->{_num_taxa} = $num;
168 return $self->{_num_taxa};
172 =head2 get_taxon
174 Title : get_taxon
175 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
176 Function: Get a Bio::Taxon object from the database.
177 Returns : Bio::Taxon object
178 Args : just a single value which is the database id, OR named args:
179 -taxonid => taxonomy id (to query by taxonid)
181 -name => string (to query by a taxonomy name: common name,
182 scientific name, etc)
184 =cut
186 sub get_taxon {
187 my ($self) = shift;
188 my ($taxonid, $name);
190 if (@_ > 1) {
191 ($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
192 if ($name) {
193 ($taxonid, my @others) = $self->get_taxonids($name);
194 $self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
197 else {
198 $taxonid = shift;
201 return unless $taxonid;
203 $taxonid =~ /^\d+$/ || return;
204 my $node = $self->{'_nodes'}->[$taxonid] || return;
205 length($node) || return;
206 my ($taxid, undef, $rank, $code, $divid, $gen_code, $mito) = split(SEPARATOR,$node);
207 last unless defined $taxid;
208 my ($taxon_names) = $self->{'_id2name'}->[$taxid];
209 my ($sci_name, @common_names) = split(SEPARATOR, $taxon_names);
211 my $taxon = Bio::Taxon->new(
212 -name => $sci_name,
213 -common_names => [@common_names],
214 -ncbi_taxid => $taxid, # since this is a real ncbi taxid, explicitly set it as one
215 -rank => $rank,
216 -division => $DIVISIONS[$divid]->[1],
217 -genetic_code => $gen_code,
218 -mito_genetic_code => $mito );
219 # we can't use -dbh or the db_handle() method ourselves or we'll go
220 # infinite on the merge attempt
221 $taxon->{'db_handle'} = $self;
223 $self->_handle_internal_id($taxon);
225 return $taxon;
228 *get_Taxonomy_Node = \&get_taxon;
231 =head2 get_taxonids
233 Title : get_taxonids
234 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
235 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
236 string. Note that multiple taxonids can match to the same supplied
237 name.
238 Returns : array of integer ids in list context, one of these in scalar context
239 Args : string representing taxon's name
241 =cut
243 sub get_taxonids {
244 my ($self, $query) = @_;
245 my $ids = $self->{'_name2id'}->{lc($query)};
246 unless ($ids) {
247 if ($query =~ /_/) {
248 # try again converting underscores to spaces
249 $query =~ s/_/ /g;
250 $ids = $self->{'_name2id'}->{lc($query)};
252 $ids || return;
254 my @ids = split(SEPARATOR, $ids);
255 return wantarray() ? @ids : shift @ids;
258 *get_taxonid = \&get_taxonids;
261 =head2 get_Children_Taxids
263 Title : get_Children_Taxids
264 Usage : my @childrenids = $db->get_Children_Taxids
265 Function: Get the ids of the children of a node in the taxonomy
266 Returns : Array of Ids
267 Args : Bio::Taxon or a taxon_id
268 Status : deprecated (use each_Descendent())
270 =cut
272 sub get_Children_Taxids {
273 my ($self, $node) = @_;
274 $self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
275 my $id;
276 if( ref($node) ) {
277 if( $node->can('object_id') ) {
278 $id = $node->object_id;
279 } elsif( $node->can('ncbi_taxid') ) {
280 $id = $node->ncbi_taxid;
281 } else {
282 $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
283 return;
285 } else { $id = $node }
286 my @vals = $self->{'_parentbtree'}->get_dup($id);
287 return @vals;
291 =head2 ancestor
293 Title : ancestor
294 Usage : my $ancestor_taxon = $db->ancestor($taxon)
295 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
296 database.
297 Returns : Bio::Taxon
298 Args : Bio::Taxon (that was retrieved from this database)
300 =cut
302 sub ancestor {
303 my ($self, $taxon) = @_;
304 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
305 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
306 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
308 my $node = $self->{'_nodes'}->[$id];
309 if (length($node)) {
310 my (undef, $parent_id) = split(SEPARATOR,$node);
311 $parent_id || return;
312 $parent_id eq $id && return; # one of the roots
313 return $self->get_taxon($parent_id);
315 return;
319 =head2 each_Descendent
321 Title : each_Descendent
322 Usage : my @taxa = $db->each_Descendent($taxon);
323 Function: Get all the descendents of the supplied Taxon (but not their
324 descendents, ie. not a recursive fetchall).
325 Returns : Array of Bio::Taxon objects
326 Args : Bio::Taxon (that was retrieved from this database)
328 =cut
330 sub each_Descendent {
331 my ($self, $taxon) = @_;
332 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
333 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
334 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
336 my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
337 my @descs;
338 foreach my $desc_id (@desc_ids) {
339 push(@descs, $self->get_taxon($desc_id) || next);
341 return @descs;
345 =head2 Helper methods
347 =cut
349 # internal method which does the indexing
350 sub _build_index {
351 my ($self, $nodesfile, $namesfile, $force) = @_;
353 my $dir = $self->index_directory;
354 my $nodeindex = catfile($dir, $DEFAULT_NODE_INDEX);
355 my $name2idindex = catfile($dir, $DEFAULT_NAME2ID_INDEX);
356 my $id2nameindex = catfile($dir, $DEFAULT_ID2NAME_INDEX);
357 my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
358 $self->{'_nodes'} = [];
359 $self->{'_id2name'} = [];
360 $self->{'_name2id'} = {};
361 $self->{'_parent2children'} = {};
363 if (! -e $nodeindex || $force) {
364 my (%parent2children,@nodes);
365 open my $NODES, '<', $nodesfile
366 or $self->throw("Could not read node file '$nodesfile': $!");
368 unlink $nodeindex;
369 unlink $parent2childindex;
370 my $nh = tie ( @nodes, 'DB_File', $nodeindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
371 $self->throw("Cannot open file '$nodeindex': $!");
372 my $btree = tie( %parent2children, 'DB_File', $parent2childindex, O_RDWR|O_CREAT, 0644, $DB_BTREE) ||
373 $self->throw("Cannot tie to file '$parent2childindex': $!");
375 while (<$NODES>) {
376 next if /^$/;
377 chomp;
378 my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
379 # don't include the fake root node 'root' with id 1; we essentially have multiple roots here
380 next if $taxid == 1;
381 if ($parent == 1) {
382 $parent = $taxid;
385 # keep this stringified
386 $nodes[$taxid] = join(SEPARATOR, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
387 $btree->put($parent,$taxid);
389 close $NODES;
391 $nh = $btree = undef;
392 untie @nodes ;
393 untie %parent2children;
396 if ((! -e $name2idindex || -z $name2idindex) || (! -e $id2nameindex || -z $id2nameindex) || $force) {
397 open my $NAMES, '<', $namesfile
398 or $self->throw("Could not read names file '$namesfile': $!");
400 unlink $name2idindex;
401 unlink $id2nameindex;
402 my (@id2name,%name2id);
403 my $idh = tie (@id2name, 'DB_File', $id2nameindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
404 $self->throw("Cannot tie to file '$id2nameindex': $!");
405 my $nameh = tie ( %name2id, 'DB_File', $name2idindex, O_RDWR|O_CREAT, 0644, $DB_HASH) ||
406 $self->throw("Cannot tie to file '$name2idindex': $!");
408 while (<$NAMES>) {
409 next if /^$/;
410 chomp;
411 my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
412 # don't include the fake root node 'root' or 'all' with id 1
413 next if $taxid == 1;
415 $class =~ s/\s+\|\s*$//;
416 my $lc_name = lc($name);
417 my $orig_name = $name;
419 # unique names aren't always in the correct column, sometimes they
420 # are uniqued by adding bracketed rank names to the normal name;
421 # store the uniqued version then fix the name for normal use
422 if ($lc_name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
423 $name2id{$lc_name} = $taxid;
424 $name =~ s/\s+\(class\)$//;
425 $lc_name = lc($name);
428 # handle normal names which aren't necessarily unique
429 my $taxids = $name2id{$lc_name} || '';
430 my %taxids = map { $_ => 1 } split(SEPARATOR, $taxids);
431 unless (exists $taxids{$taxid}) {
432 $taxids{$taxid} = 1;
433 $name2id{$lc_name} = join(SEPARATOR, keys %taxids);
436 # store unique names in name2id
437 if ($unique_name) {
438 $name2id{lc($unique_name)} = $taxid;
441 # store all names in id2name array
442 my $names = $id2name[$taxid] || '';
443 my @names = split(SEPARATOR, $names);
444 if ($class && $class eq 'scientific name') {
445 # the scientific name should be the first name stored
446 unshift(@names, $name);
447 push(@names, $orig_name) if ($orig_name ne $name);
448 push(@names, $unique_name) if $unique_name;
450 else {
451 # all other ('common' in this simplification) names get added after
452 push(@names, $name);
453 push(@names, $orig_name) if ($orig_name ne $name);
454 push(@names, $unique_name) if $unique_name;
456 $id2name[$taxid] = join(SEPARATOR, @names);
458 close $NAMES;
460 $idh = $nameh = undef;
461 untie( %name2id);
462 untie( @id2name);
467 # connect the internal db handle
468 sub _db_connect {
469 my $self = shift;
470 return if $self->{'_initialized'};
472 my $dir = $self->index_directory;
473 my $nodeindex = catfile($dir, $DEFAULT_NODE_INDEX);
474 my $name2idindex = catfile($dir, $DEFAULT_NAME2ID_INDEX);
475 my $id2nameindex = catfile($dir, $DEFAULT_ID2NAME_INDEX);
476 my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
477 $self->{'_nodes'} = [];
478 $self->{'_id2name'} = [];
479 $self->{'_name2id'} = {};
480 $self->{'_parent2children'} = {};
482 if( ! -e $nodeindex ||
483 ! -e $name2idindex ||
484 ! -e $id2nameindex ) {
485 $self->warn("Index files have not been created");
486 return 0;
488 tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDWR,undef, $DB_RECNO)
489 || $self->throw("$! $nodeindex");
490 tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDWR, undef,
491 $DB_RECNO) || $self->throw("$! $id2nameindex");
493 tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDWR,undef,
494 $DB_HASH) || $self->throw("$! $name2idindex");
495 $self->{'_parentbtree'} = tie( %{$self->{'_parent2children'}},
496 'DB_File', $parent2childindex,
497 O_RDWR, 0644, $DB_BTREE);
499 $self->{'_initialized'} = 1;
503 =head2 index_directory
505 Title : index_directory
506 Funtion : Get/set the location that index files are stored. (this module
507 will index the supplied database)
508 Usage : $obj->index_directory($newval)
509 Returns : value of index_directory (a scalar)
510 Args : on set, new value (a scalar or undef, optional)
512 =cut
515 sub index_directory {
516 my $self = shift;
517 return $self->{'index_directory'} = shift if @_;
518 return $self->{'index_directory'};
522 sub DESTROY {
523 my $self = shift;
524 # Destroy all filehandle references
525 # to be able to remove temporary files
526 undef $self->{_id2name};
527 undef $self->{_name2id};
528 undef $self->{_nodes};
529 undef $self->{_parent2children};
530 undef $self->{_parentbtree};
531 unlink catfile($self->{index_directory},'id2names');
532 unlink catfile($self->{index_directory},'names2id');
533 unlink catfile($self->{index_directory},'nodes');
534 unlink catfile($self->{index_directory},'parents');