changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / DB / Flat / BDB.pm
blobd7f29a24f7c9d29e6a775e8747ab2954ba18d19a
3 # BioPerl module for Bio::DB::Flat::BDB
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Lincoln Stein <lstein@cshl.org>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file
17 =head1 SYNOPSIS
19 #You should not be using this module directly.
21 See L<Bio::DB::Flat>.
23 =head1 DESCRIPTION
25 This object provides the basic mechanism to associate positions in
26 files with primary and secondary name spaces. Unlike
27 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
28 to work with the BerkeleyDB-indexed "common" flat file format worked
29 out at the 2002 BioHackathon.
31 This object is the guts to the mechanism, which will be used by the
32 specific objects inheriting from it.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to one
40 of the Bioperl mailing lists. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Support
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
56 =head2 Reporting Bugs
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 the bugs and their resolution. Bug reports can be submitted via
60 email or the web:
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Lincoln Stein
66 Email - lstein@cshl.org
68 =head1 SEE ALSO
70 L<Bio::DB::Flat>,
72 =head1 APPENDIX
74 The rest of the documentation details each of the object methods. Internal
75 methods are usually preceded with an "_" (underscore).
77 =cut
80 # Let the code begin...
82 package Bio::DB::Flat::BDB;
84 use strict;
85 use DB_File;
86 use IO::File;
87 use Fcntl qw(O_CREAT O_RDWR O_RDONLY);
88 use File::Spec;
89 use Bio::SeqIO;
90 use Bio::DB::RandomAccessI;
91 use Bio::Root::Root;
92 use Bio::Root::IO;
94 use base qw(Bio::DB::Flat);
96 sub _initialize {
97 my $self = shift;
98 my ($max_open) = $self->_rearrange(['MAXOPEN'],@_);
99 $self->{bdb_maxopen} = $max_open || 32;
102 # return a filehandle seeked to the appropriate place
103 # this only works with the primary namespace
104 sub _get_stream {
105 my ($self,$id) = @_;
106 my ($filepath,$offset,$length) = $self->_lookup_primary($id)
107 or $self->throw("Unable to find a record for $id in the flat file index");
108 my $fh = $self->_fhcache($filepath)
109 or $self->throw("couldn't open $filepath: $!");
110 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
111 $fh;
114 # return records corresponding to the indicated index
115 # if there are multiple hits will return a list in list context,
116 # otherwise will throw an exception
117 sub fetch_raw {
118 my ($self,$id,$namespace) = @_;
120 # secondary lookup
121 if (defined $namespace && $namespace ne $self->primary_namespace) {
122 my @hits = $self->_lookup_secondary($namespace,$id);
123 $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context")
124 unless wantarray;
125 return map {$self->_read_record(@$_)} @hits;
128 # primary lookup
129 my @args = $self->_lookup_primary($id)
130 or $self->throw("Unable to find a record for $id in the flat file index");
131 return $self->_read_record(@args);
134 # create real live Bio::Seq object
135 sub get_Seq_by_id {
136 my $self = shift;
137 my $id = shift;
138 my $fh = eval {$self->_get_stream($id)} or return;
139 my $seqio =
140 $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format,
141 -fh => $fh);
142 return $seqio->next_seq;
145 # fetch array of Bio::Seq objects
146 sub get_Seq_by_acc {
147 my $self = shift;
148 unshift @_,'ACC' if @_==1;
149 my ($ns,$key) = @_;
150 my @primary_ids = $self->expand_ids($ns => $key);
151 $self->throw("more than one sequences correspond to this accession")
152 if @primary_ids > 1 && ! wantarray;
153 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
154 return wantarray ? @rc : $rc[0];
157 # fetch array of Bio::Seq objects
158 sub get_Seq_by_version {
159 my $self = shift;
160 unshift @_,'VERSION' if @_==1;
161 my ($ns,$key) = @_;
162 my @primary_ids = $self->expand_ids($ns => $key);
163 $self->throw("more than one sequences correspond to this accession")
164 if @primary_ids > 1 && !wantarray;
165 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
166 return wantarray ? @rc : $rc[0];
169 =head2 get_PrimarySeq_stream
171 Title : get_PrimarySeq_stream
172 Usage : $stream = get_PrimarySeq_stream
173 Function: Makes a Bio::DB::SeqStreamI compliant object
174 which provides a single method, next_primary_seq
175 Returns : Bio::DB::SeqStreamI
176 Args : none
179 =cut
181 sub get_PrimarySeq_stream {
182 my $self = shift;
183 my @files = $self->files || 0;
184 my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format ,
185 -files => \@files);
186 return $out;
189 sub get_all_primary_ids {
190 my $self = shift;
191 my $db = $self->primary_db;
192 return keys %$db;
195 =head2 get_all_primary_ids
197 Title : get_all_primary_ids
198 Usage : @ids = $seqdb->get_all_primary_ids()
199 Function: gives an array of all the primary_ids of the
200 sequence objects in the database.
201 Example :
202 Returns : an array of strings
203 Args : none
205 =cut
207 # this will perform an ID lookup on a (possibly secondary)
208 # id, returning all the corresponding ids
209 sub expand_ids {
210 my $self = shift;
211 my ($ns,$key) = @_;
212 return $key unless defined $ns;
213 return $key if $ns eq $self->primary_namespace;
214 my $db = $self->secondary_db($ns)
215 or $self->throw("invalid secondary namespace $ns");
216 my $record = $db->{$key} or return; # nothing there
217 return $self->unpack_secondary($record);
220 # build index from files listed
221 sub build_index {
222 my $self = shift;
223 my @files = @_;
224 my $count = 0;
225 for my $file (@files) {
226 $file = File::Spec->rel2abs($file)
227 unless File::Spec->file_name_is_absolute($file);
228 $count += $self->_index_file($file);
230 $self->write_config;
231 $count;
234 sub _index_file {
235 my $self = shift;
236 my $file = shift;
238 my $fileno = $self->_path2fileno($file);
239 defined $fileno or $self->throw("could not create a file number for $file");
241 my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
242 my $offset = 0;
243 my $count = 0;
245 while (!eof($fh)) {
246 my ($ids,$adjustment) = $self->parse_one_record($fh) or next;
247 $adjustment ||= 0; # prevent uninit variable warning
248 my $pos = tell($fh) + $adjustment;
249 $self->_store_index($ids,$file,$offset,$pos-$offset);
250 $offset = $pos;
251 $count++;
253 $count;
256 =head2 To Be Implemented in Subclasses
258 The following methods MUST be implemented by subclasses.
260 =cut
262 =head2 May Be Overridden in Subclasses
264 The following methods MAY be overridden by subclasses.
266 =cut
268 sub default_primary_namespace {
269 return "ACC";
272 sub default_secondary_namespaces {
273 return;
276 sub _read_record {
277 my $self = shift;
278 my ($filepath,$offset,$length) = @_;
279 my $fh = $self->_fhcache($filepath)
280 or $self->throw("couldn't open $filepath: $!");
281 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
282 my $record;
283 read($fh,$record,$length) or $self->throw("can't read $filepath: $!");
284 $record
287 # return a list in the form ($filepath,$offset,$length)
288 sub _lookup_primary {
289 my $self = shift;
290 my $primary = shift;
291 my $db = $self->primary_db
292 or $self->throw("no primary namespace database is open");
294 my $record = $db->{$primary} or return; # nothing here
296 my($fileid,$offset,$length) = $self->unpack_primary($record);
297 my $filepath = $self->_fileno2path($fileid)
298 or $self->throw("no file path entry for fileid $fileid");
299 return ($filepath,$offset,$length);
302 # return a list of array refs in the form [$filepath,$offset,$length]
303 sub _lookup_secondary {
304 my $self = shift;
305 my ($namespace,$secondary) = @_;
306 my @primary = $self->expand_ids($namespace=>$secondary);
307 return map {[$self->_lookup_primary($_)]} @primary;
310 # store indexing information into a primary & secondary record
311 # $namespaces is one of:
312 # 1. a scalar corresponding to the primary name
313 # 2. a hashref corresponding to namespace=>id identifiers
314 # it is valid for secondary id to be an arrayref
315 sub _store_index {
316 my $self = shift;
317 my ($keys,$filepath,$offset,$length) = @_;
318 my ($primary,%secondary);
320 if (ref $keys eq 'HASH') {
321 my %valid_secondary = map {$_=>1} $self->secondary_namespaces;
322 while (my($ns,$value) = each %$keys) {
323 if ($ns eq $self->primary_namespace) {
324 $primary = $value;
325 } else {
326 $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns");
327 push @{$secondary{$ns}},$value;
330 $primary or $self->throw("no primary namespace ID provided");
331 } else {
332 $primary = $keys;
335 $self->throw("invalid primary ID; must be a scalar")
336 if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects
338 $self->_store_primary($primary,$filepath,$offset,$length);
339 for my $ns (keys %secondary) {
340 my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns};
341 $self->_store_secondary($ns,$_,$primary) foreach @ids;
347 # store primary index
348 sub _store_primary {
349 my $self = shift;
350 my ($id,$filepath,$offset,$length) = @_;
352 my $db = $self->primary_db
353 or $self->throw("no primary namespace database is open");
354 my $fileno = $self->_path2fileno($filepath);
355 defined $fileno or $self->throw("could not create a file number for $filepath");
357 my $record = $self->pack_primary($fileno,$offset,$length);
358 $db->{$id} = $record or return; # nothing here
362 # store a primary index name under a secondary index
363 sub _store_secondary {
364 my $self = shift;
365 my ($secondary_ns,$secondary_id,$primary_id) = @_;
367 my $db = $self->secondary_db($secondary_ns)
368 or $self->throw("invalid secondary namespace $secondary_ns");
370 # first get whatever secondary ids are already stored there
371 my @primary = $self->unpack_secondary($db->{$secondary_id});
372 # uniqueify
373 my %unique = map {$_=>undef} @primary,$primary_id;
375 my $record = $self->pack_secondary(keys %unique);
376 $db->{$secondary_id} = $record;
379 # get output file handle
380 sub _outfh {
381 my $self = shift;
382 #### XXXXX FINISH #####
383 # my $
386 # unpack a primary record into fileid,offset,length
387 sub unpack_primary {
388 my $self = shift;
389 my $index_record = shift;
390 return split "\t",$index_record;
393 # unpack a secondary record into a list of primary ids
394 sub unpack_secondary {
395 my $self = shift;
396 my $index_record = shift or return;
397 return split "\t",$index_record;
400 # pack a list of fileid,offset,length into a primary id record
401 sub pack_primary {
402 my $self = shift;
403 my ($fileid,$offset,$length) = @_;
404 return join "\t",($fileid,$offset,$length);
407 # pack a list of primary ids into a secondary id record
408 sub pack_secondary {
409 my $self = shift;
410 my @secondaries = @_;
411 return join "\t",@secondaries;
414 sub primary_db {
415 my $self = shift;
416 # lazy opening
417 $self->_open_bdb unless exists $self->{bdb_primary_db};
418 return $self->{bdb_primary_db};
421 sub secondary_db {
422 my $self = shift;
423 my $secondary_namespace = shift
424 or $self->throw("usage: secondary_db(\$secondary_namespace)");
425 $self->_open_bdb unless exists $self->{bdb_primary_db};
426 return $self->{bdb_secondary_db}{$secondary_namespace};
429 sub _open_bdb {
430 my $self = shift;
432 my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY;
434 my $primary_db = {};
435 tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE)
436 or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)");
437 $self->{bdb_primary_db} = $primary_db;
439 for my $secondary ($self->secondary_namespaces) {
440 my $secondary_db = {};
441 tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE)
442 or $self->throw("Could not open primary index file");
443 $self->{bdb_secondary_db}{$secondary} = $secondary_db;
449 sub _primary_db_name {
450 my $self = shift;
451 my $pns = $self->primary_namespace or $self->throw('no primary namespace defined');
452 return "key_$pns";
455 sub _secondary_db_name {
456 my $self = shift;
457 my $sns = shift;
458 return "id_$sns";
461 sub _fhcache {
462 my $self = shift;
463 my $path = shift;
464 my $write = shift;
466 if (!$self->{bdb_fhcache}{$path}) {
467 $self->{bdb_curopen} ||= 0;
468 if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) {
469 my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}};
470 splice(@lru, $self->{bdb_maxopen} / 3);
471 $self->{bdb_curopen} -= @lru;
472 for (@lru) { delete $self->{bdb_fhcache}{$_} }
474 if ($write) {
475 my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>';
476 $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return;
477 } else {
478 $self->{bdb_fhcache}{$path} = IO::File->new($path) or return;
480 $self->{bdb_curopen}++;
482 $self->{bdb_cacheseq}{$path}++;
483 $self->{bdb_fhcache}{$path}