3 # BioPerl module for Bio::DB::Flat::BDB
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Lincoln Stein <lstein@cshl.org>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file
19 #You should not be using this module directly.
25 This object provides the basic mechanism to associate positions in
26 files with primary and secondary name spaces. Unlike
27 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
28 to work with the BerkeleyDB-indexed "common" flat file format worked
29 out at the 2002 BioHackathon.
31 This object is the guts to the mechanism, which will be used by the
32 specific objects inheriting from it.
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to one
40 of the Bioperl mailing lists. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 the bugs and their resolution. Bug reports can be submitted via
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Lincoln Stein
66 Email - lstein@cshl.org
74 The rest of the documentation details each of the object methods. Internal
75 methods are usually preceded with an "_" (underscore).
80 # Let the code begin...
82 package Bio
::DB
::Flat
::BDB
;
87 use Fcntl
qw(O_CREAT O_RDWR O_RDONLY);
90 use Bio
::DB
::RandomAccessI
;
94 use base
qw(Bio::DB::Flat);
98 my ($max_open) = $self->_rearrange(['MAXOPEN'],@_);
99 $self->{bdb_maxopen
} = $max_open || 32;
102 # return a filehandle seeked to the appropriate place
103 # this only works with the primary namespace
106 my ($filepath,$offset,$length) = $self->_lookup_primary($id)
107 or $self->throw("Unable to find a record for $id in the flat file index");
108 my $fh = $self->_fhcache($filepath)
109 or $self->throw("couldn't open $filepath: $!");
110 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
114 # return records corresponding to the indicated index
115 # if there are multiple hits will return a list in list context,
116 # otherwise will throw an exception
118 my ($self,$id,$namespace) = @_;
121 if (defined $namespace && $namespace ne $self->primary_namespace) {
122 my @hits = $self->_lookup_secondary($namespace,$id);
123 $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context")
125 return map {$self->_read_record(@
$_)} @hits;
129 my @args = $self->_lookup_primary($id)
130 or $self->throw("Unable to find a record for $id in the flat file index");
131 return $self->_read_record(@args);
134 # create real live Bio::Seq object
138 my $fh = eval {$self->_get_stream($id)} or return;
140 $self->{bdb_cached_parsers
}{fileno $fh} ||= Bio
::SeqIO
->new( -Format
=> $self->file_format,
142 return $seqio->next_seq;
145 # fetch array of Bio::Seq objects
148 unshift @_,'ACC' if @_==1;
150 my @primary_ids = $self->expand_ids($ns => $key);
151 $self->throw("more than one sequences correspond to this accession")
152 if @primary_ids > 1 && ! wantarray;
153 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
154 return wantarray ?
@rc : $rc[0];
157 # fetch array of Bio::Seq objects
158 sub get_Seq_by_version
{
160 unshift @_,'VERSION' if @_==1;
162 my @primary_ids = $self->expand_ids($ns => $key);
163 $self->throw("more than one sequences correspond to this accession")
164 if @primary_ids > 1 && !wantarray;
165 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
166 return wantarray ?
@rc : $rc[0];
169 =head2 get_PrimarySeq_stream
171 Title : get_PrimarySeq_stream
172 Usage : $stream = get_PrimarySeq_stream
173 Function: Makes a Bio::DB::SeqStreamI compliant object
174 which provides a single method, next_primary_seq
175 Returns : Bio::DB::SeqStreamI
181 sub get_PrimarySeq_stream
{
183 my @files = $self->files || 0;
184 my $out = Bio
::SeqIO
::MultiFile
->new( -format
=> $self->file_format ,
189 sub get_all_primary_ids
{
191 my $db = $self->primary_db;
195 =head2 get_all_primary_ids
197 Title : get_all_primary_ids
198 Usage : @ids = $seqdb->get_all_primary_ids()
199 Function: gives an array of all the primary_ids of the
200 sequence objects in the database.
202 Returns : an array of strings
207 # this will perform an ID lookup on a (possibly secondary)
208 # id, returning all the corresponding ids
212 return $key unless defined $ns;
213 return $key if $ns eq $self->primary_namespace;
214 my $db = $self->secondary_db($ns)
215 or $self->throw("invalid secondary namespace $ns");
216 my $record = $db->{$key} or return; # nothing there
217 return $self->unpack_secondary($record);
220 # build index from files listed
225 for my $file (@files) {
226 $file = File
::Spec
->rel2abs($file)
227 unless File
::Spec
->file_name_is_absolute($file);
228 $count += $self->_index_file($file);
238 my $fileno = $self->_path2fileno($file);
239 defined $fileno or $self->throw("could not create a file number for $file");
241 my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
246 my ($ids,$adjustment) = $self->parse_one_record($fh) or next;
247 $adjustment ||= 0; # prevent uninit variable warning
248 my $pos = tell($fh) + $adjustment;
249 $self->_store_index($ids,$file,$offset,$pos-$offset);
256 =head2 To Be Implemented in Subclasses
258 The following methods MUST be implemented by subclasses.
262 =head2 May Be Overridden in Subclasses
264 The following methods MAY be overridden by subclasses.
268 sub default_primary_namespace
{
272 sub default_secondary_namespaces
{
278 my ($filepath,$offset,$length) = @_;
279 my $fh = $self->_fhcache($filepath)
280 or $self->throw("couldn't open $filepath: $!");
281 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
283 read($fh,$record,$length) or $self->throw("can't read $filepath: $!");
287 # return a list in the form ($filepath,$offset,$length)
288 sub _lookup_primary
{
291 my $db = $self->primary_db
292 or $self->throw("no primary namespace database is open");
294 my $record = $db->{$primary} or return; # nothing here
296 my($fileid,$offset,$length) = $self->unpack_primary($record);
297 my $filepath = $self->_fileno2path($fileid)
298 or $self->throw("no file path entry for fileid $fileid");
299 return ($filepath,$offset,$length);
302 # return a list of array refs in the form [$filepath,$offset,$length]
303 sub _lookup_secondary
{
305 my ($namespace,$secondary) = @_;
306 my @primary = $self->expand_ids($namespace=>$secondary);
307 return map {[$self->_lookup_primary($_)]} @primary;
310 # store indexing information into a primary & secondary record
311 # $namespaces is one of:
312 # 1. a scalar corresponding to the primary name
313 # 2. a hashref corresponding to namespace=>id identifiers
314 # it is valid for secondary id to be an arrayref
317 my ($keys,$filepath,$offset,$length) = @_;
318 my ($primary,%secondary);
320 if (ref $keys eq 'HASH') {
321 my %valid_secondary = map {$_=>1} $self->secondary_namespaces;
322 while (my($ns,$value) = each %$keys) {
323 if ($ns eq $self->primary_namespace) {
326 $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns");
327 push @
{$secondary{$ns}},$value;
330 $primary or $self->throw("no primary namespace ID provided");
335 $self->throw("invalid primary ID; must be a scalar")
336 if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects
338 $self->_store_primary($primary,$filepath,$offset,$length);
339 for my $ns (keys %secondary) {
340 my @ids = ref $secondary{$ns} ? @
{$secondary{$ns}} : $secondary{$ns};
341 $self->_store_secondary($ns,$_,$primary) foreach @ids;
347 # store primary index
350 my ($id,$filepath,$offset,$length) = @_;
352 my $db = $self->primary_db
353 or $self->throw("no primary namespace database is open");
354 my $fileno = $self->_path2fileno($filepath);
355 defined $fileno or $self->throw("could not create a file number for $filepath");
357 my $record = $self->pack_primary($fileno,$offset,$length);
358 $db->{$id} = $record or return; # nothing here
362 # store a primary index name under a secondary index
363 sub _store_secondary
{
365 my ($secondary_ns,$secondary_id,$primary_id) = @_;
367 my $db = $self->secondary_db($secondary_ns)
368 or $self->throw("invalid secondary namespace $secondary_ns");
370 # first get whatever secondary ids are already stored there
371 my @primary = $self->unpack_secondary($db->{$secondary_id});
373 my %unique = map {$_=>undef} @primary,$primary_id;
375 my $record = $self->pack_secondary(keys %unique);
376 $db->{$secondary_id} = $record;
379 # get output file handle
382 #### XXXXX FINISH #####
386 # unpack a primary record into fileid,offset,length
389 my $index_record = shift;
390 return split "\t",$index_record;
393 # unpack a secondary record into a list of primary ids
394 sub unpack_secondary
{
396 my $index_record = shift or return;
397 return split "\t",$index_record;
400 # pack a list of fileid,offset,length into a primary id record
403 my ($fileid,$offset,$length) = @_;
404 return join "\t",($fileid,$offset,$length);
407 # pack a list of primary ids into a secondary id record
410 my @secondaries = @_;
411 return join "\t",@secondaries;
417 $self->_open_bdb unless exists $self->{bdb_primary_db
};
418 return $self->{bdb_primary_db
};
423 my $secondary_namespace = shift
424 or $self->throw("usage: secondary_db(\$secondary_namespace)");
425 $self->_open_bdb unless exists $self->{bdb_primary_db
};
426 return $self->{bdb_secondary_db
}{$secondary_namespace};
432 my $flags = $self->write_flag ? O_CREAT
|O_RDWR
: O_RDONLY
;
435 tie
(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE)
436 or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)");
437 $self->{bdb_primary_db
} = $primary_db;
439 for my $secondary ($self->secondary_namespaces) {
440 my $secondary_db = {};
441 tie
(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE)
442 or $self->throw("Could not open primary index file");
443 $self->{bdb_secondary_db
}{$secondary} = $secondary_db;
449 sub _primary_db_name
{
451 my $pns = $self->primary_namespace or $self->throw('no primary namespace defined');
455 sub _secondary_db_name
{
466 if (!$self->{bdb_fhcache
}{$path}) {
467 $self->{bdb_curopen
} ||= 0;
468 if ($self->{bdb_curopen
} >= $self->{bdb_maxopen
}) {
469 my @lru = sort {$self->{bdb_cacheseq
}{$a} <=> $self->{bdb_cacheseq
}{$b};} keys %{$self->{bdb_fhcache
}};
470 splice(@lru, $self->{bdb_maxopen
} / 3);
471 $self->{bdb_curopen
} -= @lru;
472 for (@lru) { delete $self->{bdb_fhcache
}{$_} }
475 my $modifier = $self->{bdb_fhcache_seenit
}{$path}++ ?
'>' : '>>';
476 $self->{bdb_fhcache
}{$path} = IO
::File
->new("${modifier}${path}") or return;
478 $self->{bdb_fhcache
}{$path} = IO
::File
->new($path) or return;
480 $self->{bdb_curopen
}++;
482 $self->{bdb_cacheseq
}{$path}++;
483 $self->{bdb_fhcache
}{$path}