Bio/DB/Flat/BDB.pm

   1 #
   2 #
   3 # BioPerl module for Bio::DB::Flat::BDB
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Lincoln Stein <lstein@cshl.org>
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file
  16
  17 =head1 SYNOPSIS
  18
  19   #You should not be using this module directly.
  20
  21 See L<Bio::DB::Flat>.
  22
  23 =head1 DESCRIPTION
  24
  25 This object provides the basic mechanism to associate positions in
  26 files with primary and secondary name spaces. Unlike
  27 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
  28 to work with the BerkeleyDB-indexed "common" flat file format worked
  29 out at the 2002 BioHackathon.
  30
  31 This object is the guts to the mechanism, which will be used by the
  32 specific objects inheriting from it.
  33
  34 =head1 FEEDBACK
  35
  36 =head2 Mailing Lists
  37
  38 User feedback is an integral part of the evolution of this and other
  39 Bioperl modules. Send your comments and suggestions preferably to one
  40 of the Bioperl mailing lists.  Your participation is much appreciated.
  41
  42   bioperl-l@bioperl.org                  - General discussion
  43   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  44
  45 =head2 Support
  46
  47 Please direct usage questions or support issues to the mailing list:
  48
  49 I<bioperl-l@bioperl.org>
  50
  51 rather than to the module maintainer directly. Many experienced and
  52 reponsive experts will be able look at the problem and quickly
  53 address it. Please include a thorough description of the problem
  54 with code and data examples if at all possible.
  55
  56 =head2 Reporting Bugs
  57
  58 Report bugs to the Bioperl bug tracking system to help us keep track
  59 the bugs and their resolution.  Bug reports can be submitted via
  60 email or the web:
  61
  62   https://github.com/bioperl/bioperl-live/issues
  63
  64 =head1 AUTHOR - Lincoln Stein
  65
  66 Email - lstein@cshl.org
  67
  68 =head1 SEE ALSO
  69
  70 L<Bio::DB::Flat>,
  71
  72 =head1 APPENDIX
  73
  74 The rest of the documentation details each of the object methods. Internal
  75 methods are usually preceded with an "_" (underscore).
  76
  77 =cut
  78
  79
  80 # Let the code begin...
  81
  82 package Bio::DB::Flat::BDB;
  83
  84 use strict;
  85 use DB_File;
  86 use IO::File;
  87 use Fcntl qw(O_CREAT O_RDWR O_RDONLY);
  88 use File::Spec;
  89 use Bio::SeqIO;
  90 use Bio::DB::RandomAccessI;
  91 use Bio::Root::Root;
  92 use Bio::Root::IO;
  93
  94 use base qw(Bio::DB::Flat);
  95
  96 sub _initialize {
  97   my $self = shift;
  98   my ($max_open) = $self->_rearrange(['MAXOPEN'],@_);
  99   $self->{bdb_maxopen} = $max_open || 32;
 100 }
 101
 102 # return a filehandle seeked to the appropriate place
 103 # this only works with the primary namespace
 104 sub _get_stream {
 105   my ($self,$id) = @_;
 106   my ($filepath,$offset,$length) = $self->_lookup_primary($id)
 107     or $self->throw("Unable to find a record for $id in the flat file index");
 108   my $fh = $self->_fhcache($filepath)
 109     or $self->throw("couldn't open $filepath: $!");
 110   seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
 111   $fh;
 112 }
 113
 114 # return records corresponding to the indicated index
 115 # if there are multiple hits will return a list in list context,
 116 # otherwise will throw an exception
 117 sub fetch_raw {
 118   my ($self,$id,$namespace) = @_;
 119
 120   # secondary lookup
 121   if (defined $namespace && $namespace ne $self->primary_namespace) {
 122     my @hits = $self->_lookup_secondary($namespace,$id);
 123     $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context")
 124       unless wantarray;
 125     return map {$self->_read_record(@$_)} @hits;
 126   }
 127
 128   # primary lookup
 129   my @args = $self->_lookup_primary($id)
 130     or $self->throw("Unable to find a record for $id in the flat file index");
 131   return $self->_read_record(@args);
 132 }
 133
 134 # create real live Bio::Seq object
 135 sub get_Seq_by_id {
 136   my $self = shift;
 137   my $id   = shift;
 138   my $fh   = eval {$self->_get_stream($id)} or return;
 139   my $seqio =
 140     $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format,
 141                                                                  -fh     => $fh);
 142   return $seqio->next_seq;
 143 }
 144
 145 # fetch array of Bio::Seq objects
 146 sub get_Seq_by_acc {
 147   my $self = shift;
 148   unshift @_,'ACC' if @_==1;
 149   my ($ns,$key) = @_;
 150   my @primary_ids = $self->expand_ids($ns => $key);
 151   $self->throw("more than one sequences correspond to this accession")
 152       if @primary_ids > 1 && ! wantarray;
 153   my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
 154   return wantarray ? @rc : $rc[0];
 155 }
 156
 157 # fetch array of Bio::Seq objects
 158 sub get_Seq_by_version {
 159   my $self = shift;
 160   unshift @_,'VERSION' if @_==1;
 161   my ($ns,$key) = @_;
 162   my @primary_ids = $self->expand_ids($ns => $key);
 163   $self->throw("more than one sequences correspond to this accession")
 164     if @primary_ids > 1 && !wantarray;
 165   my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
 166   return wantarray ? @rc : $rc[0];
 167 }
 168
 169 =head2 get_PrimarySeq_stream
 170
 171  Title   : get_PrimarySeq_stream
 172  Usage   : $stream = get_PrimarySeq_stream
 173  Function: Makes a Bio::DB::SeqStreamI compliant object
 174            which provides a single method, next_primary_seq
 175  Returns : Bio::DB::SeqStreamI
 176  Args    : none
 177
 178
 179 =cut
 180
 181 sub get_PrimarySeq_stream {
 182   my $self = shift;
 183   my @files  = $self->files || 0;
 184   my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format ,
 185                                         -files  => \@files);
 186   return $out;
 187 }
 188
 189 sub get_all_primary_ids {
 190   my $self = shift;
 191   my $db   = $self->primary_db;
 192   return keys %$db;
 193 }
 194
 195 =head2 get_all_primary_ids
 196
 197  Title   : get_all_primary_ids
 198  Usage   : @ids = $seqdb->get_all_primary_ids()
 199  Function: gives an array of all the primary_ids of the
 200            sequence objects in the database.
 201  Example :
 202  Returns : an array of strings
 203  Args    : none
 204
 205 =cut
 206
 207 # this will perform an ID lookup on a (possibly secondary)
 208 # id, returning all the corresponding ids
 209 sub expand_ids {
 210   my $self = shift;
 211   my ($ns,$key) = @_;
 212   return $key unless defined $ns;
 213   return $key if $ns eq $self->primary_namespace;
 214   my $db   = $self->secondary_db($ns)
 215     or $self->throw("invalid secondary namespace $ns");
 216   my $record = $db->{$key} or return;  # nothing there
 217   return $self->unpack_secondary($record);
 218 }
 219
 220 # build index from files listed
 221 sub build_index {
 222   my $self  = shift;
 223   my @files = @_;
 224   my $count = 0;
 225   for my $file (@files) {
 226     $file = File::Spec->rel2abs($file)
 227       unless File::Spec->file_name_is_absolute($file);
 228     $count += $self->_index_file($file);
 229   }
 230   $self->write_config;
 231   $count;
 232 }
 233
 234 sub _index_file {
 235   my $self = shift;
 236   my $file = shift;
 237
 238   my $fileno = $self->_path2fileno($file);
 239   defined $fileno or $self->throw("could not create a file number for $file");
 240
 241   my $fh     = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
 242   my $offset = 0;
 243   my $count  = 0;
 244
 245   while (!eof($fh)) {
 246     my ($ids,$adjustment)  = $self->parse_one_record($fh) or next;
 247     $adjustment ||= 0;  # prevent uninit variable warning
 248     my $pos = tell($fh) + $adjustment;
 249     $self->_store_index($ids,$file,$offset,$pos-$offset);
 250     $offset = $pos;
 251     $count++;
 252   }
 253   $count;
 254 }
 255
 256 =head2 To Be Implemented in Subclasses
 257
 258 The following methods MUST be implemented by subclasses.
 259
 260 =cut
 261
 262 =head2 May Be Overridden in Subclasses
 263
 264 The following methods MAY be overridden by subclasses.
 265
 266 =cut
 267
 268 sub default_primary_namespace {
 269   return "ACC";
 270 }
 271
 272 sub default_secondary_namespaces {
 273   return;
 274 }
 275
 276 sub _read_record {
 277   my $self = shift;
 278   my ($filepath,$offset,$length) = @_;
 279   my $fh = $self->_fhcache($filepath)
 280     or $self->throw("couldn't open $filepath: $!");
 281   seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
 282   my $record;
 283   read($fh,$record,$length) or $self->throw("can't read $filepath: $!");
 284   $record
 285 }
 286
 287 # return a list in the form ($filepath,$offset,$length)
 288 sub _lookup_primary {
 289   my $self    = shift;
 290   my $primary = shift;
 291   my $db     = $self->primary_db
 292     or $self->throw("no primary namespace database is open");
 293
 294   my $record = $db->{$primary} or return;  # nothing here
 295
 296   my($fileid,$offset,$length) = $self->unpack_primary($record);
 297   my $filepath = $self->_fileno2path($fileid)
 298     or $self->throw("no file path entry for fileid $fileid");
 299   return ($filepath,$offset,$length);
 300 }
 301
 302 # return a list of array refs in the form [$filepath,$offset,$length]
 303 sub _lookup_secondary {
 304   my $self = shift;
 305   my ($namespace,$secondary) = @_;
 306   my @primary = $self->expand_ids($namespace=>$secondary);
 307   return map {[$self->_lookup_primary($_)]} @primary;
 308 }
 309
 310 # store indexing information into a primary & secondary record
 311 # $namespaces is one of:
 312 #     1. a scalar corresponding to the primary name
 313 #     2. a hashref corresponding to namespace=>id identifiers
 314 #              it is valid for secondary id to be an arrayref
 315 sub _store_index {
 316   my $self = shift;
 317   my ($keys,$filepath,$offset,$length) = @_;
 318   my ($primary,%secondary);
 319
 320   if (ref $keys eq 'HASH') {
 321     my %valid_secondary = map {$_=>1} $self->secondary_namespaces;
 322     while (my($ns,$value) = each %$keys) {
 323       if ($ns eq $self->primary_namespace) {
 324         $primary = $value;
 325       } else {
 326         $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns");
 327         push @{$secondary{$ns}},$value;
 328       }
 329     }
 330     $primary or $self->throw("no primary namespace ID provided");
 331   } else {
 332     $primary = $keys;
 333   }
 334
 335   $self->throw("invalid primary ID; must be a scalar")
 336     if ref($primary) =~ /^(ARRAY|HASH)$/;  # but allow stringified objects
 337
 338   $self->_store_primary($primary,$filepath,$offset,$length);
 339   for my $ns (keys %secondary) {
 340     my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns};
 341     $self->_store_secondary($ns,$_,$primary) foreach @ids;
 342   }
 343
 344   1;
 345 }
 346
 347 # store primary index
 348 sub _store_primary {
 349   my $self = shift;
 350   my ($id,$filepath,$offset,$length) = @_;
 351
 352   my $db = $self->primary_db
 353     or $self->throw("no primary namespace database is open");
 354   my $fileno = $self->_path2fileno($filepath);
 355   defined $fileno or $self->throw("could not create a file number for $filepath");
 356
 357   my $record = $self->pack_primary($fileno,$offset,$length);
 358   $db->{$id} = $record or return;  # nothing here
 359   1;
 360 }
 361
 362 # store a primary index name under a secondary index
 363 sub _store_secondary {
 364   my $self = shift;
 365   my ($secondary_ns,$secondary_id,$primary_id) = @_;
 366
 367   my $db   = $self->secondary_db($secondary_ns)
 368     or $self->throw("invalid secondary namespace $secondary_ns");
 369
 370   # first get whatever secondary ids are already stored there
 371   my @primary = $self->unpack_secondary($db->{$secondary_id});
 372   # uniqueify
 373   my %unique  = map {$_=>undef} @primary,$primary_id;
 374
 375   my $record = $self->pack_secondary(keys %unique);
 376   $db->{$secondary_id} = $record;
 377 }
 378
 379 # get output file handle
 380 sub _outfh {
 381   my $self = shift;
 382 #### XXXXX FINISH #####
 383 #  my $
 384 }
 385
 386 # unpack a primary record into fileid,offset,length
 387 sub unpack_primary {
 388   my $self = shift;
 389   my $index_record = shift;
 390   return split "\t",$index_record;
 391 }
 392
 393 # unpack a secondary record into a list of primary ids
 394 sub unpack_secondary {
 395   my $self = shift;
 396   my $index_record = shift or return;
 397   return split "\t",$index_record;
 398 }
 399
 400 # pack a list of fileid,offset,length into a primary id record
 401 sub pack_primary {
 402   my $self = shift;
 403   my ($fileid,$offset,$length) = @_;
 404   return join "\t",($fileid,$offset,$length);
 405 }
 406
 407 # pack a list of primary ids into a secondary id record
 408 sub pack_secondary {
 409   my $self = shift;
 410   my @secondaries = @_;
 411   return join "\t",@secondaries;
 412 }
 413
 414 sub primary_db {
 415   my $self = shift;
 416   # lazy opening
 417   $self->_open_bdb unless exists $self->{bdb_primary_db};
 418   return $self->{bdb_primary_db};
 419 }
 420
 421 sub secondary_db {
 422   my $self = shift;
 423   my $secondary_namespace = shift
 424     or $self->throw("usage: secondary_db(\$secondary_namespace)");
 425   $self->_open_bdb unless exists $self->{bdb_primary_db};
 426   return $self->{bdb_secondary_db}{$secondary_namespace};
 427 }
 428
 429 sub _open_bdb {
 430   my $self = shift;
 431
 432   my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY;
 433
 434   my $primary_db = {};
 435   tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE)
 436     or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)");
 437   $self->{bdb_primary_db} = $primary_db;
 438
 439   for my $secondary ($self->secondary_namespaces) {
 440     my $secondary_db = {};
 441     tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE)
 442       or $self->throw("Could not open primary index file");
 443     $self->{bdb_secondary_db}{$secondary} = $secondary_db;
 444   }
 445
 446   1;
 447 }
 448
 449 sub _primary_db_name {
 450   my $self = shift;
 451   my $pns  = $self->primary_namespace or $self->throw('no primary namespace defined');
 452   return "key_$pns";
 453 }
 454
 455 sub _secondary_db_name {
 456   my $self  = shift;
 457   my $sns   = shift;
 458   return "id_$sns";
 459 }
 460
 461 sub _fhcache {
 462   my $self  = shift;
 463   my $path  = shift;
 464   my $write = shift;
 465
 466   if (!$self->{bdb_fhcache}{$path}) {
 467     $self->{bdb_curopen} ||= 0;
 468     if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) {
 469       my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}};
 470       splice(@lru, $self->{bdb_maxopen} / 3);
 471       $self->{bdb_curopen} -= @lru;
 472       for (@lru) { delete $self->{bdb_fhcache}{$_} }
 473     }
 474     if ($write) {
 475       my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>';
 476       $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return;
 477     } else {
 478       $self->{bdb_fhcache}{$path} = IO::File->new($path) or return;
 479     }
 480     $self->{bdb_curopen}++;
 481   }
 482   $self->{bdb_cacheseq}{$path}++;
 483   $self->{bdb_fhcache}{$path}
 484 }
 485
 486 1;