Bio/DB/SeqFeature/Store/berkeleydb.pm

   1 package Bio::DB::SeqFeature::Store::berkeleydb;
   2
   3 # $Id$
   4
   5 use strict;
   6 use base 'Bio::DB::SeqFeature::Store';
   7 use Bio::DB::GFF::Util::Rearrange 'rearrange';
   8 use DB_File;
   9 use Fcntl qw(O_RDWR O_CREAT :flock);
  10 use IO::File;
  11 use File::Temp 'tempdir';
  12 use File::Path 'rmtree','mkpath';
  13 use File::Basename;
  14 use File::Spec;
  15 use Carp 'carp','croak';
  16
  17 use constant BINSIZE => 10_000;
  18 use constant MININT  => -999_999_999_999;
  19 use constant MAXINT  => 999_999_999_999;
  20
  21 our $VERSION = '2.00';
  22
  23 =head1 NAME
  24
  25 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
  26
  27 =head1 SYNOPSIS
  28
  29   use Bio::DB::SeqFeature::Store;
  30
  31   # Create a database from the feature files located in /home/fly4.3 and store
  32   # the database index in the same directory:
  33   $db =  Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  34                                           -dir     => '/home/fly4.3');
  35
  36   # Create a database that will monitor the files in /home/fly4.3, but store
  37   # the indexes in /var/databases/fly4.3
  38   $db      = Bio::DB::SeqFeature::Store->new( -adaptor    => 'berkeleydb',
  39                                               -dsn        => '/var/databases/fly4.3',
  40                                               -dir        => '/home/fly4.3');
  41
  42   # Create a feature database from scratch
  43   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  44                                              -dsn     => '/var/databases/fly4.3',
  45                                              -create  => 1);
  46
  47   # get a feature from somewhere
  48   my $feature = Bio::SeqFeature::Generic->new(...);
  49
  50   # store it
  51   $db->store($feature) or die "Couldn't store!";
  52
  53   # primary ID of the feature is changed to indicate its primary ID
  54   # in the database...
  55   my $id = $feature->primary_id;
  56
  57   # get the feature back out
  58   my $f  = $db->fetch($id);
  59
  60   # change the feature and update it
  61   $f->start(100);
  62   $db->update($f) or $self->throw("Couldn't update!");
  63
  64   # use the GFF3 loader to do a bulk write:
  65   my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store   => $db,
  66                                                            -verbose => 0,
  67                                                            -fast    => 1);
  68   $loader->load('/home/fly4.3/dmel-all.gff');
  69
  70
  71   # searching...
  72   # ...by id
  73   my @features = $db->fetch_many(@list_of_ids);
  74
  75   # ...by name
  76   @features = $db->get_features_by_name('ZK909');
  77
  78   # ...by alias
  79   @features = $db->get_features_by_alias('sma-3');
  80
  81   # ...by type
  82   @features = $db->get_features_by_type('gene');
  83
  84   # ...by location
  85   @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
  86
  87   # ...by attribute
  88   @features = $db->get_features_by_attribute({description => 'protein kinase'})
  89
  90   # ...by the GFF "Note" field
  91   @result_list = $db->search_notes('kinase');
  92
  93   # ...by arbitrary combinations of selectors
  94   @features = $db->features(-name => $name,
  95                             -type => $types,
  96                             -seq_id => $seqid,
  97                             -start  => $start,
  98                             -end    => $end,
  99                             -attributes => $attributes);
 100
 101   # ...using an iterator
 102   my $iterator = $db->get_seq_stream(-name => $name,
 103                                      -type => $types,
 104                                      -seq_id => $seqid,
 105                                      -start  => $start,
 106                                      -end    => $end,
 107                                      -attributes => $attributes);
 108
 109   while (my $feature = $iterator->next_seq) {
 110     # do something with the feature
 111   }
 112
 113   # ...limiting the search to a particular region
 114   my $segment  = $db->segment('Chr1',5000=>6000);
 115   my @features = $segment->features(-type=>['mRNA','match']);
 116
 117   # what feature types are defined in the database?
 118   my @types    = $db->types;
 119
 120   # getting & storing sequence information
 121   # Warning: this returns a string, and not a PrimarySeq object
 122   $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
 123   my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
 124
 125   # create a new feature in the database
 126   my $feature = $db->new_feature(-primary_tag => 'mRNA',
 127                                  -seq_id      => 'chr3',
 128                                  -start      => 10000,
 129                                  -end        => 11000);
 130
 131 =head1 DESCRIPTION
 132
 133 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
 134 Bio::DB::SeqFeature::Store. You will not create it directly, but
 135 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
 136
 137 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
 138
 139 =head2 Using the berkeleydb adaptor
 140
 141 The Berkeley database consists of a series of Berkeleydb index files,
 142 and a couple of special purpose indexes. You can create the index
 143 files from scratch by creating a new database and calling
 144 new_feature() repeatedly, you can create the database and then bulk
 145 populate it using the GFF3 loader, or you can monitor a directory of
 146 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
 147 or more of the fields changes. The last mode is probably the most
 148 convenient. Note that the indexer will only pay attention to files
 149 that end with .gff3, .wig and .fa.
 150
 151 =over 4
 152
 153 =item The new() constructor
 154
 155 The new() constructor method all the arguments recognized by
 156 Bio::DB::SeqFeature::Store, and a few additional ones.
 157
 158 Standard arguments:
 159
 160  Name               Value
 161  ----               -----
 162
 163  -adaptor           The name of the Adaptor class (default DBI::mysql)
 164
 165  -serializer        The name of the serializer class (default Storable)
 166
 167  -index_subfeatures Whether or not to make subfeatures searchable
 168                     (default true)
 169
 170  -cache             Activate LRU caching feature -- size of cache
 171
 172  -compress          Compresses features before storing them in database
 173                     using Compress::Zlib
 174
 175 Adaptor-specific arguments
 176
 177  Name               Value
 178  ----               -----
 179
 180  -dsn               Where the index files are stored
 181
 182  -dir               Where the source (GFF3, FASTA) files are stored
 183
 184  -autoindex         An alias for -dir.
 185
 186  -write             Pass true to open the index files for writing.
 187
 188  -create            Pass true to create the index files if they don't exist
 189                     (implies -write=>1)
 190
 191  -locking           Use advisory locking to avoid one process trying to read
 192                     from the database while another is updating it (may not
 193                     work properly over NFS).
 194
 195  -temp              Pass true to create temporary index files that will
 196                     be deleted once the script exits.
 197
 198  -verbose           Pass true to report autoindexing operations on STDERR.
 199                     (default is true).
 200
 201 Examples:
 202
 203 To create an empty database which will be populated using calls to
 204 store() or new_feature(), or which will be bulk-loaded using the GFF3
 205 loader:
 206
 207   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 208                                              -dsn     => '/var/databases/fly4.3',
 209                                              -create  => 1);
 210
 211 To open a preexisting database in read-only mode:
 212
 213   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 214                                              -dsn     => '/var/databases/fly4.3');
 215
 216 To open a preexisting database in read/write (update) mode:
 217
 218   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 219                                              -dsn     => '/var/databases/fly4.3',
 220                                              -write   => 1);
 221
 222 To monitor a set of GFF3 and FASTA files located in a directory and
 223 create/update the database indexes as needed. The indexes will be
 224 stored in a new subdirectory named "indexes":
 225
 226   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 227                                              -dir     => '/var/databases/fly4.3');
 228
 229 As above, but store the source files and index files in separate directories:
 230
 231   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 232                                              -dsn     => '/var/databases/fly4.3',
 233                                              -dir     => '/home/gff3_files/fly4.3');
 234
 235 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
 236 (FASTA format) or .wig (WIG format).
 237
 238 B<-autoindex> is an alias for B<-dir>.
 239
 240 You should specify B<-locking> in a multiuser environment, including
 241 the case in which the database is being used by a web server at the
 242 same time another user might be updating it.
 243
 244 =back
 245
 246 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
 247 by this adaptor. The various methods for storing and updating features
 248 and sequences into the database are supported, but there is no
 249 locking. If two processes try to update the same database
 250 simultaneously, the database will likely become corrupted.
 251
 252 =cut
 253
 254 ###
 255 # object initialization
 256 #
 257 sub init {
 258   my $self          = shift;
 259   my ($directory,
 260       $autoindex,
 261       $is_temporary,
 262       $write,
 263       $create,
 264       $verbose,
 265       $locking,
 266       ) = rearrange([['DSN','DB'],
 267                     [qw(DIR AUTOINDEX)],
 268                     ['TMP','TEMP','TEMPORARY'],
 269                     [qw(WRITE WRITABLE)],
 270                     'CREATE',
 271                     'VERBOSE',
 272                     [qw(LOCK LOCKING)],
 273                   ],@_);
 274
 275   $verbose = 1 unless defined $verbose;
 276
 277   if ($autoindex) {
 278     -d $autoindex or $self->throw("Invalid directory $autoindex");
 279     $directory ||= "$autoindex/indexes";
 280   }
 281   $directory ||= $is_temporary ? File::Spec->tmpdir : '.';
 282   #
 283   my $pacname = __PACKAGE__;
 284   if ($^O =~ /mswin/i) {
 285     $pacname =~ s/:+/_/g;
 286   }
 287   $directory = tempdir($pacname.'_XXXXXX',
 288                        TMPDIR  => 1,
 289                        CLEANUP => 1,
 290                        DIR     => $directory) if $is_temporary;
 291   mkpath($directory);
 292   -d $directory or $self->throw("Invalid directory $directory");
 293
 294   $create++ if $is_temporary;
 295   $write ||= $create;
 296   $self->throw("Can't write into the directory $directory")
 297     if $write && !-w $directory;
 298
 299
 300   $self->default_settings;
 301   $self->directory($directory);
 302   $self->temporary($is_temporary);
 303   $self->verbose($verbose);
 304   $self->locking($locking);
 305   $self->_delete_databases()    if $create;
 306   if ($autoindex && -d $autoindex) {
 307       $self->auto_reindex($autoindex);
 308   }
 309   $self->lock('shared');
 310   $self->_open_databases($write,$create,$autoindex);
 311   $self->_permissions($write,$create);
 312   return $self;
 313 }
 314
 315 sub can_store_parentage { 1 }
 316
 317 sub auto_reindex {
 318     my $self    = shift;
 319     my $autodir = shift;
 320     my $result  = $self->needs_auto_reindexing($autodir);
 321
 322     if ($result && %$result) {
 323         $self->flag_autoindexing(1);
 324         $self->lock('exclusive');
 325         $self->reindex_wigfiles($result->{wig},$autodir)  if $result->{wig};
 326         $self->reindex_ffffiles($result->{fff},$autodir)  if $result->{fff};
 327         $self->reindex_gfffiles($result->{gff},$autodir) if $result->{gff};
 328         $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
 329         $self->unlock;
 330         $self->flag_autoindexing(0);
 331     }
 332
 333     else {
 334         $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
 335     }
 336 }
 337
 338 sub autoindex_flagfile {
 339     return File::Spec->catfile(shift->directory,'autoindex.pid');
 340 }
 341 sub auto_index_in_process {
 342     my $self = shift;
 343     my $flag_file = $self->autoindex_flagfile;
 344     return unless -e $flag_file;
 345
 346     # if flagfile exists, then check that PID still exists
 347     open my $fh,$flag_file or die "Couldn't open $flag_file: $!";
 348     my $pid = <$fh>;
 349     close $fh;
 350     return 1 if kill 0=>$pid;
 351     warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
 352     if (unlink $flag_file) {
 353         warn "Successfully removed stale PID file." if $self->verbose;
 354         warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
 355         my $glob = File::Spec->catfile($self->directory,'*');
 356         unlink glob($glob);
 357         return;
 358     } else {
 359         croak ("Cannot recover from apparent aborted autoindex process. Please remove files in ",
 360              $self->directory,
 361              " and allow the adaptor to reindex");
 362         return 1;
 363     }
 364 }
 365
 366 sub flag_autoindexing {
 367     my $self = shift;
 368     my $doit = shift;
 369     my $flag_file = $self->autoindex_flagfile;
 370     if ($doit) {
 371         open my $fh,'>',$flag_file or die "Couldn't open $flag_file: $!";
 372         print $fh $$;
 373         close $fh;
 374     } else {
 375         unlink $flag_file;
 376     }
 377 }
 378
 379 sub reindex_gfffiles {
 380     my $self    = shift;
 381     my $files   = shift;
 382     my $autodir = shift;
 383
 384     warn "Reindexing GFF files...\n" if $self->verbose;
 385     my $exists = -e $self->_features_path;
 386
 387     $self->_permissions(1,1);
 388     $self->_close_databases();
 389     $self->_open_databases(1,!$exists);
 390     require Bio::DB::SeqFeature::Store::GFF3Loader
 391         unless Bio::DB::SeqFeature::Store::GFF3Loader->can('new');
 392     my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store    => $self,
 393                                                              -sf_class => $self->seqfeature_class,
 394                                                              -verbose  => $self->verbose,
 395         )
 396         or $self->throw("Couldn't create GFF3Loader");
 397     my %seen;
 398     $loader->load(grep {!$seen{$_}++} @$files);
 399     $self->_touch_timestamp;
 400 }
 401
 402 sub reindex_ffffiles {
 403     my $self    = shift;
 404     my $files   = shift;
 405     my $autodir = shift;
 406
 407     warn "Reindexing FFF files...\n" if $self->verbose;
 408     $self->_permissions(1,1);
 409     $self->_close_databases();
 410     $self->_open_databases(1,1);
 411     require Bio::DB::SeqFeature::Store::FeatureFileLoader
 412         unless Bio::DB::SeqFeature::Store::FeatureFileLoader->can('new');
 413     my $loader = Bio::DB::SeqFeature::Store::FeatureFileLoader->new(-store    => $self,
 414                                                                     -sf_class => $self->seqfeature_class,
 415                                                                     -verbose  => $self->verbose,
 416         )
 417         or $self->throw("Couldn't create FeatureFileLoader");
 418     my %seen;
 419     $loader->load(grep {!$seen{$_}++} @$files);
 420     $self->_touch_timestamp;
 421 }
 422
 423 sub reindex_wigfiles {
 424     my $self    = shift;
 425     my $files   = shift;
 426     my $autodir = shift;
 427
 428     warn "Reindexing wig files...\n" if $self->verbose;
 429
 430     unless (Bio::Graphics::Wiggle::Loader->can('new')) {
 431         eval "require Bio::Graphics::Wiggle::Loader; 1"
 432             or return;
 433     }
 434
 435     for my $wig (@$files) {
 436         warn "Reindexing $wig...\n" if $self->verbose;
 437         my ($wib_name) = fileparse($wig,qr/\.[^.]*/);
 438         my $gff3_name  = "$wib_name.gff3";
 439
 440         # unlink all wib files that share the basename
 441         my $wib_glob   = File::Spec->catfile($self->directory,"$wib_name*.wib");
 442         unlink glob($wib_glob);
 443
 444         my $loader     = Bio::Graphics::Wiggle::Loader->new($self->directory,$wib_name);
 445         my $fh         = IO::File->new($wig) or die "Can't open $wig: $!";
 446         $loader->load($fh);  # will create one or more .wib files
 447         $fh->close;
 448         my $gff3_data  = $loader->featurefile('gff3','microarray_oligo',$wib_name);
 449         my $gff3_path  = File::Spec->catfile($autodir,$gff3_name);
 450         $fh            = IO::File->new($gff3_path,'>')
 451             or die "Can't open $gff3_path for writing: $!";
 452         $fh->print($gff3_data);
 453         $fh->close;
 454         my $conf_path  = File::Spec->catfile($autodir,"$wib_name.conf");
 455         $fh            = IO::File->new($conf_path,'>');
 456         $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
 457         $fh->close;
 458     }
 459 }
 460
 461 # returns the following hashref
 462 # empty hash if nothing needs reindexing
 463 # {fasta => 1}                    if DNA database needs reindexing
 464 # {gff   => [list,of,gff,paths]}  if gff3 files need reindexing
 465 # {wig   => [list,of,wig,paths]}  if wig files need reindexing
 466 sub needs_auto_reindexing {
 467     my $self    = shift;
 468     my $autodir = shift;
 469     my $result    = {};
 470
 471     # don't allow two processes to reindex simultaneously
 472         $self->auto_index_in_process and croak "Autoindexing in process. Try again later";
 473
 474     # first interrogate the GFF3 files, using the timestamp file
 475     # as modification comparison
 476     my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
 477     opendir (my $D,$autodir)
 478         or $self->throw("Couldn't open directory $autodir for reading: $!");
 479
 480     my $maxtime   = 0;
 481     my $timestamp_time  = _mtime($self->_mtime_path) || 0;
 482     while (defined (my $node = readdir($D))) {
 483         next if $node =~ /^\./;
 484         my $path      = File::Spec->catfile($autodir,$node);
 485         next unless -f $path;
 486
 487         if ($path =~ /\.gff\d?$/i) {
 488             my $mtime = _mtime(\*_);  # not a typo
 489             $maxtime   = $mtime if $mtime > $maxtime;
 490             push @gff3,$path;
 491         }
 492
 493
 494         elsif ($path =~ /\.fff?$/i) {
 495             my $mtime = _mtime(\*_);  # not a typo
 496             $maxtime   = $mtime if $mtime > $maxtime;
 497             push @fff,$path;
 498         }
 499
 500         elsif ($path =~ /\.wig$/i) {
 501             my $wig       = $path;
 502             (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
 503             next if -e $gff_file && _mtime($gff_file) > _mtime($path);
 504             push @wig,$wig;
 505             push @gff3,$gff_file;
 506             $maxtime      = time();
 507         }
 508
 509         elsif ($path =~ /\.(fa|fasta|dna)$/i) {
 510             $fasta_index_time =
 511                 _mtime(File::Spec->catfile($self->directory,'fasta.index'))||0
 512                 unless defined $fasta_index_time;
 513             $fasta++ if _mtime($path) > $fasta_index_time;
 514         }
 515     }
 516     closedir $D;
 517
 518     $result->{gff}     = \@gff3 if $maxtime > $timestamp_time;
 519     $result->{wig}     = \@wig  if @wig;
 520     $result->{fff}     = \@fff  if @fff;
 521     $result->{fasta}++ if $fasta;
 522     return $result;
 523 }
 524
 525 sub verbose {
 526     my $self = shift;
 527     my $d    = $self->{verbose};
 528     $self->{verbose} = shift if @_;
 529     return $d;
 530 }
 531
 532 sub locking {
 533     my $self = shift;
 534     my $d    = $self->{locking};
 535     $self->{locking} = shift if @_;
 536     return $d;
 537 }
 538
 539 sub lockfile {
 540     my $self = shift;
 541     return File::Spec->catfile($self->directory,'lock');
 542 }
 543
 544 sub lock {
 545     my $self = shift;
 546     my $mode = shift;
 547     return unless $self->locking;
 548
 549     my $flag = $mode eq 'exclusive' ? LOCK_EX : LOCK_SH;
 550     my $lockfile = $self->lockfile;
 551     my $fh = $self->_flock_fh;
 552     unless ($fh) {
 553         my $open  = -e $lockfile ? '<' : '>';
 554         $fh       = IO::File->new($lockfile,$open) or die "Cannot open $lockfile: $!";
 555     }
 556     flock($fh,$flag);
 557     $self->_flock_fh($fh);
 558 }
 559
 560 sub unlock {
 561     my $self = shift;
 562     return unless $self->locking;
 563
 564     my $fh = $self->_flock_fh or return;
 565     flock($fh,LOCK_UN);
 566     undef $self->{flock_fh};
 567 }
 568
 569 sub _flock_fh {
 570     my $self = shift;
 571     my $d    = $self->{flock_fh};
 572     $self->{flock_fh} = shift if @_;
 573     $d;
 574 }
 575
 576 sub _open_databases {
 577   my $self = shift;
 578   my ($write,$create,$ignore_errors) = @_;
 579   return if $self->db; # already open - don't reopen
 580
 581   my $directory  = $self->directory;
 582   unless (-d $directory) {  # directory does not exist
 583     $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
 584     mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
 585   }
 586
 587   my $flags = O_RDONLY;
 588   $flags   |= O_RDWR  if $write;
 589   $flags   |= O_CREAT if $create;
 590
 591   # Create the main database; this is a DB_HASH implementation
 592   my %h;
 593   my $result = tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
 594
 595   unless ($result) {
 596     return if $ignore_errors;  # autoindex set, so defer this
 597     $self->throw("Couldn't tie: ".$self->_features_path . " $!");
 598   }
 599   if ($create) {
 600     %h = ();
 601     $h{'.next_id'} = 1;
 602     $h{'.version'} = $VERSION;
 603   }
 604   $self->db(\%h);
 605
 606   # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
 607   local $DB_BTREE->{flags} = R_DUP;
 608   $DB_BTREE->{compare}     = sub { lc($_[0]) cmp lc($_[1]) };
 609   for my $idx ($self->_index_files) {
 610     my $path = $self->_qualify("$idx.idx");
 611     my %db;
 612     tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
 613       or $self->throw("Couldn't tie $path: $!");
 614     %db = () if $create;
 615     $self->index_db($idx=>\%db);
 616   }
 617
 618   # Create the parentage database
 619   my %p;
 620   tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
 621     or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
 622     %p = () if $create;
 623   $self->parentage_db(\%p);
 624
 625   if (-e $self->_fasta_path) {
 626     my $dna_db = Bio::DB::Fasta::Subdir->new($self->_fasta_path)
 627         or $self->throw("Can't reindex sequence file: $@");
 628     $self->dna_db($dna_db);
 629   }
 630
 631   my $mode =  $write  ? "+>>"
 632             : $create ? "+>"
 633             : "<";
 634
 635   open (my $F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
 636   $self->notes_db($F);
 637 }
 638
 639 sub commit { # reindex fasta files
 640   my $self = shift;
 641   if (my $fh = $self->{fasta_fh}) {
 642     $fh->close;
 643     $self->dna_db(Bio::DB::Fasta::Subdir->new($self->{fasta_file}));
 644   } elsif (-d $self->directory) {
 645     $self->dna_db(Bio::DB::Fasta::Subdir->new($self->directory));
 646   }
 647 }
 648
 649 sub _close_databases {
 650   my $self = shift;
 651   $self->db(undef);
 652   $self->dna_db(undef);
 653   $self->notes_db(undef);
 654   $self->index_db($_=>undef) foreach $self->_index_files;
 655 }
 656
 657 # do nothing -- new() with -create=>1 will do the trick
 658 sub _init_database { }
 659
 660 sub _delete_databases {
 661   my $self = shift;
 662   for my $idx ($self->_index_files) {
 663     my $path = $self->_qualify("$idx.idx");
 664     unlink $path;
 665   }
 666   unlink $self->_parentage_path;
 667   unlink $self->_fasta_path;
 668   unlink $self->_features_path;
 669   unlink $self->_mtime_path;
 670 }
 671
 672 sub _touch_timestamp {
 673   my $self = shift;
 674   my $tsf = $self->_mtime_path;
 675   open (F,">$tsf") or $self->throw("Couldn't open $tsf: $!");
 676   print F scalar(localtime);
 677   close F;
 678 }
 679
 680 sub _store {
 681   my $self    = shift;
 682   my $indexed = shift;
 683   my $db   = $self->db;
 684   my $count = 0;
 685   for my $obj (@_) {
 686     my $primary_id = $obj->primary_id;
 687     $self->_delete_indexes($obj,$primary_id)  if $indexed && $primary_id;
 688     $primary_id    = $db->{'.next_id'}++      unless defined $primary_id;
 689     $db->{$primary_id} = $self->freeze($obj);
 690     $obj->primary_id($primary_id);
 691     $self->_update_indexes($obj)              if $indexed;
 692     $count++;
 693   }
 694   $count;
 695 }
 696
 697 sub _delete_indexes {
 698   my $self = shift;
 699   my ($obj,$id) = @_;
 700
 701   # the additional "1" causes the index to be deleted
 702   $self->_update_name_index($obj,$id,1);
 703   $self->_update_type_index($obj,$id,1);
 704   $self->_update_location_index($obj,$id,1);
 705   $self->_update_attribute_index($obj,$id,1);
 706   $self->_update_note_index($obj,$id,1);
 707 }
 708
 709 sub _fetch {
 710   my $self = shift;
 711   my $id   = shift;
 712   my $db = $self->db;
 713   my $obj = $self->thaw($db->{$id},$id);
 714   $obj;
 715 }
 716
 717 sub _add_SeqFeature {
 718   my $self = shift;
 719   my $parent   = shift;
 720   my @children = @_;
 721   my $parent_id = (ref $parent ? $parent->primary_id : $parent)
 722     or $self->throw("$parent should have a primary_id");
 723   my $p = $self->parentage_db;
 724   for my $child (@children) {
 725     my $child_id = ref $child ? $child->primary_id : $child;
 726     defined $child_id or $self->throw("no primary ID known for $child");
 727     $p->{$parent_id} = $child_id;
 728   }
 729 }
 730
 731 sub _fetch_SeqFeatures {
 732   my $self   = shift;
 733   my $parent = shift;
 734   my @types  = @_;
 735   my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
 736   my $index     = $self->parentage_db;
 737   my $db        = tied %$index;
 738
 739   my @children_ids  = $db->get_dup($parent_id);
 740   my @children      = map {$self->fetch($_)} @children_ids;
 741
 742   if (@types) {
 743     my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
 744     return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
 745   } else {
 746     return @children;
 747   }
 748 }
 749
 750 sub _update_indexes {
 751   my $self = shift;
 752   my $obj  = shift;
 753   defined (my $id   = $obj->primary_id) or return;
 754   $self->_update_name_index($obj,$id);
 755   $self->_update_type_index($obj,$id);
 756   $self->_update_location_index($obj,$id);
 757   $self->_update_attribute_index($obj,$id);
 758   $self->_update_note_index($obj,$id);
 759 }
 760
 761 sub _update_name_index {
 762   my $self = shift;
 763   my ($obj,$id,$delete) = @_;
 764   my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
 765   my ($names,$aliases) = $self->feature_names($obj);
 766
 767   # little stinky - needs minor refactoring
 768   foreach (@$names) {
 769     my $key = lc $_;
 770     $self->update_or_delete($delete,$db,$key,$id);
 771   }
 772
 773   foreach (@$aliases) {
 774     my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
 775     $self->update_or_delete($delete,$db,$key,$id);
 776   }
 777
 778 }
 779
 780 sub _update_type_index {
 781   my $self = shift;
 782   my ($obj,$id,$delete) = @_;
 783   my $db = $self->index_db('types')
 784     or $self->throw("Couldn't find 'types' index file");
 785   my $primary_tag = $obj->primary_tag;
 786   my $source_tag  = $obj->source_tag || '';
 787   return unless defined $primary_tag;
 788
 789   $primary_tag    .= ":$source_tag";
 790   my $key          = lc $primary_tag;
 791   $self->update_or_delete($delete,$db,$key,$id);
 792 }
 793
 794 # Note: this indexing scheme is space-inefficient because it stores the
 795 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
 796 # format. It should be replaced with a binary numeric encoding and the
 797 # BTREE {compare} attribute changed accordingly.
 798 sub _update_location_index {
 799   my $self = shift;
 800   my ($obj,$id,$delete) = @_;
 801   my $db = $self->index_db('locations')
 802     or $self->throw("Couldn't find 'locations' index file");
 803
 804   my $seq_id      = $obj->seq_id || '';
 805   my $start       = $obj->start  || '';
 806   my $end         = $obj->end    || '';
 807   my $strand      = $obj->strand;
 808   my $bin_min     = int $start/BINSIZE;
 809   my $bin_max     = int $end/BINSIZE;
 810
 811   for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
 812     my $key = sprintf("%s.%06d",lc($seq_id),$bin);
 813     $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
 814   }
 815 }
 816
 817 sub _update_attribute_index {
 818   my $self      = shift;
 819   my ($obj,$id,$delete) = @_;
 820   my $db = $self->index_db('attributes')
 821     or $self->throw("Couldn't find 'attributes' index file");
 822
 823   for my $tag ($obj->get_all_tags) {
 824     for my $value ($obj->get_tag_values($tag)) {
 825       my $key = "${tag}:${value}";
 826       $self->update_or_delete($delete,$db,$key,$id);
 827     }
 828   }
 829 }
 830
 831 sub _update_note_index {
 832   my $self = shift;
 833   my ($obj,$id,$delete) = @_;
 834   return if $delete; # we don't know how to do this
 835
 836   my $fh = $self->notes_db;
 837   my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
 838
 839
 840   print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
 841     foreach @notes;
 842 }
 843
 844 sub update_or_delete {
 845   my $self = shift;
 846   my ($delete,$db,$key,$id) = @_;
 847   if ($delete) {
 848     tied(%$db)->del_dup($key,$id);
 849   } else {
 850     $db->{$key} = $id;
 851   }
 852 }
 853
 854 # these methods return pointers to....
 855 # the database that stores serialized objects
 856 sub db {
 857   my $self = shift;
 858   my $d = $self->setting('db');
 859   $self->setting(db=>shift) if @_;
 860   $d;
 861 }
 862
 863 sub parentage_db {
 864   my $self = shift;
 865   my $d = $self->setting('parentage_db');
 866   $self->setting(parentage_db=>shift) if @_;
 867   $d;
 868 }
 869
 870 # the Bio::DB::Fasta object
 871 sub dna_db {
 872   my $self = shift;
 873   my $d = $self->setting('dna_db');
 874   $self->setting(dna_db=>shift) if @_;
 875   $d;
 876 }
 877
 878 # the specialized notes database
 879 sub notes_db {
 880   my $self = shift;
 881   my $d = $self->setting('notes_db');
 882   $self->setting(notes_db=>shift) if @_;
 883   $d;
 884 }
 885
 886 # The indicated index berkeley db
 887 sub index_db {
 888   my $self = shift;
 889   my $index_name = shift;
 890   my $d = $self->setting($index_name);
 891   $self->setting($index_name=>shift) if @_;
 892   $d;
 893 }
 894
 895
 896 sub _mtime {
 897   my $file = shift;
 898   my @stat = stat($file);
 899   return $stat[9];
 900 }
 901
 902 # return names of all the indexes
 903 sub _index_files {
 904   return qw(names types locations attributes);
 905 }
 906
 907 # the directory in which we store our indexes
 908 sub directory {
 909   my $self = shift;
 910   my $d = $self->setting('directory');
 911   $self->setting(directory=>shift) if @_;
 912   $d;
 913 }
 914
 915 # flag indicating that we are a temporary database
 916 sub temporary {
 917   my $self = shift;
 918   my $d = $self->setting('temporary');
 919   $self->setting(temporary=>shift) if @_;
 920   $d;
 921 }
 922
 923 sub _permissions {
 924   my $self = shift;
 925   my $d = $self->setting('permissions') or return;
 926   if (@_) {
 927     my ($write,$create) = @_;
 928     $self->setting(permissions=>[$write,$create]);
 929   }
 930   @$d;
 931 }
 932
 933 # file name utilities...
 934 sub _qualify {
 935   my $self = shift;
 936   my $file = shift;
 937   return $self->directory .'/' . $file;
 938 }
 939
 940 sub _features_path {
 941   shift->_qualify('features.bdb');
 942 }
 943
 944 sub _parentage_path {
 945   shift->_qualify('parentage.bdb');
 946 }
 947
 948 sub _type_path {
 949   shift->_qualify('types.idx');
 950 }
 951
 952 sub _location_path {
 953   shift->_qualify('locations.idx');
 954 }
 955
 956 sub _attribute_path {
 957   shift->_qualify('attributes.idx');
 958 }
 959
 960 sub _notes_path {
 961   shift->_qualify('notes.idx');
 962 }
 963
 964 sub _fasta_path {
 965   shift->_qualify('sequence.fa');
 966 }
 967
 968 sub _mtime_path {
 969   shift->_qualify('mtime.stamp');
 970 }
 971
 972 ###########################################
 973 # searching
 974 ###########################################
 975
 976 sub _features {
 977   my $self = shift;
 978   my ($seq_id,$start,$end,$strand,
 979       $name,$class,$allow_aliases,
 980       $types,
 981       $attributes,
 982       $range_type,
 983       $iterator
 984      ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
 985                     'NAME','CLASS','ALIASES',
 986                     ['TYPES','TYPE','PRIMARY_TAG'],
 987                     ['ATTRIBUTES','ATTRIBUTE'],
 988                     'RANGE_TYPE',
 989                     'ITERATOR',
 990                    ],@_);
 991
 992   my (@from,@where,@args,@group);
 993   $range_type ||= 'overlaps';
 994
 995   my @result;
 996   unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
 997     @result = grep {!/^\./} keys %{$self->db};
 998   }
 999
1000   my %found = ();
1001   my $result = 1;
1002
1003   if (defined($name)) {
1004     # hacky backward compatibility workaround
1005     undef $class if $class && $class eq 'Sequence';
1006     $name     = "$class:$name" if defined $class && length $class > 0;
1007     $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
1008   }
1009
1010   if (defined $seq_id) {
1011     $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
1012   }
1013
1014   if (defined $types) {
1015     $result &&= $self->filter_by_type($types,\%found);
1016   }
1017
1018   if (defined $attributes) {
1019     $result &&= $self->filter_by_attribute($attributes,\%found);
1020   }
1021
1022   push @result,keys %found if $result;
1023   return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
1024                    : map {$self->fetch($_)} @result;
1025 }
1026
1027 sub filter_by_name {
1028   my $self = shift;
1029   my ($name,$allow_aliases,$filter) = @_;
1030
1031   my $index = $self->index_db('names');
1032   my $db    = tied(%$index);
1033
1034   my ($stem,$regexp) = $self->glob_match($name);
1035   $stem   ||= $name;
1036   $regexp ||= $name;
1037   $regexp .= "(?:_2)?" if $allow_aliases;
1038
1039   my $key   = $stem;
1040   my $value;
1041   my @results;
1042   for (my $status = $db->seq($key,$value,R_CURSOR);
1043        $status == 0 and $key =~ /^$regexp$/i;
1044        $status = $db->seq($key,$value,R_NEXT)) {
1045       next if %$filter && !$filter->{$value};  # don't bother
1046       push @results,$value;
1047   }
1048   $self->update_filter($filter,\@results);
1049 }
1050
1051 sub filter_by_type {
1052   my $self = shift;
1053   my ($types,$filter) = @_;
1054   my @types = ref $types eq 'ARRAY' ?  @$types : $types;
1055
1056   my $index = $self->index_db('types');
1057   my $db    = tied(%$index);
1058
1059   my @results;
1060
1061   for my $type (@types) {
1062     my ($primary_tag,$source_tag);
1063     if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1064       $primary_tag = $type->method;
1065       $source_tag  = $type->source;
1066     } else {
1067       ($primary_tag,$source_tag) = split ':',$type,2;
1068     }
1069     my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
1070     $source_tag ||= '';
1071     my $key   = lc "$primary_tag:$source_tag";
1072     my $value;
1073
1074     # If filter is already provided, then it is usually faster to
1075     # fetch each object.
1076     if (%$filter) {
1077         for my $id (keys %$filter) {
1078             my $obj = $self->_fetch($id) or next;
1079             push @results,$id if $obj->type =~ /$match/i;
1080         }
1081
1082     }
1083
1084     else {
1085         for (my $status = $db->seq($key,$value,R_CURSOR);
1086              $status == 0 && $key =~ /$match/i;
1087              $status = $db->seq($key,$value,R_NEXT)) {
1088             next if %$filter && !$filter->{$value};  # don't even bother
1089             push @results,$value;
1090         }
1091     }
1092   }
1093   $self->update_filter($filter,\@results);
1094 }
1095
1096 sub filter_by_location {
1097   my $self = shift;
1098   my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1099   $strand ||= 0;
1100
1101   my $index    = $self->index_db('locations');
1102   my $db       = tied(%$index);
1103
1104   my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
1105   my $binend   = defined $end   ? sprintf("%06d",int $end/BINSIZE)   : 'z';  # beyond a number
1106
1107   my %seenit;
1108   my @results;
1109
1110   $start = MININT  if !defined $start;
1111   $end   = MAXINT  if !defined $end;
1112   my $version_2 = $self->version > 1;
1113
1114   if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1115     my $key     = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1116     my $keystop = $version_2 ? "\L$seq_id\E.$binend"   : "\L$seq_id\E$binend";
1117     my $value;
1118
1119     for (my $status = $db->seq($key,$value,R_CURSOR);
1120          $status == 0 && $key le $keystop;
1121          $status = $db->seq($key,$value,R_NEXT)) {
1122       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1123       next if $seenit{$id}++;
1124       next if $strand && $fstrand != $strand;
1125       if ($range_type eq 'overlaps') {
1126         next unless $fend >= $start && $fstart <= $end;
1127       }
1128       elsif ($range_type eq 'contains') {
1129         next unless $fstart >= $start && $fend <= $end;
1130       }
1131       next if %$filter && !$filter->{$id};  # don't bother
1132       push @results,$id;
1133     }
1134   }
1135
1136   # for contained in, we look for features originating and terminating outside the specified range
1137   # this is incredibly inefficient, but fortunately the query is rare (?)
1138   elsif ($range_type eq 'contained_in') {
1139     my $key     = $version_2 ? "\L$seq_id."            : "\L$seq_id";
1140     my $keystop = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1141     my $value;
1142
1143     # do the left part of the range
1144     for (my $status = $db->seq($key,$value,R_CURSOR);
1145          $status == 0 && $key le $keystop;
1146          $status = $db->seq($key,$value,R_NEXT)) {
1147       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1148       next if $seenit{$id}++;
1149       next if $strand && $fstrand != $strand;
1150       next unless $fstart <= $start && $fend >= $end;
1151       next if %$filter && !$filter->{$id};  # don't bother
1152       push @results,$id;
1153     }
1154
1155     # do the right part of the range
1156     $key = "\L$seq_id\E.$binend";
1157     for (my $status = $db->seq($key,$value,R_CURSOR);
1158          $status == 0;
1159          $status = $db->seq($key,$value,R_NEXT)) {
1160       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1161       next if $seenit{$id}++;
1162       next if $strand && $fstrand != $strand;
1163       next unless $fstart <= $start && $fend >= $end;
1164       next if %$filter && !$filter->{$id};  # don't bother
1165       push @results,$id;
1166     }
1167
1168   }
1169
1170   $self->update_filter($filter,\@results);
1171 }
1172
1173 sub attributes {
1174     my $self = shift;
1175     my $index = $self->index_db('attributes');
1176     my %a     = map {s/:.+$//; $_=> 1} keys %$index;
1177     return keys %a;
1178 }
1179
1180 sub filter_by_attribute {
1181   my $self = shift;
1182   my ($attributes,$filter) = @_;
1183
1184   my $index = $self->index_db('attributes');
1185   my $db    = tied(%$index);
1186   my $result;
1187
1188   for my $att_name (keys %$attributes) {
1189     my @result;
1190     my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1191                            ? @{$attributes->{$att_name}} : $attributes->{$att_name};
1192
1193     for my $v (@search_terms) {
1194       my ($stem,$regexp) = $self->glob_match($v);
1195       $stem   ||= $v;
1196       $regexp ||= $v;
1197       my $key = "\L${att_name}:${stem}\E";
1198       my $value;
1199       for (my $status = $db->seq($key,$value,R_CURSOR);
1200            $status == 0 && $key =~ /^$att_name:$regexp$/i;
1201            $status = $db->seq($key,$value,R_NEXT)) {
1202           next if %$filter && !$filter->{$value};  # don't bother
1203           push @result,$value;
1204       }
1205     }
1206     $result ||= $self->update_filter($filter,\@result);
1207   }
1208   $result;
1209 }
1210
1211 sub _search_attributes {
1212   my $self = shift;
1213   my ($search_string,$attribute_array,$limit) = @_;
1214   $search_string =~ tr/*?//d;
1215   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1216   my $search = join '|',@words;
1217
1218   my $index = $self->index_db('attributes');
1219   my $db    = tied(%$index);
1220
1221   my (%results,%notes);
1222
1223   for my $tag (@$attribute_array) {
1224     my $id;
1225     my $key = "\L$tag:\E";
1226     for (my $status = $db->seq($key,$id,R_CURSOR);
1227          $status == 0 and $key =~ /^$tag:(.*)/i;
1228          $status = $db->seq($key,$id,R_NEXT)) {
1229       my $text = $1;
1230       next unless $text =~ /$search/;
1231       for my $w (@words) {
1232         my @hits = $text =~ /($w)/ig or next;
1233         $results{$id} += @hits;
1234       }
1235       $notes{$id} .= "$text ";
1236     }
1237   }
1238
1239   my @results;
1240   for my $id (keys %results) {
1241     my $hits = $results{$id};
1242     my $note = $notes{$id};
1243     $note =~ s/\s+$//;
1244     my $relevance = 10 * $hits;
1245     my $feature   = $self->fetch($id) or next;
1246     my $name      = $feature->display_name or next;
1247     my $type      = $feature->type;
1248     push @results,[$name,$note,$relevance,$type,$id];
1249   }
1250
1251   return @results;
1252 }
1253
1254 sub search_notes {
1255   my $self = shift;
1256   my ($search_string,$limit) = @_;
1257
1258   $search_string =~ tr/*?//d;
1259
1260   my @results;
1261
1262   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1263   my $search = join '|',@words;
1264
1265   my (%found,$found);
1266   my $note_index = $self->notes_db;
1267   seek($note_index,0,0);  # back to start
1268   while (<$note_index>) {
1269     next unless /$search/;
1270     chomp;
1271     my ($note,$uu) = split "\t";
1272     $found{unpack("u*",$uu)}++;
1273     last if $limit && ++$found >= $limit;
1274   }
1275
1276   my (@features, @matches);
1277   for my $idx (keys %found) {
1278     my $feature    = $self->fetch($idx) or next;
1279     my @values     = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1280     my $value      = "@values";
1281
1282     my $hits;
1283     $hits++ while $value =~ /($search)/ig;  # count the number of times we were hit
1284     push @matches,$hits;
1285     push @features,$feature;
1286   }
1287
1288   for (my $i=0; $i<@matches; $i++)  {
1289     my $feature = $features[$i];
1290     my $matches = $matches[$i];
1291
1292     my $relevance = 10 * $matches;
1293     my $note;
1294     $note   = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1295     push @results,[$feature->display_name,$note,$relevance];
1296   }
1297
1298   return @results;
1299 }
1300
1301 sub glob_match {
1302   my $self = shift;
1303   my $term = shift;
1304   return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1305   my $stem = $1;
1306   $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1307   $term =~ s/(^|[^\\])\*/$1.*/g;
1308   $term =~ s/(^|[^\\])\?/$1./g;
1309   return ($stem,$term);
1310 }
1311
1312
1313 sub update_filter {
1314   my $self = shift;
1315   my ($filter,$results) = @_;
1316   return unless @$results;
1317
1318   if (%$filter) {
1319     my @filtered = grep {$filter->{$_}} @$results;
1320     %$filter     = map {$_=>1} @filtered;
1321   } else {
1322     %$filter     = map {$_=>1} @$results;
1323   }
1324
1325 }
1326
1327 sub types {
1328     my $self = shift;
1329     eval "require Bio::DB::GFF::Typename"
1330         unless Bio::DB::GFF::Typename->can('new');
1331
1332     my $index = $self->index_db('types');
1333     my $db    = tied(%$index);
1334     return map {Bio::DB::GFF::Typename->new($_)} keys %$index;
1335 }
1336
1337 # this is ugly
1338 sub _insert_sequence {
1339   my $self = shift;
1340   my ($seqid,$seq,$offset) = @_;
1341   my $dna_fh = $self->private_fasta_file or return;
1342   if ($offset == 0) { # start of the sequence
1343     print $dna_fh ">$seqid\n";
1344   }
1345   print $dna_fh $seq,"\n";
1346 }
1347
1348 sub _fetch_sequence {
1349   my $self = shift;
1350   my ($seqid,$start,$end) = @_;
1351   my $db = $self->dna_db or return;
1352   return $db->seq($seqid,$start,$end);
1353 }
1354
1355 sub private_fasta_file {
1356   my $self = shift;
1357   return $self->{fasta_fh} if exists $self->{fasta_fh};
1358   $self->{fasta_file}   = $self->_qualify("sequence.fa");
1359   return $self->{fasta_fh} = IO::File->new($self->{fasta_file},">");
1360 }
1361
1362 sub finish_bulk_update {
1363   my $self = shift;
1364   if (my $fh = $self->{fasta_fh}) {
1365     $fh->close;
1366     $self->{fasta_db} = Bio::DB::Fasta::Subdir->new($self->{fasta_file});
1367   }
1368 }
1369
1370 sub version {
1371     my $self = shift;
1372     my $db   = $self->db;
1373     return $db->{'.version'} || 1.00;
1374 }
1375
1376
1377 sub DESTROY {
1378   my $self = shift;
1379   $self->_close_databases();
1380   rmtree($self->directory,0,1) if $self->temporary;
1381 }
1382
1383 # TIE interface -- a little annoying because we are storing magic ".variable"
1384 # meta-variables in the same data structure as the IDs, so these variables
1385 # must be skipped.
1386 sub _firstid {
1387   my $self  = shift;
1388   my $db    = $self->db;
1389   my ($key,$value);
1390   while ( ($key,$value) = each %{$db}) {
1391     last unless $key =~ /^\./;
1392   }
1393   $key;
1394 }
1395
1396 sub _nextid {
1397   my $self = shift;
1398   my $id   = shift;
1399   my $db    = $self->db;
1400   my ($key,$value);
1401   while ( ($key,$value) = each %$db) {
1402     last unless $key =~ /^\./;
1403   }
1404   $key;
1405 }
1406
1407 sub _existsid {
1408   my $self = shift;
1409   my $id   = shift;
1410   return exists $self->db->{$id};
1411 }
1412
1413 sub _deleteid {
1414   my $self = shift;
1415   my $id   = shift;
1416   my $obj  = $self->fetch($id) or return;
1417   $self->_delete_indexes($obj,$id);
1418   delete $self->db->{$id};
1419 }
1420
1421 sub _clearall {
1422   my $self = shift;
1423   $self->_close_databases();
1424   $self->_delete_databases();
1425   my ($write,$create) = $self->_permissions;
1426   $self->_open_databases($write,$create);
1427 }
1428
1429 sub _featurecount {
1430   my $self = shift;
1431   return scalar %{$self->db};
1432 }
1433
1434
1435 package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;
1436
1437 sub new {
1438   my $class = shift;
1439   my $store = shift;
1440   my $ids   = shift;
1441   return bless {store => $store,
1442                 ids   => $ids},ref($class) || $class;
1443 }
1444
1445 sub next_seq {
1446   my $self  = shift;
1447   my $store = $self->{store} or return;
1448   my $id    = shift @{$self->{ids}};
1449   defined $id or return;
1450   return $store->fetch($id);
1451 }
1452
1453 package Bio::DB::Fasta::Subdir;
1454
1455 use base 'Bio::DB::Fasta';
1456
1457 # alter calling arguments so that the fasta file is placed in a subdirectory
1458 # named "indexes"
1459
1460 sub index_name {
1461     my $self = shift;
1462     my ($path,$isdir) = @_;
1463     return $self->SUPER::index_name($path,$isdir)
1464         unless $isdir;
1465     return File::Spec->catfile($path,'indexes','fasta.index');
1466 }
1467
1468
1469 1;
1470
1471 __END__
1472
1473 =head1 BUGS
1474
1475 This is an early version, so there are certainly some bugs. Please
1476 use the BioPerl bug tracking system to report bugs.
1477
1478 =head1 SEE ALSO
1479
1480 L<bioperl>,
1481 L<Bio::DB::SeqFeature>,
1482 L<Bio::DB::SeqFeature::Store>,
1483 L<Bio::DB::SeqFeature::GFF3Loader>,
1484 L<Bio::DB::SeqFeature::Segment>,
1485 L<Bio::DB::SeqFeature::Store::memory>,
1486 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1487
1488 =head1 AUTHOR
1489
1490 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1491
1492 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1493
1494 This library is free software; you can redistribute it and/or modify
1495 it under the same terms as Perl itself.
1496
1497 =cut
1498