branch-1-6/Bio/DB/SeqFeature/Store/berkeleydb.pm

   1 package Bio::DB::SeqFeature::Store::berkeleydb;
   2
   3 # $Id$
   4
   5 use strict;
   6 use base 'Bio::DB::SeqFeature::Store';
   7 use Bio::DB::GFF::Util::Rearrange 'rearrange';
   8 use DB_File;
   9 use Fcntl qw(O_RDWR O_CREAT :flock);
  10 use IO::File;
  11 use File::Temp 'tempdir';
  12 use File::Path 'rmtree','mkpath';
  13 use File::Basename;
  14 use File::Spec;
  15 use Carp 'carp','croak';
  16
  17 use constant BINSIZE => 10_000;
  18 use constant MININT  => -999_999_999_999;
  19 use constant MAXINT  => 999_999_999_999;
  20
  21 =head1 NAME
  22
  23 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
  24
  25 =head1 SYNOPSIS
  26
  27   use Bio::DB::SeqFeature::Store;
  28
  29   # Create a database from the feature files located in /home/fly4.3 and store
  30   # the database index in the same directory:
  31   $db =  Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  32                                           -dir     => '/home/fly4.3');
  33
  34   # Create a database that will monitor the files in /home/fly4.3, but store
  35   # the indexes in /var/databases/fly4.3
  36   $db      = Bio::DB::SeqFeature::Store->new( -adaptor    => 'berkeleydb',
  37                                               -dsn        => '/var/databases/fly4.3',
  38                                               -dir        => '/home/fly4.3');
  39
  40   # Create a feature database from scratch
  41   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  42                                              -dsn     => '/var/databases/fly4.3',
  43                                              -create  => 1);
  44
  45   # get a feature from somewhere
  46   my $feature = Bio::SeqFeature::Generic->new(...);
  47
  48   # store it
  49   $db->store($feature) or die "Couldn't store!";
  50
  51   # primary ID of the feature is changed to indicate its primary ID
  52   # in the database...
  53   my $id = $feature->primary_id;
  54
  55   # get the feature back out
  56   my $f  = $db->fetch($id);
  57
  58   # change the feature and update it
  59   $f->start(100);
  60   $db->update($f) or $self->throw("Couldn't update!");
  61
  62   # use the GFF3 loader to do a bulk write:
  63   my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store   => $db,
  64                                                            -verbose => 0,
  65                                                            -fast    => 1);
  66   $loader->load('/home/fly4.3/dmel-all.gff');
  67
  68
  69   # searching...
  70   # ...by id
  71   my @features = $db->fetch_many(@list_of_ids);
  72
  73   # ...by name
  74   @features = $db->get_features_by_name('ZK909');
  75
  76   # ...by alias
  77   @features = $db->get_features_by_alias('sma-3');
  78
  79   # ...by type
  80   @features = $db->get_features_by_type('gene');
  81
  82   # ...by location
  83   @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
  84
  85   # ...by attribute
  86   @features = $db->get_features_by_attribute({description => 'protein kinase'})
  87
  88   # ...by the GFF "Note" field
  89   @result_list = $db->search_notes('kinase');
  90
  91   # ...by arbitrary combinations of selectors
  92   @features = $db->features(-name => $name,
  93                             -type => $types,
  94                             -seq_id => $seqid,
  95                             -start  => $start,
  96                             -end    => $end,
  97                             -attributes => $attributes);
  98
  99   # ...using an iterator
 100   my $iterator = $db->get_seq_stream(-name => $name,
 101                                      -type => $types,
 102                                      -seq_id => $seqid,
 103                                      -start  => $start,
 104                                      -end    => $end,
 105                                      -attributes => $attributes);
 106
 107   while (my $feature = $iterator->next_seq) {
 108     # do something with the feature
 109   }
 110
 111   # ...limiting the search to a particular region
 112   my $segment  = $db->segment('Chr1',5000=>6000);
 113   my @features = $segment->features(-type=>['mRNA','match']);
 114
 115   # what feature types are defined in the database?
 116   my @types    = $db->types;
 117
 118   # getting & storing sequence information
 119   # Warning: this returns a string, and not a PrimarySeq object
 120   $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
 121   my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
 122
 123   # create a new feature in the database
 124   my $feature = $db->new_feature(-primary_tag => 'mRNA',
 125                                  -seq_id      => 'chr3',
 126                                  -start      => 10000,
 127                                  -end        => 11000);
 128
 129 =head1 DESCRIPTION
 130
 131 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
 132 Bio::DB::SeqFeature::Store. You will not create it directly, but
 133 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
 134
 135 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
 136
 137 =head2 Using the berkeleydb adaptor
 138
 139 The Berkeley database consists of a series of Berkeleydb index files,
 140 and a couple of special purpose indexes. You can create the index
 141 files from scratch by creating a new database and calling
 142 new_feature() repeatedly, you can create the database and then bulk
 143 populate it using the GFF3 loader, or you can monitor a directory of
 144 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
 145 or more of the fields changes. The last mode is probably the most
 146 convenient. Note that the indexer will only pay attention to files
 147 that end with .gff3, .wig and .fa.
 148
 149 =over 4
 150
 151 =item The new() constructor
 152
 153 The new() constructor method all the arguments recognized by
 154 Bio::DB::SeqFeature::Store, and a few additional ones.
 155
 156 Standard arguments:
 157
 158  Name               Value
 159  ----               -----
 160
 161  -adaptor           The name of the Adaptor class (default DBI::mysql)
 162
 163  -serializer        The name of the serializer class (default Storable)
 164
 165  -index_subfeatures Whether or not to make subfeatures searchable
 166                     (default true)
 167
 168  -cache             Activate LRU caching feature -- size of cache
 169
 170  -compress          Compresses features before storing them in database
 171                     using Compress::Zlib
 172
 173 Adaptor-specific arguments
 174
 175  Name               Value
 176  ----               -----
 177
 178  -dsn               Where the index files are stored
 179
 180  -dir               Where the source (GFF3, FASTA) files are stored
 181
 182  -autoindex         An alias for -dir.
 183
 184  -write             Pass true to open the index files for writing.
 185
 186  -create            Pass true to create the index files if they don't exist
 187                     (implies -write=>1)
 188
 189  -locking           Use advisory locking to avoid one process trying to read
 190                     from the database while another is updating it (may not
 191                     work properly over NFS).
 192
 193  -temp              Pass true to create temporary index files that will
 194                     be deleted once the script exits.
 195
 196  -verbose           Pass true to report autoindexing operations on STDERR.
 197                     (default is true).
 198
 199 Examples:
 200
 201 To create an empty database which will be populated using calls to
 202 store() or new_feature(), or which will be bulk-loaded using the GFF3
 203 loader:
 204
 205   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 206                                              -dsn     => '/var/databases/fly4.3',
 207                                              -create  => 1);
 208
 209 To open a preexisting database in read-only mode:
 210
 211   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 212                                              -dsn     => '/var/databases/fly4.3');
 213
 214 To open a preexisting database in read/write (update) mode:
 215
 216   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 217                                              -dsn     => '/var/databases/fly4.3',
 218                                              -write   => 1);
 219
 220 To monitor a set of GFF3 and FASTA files located in a directory and
 221 create/update the database indexes as needed. The indexes will be
 222 stored in a new subdirectory named "indexes":
 223
 224   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 225                                              -dir     => '/var/databases/fly4.3');
 226
 227 As above, but store the source files and index files in separate directories:
 228
 229   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 230                                              -dsn     => '/var/databases/fly4.3',
 231                                              -dir     => '/home/gff3_files/fly4.3');
 232
 233 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
 234 (FASTA format) or .wig (WIG format).
 235
 236 B<-autoindex> is an alias for B<-dir>.
 237
 238 You should specify B<-locking> in a multiuser environment, including
 239 the case in which the database is being used by a web server at the
 240 same time another user might be updating it.
 241
 242 =back
 243
 244 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
 245 by this adaptor. The various methods for storing and updating features
 246 and sequences into the database are supported, but there is no
 247 locking. If two processes try to update the same database
 248 simultaneously, the database will likely become corrupted.
 249
 250 =cut
 251
 252 ###
 253 # object initialization
 254 #
 255 sub init {
 256   my $self          = shift;
 257   my ($directory,
 258       $autoindex,
 259       $is_temporary,
 260       $write,
 261       $create,
 262       $verbose,
 263       $locking,
 264       ) = rearrange([['DSN','DB'],
 265                     [qw(DIR AUTOINDEX)],
 266                     ['TMP','TEMP','TEMPORARY'],
 267                     [qw(WRITE WRITABLE)],
 268                     'CREATE',
 269                     'VERBOSE',
 270                     [qw(LOCK LOCKING)],
 271                   ],@_);
 272
 273   $verbose = 1 unless defined $verbose;
 274
 275   if ($autoindex) {
 276     -d $autoindex or $self->throw("Invalid directory $autoindex");
 277     $directory ||= "$autoindex/indexes";
 278   }
 279   $directory ||= $is_temporary ? File::Spec->tmpdir : '.';
 280   #
 281   my $pacname = __PACKAGE__;
 282   if ($^O =~ /mswin/i) {
 283     $pacname =~ s/:+/_/g;
 284   }
 285   $directory = tempdir($pacname.'_XXXXXX',
 286                        TMPDIR  => 1,
 287                        CLEANUP => 1,
 288                        DIR     => $directory) if $is_temporary;
 289   mkpath($directory);
 290   -d $directory or $self->throw("Invalid directory $directory");
 291
 292   $create++ if $is_temporary;
 293   $write ||= $create;
 294   $self->throw("Can't write into the directory $directory")
 295     if $write && !-w $directory;
 296
 297
 298   $self->default_settings;
 299   $self->directory($directory);
 300   $self->temporary($is_temporary);
 301   $self->verbose($verbose);
 302   $self->locking($locking);
 303   $self->_delete_databases()    if $create;
 304   if ($autoindex && -d $autoindex) {
 305       $self->auto_reindex($autoindex);
 306   }
 307   $self->lock('shared');
 308
 309   # this step may rebless $self into a subclass
 310   # to preserve backward compatibility with older
 311   # databases while providing better performance for
 312   # new databases.
 313   $self->possibly_rebless($create);
 314
 315   $self->_open_databases($write,$create,$autoindex);
 316   $self->_permissions($write,$create);
 317   return $self;
 318 }
 319
 320 sub version { return 2.0 };
 321
 322 sub possibly_rebless {
 323     my $self   = shift;
 324     my $create = shift;
 325     my $do_rebless;
 326
 327     if ($create) {
 328         $do_rebless++;
 329     } else {  # probe database
 330         my %h;
 331         tie (%h,'DB_File',$self->_features_path,O_RDONLY,0666,$DB_HASH) or return;
 332         $do_rebless = $h{'.version'} >= 3.0;
 333     }
 334
 335     if ($do_rebless) {
 336         eval "require Bio::DB::SeqFeature::Store::berkeleydb3";
 337         bless $self,'Bio::DB::SeqFeature::Store::berkeleydb3';
 338     }
 339
 340 }
 341
 342 sub can_store_parentage { 1 }
 343
 344 sub auto_reindex {
 345     my $self    = shift;
 346     my $autodir = shift;
 347     my $result  = $self->needs_auto_reindexing($autodir);
 348
 349     if ($result && %$result) {
 350         $self->flag_autoindexing(1);
 351         $self->lock('exclusive');
 352         $self->reindex_wigfiles($result->{wig},$autodir)  if $result->{wig};
 353         $self->reindex_ffffiles($result->{fff},$autodir)  if $result->{fff};
 354         $self->reindex_gfffiles($result->{gff},$autodir) if $result->{gff};
 355         $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
 356         $self->unlock;
 357         $self->flag_autoindexing(0);
 358     }
 359
 360     else {
 361         $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
 362     }
 363 }
 364
 365 sub autoindex_flagfile {
 366     return File::Spec->catfile(shift->directory,'autoindex.pid');
 367 }
 368 sub auto_index_in_process {
 369     my $self = shift;
 370     my $flag_file = $self->autoindex_flagfile;
 371     return unless -e $flag_file;
 372
 373     # if flagfile exists, then check that PID still exists
 374     open my $fh,$flag_file or die "Couldn't open $flag_file: $!";
 375     my $pid = <$fh>;
 376     close $fh;
 377     return 1 if kill 0=>$pid;
 378     warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
 379     if (unlink $flag_file) {
 380         warn "Successfully removed stale PID file." if $self->verbose;
 381         warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
 382         my $glob = File::Spec->catfile($self->directory,'*');
 383         unlink glob($glob);
 384         return;
 385     } else {
 386         croak ("Cannot recover from apparent aborted autoindex process. Please remove files in ",
 387              $self->directory,
 388              " and allow the adaptor to reindex");
 389         return 1;
 390     }
 391 }
 392
 393 sub flag_autoindexing {
 394     my $self = shift;
 395     my $doit = shift;
 396     my $flag_file = $self->autoindex_flagfile;
 397     if ($doit) {
 398         open my $fh,'>',$flag_file or die "Couldn't open $flag_file: $!";
 399         print $fh $$;
 400         close $fh;
 401     } else {
 402         unlink $flag_file;
 403     }
 404 }
 405
 406 sub reindex_gfffiles {
 407     my $self    = shift;
 408     my $files   = shift;
 409     my $autodir = shift;
 410
 411     warn "Reindexing GFF files...\n" if $self->verbose;
 412     my $exists = -e $self->_features_path;
 413
 414     $self->_permissions(1,1);
 415     $self->_close_databases();
 416     $self->_open_databases(1,!$exists);
 417     require Bio::DB::SeqFeature::Store::GFF3Loader
 418         unless Bio::DB::SeqFeature::Store::GFF3Loader->can('new');
 419     my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store    => $self,
 420                                                              -sf_class => $self->seqfeature_class,
 421                                                              -verbose  => $self->verbose,
 422         )
 423         or $self->throw("Couldn't create GFF3Loader");
 424     my %seen;
 425     $loader->load(grep {!$seen{$_}++} @$files);
 426     $self->_touch_timestamp;
 427 }
 428
 429 sub reindex_ffffiles {
 430     my $self    = shift;
 431     my $files   = shift;
 432     my $autodir = shift;
 433
 434     warn "Reindexing FFF files...\n" if $self->verbose;
 435     $self->_permissions(1,1);
 436     $self->_close_databases();
 437     $self->_open_databases(1,1);
 438     require Bio::DB::SeqFeature::Store::FeatureFileLoader
 439         unless Bio::DB::SeqFeature::Store::FeatureFileLoader->can('new');
 440     my $loader = Bio::DB::SeqFeature::Store::FeatureFileLoader->new(-store    => $self,
 441                                                                     -sf_class => $self->seqfeature_class,
 442                                                                     -verbose  => $self->verbose,
 443         )
 444         or $self->throw("Couldn't create FeatureFileLoader");
 445     my %seen;
 446     $loader->load(grep {!$seen{$_}++} @$files);
 447     $self->_touch_timestamp;
 448 }
 449
 450 sub reindex_wigfiles {
 451     my $self    = shift;
 452     my $files   = shift;
 453     my $autodir = shift;
 454
 455     warn "Reindexing wig files...\n" if $self->verbose;
 456
 457     unless (Bio::Graphics::Wiggle::Loader->can('new')) {
 458         eval "require Bio::Graphics::Wiggle::Loader; 1"
 459             or return;
 460     }
 461
 462     for my $wig (@$files) {
 463         warn "Reindexing $wig...\n" if $self->verbose;
 464         my ($wib_name) = fileparse($wig,qr/\.[^.]*/);
 465         my $gff3_name  = "$wib_name.gff3";
 466
 467         # unlink all wib files that share the basename
 468         my $wib_glob   = File::Spec->catfile($self->directory,"$wib_name*.wib");
 469         unlink glob($wib_glob);
 470
 471         my $loader     = Bio::Graphics::Wiggle::Loader->new($self->directory,$wib_name);
 472         my $fh         = IO::File->new($wig) or die "Can't open $wig: $!";
 473         $loader->load($fh);  # will create one or more .wib files
 474         $fh->close;
 475         my $gff3_data  = $loader->featurefile('gff3','microarray_oligo',$wib_name);
 476         my $gff3_path  = File::Spec->catfile($autodir,$gff3_name);
 477         $fh            = IO::File->new($gff3_path,'>')
 478             or die "Can't open $gff3_path for writing: $!";
 479         $fh->print($gff3_data);
 480         $fh->close;
 481         my $conf_path  = File::Spec->catfile($autodir,"$wib_name.conf");
 482         $fh            = IO::File->new($conf_path,'>');
 483         $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
 484         $fh->close;
 485     }
 486 }
 487
 488 # returns the following hashref
 489 # empty hash if nothing needs reindexing
 490 # {fasta => 1}                    if DNA database needs reindexing
 491 # {gff   => [list,of,gff,paths]}  if gff3 files need reindexing
 492 # {wig   => [list,of,wig,paths]}  if wig files need reindexing
 493 sub needs_auto_reindexing {
 494     my $self    = shift;
 495     my $autodir = shift;
 496     my $result    = {};
 497
 498     # don't allow two processes to reindex simultaneously
 499         $self->auto_index_in_process and croak "Autoindexing in process. Try again later";
 500
 501     # first interrogate the GFF3 files, using the timestamp file
 502     # as modification comparison
 503     my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
 504     opendir (my $D,$autodir)
 505         or $self->throw("Couldn't open directory $autodir for reading: $!");
 506
 507     my $maxtime   = 0;
 508     my $timestamp_time  = _mtime($self->_mtime_path) || 0;
 509     while (defined (my $node = readdir($D))) {
 510         next if $node =~ /^\./;
 511         my $path      = File::Spec->catfile($autodir,$node);
 512         next unless -f $path;
 513
 514         if ($path =~ /\.gff\d?$/i) {
 515             my $mtime = _mtime(\*_);  # not a typo
 516             $maxtime   = $mtime if $mtime > $maxtime;
 517             push @gff3,$path;
 518         }
 519
 520
 521         elsif ($path =~ /\.fff?$/i) {
 522             my $mtime = _mtime(\*_);  # not a typo
 523             $maxtime   = $mtime if $mtime > $maxtime;
 524             push @fff,$path;
 525         }
 526
 527         elsif ($path =~ /\.wig$/i) {
 528             my $wig       = $path;
 529             (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
 530             next if -e $gff_file && _mtime($gff_file) > _mtime($path);
 531             push @wig,$wig;
 532             push @gff3,$gff_file;
 533             $maxtime      = time();
 534         }
 535
 536         elsif ($path =~ /\.(fa|fasta|dna)$/i) {
 537             $fasta_index_time =
 538                 _mtime(File::Spec->catfile($self->directory,'fasta.index'))||0
 539                 unless defined $fasta_index_time;
 540             $fasta++ if _mtime($path) > $fasta_index_time;
 541         }
 542     }
 543     closedir $D;
 544
 545     $result->{gff}     = \@gff3 if $maxtime > $timestamp_time;
 546     $result->{wig}     = \@wig  if @wig;
 547     $result->{fff}     = \@fff  if @fff;
 548     $result->{fasta}++ if $fasta;
 549     return $result;
 550 }
 551
 552 sub verbose {
 553     my $self = shift;
 554     my $d    = $self->{verbose};
 555     $self->{verbose} = shift if @_;
 556     return $d;
 557 }
 558
 559 sub locking {
 560     my $self = shift;
 561     my $d    = $self->{locking};
 562     $self->{locking} = shift if @_;
 563     return $d;
 564 }
 565
 566 sub lockfile {
 567     my $self = shift;
 568     return File::Spec->catfile($self->directory,'lock');
 569 }
 570
 571 sub lock {
 572     my $self = shift;
 573     my $mode = shift;
 574     return unless $self->locking;
 575
 576     my $flag = $mode eq 'exclusive' ? LOCK_EX : LOCK_SH;
 577     my $lockfile = $self->lockfile;
 578     my $fh = $self->_flock_fh;
 579     unless ($fh) {
 580         my $open  = -e $lockfile ? '<' : '>';
 581         $fh       = IO::File->new($lockfile,$open) or die "Cannot open $lockfile: $!";
 582     }
 583     flock($fh,$flag);
 584     $self->_flock_fh($fh);
 585 }
 586
 587 sub unlock {
 588     my $self = shift;
 589     return unless $self->locking;
 590
 591     my $fh = $self->_flock_fh or return;
 592     flock($fh,LOCK_UN);
 593     undef $self->{flock_fh};
 594 }
 595
 596 sub _flock_fh {
 597     my $self = shift;
 598     my $d    = $self->{flock_fh};
 599     $self->{flock_fh} = shift if @_;
 600     $d;
 601 }
 602
 603 sub _open_databases {
 604   my $self = shift;
 605   my ($write,$create,$ignore_errors) = @_;
 606   return if $self->db; # already open - don't reopen
 607
 608   my $directory  = $self->directory;
 609   unless (-d $directory) {  # directory does not exist
 610     $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
 611     mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
 612   }
 613
 614   my $flags = O_RDONLY;
 615   $flags   |= O_RDWR  if $write;
 616   $flags   |= O_CREAT if $create;
 617
 618   # Create the main database; this is a DB_HASH implementation
 619   my %h;
 620   my $result = tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
 621
 622   unless ($result) {
 623     return if $ignore_errors;  # autoindex set, so defer this
 624     $self->throw("Couldn't tie: ".$self->_features_path . " $!");
 625   }
 626   if ($create) {
 627     %h = ();
 628     $h{'.next_id'} = 1;
 629     $h{'.version'} = $self->version;
 630   }
 631   $self->db(\%h);
 632
 633   $self->open_index_dbs($flags,$create);
 634   $self->open_parentage_db($flags,$create);
 635   $self->open_notes_db($write,$create);
 636   $self->open_seq_db($flags,$create) if -e $self->_fasta_path;
 637 }
 638
 639 sub open_index_dbs {
 640     my $self = shift;
 641     my ($flags,$create) = @_;
 642
 643     # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
 644     $DB_BTREE->{flags}  = R_DUP;
 645     $DB_BTREE->{compare}     = sub { lc($_[0]) cmp lc($_[1]) };
 646     for my $idx ($self->_index_files) {
 647         my $path = $self->_qualify("$idx.idx");
 648         my %db;
 649         tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
 650             or $self->throw("Couldn't tie $path: $!");
 651         %db = () if $create;
 652         $self->index_db($idx=>\%db);
 653     }
 654 }
 655
 656 sub open_parentage_db {
 657     my $self = shift;
 658     my ($flags,$create) = @_;
 659
 660     # Create the parentage database
 661     my %p;
 662     tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
 663         or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
 664     %p = () if $create;
 665     $self->parentage_db(\%p);
 666 }
 667
 668 sub open_notes_db {
 669     my $self = shift;
 670     my ($write,$create) = @_;
 671
 672     my $mode =  $write  ? "+>>"
 673                         : $create ? "+>"
 674                         : "<";
 675
 676     open (my $F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
 677     $self->notes_db($F);
 678 }
 679
 680 sub open_seq_db {
 681     my $self = shift;
 682
 683     if (-e $self->_fasta_path) {
 684         my $dna_db = Bio::DB::Fasta::Subdir->new($self->_fasta_path)
 685             or $self->throw("Can't reindex sequence file: $@");
 686         $self->dna_db($dna_db);
 687     }
 688 }
 689
 690 sub commit { # reindex fasta files
 691   my $self = shift;
 692   if (my $fh = $self->{fasta_fh}) {
 693     $fh->close;
 694     $self->dna_db(Bio::DB::Fasta::Subdir->new($self->{fasta_file}));
 695   } elsif (-d $self->directory) {
 696     $self->dna_db(Bio::DB::Fasta::Subdir->new($self->directory));
 697   }
 698 }
 699
 700 sub _close_databases {
 701   my $self = shift;
 702   $self->db(undef);
 703   $self->dna_db(undef);
 704   $self->notes_db(undef);
 705   $self->index_db($_=>undef) foreach $self->_index_files;
 706 }
 707
 708 # do nothing -- new() with -create=>1 will do the trick
 709 sub _init_database { }
 710
 711 sub _delete_databases {
 712   my $self = shift;
 713   for my $idx ($self->_index_files) {
 714     my $path = $self->_qualify("$idx.idx");
 715     unlink $path;
 716   }
 717   unlink $self->_parentage_path;
 718   unlink $self->_fasta_path;
 719   unlink $self->_features_path;
 720   unlink $self->_mtime_path;
 721 }
 722
 723 sub _touch_timestamp {
 724   my $self = shift;
 725   my $tsf = $self->_mtime_path;
 726   open (F,">$tsf") or $self->throw("Couldn't open $tsf: $!");
 727   print F scalar(localtime);
 728   close F;
 729 }
 730
 731 sub _store {
 732   my $self    = shift;
 733   my $indexed = shift;
 734   my $db   = $self->db;
 735   my $count = 0;
 736   for my $obj (@_) {
 737     my $primary_id = $obj->primary_id;
 738     $self->_delete_indexes($obj,$primary_id)  if $indexed && $primary_id;
 739     $primary_id    = $db->{'.next_id'}++      unless defined $primary_id;
 740     $db->{$primary_id} = $self->freeze($obj);
 741     $obj->primary_id($primary_id);
 742     $self->_update_indexes($obj)              if $indexed;
 743     $count++;
 744   }
 745   $count;
 746 }
 747
 748 sub _delete_indexes {
 749   my $self = shift;
 750   my ($obj,$id) = @_;
 751
 752   # the additional "1" causes the index to be deleted
 753   $self->_update_name_index($obj,$id,1);
 754   $self->_update_type_index($obj,$id,1);
 755   $self->_update_location_index($obj,$id,1);
 756   $self->_update_attribute_index($obj,$id,1);
 757   $self->_update_note_index($obj,$id,1);
 758 }
 759
 760 sub _fetch {
 761   my $self = shift;
 762   my $id   = shift;
 763   my $db = $self->db;
 764   my $obj = $self->thaw($db->{$id},$id);
 765   $obj;
 766 }
 767
 768 sub _add_SeqFeature {
 769   my $self = shift;
 770   my $parent   = shift;
 771   my @children = @_;
 772   my $parent_id = (ref $parent ? $parent->primary_id : $parent)
 773     or $self->throw("$parent should have a primary_id");
 774   my $p = $self->parentage_db;
 775   for my $child (@children) {
 776     my $child_id = ref $child ? $child->primary_id : $child;
 777     defined $child_id or $self->throw("no primary ID known for $child");
 778     $p->{$parent_id} = $child_id;
 779   }
 780 }
 781
 782 sub _fetch_SeqFeatures {
 783   my $self   = shift;
 784   my $parent = shift;
 785   my @types  = @_;
 786   my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
 787   my $index     = $self->parentage_db;
 788   my $db        = tied %$index;
 789
 790   my @children_ids  = $db->get_dup($parent_id);
 791   my @children      = map {$self->fetch($_)} @children_ids;
 792
 793   if (@types) {
 794     my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
 795     return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
 796   } else {
 797     return @children;
 798   }
 799 }
 800
 801 sub _update_indexes {
 802   my $self = shift;
 803   my $obj  = shift;
 804   defined (my $id   = $obj->primary_id) or return;
 805   $self->_update_name_index($obj,$id);
 806   $self->_update_type_index($obj,$id);
 807   $self->_update_location_index($obj,$id);
 808   $self->_update_attribute_index($obj,$id);
 809   $self->_update_note_index($obj,$id);
 810 }
 811
 812 sub _update_name_index {
 813   my $self = shift;
 814   my ($obj,$id,$delete) = @_;
 815   my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
 816   my ($names,$aliases) = $self->feature_names($obj);
 817
 818   # little stinky - needs minor refactoring
 819   foreach (@$names) {
 820     my $key = lc $_;
 821     $self->update_or_delete($delete,$db,$key,$id);
 822   }
 823
 824   foreach (@$aliases) {
 825     my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
 826     $self->update_or_delete($delete,$db,$key,$id);
 827   }
 828
 829 }
 830
 831 sub _update_type_index {
 832   my $self = shift;
 833   my ($obj,$id,$delete) = @_;
 834   my $db = $self->index_db('types')
 835     or $self->throw("Couldn't find 'types' index file");
 836   my $primary_tag = $obj->primary_tag;
 837   my $source_tag  = $obj->source_tag || '';
 838   return unless defined $primary_tag;
 839
 840   $primary_tag    .= ":$source_tag";
 841   my $key          = lc $primary_tag;
 842   $self->update_or_delete($delete,$db,$key,$id);
 843 }
 844
 845 # Note: this indexing scheme is space-inefficient because it stores the
 846 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
 847 # format. It should be replaced with a binary numeric encoding and the
 848 # BTREE {compare} attribute changed accordingly.
 849 sub _update_location_index {
 850   my $self = shift;
 851   my ($obj,$id,$delete) = @_;
 852   my $db = $self->index_db('locations')
 853     or $self->throw("Couldn't find 'locations' index file");
 854
 855   my $seq_id      = $obj->seq_id || '';
 856   my $start       = $obj->start  || '';
 857   my $end         = $obj->end    || '';
 858   my $strand      = $obj->strand;
 859   my $bin_min     = int $start/BINSIZE;
 860   my $bin_max     = int $end/BINSIZE;
 861
 862   for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
 863     my $key = sprintf("%s.%06d",lc($seq_id),$bin);
 864     $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
 865   }
 866 }
 867
 868 sub _update_attribute_index {
 869   my $self      = shift;
 870   my ($obj,$id,$delete) = @_;
 871   my $db = $self->index_db('attributes')
 872     or $self->throw("Couldn't find 'attributes' index file");
 873
 874   for my $tag ($obj->get_all_tags) {
 875     for my $value ($obj->get_tag_values($tag)) {
 876       my $key = "${tag}:${value}";
 877       $self->update_or_delete($delete,$db,$key,$id);
 878     }
 879   }
 880 }
 881
 882 sub _update_note_index {
 883   my $self = shift;
 884   my ($obj,$id,$delete) = @_;
 885   return if $delete; # we don't know how to do this
 886
 887   my $fh = $self->notes_db;
 888   my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
 889
 890
 891   print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
 892     foreach @notes;
 893 }
 894
 895 sub update_or_delete {
 896   my $self = shift;
 897   my ($delete,$db,$key,$id) = @_;
 898   if ($delete) {
 899     tied(%$db)->del_dup($key,$id);
 900   } else {
 901     $db->{$key} = $id;
 902   }
 903 }
 904
 905 # these methods return pointers to....
 906 # the database that stores serialized objects
 907 sub db {
 908   my $self = shift;
 909   my $d = $self->setting('db');
 910   $self->setting(db=>shift) if @_;
 911   $d;
 912 }
 913
 914 sub parentage_db {
 915   my $self = shift;
 916   my $d = $self->setting('parentage_db');
 917   $self->setting(parentage_db=>shift) if @_;
 918   $d;
 919 }
 920
 921 # the Bio::DB::Fasta object
 922 sub dna_db {
 923   my $self = shift;
 924   my $d = $self->setting('dna_db');
 925   $self->setting(dna_db=>shift) if @_;
 926   $d;
 927 }
 928
 929 # the specialized notes database
 930 sub notes_db {
 931   my $self = shift;
 932   my $d = $self->setting('notes_db');
 933   $self->setting(notes_db=>shift) if @_;
 934   $d;
 935 }
 936
 937 # The indicated index berkeley db
 938 sub index_db {
 939   my $self = shift;
 940   my $index_name = shift;
 941   my $d = $self->setting($index_name);
 942   $self->setting($index_name=>shift) if @_;
 943   $d;
 944 }
 945
 946
 947 sub _mtime {
 948   my $file = shift;
 949   my @stat = stat($file);
 950   return $stat[9];
 951 }
 952
 953 # return names of all the indexes
 954 sub _index_files {
 955   return qw(names types locations attributes);
 956 }
 957
 958 # the directory in which we store our indexes
 959 sub directory {
 960   my $self = shift;
 961   my $d = $self->setting('directory');
 962   $self->setting(directory=>shift) if @_;
 963   $d;
 964 }
 965
 966 # flag indicating that we are a temporary database
 967 sub temporary {
 968   my $self = shift;
 969   my $d = $self->setting('temporary');
 970   $self->setting(temporary=>shift) if @_;
 971   $d;
 972 }
 973
 974 sub _permissions {
 975   my $self = shift;
 976   my $d = $self->setting('permissions') or return;
 977   if (@_) {
 978     my ($write,$create) = @_;
 979     $self->setting(permissions=>[$write,$create]);
 980   }
 981   @$d;
 982 }
 983
 984 # file name utilities...
 985 sub _qualify {
 986   my $self = shift;
 987   my $file = shift;
 988   return $self->directory .'/' . $file;
 989 }
 990
 991 sub _features_path {
 992   shift->_qualify('features.bdb');
 993 }
 994
 995 sub _parentage_path {
 996   shift->_qualify('parentage.bdb');
 997 }
 998
 999 sub _type_path {
1000   shift->_qualify('types.idx');
1001 }
1002
1003 sub _location_path {
1004   shift->_qualify('locations.idx');
1005 }
1006
1007 sub _attribute_path {
1008   shift->_qualify('attributes.idx');
1009 }
1010
1011 sub _notes_path {
1012   shift->_qualify('notes.idx');
1013 }
1014
1015 sub _fasta_path {
1016   shift->_qualify('sequence.fa');
1017 }
1018
1019 sub _mtime_path {
1020   shift->_qualify('mtime.stamp');
1021 }
1022
1023 ###########################################
1024 # searching
1025 ###########################################
1026
1027 sub _features {
1028   my $self = shift;
1029   my ($seq_id,$start,$end,$strand,
1030       $name,$class,$allow_aliases,
1031       $types,
1032       $attributes,
1033       $range_type,
1034       $iterator
1035      ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
1036                     'NAME','CLASS','ALIASES',
1037                     ['TYPES','TYPE','PRIMARY_TAG'],
1038                     ['ATTRIBUTES','ATTRIBUTE'],
1039                     'RANGE_TYPE',
1040                     'ITERATOR',
1041                    ],@_);
1042
1043   my (@from,@where,@args,@group);
1044   $range_type ||= 'overlaps';
1045
1046   my @result;
1047   unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
1048     @result = grep {!/^\./} keys %{$self->db};
1049   }
1050
1051   my %found = ();
1052   my $result = 1;
1053
1054   if (defined($name)) {
1055     # hacky backward compatibility workaround
1056     undef $class if $class && $class eq 'Sequence';
1057     $name     = "$class:$name" if defined $class && length $class > 0;
1058     $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
1059   }
1060
1061   if (defined $seq_id) {
1062     $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
1063   }
1064
1065   if (defined $types) {
1066     $result &&= $self->filter_by_type($types,\%found);
1067   }
1068
1069   if (defined $attributes) {
1070     $result &&= $self->filter_by_attribute($attributes,\%found);
1071   }
1072
1073   push @result,keys %found if $result;
1074   return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
1075                    : map {$self->fetch($_)} @result;
1076 }
1077
1078 sub filter_by_name {
1079   my $self = shift;
1080   my ($name,$allow_aliases,$filter) = @_;
1081
1082   my $index = $self->index_db('names');
1083   my $db    = tied(%$index);
1084
1085   my ($stem,$regexp) = $self->glob_match($name);
1086   $stem   ||= $name;
1087   $regexp ||= $name;
1088   $regexp .= "(?:_2)?" if $allow_aliases;
1089
1090   my $key   = $stem;
1091   my $value;
1092   my @results;
1093   for (my $status = $db->seq($key,$value,R_CURSOR);
1094        $status == 0 and $key =~ /^$regexp$/i;
1095        $status = $db->seq($key,$value,R_NEXT)) {
1096       next if %$filter && !$filter->{$value};  # don't bother
1097       push @results,$value;
1098   }
1099   $self->update_filter($filter,\@results);
1100 }
1101
1102 sub filter_by_type {
1103   my $self = shift;
1104   my ($types,$filter) = @_;
1105   my @types = ref $types eq 'ARRAY' ?  @$types : $types;
1106
1107   my $index = $self->index_db('types');
1108   my $db    = tied(%$index);
1109
1110   my @results;
1111
1112   for my $type (@types) {
1113     my ($primary_tag,$source_tag);
1114     if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1115       $primary_tag = $type->method;
1116       $source_tag  = $type->source;
1117     } else {
1118       ($primary_tag,$source_tag) = split ':',$type,2;
1119     }
1120     my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
1121     $source_tag ||= '';
1122     my $key   = lc "$primary_tag:$source_tag";
1123     my $value;
1124
1125     # If filter is already provided, then it is usually faster to
1126     # fetch each object.
1127     if (%$filter) {
1128         for my $id (keys %$filter) {
1129             my $obj = $self->_fetch($id) or next;
1130             push @results,$id if $obj->type =~ /$match/i;
1131         }
1132
1133     }
1134
1135     else {
1136         for (my $status = $db->seq($key,$value,R_CURSOR);
1137              $status == 0 && $key =~ /$match/i;
1138              $status = $db->seq($key,$value,R_NEXT)) {
1139             next if %$filter && !$filter->{$value};  # don't even bother
1140             push @results,$value;
1141         }
1142     }
1143   }
1144   $self->update_filter($filter,\@results);
1145 }
1146
1147 sub filter_by_location {
1148   my $self = shift;
1149   my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1150   $strand ||= 0;
1151
1152   my $index    = $self->index_db('locations');
1153   my $db       = tied(%$index);
1154
1155   my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
1156   my $binend   = defined $end   ? sprintf("%06d",int $end/BINSIZE)   : 'z';  # beyond a number
1157
1158   my %seenit;
1159   my @results;
1160
1161   $start = MININT  if !defined $start;
1162   $end   = MAXINT  if !defined $end;
1163   my $version_2 = $self->db_version > 1;
1164
1165   if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1166     my $key     = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1167     my $keystop = $version_2 ? "\L$seq_id\E.$binend"   : "\L$seq_id\E$binend";
1168     my $value;
1169
1170     for (my $status = $db->seq($key,$value,R_CURSOR);
1171          $status == 0 && $key le $keystop;
1172          $status = $db->seq($key,$value,R_NEXT)) {
1173       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1174       next if $seenit{$id}++;
1175       next if $strand && $fstrand != $strand;
1176       if ($range_type eq 'overlaps') {
1177         next unless $fend >= $start && $fstart <= $end;
1178       }
1179       elsif ($range_type eq 'contains') {
1180         next unless $fstart >= $start && $fend <= $end;
1181       }
1182       next if %$filter && !$filter->{$id};  # don't bother
1183       push @results,$id;
1184     }
1185   }
1186
1187   # for contained in, we look for features originating and terminating outside the specified range
1188   # this is incredibly inefficient, but fortunately the query is rare (?)
1189   elsif ($range_type eq 'contained_in') {
1190     my $key     = $version_2 ? "\L$seq_id."            : "\L$seq_id";
1191     my $keystop = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1192     my $value;
1193
1194     # do the left part of the range
1195     for (my $status = $db->seq($key,$value,R_CURSOR);
1196          $status == 0 && $key le $keystop;
1197          $status = $db->seq($key,$value,R_NEXT)) {
1198       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1199       next if $seenit{$id}++;
1200       next if $strand && $fstrand != $strand;
1201       next unless $fstart <= $start && $fend >= $end;
1202       next if %$filter && !$filter->{$id};  # don't bother
1203       push @results,$id;
1204     }
1205
1206     # do the right part of the range
1207     $key = "\L$seq_id\E.$binend";
1208     for (my $status = $db->seq($key,$value,R_CURSOR);
1209          $status == 0;
1210          $status = $db->seq($key,$value,R_NEXT)) {
1211       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1212       next if $seenit{$id}++;
1213       next if $strand && $fstrand != $strand;
1214       next unless $fstart <= $start && $fend >= $end;
1215       next if %$filter && !$filter->{$id};  # don't bother
1216       push @results,$id;
1217     }
1218
1219   }
1220
1221   $self->update_filter($filter,\@results);
1222 }
1223
1224 sub attributes {
1225     my $self = shift;
1226     my $index = $self->index_db('attributes');
1227     my %a     = map {s/:.+$//; $_=> 1} keys %$index;
1228     return keys %a;
1229 }
1230
1231 sub filter_by_attribute {
1232   my $self = shift;
1233   my ($attributes,$filter) = @_;
1234
1235   my $index = $self->index_db('attributes');
1236   my $db    = tied(%$index);
1237   my $result;
1238
1239   for my $att_name (keys %$attributes) {
1240     my @result;
1241     my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1242                            ? @{$attributes->{$att_name}} : $attributes->{$att_name};
1243
1244     for my $v (@search_terms) {
1245       my ($stem,$regexp) = $self->glob_match($v);
1246       $stem   ||= $v;
1247       $regexp ||= $v;
1248       my $key = "\L${att_name}:${stem}\E";
1249       my $value;
1250       for (my $status = $db->seq($key,$value,R_CURSOR);
1251            $status == 0 && $key =~ /^$att_name:$regexp$/i;
1252            $status = $db->seq($key,$value,R_NEXT)) {
1253           next if %$filter && !$filter->{$value};  # don't bother
1254           push @result,$value;
1255       }
1256     }
1257     $result ||= $self->update_filter($filter,\@result);
1258   }
1259   $result;
1260 }
1261
1262 sub _search_attributes {
1263   my $self = shift;
1264   my ($search_string,$attribute_array,$limit) = @_;
1265   $search_string =~ tr/*?//d;
1266   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1267   my $search = join '|',@words;
1268
1269   my $index = $self->index_db('attributes');
1270   my $db    = tied(%$index);
1271
1272   my (%results,%notes);
1273
1274   for my $tag (@$attribute_array) {
1275     my $id;
1276     my $key = "\L$tag:\E";
1277     for (my $status = $db->seq($key,$id,R_CURSOR);
1278          $status == 0 and $key =~ /^$tag:(.*)/i;
1279          $status = $db->seq($key,$id,R_NEXT)) {
1280       my $text = $1;
1281       next unless $text =~ /$search/;
1282       for my $w (@words) {
1283         my @hits = $text =~ /($w)/ig or next;
1284         $results{$id} += @hits;
1285       }
1286       $notes{$id} .= "$text ";
1287     }
1288   }
1289
1290   my @results;
1291   for my $id (keys %results) {
1292     my $hits = $results{$id};
1293     my $note = $notes{$id};
1294     $note =~ s/\s+$//;
1295     my $relevance = 10 * $hits;
1296     my $feature   = $self->fetch($id) or next;
1297     my $name      = $feature->display_name or next;
1298     my $type      = $feature->type;
1299     push @results,[$name,$note,$relevance,$type,$id];
1300   }
1301
1302   return @results;
1303 }
1304
1305 sub search_notes {
1306   my $self = shift;
1307   my ($search_string,$limit) = @_;
1308
1309   $search_string =~ tr/*?//d;
1310
1311   my @results;
1312
1313   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1314   my $search = join '|',@words;
1315
1316   my (%found,$found);
1317   my $note_index = $self->notes_db;
1318   seek($note_index,0,0);  # back to start
1319   while (<$note_index>) {
1320     next unless /$search/;
1321     chomp;
1322     my ($note,$uu) = split "\t";
1323     $found{unpack("u*",$uu)}++;
1324     last if $limit && ++$found >= $limit;
1325   }
1326
1327   my (@features, @matches);
1328   for my $idx (keys %found) {
1329     my $feature    = $self->fetch($idx) or next;
1330     my @values     = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1331     my $value      = "@values";
1332
1333     my $hits;
1334     $hits++ while $value =~ /($search)/ig;  # count the number of times we were hit
1335     push @matches,$hits;
1336     push @features,$feature;
1337   }
1338
1339   for (my $i=0; $i<@matches; $i++)  {
1340     my $feature = $features[$i];
1341     my $matches = $matches[$i];
1342
1343     my $relevance = 10 * $matches;
1344     my $note;
1345     $note   = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1346     push @results,[$feature->display_name,$note,$relevance];
1347   }
1348
1349   return @results;
1350 }
1351
1352 sub glob_match {
1353   my $self = shift;
1354   my $term = shift;
1355   return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1356   my $stem = $1;
1357   $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1358   $term =~ s/(^|[^\\])\*/$1.*/g;
1359   $term =~ s/(^|[^\\])\?/$1./g;
1360   return ($stem,$term);
1361 }
1362
1363
1364 sub update_filter {
1365   my $self = shift;
1366   my ($filter,$results) = @_;
1367   return unless @$results;
1368
1369   if (%$filter) {
1370     my @filtered = grep {$filter->{$_}} @$results;
1371     %$filter     = map {$_=>1} @filtered;
1372   } else {
1373     %$filter     = map {$_=>1} @$results;
1374   }
1375
1376 }
1377
1378 sub types {
1379     my $self = shift;
1380     eval "require Bio::DB::GFF::Typename"
1381         unless Bio::DB::GFF::Typename->can('new');
1382
1383     my $index = $self->index_db('types');
1384     my $db    = tied(%$index);
1385     return map {Bio::DB::GFF::Typename->new($_)} keys %$index;
1386 }
1387
1388 # this is ugly
1389 sub _insert_sequence {
1390   my $self = shift;
1391   my ($seqid,$seq,$offset) = @_;
1392   my $dna_fh = $self->private_fasta_file or return;
1393   if ($offset == 0) { # start of the sequence
1394     print $dna_fh ">$seqid\n";
1395   }
1396   print $dna_fh $seq,"\n";
1397 }
1398
1399 sub _fetch_sequence {
1400   my $self = shift;
1401   my ($seqid,$start,$end) = @_;
1402   my $db = $self->dna_db or return;
1403   return $db->seq($seqid,$start,$end);
1404 }
1405
1406 sub private_fasta_file {
1407   my $self = shift;
1408   return $self->{fasta_fh} if exists $self->{fasta_fh};
1409   $self->{fasta_file}   = $self->_qualify("sequence.fa");
1410   return $self->{fasta_fh} = IO::File->new($self->{fasta_file},">");
1411 }
1412
1413 sub finish_bulk_update {
1414   my $self = shift;
1415   if (my $fh = $self->{fasta_fh}) {
1416     $fh->close;
1417     $self->{fasta_db} = Bio::DB::Fasta::Subdir->new($self->{fasta_file});
1418   }
1419 }
1420
1421 sub db_version {
1422     my $self = shift;
1423     my $db   = $self->db;
1424     return $db->{'.version'} || 1.00;
1425 }
1426
1427
1428 sub DESTROY {
1429   my $self = shift;
1430   $self->_close_databases();
1431   rmtree($self->directory,0,1) if $self->temporary;
1432 }
1433
1434 # TIE interface -- a little annoying because we are storing magic ".variable"
1435 # meta-variables in the same data structure as the IDs, so these variables
1436 # must be skipped.
1437 sub _firstid {
1438   my $self  = shift;
1439   my $db    = $self->db;
1440   my ($key,$value);
1441   while ( ($key,$value) = each %{$db}) {
1442     last unless $key =~ /^\./;
1443   }
1444   $key;
1445 }
1446
1447 sub _nextid {
1448   my $self = shift;
1449   my $id   = shift;
1450   my $db    = $self->db;
1451   my ($key,$value);
1452   while ( ($key,$value) = each %$db) {
1453     last unless $key =~ /^\./;
1454   }
1455   $key;
1456 }
1457
1458 sub _existsid {
1459   my $self = shift;
1460   my $id   = shift;
1461   return exists $self->db->{$id};
1462 }
1463
1464 sub _deleteid {
1465   my $self = shift;
1466   my $id   = shift;
1467   my $obj  = $self->fetch($id) or return;
1468   $self->_delete_indexes($obj,$id);
1469   delete $self->db->{$id};
1470 }
1471
1472 sub _clearall {
1473   my $self = shift;
1474   $self->_close_databases();
1475   $self->_delete_databases();
1476   my ($write,$create) = $self->_permissions;
1477   $self->_open_databases($write,$create);
1478 }
1479
1480 sub _featurecount {
1481   my $self = shift;
1482   return scalar %{$self->db};
1483 }
1484
1485
1486 package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;
1487
1488 sub new {
1489   my $class = shift;
1490   my $store = shift;
1491   my $ids   = shift;
1492   return bless {store => $store,
1493                 ids   => $ids},ref($class) || $class;
1494 }
1495
1496 sub next_seq {
1497   my $self  = shift;
1498   my $store = $self->{store} or return;
1499   my $id    = shift @{$self->{ids}};
1500   defined $id or return;
1501   return $store->fetch($id);
1502 }
1503
1504 package Bio::DB::Fasta::Subdir;
1505
1506 use base 'Bio::DB::Fasta';
1507
1508 # alter calling arguments so that the fasta file is placed in a subdirectory
1509 # named "indexes"
1510
1511 sub index_name {
1512     my $self = shift;
1513     my ($path,$isdir) = @_;
1514     return $self->SUPER::index_name($path,$isdir)
1515         unless $isdir;
1516     return File::Spec->catfile($path,'indexes','fasta.index');
1517 }
1518
1519
1520 1;
1521
1522 __END__
1523
1524 =head1 BUGS
1525
1526 This is an early version, so there are certainly some bugs. Please
1527 use the BioPerl bug tracking system to report bugs.
1528
1529 =head1 SEE ALSO
1530
1531 L<bioperl>,
1532 L<Bio::DB::SeqFeature>,
1533 L<Bio::DB::SeqFeature::Store>,
1534 L<Bio::DB::SeqFeature::GFF3Loader>,
1535 L<Bio::DB::SeqFeature::Segment>,
1536 L<Bio::DB::SeqFeature::Store::memory>,
1537 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1538
1539 =head1 AUTHOR
1540
1541 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1542
1543 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1544
1545 This library is free software; you can redistribute it and/or modify
1546 it under the same terms as Perl itself.
1547
1548 =cut
1549