1 package Bio
::DB
::SeqFeature
::Store
::berkeleydb
;
6 use base
'Bio::DB::SeqFeature::Store';
7 use Bio
::DB
::GFF
::Util
::Rearrange
'rearrange';
9 use Fcntl
qw(O_RDWR O_CREAT :flock);
11 use File
::Temp
'tempdir';
12 use File
::Path
'rmtree','mkpath';
15 use Carp
'carp','croak';
17 use constant BINSIZE
=> 10_000
;
18 use constant MININT
=> -999_999_999_999
;
19 use constant MAXINT
=> 999_999_999_999
;
23 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
27 use Bio::DB::SeqFeature::Store;
29 # Create a database from the feature files located in /home/fly4.3 and store
30 # the database index in the same directory:
31 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
32 -dir => '/home/fly4.3');
34 # Create a database that will monitor the files in /home/fly4.3, but store
35 # the indexes in /var/databases/fly4.3
36 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
37 -dsn => '/var/databases/fly4.3',
38 -dir => '/home/fly4.3');
40 # Create a feature database from scratch
41 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
42 -dsn => '/var/databases/fly4.3',
45 # get a feature from somewhere
46 my $feature = Bio::SeqFeature::Generic->new(...);
49 $db->store($feature) or die "Couldn't store!";
51 # primary ID of the feature is changed to indicate its primary ID
53 my $id = $feature->primary_id;
55 # get the feature back out
56 my $f = $db->fetch($id);
58 # change the feature and update it
60 $db->update($f) or $self->throw("Couldn't update!");
62 # use the GFF3 loader to do a bulk write:
63 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
66 $loader->load('/home/fly4.3/dmel-all.gff');
71 my @features = $db->fetch_many(@list_of_ids);
74 @features = $db->get_features_by_name('ZK909');
77 @features = $db->get_features_by_alias('sma-3');
80 @features = $db->get_features_by_type('gene');
83 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
86 @features = $db->get_features_by_attribute({description => 'protein kinase'})
88 # ...by the GFF "Note" field
89 @result_list = $db->search_notes('kinase');
91 # ...by arbitrary combinations of selectors
92 @features = $db->features(-name => $name,
97 -attributes => $attributes);
99 # ...using an iterator
100 my $iterator = $db->get_seq_stream(-name => $name,
105 -attributes => $attributes);
107 while (my $feature = $iterator->next_seq) {
108 # do something with the feature
111 # ...limiting the search to a particular region
112 my $segment = $db->segment('Chr1',5000=>6000);
113 my @features = $segment->features(-type=>['mRNA','match']);
115 # what feature types are defined in the database?
116 my @types = $db->types;
118 # getting & storing sequence information
119 # Warning: this returns a string, and not a PrimarySeq object
120 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
121 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
123 # create a new feature in the database
124 my $feature = $db->new_feature(-primary_tag => 'mRNA',
131 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
132 Bio::DB::SeqFeature::Store. You will not create it directly, but
133 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
135 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
137 =head2 Using the berkeleydb adaptor
139 The Berkeley database consists of a series of Berkeleydb index files,
140 and a couple of special purpose indexes. You can create the index
141 files from scratch by creating a new database and calling
142 new_feature() repeatedly, you can create the database and then bulk
143 populate it using the GFF3 loader, or you can monitor a directory of
144 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
145 or more of the fields changes. The last mode is probably the most
146 convenient. Note that the indexer will only pay attention to files
147 that end with .gff3, .wig and .fa.
151 =item The new() constructor
153 The new() constructor method all the arguments recognized by
154 Bio::DB::SeqFeature::Store, and a few additional ones.
161 -adaptor The name of the Adaptor class (default DBI::mysql)
163 -serializer The name of the serializer class (default Storable)
165 -index_subfeatures Whether or not to make subfeatures searchable
168 -cache Activate LRU caching feature -- size of cache
170 -compress Compresses features before storing them in database
173 Adaptor-specific arguments
178 -dsn Where the index files are stored
180 -dir Where the source (GFF3, FASTA) files are stored
182 -autoindex An alias for -dir.
184 -write Pass true to open the index files for writing.
186 -create Pass true to create the index files if they don't exist
189 -locking Use advisory locking to avoid one process trying to read
190 from the database while another is updating it (may not
191 work properly over NFS).
193 -temp Pass true to create temporary index files that will
194 be deleted once the script exits.
196 -verbose Pass true to report autoindexing operations on STDERR.
201 To create an empty database which will be populated using calls to
202 store() or new_feature(), or which will be bulk-loaded using the GFF3
205 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
206 -dsn => '/var/databases/fly4.3',
209 To open a preexisting database in read-only mode:
211 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
212 -dsn => '/var/databases/fly4.3');
214 To open a preexisting database in read/write (update) mode:
216 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
217 -dsn => '/var/databases/fly4.3',
220 To monitor a set of GFF3 and FASTA files located in a directory and
221 create/update the database indexes as needed. The indexes will be
222 stored in a new subdirectory named "indexes":
224 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
225 -dir => '/var/databases/fly4.3');
227 As above, but store the source files and index files in separate directories:
229 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
230 -dsn => '/var/databases/fly4.3',
231 -dir => '/home/gff3_files/fly4.3');
233 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
234 (FASTA format) or .wig (WIG format).
236 B<-autoindex> is an alias for B<-dir>.
238 You should specify B<-locking> in a multiuser environment, including
239 the case in which the database is being used by a web server at the
240 same time another user might be updating it.
244 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
245 by this adaptor. The various methods for storing and updating features
246 and sequences into the database are supported, but there is no
247 locking. If two processes try to update the same database
248 simultaneously, the database will likely become corrupted.
253 # object initialization
264 ) = rearrange
([['DSN','DB'],
266 ['TMP','TEMP','TEMPORARY'],
267 [qw(WRITE WRITABLE)],
273 $verbose = 1 unless defined $verbose;
276 -d
$autoindex or $self->throw("Invalid directory $autoindex");
277 $directory ||= "$autoindex/indexes";
279 $directory ||= $is_temporary ? File
::Spec
->tmpdir : '.';
281 my $pacname = __PACKAGE__
;
282 if ($^O
=~ /mswin/i) {
283 $pacname =~ s/:+/_/g;
285 $directory = tempdir
($pacname.'_XXXXXX',
288 DIR
=> $directory) if $is_temporary;
290 -d
$directory or $self->throw("Invalid directory $directory");
292 $create++ if $is_temporary;
294 $self->throw("Can't write into the directory $directory")
295 if $write && !-w
$directory;
298 $self->default_settings;
299 $self->directory($directory);
300 $self->temporary($is_temporary);
301 $self->verbose($verbose);
302 $self->locking($locking);
303 $self->_delete_databases() if $create;
304 if ($autoindex && -d
$autoindex) {
305 $self->auto_reindex($autoindex);
307 $self->lock('shared');
309 # this step may rebless $self into a subclass
310 # to preserve backward compatibility with older
311 # databases while providing better performance for
313 $self->possibly_rebless($create);
315 $self->_open_databases($write,$create,$autoindex);
316 $self->_permissions($write,$create);
320 sub version
{ return 2.0 };
322 sub possibly_rebless
{
329 } else { # probe database
331 tie
(%h,'DB_File',$self->_features_path,O_RDONLY
,0666,$DB_HASH) or return;
332 $do_rebless = $h{'.version'} >= 3.0;
336 eval "require Bio::DB::SeqFeature::Store::berkeleydb3";
337 bless $self,'Bio::DB::SeqFeature::Store::berkeleydb3';
342 sub can_store_parentage
{ 1 }
347 my $result = $self->needs_auto_reindexing($autodir);
349 if ($result && %$result) {
350 $self->flag_autoindexing(1);
351 $self->lock('exclusive');
352 $self->reindex_wigfiles($result->{wig
},$autodir) if $result->{wig
};
353 $self->reindex_ffffiles($result->{fff
},$autodir) if $result->{fff
};
354 $self->reindex_gfffiles($result->{gff
},$autodir) if $result->{gff
};
355 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($autodir));
357 $self->flag_autoindexing(0);
361 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($autodir));
365 sub autoindex_flagfile
{
366 return File
::Spec
->catfile(shift->directory,'autoindex.pid');
368 sub auto_index_in_process
{
370 my $flag_file = $self->autoindex_flagfile;
371 return unless -e
$flag_file;
373 # if flagfile exists, then check that PID still exists
374 open my $fh,$flag_file or die "Couldn't open $flag_file: $!";
377 return 1 if kill 0=>$pid;
378 warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
379 if (unlink $flag_file) {
380 warn "Successfully removed stale PID file." if $self->verbose;
381 warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
382 my $glob = File
::Spec
->catfile($self->directory,'*');
386 croak
("Cannot recover from apparent aborted autoindex process. Please remove files in ",
388 " and allow the adaptor to reindex");
393 sub flag_autoindexing
{
396 my $flag_file = $self->autoindex_flagfile;
398 open my $fh,'>',$flag_file or die "Couldn't open $flag_file: $!";
406 sub reindex_gfffiles
{
411 warn "Reindexing GFF files...\n" if $self->verbose;
412 my $exists = -e
$self->_features_path;
414 $self->_permissions(1,1);
415 $self->_close_databases();
416 $self->_open_databases(1,!$exists);
417 require Bio
::DB
::SeqFeature
::Store
::GFF3Loader
418 unless Bio
::DB
::SeqFeature
::Store
::GFF3Loader
->can('new');
419 my $loader = Bio
::DB
::SeqFeature
::Store
::GFF3Loader
->new(-store
=> $self,
420 -sf_class
=> $self->seqfeature_class,
421 -verbose
=> $self->verbose,
423 or $self->throw("Couldn't create GFF3Loader");
425 $loader->load(grep {!$seen{$_}++} @
$files);
426 $self->_touch_timestamp;
429 sub reindex_ffffiles
{
434 warn "Reindexing FFF files...\n" if $self->verbose;
435 $self->_permissions(1,1);
436 $self->_close_databases();
437 $self->_open_databases(1,1);
438 require Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
439 unless Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
->can('new');
440 my $loader = Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
->new(-store
=> $self,
441 -sf_class
=> $self->seqfeature_class,
442 -verbose
=> $self->verbose,
444 or $self->throw("Couldn't create FeatureFileLoader");
446 $loader->load(grep {!$seen{$_}++} @
$files);
447 $self->_touch_timestamp;
450 sub reindex_wigfiles
{
455 warn "Reindexing wig files...\n" if $self->verbose;
457 unless (Bio
::Graphics
::Wiggle
::Loader
->can('new')) {
458 eval "require Bio::Graphics::Wiggle::Loader; 1"
462 for my $wig (@
$files) {
463 warn "Reindexing $wig...\n" if $self->verbose;
464 my ($wib_name) = fileparse
($wig,qr/\.[^.]*/);
465 my $gff3_name = "$wib_name.gff3";
467 # unlink all wib files that share the basename
468 my $wib_glob = File
::Spec
->catfile($self->directory,"$wib_name*.wib");
469 unlink glob($wib_glob);
471 my $loader = Bio
::Graphics
::Wiggle
::Loader
->new($self->directory,$wib_name);
472 my $fh = IO
::File
->new($wig) or die "Can't open $wig: $!";
473 $loader->load($fh); # will create one or more .wib files
475 my $gff3_data = $loader->featurefile('gff3','microarray_oligo',$wib_name);
476 my $gff3_path = File
::Spec
->catfile($autodir,$gff3_name);
477 $fh = IO
::File
->new($gff3_path,'>')
478 or die "Can't open $gff3_path for writing: $!";
479 $fh->print($gff3_data);
481 my $conf_path = File
::Spec
->catfile($autodir,"$wib_name.conf");
482 $fh = IO
::File
->new($conf_path,'>');
483 $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
488 # returns the following hashref
489 # empty hash if nothing needs reindexing
490 # {fasta => 1} if DNA database needs reindexing
491 # {gff => [list,of,gff,paths]} if gff3 files need reindexing
492 # {wig => [list,of,wig,paths]} if wig files need reindexing
493 sub needs_auto_reindexing
{
498 # don't allow two processes to reindex simultaneously
499 $self->auto_index_in_process and croak
"Autoindexing in process. Try again later";
501 # first interrogate the GFF3 files, using the timestamp file
502 # as modification comparison
503 my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
504 opendir (my $D,$autodir)
505 or $self->throw("Couldn't open directory $autodir for reading: $!");
508 my $timestamp_time = _mtime
($self->_mtime_path) || 0;
509 while (defined (my $node = readdir($D))) {
510 next if $node =~ /^\./;
511 my $path = File
::Spec
->catfile($autodir,$node);
512 next unless -f
$path;
514 if ($path =~ /\.gff\d?$/i) {
515 my $mtime = _mtime
(\
*_
); # not a typo
516 $maxtime = $mtime if $mtime > $maxtime;
521 elsif ($path =~ /\.fff?$/i) {
522 my $mtime = _mtime
(\
*_
); # not a typo
523 $maxtime = $mtime if $mtime > $maxtime;
527 elsif ($path =~ /\.wig$/i) {
529 (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
530 next if -e
$gff_file && _mtime
($gff_file) > _mtime
($path);
532 push @gff3,$gff_file;
536 elsif ($path =~ /\.(fa|fasta|dna)$/i) {
538 _mtime
(File
::Spec
->catfile($self->directory,'fasta.index'))||0
539 unless defined $fasta_index_time;
540 $fasta++ if _mtime
($path) > $fasta_index_time;
545 $result->{gff
} = \
@gff3 if $maxtime > $timestamp_time;
546 $result->{wig
} = \
@wig if @wig;
547 $result->{fff
} = \
@fff if @fff;
548 $result->{fasta
}++ if $fasta;
554 my $d = $self->{verbose
};
555 $self->{verbose
} = shift if @_;
561 my $d = $self->{locking
};
562 $self->{locking
} = shift if @_;
568 return File
::Spec
->catfile($self->directory,'lock');
574 return unless $self->locking;
576 my $flag = $mode eq 'exclusive' ? LOCK_EX
: LOCK_SH
;
577 my $lockfile = $self->lockfile;
578 my $fh = $self->_flock_fh;
580 my $open = -e
$lockfile ?
'<' : '>';
581 $fh = IO
::File
->new($lockfile,$open) or die "Cannot open $lockfile: $!";
584 $self->_flock_fh($fh);
589 return unless $self->locking;
591 my $fh = $self->_flock_fh or return;
593 undef $self->{flock_fh
};
598 my $d = $self->{flock_fh
};
599 $self->{flock_fh
} = shift if @_;
603 sub _open_databases
{
605 my ($write,$create,$ignore_errors) = @_;
606 return if $self->db; # already open - don't reopen
608 my $directory = $self->directory;
609 unless (-d
$directory) { # directory does not exist
610 $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
611 mkpath
($directory) or $self->throw("Couldn't create database directory $directory: $!");
614 my $flags = O_RDONLY
;
615 $flags |= O_RDWR
if $write;
616 $flags |= O_CREAT
if $create;
618 # Create the main database; this is a DB_HASH implementation
620 my $result = tie
(%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
623 return if $ignore_errors; # autoindex set, so defer this
624 $self->throw("Couldn't tie: ".$self->_features_path . " $!");
629 $h{'.version'} = $self->version;
633 $self->open_index_dbs($flags,$create);
634 $self->open_parentage_db($flags,$create);
635 $self->open_notes_db($write,$create);
636 $self->open_seq_db($flags,$create) if -e
$self->_fasta_path;
641 my ($flags,$create) = @_;
643 # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
644 $DB_BTREE->{flags
} = R_DUP
;
645 $DB_BTREE->{compare
} = sub { lc($_[0]) cmp lc($_[1]) };
646 for my $idx ($self->_index_files) {
647 my $path = $self->_qualify("$idx.idx");
649 tie
(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
650 or $self->throw("Couldn't tie $path: $!");
652 $self->index_db($idx=>\
%db);
656 sub open_parentage_db
{
658 my ($flags,$create) = @_;
660 # Create the parentage database
662 tie
(%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
663 or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
665 $self->parentage_db(\
%p);
670 my ($write,$create) = @_;
672 my $mode = $write ?
"+>>"
676 open (my $F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
683 if (-e
$self->_fasta_path) {
684 my $dna_db = Bio
::DB
::Fasta
::Subdir
->new($self->_fasta_path)
685 or $self->throw("Can't reindex sequence file: $@");
686 $self->dna_db($dna_db);
690 sub commit
{ # reindex fasta files
692 if (my $fh = $self->{fasta_fh
}) {
694 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($self->{fasta_file
}));
695 } elsif (-d
$self->directory) {
696 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($self->directory));
700 sub _close_databases
{
703 $self->dna_db(undef);
704 $self->notes_db(undef);
705 $self->index_db($_=>undef) foreach $self->_index_files;
708 # do nothing -- new() with -create=>1 will do the trick
709 sub _init_database
{ }
711 sub _delete_databases
{
713 for my $idx ($self->_index_files) {
714 my $path = $self->_qualify("$idx.idx");
717 unlink $self->_parentage_path;
718 unlink $self->_fasta_path;
719 unlink $self->_features_path;
720 unlink $self->_mtime_path;
723 sub _touch_timestamp
{
725 my $tsf = $self->_mtime_path;
726 open (F
,">$tsf") or $self->throw("Couldn't open $tsf: $!");
727 print F
scalar(localtime);
737 my $primary_id = $obj->primary_id;
738 $self->_delete_indexes($obj,$primary_id) if $indexed && $primary_id;
739 $primary_id = $db->{'.next_id'}++ unless defined $primary_id;
740 $db->{$primary_id} = $self->freeze($obj);
741 $obj->primary_id($primary_id);
742 $self->_update_indexes($obj) if $indexed;
748 sub _delete_indexes
{
752 # the additional "1" causes the index to be deleted
753 $self->_update_name_index($obj,$id,1);
754 $self->_update_type_index($obj,$id,1);
755 $self->_update_location_index($obj,$id,1);
756 $self->_update_attribute_index($obj,$id,1);
757 $self->_update_note_index($obj,$id,1);
764 my $obj = $self->thaw($db->{$id},$id);
768 sub _add_SeqFeature
{
772 my $parent_id = (ref $parent ?
$parent->primary_id : $parent)
773 or $self->throw("$parent should have a primary_id");
774 my $p = $self->parentage_db;
775 for my $child (@children) {
776 my $child_id = ref $child ?
$child->primary_id : $child;
777 defined $child_id or $self->throw("no primary ID known for $child");
778 $p->{$parent_id} = $child_id;
782 sub _fetch_SeqFeatures
{
786 my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
787 my $index = $self->parentage_db;
788 my $db = tied %$index;
790 my @children_ids = $db->get_dup($parent_id);
791 my @children = map {$self->fetch($_)} @children_ids;
794 my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
795 return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
801 sub _update_indexes
{
804 defined (my $id = $obj->primary_id) or return;
805 $self->_update_name_index($obj,$id);
806 $self->_update_type_index($obj,$id);
807 $self->_update_location_index($obj,$id);
808 $self->_update_attribute_index($obj,$id);
809 $self->_update_note_index($obj,$id);
812 sub _update_name_index
{
814 my ($obj,$id,$delete) = @_;
815 my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
816 my ($names,$aliases) = $self->feature_names($obj);
818 # little stinky - needs minor refactoring
821 $self->update_or_delete($delete,$db,$key,$id);
824 foreach (@
$aliases) {
825 my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
826 $self->update_or_delete($delete,$db,$key,$id);
831 sub _update_type_index
{
833 my ($obj,$id,$delete) = @_;
834 my $db = $self->index_db('types')
835 or $self->throw("Couldn't find 'types' index file");
836 my $primary_tag = $obj->primary_tag;
837 my $source_tag = $obj->source_tag || '';
838 return unless defined $primary_tag;
840 $primary_tag .= ":$source_tag";
841 my $key = lc $primary_tag;
842 $self->update_or_delete($delete,$db,$key,$id);
845 # Note: this indexing scheme is space-inefficient because it stores the
846 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
847 # format. It should be replaced with a binary numeric encoding and the
848 # BTREE {compare} attribute changed accordingly.
849 sub _update_location_index
{
851 my ($obj,$id,$delete) = @_;
852 my $db = $self->index_db('locations')
853 or $self->throw("Couldn't find 'locations' index file");
855 my $seq_id = $obj->seq_id || '';
856 my $start = $obj->start || '';
857 my $end = $obj->end || '';
858 my $strand = $obj->strand;
859 my $bin_min = int $start/BINSIZE
;
860 my $bin_max = int $end/BINSIZE
;
862 for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
863 my $key = sprintf("%s.%06d",lc($seq_id),$bin);
864 $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
868 sub _update_attribute_index
{
870 my ($obj,$id,$delete) = @_;
871 my $db = $self->index_db('attributes')
872 or $self->throw("Couldn't find 'attributes' index file");
874 for my $tag ($obj->get_all_tags) {
875 for my $value ($obj->get_tag_values($tag)) {
876 my $key = "${tag}:${value}";
877 $self->update_or_delete($delete,$db,$key,$id);
882 sub _update_note_index
{
884 my ($obj,$id,$delete) = @_;
885 return if $delete; # we don't know how to do this
887 my $fh = $self->notes_db;
888 my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
891 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
895 sub update_or_delete
{
897 my ($delete,$db,$key,$id) = @_;
899 tied(%$db)->del_dup($key,$id);
905 # these methods return pointers to....
906 # the database that stores serialized objects
909 my $d = $self->setting('db');
910 $self->setting(db
=>shift) if @_;
916 my $d = $self->setting('parentage_db');
917 $self->setting(parentage_db
=>shift) if @_;
921 # the Bio::DB::Fasta object
924 my $d = $self->setting('dna_db');
925 $self->setting(dna_db
=>shift) if @_;
929 # the specialized notes database
932 my $d = $self->setting('notes_db');
933 $self->setting(notes_db
=>shift) if @_;
937 # The indicated index berkeley db
940 my $index_name = shift;
941 my $d = $self->setting($index_name);
942 $self->setting($index_name=>shift) if @_;
949 my @stat = stat($file);
953 # return names of all the indexes
955 return qw(names types locations attributes);
958 # the directory in which we store our indexes
961 my $d = $self->setting('directory');
962 $self->setting(directory
=>shift) if @_;
966 # flag indicating that we are a temporary database
969 my $d = $self->setting('temporary');
970 $self->setting(temporary
=>shift) if @_;
976 my $d = $self->setting('permissions') or return;
978 my ($write,$create) = @_;
979 $self->setting(permissions
=>[$write,$create]);
984 # file name utilities...
988 return $self->directory .'/' . $file;
992 shift->_qualify('features.bdb');
995 sub _parentage_path
{
996 shift->_qualify('parentage.bdb');
1000 shift->_qualify('types.idx');
1003 sub _location_path
{
1004 shift->_qualify('locations.idx');
1007 sub _attribute_path
{
1008 shift->_qualify('attributes.idx');
1012 shift->_qualify('notes.idx');
1016 shift->_qualify('sequence.fa');
1020 shift->_qualify('mtime.stamp');
1023 ###########################################
1025 ###########################################
1029 my ($seq_id,$start,$end,$strand,
1030 $name,$class,$allow_aliases,
1035 ) = rearrange
([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
1036 'NAME','CLASS','ALIASES',
1037 ['TYPES','TYPE','PRIMARY_TAG'],
1038 ['ATTRIBUTES','ATTRIBUTE'],
1043 my (@from,@where,@args,@group);
1044 $range_type ||= 'overlaps';
1047 unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
1048 @result = grep {!/^\./} keys %{$self->db};
1054 if (defined($name)) {
1055 # hacky backward compatibility workaround
1056 undef $class if $class && $class eq 'Sequence';
1057 $name = "$class:$name" if defined $class && length $class > 0;
1058 $result &&= $self->filter_by_name($name,$allow_aliases,\
%found);
1061 if (defined $seq_id) {
1062 $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\
%found);
1065 if (defined $types) {
1066 $result &&= $self->filter_by_type($types,\
%found);
1069 if (defined $attributes) {
1070 $result &&= $self->filter_by_attribute($attributes,\
%found);
1073 push @result,keys %found if $result;
1074 return $iterator ? Bio
::DB
::SeqFeature
::Store
::berkeleydb
::Iterator
->new($self,\
@result)
1075 : map {$self->fetch($_)} @result;
1078 sub filter_by_name
{
1080 my ($name,$allow_aliases,$filter) = @_;
1082 my $index = $self->index_db('names');
1083 my $db = tied(%$index);
1085 my ($stem,$regexp) = $self->glob_match($name);
1088 $regexp .= "(?:_2)?" if $allow_aliases;
1093 for (my $status = $db->seq($key,$value,R_CURSOR
);
1094 $status == 0 and $key =~ /^$regexp$/i;
1095 $status = $db->seq($key,$value,R_NEXT
)) {
1096 next if %$filter && !$filter->{$value}; # don't bother
1097 push @results,$value;
1099 $self->update_filter($filter,\
@results);
1102 sub filter_by_type
{
1104 my ($types,$filter) = @_;
1105 my @types = ref $types eq 'ARRAY' ? @
$types : $types;
1107 my $index = $self->index_db('types');
1108 my $db = tied(%$index);
1112 for my $type (@types) {
1113 my ($primary_tag,$source_tag);
1114 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1115 $primary_tag = $type->method;
1116 $source_tag = $type->source;
1118 ($primary_tag,$source_tag) = split ':',$type,2;
1120 my $match = defined $source_tag ?
"^$primary_tag:$source_tag\$" : "^$primary_tag:";
1122 my $key = lc "$primary_tag:$source_tag";
1125 # If filter is already provided, then it is usually faster to
1126 # fetch each object.
1128 for my $id (keys %$filter) {
1129 my $obj = $self->_fetch($id) or next;
1130 push @results,$id if $obj->type =~ /$match/i;
1136 for (my $status = $db->seq($key,$value,R_CURSOR
);
1137 $status == 0 && $key =~ /$match/i;
1138 $status = $db->seq($key,$value,R_NEXT
)) {
1139 next if %$filter && !$filter->{$value}; # don't even bother
1140 push @results,$value;
1144 $self->update_filter($filter,\
@results);
1147 sub filter_by_location
{
1149 my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1152 my $index = $self->index_db('locations');
1153 my $db = tied(%$index);
1155 my $binstart = defined $start ?
sprintf("%06d",int $start/BINSIZE
) : '';
1156 my $binend = defined $end ?
sprintf("%06d",int $end/BINSIZE
) : 'z'; # beyond a number
1161 $start = MININT
if !defined $start;
1162 $end = MAXINT
if !defined $end;
1163 my $version_2 = $self->db_version > 1;
1165 if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1166 my $key = $version_2 ?
"\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1167 my $keystop = $version_2 ?
"\L$seq_id\E.$binend" : "\L$seq_id\E$binend";
1170 for (my $status = $db->seq($key,$value,R_CURSOR
);
1171 $status == 0 && $key le $keystop;
1172 $status = $db->seq($key,$value,R_NEXT
)) {
1173 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1174 next if $seenit{$id}++;
1175 next if $strand && $fstrand != $strand;
1176 if ($range_type eq 'overlaps') {
1177 next unless $fend >= $start && $fstart <= $end;
1179 elsif ($range_type eq 'contains') {
1180 next unless $fstart >= $start && $fend <= $end;
1182 next if %$filter && !$filter->{$id}; # don't bother
1187 # for contained in, we look for features originating and terminating outside the specified range
1188 # this is incredibly inefficient, but fortunately the query is rare (?)
1189 elsif ($range_type eq 'contained_in') {
1190 my $key = $version_2 ?
"\L$seq_id." : "\L$seq_id";
1191 my $keystop = $version_2 ?
"\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1194 # do the left part of the range
1195 for (my $status = $db->seq($key,$value,R_CURSOR
);
1196 $status == 0 && $key le $keystop;
1197 $status = $db->seq($key,$value,R_NEXT
)) {
1198 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1199 next if $seenit{$id}++;
1200 next if $strand && $fstrand != $strand;
1201 next unless $fstart <= $start && $fend >= $end;
1202 next if %$filter && !$filter->{$id}; # don't bother
1206 # do the right part of the range
1207 $key = "\L$seq_id\E.$binend";
1208 for (my $status = $db->seq($key,$value,R_CURSOR
);
1210 $status = $db->seq($key,$value,R_NEXT
)) {
1211 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1212 next if $seenit{$id}++;
1213 next if $strand && $fstrand != $strand;
1214 next unless $fstart <= $start && $fend >= $end;
1215 next if %$filter && !$filter->{$id}; # don't bother
1221 $self->update_filter($filter,\
@results);
1226 my $index = $self->index_db('attributes');
1227 my %a = map {s/:.+$//; $_=> 1} keys %$index;
1231 sub filter_by_attribute
{
1233 my ($attributes,$filter) = @_;
1235 my $index = $self->index_db('attributes');
1236 my $db = tied(%$index);
1239 for my $att_name (keys %$attributes) {
1241 my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1242 ? @
{$attributes->{$att_name}} : $attributes->{$att_name};
1244 for my $v (@search_terms) {
1245 my ($stem,$regexp) = $self->glob_match($v);
1248 my $key = "\L${att_name}:${stem}\E";
1250 for (my $status = $db->seq($key,$value,R_CURSOR
);
1251 $status == 0 && $key =~ /^$att_name:$regexp$/i;
1252 $status = $db->seq($key,$value,R_NEXT
)) {
1253 next if %$filter && !$filter->{$value}; # don't bother
1254 push @result,$value;
1257 $result ||= $self->update_filter($filter,\
@result);
1262 sub _search_attributes
{
1264 my ($search_string,$attribute_array,$limit) = @_;
1265 $search_string =~ tr/*?//d;
1266 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1267 my $search = join '|',@words;
1269 my $index = $self->index_db('attributes');
1270 my $db = tied(%$index);
1272 my (%results,%notes);
1274 for my $tag (@
$attribute_array) {
1276 my $key = "\L$tag:\E";
1277 for (my $status = $db->seq($key,$id,R_CURSOR
);
1278 $status == 0 and $key =~ /^$tag:(.*)/i;
1279 $status = $db->seq($key,$id,R_NEXT
)) {
1281 next unless $text =~ /$search/;
1282 for my $w (@words) {
1283 my @hits = $text =~ /($w)/ig or next;
1284 $results{$id} += @hits;
1286 $notes{$id} .= "$text ";
1291 for my $id (keys %results) {
1292 my $hits = $results{$id};
1293 my $note = $notes{$id};
1295 my $relevance = 10 * $hits;
1296 my $feature = $self->fetch($id) or next;
1297 my $name = $feature->display_name or next;
1298 my $type = $feature->type;
1299 push @results,[$name,$note,$relevance,$type,$id];
1307 my ($search_string,$limit) = @_;
1309 $search_string =~ tr/*?//d;
1313 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1314 my $search = join '|',@words;
1317 my $note_index = $self->notes_db;
1318 seek($note_index,0,0); # back to start
1319 while (<$note_index>) {
1320 next unless /$search/;
1322 my ($note,$uu) = split "\t";
1323 $found{unpack("u*",$uu)}++;
1324 last if $limit && ++$found >= $limit;
1327 my (@features, @matches);
1328 for my $idx (keys %found) {
1329 my $feature = $self->fetch($idx) or next;
1330 my @values = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1331 my $value = "@values";
1334 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
1335 push @matches,$hits;
1336 push @features,$feature;
1339 for (my $i=0; $i<@matches; $i++) {
1340 my $feature = $features[$i];
1341 my $matches = $matches[$i];
1343 my $relevance = 10 * $matches;
1345 $note = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1346 push @results,[$feature->display_name,$note,$relevance];
1355 return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1357 $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1358 $term =~ s/(^|[^\\])\*/$1.*/g;
1359 $term =~ s/(^|[^\\])\?/$1./g;
1360 return ($stem,$term);
1366 my ($filter,$results) = @_;
1367 return unless @
$results;
1370 my @filtered = grep {$filter->{$_}} @
$results;
1371 %$filter = map {$_=>1} @filtered;
1373 %$filter = map {$_=>1} @
$results;
1380 eval "require Bio::DB::GFF::Typename"
1381 unless Bio
::DB
::GFF
::Typename
->can('new');
1383 my $index = $self->index_db('types');
1384 my $db = tied(%$index);
1385 return map {Bio
::DB
::GFF
::Typename
->new($_)} keys %$index;
1389 sub _insert_sequence
{
1391 my ($seqid,$seq,$offset) = @_;
1392 my $dna_fh = $self->private_fasta_file or return;
1393 if ($offset == 0) { # start of the sequence
1394 print $dna_fh ">$seqid\n";
1396 print $dna_fh $seq,"\n";
1399 sub _fetch_sequence
{
1401 my ($seqid,$start,$end) = @_;
1402 my $db = $self->dna_db or return;
1403 return $db->seq($seqid,$start,$end);
1406 sub private_fasta_file
{
1408 return $self->{fasta_fh
} if exists $self->{fasta_fh
};
1409 $self->{fasta_file
} = $self->_qualify("sequence.fa");
1410 return $self->{fasta_fh
} = IO
::File
->new($self->{fasta_file
},">");
1413 sub finish_bulk_update
{
1415 if (my $fh = $self->{fasta_fh
}) {
1417 $self->{fasta_db
} = Bio
::DB
::Fasta
::Subdir
->new($self->{fasta_file
});
1424 return $db->{'.version'} || 1.00;
1430 $self->_close_databases();
1431 rmtree
($self->directory,0,1) if $self->temporary;
1434 # TIE interface -- a little annoying because we are storing magic ".variable"
1435 # meta-variables in the same data structure as the IDs, so these variables
1441 while ( ($key,$value) = each %{$db}) {
1442 last unless $key =~ /^\./;
1452 while ( ($key,$value) = each %$db) {
1453 last unless $key =~ /^\./;
1461 return exists $self->db->{$id};
1467 my $obj = $self->fetch($id) or return;
1468 $self->_delete_indexes($obj,$id);
1469 delete $self->db->{$id};
1474 $self->_close_databases();
1475 $self->_delete_databases();
1476 my ($write,$create) = $self->_permissions;
1477 $self->_open_databases($write,$create);
1482 return scalar %{$self->db};
1486 package Bio
::DB
::SeqFeature
::Store
::berkeleydb
::Iterator
;
1492 return bless {store
=> $store,
1493 ids
=> $ids},ref($class) || $class;
1498 my $store = $self->{store
} or return;
1499 my $id = shift @
{$self->{ids
}};
1500 defined $id or return;
1501 return $store->fetch($id);
1504 package Bio
::DB
::Fasta
::Subdir
;
1506 use base
'Bio::DB::Fasta';
1508 # alter calling arguments so that the fasta file is placed in a subdirectory
1513 my ($path,$isdir) = @_;
1514 return $self->SUPER::index_name
($path,$isdir)
1516 return File
::Spec
->catfile($path,'indexes','fasta.index');
1526 This is an early version, so there are certainly some bugs. Please
1527 use the BioPerl bug tracking system to report bugs.
1532 L<Bio::DB::SeqFeature>,
1533 L<Bio::DB::SeqFeature::Store>,
1534 L<Bio::DB::SeqFeature::GFF3Loader>,
1535 L<Bio::DB::SeqFeature::Segment>,
1536 L<Bio::DB::SeqFeature::Store::memory>,
1537 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1541 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1543 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1545 This library is free software; you can redistribute it and/or modify
1546 it under the same terms as Perl itself.