1 package Bio
::DB
::SeqFeature
::Store
::berkeleydb
;
6 use base
'Bio::DB::SeqFeature::Store';
7 use Bio
::DB
::GFF
::Util
::Rearrange
'rearrange';
9 use Fcntl
qw(O_RDWR O_CREAT :flock);
11 use File
::Temp
'tempdir';
12 use File
::Path
'rmtree','mkpath';
15 use Carp
'carp','croak';
17 use constant BINSIZE
=> 10_000
;
18 use constant MININT
=> -999_999_999_999
;
19 use constant MAXINT
=> 999_999_999_999
;
21 our $VERSION = '2.00';
25 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
29 use Bio::DB::SeqFeature::Store;
31 # Create a database from the feature files located in /home/fly4.3 and store
32 # the database index in the same directory:
33 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
34 -dir => '/home/fly4.3');
36 # Create a database that will monitor the files in /home/fly4.3, but store
37 # the indexes in /var/databases/fly4.3
38 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
39 -dsn => '/var/databases/fly4.3',
40 -dir => '/home/fly4.3');
42 # Create a feature database from scratch
43 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
44 -dsn => '/var/databases/fly4.3',
47 # get a feature from somewhere
48 my $feature = Bio::SeqFeature::Generic->new(...);
51 $db->store($feature) or die "Couldn't store!";
53 # primary ID of the feature is changed to indicate its primary ID
55 my $id = $feature->primary_id;
57 # get the feature back out
58 my $f = $db->fetch($id);
60 # change the feature and update it
62 $db->update($f) or $self->throw("Couldn't update!");
64 # use the GFF3 loader to do a bulk write:
65 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
68 $loader->load('/home/fly4.3/dmel-all.gff');
73 my @features = $db->fetch_many(@list_of_ids);
76 @features = $db->get_features_by_name('ZK909');
79 @features = $db->get_features_by_alias('sma-3');
82 @features = $db->get_features_by_type('gene');
85 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
88 @features = $db->get_features_by_attribute({description => 'protein kinase'})
90 # ...by the GFF "Note" field
91 @result_list = $db->search_notes('kinase');
93 # ...by arbitrary combinations of selectors
94 @features = $db->features(-name => $name,
99 -attributes => $attributes);
101 # ...using an iterator
102 my $iterator = $db->get_seq_stream(-name => $name,
107 -attributes => $attributes);
109 while (my $feature = $iterator->next_seq) {
110 # do something with the feature
113 # ...limiting the search to a particular region
114 my $segment = $db->segment('Chr1',5000=>6000);
115 my @features = $segment->features(-type=>['mRNA','match']);
117 # what feature types are defined in the database?
118 my @types = $db->types;
120 # getting & storing sequence information
121 # Warning: this returns a string, and not a PrimarySeq object
122 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
123 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
125 # create a new feature in the database
126 my $feature = $db->new_feature(-primary_tag => 'mRNA',
133 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
134 Bio::DB::SeqFeature::Store. You will not create it directly, but
135 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
137 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
139 =head2 Using the berkeleydb adaptor
141 The Berkeley database consists of a series of Berkeleydb index files,
142 and a couple of special purpose indexes. You can create the index
143 files from scratch by creating a new database and calling
144 new_feature() repeatedly, you can create the database and then bulk
145 populate it using the GFF3 loader, or you can monitor a directory of
146 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
147 or more of the fields changes. The last mode is probably the most
148 convenient. Note that the indexer will only pay attention to files
149 that end with .gff3, .wig and .fa.
153 =item The new() constructor
155 The new() constructor method all the arguments recognized by
156 Bio::DB::SeqFeature::Store, and a few additional ones.
163 -adaptor The name of the Adaptor class (default DBI::mysql)
165 -serializer The name of the serializer class (default Storable)
167 -index_subfeatures Whether or not to make subfeatures searchable
170 -cache Activate LRU caching feature -- size of cache
172 -compress Compresses features before storing them in database
175 Adaptor-specific arguments
180 -dsn Where the index files are stored
182 -dir Where the source (GFF3, FASTA) files are stored
184 -autoindex An alias for -dir.
186 -write Pass true to open the index files for writing.
188 -create Pass true to create the index files if they don't exist
191 -locking Use advisory locking to avoid one process trying to read
192 from the database while another is updating it (may not
193 work properly over NFS).
195 -temp Pass true to create temporary index files that will
196 be deleted once the script exits.
198 -verbose Pass true to report autoindexing operations on STDERR.
203 To create an empty database which will be populated using calls to
204 store() or new_feature(), or which will be bulk-loaded using the GFF3
207 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
208 -dsn => '/var/databases/fly4.3',
211 To open a preexisting database in read-only mode:
213 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
214 -dsn => '/var/databases/fly4.3');
216 To open a preexisting database in read/write (update) mode:
218 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
219 -dsn => '/var/databases/fly4.3',
222 To monitor a set of GFF3 and FASTA files located in a directory and
223 create/update the database indexes as needed. The indexes will be
224 stored in a new subdirectory named "indexes":
226 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
227 -dir => '/var/databases/fly4.3');
229 As above, but store the source files and index files in separate directories:
231 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
232 -dsn => '/var/databases/fly4.3',
233 -dir => '/home/gff3_files/fly4.3');
235 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
236 (FASTA format) or .wig (WIG format).
238 B<-autoindex> is an alias for B<-dir>.
240 You should specify B<-locking> in a multiuser environment, including
241 the case in which the database is being used by a web server at the
242 same time another user might be updating it.
246 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
247 by this adaptor. The various methods for storing and updating features
248 and sequences into the database are supported, but there is no
249 locking. If two processes try to update the same database
250 simultaneously, the database will likely become corrupted.
255 # object initialization
266 ) = rearrange
([['DSN','DB'],
268 ['TMP','TEMP','TEMPORARY'],
269 [qw(WRITE WRITABLE)],
275 $verbose = 1 unless defined $verbose;
278 -d
$autoindex or $self->throw("Invalid directory $autoindex");
279 $directory ||= "$autoindex/indexes";
281 $directory ||= $is_temporary ? File
::Spec
->tmpdir : '.';
283 my $pacname = __PACKAGE__
;
284 if ($^O
=~ /mswin/i) {
285 $pacname =~ s/:+/_/g;
287 $directory = tempdir
($pacname.'_XXXXXX',
290 DIR
=> $directory) if $is_temporary;
292 -d
$directory or $self->throw("Invalid directory $directory");
294 $create++ if $is_temporary;
296 $self->throw("Can't write into the directory $directory")
297 if $write && !-w
$directory;
300 $self->default_settings;
301 $self->directory($directory);
302 $self->temporary($is_temporary);
303 $self->verbose($verbose);
304 $self->locking($locking);
305 $self->_delete_databases() if $create;
306 if ($autoindex && -d
$autoindex) {
307 $self->auto_reindex($autoindex);
309 $self->lock('shared');
310 $self->_open_databases($write,$create,$autoindex);
311 $self->_permissions($write,$create);
315 sub can_store_parentage
{ 1 }
320 my $result = $self->needs_auto_reindexing($autodir);
322 if ($result && %$result) {
323 $self->flag_autoindexing(1);
324 $self->lock('exclusive');
325 $self->reindex_wigfiles($result->{wig
},$autodir) if $result->{wig
};
326 $self->reindex_ffffiles($result->{fff
},$autodir) if $result->{fff
};
327 $self->reindex_gfffiles($result->{gff
},$autodir) if $result->{gff
};
328 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($autodir));
330 $self->flag_autoindexing(0);
334 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($autodir));
338 sub autoindex_flagfile
{
339 return File
::Spec
->catfile(shift->directory,'autoindex.pid');
341 sub auto_index_in_process
{
343 my $flag_file = $self->autoindex_flagfile;
344 return unless -e
$flag_file;
346 # if flagfile exists, then check that PID still exists
347 open my $fh,$flag_file or die "Couldn't open $flag_file: $!";
350 return 1 if kill 0=>$pid;
351 warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
352 if (unlink $flag_file) {
353 warn "Successfully removed stale PID file." if $self->verbose;
354 warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
355 my $glob = File
::Spec
->catfile($self->directory,'*');
359 croak
("Cannot recover from apparent aborted autoindex process. Please remove files in ",
361 " and allow the adaptor to reindex");
366 sub flag_autoindexing
{
369 my $flag_file = $self->autoindex_flagfile;
371 open my $fh,'>',$flag_file or die "Couldn't open $flag_file: $!";
379 sub reindex_gfffiles
{
384 warn "Reindexing GFF files...\n" if $self->verbose;
385 my $exists = -e
$self->_features_path;
387 $self->_permissions(1,1);
388 $self->_close_databases();
389 $self->_open_databases(1,!$exists);
390 require Bio
::DB
::SeqFeature
::Store
::GFF3Loader
391 unless Bio
::DB
::SeqFeature
::Store
::GFF3Loader
->can('new');
392 my $loader = Bio
::DB
::SeqFeature
::Store
::GFF3Loader
->new(-store
=> $self,
393 -sf_class
=> $self->seqfeature_class,
394 -verbose
=> $self->verbose,
396 or $self->throw("Couldn't create GFF3Loader");
398 $loader->load(grep {!$seen{$_}++} @
$files);
399 $self->_touch_timestamp;
402 sub reindex_ffffiles
{
407 warn "Reindexing FFF files...\n" if $self->verbose;
408 $self->_permissions(1,1);
409 $self->_close_databases();
410 $self->_open_databases(1,1);
411 require Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
412 unless Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
->can('new');
413 my $loader = Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
->new(-store
=> $self,
414 -sf_class
=> $self->seqfeature_class,
415 -verbose
=> $self->verbose,
417 or $self->throw("Couldn't create FeatureFileLoader");
419 $loader->load(grep {!$seen{$_}++} @
$files);
420 $self->_touch_timestamp;
423 sub reindex_wigfiles
{
428 warn "Reindexing wig files...\n" if $self->verbose;
430 unless (Bio
::Graphics
::Wiggle
::Loader
->can('new')) {
431 eval "require Bio::Graphics::Wiggle::Loader; 1"
435 for my $wig (@
$files) {
436 warn "Reindexing $wig...\n" if $self->verbose;
437 my ($wib_name) = fileparse
($wig,qr/\.[^.]*/);
438 my $gff3_name = "$wib_name.gff3";
440 # unlink all wib files that share the basename
441 my $wib_glob = File
::Spec
->catfile($self->directory,"$wib_name*.wib");
442 unlink glob($wib_glob);
444 my $loader = Bio
::Graphics
::Wiggle
::Loader
->new($self->directory,$wib_name);
445 my $fh = IO
::File
->new($wig) or die "Can't open $wig: $!";
446 $loader->load($fh); # will create one or more .wib files
448 my $gff3_data = $loader->featurefile('gff3','microarray_oligo',$wib_name);
449 my $gff3_path = File
::Spec
->catfile($autodir,$gff3_name);
450 $fh = IO
::File
->new($gff3_path,'>')
451 or die "Can't open $gff3_path for writing: $!";
452 $fh->print($gff3_data);
454 my $conf_path = File
::Spec
->catfile($autodir,"$wib_name.conf");
455 $fh = IO
::File
->new($conf_path,'>');
456 $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
461 # returns the following hashref
462 # empty hash if nothing needs reindexing
463 # {fasta => 1} if DNA database needs reindexing
464 # {gff => [list,of,gff,paths]} if gff3 files need reindexing
465 # {wig => [list,of,wig,paths]} if wig files need reindexing
466 sub needs_auto_reindexing
{
471 # don't allow two processes to reindex simultaneously
472 $self->auto_index_in_process and croak
"Autoindexing in process. Try again later";
474 # first interrogate the GFF3 files, using the timestamp file
475 # as modification comparison
476 my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
477 opendir (my $D,$autodir)
478 or $self->throw("Couldn't open directory $autodir for reading: $!");
481 my $timestamp_time = _mtime
($self->_mtime_path) || 0;
482 while (defined (my $node = readdir($D))) {
483 next if $node =~ /^\./;
484 my $path = File
::Spec
->catfile($autodir,$node);
485 next unless -f
$path;
487 if ($path =~ /\.gff\d?$/i) {
488 my $mtime = _mtime
(\
*_
); # not a typo
489 $maxtime = $mtime if $mtime > $maxtime;
494 elsif ($path =~ /\.fff?$/i) {
495 my $mtime = _mtime
(\
*_
); # not a typo
496 $maxtime = $mtime if $mtime > $maxtime;
500 elsif ($path =~ /\.wig$/i) {
502 (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
503 next if -e
$gff_file && _mtime
($gff_file) > _mtime
($path);
505 push @gff3,$gff_file;
509 elsif ($path =~ /\.(fa|fasta|dna)$/i) {
511 _mtime
(File
::Spec
->catfile($self->directory,'fasta.index'))||0
512 unless defined $fasta_index_time;
513 $fasta++ if _mtime
($path) > $fasta_index_time;
518 $result->{gff
} = \
@gff3 if $maxtime > $timestamp_time;
519 $result->{wig
} = \
@wig if @wig;
520 $result->{fff
} = \
@fff if @fff;
521 $result->{fasta
}++ if $fasta;
527 my $d = $self->{verbose
};
528 $self->{verbose
} = shift if @_;
534 my $d = $self->{locking
};
535 $self->{locking
} = shift if @_;
541 return File
::Spec
->catfile($self->directory,'lock');
547 return unless $self->locking;
549 my $flag = $mode eq 'exclusive' ? LOCK_EX
: LOCK_SH
;
550 my $lockfile = $self->lockfile;
551 my $fh = $self->_flock_fh;
553 my $open = -e
$lockfile ?
'<' : '>';
554 $fh = IO
::File
->new($lockfile,$open) or die "Cannot open $lockfile: $!";
557 $self->_flock_fh($fh);
562 return unless $self->locking;
564 my $fh = $self->_flock_fh or return;
566 undef $self->{flock_fh
};
571 my $d = $self->{flock_fh
};
572 $self->{flock_fh
} = shift if @_;
576 sub _open_databases
{
578 my ($write,$create,$ignore_errors) = @_;
579 return if $self->db; # already open - don't reopen
581 my $directory = $self->directory;
582 unless (-d
$directory) { # directory does not exist
583 $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
584 mkpath
($directory) or $self->throw("Couldn't create database directory $directory: $!");
587 my $flags = O_RDONLY
;
588 $flags |= O_RDWR
if $write;
589 $flags |= O_CREAT
if $create;
591 # Create the main database; this is a DB_HASH implementation
593 my $result = tie
(%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
596 return if $ignore_errors; # autoindex set, so defer this
597 $self->throw("Couldn't tie: ".$self->_features_path . " $!");
602 $h{'.version'} = $VERSION;
606 # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
607 local $DB_BTREE->{flags
} = R_DUP
;
608 $DB_BTREE->{compare
} = sub { lc($_[0]) cmp lc($_[1]) };
609 for my $idx ($self->_index_files) {
610 my $path = $self->_qualify("$idx.idx");
612 tie
(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
613 or $self->throw("Couldn't tie $path: $!");
615 $self->index_db($idx=>\
%db);
618 # Create the parentage database
620 tie
(%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
621 or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
623 $self->parentage_db(\
%p);
625 if (-e
$self->_fasta_path) {
626 my $dna_db = Bio
::DB
::Fasta
::Subdir
->new($self->_fasta_path)
627 or $self->throw("Can't reindex sequence file: $@");
628 $self->dna_db($dna_db);
631 my $mode = $write ?
"+>>"
635 open (my $F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
639 sub commit
{ # reindex fasta files
641 if (my $fh = $self->{fasta_fh
}) {
643 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($self->{fasta_file
}));
644 } elsif (-d
$self->directory) {
645 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($self->directory));
649 sub _close_databases
{
652 $self->dna_db(undef);
653 $self->notes_db(undef);
654 $self->index_db($_=>undef) foreach $self->_index_files;
657 # do nothing -- new() with -create=>1 will do the trick
658 sub _init_database
{ }
660 sub _delete_databases
{
662 for my $idx ($self->_index_files) {
663 my $path = $self->_qualify("$idx.idx");
666 unlink $self->_parentage_path;
667 unlink $self->_fasta_path;
668 unlink $self->_features_path;
669 unlink $self->_mtime_path;
672 sub _touch_timestamp
{
674 my $tsf = $self->_mtime_path;
675 open (F
,">$tsf") or $self->throw("Couldn't open $tsf: $!");
676 print F
scalar(localtime);
686 my $primary_id = $obj->primary_id;
687 $self->_delete_indexes($obj,$primary_id) if $indexed && $primary_id;
688 $primary_id = $db->{'.next_id'}++ unless defined $primary_id;
689 $db->{$primary_id} = $self->freeze($obj);
690 $obj->primary_id($primary_id);
691 $self->_update_indexes($obj) if $indexed;
697 sub _delete_indexes
{
701 # the additional "1" causes the index to be deleted
702 $self->_update_name_index($obj,$id,1);
703 $self->_update_type_index($obj,$id,1);
704 $self->_update_location_index($obj,$id,1);
705 $self->_update_attribute_index($obj,$id,1);
706 $self->_update_note_index($obj,$id,1);
713 my $obj = $self->thaw($db->{$id},$id);
717 sub _add_SeqFeature
{
721 my $parent_id = (ref $parent ?
$parent->primary_id : $parent)
722 or $self->throw("$parent should have a primary_id");
723 my $p = $self->parentage_db;
724 for my $child (@children) {
725 my $child_id = ref $child ?
$child->primary_id : $child;
726 defined $child_id or $self->throw("no primary ID known for $child");
727 $p->{$parent_id} = $child_id;
731 sub _fetch_SeqFeatures
{
735 my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
736 my $index = $self->parentage_db;
737 my $db = tied %$index;
739 my @children_ids = $db->get_dup($parent_id);
740 my @children = map {$self->fetch($_)} @children_ids;
743 my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
744 return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
750 sub _update_indexes
{
753 defined (my $id = $obj->primary_id) or return;
754 $self->_update_name_index($obj,$id);
755 $self->_update_type_index($obj,$id);
756 $self->_update_location_index($obj,$id);
757 $self->_update_attribute_index($obj,$id);
758 $self->_update_note_index($obj,$id);
761 sub _update_name_index
{
763 my ($obj,$id,$delete) = @_;
764 my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
765 my ($names,$aliases) = $self->feature_names($obj);
767 # little stinky - needs minor refactoring
770 $self->update_or_delete($delete,$db,$key,$id);
773 foreach (@
$aliases) {
774 my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
775 $self->update_or_delete($delete,$db,$key,$id);
780 sub _update_type_index
{
782 my ($obj,$id,$delete) = @_;
783 my $db = $self->index_db('types')
784 or $self->throw("Couldn't find 'types' index file");
785 my $primary_tag = $obj->primary_tag;
786 my $source_tag = $obj->source_tag || '';
787 return unless defined $primary_tag;
789 $primary_tag .= ":$source_tag";
790 my $key = lc $primary_tag;
791 $self->update_or_delete($delete,$db,$key,$id);
794 # Note: this indexing scheme is space-inefficient because it stores the
795 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
796 # format. It should be replaced with a binary numeric encoding and the
797 # BTREE {compare} attribute changed accordingly.
798 sub _update_location_index
{
800 my ($obj,$id,$delete) = @_;
801 my $db = $self->index_db('locations')
802 or $self->throw("Couldn't find 'locations' index file");
804 my $seq_id = $obj->seq_id || '';
805 my $start = $obj->start || '';
806 my $end = $obj->end || '';
807 my $strand = $obj->strand;
808 my $bin_min = int $start/BINSIZE
;
809 my $bin_max = int $end/BINSIZE
;
811 for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
812 my $key = sprintf("%s.%06d",lc($seq_id),$bin);
813 $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
817 sub _update_attribute_index
{
819 my ($obj,$id,$delete) = @_;
820 my $db = $self->index_db('attributes')
821 or $self->throw("Couldn't find 'attributes' index file");
823 for my $tag ($obj->get_all_tags) {
824 for my $value ($obj->get_tag_values($tag)) {
825 my $key = "${tag}:${value}";
826 $self->update_or_delete($delete,$db,$key,$id);
831 sub _update_note_index
{
833 my ($obj,$id,$delete) = @_;
834 return if $delete; # we don't know how to do this
836 my $fh = $self->notes_db;
837 my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
840 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
844 sub update_or_delete
{
846 my ($delete,$db,$key,$id) = @_;
848 tied(%$db)->del_dup($key,$id);
854 # these methods return pointers to....
855 # the database that stores serialized objects
858 my $d = $self->setting('db');
859 $self->setting(db
=>shift) if @_;
865 my $d = $self->setting('parentage_db');
866 $self->setting(parentage_db
=>shift) if @_;
870 # the Bio::DB::Fasta object
873 my $d = $self->setting('dna_db');
874 $self->setting(dna_db
=>shift) if @_;
878 # the specialized notes database
881 my $d = $self->setting('notes_db');
882 $self->setting(notes_db
=>shift) if @_;
886 # The indicated index berkeley db
889 my $index_name = shift;
890 my $d = $self->setting($index_name);
891 $self->setting($index_name=>shift) if @_;
898 my @stat = stat($file);
902 # return names of all the indexes
904 return qw(names types locations attributes);
907 # the directory in which we store our indexes
910 my $d = $self->setting('directory');
911 $self->setting(directory
=>shift) if @_;
915 # flag indicating that we are a temporary database
918 my $d = $self->setting('temporary');
919 $self->setting(temporary
=>shift) if @_;
925 my $d = $self->setting('permissions') or return;
927 my ($write,$create) = @_;
928 $self->setting(permissions
=>[$write,$create]);
933 # file name utilities...
937 return $self->directory .'/' . $file;
941 shift->_qualify('features.bdb');
944 sub _parentage_path
{
945 shift->_qualify('parentage.bdb');
949 shift->_qualify('types.idx');
953 shift->_qualify('locations.idx');
956 sub _attribute_path
{
957 shift->_qualify('attributes.idx');
961 shift->_qualify('notes.idx');
965 shift->_qualify('sequence.fa');
969 shift->_qualify('mtime.stamp');
972 ###########################################
974 ###########################################
978 my ($seq_id,$start,$end,$strand,
979 $name,$class,$allow_aliases,
984 ) = rearrange
([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
985 'NAME','CLASS','ALIASES',
986 ['TYPES','TYPE','PRIMARY_TAG'],
987 ['ATTRIBUTES','ATTRIBUTE'],
992 my (@from,@where,@args,@group);
993 $range_type ||= 'overlaps';
996 unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
997 @result = grep {!/^\./} keys %{$self->db};
1003 if (defined($name)) {
1004 # hacky backward compatibility workaround
1005 undef $class if $class && $class eq 'Sequence';
1006 $name = "$class:$name" if defined $class && length $class > 0;
1007 $result &&= $self->filter_by_name($name,$allow_aliases,\
%found);
1010 if (defined $seq_id) {
1011 $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\
%found);
1014 if (defined $types) {
1015 $result &&= $self->filter_by_type($types,\
%found);
1018 if (defined $attributes) {
1019 $result &&= $self->filter_by_attribute($attributes,\
%found);
1022 push @result,keys %found if $result;
1023 return $iterator ? Bio
::DB
::SeqFeature
::Store
::berkeleydb
::Iterator
->new($self,\
@result)
1024 : map {$self->fetch($_)} @result;
1027 sub filter_by_name
{
1029 my ($name,$allow_aliases,$filter) = @_;
1031 my $index = $self->index_db('names');
1032 my $db = tied(%$index);
1034 my ($stem,$regexp) = $self->glob_match($name);
1037 $regexp .= "(?:_2)?" if $allow_aliases;
1042 for (my $status = $db->seq($key,$value,R_CURSOR
);
1043 $status == 0 and $key =~ /^$regexp$/i;
1044 $status = $db->seq($key,$value,R_NEXT
)) {
1045 next if %$filter && !$filter->{$value}; # don't bother
1046 push @results,$value;
1048 $self->update_filter($filter,\
@results);
1051 sub filter_by_type
{
1053 my ($types,$filter) = @_;
1054 my @types = ref $types eq 'ARRAY' ? @
$types : $types;
1056 my $index = $self->index_db('types');
1057 my $db = tied(%$index);
1061 for my $type (@types) {
1062 my ($primary_tag,$source_tag);
1063 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1064 $primary_tag = $type->method;
1065 $source_tag = $type->source;
1067 ($primary_tag,$source_tag) = split ':',$type,2;
1069 my $match = defined $source_tag ?
"^$primary_tag:$source_tag\$" : "^$primary_tag:";
1071 my $key = lc "$primary_tag:$source_tag";
1074 # If filter is already provided, then it is usually faster to
1075 # fetch each object.
1077 for my $id (keys %$filter) {
1078 my $obj = $self->_fetch($id) or next;
1079 push @results,$id if $obj->type =~ /$match/i;
1085 for (my $status = $db->seq($key,$value,R_CURSOR
);
1086 $status == 0 && $key =~ /$match/i;
1087 $status = $db->seq($key,$value,R_NEXT
)) {
1088 next if %$filter && !$filter->{$value}; # don't even bother
1089 push @results,$value;
1093 $self->update_filter($filter,\
@results);
1096 sub filter_by_location
{
1098 my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1101 my $index = $self->index_db('locations');
1102 my $db = tied(%$index);
1104 my $binstart = defined $start ?
sprintf("%06d",int $start/BINSIZE
) : '';
1105 my $binend = defined $end ?
sprintf("%06d",int $end/BINSIZE
) : 'z'; # beyond a number
1110 $start = MININT
if !defined $start;
1111 $end = MAXINT
if !defined $end;
1112 my $version_2 = $self->version > 1;
1114 if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1115 my $key = $version_2 ?
"\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1116 my $keystop = $version_2 ?
"\L$seq_id\E.$binend" : "\L$seq_id\E$binend";
1119 for (my $status = $db->seq($key,$value,R_CURSOR
);
1120 $status == 0 && $key le $keystop;
1121 $status = $db->seq($key,$value,R_NEXT
)) {
1122 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1123 next if $seenit{$id}++;
1124 next if $strand && $fstrand != $strand;
1125 if ($range_type eq 'overlaps') {
1126 next unless $fend >= $start && $fstart <= $end;
1128 elsif ($range_type eq 'contains') {
1129 next unless $fstart >= $start && $fend <= $end;
1131 next if %$filter && !$filter->{$id}; # don't bother
1136 # for contained in, we look for features originating and terminating outside the specified range
1137 # this is incredibly inefficient, but fortunately the query is rare (?)
1138 elsif ($range_type eq 'contained_in') {
1139 my $key = $version_2 ?
"\L$seq_id." : "\L$seq_id";
1140 my $keystop = $version_2 ?
"\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1143 # do the left part of the range
1144 for (my $status = $db->seq($key,$value,R_CURSOR
);
1145 $status == 0 && $key le $keystop;
1146 $status = $db->seq($key,$value,R_NEXT
)) {
1147 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1148 next if $seenit{$id}++;
1149 next if $strand && $fstrand != $strand;
1150 next unless $fstart <= $start && $fend >= $end;
1151 next if %$filter && !$filter->{$id}; # don't bother
1155 # do the right part of the range
1156 $key = "\L$seq_id\E.$binend";
1157 for (my $status = $db->seq($key,$value,R_CURSOR
);
1159 $status = $db->seq($key,$value,R_NEXT
)) {
1160 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1161 next if $seenit{$id}++;
1162 next if $strand && $fstrand != $strand;
1163 next unless $fstart <= $start && $fend >= $end;
1164 next if %$filter && !$filter->{$id}; # don't bother
1170 $self->update_filter($filter,\
@results);
1175 my $index = $self->index_db('attributes');
1176 my %a = map {s/:.+$//; $_=> 1} keys %$index;
1180 sub filter_by_attribute
{
1182 my ($attributes,$filter) = @_;
1184 my $index = $self->index_db('attributes');
1185 my $db = tied(%$index);
1188 for my $att_name (keys %$attributes) {
1190 my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1191 ? @
{$attributes->{$att_name}} : $attributes->{$att_name};
1193 for my $v (@search_terms) {
1194 my ($stem,$regexp) = $self->glob_match($v);
1197 my $key = "\L${att_name}:${stem}\E";
1199 for (my $status = $db->seq($key,$value,R_CURSOR
);
1200 $status == 0 && $key =~ /^$att_name:$regexp$/i;
1201 $status = $db->seq($key,$value,R_NEXT
)) {
1202 next if %$filter && !$filter->{$value}; # don't bother
1203 push @result,$value;
1206 $result ||= $self->update_filter($filter,\
@result);
1211 sub _search_attributes
{
1213 my ($search_string,$attribute_array,$limit) = @_;
1214 $search_string =~ tr/*?//d;
1215 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1216 my $search = join '|',@words;
1218 my $index = $self->index_db('attributes');
1219 my $db = tied(%$index);
1221 my (%results,%notes);
1223 for my $tag (@
$attribute_array) {
1225 my $key = "\L$tag:\E";
1226 for (my $status = $db->seq($key,$id,R_CURSOR
);
1227 $status == 0 and $key =~ /^$tag:(.*)/i;
1228 $status = $db->seq($key,$id,R_NEXT
)) {
1230 next unless $text =~ /$search/;
1231 for my $w (@words) {
1232 my @hits = $text =~ /($w)/ig or next;
1233 $results{$id} += @hits;
1235 $notes{$id} .= "$text ";
1240 for my $id (keys %results) {
1241 my $hits = $results{$id};
1242 my $note = $notes{$id};
1244 my $relevance = 10 * $hits;
1245 my $feature = $self->fetch($id) or next;
1246 my $name = $feature->display_name or next;
1247 my $type = $feature->type;
1248 push @results,[$name,$note,$relevance,$type,$id];
1256 my ($search_string,$limit) = @_;
1258 $search_string =~ tr/*?//d;
1262 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1263 my $search = join '|',@words;
1266 my $note_index = $self->notes_db;
1267 seek($note_index,0,0); # back to start
1268 while (<$note_index>) {
1269 next unless /$search/;
1271 my ($note,$uu) = split "\t";
1272 $found{unpack("u*",$uu)}++;
1273 last if $limit && ++$found >= $limit;
1276 my (@features, @matches);
1277 for my $idx (keys %found) {
1278 my $feature = $self->fetch($idx) or next;
1279 my @values = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1280 my $value = "@values";
1283 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
1284 push @matches,$hits;
1285 push @features,$feature;
1288 for (my $i=0; $i<@matches; $i++) {
1289 my $feature = $features[$i];
1290 my $matches = $matches[$i];
1292 my $relevance = 10 * $matches;
1294 $note = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1295 push @results,[$feature->display_name,$note,$relevance];
1304 return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1306 $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1307 $term =~ s/(^|[^\\])\*/$1.*/g;
1308 $term =~ s/(^|[^\\])\?/$1./g;
1309 return ($stem,$term);
1315 my ($filter,$results) = @_;
1316 return unless @
$results;
1319 my @filtered = grep {$filter->{$_}} @
$results;
1320 %$filter = map {$_=>1} @filtered;
1322 %$filter = map {$_=>1} @
$results;
1329 eval "require Bio::DB::GFF::Typename"
1330 unless Bio
::DB
::GFF
::Typename
->can('new');
1332 my $index = $self->index_db('types');
1333 my $db = tied(%$index);
1334 return map {Bio
::DB
::GFF
::Typename
->new($_)} keys %$index;
1338 sub _insert_sequence
{
1340 my ($seqid,$seq,$offset) = @_;
1341 my $dna_fh = $self->private_fasta_file or return;
1342 if ($offset == 0) { # start of the sequence
1343 print $dna_fh ">$seqid\n";
1345 print $dna_fh $seq,"\n";
1348 sub _fetch_sequence
{
1350 my ($seqid,$start,$end) = @_;
1351 my $db = $self->dna_db or return;
1352 return $db->seq($seqid,$start,$end);
1355 sub private_fasta_file
{
1357 return $self->{fasta_fh
} if exists $self->{fasta_fh
};
1358 $self->{fasta_file
} = $self->_qualify("sequence.fa");
1359 return $self->{fasta_fh
} = IO
::File
->new($self->{fasta_file
},">");
1362 sub finish_bulk_update
{
1364 if (my $fh = $self->{fasta_fh
}) {
1366 $self->{fasta_db
} = Bio
::DB
::Fasta
::Subdir
->new($self->{fasta_file
});
1373 return $db->{'.version'} || 1.00;
1379 $self->_close_databases();
1380 rmtree
($self->directory,0,1) if $self->temporary;
1383 # TIE interface -- a little annoying because we are storing magic ".variable"
1384 # meta-variables in the same data structure as the IDs, so these variables
1390 while ( ($key,$value) = each %{$db}) {
1391 last unless $key =~ /^\./;
1401 while ( ($key,$value) = each %$db) {
1402 last unless $key =~ /^\./;
1410 return exists $self->db->{$id};
1416 my $obj = $self->fetch($id) or return;
1417 $self->_delete_indexes($obj,$id);
1418 delete $self->db->{$id};
1423 $self->_close_databases();
1424 $self->_delete_databases();
1425 my ($write,$create) = $self->_permissions;
1426 $self->_open_databases($write,$create);
1431 return scalar %{$self->db};
1435 package Bio
::DB
::SeqFeature
::Store
::berkeleydb
::Iterator
;
1441 return bless {store
=> $store,
1442 ids
=> $ids},ref($class) || $class;
1447 my $store = $self->{store
} or return;
1448 my $id = shift @
{$self->{ids
}};
1449 defined $id or return;
1450 return $store->fetch($id);
1453 package Bio
::DB
::Fasta
::Subdir
;
1455 use base
'Bio::DB::Fasta';
1457 # alter calling arguments so that the fasta file is placed in a subdirectory
1462 my ($path,$isdir) = @_;
1463 return $self->SUPER::index_name
($path,$isdir)
1465 return File
::Spec
->catfile($path,'indexes','fasta.index');
1475 This is an early version, so there are certainly some bugs. Please
1476 use the BioPerl bug tracking system to report bugs.
1481 L<Bio::DB::SeqFeature>,
1482 L<Bio::DB::SeqFeature::Store>,
1483 L<Bio::DB::SeqFeature::GFF3Loader>,
1484 L<Bio::DB::SeqFeature::Segment>,
1485 L<Bio::DB::SeqFeature::Store::memory>,
1486 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1490 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1492 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1494 This library is free software; you can redistribute it and/or modify
1495 it under the same terms as Perl itself.