tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / SeqFeature / Store / berkeleydb.pm
blob7014532a73e3d431308904af026bc161307ae838
1 package Bio::DB::SeqFeature::Store::berkeleydb;
3 # $Id$
5 use strict;
6 use base 'Bio::DB::SeqFeature::Store';
7 use Bio::DB::GFF::Util::Rearrange 'rearrange';
8 use DB_File;
9 use Fcntl qw(O_RDWR O_CREAT :flock);
10 use IO::File;
11 use File::Temp 'tempdir';
12 use File::Path 'rmtree','mkpath';
13 use File::Basename;
14 use File::Spec;
15 use Carp 'carp','croak';
17 use constant BINSIZE => 10_000;
18 use constant MININT => -999_999_999_999;
19 use constant MAXINT => 999_999_999_999;
21 =head1 NAME
23 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
25 =head1 SYNOPSIS
27 use Bio::DB::SeqFeature::Store;
29 # Create a database from the feature files located in /home/fly4.3 and store
30 # the database index in the same directory:
31 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
32 -dir => '/home/fly4.3');
34 # Create a database that will monitor the files in /home/fly4.3, but store
35 # the indexes in /var/databases/fly4.3
36 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
37 -dsn => '/var/databases/fly4.3',
38 -dir => '/home/fly4.3');
40 # Create a feature database from scratch
41 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
42 -dsn => '/var/databases/fly4.3',
43 -create => 1);
45 # get a feature from somewhere
46 my $feature = Bio::SeqFeature::Generic->new(...);
48 # store it
49 $db->store($feature) or die "Couldn't store!";
51 # primary ID of the feature is changed to indicate its primary ID
52 # in the database...
53 my $id = $feature->primary_id;
55 # get the feature back out
56 my $f = $db->fetch($id);
58 # change the feature and update it
59 $f->start(100);
60 $db->update($f) or $self->throw("Couldn't update!");
62 # use the GFF3 loader to do a bulk write:
63 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
64 -verbose => 0,
65 -fast => 1);
66 $loader->load('/home/fly4.3/dmel-all.gff');
69 # searching...
70 # ...by id
71 my @features = $db->fetch_many(@list_of_ids);
73 # ...by name
74 @features = $db->get_features_by_name('ZK909');
76 # ...by alias
77 @features = $db->get_features_by_alias('sma-3');
79 # ...by type
80 @features = $db->get_features_by_type('gene');
82 # ...by location
83 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
85 # ...by attribute
86 @features = $db->get_features_by_attribute({description => 'protein kinase'})
88 # ...by the GFF "Note" field
89 @result_list = $db->search_notes('kinase');
91 # ...by arbitrary combinations of selectors
92 @features = $db->features(-name => $name,
93 -type => $types,
94 -seq_id => $seqid,
95 -start => $start,
96 -end => $end,
97 -attributes => $attributes);
99 # ...using an iterator
100 my $iterator = $db->get_seq_stream(-name => $name,
101 -type => $types,
102 -seq_id => $seqid,
103 -start => $start,
104 -end => $end,
105 -attributes => $attributes);
107 while (my $feature = $iterator->next_seq) {
108 # do something with the feature
111 # ...limiting the search to a particular region
112 my $segment = $db->segment('Chr1',5000=>6000);
113 my @features = $segment->features(-type=>['mRNA','match']);
115 # what feature types are defined in the database?
116 my @types = $db->types;
118 # getting & storing sequence information
119 # Warning: this returns a string, and not a PrimarySeq object
120 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
121 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
123 # create a new feature in the database
124 my $feature = $db->new_feature(-primary_tag => 'mRNA',
125 -seq_id => 'chr3',
126 -start => 10000,
127 -end => 11000);
129 =head1 DESCRIPTION
131 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
132 Bio::DB::SeqFeature::Store. You will not create it directly, but
133 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
135 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
137 =head2 Using the berkeleydb adaptor
139 The Berkeley database consists of a series of Berkeleydb index files,
140 and a couple of special purpose indexes. You can create the index
141 files from scratch by creating a new database and calling
142 new_feature() repeatedly, you can create the database and then bulk
143 populate it using the GFF3 loader, or you can monitor a directory of
144 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
145 or more of the fields changes. The last mode is probably the most
146 convenient. Note that the indexer will only pay attention to files
147 that end with .gff3, .wig and .fa.
149 =over 4
151 =item The new() constructor
153 The new() constructor method all the arguments recognized by
154 Bio::DB::SeqFeature::Store, and a few additional ones.
156 Standard arguments:
158 Name Value
159 ---- -----
161 -adaptor The name of the Adaptor class (default DBI::mysql)
163 -serializer The name of the serializer class (default Storable)
165 -index_subfeatures Whether or not to make subfeatures searchable
166 (default true)
168 -cache Activate LRU caching feature -- size of cache
170 -compress Compresses features before storing them in database
171 using Compress::Zlib
173 Adaptor-specific arguments
175 Name Value
176 ---- -----
178 -dsn Where the index files are stored
180 -dir Where the source (GFF3, FASTA) files are stored
182 -autoindex An alias for -dir.
184 -write Pass true to open the index files for writing.
186 -create Pass true to create the index files if they don't exist
187 (implies -write=>1)
189 -locking Use advisory locking to avoid one process trying to read
190 from the database while another is updating it (may not
191 work properly over NFS).
193 -temp Pass true to create temporary index files that will
194 be deleted once the script exits.
196 -verbose Pass true to report autoindexing operations on STDERR.
197 (default is true).
199 Examples:
201 To create an empty database which will be populated using calls to
202 store() or new_feature(), or which will be bulk-loaded using the GFF3
203 loader:
205 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
206 -dsn => '/var/databases/fly4.3',
207 -create => 1);
209 To open a preexisting database in read-only mode:
211 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
212 -dsn => '/var/databases/fly4.3');
214 To open a preexisting database in read/write (update) mode:
216 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
217 -dsn => '/var/databases/fly4.3',
218 -write => 1);
220 To monitor a set of GFF3 and FASTA files located in a directory and
221 create/update the database indexes as needed. The indexes will be
222 stored in a new subdirectory named "indexes":
224 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
225 -dir => '/var/databases/fly4.3');
227 As above, but store the source files and index files in separate directories:
229 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
230 -dsn => '/var/databases/fly4.3',
231 -dir => '/home/gff3_files/fly4.3');
233 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
234 (FASTA format) or .wig (WIG format).
236 B<-autoindex> is an alias for B<-dir>.
238 You should specify B<-locking> in a multiuser environment, including
239 the case in which the database is being used by a web server at the
240 same time another user might be updating it.
242 =back
244 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
245 by this adaptor. The various methods for storing and updating features
246 and sequences into the database are supported, but there is no
247 locking. If two processes try to update the same database
248 simultaneously, the database will likely become corrupted.
250 =cut
253 # object initialization
255 sub init {
256 my $self = shift;
257 my ($directory,
258 $autoindex,
259 $is_temporary,
260 $write,
261 $create,
262 $verbose,
263 $locking,
264 ) = rearrange([['DSN','DB'],
265 [qw(DIR AUTOINDEX)],
266 ['TMP','TEMP','TEMPORARY'],
267 [qw(WRITE WRITABLE)],
268 'CREATE',
269 'VERBOSE',
270 [qw(LOCK LOCKING)],
271 ],@_);
273 $verbose = 1 unless defined $verbose;
275 if ($autoindex) {
276 -d $autoindex or $self->throw("Invalid directory $autoindex");
277 $directory ||= "$autoindex/indexes";
279 $directory ||= $is_temporary ? File::Spec->tmpdir : '.';
281 my $pacname = __PACKAGE__;
282 if ($^O =~ /mswin/i) {
283 $pacname =~ s/:+/_/g;
285 $directory = tempdir($pacname.'_XXXXXX',
286 TMPDIR => 1,
287 CLEANUP => 1,
288 DIR => $directory) if $is_temporary;
289 mkpath($directory);
290 -d $directory or $self->throw("Invalid directory $directory");
292 $create++ if $is_temporary;
293 $write ||= $create;
294 $self->throw("Can't write into the directory $directory")
295 if $write && !-w $directory;
298 $self->default_settings;
299 $self->directory($directory);
300 $self->temporary($is_temporary);
301 $self->verbose($verbose);
302 $self->locking($locking);
303 $self->_delete_databases() if $create;
304 if ($autoindex && -d $autoindex) {
305 $self->auto_reindex($autoindex);
307 $self->lock('shared');
309 # this step may rebless $self into a subclass
310 # to preserve backward compatibility with older
311 # databases while providing better performance for
312 # new databases.
313 $self->possibly_rebless($create);
315 $self->_open_databases($write,$create,$autoindex);
316 $self->_permissions($write,$create);
317 return $self;
320 sub version { return 2.0 };
322 sub possibly_rebless {
323 my $self = shift;
324 my $create = shift;
325 my $do_rebless;
327 if ($create) {
328 $do_rebless++;
329 } else { # probe database
330 my %h;
331 tie (%h,'DB_File',$self->_features_path,O_RDONLY,0666,$DB_HASH) or return;
332 $do_rebless = $h{'.version'} >= 3.0;
335 if ($do_rebless) {
336 eval "require Bio::DB::SeqFeature::Store::berkeleydb3";
337 bless $self,'Bio::DB::SeqFeature::Store::berkeleydb3';
342 sub can_store_parentage { 1 }
344 sub auto_reindex {
345 my $self = shift;
346 my $autodir = shift;
347 my $result = $self->needs_auto_reindexing($autodir);
349 if ($result && %$result) {
350 $self->flag_autoindexing(1);
351 $self->lock('exclusive');
352 $self->reindex_wigfiles($result->{wig},$autodir) if $result->{wig};
353 $self->reindex_ffffiles($result->{fff},$autodir) if $result->{fff};
354 $self->reindex_gfffiles($result->{gff},$autodir) if $result->{gff};
355 $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
356 $self->unlock;
357 $self->flag_autoindexing(0);
360 else {
361 $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
365 sub autoindex_flagfile {
366 return File::Spec->catfile(shift->directory,'autoindex.pid');
368 sub auto_index_in_process {
369 my $self = shift;
370 my $flag_file = $self->autoindex_flagfile;
371 return unless -e $flag_file;
373 # if flagfile exists, then check that PID still exists
374 open my $fh,$flag_file or die "Couldn't open $flag_file: $!";
375 my $pid = <$fh>;
376 close $fh;
377 return 1 if kill 0=>$pid;
378 warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
379 if (unlink $flag_file) {
380 warn "Successfully removed stale PID file." if $self->verbose;
381 warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
382 my $glob = File::Spec->catfile($self->directory,'*');
383 unlink glob($glob);
384 return;
385 } else {
386 croak ("Cannot recover from apparent aborted autoindex process. Please remove files in ",
387 $self->directory,
388 " and allow the adaptor to reindex");
389 return 1;
393 sub flag_autoindexing {
394 my $self = shift;
395 my $doit = shift;
396 my $flag_file = $self->autoindex_flagfile;
397 if ($doit) {
398 open my $fh,'>',$flag_file or die "Couldn't open $flag_file: $!";
399 print $fh $$;
400 close $fh;
401 } else {
402 unlink $flag_file;
406 sub reindex_gfffiles {
407 my $self = shift;
408 my $files = shift;
409 my $autodir = shift;
411 warn "Reindexing GFF files...\n" if $self->verbose;
412 my $exists = -e $self->_features_path;
414 $self->_permissions(1,1);
415 $self->_close_databases();
416 $self->_open_databases(1,!$exists);
417 require Bio::DB::SeqFeature::Store::GFF3Loader
418 unless Bio::DB::SeqFeature::Store::GFF3Loader->can('new');
419 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $self,
420 -sf_class => $self->seqfeature_class,
421 -verbose => $self->verbose,
423 or $self->throw("Couldn't create GFF3Loader");
424 my %seen;
425 $loader->load(grep {!$seen{$_}++} @$files);
426 $self->_touch_timestamp;
429 sub reindex_ffffiles {
430 my $self = shift;
431 my $files = shift;
432 my $autodir = shift;
434 warn "Reindexing FFF files...\n" if $self->verbose;
435 $self->_permissions(1,1);
436 $self->_close_databases();
437 $self->_open_databases(1,1);
438 require Bio::DB::SeqFeature::Store::FeatureFileLoader
439 unless Bio::DB::SeqFeature::Store::FeatureFileLoader->can('new');
440 my $loader = Bio::DB::SeqFeature::Store::FeatureFileLoader->new(-store => $self,
441 -sf_class => $self->seqfeature_class,
442 -verbose => $self->verbose,
444 or $self->throw("Couldn't create FeatureFileLoader");
445 my %seen;
446 $loader->load(grep {!$seen{$_}++} @$files);
447 $self->_touch_timestamp;
450 sub reindex_wigfiles {
451 my $self = shift;
452 my $files = shift;
453 my $autodir = shift;
455 warn "Reindexing wig files...\n" if $self->verbose;
457 unless (Bio::Graphics::Wiggle::Loader->can('new')) {
458 eval "require Bio::Graphics::Wiggle::Loader; 1"
459 or return;
462 for my $wig (@$files) {
463 warn "Reindexing $wig...\n" if $self->verbose;
464 my ($wib_name) = fileparse($wig,qr/\.[^.]*/);
465 my $gff3_name = "$wib_name.gff3";
467 # unlink all wib files that share the basename
468 my $wib_glob = File::Spec->catfile($self->directory,"$wib_name*.wib");
469 unlink glob($wib_glob);
471 my $loader = Bio::Graphics::Wiggle::Loader->new($self->directory,$wib_name);
472 my $fh = IO::File->new($wig) or die "Can't open $wig: $!";
473 $loader->load($fh); # will create one or more .wib files
474 $fh->close;
475 my $gff3_data = $loader->featurefile('gff3','microarray_oligo',$wib_name);
476 my $gff3_path = File::Spec->catfile($autodir,$gff3_name);
477 $fh = IO::File->new($gff3_path,'>')
478 or die "Can't open $gff3_path for writing: $!";
479 $fh->print($gff3_data);
480 $fh->close;
481 my $conf_path = File::Spec->catfile($autodir,"$wib_name.conf");
482 $fh = IO::File->new($conf_path,'>');
483 $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
484 $fh->close;
488 # returns the following hashref
489 # empty hash if nothing needs reindexing
490 # {fasta => 1} if DNA database needs reindexing
491 # {gff => [list,of,gff,paths]} if gff3 files need reindexing
492 # {wig => [list,of,wig,paths]} if wig files need reindexing
493 sub needs_auto_reindexing {
494 my $self = shift;
495 my $autodir = shift;
496 my $result = {};
498 # don't allow two processes to reindex simultaneously
499 $self->auto_index_in_process and croak "Autoindexing in process. Try again later";
501 # first interrogate the GFF3 files, using the timestamp file
502 # as modification comparison
503 my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
504 opendir (my $D,$autodir)
505 or $self->throw("Couldn't open directory $autodir for reading: $!");
507 my $maxtime = 0;
508 my $timestamp_time = _mtime($self->_mtime_path) || 0;
509 while (defined (my $node = readdir($D))) {
510 next if $node =~ /^\./;
511 my $path = File::Spec->catfile($autodir,$node);
512 next unless -f $path;
514 if ($path =~ /\.gff\d?$/i) {
515 my $mtime = _mtime(\*_); # not a typo
516 $maxtime = $mtime if $mtime > $maxtime;
517 push @gff3,$path;
521 elsif ($path =~ /\.fff?$/i) {
522 my $mtime = _mtime(\*_); # not a typo
523 $maxtime = $mtime if $mtime > $maxtime;
524 push @fff,$path;
527 elsif ($path =~ /\.wig$/i) {
528 my $wig = $path;
529 (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
530 next if -e $gff_file && _mtime($gff_file) > _mtime($path);
531 push @wig,$wig;
532 push @gff3,$gff_file;
533 $maxtime = time();
536 elsif ($path =~ /\.(fa|fasta|dna)$/i) {
537 $fasta_index_time =
538 _mtime(File::Spec->catfile($self->directory,'fasta.index'))||0
539 unless defined $fasta_index_time;
540 $fasta++ if _mtime($path) > $fasta_index_time;
543 closedir $D;
545 $result->{gff} = \@gff3 if $maxtime > $timestamp_time;
546 $result->{wig} = \@wig if @wig;
547 $result->{fff} = \@fff if @fff;
548 $result->{fasta}++ if $fasta;
549 return $result;
552 sub verbose {
553 my $self = shift;
554 my $d = $self->{verbose};
555 $self->{verbose} = shift if @_;
556 return $d;
559 sub locking {
560 my $self = shift;
561 my $d = $self->{locking};
562 $self->{locking} = shift if @_;
563 return $d;
566 sub lockfile {
567 my $self = shift;
568 return File::Spec->catfile($self->directory,'lock');
571 sub lock {
572 my $self = shift;
573 my $mode = shift;
574 return unless $self->locking;
576 my $flag = $mode eq 'exclusive' ? LOCK_EX : LOCK_SH;
577 my $lockfile = $self->lockfile;
578 my $fh = $self->_flock_fh;
579 unless ($fh) {
580 my $open = -e $lockfile ? '<' : '>';
581 $fh = IO::File->new($lockfile,$open) or die "Cannot open $lockfile: $!";
583 flock($fh,$flag);
584 $self->_flock_fh($fh);
587 sub unlock {
588 my $self = shift;
589 return unless $self->locking;
591 my $fh = $self->_flock_fh or return;
592 flock($fh,LOCK_UN);
593 undef $self->{flock_fh};
596 sub _flock_fh {
597 my $self = shift;
598 my $d = $self->{flock_fh};
599 $self->{flock_fh} = shift if @_;
603 sub _open_databases {
604 my $self = shift;
605 my ($write,$create,$ignore_errors) = @_;
606 return if $self->db; # already open - don't reopen
608 my $directory = $self->directory;
609 unless (-d $directory) { # directory does not exist
610 $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
611 mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
614 my $flags = O_RDONLY;
615 $flags |= O_RDWR if $write;
616 $flags |= O_CREAT if $create;
618 # Create the main database; this is a DB_HASH implementation
619 my %h;
620 my $result = tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
622 unless ($result) {
623 return if $ignore_errors; # autoindex set, so defer this
624 $self->throw("Couldn't tie: ".$self->_features_path . " $!");
626 if ($create) {
627 %h = ();
628 $h{'.next_id'} = 1;
629 $h{'.version'} = $self->version;
631 $self->db(\%h);
633 $self->open_index_dbs($flags,$create);
634 $self->open_parentage_db($flags,$create);
635 $self->open_notes_db($write,$create);
636 $self->open_seq_db($flags,$create) if -e $self->_fasta_path;
639 sub open_index_dbs {
640 my $self = shift;
641 my ($flags,$create) = @_;
643 # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
644 $DB_BTREE->{flags} = R_DUP;
645 $DB_BTREE->{compare} = sub { lc($_[0]) cmp lc($_[1]) };
646 for my $idx ($self->_index_files) {
647 my $path = $self->_qualify("$idx.idx");
648 my %db;
649 tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
650 or $self->throw("Couldn't tie $path: $!");
651 %db = () if $create;
652 $self->index_db($idx=>\%db);
656 sub open_parentage_db {
657 my $self = shift;
658 my ($flags,$create) = @_;
660 # Create the parentage database
661 my %p;
662 tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
663 or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
664 %p = () if $create;
665 $self->parentage_db(\%p);
668 sub open_notes_db {
669 my $self = shift;
670 my ($write,$create) = @_;
672 my $mode = $write ? "+>>"
673 : $create ? "+>"
674 : "<";
676 open (my $F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
677 $self->notes_db($F);
680 sub open_seq_db {
681 my $self = shift;
683 if (-e $self->_fasta_path) {
684 my $dna_db = Bio::DB::Fasta::Subdir->new($self->_fasta_path)
685 or $self->throw("Can't reindex sequence file: $@");
686 $self->dna_db($dna_db);
690 sub commit { # reindex fasta files
691 my $self = shift;
692 if (my $fh = $self->{fasta_fh}) {
693 $fh->close;
694 $self->dna_db(Bio::DB::Fasta::Subdir->new($self->{fasta_file}));
695 } elsif (-d $self->directory) {
696 $self->dna_db(Bio::DB::Fasta::Subdir->new($self->directory));
700 sub _close_databases {
701 my $self = shift;
702 $self->db(undef);
703 $self->dna_db(undef);
704 $self->notes_db(undef);
705 $self->index_db($_=>undef) foreach $self->_index_files;
708 # do nothing -- new() with -create=>1 will do the trick
709 sub _init_database { }
711 sub _delete_databases {
712 my $self = shift;
713 for my $idx ($self->_index_files) {
714 my $path = $self->_qualify("$idx.idx");
715 unlink $path;
717 unlink $self->_parentage_path;
718 unlink $self->_fasta_path;
719 unlink $self->_features_path;
720 unlink $self->_mtime_path;
723 sub _touch_timestamp {
724 my $self = shift;
725 my $tsf = $self->_mtime_path;
726 open (F,">$tsf") or $self->throw("Couldn't open $tsf: $!");
727 print F scalar(localtime);
728 close F;
731 sub _store {
732 my $self = shift;
733 my $indexed = shift;
734 my $db = $self->db;
735 my $count = 0;
736 for my $obj (@_) {
737 my $primary_id = $obj->primary_id;
738 $self->_delete_indexes($obj,$primary_id) if $indexed && $primary_id;
739 $primary_id = $db->{'.next_id'}++ unless defined $primary_id;
740 $db->{$primary_id} = $self->freeze($obj);
741 $obj->primary_id($primary_id);
742 $self->_update_indexes($obj) if $indexed;
743 $count++;
745 $count;
748 sub _delete_indexes {
749 my $self = shift;
750 my ($obj,$id) = @_;
752 # the additional "1" causes the index to be deleted
753 $self->_update_name_index($obj,$id,1);
754 $self->_update_type_index($obj,$id,1);
755 $self->_update_location_index($obj,$id,1);
756 $self->_update_attribute_index($obj,$id,1);
757 $self->_update_note_index($obj,$id,1);
760 sub _fetch {
761 my $self = shift;
762 my $id = shift;
763 my $db = $self->db;
764 my $obj = $self->thaw($db->{$id},$id);
765 $obj;
768 sub _add_SeqFeature {
769 my $self = shift;
770 my $parent = shift;
771 my @children = @_;
772 my $parent_id = (ref $parent ? $parent->primary_id : $parent)
773 or $self->throw("$parent should have a primary_id");
774 my $p = $self->parentage_db;
775 for my $child (@children) {
776 my $child_id = ref $child ? $child->primary_id : $child;
777 defined $child_id or $self->throw("no primary ID known for $child");
778 $p->{$parent_id} = $child_id;
782 sub _fetch_SeqFeatures {
783 my $self = shift;
784 my $parent = shift;
785 my @types = @_;
786 my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
787 my $index = $self->parentage_db;
788 my $db = tied %$index;
790 my @children_ids = $db->get_dup($parent_id);
791 my @children = map {$self->fetch($_)} @children_ids;
793 if (@types) {
794 my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
795 return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
796 } else {
797 return @children;
801 sub _update_indexes {
802 my $self = shift;
803 my $obj = shift;
804 defined (my $id = $obj->primary_id) or return;
805 $self->_update_name_index($obj,$id);
806 $self->_update_type_index($obj,$id);
807 $self->_update_location_index($obj,$id);
808 $self->_update_attribute_index($obj,$id);
809 $self->_update_note_index($obj,$id);
812 sub _update_name_index {
813 my $self = shift;
814 my ($obj,$id,$delete) = @_;
815 my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
816 my ($names,$aliases) = $self->feature_names($obj);
818 # little stinky - needs minor refactoring
819 foreach (@$names) {
820 my $key = lc $_;
821 $self->update_or_delete($delete,$db,$key,$id);
824 foreach (@$aliases) {
825 my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
826 $self->update_or_delete($delete,$db,$key,$id);
831 sub _update_type_index {
832 my $self = shift;
833 my ($obj,$id,$delete) = @_;
834 my $db = $self->index_db('types')
835 or $self->throw("Couldn't find 'types' index file");
836 my $primary_tag = $obj->primary_tag;
837 my $source_tag = $obj->source_tag || '';
838 return unless defined $primary_tag;
840 $primary_tag .= ":$source_tag";
841 my $key = lc $primary_tag;
842 $self->update_or_delete($delete,$db,$key,$id);
845 # Note: this indexing scheme is space-inefficient because it stores the
846 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
847 # format. It should be replaced with a binary numeric encoding and the
848 # BTREE {compare} attribute changed accordingly.
849 sub _update_location_index {
850 my $self = shift;
851 my ($obj,$id,$delete) = @_;
852 my $db = $self->index_db('locations')
853 or $self->throw("Couldn't find 'locations' index file");
855 my $seq_id = $obj->seq_id || '';
856 my $start = $obj->start || '';
857 my $end = $obj->end || '';
858 my $strand = $obj->strand;
859 my $bin_min = int $start/BINSIZE;
860 my $bin_max = int $end/BINSIZE;
862 for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
863 my $key = sprintf("%s.%06d",lc($seq_id),$bin);
864 $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
868 sub _update_attribute_index {
869 my $self = shift;
870 my ($obj,$id,$delete) = @_;
871 my $db = $self->index_db('attributes')
872 or $self->throw("Couldn't find 'attributes' index file");
874 for my $tag ($obj->get_all_tags) {
875 for my $value ($obj->get_tag_values($tag)) {
876 my $key = "${tag}:${value}";
877 $self->update_or_delete($delete,$db,$key,$id);
882 sub _update_note_index {
883 my $self = shift;
884 my ($obj,$id,$delete) = @_;
885 return if $delete; # we don't know how to do this
887 my $fh = $self->notes_db;
888 my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
891 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
892 foreach @notes;
895 sub update_or_delete {
896 my $self = shift;
897 my ($delete,$db,$key,$id) = @_;
898 if ($delete) {
899 tied(%$db)->del_dup($key,$id);
900 } else {
901 $db->{$key} = $id;
905 # these methods return pointers to....
906 # the database that stores serialized objects
907 sub db {
908 my $self = shift;
909 my $d = $self->setting('db');
910 $self->setting(db=>shift) if @_;
914 sub parentage_db {
915 my $self = shift;
916 my $d = $self->setting('parentage_db');
917 $self->setting(parentage_db=>shift) if @_;
921 # the Bio::DB::Fasta object
922 sub dna_db {
923 my $self = shift;
924 my $d = $self->setting('dna_db');
925 $self->setting(dna_db=>shift) if @_;
929 # the specialized notes database
930 sub notes_db {
931 my $self = shift;
932 my $d = $self->setting('notes_db');
933 $self->setting(notes_db=>shift) if @_;
937 # The indicated index berkeley db
938 sub index_db {
939 my $self = shift;
940 my $index_name = shift;
941 my $d = $self->setting($index_name);
942 $self->setting($index_name=>shift) if @_;
947 sub _mtime {
948 my $file = shift;
949 my @stat = stat($file);
950 return $stat[9];
953 # return names of all the indexes
954 sub _index_files {
955 return qw(names types locations attributes);
958 # the directory in which we store our indexes
959 sub directory {
960 my $self = shift;
961 my $d = $self->setting('directory');
962 $self->setting(directory=>shift) if @_;
966 # flag indicating that we are a temporary database
967 sub temporary {
968 my $self = shift;
969 my $d = $self->setting('temporary');
970 $self->setting(temporary=>shift) if @_;
974 sub _permissions {
975 my $self = shift;
976 my $d = $self->setting('permissions') or return;
977 if (@_) {
978 my ($write,$create) = @_;
979 $self->setting(permissions=>[$write,$create]);
981 @$d;
984 # file name utilities...
985 sub _qualify {
986 my $self = shift;
987 my $file = shift;
988 return $self->directory .'/' . $file;
991 sub _features_path {
992 shift->_qualify('features.bdb');
995 sub _parentage_path {
996 shift->_qualify('parentage.bdb');
999 sub _type_path {
1000 shift->_qualify('types.idx');
1003 sub _location_path {
1004 shift->_qualify('locations.idx');
1007 sub _attribute_path {
1008 shift->_qualify('attributes.idx');
1011 sub _notes_path {
1012 shift->_qualify('notes.idx');
1015 sub _fasta_path {
1016 shift->_qualify('sequence.fa');
1019 sub _mtime_path {
1020 shift->_qualify('mtime.stamp');
1023 ###########################################
1024 # searching
1025 ###########################################
1027 sub _features {
1028 my $self = shift;
1029 my ($seq_id,$start,$end,$strand,
1030 $name,$class,$allow_aliases,
1031 $types,
1032 $attributes,
1033 $range_type,
1034 $iterator
1035 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
1036 'NAME','CLASS','ALIASES',
1037 ['TYPES','TYPE','PRIMARY_TAG'],
1038 ['ATTRIBUTES','ATTRIBUTE'],
1039 'RANGE_TYPE',
1040 'ITERATOR',
1041 ],@_);
1043 my (@from,@where,@args,@group);
1044 $range_type ||= 'overlaps';
1046 my @result;
1047 unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
1048 @result = grep {!/^\./} keys %{$self->db};
1051 my %found = ();
1052 my $result = 1;
1054 if (defined($name)) {
1055 # hacky backward compatibility workaround
1056 undef $class if $class && $class eq 'Sequence';
1057 $name = "$class:$name" if defined $class && length $class > 0;
1058 $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
1061 if (defined $seq_id) {
1062 $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
1065 if (defined $types) {
1066 $result &&= $self->filter_by_type($types,\%found);
1069 if (defined $attributes) {
1070 $result &&= $self->filter_by_attribute($attributes,\%found);
1073 push @result,keys %found if $result;
1074 return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
1075 : map {$self->fetch($_)} @result;
1078 sub filter_by_name {
1079 my $self = shift;
1080 my ($name,$allow_aliases,$filter) = @_;
1082 my $index = $self->index_db('names');
1083 my $db = tied(%$index);
1085 my ($stem,$regexp) = $self->glob_match($name);
1086 $stem ||= $name;
1087 $regexp ||= $name;
1088 $regexp .= "(?:_2)?" if $allow_aliases;
1090 my $key = $stem;
1091 my $value;
1092 my @results;
1093 for (my $status = $db->seq($key,$value,R_CURSOR);
1094 $status == 0 and $key =~ /^$regexp$/i;
1095 $status = $db->seq($key,$value,R_NEXT)) {
1096 next if %$filter && !$filter->{$value}; # don't bother
1097 push @results,$value;
1099 $self->update_filter($filter,\@results);
1102 sub filter_by_type {
1103 my $self = shift;
1104 my ($types,$filter) = @_;
1105 my @types = ref $types eq 'ARRAY' ? @$types : $types;
1107 my $index = $self->index_db('types');
1108 my $db = tied(%$index);
1110 my @results;
1112 for my $type (@types) {
1113 my ($primary_tag,$source_tag);
1114 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1115 $primary_tag = $type->method;
1116 $source_tag = $type->source;
1117 } else {
1118 ($primary_tag,$source_tag) = split ':',$type,2;
1120 my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
1121 $source_tag ||= '';
1122 my $key = lc "$primary_tag:$source_tag";
1123 my $value;
1125 # If filter is already provided, then it is usually faster to
1126 # fetch each object.
1127 if (%$filter) {
1128 for my $id (keys %$filter) {
1129 my $obj = $self->_fetch($id) or next;
1130 push @results,$id if $obj->type =~ /$match/i;
1135 else {
1136 for (my $status = $db->seq($key,$value,R_CURSOR);
1137 $status == 0 && $key =~ /$match/i;
1138 $status = $db->seq($key,$value,R_NEXT)) {
1139 next if %$filter && !$filter->{$value}; # don't even bother
1140 push @results,$value;
1144 $self->update_filter($filter,\@results);
1147 sub filter_by_location {
1148 my $self = shift;
1149 my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1150 $strand ||= 0;
1152 my $index = $self->index_db('locations');
1153 my $db = tied(%$index);
1155 my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
1156 my $binend = defined $end ? sprintf("%06d",int $end/BINSIZE) : 'z'; # beyond a number
1158 my %seenit;
1159 my @results;
1161 $start = MININT if !defined $start;
1162 $end = MAXINT if !defined $end;
1163 my $version_2 = $self->db_version > 1;
1165 if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1166 my $key = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1167 my $keystop = $version_2 ? "\L$seq_id\E.$binend" : "\L$seq_id\E$binend";
1168 my $value;
1170 for (my $status = $db->seq($key,$value,R_CURSOR);
1171 $status == 0 && $key le $keystop;
1172 $status = $db->seq($key,$value,R_NEXT)) {
1173 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1174 next if $seenit{$id}++;
1175 next if $strand && $fstrand != $strand;
1176 if ($range_type eq 'overlaps') {
1177 next unless $fend >= $start && $fstart <= $end;
1179 elsif ($range_type eq 'contains') {
1180 next unless $fstart >= $start && $fend <= $end;
1182 next if %$filter && !$filter->{$id}; # don't bother
1183 push @results,$id;
1187 # for contained in, we look for features originating and terminating outside the specified range
1188 # this is incredibly inefficient, but fortunately the query is rare (?)
1189 elsif ($range_type eq 'contained_in') {
1190 my $key = $version_2 ? "\L$seq_id." : "\L$seq_id";
1191 my $keystop = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1192 my $value;
1194 # do the left part of the range
1195 for (my $status = $db->seq($key,$value,R_CURSOR);
1196 $status == 0 && $key le $keystop;
1197 $status = $db->seq($key,$value,R_NEXT)) {
1198 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1199 next if $seenit{$id}++;
1200 next if $strand && $fstrand != $strand;
1201 next unless $fstart <= $start && $fend >= $end;
1202 next if %$filter && !$filter->{$id}; # don't bother
1203 push @results,$id;
1206 # do the right part of the range
1207 $key = "\L$seq_id\E.$binend";
1208 for (my $status = $db->seq($key,$value,R_CURSOR);
1209 $status == 0;
1210 $status = $db->seq($key,$value,R_NEXT)) {
1211 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1212 next if $seenit{$id}++;
1213 next if $strand && $fstrand != $strand;
1214 next unless $fstart <= $start && $fend >= $end;
1215 next if %$filter && !$filter->{$id}; # don't bother
1216 push @results,$id;
1221 $self->update_filter($filter,\@results);
1224 sub attributes {
1225 my $self = shift;
1226 my $index = $self->index_db('attributes');
1227 my %a = map {s/:.+$//; $_=> 1} keys %$index;
1228 return keys %a;
1231 sub filter_by_attribute {
1232 my $self = shift;
1233 my ($attributes,$filter) = @_;
1235 my $index = $self->index_db('attributes');
1236 my $db = tied(%$index);
1237 my $result;
1239 for my $att_name (keys %$attributes) {
1240 my @result;
1241 my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1242 ? @{$attributes->{$att_name}} : $attributes->{$att_name};
1244 for my $v (@search_terms) {
1245 my ($stem,$regexp) = $self->glob_match($v);
1246 $stem ||= $v;
1247 $regexp ||= $v;
1248 my $key = "\L${att_name}:${stem}\E";
1249 my $value;
1250 for (my $status = $db->seq($key,$value,R_CURSOR);
1251 $status == 0 && $key =~ /^$att_name:$regexp$/i;
1252 $status = $db->seq($key,$value,R_NEXT)) {
1253 next if %$filter && !$filter->{$value}; # don't bother
1254 push @result,$value;
1257 $result ||= $self->update_filter($filter,\@result);
1259 $result;
1262 sub _search_attributes {
1263 my $self = shift;
1264 my ($search_string,$attribute_array,$limit) = @_;
1265 $search_string =~ tr/*?//d;
1266 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1267 my $search = join '|',@words;
1269 my $index = $self->index_db('attributes');
1270 my $db = tied(%$index);
1272 my (%results,%notes);
1274 for my $tag (@$attribute_array) {
1275 my $id;
1276 my $key = "\L$tag:\E";
1277 for (my $status = $db->seq($key,$id,R_CURSOR);
1278 $status == 0 and $key =~ /^$tag:(.*)/i;
1279 $status = $db->seq($key,$id,R_NEXT)) {
1280 my $text = $1;
1281 next unless $text =~ /$search/;
1282 for my $w (@words) {
1283 my @hits = $text =~ /($w)/ig or next;
1284 $results{$id} += @hits;
1286 $notes{$id} .= "$text ";
1290 my @results;
1291 for my $id (keys %results) {
1292 my $hits = $results{$id};
1293 my $note = $notes{$id};
1294 $note =~ s/\s+$//;
1295 my $relevance = 10 * $hits;
1296 my $feature = $self->fetch($id) or next;
1297 my $name = $feature->display_name or next;
1298 my $type = $feature->type;
1299 push @results,[$name,$note,$relevance,$type,$id];
1302 return @results;
1305 sub search_notes {
1306 my $self = shift;
1307 my ($search_string,$limit) = @_;
1309 $search_string =~ tr/*?//d;
1311 my @results;
1313 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1314 my $search = join '|',@words;
1316 my (%found,$found);
1317 my $note_index = $self->notes_db;
1318 seek($note_index,0,0); # back to start
1319 while (<$note_index>) {
1320 next unless /$search/;
1321 chomp;
1322 my ($note,$uu) = split "\t";
1323 $found{unpack("u*",$uu)}++;
1324 last if $limit && ++$found >= $limit;
1327 my (@features, @matches);
1328 for my $idx (keys %found) {
1329 my $feature = $self->fetch($idx) or next;
1330 my @values = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1331 my $value = "@values";
1333 my $hits;
1334 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
1335 push @matches,$hits;
1336 push @features,$feature;
1339 for (my $i=0; $i<@matches; $i++) {
1340 my $feature = $features[$i];
1341 my $matches = $matches[$i];
1343 my $relevance = 10 * $matches;
1344 my $note;
1345 $note = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1346 push @results,[$feature->display_name,$note,$relevance];
1349 return @results;
1352 sub glob_match {
1353 my $self = shift;
1354 my $term = shift;
1355 return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1356 my $stem = $1;
1357 $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1358 $term =~ s/(^|[^\\])\*/$1.*/g;
1359 $term =~ s/(^|[^\\])\?/$1./g;
1360 return ($stem,$term);
1364 sub update_filter {
1365 my $self = shift;
1366 my ($filter,$results) = @_;
1367 return unless @$results;
1369 if (%$filter) {
1370 my @filtered = grep {$filter->{$_}} @$results;
1371 %$filter = map {$_=>1} @filtered;
1372 } else {
1373 %$filter = map {$_=>1} @$results;
1378 sub types {
1379 my $self = shift;
1380 eval "require Bio::DB::GFF::Typename"
1381 unless Bio::DB::GFF::Typename->can('new');
1383 my $index = $self->index_db('types');
1384 my $db = tied(%$index);
1385 return map {Bio::DB::GFF::Typename->new($_)} keys %$index;
1388 # this is ugly
1389 sub _insert_sequence {
1390 my $self = shift;
1391 my ($seqid,$seq,$offset) = @_;
1392 my $dna_fh = $self->private_fasta_file or return;
1393 if ($offset == 0) { # start of the sequence
1394 print $dna_fh ">$seqid\n";
1396 print $dna_fh $seq,"\n";
1399 sub _fetch_sequence {
1400 my $self = shift;
1401 my ($seqid,$start,$end) = @_;
1402 my $db = $self->dna_db or return;
1403 return $db->seq($seqid,$start,$end);
1406 sub private_fasta_file {
1407 my $self = shift;
1408 return $self->{fasta_fh} if exists $self->{fasta_fh};
1409 $self->{fasta_file} = $self->_qualify("sequence.fa");
1410 return $self->{fasta_fh} = IO::File->new($self->{fasta_file},">");
1413 sub finish_bulk_update {
1414 my $self = shift;
1415 if (my $fh = $self->{fasta_fh}) {
1416 $fh->close;
1417 $self->{fasta_db} = Bio::DB::Fasta::Subdir->new($self->{fasta_file});
1421 sub db_version {
1422 my $self = shift;
1423 my $db = $self->db;
1424 return $db->{'.version'} || 1.00;
1428 sub DESTROY {
1429 my $self = shift;
1430 $self->_close_databases();
1431 rmtree($self->directory,0,1) if $self->temporary;
1434 # TIE interface -- a little annoying because we are storing magic ".variable"
1435 # meta-variables in the same data structure as the IDs, so these variables
1436 # must be skipped.
1437 sub _firstid {
1438 my $self = shift;
1439 my $db = $self->db;
1440 my ($key,$value);
1441 while ( ($key,$value) = each %{$db}) {
1442 last unless $key =~ /^\./;
1444 $key;
1447 sub _nextid {
1448 my $self = shift;
1449 my $id = shift;
1450 my $db = $self->db;
1451 my ($key,$value);
1452 while ( ($key,$value) = each %$db) {
1453 last unless $key =~ /^\./;
1455 $key;
1458 sub _existsid {
1459 my $self = shift;
1460 my $id = shift;
1461 return exists $self->db->{$id};
1464 sub _deleteid {
1465 my $self = shift;
1466 my $id = shift;
1467 my $obj = $self->fetch($id) or return;
1468 $self->_delete_indexes($obj,$id);
1469 delete $self->db->{$id};
1472 sub _clearall {
1473 my $self = shift;
1474 $self->_close_databases();
1475 $self->_delete_databases();
1476 my ($write,$create) = $self->_permissions;
1477 $self->_open_databases($write,$create);
1480 sub _featurecount {
1481 my $self = shift;
1482 return scalar %{$self->db};
1486 package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;
1488 sub new {
1489 my $class = shift;
1490 my $store = shift;
1491 my $ids = shift;
1492 return bless {store => $store,
1493 ids => $ids},ref($class) || $class;
1496 sub next_seq {
1497 my $self = shift;
1498 my $store = $self->{store} or return;
1499 my $id = shift @{$self->{ids}};
1500 defined $id or return;
1501 return $store->fetch($id);
1504 package Bio::DB::Fasta::Subdir;
1506 use base 'Bio::DB::Fasta';
1508 # alter calling arguments so that the fasta file is placed in a subdirectory
1509 # named "indexes"
1511 sub index_name {
1512 my $self = shift;
1513 my ($path,$isdir) = @_;
1514 return $self->SUPER::index_name($path,$isdir)
1515 unless $isdir;
1516 return File::Spec->catfile($path,'indexes','fasta.index');
1522 __END__
1524 =head1 BUGS
1526 This is an early version, so there are certainly some bugs. Please
1527 use the BioPerl bug tracking system to report bugs.
1529 =head1 SEE ALSO
1531 L<bioperl>,
1532 L<Bio::DB::SeqFeature>,
1533 L<Bio::DB::SeqFeature::Store>,
1534 L<Bio::DB::SeqFeature::GFF3Loader>,
1535 L<Bio::DB::SeqFeature::Segment>,
1536 L<Bio::DB::SeqFeature::Store::memory>,
1537 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1539 =head1 AUTHOR
1541 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1543 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1545 This library is free software; you can redistribute it and/or modify
1546 it under the same terms as Perl itself.
1548 =cut