sync with trunk (to r15946)
[bioperl-live.git] / Bio / DB / SeqFeature / Store / berkeleydb.pm
blob84061c3954c3920c04996ea6689df8a6b4548f56
1 package Bio::DB::SeqFeature::Store::berkeleydb;
3 # $Id$
5 use strict;
6 use base 'Bio::DB::SeqFeature::Store';
7 use Bio::DB::GFF::Util::Rearrange 'rearrange';
8 use DB_File;
9 use Fcntl qw(O_RDWR O_CREAT :flock);
10 use IO::File;
11 use File::Temp 'tempdir';
12 use File::Path 'rmtree','mkpath';
13 use File::Basename;
14 use File::Spec;
15 use Carp 'carp','croak';
17 use constant BINSIZE => 10_000;
18 use constant MININT => -999_999_999_999;
19 use constant MAXINT => 999_999_999_999;
21 our $VERSION = '2.00';
23 =head1 NAME
25 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
27 =head1 SYNOPSIS
29 use Bio::DB::SeqFeature::Store;
31 # Create a database from the feature files located in /home/fly4.3 and store
32 # the database index in the same directory:
33 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
34 -dir => '/home/fly4.3');
36 # Create a database that will monitor the files in /home/fly4.3, but store
37 # the indexes in /var/databases/fly4.3
38 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
39 -dsn => '/var/databases/fly4.3',
40 -dir => '/home/fly4.3');
42 # Create a feature database from scratch
43 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
44 -dsn => '/var/databases/fly4.3',
45 -create => 1);
47 # get a feature from somewhere
48 my $feature = Bio::SeqFeature::Generic->new(...);
50 # store it
51 $db->store($feature) or die "Couldn't store!";
53 # primary ID of the feature is changed to indicate its primary ID
54 # in the database...
55 my $id = $feature->primary_id;
57 # get the feature back out
58 my $f = $db->fetch($id);
60 # change the feature and update it
61 $f->start(100);
62 $db->update($f) or $self->throw("Couldn't update!");
64 # use the GFF3 loader to do a bulk write:
65 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
66 -verbose => 0,
67 -fast => 1);
68 $loader->load('/home/fly4.3/dmel-all.gff');
71 # searching...
72 # ...by id
73 my @features = $db->fetch_many(@list_of_ids);
75 # ...by name
76 @features = $db->get_features_by_name('ZK909');
78 # ...by alias
79 @features = $db->get_features_by_alias('sma-3');
81 # ...by type
82 @features = $db->get_features_by_type('gene');
84 # ...by location
85 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
87 # ...by attribute
88 @features = $db->get_features_by_attribute({description => 'protein kinase'})
90 # ...by the GFF "Note" field
91 @result_list = $db->search_notes('kinase');
93 # ...by arbitrary combinations of selectors
94 @features = $db->features(-name => $name,
95 -type => $types,
96 -seq_id => $seqid,
97 -start => $start,
98 -end => $end,
99 -attributes => $attributes);
101 # ...using an iterator
102 my $iterator = $db->get_seq_stream(-name => $name,
103 -type => $types,
104 -seq_id => $seqid,
105 -start => $start,
106 -end => $end,
107 -attributes => $attributes);
109 while (my $feature = $iterator->next_seq) {
110 # do something with the feature
113 # ...limiting the search to a particular region
114 my $segment = $db->segment('Chr1',5000=>6000);
115 my @features = $segment->features(-type=>['mRNA','match']);
117 # what feature types are defined in the database?
118 my @types = $db->types;
120 # getting & storing sequence information
121 # Warning: this returns a string, and not a PrimarySeq object
122 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
123 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
125 # create a new feature in the database
126 my $feature = $db->new_feature(-primary_tag => 'mRNA',
127 -seq_id => 'chr3',
128 -start => 10000,
129 -end => 11000);
131 =head1 DESCRIPTION
133 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
134 Bio::DB::SeqFeature::Store. You will not create it directly, but
135 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
137 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
139 =head2 Using the berkeleydb adaptor
141 The Berkeley database consists of a series of Berkeleydb index files,
142 and a couple of special purpose indexes. You can create the index
143 files from scratch by creating a new database and calling
144 new_feature() repeatedly, you can create the database and then bulk
145 populate it using the GFF3 loader, or you can monitor a directory of
146 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
147 or more of the fields changes. The last mode is probably the most
148 convenient. Note that the indexer will only pay attention to files
149 that end with .gff3, .wig and .fa.
151 =over 4
153 =item The new() constructor
155 The new() constructor method all the arguments recognized by
156 Bio::DB::SeqFeature::Store, and a few additional ones.
158 Standard arguments:
160 Name Value
161 ---- -----
163 -adaptor The name of the Adaptor class (default DBI::mysql)
165 -serializer The name of the serializer class (default Storable)
167 -index_subfeatures Whether or not to make subfeatures searchable
168 (default true)
170 -cache Activate LRU caching feature -- size of cache
172 -compress Compresses features before storing them in database
173 using Compress::Zlib
175 Adaptor-specific arguments
177 Name Value
178 ---- -----
180 -dsn Where the index files are stored
182 -dir Where the source (GFF3, FASTA) files are stored
184 -autoindex An alias for -dir.
186 -write Pass true to open the index files for writing.
188 -create Pass true to create the index files if they don't exist
189 (implies -write=>1)
191 -locking Use advisory locking to avoid one process trying to read
192 from the database while another is updating it (may not
193 work properly over NFS).
195 -temp Pass true to create temporary index files that will
196 be deleted once the script exits.
198 -verbose Pass true to report autoindexing operations on STDERR.
199 (default is true).
201 Examples:
203 To create an empty database which will be populated using calls to
204 store() or new_feature(), or which will be bulk-loaded using the GFF3
205 loader:
207 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
208 -dsn => '/var/databases/fly4.3',
209 -create => 1);
211 To open a preexisting database in read-only mode:
213 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
214 -dsn => '/var/databases/fly4.3');
216 To open a preexisting database in read/write (update) mode:
218 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
219 -dsn => '/var/databases/fly4.3',
220 -write => 1);
222 To monitor a set of GFF3 and FASTA files located in a directory and
223 create/update the database indexes as needed. The indexes will be
224 stored in a new subdirectory named "indexes":
226 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
227 -dir => '/var/databases/fly4.3');
229 As above, but store the source files and index files in separate directories:
231 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
232 -dsn => '/var/databases/fly4.3',
233 -dir => '/home/gff3_files/fly4.3');
235 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
236 (FASTA format) or .wig (WIG format).
238 B<-autoindex> is an alias for B<-dir>.
240 You should specify B<-locking> in a multiuser environment, including
241 the case in which the database is being used by a web server at the
242 same time another user might be updating it.
244 =back
246 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
247 by this adaptor. The various methods for storing and updating features
248 and sequences into the database are supported, but there is no
249 locking. If two processes try to update the same database
250 simultaneously, the database will likely become corrupted.
252 =cut
255 # object initialization
257 sub init {
258 my $self = shift;
259 my ($directory,
260 $autoindex,
261 $is_temporary,
262 $write,
263 $create,
264 $verbose,
265 $locking,
266 ) = rearrange([['DSN','DB'],
267 [qw(DIR AUTOINDEX)],
268 ['TMP','TEMP','TEMPORARY'],
269 [qw(WRITE WRITABLE)],
270 'CREATE',
271 'VERBOSE',
272 [qw(LOCK LOCKING)],
273 ],@_);
275 $verbose = 1 unless defined $verbose;
277 if ($autoindex) {
278 -d $autoindex or $self->throw("Invalid directory $autoindex");
279 $directory ||= "$autoindex/indexes";
281 $directory ||= $is_temporary ? File::Spec->tmpdir : '.';
283 my $pacname = __PACKAGE__;
284 if ($^O =~ /mswin/i) {
285 $pacname =~ s/:+/_/g;
287 $directory = tempdir($pacname.'_XXXXXX',
288 TMPDIR => 1,
289 CLEANUP => 1,
290 DIR => $directory) if $is_temporary;
291 mkpath($directory);
292 -d $directory or $self->throw("Invalid directory $directory");
294 $create++ if $is_temporary;
295 $write ||= $create;
296 $self->throw("Can't write into the directory $directory")
297 if $write && !-w $directory;
300 $self->default_settings;
301 $self->directory($directory);
302 $self->temporary($is_temporary);
303 $self->verbose($verbose);
304 $self->locking($locking);
305 $self->_delete_databases() if $create;
306 if ($autoindex && -d $autoindex) {
307 $self->auto_reindex($autoindex);
309 $self->lock('shared');
310 $self->_open_databases($write,$create,$autoindex);
311 $self->_permissions($write,$create);
312 return $self;
315 sub can_store_parentage { 1 }
317 sub auto_reindex {
318 my $self = shift;
319 my $autodir = shift;
320 my $result = $self->needs_auto_reindexing($autodir);
322 if ($result && %$result) {
323 $self->flag_autoindexing(1);
324 $self->lock('exclusive');
325 $self->reindex_wigfiles($result->{wig},$autodir) if $result->{wig};
326 $self->reindex_ffffiles($result->{fff},$autodir) if $result->{fff};
327 $self->reindex_gfffiles($result->{gff},$autodir) if $result->{gff};
328 $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
329 $self->unlock;
330 $self->flag_autoindexing(0);
333 else {
334 $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
338 sub autoindex_flagfile {
339 return File::Spec->catfile(shift->directory,'autoindex.pid');
341 sub auto_index_in_process {
342 my $self = shift;
343 my $flag_file = $self->autoindex_flagfile;
344 return unless -e $flag_file;
346 # if flagfile exists, then check that PID still exists
347 open my $fh,$flag_file or die "Couldn't open $flag_file: $!";
348 my $pid = <$fh>;
349 close $fh;
350 return 1 if kill 0=>$pid;
351 warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
352 if (unlink $flag_file) {
353 warn "Successfully removed stale PID file." if $self->verbose;
354 warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
355 my $glob = File::Spec->catfile($self->directory,'*');
356 unlink glob($glob);
357 return;
358 } else {
359 croak ("Cannot recover from apparent aborted autoindex process. Please remove files in ",
360 $self->directory,
361 " and allow the adaptor to reindex");
362 return 1;
366 sub flag_autoindexing {
367 my $self = shift;
368 my $doit = shift;
369 my $flag_file = $self->autoindex_flagfile;
370 if ($doit) {
371 open my $fh,'>',$flag_file or die "Couldn't open $flag_file: $!";
372 print $fh $$;
373 close $fh;
374 } else {
375 unlink $flag_file;
379 sub reindex_gfffiles {
380 my $self = shift;
381 my $files = shift;
382 my $autodir = shift;
384 warn "Reindexing GFF files...\n" if $self->verbose;
385 my $exists = -e $self->_features_path;
387 $self->_permissions(1,1);
388 $self->_close_databases();
389 $self->_open_databases(1,!$exists);
390 require Bio::DB::SeqFeature::Store::GFF3Loader
391 unless Bio::DB::SeqFeature::Store::GFF3Loader->can('new');
392 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $self,
393 -sf_class => $self->seqfeature_class,
394 -verbose => $self->verbose,
396 or $self->throw("Couldn't create GFF3Loader");
397 my %seen;
398 $loader->load(grep {!$seen{$_}++} @$files);
399 $self->_touch_timestamp;
402 sub reindex_ffffiles {
403 my $self = shift;
404 my $files = shift;
405 my $autodir = shift;
407 warn "Reindexing FFF files...\n" if $self->verbose;
408 $self->_permissions(1,1);
409 $self->_close_databases();
410 $self->_open_databases(1,1);
411 require Bio::DB::SeqFeature::Store::FeatureFileLoader
412 unless Bio::DB::SeqFeature::Store::FeatureFileLoader->can('new');
413 my $loader = Bio::DB::SeqFeature::Store::FeatureFileLoader->new(-store => $self,
414 -sf_class => $self->seqfeature_class,
415 -verbose => $self->verbose,
417 or $self->throw("Couldn't create FeatureFileLoader");
418 my %seen;
419 $loader->load(grep {!$seen{$_}++} @$files);
420 $self->_touch_timestamp;
423 sub reindex_wigfiles {
424 my $self = shift;
425 my $files = shift;
426 my $autodir = shift;
428 warn "Reindexing wig files...\n" if $self->verbose;
430 unless (Bio::Graphics::Wiggle::Loader->can('new')) {
431 eval "require Bio::Graphics::Wiggle::Loader; 1"
432 or return;
435 for my $wig (@$files) {
436 warn "Reindexing $wig...\n" if $self->verbose;
437 my ($wib_name) = fileparse($wig,qr/\.[^.]*/);
438 my $gff3_name = "$wib_name.gff3";
440 # unlink all wib files that share the basename
441 my $wib_glob = File::Spec->catfile($self->directory,"$wib_name*.wib");
442 unlink glob($wib_glob);
444 my $loader = Bio::Graphics::Wiggle::Loader->new($self->directory,$wib_name);
445 my $fh = IO::File->new($wig) or die "Can't open $wig: $!";
446 $loader->load($fh); # will create one or more .wib files
447 $fh->close;
448 my $gff3_data = $loader->featurefile('gff3','microarray_oligo',$wib_name);
449 my $gff3_path = File::Spec->catfile($autodir,$gff3_name);
450 $fh = IO::File->new($gff3_path,'>')
451 or die "Can't open $gff3_path for writing: $!";
452 $fh->print($gff3_data);
453 $fh->close;
454 my $conf_path = File::Spec->catfile($autodir,"$wib_name.conf");
455 $fh = IO::File->new($conf_path,'>');
456 $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
457 $fh->close;
461 # returns the following hashref
462 # empty hash if nothing needs reindexing
463 # {fasta => 1} if DNA database needs reindexing
464 # {gff => [list,of,gff,paths]} if gff3 files need reindexing
465 # {wig => [list,of,wig,paths]} if wig files need reindexing
466 sub needs_auto_reindexing {
467 my $self = shift;
468 my $autodir = shift;
469 my $result = {};
471 # don't allow two processes to reindex simultaneously
472 $self->auto_index_in_process and croak "Autoindexing in process. Try again later";
474 # first interrogate the GFF3 files, using the timestamp file
475 # as modification comparison
476 my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
477 opendir (my $D,$autodir)
478 or $self->throw("Couldn't open directory $autodir for reading: $!");
480 my $maxtime = 0;
481 my $timestamp_time = _mtime($self->_mtime_path) || 0;
482 while (defined (my $node = readdir($D))) {
483 next if $node =~ /^\./;
484 my $path = File::Spec->catfile($autodir,$node);
485 next unless -f $path;
487 if ($path =~ /\.gff\d?$/i) {
488 my $mtime = _mtime(\*_); # not a typo
489 $maxtime = $mtime if $mtime > $maxtime;
490 push @gff3,$path;
494 elsif ($path =~ /\.fff?$/i) {
495 my $mtime = _mtime(\*_); # not a typo
496 $maxtime = $mtime if $mtime > $maxtime;
497 push @fff,$path;
500 elsif ($path =~ /\.wig$/i) {
501 my $wig = $path;
502 (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
503 next if -e $gff_file && _mtime($gff_file) > _mtime($path);
504 push @wig,$wig;
505 push @gff3,$gff_file;
506 $maxtime = time();
509 elsif ($path =~ /\.(fa|fasta|dna)$/i) {
510 $fasta_index_time =
511 _mtime(File::Spec->catfile($self->directory,'fasta.index'))||0
512 unless defined $fasta_index_time;
513 $fasta++ if _mtime($path) > $fasta_index_time;
516 closedir $D;
518 $result->{gff} = \@gff3 if $maxtime > $timestamp_time;
519 $result->{wig} = \@wig if @wig;
520 $result->{fff} = \@fff if @fff;
521 $result->{fasta}++ if $fasta;
522 return $result;
525 sub verbose {
526 my $self = shift;
527 my $d = $self->{verbose};
528 $self->{verbose} = shift if @_;
529 return $d;
532 sub locking {
533 my $self = shift;
534 my $d = $self->{locking};
535 $self->{locking} = shift if @_;
536 return $d;
539 sub lockfile {
540 my $self = shift;
541 return File::Spec->catfile($self->directory,'lock');
544 sub lock {
545 my $self = shift;
546 my $mode = shift;
547 return unless $self->locking;
549 my $flag = $mode eq 'exclusive' ? LOCK_EX : LOCK_SH;
550 my $lockfile = $self->lockfile;
551 my $fh = $self->_flock_fh;
552 unless ($fh) {
553 my $open = -e $lockfile ? '<' : '>';
554 $fh = IO::File->new($lockfile,$open) or die "Cannot open $lockfile: $!";
556 flock($fh,$flag);
557 $self->_flock_fh($fh);
560 sub unlock {
561 my $self = shift;
562 return unless $self->locking;
564 my $fh = $self->_flock_fh or return;
565 flock($fh,LOCK_UN);
566 undef $self->{flock_fh};
569 sub _flock_fh {
570 my $self = shift;
571 my $d = $self->{flock_fh};
572 $self->{flock_fh} = shift if @_;
576 sub _open_databases {
577 my $self = shift;
578 my ($write,$create,$ignore_errors) = @_;
579 return if $self->db; # already open - don't reopen
581 my $directory = $self->directory;
582 unless (-d $directory) { # directory does not exist
583 $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
584 mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
587 my $flags = O_RDONLY;
588 $flags |= O_RDWR if $write;
589 $flags |= O_CREAT if $create;
591 # Create the main database; this is a DB_HASH implementation
592 my %h;
593 my $result = tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
595 unless ($result) {
596 return if $ignore_errors; # autoindex set, so defer this
597 $self->throw("Couldn't tie: ".$self->_features_path . " $!");
599 if ($create) {
600 %h = ();
601 $h{'.next_id'} = 1;
602 $h{'.version'} = $VERSION;
604 $self->db(\%h);
606 # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
607 local $DB_BTREE->{flags} = R_DUP;
608 $DB_BTREE->{compare} = sub { lc($_[0]) cmp lc($_[1]) };
609 for my $idx ($self->_index_files) {
610 my $path = $self->_qualify("$idx.idx");
611 my %db;
612 tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
613 or $self->throw("Couldn't tie $path: $!");
614 %db = () if $create;
615 $self->index_db($idx=>\%db);
618 # Create the parentage database
619 my %p;
620 tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
621 or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
622 %p = () if $create;
623 $self->parentage_db(\%p);
625 if (-e $self->_fasta_path) {
626 my $dna_db = Bio::DB::Fasta::Subdir->new($self->_fasta_path)
627 or $self->throw("Can't reindex sequence file: $@");
628 $self->dna_db($dna_db);
631 my $mode = $write ? "+>>"
632 : $create ? "+>"
633 : "<";
635 open (my $F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
636 $self->notes_db($F);
639 sub commit { # reindex fasta files
640 my $self = shift;
641 if (my $fh = $self->{fasta_fh}) {
642 $fh->close;
643 $self->dna_db(Bio::DB::Fasta::Subdir->new($self->{fasta_file}));
644 } elsif (-d $self->directory) {
645 $self->dna_db(Bio::DB::Fasta::Subdir->new($self->directory));
649 sub _close_databases {
650 my $self = shift;
651 $self->db(undef);
652 $self->dna_db(undef);
653 $self->notes_db(undef);
654 $self->index_db($_=>undef) foreach $self->_index_files;
657 # do nothing -- new() with -create=>1 will do the trick
658 sub _init_database { }
660 sub _delete_databases {
661 my $self = shift;
662 for my $idx ($self->_index_files) {
663 my $path = $self->_qualify("$idx.idx");
664 unlink $path;
666 unlink $self->_parentage_path;
667 unlink $self->_fasta_path;
668 unlink $self->_features_path;
669 unlink $self->_mtime_path;
672 sub _touch_timestamp {
673 my $self = shift;
674 my $tsf = $self->_mtime_path;
675 open (F,">$tsf") or $self->throw("Couldn't open $tsf: $!");
676 print F scalar(localtime);
677 close F;
680 sub _store {
681 my $self = shift;
682 my $indexed = shift;
683 my $db = $self->db;
684 my $count = 0;
685 for my $obj (@_) {
686 my $primary_id = $obj->primary_id;
687 $self->_delete_indexes($obj,$primary_id) if $indexed && $primary_id;
688 $primary_id = $db->{'.next_id'}++ unless defined $primary_id;
689 $db->{$primary_id} = $self->freeze($obj);
690 $obj->primary_id($primary_id);
691 $self->_update_indexes($obj) if $indexed;
692 $count++;
694 $count;
697 sub _delete_indexes {
698 my $self = shift;
699 my ($obj,$id) = @_;
701 # the additional "1" causes the index to be deleted
702 $self->_update_name_index($obj,$id,1);
703 $self->_update_type_index($obj,$id,1);
704 $self->_update_location_index($obj,$id,1);
705 $self->_update_attribute_index($obj,$id,1);
706 $self->_update_note_index($obj,$id,1);
709 sub _fetch {
710 my $self = shift;
711 my $id = shift;
712 my $db = $self->db;
713 my $obj = $self->thaw($db->{$id},$id);
714 $obj;
717 sub _add_SeqFeature {
718 my $self = shift;
719 my $parent = shift;
720 my @children = @_;
721 my $parent_id = (ref $parent ? $parent->primary_id : $parent)
722 or $self->throw("$parent should have a primary_id");
723 my $p = $self->parentage_db;
724 for my $child (@children) {
725 my $child_id = ref $child ? $child->primary_id : $child;
726 defined $child_id or $self->throw("no primary ID known for $child");
727 $p->{$parent_id} = $child_id;
731 sub _fetch_SeqFeatures {
732 my $self = shift;
733 my $parent = shift;
734 my @types = @_;
735 my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
736 my $index = $self->parentage_db;
737 my $db = tied %$index;
739 my @children_ids = $db->get_dup($parent_id);
740 my @children = map {$self->fetch($_)} @children_ids;
742 if (@types) {
743 my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
744 return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
745 } else {
746 return @children;
750 sub _update_indexes {
751 my $self = shift;
752 my $obj = shift;
753 defined (my $id = $obj->primary_id) or return;
754 $self->_update_name_index($obj,$id);
755 $self->_update_type_index($obj,$id);
756 $self->_update_location_index($obj,$id);
757 $self->_update_attribute_index($obj,$id);
758 $self->_update_note_index($obj,$id);
761 sub _update_name_index {
762 my $self = shift;
763 my ($obj,$id,$delete) = @_;
764 my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
765 my ($names,$aliases) = $self->feature_names($obj);
767 # little stinky - needs minor refactoring
768 foreach (@$names) {
769 my $key = lc $_;
770 $self->update_or_delete($delete,$db,$key,$id);
773 foreach (@$aliases) {
774 my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
775 $self->update_or_delete($delete,$db,$key,$id);
780 sub _update_type_index {
781 my $self = shift;
782 my ($obj,$id,$delete) = @_;
783 my $db = $self->index_db('types')
784 or $self->throw("Couldn't find 'types' index file");
785 my $primary_tag = $obj->primary_tag;
786 my $source_tag = $obj->source_tag || '';
787 return unless defined $primary_tag;
789 $primary_tag .= ":$source_tag";
790 my $key = lc $primary_tag;
791 $self->update_or_delete($delete,$db,$key,$id);
794 # Note: this indexing scheme is space-inefficient because it stores the
795 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
796 # format. It should be replaced with a binary numeric encoding and the
797 # BTREE {compare} attribute changed accordingly.
798 sub _update_location_index {
799 my $self = shift;
800 my ($obj,$id,$delete) = @_;
801 my $db = $self->index_db('locations')
802 or $self->throw("Couldn't find 'locations' index file");
804 my $seq_id = $obj->seq_id || '';
805 my $start = $obj->start || '';
806 my $end = $obj->end || '';
807 my $strand = $obj->strand;
808 my $bin_min = int $start/BINSIZE;
809 my $bin_max = int $end/BINSIZE;
811 for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
812 my $key = sprintf("%s.%06d",lc($seq_id),$bin);
813 $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
817 sub _update_attribute_index {
818 my $self = shift;
819 my ($obj,$id,$delete) = @_;
820 my $db = $self->index_db('attributes')
821 or $self->throw("Couldn't find 'attributes' index file");
823 for my $tag ($obj->get_all_tags) {
824 for my $value ($obj->get_tag_values($tag)) {
825 my $key = "${tag}:${value}";
826 $self->update_or_delete($delete,$db,$key,$id);
831 sub _update_note_index {
832 my $self = shift;
833 my ($obj,$id,$delete) = @_;
834 return if $delete; # we don't know how to do this
836 my $fh = $self->notes_db;
837 my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
840 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
841 foreach @notes;
844 sub update_or_delete {
845 my $self = shift;
846 my ($delete,$db,$key,$id) = @_;
847 if ($delete) {
848 tied(%$db)->del_dup($key,$id);
849 } else {
850 $db->{$key} = $id;
854 # these methods return pointers to....
855 # the database that stores serialized objects
856 sub db {
857 my $self = shift;
858 my $d = $self->setting('db');
859 $self->setting(db=>shift) if @_;
863 sub parentage_db {
864 my $self = shift;
865 my $d = $self->setting('parentage_db');
866 $self->setting(parentage_db=>shift) if @_;
870 # the Bio::DB::Fasta object
871 sub dna_db {
872 my $self = shift;
873 my $d = $self->setting('dna_db');
874 $self->setting(dna_db=>shift) if @_;
878 # the specialized notes database
879 sub notes_db {
880 my $self = shift;
881 my $d = $self->setting('notes_db');
882 $self->setting(notes_db=>shift) if @_;
886 # The indicated index berkeley db
887 sub index_db {
888 my $self = shift;
889 my $index_name = shift;
890 my $d = $self->setting($index_name);
891 $self->setting($index_name=>shift) if @_;
896 sub _mtime {
897 my $file = shift;
898 my @stat = stat($file);
899 return $stat[9];
902 # return names of all the indexes
903 sub _index_files {
904 return qw(names types locations attributes);
907 # the directory in which we store our indexes
908 sub directory {
909 my $self = shift;
910 my $d = $self->setting('directory');
911 $self->setting(directory=>shift) if @_;
915 # flag indicating that we are a temporary database
916 sub temporary {
917 my $self = shift;
918 my $d = $self->setting('temporary');
919 $self->setting(temporary=>shift) if @_;
923 sub _permissions {
924 my $self = shift;
925 my $d = $self->setting('permissions') or return;
926 if (@_) {
927 my ($write,$create) = @_;
928 $self->setting(permissions=>[$write,$create]);
930 @$d;
933 # file name utilities...
934 sub _qualify {
935 my $self = shift;
936 my $file = shift;
937 return $self->directory .'/' . $file;
940 sub _features_path {
941 shift->_qualify('features.bdb');
944 sub _parentage_path {
945 shift->_qualify('parentage.bdb');
948 sub _type_path {
949 shift->_qualify('types.idx');
952 sub _location_path {
953 shift->_qualify('locations.idx');
956 sub _attribute_path {
957 shift->_qualify('attributes.idx');
960 sub _notes_path {
961 shift->_qualify('notes.idx');
964 sub _fasta_path {
965 shift->_qualify('sequence.fa');
968 sub _mtime_path {
969 shift->_qualify('mtime.stamp');
972 ###########################################
973 # searching
974 ###########################################
976 sub _features {
977 my $self = shift;
978 my ($seq_id,$start,$end,$strand,
979 $name,$class,$allow_aliases,
980 $types,
981 $attributes,
982 $range_type,
983 $iterator
984 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
985 'NAME','CLASS','ALIASES',
986 ['TYPES','TYPE','PRIMARY_TAG'],
987 ['ATTRIBUTES','ATTRIBUTE'],
988 'RANGE_TYPE',
989 'ITERATOR',
990 ],@_);
992 my (@from,@where,@args,@group);
993 $range_type ||= 'overlaps';
995 my @result;
996 unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
997 @result = grep {!/^\./} keys %{$self->db};
1000 my %found = ();
1001 my $result = 1;
1003 if (defined($name)) {
1004 # hacky backward compatibility workaround
1005 undef $class if $class && $class eq 'Sequence';
1006 $name = "$class:$name" if defined $class && length $class > 0;
1007 $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
1010 if (defined $seq_id) {
1011 $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
1014 if (defined $types) {
1015 $result &&= $self->filter_by_type($types,\%found);
1018 if (defined $attributes) {
1019 $result &&= $self->filter_by_attribute($attributes,\%found);
1022 push @result,keys %found if $result;
1023 return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
1024 : map {$self->fetch($_)} @result;
1027 sub filter_by_name {
1028 my $self = shift;
1029 my ($name,$allow_aliases,$filter) = @_;
1031 my $index = $self->index_db('names');
1032 my $db = tied(%$index);
1034 my ($stem,$regexp) = $self->glob_match($name);
1035 $stem ||= $name;
1036 $regexp ||= $name;
1037 $regexp .= "(?:_2)?" if $allow_aliases;
1039 my $key = $stem;
1040 my $value;
1041 my @results;
1042 for (my $status = $db->seq($key,$value,R_CURSOR);
1043 $status == 0 and $key =~ /^$regexp$/i;
1044 $status = $db->seq($key,$value,R_NEXT)) {
1045 next if %$filter && !$filter->{$value}; # don't bother
1046 push @results,$value;
1048 $self->update_filter($filter,\@results);
1051 sub filter_by_type {
1052 my $self = shift;
1053 my ($types,$filter) = @_;
1054 my @types = ref $types eq 'ARRAY' ? @$types : $types;
1056 my $index = $self->index_db('types');
1057 my $db = tied(%$index);
1059 my @results;
1061 for my $type (@types) {
1062 my ($primary_tag,$source_tag);
1063 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1064 $primary_tag = $type->method;
1065 $source_tag = $type->source;
1066 } else {
1067 ($primary_tag,$source_tag) = split ':',$type,2;
1069 my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
1070 $source_tag ||= '';
1071 my $key = lc "$primary_tag:$source_tag";
1072 my $value;
1074 # If filter is already provided, then it is usually faster to
1075 # fetch each object.
1076 if (%$filter) {
1077 for my $id (keys %$filter) {
1078 my $obj = $self->_fetch($id) or next;
1079 push @results,$id if $obj->type =~ /$match/i;
1084 else {
1085 for (my $status = $db->seq($key,$value,R_CURSOR);
1086 $status == 0 && $key =~ /$match/i;
1087 $status = $db->seq($key,$value,R_NEXT)) {
1088 next if %$filter && !$filter->{$value}; # don't even bother
1089 push @results,$value;
1093 $self->update_filter($filter,\@results);
1096 sub filter_by_location {
1097 my $self = shift;
1098 my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1099 $strand ||= 0;
1101 my $index = $self->index_db('locations');
1102 my $db = tied(%$index);
1104 my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
1105 my $binend = defined $end ? sprintf("%06d",int $end/BINSIZE) : 'z'; # beyond a number
1107 my %seenit;
1108 my @results;
1110 $start = MININT if !defined $start;
1111 $end = MAXINT if !defined $end;
1112 my $version_2 = $self->version > 1;
1114 if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1115 my $key = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1116 my $keystop = $version_2 ? "\L$seq_id\E.$binend" : "\L$seq_id\E$binend";
1117 my $value;
1119 for (my $status = $db->seq($key,$value,R_CURSOR);
1120 $status == 0 && $key le $keystop;
1121 $status = $db->seq($key,$value,R_NEXT)) {
1122 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1123 next if $seenit{$id}++;
1124 next if $strand && $fstrand != $strand;
1125 if ($range_type eq 'overlaps') {
1126 next unless $fend >= $start && $fstart <= $end;
1128 elsif ($range_type eq 'contains') {
1129 next unless $fstart >= $start && $fend <= $end;
1131 next if %$filter && !$filter->{$id}; # don't bother
1132 push @results,$id;
1136 # for contained in, we look for features originating and terminating outside the specified range
1137 # this is incredibly inefficient, but fortunately the query is rare (?)
1138 elsif ($range_type eq 'contained_in') {
1139 my $key = $version_2 ? "\L$seq_id." : "\L$seq_id";
1140 my $keystop = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1141 my $value;
1143 # do the left part of the range
1144 for (my $status = $db->seq($key,$value,R_CURSOR);
1145 $status == 0 && $key le $keystop;
1146 $status = $db->seq($key,$value,R_NEXT)) {
1147 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1148 next if $seenit{$id}++;
1149 next if $strand && $fstrand != $strand;
1150 next unless $fstart <= $start && $fend >= $end;
1151 next if %$filter && !$filter->{$id}; # don't bother
1152 push @results,$id;
1155 # do the right part of the range
1156 $key = "\L$seq_id\E.$binend";
1157 for (my $status = $db->seq($key,$value,R_CURSOR);
1158 $status == 0;
1159 $status = $db->seq($key,$value,R_NEXT)) {
1160 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1161 next if $seenit{$id}++;
1162 next if $strand && $fstrand != $strand;
1163 next unless $fstart <= $start && $fend >= $end;
1164 next if %$filter && !$filter->{$id}; # don't bother
1165 push @results,$id;
1170 $self->update_filter($filter,\@results);
1173 sub attributes {
1174 my $self = shift;
1175 my $index = $self->index_db('attributes');
1176 my %a = map {s/:.+$//; $_=> 1} keys %$index;
1177 return keys %a;
1180 sub filter_by_attribute {
1181 my $self = shift;
1182 my ($attributes,$filter) = @_;
1184 my $index = $self->index_db('attributes');
1185 my $db = tied(%$index);
1186 my $result;
1188 for my $att_name (keys %$attributes) {
1189 my @result;
1190 my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1191 ? @{$attributes->{$att_name}} : $attributes->{$att_name};
1193 for my $v (@search_terms) {
1194 my ($stem,$regexp) = $self->glob_match($v);
1195 $stem ||= $v;
1196 $regexp ||= $v;
1197 my $key = "\L${att_name}:${stem}\E";
1198 my $value;
1199 for (my $status = $db->seq($key,$value,R_CURSOR);
1200 $status == 0 && $key =~ /^$att_name:$regexp$/i;
1201 $status = $db->seq($key,$value,R_NEXT)) {
1202 next if %$filter && !$filter->{$value}; # don't bother
1203 push @result,$value;
1206 $result ||= $self->update_filter($filter,\@result);
1208 $result;
1211 sub _search_attributes {
1212 my $self = shift;
1213 my ($search_string,$attribute_array,$limit) = @_;
1214 $search_string =~ tr/*?//d;
1215 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1216 my $search = join '|',@words;
1218 my $index = $self->index_db('attributes');
1219 my $db = tied(%$index);
1221 my (%results,%notes);
1223 for my $tag (@$attribute_array) {
1224 my $id;
1225 my $key = "\L$tag:\E";
1226 for (my $status = $db->seq($key,$id,R_CURSOR);
1227 $status == 0 and $key =~ /^$tag:(.*)/i;
1228 $status = $db->seq($key,$id,R_NEXT)) {
1229 my $text = $1;
1230 next unless $text =~ /$search/;
1231 for my $w (@words) {
1232 my @hits = $text =~ /($w)/ig or next;
1233 $results{$id} += @hits;
1235 $notes{$id} .= "$text ";
1239 my @results;
1240 for my $id (keys %results) {
1241 my $hits = $results{$id};
1242 my $note = $notes{$id};
1243 $note =~ s/\s+$//;
1244 my $relevance = 10 * $hits;
1245 my $feature = $self->fetch($id) or next;
1246 my $name = $feature->display_name or next;
1247 my $type = $feature->type;
1248 push @results,[$name,$note,$relevance,$type,$id];
1251 return @results;
1254 sub search_notes {
1255 my $self = shift;
1256 my ($search_string,$limit) = @_;
1258 $search_string =~ tr/*?//d;
1260 my @results;
1262 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1263 my $search = join '|',@words;
1265 my (%found,$found);
1266 my $note_index = $self->notes_db;
1267 seek($note_index,0,0); # back to start
1268 while (<$note_index>) {
1269 next unless /$search/;
1270 chomp;
1271 my ($note,$uu) = split "\t";
1272 $found{unpack("u*",$uu)}++;
1273 last if $limit && ++$found >= $limit;
1276 my (@features, @matches);
1277 for my $idx (keys %found) {
1278 my $feature = $self->fetch($idx) or next;
1279 my @values = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1280 my $value = "@values";
1282 my $hits;
1283 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
1284 push @matches,$hits;
1285 push @features,$feature;
1288 for (my $i=0; $i<@matches; $i++) {
1289 my $feature = $features[$i];
1290 my $matches = $matches[$i];
1292 my $relevance = 10 * $matches;
1293 my $note;
1294 $note = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1295 push @results,[$feature->display_name,$note,$relevance];
1298 return @results;
1301 sub glob_match {
1302 my $self = shift;
1303 my $term = shift;
1304 return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1305 my $stem = $1;
1306 $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1307 $term =~ s/(^|[^\\])\*/$1.*/g;
1308 $term =~ s/(^|[^\\])\?/$1./g;
1309 return ($stem,$term);
1313 sub update_filter {
1314 my $self = shift;
1315 my ($filter,$results) = @_;
1316 return unless @$results;
1318 if (%$filter) {
1319 my @filtered = grep {$filter->{$_}} @$results;
1320 %$filter = map {$_=>1} @filtered;
1321 } else {
1322 %$filter = map {$_=>1} @$results;
1327 sub types {
1328 my $self = shift;
1329 eval "require Bio::DB::GFF::Typename"
1330 unless Bio::DB::GFF::Typename->can('new');
1332 my $index = $self->index_db('types');
1333 my $db = tied(%$index);
1334 return map {Bio::DB::GFF::Typename->new($_)} keys %$index;
1337 # this is ugly
1338 sub _insert_sequence {
1339 my $self = shift;
1340 my ($seqid,$seq,$offset) = @_;
1341 my $dna_fh = $self->private_fasta_file or return;
1342 if ($offset == 0) { # start of the sequence
1343 print $dna_fh ">$seqid\n";
1345 print $dna_fh $seq,"\n";
1348 sub _fetch_sequence {
1349 my $self = shift;
1350 my ($seqid,$start,$end) = @_;
1351 my $db = $self->dna_db or return;
1352 return $db->seq($seqid,$start,$end);
1355 sub private_fasta_file {
1356 my $self = shift;
1357 return $self->{fasta_fh} if exists $self->{fasta_fh};
1358 $self->{fasta_file} = $self->_qualify("sequence.fa");
1359 return $self->{fasta_fh} = IO::File->new($self->{fasta_file},">");
1362 sub finish_bulk_update {
1363 my $self = shift;
1364 if (my $fh = $self->{fasta_fh}) {
1365 $fh->close;
1366 $self->{fasta_db} = Bio::DB::Fasta::Subdir->new($self->{fasta_file});
1370 sub version {
1371 my $self = shift;
1372 my $db = $self->db;
1373 return $db->{'.version'} || 1.00;
1377 sub DESTROY {
1378 my $self = shift;
1379 $self->_close_databases();
1380 rmtree($self->directory,0,1) if $self->temporary;
1383 # TIE interface -- a little annoying because we are storing magic ".variable"
1384 # meta-variables in the same data structure as the IDs, so these variables
1385 # must be skipped.
1386 sub _firstid {
1387 my $self = shift;
1388 my $db = $self->db;
1389 my ($key,$value);
1390 while ( ($key,$value) = each %{$db}) {
1391 last unless $key =~ /^\./;
1393 $key;
1396 sub _nextid {
1397 my $self = shift;
1398 my $id = shift;
1399 my $db = $self->db;
1400 my ($key,$value);
1401 while ( ($key,$value) = each %$db) {
1402 last unless $key =~ /^\./;
1404 $key;
1407 sub _existsid {
1408 my $self = shift;
1409 my $id = shift;
1410 return exists $self->db->{$id};
1413 sub _deleteid {
1414 my $self = shift;
1415 my $id = shift;
1416 my $obj = $self->fetch($id) or return;
1417 $self->_delete_indexes($obj,$id);
1418 delete $self->db->{$id};
1421 sub _clearall {
1422 my $self = shift;
1423 $self->_close_databases();
1424 $self->_delete_databases();
1425 my ($write,$create) = $self->_permissions;
1426 $self->_open_databases($write,$create);
1429 sub _featurecount {
1430 my $self = shift;
1431 return scalar %{$self->db};
1435 package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;
1437 sub new {
1438 my $class = shift;
1439 my $store = shift;
1440 my $ids = shift;
1441 return bless {store => $store,
1442 ids => $ids},ref($class) || $class;
1445 sub next_seq {
1446 my $self = shift;
1447 my $store = $self->{store} or return;
1448 my $id = shift @{$self->{ids}};
1449 defined $id or return;
1450 return $store->fetch($id);
1453 package Bio::DB::Fasta::Subdir;
1455 use base 'Bio::DB::Fasta';
1457 # alter calling arguments so that the fasta file is placed in a subdirectory
1458 # named "indexes"
1460 sub index_name {
1461 my $self = shift;
1462 my ($path,$isdir) = @_;
1463 return $self->SUPER::index_name($path,$isdir)
1464 unless $isdir;
1465 return File::Spec->catfile($path,'indexes','fasta.index');
1471 __END__
1473 =head1 BUGS
1475 This is an early version, so there are certainly some bugs. Please
1476 use the BioPerl bug tracking system to report bugs.
1478 =head1 SEE ALSO
1480 L<bioperl>,
1481 L<Bio::DB::SeqFeature>,
1482 L<Bio::DB::SeqFeature::Store>,
1483 L<Bio::DB::SeqFeature::GFF3Loader>,
1484 L<Bio::DB::SeqFeature::Segment>,
1485 L<Bio::DB::SeqFeature::Store::memory>,
1486 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1488 =head1 AUTHOR
1490 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1492 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1494 This library is free software; you can redistribute it and/or modify
1495 it under the same terms as Perl itself.
1497 =cut