Massive check of file open lines. Changed bareword filehandles
[bioperl-live.git] / Bio / DB / GFF / Adaptor / berkeleydb.pm
blobfdf86d8ac6c5c0628bea3ae72711be5e65bc608f
1 package Bio::DB::GFF::Adaptor::berkeleydb;
4 =head1 NAME
6 Bio::DB::GFF::Adaptor::berkeleydb -- Bio::DB::GFF database adaptor for in-memory databases
8 =head1 SYNOPSIS
10 use Bio::DB::GFF;
11 my $db = Bio::DB::GFF->new(-adaptor=> 'berkeleydb',
12 -create => 1, # on initial build you need this
13 -dsn => '/usr/local/share/gff/dmel');
15 # initialize an empty database, then load GFF and FASTA files
16 $db->initialize(1);
17 $db->load_gff('/home/drosophila_R3.2.gff');
18 $db->load_fasta('/home/drosophila_R3.2.fa');
20 # do queries
21 my $segment = $db->segment(Chromosome => '1R');
22 my $subseg = $segment->subseq(5000,6000);
23 my @features = $subseg->features('gene');
25 See L<Bio::DB::GFF> for other methods.
27 =head1 DESCRIPTION
29 This adaptor implements a berkeleydb-indexed version of Bio::DB::GFF.
30 It requires the DB_File and Storable modules. It can be used to store
31 and retrieve short to medium-length GFF files of several million
32 features in length.
34 =head1 CONSTRUCTOR
36 Use Bio::DB::GFF-E<gt>new() to construct new instances of this class.
37 Three named arguments are recommended:
39 Argument Description
40 -------- -----------
42 -adaptor Set to "berkeleydb" to create an instance of this class.
44 -dsn Path to directory where the database index files will be stored (alias -db)
46 -autoindex Monitor the indicated directory path for FASTA and GFF files, and update the
47 indexes automatically if they change (alias -dir)
49 -write Set to a true value in order to update the database.
51 -create Set to a true value to create the database the first time
52 (implies -write)
54 -tmp Location of temporary directory for storing intermediate files
55 during certain queries.
57 -preferred_groups Specify the grouping tag. See L<Bio::DB::GFF>
59 The -dsn argument selects the directory in which to store the database
60 index files. If the directory does not exist it will be created
61 automatically, provided that the current process has sufficient
62 privileges. If no -dsn argument is specified, a database named "test"
63 will be created in your system's temporary files directory.
65 The -tmp argument specifies the temporary directory to use for storing
66 intermediate search results. If not specified, your system's temporary
67 files directory will be used. On Unix systems, the TMPDIR environment
68 variable is honored. Note that some queries can require a lot of
69 space.
71 The -autoindex argument, if present, selects a directory to be
72 monitored for GFF and FASTA files (which can be compressed with the
73 gzip program if desired). Whenever any file in this directory is
74 changed, the index files will be updated. Note that the indexing can
75 take a long time to run: anywhere from 5 to 10 minutes for a million
76 features. An alias for this argument is -dir, which gives this adaptor
77 a similar flavor to the "memory" adaptor.
79 -dsn and -dir can point to the same directory. If -dir is given but
80 -dsn is absent the index files will be stored into the directory
81 containing the source files. For autoindexing to work, you must
82 specify the same -dir path each time you open the database.
84 If you do not choose autoindexing, then you will want to load the
85 database using the bp_load_gff.pl command-line tool. For example:
87 bp_load_gff.pl -a berkeleydb -c -d /usr/local/share/gff/dmel dna1.fa dna2.fa features.gff
89 =head1 METHODS
91 See L<Bio::DB::GFF> for inherited methods
93 =head1 BUGS
95 The various get_Stream_* methods and the features() method with the
96 -iterator argument only return an iterator after the query runs
97 completely and the module has been able to generate a temporary
98 results file on disk. This means that iteration is not as big a win as
99 it is for the relational-database adaptors.
101 Like the dbi::mysqlopt adaptor, this module uses a binning scheme to
102 speed up range-based searches. The binning scheme used here imposes a
103 hard-coded 1 gigabase (1000 Mbase) limit on the size of the largest
104 chromosome or other reference sequence.
106 =head1 SEE ALSO
108 L<Bio::DB::GFF>, L<bioperl>
110 =head1 AUTHORS
112 Vsevolod (Simon) Ilyushchenko E<gt>simonf@cshl.eduE<lt>
113 Lincoln Stein E<gt>lstein@cshl.eduE<lt>
115 Copyright (c) 2005 Cold Spring Harbor Laboratory.
117 This library is free software; you can redistribute it and/or modify
118 it under the same terms as Perl itself.
120 =cut
122 use strict;
124 use DB_File;
125 use File::Path 'mkpath';
126 use File::Spec;
127 use File::Temp 'tempfile';
129 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
130 use Bio::DB::GFF::Util::Binning;
131 use Bio::DB::Fasta;
132 use Bio::DB::GFF::Adaptor::berkeleydb::iterator;
133 use Bio::DB::GFF::Adaptor::memory::feature_serializer; # qw(feature2string string2feature @hash2array_map);
135 # this is the smallest bin (1 K)
136 use constant MIN_BIN => 1000;
137 # this is the largest that any reference sequence can be (1000 megabases)
138 use constant MAX_BIN => 1_000_000_000;
139 use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get
141 #We have to define a limit because Berkeleydb sorts in lexicografic order,
142 #so all the numbers have to have the same length.
143 use constant MAX_NUM_LENGTH => length(MAX_BIN);
145 use base 'Bio::DB::GFF::Adaptor::memory';
147 sub new {
148 my $class = shift ;
149 my ($dbdir,$preferred_groups,$autoindex,$write,$create,$tmpdir) = rearrange([
150 [qw(DSN DB)],
151 'PREFERRED_GROUPS',
152 [qw(DIR AUTOINDEX)],
153 [qw(WRITE WRITABLE)],
154 'CREATE',
155 'TMP',
156 ],@_);
157 $tmpdir ||= File::Spec->tmpdir;
158 $dbdir ||= $autoindex;
159 $dbdir ||= "$tmpdir/test";
160 $write ||= $create;
162 my $self = bless {},$class;
163 $self->dsn($dbdir);
164 $self->tmpdir($tmpdir);
165 $self->preferred_groups($preferred_groups) if defined $preferred_groups;
166 $self->_autoindex($autoindex) if $autoindex;
167 $self->_open_databases($write,$create);
168 return $self;
171 sub _autoindex {
172 my $self = shift;
173 my $autodir = shift;
175 my $dir = $self->dsn;
176 my %ignore = map {$_=>1} ($self->_index_file,$self->_data_file,
177 $self->_fasta_file,$self->_temp_file,
178 $self->_notes_file,
179 $self->_timestamp_file);
181 my $maxtime = 0;
182 my $maxfatime = 0;
184 opendir (my $D,$autodir) or $self->throw("Couldn't open directory $autodir for reading: $!");
186 while (defined (my $node = readdir($D))) {
187 next if $node =~ /^\./;
188 my $path = "$dir/$node";
189 next if $ignore{$path};
190 next unless -f $path;
191 my $mtime = _mtime(\*_); # not a typo
192 $maxtime = $mtime if $mtime > $maxtime;
193 $maxfatime = $mtime if $mtime > $maxfatime && $node =~ /\.(?:fa|fasta|dna)(?:\.gz)?$/;
196 close $D;
198 my $timestamp_time = _mtime($self->_timestamp_file) || 0;
199 my $all_files_exist = -e $self->_index_file && -e $self->_data_file && (-e $self->_fasta_file || !$maxfatime);
201 # to avoid rebuilding FASTA files if not changed
202 my $spare_fasta = $maxfatime > 0 && $maxfatime < $timestamp_time && -e $self->_fasta_file;
204 if ($maxtime > $timestamp_time || !$all_files_exist) {
205 print STDERR __PACKAGE__,": Reindexing files in $dir. This may take a while....\n";
206 $self->do_initialize(1,$spare_fasta);
207 $self->load_gff($autodir,1);
208 $self->load_fasta($autodir,1) unless $spare_fasta;
209 print STDERR __PACKAGE__,": Reindexing done\n";
212 else {
213 $self->_open_databases();
218 sub _open_databases {
219 my $self = shift;
220 my ($write,$create) = @_;
222 my $dsn = $self->dsn;
223 unless (-d $dsn) { # directory does not exist
224 $create or $self->throw("Directory $dsn does not exist and you did not specify the -create flag");
225 mkpath($dsn) or $self->throw("Couldn't create database directory $dsn: $!");
228 my %db;
229 local $DB_BTREE->{flags} = R_DUP;
230 $DB_BTREE->{compare} = sub { lc($_[0]) cmp lc($_[1]) };
231 my $flags = O_RDONLY;
232 $flags |= O_RDWR if $write;
233 $flags |= O_CREAT if $create;
235 tie(%db,'DB_File',$self->_index_file,$flags,0666,$DB_BTREE)
236 or $self->throw("Couldn't tie ".$self->_index_file.": $!");
237 $self->{db} = \%db;
238 $self->{data} = FeatureStore->new($self->_data_file,$write,$create);
240 if (-e $self->_fasta_file) {
241 my $dna_db = Bio::DB::Fasta->new($self->_fasta_file) or $self->throw("Can't reindex sequence file: $@");
242 $self->dna_db($dna_db);
245 my $mode = $write ? "+>>"
246 : $create ? "+>"
247 : "<";
249 my $notes_file = $self->_notes_file;
250 open my $F, $mode, $notes_file or $self->throw("Could not open file '$notes_file': $!");
251 $self->{notes} = $F;
254 sub _close_databases {
255 my $self = shift;
256 delete $self->{db};
257 delete $self->{data};
258 delete $self->{notes};
261 sub _delete_features {
262 my $self = shift;
263 my @feature_ids = @_;
264 my $removed = 0;
265 my $last_id = $self->{data}->last_id;
266 for my $id (@feature_ids) {
267 next unless $id >= 0 && $id < $last_id;
268 my $feat = $self->{data}->get($id) or next;
269 $self->{data}->remove($id);
270 $self->_bump_class_count($feat->{gclass},-1);
271 my @keys = $self->_secondary_keys($feat);
272 $self->db->del_dup($_,$id) foreach @keys;
273 $removed++;
275 $removed;
278 sub _secondary_keys {
279 my $self = shift;
280 my $feat = shift;
281 return (
282 "__name__".lc(join ":",$feat->{gclass},$feat->{gname}),
283 "__bin__".lc("$feat->{ref}$;$feat->{bin}"),
284 "__type__".join(':',$feat->{method},$feat->{source}),
285 map {"__attr__".lc(join(':',$_->[0],$_->[1]))} @{$feat->{attributes}}
289 sub _delete {
290 my $self = shift;
291 my $delete_spec = shift;
292 return $self->SUPER::_delete($delete_spec) if @{$delete_spec->{segments}} or @{$delete_spec->{types}};
293 $self->throw("This operation would delete all feature data and -force not specified")
294 unless $delete_spec->{force};
295 my $deleted = $self->{db}{__count__};
296 $self->{data} = FeatureStore->new($self->_data_file,1,1);
297 %{$self->{db}} = ();
298 $deleted;
301 # with duplicates enabled, we cannot simply do $db->{__index__}++;
302 sub _bump_feature_count {
303 my $self = shift;
304 my $db = $self->{db};
305 if (@_) {
306 delete $db->{__count__};
307 return $db->{__count__} = shift;
308 } else {
309 my $index = ${db}->{__count__};
310 delete $db->{__count__};
311 $db->{__count__} = ($index || 0) + 1;
312 return $index;
316 sub _bump_class_count {
317 my $self = shift;
318 my ($class,$count) = @_;
319 $count ||= 1;
320 my $db = $self->{db};
321 my $key = "__class__$class";
322 my $newcount = ($db->{$key} || 0) + $count;
323 delete $db->{$key};
324 $db->{$key} = $newcount;
327 sub classes {
328 my $self = shift;
329 my $db = $self->db;
330 my ($key,$value) = ('__class__',undef);
331 my %classes;
332 for (my $status = $db->seq($key,$value,R_CURSOR);
333 $status == 0;
334 $status = $db->seq($key,$value,R_NEXT)) {
335 my ($class) = $key =~ /^__class__(.+)/ or last;
336 $classes{$class}++ if $value > 0;
338 my @classes = sort keys %classes;
339 return @classes;
342 sub do_initialize {
343 my $self = shift;
344 my $erase = shift;
345 my $spare_fasta = shift; # used internally only!
346 if ($erase) {
347 $self->_close_databases;
348 unlink $self->_index_file;
349 unlink $self->_data_file;
350 unlink $self->_notes_file;
351 unless ($spare_fasta) {
352 unlink $self->_fasta_file;
353 unlink $self->_fasta_file.'.index';
355 unlink $self->_timestamp_file;
356 $self->_open_databases(1,1);
361 # load_sequence($fasta_filehandle,$first_sequence_id)
362 sub load_sequence {
363 my $self = shift;
364 my ($io_handle,$id) = @_;
365 my $file = $self->_fasta_file;
366 my $loaded = 0;
368 open my $F, '>>', $file or $self->throw("Could not append file '$file': $!");
370 if (defined $id) {
371 print $F ">$id\n";
372 $loaded++;
375 while (<$io_handle>) {
376 $loaded++ if /^>/;
377 print $F $_;
379 close $F;
380 my $dna_db = Bio::DB::Fasta->new($file) or $self->throw("Can't reindex sequence file: $@");
381 $self->dna_db($dna_db);
382 $self->_touch_timestamp;
383 return $loaded;
386 sub _mtime {
387 my $file = shift;
388 my @stat = stat($file);
389 return $stat[9];
392 sub _index_file {
393 my $self = shift;
394 return $self->dsn . "/bdb_features.btree";
397 sub _data_file {
398 my $self = shift;
399 return $self->dsn . "/bdb_features.data";
402 sub _fasta_file {
403 my $self = shift;
404 return $self->dsn . "/bdb_sequence.fa";
407 sub _notes_file {
408 my $self = shift;
409 return $self->dsn . "/bdb_notes.idx";
412 sub _temp_file {
413 my $self = shift;
414 local $^W=0;
415 my (undef,$filename) = tempfile("bdb_temp_XXXXXX",DIR=>$self->tmpdir,OPEN=>0);
416 return $filename;
419 sub _timestamp_file {
420 my $self = shift;
421 return $self->dsn ."/bdb_timestamp";
424 sub db {
425 my $db = shift()->{db} or return;
426 return tied(%$db);
429 sub dsn {
430 my $self = shift;
431 my $d = $self->{dsn};
432 $self->{dsn} = shift if @_;
436 sub tmpdir {
437 my $self = shift;
438 my $d = $self->{tmpdir};
439 $self->{tmpdir} = shift if @_;
443 sub load_gff_line {
445 my ($self, $feat) = @_;
447 $feat->{strand} = '' if $feat->{strand} && $feat->{strand} eq '.';
448 $feat->{phase} = '' if $feat->{phase} && $feat->{phase} eq '.';
450 my $start = $feat->{start};
451 my $stop = $feat->{stop};
452 my $type = join(':',$feat->{method},$feat->{source});
454 my $bin = bin($feat->{start},$feat->{stop},MIN_BIN);
455 $feat->{bin} = $bin;
457 my $id = $self->{data}->put($feat);
458 $bin = $self->normalizeNumber($bin);
460 my $db = $self->{db};
461 for my $skey ($self->_secondary_keys($feat)) {
462 $db->{$skey} = $id;
465 # save searchable notes to separate index
466 my $fh = $self->{notes};
467 my @notes = map {$_->[1]} grep {lc $_->[0] eq 'note'} @{$feat->{attributes}};
468 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating indexes: $!")
469 foreach @notes;
471 $self->{records_loaded}++;
472 $self->_bump_feature_count();
473 $self->_bump_class_count($feat->{gclass});
477 # do nothing!
478 sub setup_load {
479 my $self = shift;
480 $self->{records_loaded} = 0;
484 sub finish_load {
485 my $self = shift;
486 $self->db->sync && $self->throw("An error occurred while updating indexes: $!");
487 $self->_touch_timestamp;
488 $self->{records_loaded};
491 sub _touch_timestamp {
492 my $self = shift;
493 my $tsf = $self->_timestamp_file;
494 open my $F, '>', $tsf or $self->throw("Could not write file '$tsf': $!");
495 print $F scalar(localtime);
496 close $F;
500 # given sequence name, return (reference,start,stop,strand)
501 sub get_abscoords {
502 my $self = shift;
503 my ($name,$class,$refseq) = @_;
504 my %refs;
505 my $regexp;
507 if ($name =~ /[*?]/) { # uh oh regexp time
508 $name = quotemeta($name);
509 $name =~ s/\\\*/.*/g;
510 $name =~ s/\\\?/.?/g;
511 $regexp++;
513 # Find all features that have the requested name and class.
514 # Sort them by reference point.
515 my @features = @{$self->retrieve_features(-table => 'name', -key=>"$class:$name")};
516 if (!@features) { # nothing matched exactly, so try aliases
517 @features = @{$self->retrieve_features(-table=>'attr',-key=>"Alias:$name")};
520 foreach my $feature (@features){
521 push @{$refs{$feature->{ref}}},$feature;
524 # find out how many reference points we recovered
525 if (! %refs) {
526 $self->error("$name not found in database");
527 return;
530 # compute min and max
531 my ($ref) = keys %refs;
532 my @found = @{$refs{$ref}};
533 my ($strand,$start,$stop);
535 my @found_segments;
536 foreach my $ref (keys %refs) {
537 next if defined($refseq) and $ref ne $refseq;
538 my @found = @{$refs{$ref}};
539 my ($strand,$start,$stop,$name);
540 foreach (@found) {
541 $strand ||= $_->{strand};
542 $strand = '+' if $strand && $strand eq '.';
543 $start = $_->{start} if !defined($start) || $start > $_->{start};
544 $stop = $_->{stop} if !defined($stop) || $stop < $_->{stop};
545 $name ||= $_->{gname};
547 push @found_segments,[$ref,$class,$start,$stop,$strand,$name];
551 return \@found_segments;
554 sub get_types {
555 my $self = shift;
556 my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_;
557 my (%obj,%result,$key,$value);
558 $key = "__type__";
560 if (!$srcseq) { # optimized full type list
561 my $db = $self->db;
562 my $status = $db->seq($key,$value,R_CURSOR);
564 while ($status == 0 && $key =~ /^__type__(.+)/) {
565 my $type = $1;
566 my ($method,$source) = split ':',$type;
567 $obj{$type} = Bio::DB::GFF::Typename->new($method,$source);
568 $result{$type}++;
570 if ($want_count) {
571 $status = $db->seq($key,$value,R_NEXT);
572 } else { # skip to next key set
573 $key .= "\0";
574 $status = $db->seq($key,$value,R_CURSOR)
580 else { # range search
581 for my $feature (@{$self->_get_features_by_search_options(
582 {rangetype => 'overlaps',
583 refseq => $srcseq,
584 refclass => ($class || undef),
585 start => ($start || undef),
586 stop => ($stop || undef),
591 my $type = Bio::DB::GFF::Typename->new($feature->{method},$feature->{source});
592 $obj{$type} = $type;
593 $result{$type}++;
597 return $want_count ? %result : values %obj;
601 # Low level implementation of fetching a named feature.
602 # GFF annotations are named using the group class and name fields.
603 # May return zero, one, or several Bio::DB::GFF::Feature objects.
605 =head2 _feature_by_name
607 Title : _feature_by_name
608 Usage : $db->get_features_by_name($class,$name,$callback)
609 Function: get a list of features by name and class
610 Returns : count of number of features retrieved
611 Args : name of feature, class of feature, and a callback
612 Status : protected
614 This method is used internally. The callback arguments are those used
615 by make_feature().
617 =cut
619 sub _feature_by_name {
620 my $self = shift;
621 my ($class,$name,$location,$callback) = @_;
622 $callback || $self->throw('must provide a callback argument');
624 #use Devel::StackTrace;
625 #warn Devel::StackTrace->new->as_string;
627 my $count = 0;
628 my $id = -1;
629 my ($use_regexp, $use_glob,$using_alias_search);
631 if ($name =~ /[*?]/) { # uh oh regexp time
633 #If there is only one trailing *, do a range search
634 if ($name =~ /^([^\*]+)\*$/) {
635 $name = $1;
636 $use_glob++;
639 else {
640 $name = quotemeta($name);
641 $name =~ s/\\\*/.*/g;
642 $name =~ s/\\\?/.?/g;
643 $use_regexp++;
647 my @features;
648 if ($use_glob) {
649 my $callback = sub {my $feat = shift; $feat->{gname} =~ /^$name/i};
650 @features = @{$self->retrieve_features_range (-table => 'name',
651 -start => "$class:$name",
652 -do_while => $callback)
655 elsif ($use_regexp) {
656 my $filter = sub {my $feat = shift; $feat->{gname} =~ /$name/i};
657 @features = @{$self->filter_features(-table =>'name', -filter => $filter)};
660 else {
661 @features = @{$self->retrieve_features(-table=>'name', -key => "$class:$name")};
664 unless (@features) {
665 $using_alias_search++;
666 @features = @{$self->retrieve_features(-table=>'attr', -key=>"Alias:$name")};
669 foreach my $feature (@features){
670 $id++;
671 next unless $using_alias_search || $feature->{gclass} eq $class;
673 if ($location) {
674 next if $location->[0] ne $feature->{ref};
675 next if $location->[1] && $location->[1] > $feature->{stop};
676 next if $location->[2] && $location->[2] < $feature->{start};
678 $count++;
680 $callback->(@{$feature}{@hash2array_map},0);
682 return $count;
685 #sub get_feature_by_attribute{
686 sub _feature_by_attribute{
687 my $self = shift;
688 my ($attributes,$callback) = @_;
689 $callback || $self->throw('must provide a callback argument');
690 my $count = 0;
691 my $feature_group_id = undef;
693 #there could be more than one set of attributes......
694 while (my ($key, $value) = each %$attributes) {
696 my @features = @{$self->retrieve_features
697 (-table => "attr", -key => "$key:$value")};
699 for my $feature (@features) {
700 $callback->(@{$feature}{@hash2array_map},$feature_group_id);
701 $count++;
707 sub search_notes {
708 my $self = shift;
709 my ($search_string,$limit) = @_;
711 $search_string =~ tr/*?//d;
713 my @results;
715 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
716 my $search = join '|',@words;
718 my (%found,$found);
719 my $note_index = $self->{notes};
720 seek($note_index,0,0); # back to start
721 while (<$note_index>) {
722 next unless /$search/;
723 chomp;
724 my ($note,$uu) = split "\t";
725 $found{unpack("u*",$uu)}++;
726 last if $limit && ++$found >= $limit;
729 my (@features, @matches);
730 for my $idx (keys %found) {
731 my $feature = $self->{data}->get($idx) or next;
732 my @attributes = @{$feature->{attributes}};
733 my @values = map {lc $_->[0] eq 'note' ? $_->[1] : ()} @attributes;
734 my $value = "@values";
736 my $hits;
737 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
738 push @matches,$hits;
739 push @features,$feature;
742 for (my $i=0; $i<@matches; $i++) {
743 my $feature = $features[$i];
744 my $matches = $matches[$i];
746 my $relevance = 10 * $matches;
747 my $featname = Bio::DB::GFF::Featname->new($feature->{gclass}=>$feature->{gname});
748 my $type = Bio::DB::GFF::Typename->new($feature->{method}=>$feature->{source});
749 my $note;
750 $note = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'} @{$feature->{attributes}};
751 push @results,[$featname,$note,$relevance,$type];
754 return @results;
757 sub _get_features_by_search_options {
759 #The $data argument is not used and is preserved for superclass compatibility
760 my ($self, $search,$options) = @_;
761 my $count = 0;
763 my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$temp_file) =
764 (@{$search}{qw(rangetype refseq refclass start stop types)},
765 @{$options}{qw(sparse sort_by_group ATTRIBUTES temp_file)}) ;
767 $start = 0 unless defined($start);
768 $stop = MAX_BIN unless defined($stop);
770 my $bin = bin($start,$stop,MIN_BIN);
771 $bin = $self->normalizeNumber($bin);
773 my ($results,@features,%found,%results_table);
775 if ($temp_file) {
776 local $DB_BTREE->{flags} = R_DUP;
777 # note: there is a race condition possible here, if someone reuses the
778 # same name between the time we get the tmpfile name and the time we
779 # ask DB_File to open it.
780 tie(%results_table,'DB_File',$temp_file,O_RDWR|O_CREAT,0666,$DB_BTREE)
781 or $self->throw("Couldn't tie temporary file ".$temp_file." for writing: $!");
782 $results = \%results_table;
783 } else {
784 $results = \@features;
787 my $filter = sub {
788 my $feature = shift;
790 my $ref = $feature->{ref};
791 my $feature_start = $feature->{start};
792 my $feature_stop = $feature->{stop};
793 my $feature_id = $feature->{feature_id};
795 return 0 if $found{$feature_id}++;
797 if (defined $refseq) {
798 return 0 unless lc $refseq eq lc $ref;
799 $start = 0 unless defined($start);
800 $stop = MAX_SEGMENT unless defined($stop);
802 if ($rangetype eq 'overlaps') {
803 return 0 unless $feature_stop >= $start && $feature_start <= $stop;
804 } elsif ($rangetype eq 'contains') {
805 return 0 unless $feature_start >= $start && $feature_stop <= $stop;
806 } elsif ($rangetype eq 'contained_in') {
807 return 0 unless $feature_start <= $start && $feature_stop >= $stop;
808 } else {
809 return 0 unless $feature_start == $start && $feature_stop == $stop;
813 my $feature_source = $feature->{source};
814 my $feature_method = $feature->{method};
816 if (defined $types && @$types){
817 return 0 unless $self->_matching_typelist($feature_method,$feature_source,$types);
820 my $feature_attributes = $feature->{attributes};
821 if (defined $attributes){
822 return 0 unless $self->_matching_attributes($feature_attributes,$attributes);
825 return 1;
828 if (defined $refseq && !$sparse) {
829 my $tier = MAX_BIN;
830 while ($tier >= MIN_BIN) {
831 my ($tier_start,$tier_stop) = (bin_bot($tier,$start),bin_top($tier,$stop));
832 # warn "Using $tier_start $tier_stop\n";
833 if ($tier_start == $tier_stop) {
834 $self->retrieve_features(-table => "bin",
835 -key => "$refseq$;$tier_start",
836 -filter => $filter,
837 -result => $results);
838 } else {
839 my $callback = sub {my $feat = shift; $feat->{bin} <= $tier_stop};
840 $self->retrieve_features_range(-table => "bin",
841 -start => "$refseq$;$tier_start",
842 -do_while => $callback,
843 -filter => $filter,
844 -result => $results);
846 $tier /= 10;
850 elsif (@$types) {
851 foreach (@$types) {
852 my $type = join ':',@$_;
853 $self->retrieve_features_range(-table => 'type',
854 -start => $type,
855 -filter => $filter,
856 -do_while => sub { my $f = shift;
857 lc($f->{method}) eq lc($_->[0])
859 lc($f->{source}||$_->[1]||'') eq lc($_->[1]||'')
861 -result => $results);
865 elsif (defined $attributes) {
866 my ($attribute_name,$attribute_value) = each %$attributes; # pick first one
867 $self->retrieve_features(-table => 'attr',
868 -key => "${attribute_name}:${attribute_value}",
869 -filter => $filter,
870 -result => $results);
873 else {
874 $self->filter_features(-filter => $filter,-result=>$results);
877 return $results;
880 sub retrieve_features {
881 my $self = shift;
882 my ($table, $key, $filter, $result) = rearrange(['TABLE','KEY','FILTER', 'RESULT'],@_);
884 my @result;
885 $result ||= \@result;
887 my $frozen;
888 my @ids = $self->db->get_dup("__".lc($table)."__".lc($key));
889 my $data = $self->{data};
890 local $^W = 0; # because _hash_to_array() will generate lots of uninit values
892 foreach my $id (@ids) {
893 my $feat = $data->get($id);
894 my $filter_result = $filter ? $filter->($feat) : 1;
895 next unless $filter_result;
896 if (ref $result eq 'HASH') {
897 $result->{"$feat->{gclass}:$feat->{gname}"} = join ($;,$self->_hash_to_array($feat));
898 } else {
899 push @$result, $feat;
901 last if $filter_result == -1;
903 return $result;
906 sub retrieve_features_range {
907 my ($self) = shift;
908 my ($table, $start, $do_while, $filter, $result) = rearrange(['TABLE','START','DO_WHILE', 'FILTER', 'RESULT'],@_);
909 local $^W = 0; # because _hash_to_array will generate lots of uninit warnings
911 my @result;
912 $result ||= \@result;
913 my ($id, $key, $value);
915 $key = "__".$table."__".$start;
916 my $db = $self->db;
918 for (my $status = $db->seq($key,$value,R_CURSOR);
919 $status == 0;
920 $status = $db->seq($key,$value,R_NEXT)) {
922 my $feat = $self->{data}->get($value);
923 last unless $do_while->($feat,$key);
925 my $filter_result = $filter ? $filter->($feat) : 1;
926 next unless $filter_result;
928 if (ref $result eq 'HASH') {
929 $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat));
930 } else {
931 push @$result,$feat;
933 last if $filter_result == -1;
936 return $result;
940 sub filter_features {
941 my ($self) = shift;
943 my ($filter,$result) = rearrange(['FILTER','RESULT'],@_);
945 my @result;
946 $result ||= \@result;
948 my ($key, $frozen);
949 my $data = $self->{data};
950 $data->reset;
951 while (my $feat = $data->next) {
953 my $filter_result = $filter ? $filter->($feat) : 1;
954 next unless $filter_result;
956 if (ref($result) eq 'HASH') {
957 $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat));
958 } else {
959 push @$result,$feat;
961 last if $filter_result == -1;
964 return $result;
968 sub _basic_features_by_id{
969 my $self = shift;
970 my ($ids) = @_;
972 $ids = [$ids] unless ref $ids =~ /ARRAY/;
974 my @result;
975 my $data = $self->{data};
976 for my $feature_id (@$ids){
977 push @result, $data->get($feature_id);
980 return wantarray() ? @result : $result[0];
983 sub normalizeNumber {
984 my ($self, $num) = @_;
985 while ((length $num) < MAX_NUM_LENGTH)
987 $num = "0".$num;
989 return $num;
992 sub get_features_iterator {
993 my $self = shift;
995 my ($search,$options,$callback) = @_;
996 $callback || $self->throw('must provide a callback argument');
997 $options->{temp_file} = $self->_temp_file;
999 my $results = $self->_get_features_by_search_options($search,$options);
1000 return Bio::DB::GFF::Adaptor::berkeleydb::iterator->new($results,$callback,$options->{temp_file});
1003 #--------------------------------------------------------------------------#
1005 package FeatureStore;
1007 # This is a very specialized package that stores serialized features onto a file-based
1008 # array. The array is indexed by the physical offset to the beginning of each serialized
1009 # feature.
1011 use strict;
1012 use Fcntl qw(SEEK_SET SEEK_END);
1013 use base 'Bio::Root::Root';
1014 use Bio::DB::GFF::Adaptor::memory::feature_serializer; # qw(feature2string string2feature @hash2array_map);
1016 sub new {
1017 my $class = shift;
1018 my $dbname = shift or $class->throw("must provide a filepath argument");
1019 my ($write,$create) = @_;
1021 my $mode = $create ? "+>"
1022 : $write ? "+>>"
1023 : "<";
1025 open my $F, $mode, $dbname or $class->throw("Could not open file '$dbname': $!");
1026 my $self = bless {
1027 fh => $F,
1028 next_idx => 0,
1029 last_id => 0,
1030 },$class;
1031 return $self;
1034 sub put {
1035 my $self = shift;
1036 my $feature = shift;
1037 my $fh = $self->{fh};
1038 seek($fh,0,SEEK_END);
1039 my $offset = tell($fh) || 0;
1041 $self->{last_id} = $offset;
1043 my $id = pack("L",$offset);
1044 $feature->{feature_id} = $id;
1045 my $value = feature2string($feature);
1046 print $fh pack("n/a*",$value) or $self->throw("An error occurred while updating the data file: $!");
1049 return $id;
1052 sub last_id {
1053 shift->{last_id};
1056 sub get {
1057 my $self = shift;
1058 my $idx = shift;
1059 my $offset = unpack("L",$idx);
1060 my $fh = $self->{fh};
1062 my ($value,$length);
1063 $offset ||= 0;
1064 seek($fh,$offset,SEEK_SET);
1065 return unless read($fh,$length,2);
1066 return unless read($fh,$value,unpack("n",$length));
1067 $self->{next_idx} = tell($fh);
1068 return if substr($value,0,1) eq "\0";
1069 return string2feature($value);
1072 sub next {
1073 my $self = shift;
1074 my $fh = $self->{fh};
1075 my $result;
1076 do {
1077 $result = $self->get(pack("L",$self->{next_idx}));
1078 } until $result || eof($fh);
1079 $self->{next_idx} = 0 unless $result;
1080 $result;
1083 sub remove {
1084 my $self = shift;
1085 my $id = shift;
1086 my $offset = unpack("L",$id);
1087 my $fh = $self->{fh};
1088 my ($value,$length);
1089 seek($fh,$offset,SEEK_SET);
1090 return unless read($fh,$length,2);
1091 print $fh "\0"x$length; # null it out
1095 sub _seek {
1096 my $self = shift;
1097 my $idx = shift;
1098 my $offset = unpack("L",$idx);
1099 seek($self->{fh},$offset,SEEK_SET);
1100 $self->{next_idx} = tell($self->{fh});
1103 sub reset {
1104 my $self = shift;
1105 $self->_seek(pack("L",0));
1108 sub _feature2string {
1109 my $feature = shift;
1110 my @a = @{$feature}{@hash2array_map};
1111 push @a,map {@$_} @{$feature->{attributes}} if $feature->{attributes};
1112 return join $;,@a;
1115 sub _string2feature {
1116 my $string = shift;
1117 my (%feature,@attributes);
1119 (@feature{@hash2array_map},@attributes) = split $;,$string;
1120 while (@attributes) {
1121 my ($key,$value) = splice(@attributes,0,2);
1122 push @{$feature{attributes}},[$key,$value];
1124 $feature{group_id} = undef;
1125 \%feature;