maint: update changes from 1f7b54a281a8
[bioperl-live.git] / lib / Bio / DB / SeqFeature / Store / berkeleydb3.pm
blob10fd39762b303831623f7a2e448b498f40fd6dfa
1 package Bio::DB::SeqFeature::Store::berkeleydb3;
3 # $Id: berkeleydb3.pm 15987 2009-08-18 21:08:55Z lstein $
4 # faster implementation of berkeleydb
6 =head1 NAME
8 Bio::DB::SeqFeature::Store::berkeleydb3 -- Storage and retrieval of sequence
9 annotation data in Berkeleydb files
11 =head1 SYNOPSIS
13 # Create a feature database from scratch
14 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
15 -dsn => '/var/databases/fly4.3',
16 -create => 1);
18 # get a feature from somewhere
19 my $feature = Bio::SeqFeature::Generic->new(...);
21 # store it
22 $db->store($feature) or die "Couldn't store!";
24 =head1 DESCRIPTION
26 This is a faster version of the berkeleydb storage adaptor for
27 Bio::DB::SeqFeature::Store. It is used automatically when you create a
28 new database with the original berkeleydb adaptor. When opening a
29 database created under the original adaptor, the old code is used for
30 backward compatibility.
32 Please see L<Bio::DB::SeqFeature::Store::berkeleydb> for full usage
33 instructions.
35 =head1 BUGS
37 This is an early version, so there are certainly some bugs. Please
38 use the BioPerl bug tracking system to report bugs.
40 =head1 SEE ALSO
42 L<bioperl>,
43 L<Bio::DB::SeqFeature>,
44 L<Bio::DB::SeqFeature::Store>,
45 L<Bio::DB::SeqFeature::GFF3Loader>,
46 L<Bio::DB::SeqFeature::Segment>,
47 L<Bio::DB::SeqFeature::Store::memory>,
48 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
50 =head1 AUTHOR
52 Lincoln Stein E<lt>lincoln.stein@gmail.comE<gt>.
54 Copyright (c) 2009 Ontario Institute for Cancer Research
56 This library is free software; you can redistribute it and/or modify
57 it under the same terms as Perl itself.
59 =cut
61 use strict;
62 use base 'Bio::DB::SeqFeature::Store::berkeleydb';
63 use DB_File;
64 use Fcntl qw(O_RDWR O_CREAT :flock);
65 use Bio::DB::GFF::Util::Rearrange 'rearrange';
67 # can't have more sequence ids than this
68 use constant MAX_SEQUENCES => 1_000_000_000;
70 # used to construct the bin key
71 use constant C1 => 500_000_000; # limits chromosome length to 500 megabases
72 use constant C2 => 1000*C1; # at most 1000 chromosomes
74 use constant BINSIZE => 10_000;
75 use constant MININT => -999_999_999_999;
76 use constant MAXINT => 999_999_999_999;
77 use constant SUMMARY_BIN_SIZE => 1000;
79 sub version { return 3.0 }
81 sub open_index_dbs {
82 my $self = shift;
83 my ($flags,$create) = @_;
85 # Create the main index databases; these are DB_BTREE implementations with duplicates allowed.
86 $DB_BTREE->{flags} = R_DUP;
88 my $string_cmp = DB_File::BTREEINFO->new;
89 $string_cmp->{flags} = R_DUP;
90 $string_cmp->{compare} = sub { lc $_[0] cmp lc $_[1] };
92 my $numeric_cmp = DB_File::BTREEINFO->new;
93 $numeric_cmp->{flags} = R_DUP;
94 $numeric_cmp->{compare} = sub { $_[0] <=> $_[1] };
96 for my $idx ($self->_index_files) {
97 my $path = $self->_qualify("$idx.idx");
98 my %db;
99 my $dbtype = $idx eq 'locations' ? $numeric_cmp
100 :$idx eq 'summary' ? $numeric_cmp
101 :$idx eq 'types' ? $numeric_cmp
102 :$idx eq 'seqids' ? $DB_HASH
103 :$idx eq 'typeids' ? $DB_HASH
104 :$string_cmp;
106 tie(%db,'DB_File',$path,$flags,0666,$dbtype)
107 or $self->throw("Couldn't tie $path: $!");
108 %db = () if $create;
109 $self->index_db($idx=>\%db);
114 sub seqid_db { shift->index_db('seqids') }
115 sub typeid_db { shift->index_db('typeids') }
117 sub _delete_databases {
118 my $self = shift;
119 $self->SUPER::_delete_databases;
122 # given a seqid (name), return its denormalized numeric representation
123 sub seqid_id {
124 my $self = shift;
125 my $seqid = shift;
126 my $db = $self->seqid_db;
127 return $db->{lc $seqid};
130 sub add_seqid {
131 my $self = shift;
132 my $seqid = shift;
134 my $db = $self->seqid_db;
135 my $key = lc $seqid;
136 $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key};
137 die "Maximum number of sequence ids exceeded. This module can handle up to ",
138 MAX_SEQUENCES," unique ids" if $db->{$key} > MAX_SEQUENCES;
139 return $db->{$key};
142 # given a seqid (name), return its denormalized numeric representation
143 sub type_id {
144 my $self = shift;
145 my $typeid = shift;
146 my $db = $self->typeid_db;
147 return $db->{$typeid};
150 sub add_typeid {
151 my $self = shift;
152 my $typeid = shift;
154 my $db = $self->typeid_db;
155 my $key = lc $typeid;
156 $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key};
157 return $db->{$key};
160 sub _seq_ids {
161 my $self = shift;
162 if (my $fa = $self->{fasta_db}) {
163 if (my @s = eval {$fa->ids}) {
164 return @s;
167 my $l = $self->seqid_db or return;
168 return grep {!/^\./} keys %$l;
171 sub _index_files {
172 return (shift->SUPER::_index_files,'seqids','typeids','summary');
175 sub _update_indexes {
176 my $self = shift;
177 my $obj = shift;
178 defined (my $id = $obj->primary_id) or return;
179 $self->SUPER::_update_indexes($obj);
180 $self->_update_seqid_index($obj,$id);
183 sub _update_seqid_index {
184 my $self = shift;
185 my ($obj,$id,$delete) = @_;
186 my $seq_name = $obj->seq_id;
187 $self->add_seqid(lc $seq_name);
190 sub _update_type_index {
191 my $self = shift;
192 my ($obj,$id,$delete) = @_;
193 my $db = $self->index_db('types')
194 or $self->throw("Couldn't find 'types' index file");
196 my $key = $self->_obj_to_type($obj);
197 my $typeid = $self->add_typeid($key);
198 $self->update_or_delete($delete,$db,$typeid,$id);
201 sub _obj_to_type {
202 my $self = shift;
203 my $obj = shift;
204 my $tag = $obj->primary_tag;
205 my $source_tag = $obj->source_tag || '';
206 return unless defined $tag;
208 $tag .= ":$source_tag";
209 return lc $tag;
212 sub types {
213 my $self = shift;
214 eval "require Bio::DB::GFF::Typename"
215 unless Bio::DB::GFF::Typename->can('new');
216 my $db = $self->typeid_db;
217 return grep {!/^\./} map {Bio::DB::GFF::Typename->new($_)} keys %$db;
220 sub _id2type {
221 my $self = shift;
222 my $wanted_id = shift;
224 my $db = $self->typeid_db;
225 while (my($key,$id) = each %$db) {
226 next if $key =~ /^\./;
227 return $key if $id == $wanted_id;
229 return;
232 # return a hash of typeids that match a human-readable type
233 sub _matching_types {
234 my $self = shift;
235 my $types = shift;
236 my @types = ref $types eq 'ARRAY' ? @$types : $types;
237 my $db = $self->typeid_db;
239 my %result;
240 my @all_types;
242 for my $type (@types) {
243 my ($primary_tag,$source_tag);
244 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
245 $primary_tag = $type->method;
246 $source_tag = $type->source;
247 } else {
248 ($primary_tag,$source_tag) = split ':',$type,2;
250 if (defined $source_tag) {
251 my $id = $db->{lc "$primary_tag:$source_tag"};
252 $result{$id}++ if defined $id;
253 } else {
254 @all_types = $self->types unless @all_types;
255 $result{$db->{$_}}++ foreach grep {/^$primary_tag:/} @all_types;
258 return \%result;
261 sub _update_location_index {
262 my $self = shift;
263 my ($obj,$id,$delete) = @_;
265 my $db = $self->index_db('locations')
266 or $self->throw("Couldn't find 'locations' index file");
268 my $seq_id = $obj->seq_id || '';
269 my $start = $obj->start || '';
270 my $end = $obj->end || '';
271 my $strand = $obj->strand;
272 my $bin_min = int $start/BINSIZE;
273 my $bin_max = int $end/BINSIZE;
275 my $typeid = $self->add_typeid($self->_obj_to_type($obj));
276 my $seq_no = $self->add_seqid($seq_id);
278 for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
279 my $key = $seq_no * MAX_SEQUENCES + $bin;
280 $self->update_or_delete($delete,$db,$key,pack("i5",$id,$start,$end,$strand,$typeid));
285 sub _features {
286 my $self = shift;
287 my ($seq_id,$start,$end,$strand,
288 $name,$class,$allow_aliases,
289 $types,
290 $attributes,
291 $range_type,
292 $iterator
293 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
294 'NAME','CLASS','ALIASES',
295 ['TYPES','TYPE','PRIMARY_TAG'],
296 ['ATTRIBUTES','ATTRIBUTE'],
297 'RANGE_TYPE',
298 'ITERATOR',
299 ],@_);
301 my (@from,@where,@args,@group);
302 $range_type ||= 'overlaps';
304 my @result;
305 unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
306 my $is_indexed = $self->index_db('is_indexed');
307 @result = $is_indexed ? grep {$is_indexed->{$_}} keys %{$self->db}
308 : grep { !/^\./ }keys %{$self->db};
311 my %found = ();
312 my $result = 1;
314 if (defined($name)) {
315 # hacky backward compatibility workaround
316 undef $class if $class && $class eq 'Sequence';
317 $name = "$class:$name" if defined $class && length $class > 0;
318 $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
321 if (defined $seq_id) { # location with or without types
322 my $typelist = defined $types ? $self->_matching_types($types) : undef;
323 $result &&= $self->filter_by_type_and_location(
324 $seq_id, $start, $end, $strand, $range_type, $typelist, \%found
328 elsif (defined $types) { # types without location
329 $result &&= $self->filter_by_type($types,\%found);
332 if (defined $attributes) {
333 $result &&= $self->filter_by_attribute($attributes,\%found);
336 push @result,keys %found if $result;
337 return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
338 : map {$self->fetch($_)} @result;
341 sub filter_by_type {
342 my $self = shift;
343 my ($types,$filter) = @_;
344 my @types = ref $types eq 'ARRAY' ? @$types : $types;
346 my $index = $self->index_db('types');
347 my $db = tied(%$index);
349 my @results;
351 for my $type (@types) {
352 my ($primary_tag,$source_tag);
353 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
354 $primary_tag = $type->method;
355 $source_tag = $type->source;
356 } else {
357 ($primary_tag,$source_tag) = split ':',$type,2;
359 $source_tag ||= '';
360 $primary_tag = quotemeta($primary_tag);
361 $source_tag = quotemeta($source_tag);
362 my $match = length $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
363 my $key = lc "$primary_tag:$source_tag";
364 my $value;
366 # If filter is already provided, then it is usually faster to
367 # fetch each object.
368 if (%$filter) {
369 for my $id (keys %$filter) {
370 my $obj = $self->_fetch($id) or next;
371 push @results,$id if $obj->type =~ /$match/i;
376 else {
377 my $types = $self->typeid_db;
378 my @typeids = map {$types->{$_}} grep {/$match/} keys %$types;
379 for my $t (@typeids) {
380 my $k = $t;
381 for (my $status = $db->seq($k,$value,R_CURSOR);
382 $status == 0 && $k == $t;
383 $status = $db->seq($k,$value,R_NEXT)) {
384 next if %$filter && !$filter->{$value}; # don't even bother
385 push @results,$value;
390 $self->update_filter($filter,\@results);
393 sub filter_by_type_and_location {
394 my $self = shift;
395 my ($seq_id,$start,$end,$strand,$range_type,$typelist,$filter) = @_;
396 $strand ||= 0;
398 my $index = $self->index_db('locations');
399 my $db = tied(%$index);
401 my $binstart = defined $start ? int $start/BINSIZE : 0;
402 my $binend = defined $end ? int $end/BINSIZE : MAX_SEQUENCES-1;
404 my %seenit;
405 my @results;
407 $start = MININT if !defined $start;
408 $end = MAXINT if !defined $end;
410 my $seq_no = $self->seqid_id($seq_id);
411 return unless defined $seq_no;
413 if ($range_type eq 'overlaps' or $range_type eq 'contains') {
414 my $keystart = $seq_no * MAX_SEQUENCES + $binstart;
415 my $keystop = $seq_no * MAX_SEQUENCES + $binend;
416 my $value;
418 for (my $status = $db->seq($keystart,$value,R_CURSOR);
419 $status == 0 && $keystart <= $keystop;
420 $status = $db->seq($keystart,$value,R_NEXT)) {
421 my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
422 next if $seenit{$id}++;
423 next if $strand && $fstrand != $strand;
424 next if $typelist && !$typelist->{$ftype};
425 if ($range_type eq 'overlaps') {
426 next unless $fend >= $start && $fstart <= $end;
428 elsif ($range_type eq 'contains') {
429 next unless $fstart >= $start && $fend <= $end;
431 next if %$filter && !$filter->{$id}; # don't bother
432 push @results,$id;
436 # for contained in, we look for features originating and terminating outside the specified range
437 # this is incredibly inefficient, but fortunately the query is rare (?)
438 elsif ($range_type eq 'contained_in') {
439 my $keystart = $seq_no * MAX_SEQUENCES;
440 my $keystop = $seq_no * MAX_SEQUENCES + $binstart;
441 my $value;
443 # do the left part of the range
444 for (my $status = $db->seq($keystart,$value,R_CURSOR);
445 $status == 0 && $keystart <= $keystop;
446 $status = $db->seq($keystart,$value,R_NEXT)) {
447 my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
448 next if $seenit{$id}++;
449 next if $strand && $fstrand != $strand;
450 next if $typelist && !$typelist->{$ftype};
451 next unless $fstart <= $start && $fend >= $end;
452 next if %$filter && !$filter->{$id}; # don't bother
453 push @results,$id;
456 # do the right part of the range
457 $keystart = $seq_no*MAX_SEQUENCES+$binend;
458 for (my $status = $db->seq($keystart,$value,R_CURSOR);
459 $status == 0;
460 $status = $db->seq($keystart,$value,R_NEXT)) {
461 my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
462 next if $seenit{$id}++;
463 next if $strand && $fstrand != $strand;
464 next unless $fstart <= $start && $fend >= $end;
465 next if $typelist && !$typelist->{$ftype};
466 next if %$filter && !$filter->{$id}; # don't bother
467 push @results,$id;
472 $self->update_filter($filter,\@results);
475 sub build_summary_statistics {
476 my $self = shift;
478 my $insert = $self->index_db('summary');
479 %$insert = ();
481 my $current_bin = -1;
482 my (%residuals,$last_bin);
484 my $le = -t \*STDERR ? "\r" : "\n";
486 print STDERR "\n";
488 # iterate through all the indexed features
489 my $sbs = SUMMARY_BIN_SIZE;
491 # Sadly we have to do this in two steps. In the first step, we sort
492 # features by typeid,seqid,start. In the second step, we read through
493 # this sorted list. To avoid running out of memory, we use a db_file
494 # temporary database
495 my $fh = File::Temp->new() or $self->throw("Could not create temporary file for sorting: $!");
496 my $name = $fh->filename;
497 my %sort;
498 my $num_cmp_tree = DB_File::BTREEINFO->new;
499 $num_cmp_tree->{compare} = sub { $_[0] <=> $_[1] };
500 $num_cmp_tree->{flags} = R_DUP;
501 my $s = tie %sort, 'DB_File', $name, O_CREAT|O_RDWR, 0666, $num_cmp_tree
502 or $self->throw("Could not create Berkeley DB in temporary file '$name': $!");
504 my $index = $self->index_db('locations');
505 my $db = tied(%$index);
506 my $keystart = 0;
507 my ($value,$count);
508 my %seenit;
510 for (my $status = $db->seq($keystart,$value,R_CURSOR);
511 $status == 0;
512 $status = $db->seq($keystart,$value,R_NEXT)) {
513 my ($id,$start,$end,$strand,$typeid) = unpack('i5',$value);
514 next if $seenit{$id}++;
516 print STDERR $count," features sorted$le" if ++$count % 1000 == 0;
517 my $seqid = int($keystart / MAX_SEQUENCES);
518 my $key = $self->_encode_summary_key($typeid,$seqid,$start-1);
519 $sort{$key}=$end;
521 print STDERR "COUNT = $count\n";
523 my ($current_type,$current_seqid,$end);
524 my $cum_count = 0;
526 $keystart = 0;
527 $count = 0;
529 # the second step allows us to iterate through this
530 for (my $status = $s->seq($keystart,$end,R_CURSOR);
531 $status == 0;
532 $status = $s->seq($keystart,$end,R_NEXT)) {
534 print STDERR $count," features processed$le" if ++$count % 1000 == 0;
535 my ($typeid,$seqid,$start) = $self->_decode_summary_key($keystart);
537 my $bin = int($start/$sbs);
539 # because the input is sorted by start, no more features will contribute to the
540 # current bin so we can dispose of it
541 if ($bin != $current_bin) {
542 if ($seqid != $current_seqid or $typeid != $current_type) {
543 # load all bins left over
544 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
545 %residuals = () ;
546 $cum_count = 0;
547 } else {
548 # load all up to current one
549 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin);
553 $last_bin = $current_bin;
554 ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin);
556 # summarize across entire spanned region
557 my $last_bin = int(($end-1)/$sbs);
558 for (my $b=$bin;$b<=$last_bin;$b++) {
559 $residuals{$b}++;
563 # handle tail case
564 # load all bins left over
565 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
567 undef %sort;
568 undef $fh;
571 sub _load_bins {
572 my $self = shift;
573 my ($insert,$residuals,$cum_count,$typeid,$seqid,$stop_after) = @_;
574 for my $b (sort {$a<=>$b} keys %$residuals) {
575 last if defined $stop_after and $b > $stop_after;
576 $$cum_count += $residuals->{$b};
577 my $key = $self->_encode_summary_key($typeid,$seqid,$b);
578 $insert->{$key} = $$cum_count;
579 delete $residuals->{$b}; # no longer needed
583 sub coverage_array {
584 my $self = shift;
585 my ($seq_name,$start,$end,$types,$bins) =
586 rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
587 ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);
589 $bins ||= 1000;
590 $start ||= 1;
591 unless ($end) {
592 my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
593 $end = $segment->end;
596 my $binsize = ($end-$start+1)/$bins;
597 my $seqid = $self->seqid_id($seq_name) || 0;
599 return [] unless $seqid;
601 # where each bin starts
602 my @his_bin_array = map {$start + $binsize * $_} (0..$bins);
603 my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;
605 my $interval_stats_idx = $self->index_db('summary');
606 my $db = tied(%$interval_stats_idx);
607 my $t = $self->_matching_types($types);
609 my (%bins,$report_tag);
610 for my $typeid (sort keys %$t) {
611 $report_tag ||= $typeid;
613 for (my $i=0;$i<@sum_bin_array;$i++) {
614 my $cum_count;
615 my $bin = $sum_bin_array[$i];
616 my $key = $self->_encode_summary_key($typeid,$seqid,$bin);
617 my $status = $db->seq($key,$cum_count,R_CURSOR);
618 next unless $status == 0;
619 push @{$bins{$typeid}},[$bin,$cum_count];
623 my @merged_bins;
624 my $firstbin = int(($start-1)/$binsize);
625 for my $type (keys %bins) {
626 my $arry = $bins{$type};
627 my $last_count = $arry->[0][1]-1;
628 my $last_bin = -1;
629 my $i = 0;
630 my $delta;
631 for my $b (@$arry) {
632 my ($bin,$count) = @$b;
633 $delta = $count - $last_count if $bin > $last_bin;
634 $merged_bins[$i++] = $delta;
635 $last_count = $count;
636 $last_bin = $bin;
639 my $returned_type = $self->_id2type($report_tag);
640 return wantarray ? (\@merged_bins,$returned_type) : \@merged_bins;
644 sub _encode_summary_key {
645 my $self = shift;
646 my ($typeid,$seqid,$bin) = @_;
647 $self->throw('Cannot index chromosomes larger than '.C1*SUMMARY_BIN_SIZE/1e6.' megabases')
648 if $bin > C1;
649 return ($typeid-1)*C2 + ($seqid-1)*C1 + $bin;
652 sub _decode_summary_key {
653 my $self = shift;
654 my $key = shift;
655 my $typeid = int($key/C2);
656 my $residual = $key%C2;
657 my $seqid = int($residual/C1);
658 my $bin = $residual%C1;
659 return ($typeid+1,$seqid+1,$bin);