1 package Bio
::DB
::GFF
::Adaptor
::memory
;
5 Bio::DB::GFF::Adaptor::memory -- Bio::DB::GFF database adaptor for in-memory databases
10 my $db = Bio::DB::GFF->new(-adaptor=> 'memory',
11 -gff => 'my_features.gff',
17 my $db = Bio::DB::GFF->new(-adaptor=>'memory');
18 $db->load_gff_file('my_features.gff');
19 $db->load_fasta_file('my_dna.fa');
21 See L<Bio::DB::GFF> for other methods.
25 This adaptor implements an in-memory version of Bio::DB::GFF. It can be used to
26 store and retrieve SHORT GFF files. It inherits from Bio::DB::GFF.
30 Use Bio::DB::GFF-E<gt>new() to construct new instances of this class.
31 Three named arguments are recommended:
35 -adaptor Set to "memory" to create an instance of this class.
36 -gff Read the indicated file or directory of .gff file.
37 -fasta Read the indicated file or directory of fasta files.
38 -dir Indicates a directory containing .gff and .fa files
40 If you use the -dir option and the indicated directory is writable by
41 the current process, then this library will create a FASTA file index
42 that greatly diminishes the memory usage of this module.
44 Alternatively you may create an empty in-memory object using just the
45 -adaptor=E<gt>'memory' argument and then call the load_gff_file() and
46 load_fasta_file() methods to load GFF and/or sequence
47 information. This is recommended in CGI/mod_perl/fastCGI environments
48 because these methods do not modify STDIN, unlike the constructor.
52 See L<Bio::DB::GFF> for inherited methods.
60 L<Bio::DB::GFF>, L<bioperl>
64 Shuly Avraham E<lt>avraham@cshl.orgE<gt>.
66 Copyright (c) 2002 Cold Spring Harbor Laboratory.
68 This library is free software; you can redistribute it and/or modify
69 it under the same terms as Perl itself.
74 # AUTHOR: Shulamit Avraham
75 # This module needs to be cleaned up and documented
77 # Bio::DB::GFF::Adaptor::memory -- in-memory db adaptor
78 # implements the low level handling of data which stored in memory.
79 # This adaptor implements a specific in memory schema that is compatible with Bio::DB::GFF.
80 # Inherits from Bio::DB::GFF.
83 use Bio
::DB
::GFF
::Util
::Rearrange
; # for rearrange()
84 use Bio
::DB
::GFF
::Adaptor
::memory
::iterator
;
85 use File
::Basename
'dirname';
86 use Bio
::DB
::GFF
::Adaptor
::memory
::feature_serializer
qw(@hash2array_map);
89 use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get
91 use base qw(Bio::DB::GFF);
95 my ($file,$fasta,$dbdir,$preferred_groups) = rearrange
([
98 [qw(DSN DB DIR DIRECTORY)],
103 my $self = bless{ data
=> [] },$class;
104 $self->preferred_groups($preferred_groups) if defined $preferred_groups;
107 $self->load_gff($file) if $file;
108 $self->load_or_store_fasta($fasta) if $fasta;
112 sub load_or_store_fasta
{
115 if ((-f
$fasta && -w dirname
($fasta))
117 (-d
$fasta && -w
$fasta)) {
118 require Bio
::DB
::Fasta
;
119 my $dna_db = eval {Bio
::DB
::Fasta
->new($fasta);}
120 or warn "$@\nCan't open sequence file(s). Use -gff instead of -dir if you wish to load features without sequence.\n";
121 $dna_db && $self->dna_db($dna_db);
123 $self->load_fasta($fasta);
129 my $d = $self->{dna_db
};
130 $self->{dna_db
} = shift if @_;
134 sub insert_sequence
{
136 my($id,$offset,$seq) = @_;
137 $self->{dna
}{$id} .= $seq;
140 # low-level fetch of a DNA substring given its
141 # name, class and the desired range.
144 my ($id,$start,$stop,$class) = @_;
145 if (my $dna_db = $self->dna_db) {
146 return $dna_db->seq($id,$start=>$stop);
148 return '' unless $self->{dna
};
150 return $self->{dna
}{$id} unless defined $start || defined $stop;
151 $start = 1 if !defined $start;
154 if ($start > $stop) {
156 ($start,$stop) = ($stop,$start);
158 my $dna = substr($self->{dna
}{$id},$start-1,$stop-$start+1);
160 $dna =~ tr/gatcGATC/ctagCTAG/;
177 foreach my $arrayref (values %{$self->{tmp
}}) {
178 foreach (@
$arrayref) {$_->{feature_id
} = $idx++; }
179 push @
{$self->{data
}},@
$arrayref;
184 # this method loads the feature as a hash into memory -
185 # keeps an array of features-hashes as an in-memory db
188 my $feature_hash = shift;
189 $feature_hash->{strand
} = '' if $feature_hash->{strand
} && $feature_hash->{strand
} eq '.';
190 $feature_hash->{phase
} = '' if $feature_hash->{phase
} && $feature_hash->{phase
} eq '.';
191 $feature_hash->{gclass
} = 'Sequence' unless length $feature_hash->{gclass
} > 0;
192 # sort by group please
193 push @
{$self->{tmp
}{$feature_hash->{gclass
},$feature_hash->{gname
}}},$feature_hash;
196 # given sequence name, return (reference,start,stop,strand)
199 my ($name,$class,$refseq) = @_;
203 if ($name =~ /[*?]/) { # uh oh regexp time
204 $name = quotemeta($name);
205 $name =~ s/\\\*/.*/g;
206 $name =~ s/\\\?/.?/g;
210 # Find all features that have the requested name and class.
211 # Sort them by reference point.
212 for my $feature (@
{$self->{data
}}) {
214 my $no_match_class_name;
215 my $empty_class_name;
216 my $class_matches = !defined($feature->{gclass
}) ||
217 length($feature->{gclass
}) == 0 ||
218 $feature->{gclass
} eq $class;
220 if (defined $feature->{gname
}) {
221 my $matches = $class_matches
222 && ($regexp ?
$feature->{gname
} =~ /$name/i : lc($feature->{gname
}) eq lc($name));
223 $no_match_class_name = !$matches; # to accomodate Shuly's interesting logic
227 $empty_class_name = 1;
230 if ($no_match_class_name){
231 my $feature_attributes = $feature->{attributes
};
232 my $attributes = {Alias
=> $name};
233 if (!$self->_matching_attributes($feature_attributes,$attributes)){
238 push @
{$refs{$feature->{ref}}},$feature;
241 # find out how many reference points we recovered
243 $self->error("$name not found in database");
247 # compute min and max
248 my ($ref) = keys %refs;
249 my @found = @
{$refs{$ref}};
250 my ($strand,$start,$stop);
253 foreach my $ref (keys %refs) {
254 next if defined($refseq) and lc($ref) ne lc($refseq);
255 my @found = @
{$refs{$ref}};
256 my ($strand,$start,$stop,$name);
258 $strand ||= $_->{strand
};
259 $strand = '+' if $strand && $strand eq '.';
260 $start = $_->{start
} if !defined($start) || $start > $_->{start
};
261 $stop = $_->{stop
} if !defined($stop) || $stop < $_->{stop
};
262 $name ||= $_->{gname
};
264 push @found_segments,[$ref,$class,$start,$stop,$strand,$name];
268 return \
@found_segments;
273 my ($search_string,$limit) = @_;
275 $search_string =~ tr/*?//d;
278 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
279 my $search = join '|',@words;
281 for my $feature (@
{$self->{data
}}) {
282 next unless defined $feature->{gclass
} && defined $feature->{gname
}; # ignore NULL objects
283 next unless $feature->{attributes
};
284 my @attributes = @
{$feature->{attributes
}};
285 my @values = map {$_->[1]} @attributes;
286 my $value = "@values";
289 my @hits = $value =~ /($w)/ig;
292 next unless $matches;
294 my $relevance = 10 * $matches;
295 my $featname = Bio
::DB
::GFF
::Featname
->new($feature->{gclass
}=>$feature->{gname
});
297 $note = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'} @
{$feature->{attributes
}};
298 $note .= join ' ',grep /$search/,map {$_->[1]} grep {$_->[0] ne 'Note'} @
{$feature->{attributes
}};
299 my $type = Bio
::DB
::GFF
::Typename
->new($feature->{method
},$feature->{source
});
300 push @results,[$featname,$note,$relevance,$type];
301 last if defined $limit && @results >= $limit;
304 #added result filtering so that this method returns the expected results
305 #this section of code used to be in GBrowse's do_keyword_search method
307 my $match_sub = 'sub {';
308 foreach (split /\s+/,$search_string) {
309 $match_sub .= "return unless \$_[0] =~ /\Q$_\E/i; ";
312 my $match = eval $match_sub;
314 my @matches = grep { $match->($_->[1]) } @results;
319 sub _delete_features
{
321 my @feature_ids = sort {$b<=>$a} @_;
323 foreach (@feature_ids) {
324 next unless $_ >= 0 && $_ < @
{$self->{data
}};
325 $removed += defined splice(@
{$self->{data
}},$_,1);
332 my $delete_spec = shift;
333 my $ranges = $delete_spec->{segments
} || [];
334 my $types = $delete_spec->{types
} || [];
335 my $force = $delete_spec->{force
};
336 my $range_type = $delete_spec->{range_type
};
340 my @args = @
$types ?
(-type
=>$types) : ();
341 push @args,(-range_type
=> $range_type);
342 my %ids_to_remove = map {$_->id => 1} map {$_->features(@args)} @
$ranges;
343 $deleted = $self->delete_features(keys %ids_to_remove);
345 my %ids_to_remove = map {$_->id => 1} $self->features(-type
=>$types);
346 $deleted = $self->delete_features(keys %ids_to_remove);
348 $self->throw("This operation would delete all feature data and -force not specified")
350 $deleted = @
{$self->{data
}};
351 @
{$self->{data
}} = ();
358 # Some GFF version 2 files use the groups column to store a series of
359 # attribute/value pairs. In this interpretation of GFF, the first such
360 # pair is treated as the primary group for the feature; subsequent pairs
361 # are treated as attributes. Two attributes have special meaning:
362 # "Note" is for backward compatibility and is used for unstructured text
363 # remarks. "Alias" is considered as a synonym for the feature name.
364 # If no name is provided, then attributes() returns a flattened hash, of
365 # attribute=>value pairs.
369 my ($feature_id,$tag) = @_;
372 #my $feature = ${$self->{data}}[$feature_id];
373 my $feature = $self->_basic_features_by_id($feature_id);
376 for my $attr (@
{$feature->{attributes
}}) {
377 my ($attr_name,$attr_value) = @
$attr ;
378 if (defined($tag) && lc($attr_name) eq lc($tag)){push @result,$attr_value;}
379 elsif (!defined($tag)) {push @result,($attr_name,$attr_value);}
385 #sub get_feature_by_attribute{
386 sub _feature_by_attribute
{
388 my ($attributes,$callback) = @_;
389 $callback || $self->throw('must provide a callback argument');
392 my $feature_group_id = undef;
394 for my $feature (@
{$self->{data
}}) {
397 for my $attr (@
{$feature->{attributes
}}) {
398 my ($attr_name,$attr_value) = @
$attr ;
399 #there could be more than one set of attributes......
400 foreach (keys %$attributes) {
401 if (lc($_) eq lc($attr_name) && lc($attributes->{$_}) eq lc($attr_value)) {
402 $callback->($self->_hash_to_array($feature));
412 # This is the low-level method that is called to retrieve GFF lines from
413 # the database. It is responsible for retrieving features that satisfy
414 # range and feature type criteria, and passing the GFF fields to a
415 # callback subroutine.
420 my ($search,$options,$callback) = @_;
424 $found_features = $self->_get_features_by_search_options($search,$options);
426 # only true if the sort by group option was specified
427 @
{$found_features} = sort {lc("$a->{gclass}:$a->{gname}") cmp lc("$b->{gclass}:$b->{gname}")}
428 @
{$found_features} if $options->{sort_by_group
} ;
430 for my $feature (@
{$found_features}) { # only true if the sort by group option was specified
433 $self->_hash_to_array($feature)
441 # Low level implementation of fetching a named feature.
442 # GFF annotations are named using the group class and name fields.
443 # May return zero, one, or several Bio::DB::GFF::Feature objects.
445 =head2 _feature_by_name
447 Title : _feature_by_name
448 Usage : $db->get_features_by_name($name,$class,$callback)
449 Function: get a list of features by name and class
450 Returns : count of number of features retrieved
451 Args : name of feature, class of feature, and a callback
454 This method is used internally. The callback arguments are those used
459 sub _feature_by_name
{
461 my ($class,$name,$location,$callback) = @_;
462 $callback || $self->throw('must provide a callback argument');
466 if ($name =~ /[*?]/) { # uh oh regexp time
467 $name = quotemeta($name);
468 $name =~ s/\\\*/.*/g;
469 $name =~ s/\\\?/.?/g;
473 for my $feature (@
{$self->{data
}}) {
474 next unless ($regexp && $feature->{gname
} =~ /$name/i) || lc($feature->{gname
}) eq lc($name);
475 next if defined($feature->{gclass
}) && length($feature->{gclass
}) > 0 && $feature->{gclass
} ne $class;
478 next if $location->[0] ne $feature->{ref};
479 next if $location->[1] && $location->[1] > $feature->{stop
};
480 next if $location->[2] && $location->[2] < $feature->{start
};
483 $callback->($self->_hash_to_array($feature),0);
488 # Low level implementation of fetching a feature by it's id.
489 # The id of the feature as implemented in the in-memory db, is the location of the
490 # feature in the features hash array.
493 my ($ids,$type,$callback) = @_;
494 $callback || $self->throw('must provide a callback argument');
496 my $feature_group_id = undef;
499 if ($type eq 'feature'){
500 for my $feature_id (@
$ids){
501 my $feature = $self->_basic_features_by_id($feature_id);
502 $callback->($self->_hash_to_array($feature)) if $callback;
508 sub _basic_features_by_id
{
512 $ids = [$ids] unless ref $ids =~ /ARRAY/;
515 for my $feature_id (@
$ids){
516 push @result, ${$self->{data
}}[$feature_id];
518 return wantarray() ?
@result : $result[0];
521 # This method is similar to get_features(), except that it returns an
522 # iterator across the query.
523 # See Bio::DB::GFF::Adaptor::memory::iterator.
525 sub get_features_iterator
{
527 my ($search,$options,$callback) = @_;
528 $callback || $self->throw('must provide a callback argument');
530 my $results = $self->_get_features_by_search_options($search,$options);
531 my $results_array = $self->_convert_feature_hash_to_array($results);
533 return Bio
::DB
::GFF
::Adaptor
::memory
::iterator
->new($results_array,$callback);
537 # This method is responsible for fetching the list of feature type names.
538 # The query may be limited to a particular range, in
539 # which case the range is indicated by a landmark sequence name and
540 # class and its subrange, if any. These arguments may be undef if it is
541 # desired to retrieve all feature types.
543 # If the count flag is false, the method returns a simple list of
544 # Bio::DB::GFF::Typename objects. If $count is true, the method returns
545 # a list of $name=>$count pairs, where $count indicates the number of
546 # times this feature occurs in the range.
550 my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_;
554 for my $feature (@
{$self->{data
}}) {
555 my $feature_start = $feature->{start
};
556 my $feature_stop = $feature->{stop
};
557 my $feature_ref = $feature->{ref};
558 my $feature_class = $feature->{class};
559 my $feature_method = $feature->{method
};
560 my $feature_source = $feature->{source
};
562 if (defined $srcseq){
563 next unless lc($feature_ref) eq lc($srcseq);
567 next unless defined $feature_class && $feature_class eq $class ;
570 # the requested range should OVERLAP the retrieved features
571 if (defined $start or defined $stop) {
572 $start = 1 unless defined $start;
573 $stop = MAX_SEGMENT
unless defined $stop;
574 next unless $feature_stop >= $start && $feature_start <= $stop;
577 if (defined $typelist && @
$typelist){
578 next unless $self->_matching_typelist($feature_method,$feature_source,$typelist);
581 my $type = Bio
::DB
::GFF
::Typename
->new($feature_method,$feature_source);
587 return $want_count ?
%result : values %obj;
593 for my $feature (@
{$self->{data
}}) {
594 $classes{$feature->{gclass
}}++;
596 my @classes = sort keys %classes;
600 # Internal method that performs a search on the features array,
601 # sequentialy retrieves the features, and performs a check on each feature
602 # according to the search options.
603 sub _get_features_by_search_options
{
605 my ($self, $search,$options) = @_;
606 my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes) =
607 (@
{$search}{qw(rangetype refseq refclass start stop types)},
608 @
{$options}{qw(sparse sort_by_group ATTRIBUTES)}) ;
611 my $data = $self->{data
};
613 my $feature_id = -1 ;
614 my $feature_group_id = undef;
616 for my $feature (@
{$data}) {
620 my $feature_start = $feature->{start
};
621 my $feature_stop = $feature->{stop
};
622 my $feature_ref = $feature->{ref};
624 if (defined $refseq){
625 next unless lc($feature_ref) eq lc($refseq);
628 if (defined $start or defined $stop) {
629 $start = 0 unless defined($start);
630 $stop = MAX_SEGMENT
unless defined($stop);
632 if ($rangetype eq 'overlaps') {
633 next unless $feature_stop >= $start && $feature_start <= $stop;
634 } elsif ($rangetype eq 'contains') {
635 next unless $feature_start >= $start && $feature_stop <= $stop;
636 } elsif ($rangetype eq 'contained_in') {
637 next unless $feature_start <= $start && $feature_stop >= $stop;
639 next unless $feature_start == $start && $feature_stop == $stop;
644 my $feature_source = $feature->{source
};
645 my $feature_method = $feature->{method
};
647 if (defined $types && @
$types){
648 next unless $self->_matching_typelist($feature_method,$feature_source,$types);
651 my $feature_attributes = $feature->{attributes
};
652 if (defined $attributes){
653 next unless $self->_matching_attributes($feature_attributes,$attributes);
656 # if we get here, then we have a feature that meets the criteria.
657 # Then we just push onto an array
658 # of found features and continue.
660 my $found_feature = $feature ;
661 $found_feature->{feature_id
} = $feature_id;
662 $found_feature->{group_id
} = $feature_group_id;
663 push @found_features,$found_feature;
666 return \
@found_features;
671 my ($self,$feature_hash) = @_;
672 my @array = @
{$feature_hash}{@hash2array_map};
673 return wantarray ?
@array : \
@array;
676 # this subroutine is needed for convertion of the feature from hash to array in order to
677 # pass it to the callback subroutine
678 sub _convert_feature_hash_to_array
{
679 my ($self, $feature_hash_array) = @_;
680 my @features_array_array = map {scalar $self->_hash_to_array($_)} @
$feature_hash_array;
681 return \
@features_array_array;
684 sub _matching_typelist
{
685 my ($self, $feature_method,$feature_source,$typelist) = @_;
686 foreach (@
$typelist) {
687 my ($search_method,$search_source) = @
$_;
688 next if lc($search_method) ne lc($feature_method);
689 next if defined($search_source) && lc($search_source) ne lc($feature_source);
695 sub _matching_attributes
{
696 my ($self, $feature_attributes,$attributes) = @_ ;
697 foreach (keys %$attributes) {
698 return 0 if !_match_all_attr_in_feature
($_,$attributes->{$_},$feature_attributes)
703 sub _match_all_attr_in_feature
{
704 my ($attr_name,$attr_value,$feature_attributes) = @_;
705 for my $attr (@
$feature_attributes) {
706 my ($feature_attr_name,$feature_attr_value) = @
$attr ;
707 next if ($attr_name ne $feature_attr_name || $attr_value ne $feature_attr_value);
714 sub do_initialize
{ 1; }
715 sub get_feature_by_group_id
{ 1; }