tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / GFF / Adaptor / memory.pm
blobf7a6d1b6f013ba9c02298722964b07d1fb103e3d
1 package Bio::DB::GFF::Adaptor::memory;
3 =head1 NAME
5 Bio::DB::GFF::Adaptor::memory -- Bio::DB::GFF database adaptor for in-memory databases
7 =head1 SYNOPSIS
9 use Bio::DB::GFF;
10 my $db = Bio::DB::GFF->new(-adaptor=> 'memory',
11 -gff => 'my_features.gff',
12 -fasta => 'my_dna.fa'
17 my $db = Bio::DB::GFF->new(-adaptor=>'memory');
18 $db->load_gff_file('my_features.gff');
19 $db->load_fasta_file('my_dna.fa');
21 See L<Bio::DB::GFF> for other methods.
23 =head1 DESCRIPTION
25 This adaptor implements an in-memory version of Bio::DB::GFF. It can be used to
26 store and retrieve SHORT GFF files. It inherits from Bio::DB::GFF.
28 =head1 CONSTRUCTOR
30 Use Bio::DB::GFF-E<gt>new() to construct new instances of this class.
31 Three named arguments are recommended:
33 Argument Description
35 -adaptor Set to "memory" to create an instance of this class.
36 -gff Read the indicated file or directory of .gff file.
37 -fasta Read the indicated file or directory of fasta files.
38 -dir Indicates a directory containing .gff and .fa files
40 If you use the -dir option and the indicated directory is writable by
41 the current process, then this library will create a FASTA file index
42 that greatly diminishes the memory usage of this module.
44 Alternatively you may create an empty in-memory object using just the
45 -adaptor=E<gt>'memory' argument and then call the load_gff_file() and
46 load_fasta_file() methods to load GFF and/or sequence
47 information. This is recommended in CGI/mod_perl/fastCGI environments
48 because these methods do not modify STDIN, unlike the constructor.
50 =head1 METHODS
52 See L<Bio::DB::GFF> for inherited methods.
54 =head1 BUGS
56 none ;-)
58 =head1 SEE ALSO
60 L<Bio::DB::GFF>, L<bioperl>
62 =head1 AUTHOR
64 Shuly Avraham E<lt>avraham@cshl.orgE<gt>.
66 Copyright (c) 2002 Cold Spring Harbor Laboratory.
68 This library is free software; you can redistribute it and/or modify
69 it under the same terms as Perl itself.
71 =cut
73 use strict;
74 # $Id$
75 # AUTHOR: Shulamit Avraham
76 # This module needs to be cleaned up and documented
78 # Bio::DB::GFF::Adaptor::memory -- in-memory db adaptor
79 # implements the low level handling of data which stored in memory.
80 # This adaptor implements a specific in memory schema that is compatible with Bio::DB::GFF.
81 # Inherits from Bio::DB::GFF.
84 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
85 use Bio::DB::GFF::Adaptor::memory::iterator;
86 use File::Basename 'dirname';
87 use Bio::DB::GFF::Adaptor::memory::feature_serializer qw(@hash2array_map);
90 use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get
92 use base qw(Bio::DB::GFF);
94 sub new {
95 my $class = shift ;
96 my ($file,$fasta,$dbdir,$preferred_groups) = rearrange([
97 [qw(GFF FILE)],
98 'FASTA',
99 [qw(DSN DB DIR DIRECTORY)],
100 'PREFERRED_GROUPS',
101 ],@_);
103 # fill in object
104 my $self = bless{ data => [] },$class;
105 $self->preferred_groups($preferred_groups) if defined $preferred_groups;
106 $file ||= $dbdir;
107 $fasta ||= $dbdir;
108 $self->load_gff($file) if $file;
109 $self->load_or_store_fasta($fasta) if $fasta;
110 return $self;
113 sub load_or_store_fasta {
114 my $self = shift;
115 my $fasta = shift;
116 if ((-f $fasta && -w dirname($fasta))
118 (-d $fasta && -w $fasta)) {
119 require Bio::DB::Fasta;
120 my $dna_db = eval {Bio::DB::Fasta->new($fasta);}
121 or warn "$@\nCan't open sequence file(s). Use -gff instead of -dir if you wish to load features without sequence.\n";
122 $dna_db && $self->dna_db($dna_db);
123 } else {
124 $self->load_fasta($fasta);
128 sub dna_db {
129 my $self = shift;
130 my $d = $self->{dna_db};
131 $self->{dna_db} = shift if @_;
135 sub insert_sequence {
136 my $self = shift;
137 my($id,$offset,$seq) = @_;
138 $self->{dna}{$id} .= $seq;
141 # low-level fetch of a DNA substring given its
142 # name, class and the desired range.
143 sub get_dna {
144 my $self = shift;
145 my ($id,$start,$stop,$class) = @_;
146 if (my $dna_db = $self->dna_db) {
147 return $dna_db->seq($id,$start=>$stop);
149 return '' unless $self->{dna};
151 return $self->{dna}{$id} unless defined $start || defined $stop;
152 $start = 1 if !defined $start;
154 my $reversed = 0;
155 if ($start > $stop) {
156 $reversed++;
157 ($start,$stop) = ($stop,$start);
159 my $dna = substr($self->{dna}{$id},$start-1,$stop-$start+1);
160 if ($reversed) {
161 $dna =~ tr/gatcGATC/ctagCTAG/;
162 $dna = reverse $dna;
165 $dna;
168 sub setup_load {
169 my $self = shift;
170 $self->{tmp} = {};
171 $self->{data} = [];
175 sub finish_load {
176 my $self = shift;
177 my $idx = 0;
178 foreach my $arrayref (values %{$self->{tmp}}) {
179 foreach (@$arrayref) {$_->{feature_id} = $idx++; }
180 push @{$self->{data}},@$arrayref;
185 # this method loads the feature as a hash into memory -
186 # keeps an array of features-hashes as an in-memory db
187 sub load_gff_line {
188 my $self = shift;
189 my $feature_hash = shift;
190 $feature_hash->{strand} = '' if $feature_hash->{strand} && $feature_hash->{strand} eq '.';
191 $feature_hash->{phase} = '' if $feature_hash->{phase} && $feature_hash->{phase} eq '.';
192 $feature_hash->{gclass} = 'Sequence' unless length $feature_hash->{gclass} > 0;
193 # sort by group please
194 push @{$self->{tmp}{$feature_hash->{gclass},$feature_hash->{gname}}},$feature_hash;
197 # given sequence name, return (reference,start,stop,strand)
198 sub get_abscoords {
199 my $self = shift;
200 my ($name,$class,$refseq) = @_;
201 my %refs;
202 my $regexp;
204 if ($name =~ /[*?]/) { # uh oh regexp time
205 $name = quotemeta($name);
206 $name =~ s/\\\*/.*/g;
207 $name =~ s/\\\?/.?/g;
208 $regexp++;
211 # Find all features that have the requested name and class.
212 # Sort them by reference point.
213 for my $feature (@{$self->{data}}) {
215 my $no_match_class_name;
216 my $empty_class_name;
217 my $class_matches = !defined($feature->{gclass}) ||
218 length($feature->{gclass}) == 0 ||
219 $feature->{gclass} eq $class;
221 if (defined $feature->{gname}) {
222 my $matches = $class_matches
223 && ($regexp ? $feature->{gname} =~ /$name/i : lc($feature->{gname}) eq lc($name));
224 $no_match_class_name = !$matches; # to accomodate Shuly's interesting logic
227 else{
228 $empty_class_name = 1;
231 if ($no_match_class_name){
232 my $feature_attributes = $feature->{attributes};
233 my $attributes = {Alias => $name};
234 if (!$self->_matching_attributes($feature_attributes,$attributes)){
235 next;
239 push @{$refs{$feature->{ref}}},$feature;
242 # find out how many reference points we recovered
243 if (! %refs) {
244 $self->error("$name not found in database");
245 return;
248 # compute min and max
249 my ($ref) = keys %refs;
250 my @found = @{$refs{$ref}};
251 my ($strand,$start,$stop);
253 my @found_segments;
254 foreach my $ref (keys %refs) {
255 next if defined($refseq) and lc($ref) ne lc($refseq);
256 my @found = @{$refs{$ref}};
257 my ($strand,$start,$stop,$name);
258 foreach (@found) {
259 $strand ||= $_->{strand};
260 $strand = '+' if $strand && $strand eq '.';
261 $start = $_->{start} if !defined($start) || $start > $_->{start};
262 $stop = $_->{stop} if !defined($stop) || $stop < $_->{stop};
263 $name ||= $_->{gname};
265 push @found_segments,[$ref,$class,$start,$stop,$strand,$name];
269 return \@found_segments;
272 sub search_notes {
273 my $self = shift;
274 my ($search_string,$limit) = @_;
276 $search_string =~ tr/*?//d;
278 my @results;
279 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
280 my $search = join '|',@words;
282 for my $feature (@{$self->{data}}) {
283 next unless defined $feature->{gclass} && defined $feature->{gname}; # ignore NULL objects
284 next unless $feature->{attributes};
285 my @attributes = @{$feature->{attributes}};
286 my @values = map {$_->[1]} @attributes;
287 my $value = "@values";
288 my $matches = 0;
289 for my $w (@words) {
290 my @hits = $value =~ /($w)/ig;
291 $matches += @hits;
293 next unless $matches;
295 my $relevance = 10 * $matches;
296 my $featname = Bio::DB::GFF::Featname->new($feature->{gclass}=>$feature->{gname});
297 my $note;
298 $note = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'} @{$feature->{attributes}};
299 $note .= join ' ',grep /$search/,map {$_->[1]} grep {$_->[0] ne 'Note'} @{$feature->{attributes}};
300 my $type = Bio::DB::GFF::Typename->new($feature->{method},$feature->{source});
301 push @results,[$featname,$note,$relevance,$type];
302 last if defined $limit && @results >= $limit;
305 #added result filtering so that this method returns the expected results
306 #this section of code used to be in GBrowse's do_keyword_search method
308 my $match_sub = 'sub {';
309 foreach (split /\s+/,$search_string) {
310 $match_sub .= "return unless \$_[0] =~ /\Q$_\E/i; ";
312 $match_sub .= "};";
313 my $match = eval $match_sub;
315 my @matches = grep { $match->($_->[1]) } @results;
317 return @matches;
320 sub _delete_features {
321 my $self = shift;
322 my @feature_ids = sort {$b<=>$a} @_;
323 my $removed = 0;
324 foreach (@feature_ids) {
325 next unless $_ >= 0 && $_ < @{$self->{data}};
326 $removed += defined splice(@{$self->{data}},$_,1);
328 $removed;
331 sub _delete {
332 my $self = shift;
333 my $delete_spec = shift;
334 my $ranges = $delete_spec->{segments} || [];
335 my $types = $delete_spec->{types} || [];
336 my $force = $delete_spec->{force};
337 my $range_type = $delete_spec->{range_type};
339 my $deleted = 0;
340 if (@$ranges) {
341 my @args = @$types ? (-type=>$types) : ();
342 push @args,(-range_type => $range_type);
343 my %ids_to_remove = map {$_->id => 1} map {$_->features(@args)} @$ranges;
344 $deleted = $self->delete_features(keys %ids_to_remove);
345 } elsif (@$types) {
346 my %ids_to_remove = map {$_->id => 1} $self->features(-type=>$types);
347 $deleted = $self->delete_features(keys %ids_to_remove);
348 } else {
349 $self->throw("This operation would delete all feature data and -force not specified")
350 unless $force;
351 $deleted = @{$self->{data}};
352 @{$self->{data}} = ();
354 $deleted;
357 # attributes -
359 # Some GFF version 2 files use the groups column to store a series of
360 # attribute/value pairs. In this interpretation of GFF, the first such
361 # pair is treated as the primary group for the feature; subsequent pairs
362 # are treated as attributes. Two attributes have special meaning:
363 # "Note" is for backward compatibility and is used for unstructured text
364 # remarks. "Alias" is considered as a synonym for the feature name.
365 # If no name is provided, then attributes() returns a flattened hash, of
366 # attribute=>value pairs.
368 sub do_attributes{
369 my $self = shift;
370 my ($feature_id,$tag) = @_;
371 my $attr ;
373 #my $feature = ${$self->{data}}[$feature_id];
374 my $feature = $self->_basic_features_by_id($feature_id);
376 my @result;
377 for my $attr (@{$feature->{attributes}}) {
378 my ($attr_name,$attr_value) = @$attr ;
379 if (defined($tag) && lc($attr_name) eq lc($tag)){push @result,$attr_value;}
380 elsif (!defined($tag)) {push @result,($attr_name,$attr_value);}
382 return @result;
386 #sub get_feature_by_attribute{
387 sub _feature_by_attribute{
388 my $self = shift;
389 my ($attributes,$callback) = @_;
390 $callback || $self->throw('must provide a callback argument');
391 my $count = 0;
392 my $feature_id = -1;
393 my $feature_group_id = undef;
395 for my $feature (@{$self->{data}}) {
397 $feature_id++;
398 for my $attr (@{$feature->{attributes}}) {
399 my ($attr_name,$attr_value) = @$attr ;
400 #there could be more than one set of attributes......
401 foreach (keys %$attributes) {
402 if (lc($_) eq lc($attr_name) && lc($attributes->{$_}) eq lc($attr_value)) {
403 $callback->($self->_hash_to_array($feature));
404 $count++;
413 # This is the low-level method that is called to retrieve GFF lines from
414 # the database. It is responsible for retrieving features that satisfy
415 # range and feature type criteria, and passing the GFF fields to a
416 # callback subroutine.
418 sub get_features{
419 my $self = shift;
420 my $count = 0;
421 my ($search,$options,$callback) = @_;
423 my $found_features;
425 $found_features = $self->_get_features_by_search_options($search,$options);
427 # only true if the sort by group option was specified
428 @{$found_features} = sort {lc("$a->{gclass}:$a->{gname}") cmp lc("$b->{gclass}:$b->{gname}")}
429 @{$found_features} if $options->{sort_by_group} ;
431 for my $feature (@{$found_features}) { # only true if the sort by group option was specified
432 $count++;
433 $callback->(
434 $self->_hash_to_array($feature)
438 return $count;
442 # Low level implementation of fetching a named feature.
443 # GFF annotations are named using the group class and name fields.
444 # May return zero, one, or several Bio::DB::GFF::Feature objects.
446 =head2 _feature_by_name
448 Title : _feature_by_name
449 Usage : $db->get_features_by_name($name,$class,$callback)
450 Function: get a list of features by name and class
451 Returns : count of number of features retrieved
452 Args : name of feature, class of feature, and a callback
453 Status : protected
455 This method is used internally. The callback arguments are those used
456 by make_feature().
458 =cut
460 sub _feature_by_name {
461 my $self = shift;
462 my ($class,$name,$location,$callback) = @_;
463 $callback || $self->throw('must provide a callback argument');
464 my $count = 0;
465 my $regexp;
467 if ($name =~ /[*?]/) { # uh oh regexp time
468 $name = quotemeta($name);
469 $name =~ s/\\\*/.*/g;
470 $name =~ s/\\\?/.?/g;
471 $regexp++;
474 for my $feature (@{$self->{data}}) {
475 next unless ($regexp && $feature->{gname} =~ /$name/i) || lc($feature->{gname}) eq lc($name);
476 next if defined($feature->{gclass}) && length($feature->{gclass}) > 0 && $feature->{gclass} ne $class;
478 if ($location) {
479 next if $location->[0] ne $feature->{ref};
480 next if $location->[1] && $location->[1] > $feature->{stop};
481 next if $location->[2] && $location->[2] < $feature->{start};
483 $count++;
484 $callback->($self->_hash_to_array($feature),0);
486 return $count;
489 # Low level implementation of fetching a feature by it's id.
490 # The id of the feature as implemented in the in-memory db, is the location of the
491 # feature in the features hash array.
492 sub _feature_by_id{
493 my $self = shift;
494 my ($ids,$type,$callback) = @_;
495 $callback || $self->throw('must provide a callback argument');
497 my $feature_group_id = undef;
499 my $count = 0;
500 if ($type eq 'feature'){
501 for my $feature_id (@$ids){
502 my $feature = $self->_basic_features_by_id($feature_id);
503 $callback->($self->_hash_to_array($feature)) if $callback;
504 $count++;
509 sub _basic_features_by_id{
510 my $self = shift;
511 my ($ids) = @_;
513 $ids = [$ids] unless ref $ids =~ /ARRAY/;
515 my @result;
516 for my $feature_id (@$ids){
517 push @result, ${$self->{data}}[$feature_id];
519 return wantarray() ? @result : $result[0];
522 # This method is similar to get_features(), except that it returns an
523 # iterator across the query.
524 # See Bio::DB::GFF::Adaptor::memory::iterator.
526 sub get_features_iterator {
527 my $self = shift;
528 my ($search,$options,$callback) = @_;
529 $callback || $self->throw('must provide a callback argument');
531 my $results = $self->_get_features_by_search_options($search,$options);
532 my $results_array = $self->_convert_feature_hash_to_array($results);
534 return Bio::DB::GFF::Adaptor::memory::iterator->new($results_array,$callback);
538 # This method is responsible for fetching the list of feature type names.
539 # The query may be limited to a particular range, in
540 # which case the range is indicated by a landmark sequence name and
541 # class and its subrange, if any. These arguments may be undef if it is
542 # desired to retrieve all feature types.
544 # If the count flag is false, the method returns a simple list of
545 # Bio::DB::GFF::Typename objects. If $count is true, the method returns
546 # a list of $name=>$count pairs, where $count indicates the number of
547 # times this feature occurs in the range.
549 sub get_types {
550 my $self = shift;
551 my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_;
553 my(%result,%obj);
555 for my $feature (@{$self->{data}}) {
556 my $feature_start = $feature->{start};
557 my $feature_stop = $feature->{stop};
558 my $feature_ref = $feature->{ref};
559 my $feature_class = $feature->{class};
560 my $feature_method = $feature->{method};
561 my $feature_source = $feature->{source};
563 if (defined $srcseq){
564 next unless lc($feature_ref) eq lc($srcseq);
567 if (defined $class){
568 next unless defined $feature_class && $feature_class eq $class ;
571 # the requested range should OVERLAP the retrieved features
572 if (defined $start or defined $stop) {
573 $start = 1 unless defined $start;
574 $stop = MAX_SEGMENT unless defined $stop;
575 next unless $feature_stop >= $start && $feature_start <= $stop;
578 if (defined $typelist && @$typelist){
579 next unless $self->_matching_typelist($feature_method,$feature_source,$typelist);
582 my $type = Bio::DB::GFF::Typename->new($feature_method,$feature_source);
583 $result{$type}++;
584 $obj{$type} = $type;
586 } #end features loop
588 return $want_count ? %result : values %obj;
591 sub classes {
592 my $self = shift;
593 my %classes;
594 for my $feature (@{$self->{data}}) {
595 $classes{$feature->{gclass}}++;
597 my @classes = sort keys %classes;
598 return @classes;
601 # Internal method that performs a search on the features array,
602 # sequentialy retrieves the features, and performs a check on each feature
603 # according to the search options.
604 sub _get_features_by_search_options{
605 my $count = 0;
606 my ($self, $search,$options) = @_;
607 my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes) =
608 (@{$search}{qw(rangetype refseq refclass start stop types)},
609 @{$options}{qw(sparse sort_by_group ATTRIBUTES)}) ;
611 my @found_features;
612 my $data = $self->{data};
614 my $feature_id = -1 ;
615 my $feature_group_id = undef;
617 for my $feature (@{$data}) {
619 $feature_id++;
621 my $feature_start = $feature->{start};
622 my $feature_stop = $feature->{stop};
623 my $feature_ref = $feature->{ref};
625 if (defined $refseq){
626 next unless lc($feature_ref) eq lc($refseq);
629 if (defined $start or defined $stop) {
630 $start = 0 unless defined($start);
631 $stop = MAX_SEGMENT unless defined($stop);
633 if ($rangetype eq 'overlaps') {
634 next unless $feature_stop >= $start && $feature_start <= $stop;
635 } elsif ($rangetype eq 'contains') {
636 next unless $feature_start >= $start && $feature_stop <= $stop;
637 } elsif ($rangetype eq 'contained_in') {
638 next unless $feature_start <= $start && $feature_stop >= $stop;
639 } else {
640 next unless $feature_start == $start && $feature_stop == $stop;
645 my $feature_source = $feature->{source};
646 my $feature_method = $feature->{method};
648 if (defined $types && @$types){
649 next unless $self->_matching_typelist($feature_method,$feature_source,$types);
652 my $feature_attributes = $feature->{attributes};
653 if (defined $attributes){
654 next unless $self->_matching_attributes($feature_attributes,$attributes);
657 # if we get here, then we have a feature that meets the criteria.
658 # Then we just push onto an array
659 # of found features and continue.
661 my $found_feature = $feature ;
662 $found_feature->{feature_id} = $feature_id;
663 $found_feature->{group_id} = $feature_group_id;
664 push @found_features,$found_feature;
667 return \@found_features;
671 sub _hash_to_array {
672 my ($self,$feature_hash) = @_;
673 my @array = @{$feature_hash}{@hash2array_map};
674 return wantarray ? @array : \@array;
677 # this subroutine is needed for convertion of the feature from hash to array in order to
678 # pass it to the callback subroutine
679 sub _convert_feature_hash_to_array{
680 my ($self, $feature_hash_array) = @_;
681 my @features_array_array = map {scalar $self->_hash_to_array($_)} @$feature_hash_array;
682 return \@features_array_array;
685 sub _matching_typelist{
686 my ($self, $feature_method,$feature_source,$typelist) = @_;
687 foreach (@$typelist) {
688 my ($search_method,$search_source) = @$_;
689 next if lc($search_method) ne lc($feature_method);
690 next if defined($search_source) && lc($search_source) ne lc($feature_source);
691 return 1;
693 return 0;
696 sub _matching_attributes {
697 my ($self, $feature_attributes,$attributes) = @_ ;
698 foreach (keys %$attributes) {
699 return 0 if !_match_all_attr_in_feature($_,$attributes->{$_},$feature_attributes)
701 return 1;
704 sub _match_all_attr_in_feature{
705 my ($attr_name,$attr_value,$feature_attributes) = @_;
706 for my $attr (@$feature_attributes) {
707 my ($feature_attr_name,$feature_attr_value) = @$attr ;
708 next if ($attr_name ne $feature_attr_name || $attr_value ne $feature_attr_value);
709 return 1;
711 return 0;
715 sub do_initialize { 1; }
716 sub get_feature_by_group_id{ 1; }