2 # BioPerl module for Bio::SeqFeature::Collection
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqFeature::Collection - A container class for SeqFeatures
17 suitable for performing operations such as finding features within a
18 range, that match a certain feature type, etc.
22 use Bio::SeqFeature::Collection;
23 use Bio::Location::Simple;
27 # let's first input some features
28 my $gffio = Bio::Tools::GFF->new(-file => File::Spec->catfile
29 ("t","data","myco_sites.gff"),
32 # loop over the input stream
33 while(my $feature = $gffio->next_feature()) {
34 # do something with feature
35 push @features, $feature;
38 # build the Collection object
39 my $col = Bio::SeqFeature::Collection->new();
40 # add these features to the object
41 my $totaladded = $col->add_features(\@features);
43 my @subset = $col->features_in_range(-start => 1,
47 # subset should have 18 entries for this dataset
48 print "size is ", scalar @subset, "\n";
49 @subset = $col->features_in_range(-range => Bio::Location::Simple->new
54 -strandmatch => 'strong');
56 # subset should have 22 entries for this dataset
57 print "size is ", scalar @subset, "\n";
58 print "total number of features in collection is ",
59 $col->feature_count(),"\n";
63 This object will efficiently allow one for query subsets of ranges
64 within a large collection of sequence features (in fact the objects
65 just have to be Bio::RangeI compliant). This is done by the creation
66 of bins which are stored in order in a B-Tree data structure as
67 provided by the DB_File interface to the Berkeley DB.
69 This is based on work done by Lincoln for storage in a mysql instance
70 - this is intended to be an embedded in-memory implementation for
71 easily querying for subsets of a large range set.
73 Collections can be made persistent by keeping the indexfile and
74 passing in the -keep flag like this:
76 my $collection = Bio::SeqFeature::Collection->new(-keep => 1,
78 $collaction->add_features(\@features);
81 # To reuse this collection, next time you initialize a Collection object
82 # specify the filename and the index will be reused.
83 $collection = Bio::SeqFeature::Collection->new(-keep => 1,
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to
94 the Bioperl mailing list. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 of the bugs and their resolution. Bug reports can be submitted via
116 https://github.com/bioperl/bioperl-live/issues
118 =head1 AUTHOR - Jason Stajich
120 Email jason@bioperl.org
124 Using code and strategy developed by Lincoln Stein (lstein@cshl.org)
125 in Bio::DB::GFF implementation. Credit also to Lincoln for suggesting
126 using Storable to serialize features rather than my previous implementation
127 which kept the features in memory.
131 The rest of the documentation details each of the object methods.
132 Internal methods are usually preceded with a _
137 # Let the code begin...
140 package Bio
::SeqFeature
::Collection
;
143 # Object preamble - inherits from Bio::Root::Root
145 use Bio
::DB
::GFF
::Util
::Binning
;
147 use Bio
::Location
::Simple
;
148 use Bio
::SeqFeature
::Generic
;
149 use Storable
qw(freeze thaw);
151 use base
qw(Bio::Root::Root Bio::SeqFeature::CollectionI);
154 # This may need to get re-optimized for BDB usage as these
155 # numbers were derived empirically by Lincoln on a mysql srv
156 # running on his laptop
158 # this is the largest that any reference sequence can be (100 megabases)
159 use constant MAX_BIN
=> 100_000_000
;
161 # this is the smallest bin (1 K)
162 use constant MIN_BIN
=> 1_000
;
167 Usage : my $obj = Bio::SeqFeature::Collection->new();
168 Function: Builds a new Bio::SeqFeature::Collection object
169 Returns : Bio::SeqFeature::Collection
172 -minbin minimum value to use for binning
173 (default is 100,000,000)
174 -maxbin maximum value to use for binning
176 -file filename to store/read the
177 BTREE from rather than an in-memory structure
178 (default is false and in-memory).
179 -keep boolean, will not remove index file on
181 -features Array ref of features to add initially
186 my($class,@args) = @_;
188 my $self = $class->SUPER::new
(@args);
189 my ($maxbin,$minbin, $file, $keep,
190 $features) = $self->_rearrange([qw(MAXBIN MINBIN FILE KEEP
193 defined $maxbin && $self->max_bin($maxbin);
194 defined $minbin && $self->min_bin($minbin);
196 defined $features && $self->add_features($features);
197 $DB_BTREE->{'flags'} = R_DUP
;
198 $DB_BTREE->{'compare'} = \
&_compare
;
199 $self->{'_btreehash'} = {};
201 $self->debug("using file $file");
202 $self->indexfile($file);
205 $self->{'_btree'} = tie
%{$self->{'_btreehash'}},
206 'DB_File', $self->indexfile, O_RDWR
|O_CREAT
, 0640, $DB_BTREE;
207 $self->{'_btree'} || $self->throw("Unable to tie DB_File handle");
215 Usage : $collection->add_features(\@features);
217 Returns : number of features added
218 Args : arrayref of Bio::SeqFeatureI objects to index
224 my ($self,$feats) = @_;
225 if( ref($feats) !~ /ARRAY/i ) {
226 $self->warn("Must provide a valid Array reference to add_features");
230 foreach my $f ( @
$feats ) {
231 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) {
232 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n");
235 my $bin = bin
($f->start,$f->end,$self->min_bin);
236 my $serialized = &feature_freeze
($f);
237 $self->{'_btree'}->put($bin,$serialized);
238 if( $f->isa('Bio::SeqFeature::Generic') ) {
239 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n");
247 =head2 features_in_range
249 Title : features_in_range
250 Usage : my @features = $collection->features_in_range($range)
251 Function: Retrieves a list of features which were contained or overlap the
252 the requested range (see Args for way to specify overlap or
253 only those containe)d
254 Returns : List of Bio::SeqFeatureI objects
255 Args : -range => Bio::RangeI object defining range to search,
261 -contain => boolean - true if feature must be completely
263 OR false if should include features that simply overlap
264 the range. Default: true.
265 -strandmatch => 'strong', ranges must have the same strand
266 'weak', ranges must have the same
268 'ignore', ignore strand information
273 sub features_in_range
{
276 my ($range, $contain, $strandmatch,$start,$end,$strand);
278 $range = shift @args;
280 ($start,$end,$strand,$range,
281 $contain,$strandmatch) = $self->_rearrange([qw(START END
286 $contain = 1 unless defined $contain;
288 $strand = 1 unless defined $strand;
289 if( $strand !~ /^([\-\+])$/ &&
290 $strand !~ /^[\-\+]?1$/ ) {
291 $self->warn("must provide a valid numeric or +/- for strand");
294 if( defined $1 ) { $strand .= 1; }
296 if( !defined $start && !defined $end ) {
297 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") )
299 $self->warn("Must defined a valid Range for the method feature_in_range");
302 ($start,$end,$strand) = ($range->start,$range->end,$range->strand);
304 my $r = Bio
::Location
::Simple
->new(-start
=> $start,
309 my $maxbin = $self->max_bin;
310 my $minbin = $self->min_bin;
312 my ($k,$v,@bins) = ("",undef);
313 while ($tier >= $minbin) {
314 my ($tier_start,$tier_stop) = (bin_bot
($tier,$start),
315 bin_top
($tier,$end));
316 if( $tier_start == $tier_stop ) {
317 my @vals = $self->{'_btree'}->get_dup($tier_start);
318 if( scalar @vals > 0 ) {
319 push @bins, map { thaw
($_) } @vals;
324 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR
);
326 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT
) ) {
327 last if( $k > $tier_stop || $k < $tier_start);
328 push @bins, thaw
($v);
334 foreach my $t ( map { ref($_) } @bins) {
339 $self->warn("Trying to thaw a stored feature $t which does not appear in your Perl library. $@");
343 $strandmatch = 'ignore' unless defined $strandmatch;
344 return ( $contain ) ?
grep { $r->contains($_,$strandmatch) } @bins :
345 grep { $r->overlaps($_,$strandmatch)} @bins;
348 =head2 remove_features
350 Title : remove_features
351 Usage : $collection->remove_features(\@array)
352 Function: Removes the requested sequence features (based on features
353 which have the same location)
354 Returns : Number of features removed
355 Args : Arrayref of Bio::RangeI objects
361 my ($self,$feats) = @_;
362 if( ref($feats) !~ /ARRAY/i ) {
363 $self->warn("Must provide a valid Array reference to remove_features");
366 my $countprocessed = 0;
368 foreach my $f ( @
$feats ) {
369 next if ! ref($f) || ! $f->isa('Bio::RangeI');
370 my $bin = bin
($f->start,$f->end,$self->min_bin);
371 my @vals = $self->{'_btree'}->get_dup($bin);
372 my $vcount = scalar @vals;
374 foreach my $v ( @vals ) {
375 # Once we have uniquely identifiable field
376 # I think it will work better.
377 if( $v eq &feature_freeze
($f) ) {
378 $self->{'_btree'}->del_dup($bin,$v);
384 $self->{'_btree'}->del($bin);
391 =head2 get_all_features
393 Title : get_all_features
394 Usage : my @f = $col->get_all_features()
395 Function: Return all the features stored in this collection (Could be large)
396 Returns : Array of Bio::RangeI objects
402 sub get_all_features
{
406 for (my $status = $self->{'_btree'}->seq($key, $value, R_FIRST
) ;
408 $status = $self->{'_btree'}->seq($key, $value, R_NEXT
) )
409 { next unless defined $value;
410 push @features, &thaw
($value);
412 if( scalar @features != $self->feature_count() ) {
413 $self->warn("feature count does not match actual count\n");
422 Usage : my $minbin= $self->min_bin;
423 Function: Get/Set the minimum value to use for binning
425 Args : [optional] minimum bin value
431 my ($self,$min) = @_;
433 $self->{'_min_bin'} = $min;
435 return $self->{'_min_bin'} || MIN_BIN
;
441 Usage : my $maxbin= $self->max_bin;
442 Function: Get/Set the maximum value to use for binning
444 Args : [optional] maximum bin value
450 my ($self,$max) = @_;
452 $self->{'_max_bin'} = $max;
454 return $self->{'max_bin'} || MAX_BIN
;
459 Title : feature_count
460 Usage : my $c = $col->feature_count()
461 Function: Retrieve the total number of features in the collection
471 for ( keys %{$self->{'_btreehash'}} ) {
472 my $v = $self->{'_btreehash'}->{$_};
473 next unless defined $v;
482 Usage : $obj->indexfile($newval)
483 Function: Get/set the filename where index is kept
484 Returns : value of indexfile (a filename string)
485 Args : on set, new value (a filename string )
493 return $self->{'indexfile'} = shift if @_;
494 return $self->{'indexfile'};
500 Usage : $obj->keep($newval)
501 Function: Get/set boolean flag to keep the indexfile after
504 Returns : value of keep (boolean)
505 Args : on set, new value (boolean)
513 return $self->{'keep'} = shift if @_;
514 return $self->{'keep'};
518 if( defined $_[0] && ! defined $_[1]) {
520 } elsif ( defined $_[1] && ! defined $_[0]) {
528 _remove_cleanup_methods
($obj);
532 sub _remove_cleanup_methods
{
535 # we have to remove any cleanup methods here for Storable
536 for my $funcref ( $obj->_cleanup_methods ) {
537 $obj->_unregister_for_cleanup($funcref);
540 # ... and the same for any contained features; hopefully any implementations
541 # adhere to implementing Bio::SeqFeatureI::sub_SeqFeature
543 for my $contained ($obj->sub_SeqFeature) {
544 _remove_cleanup_methods
($contained);
556 $self->{'_btree'} = undef;
557 untie(%{$self->{'_btreehash'}});
558 if( ! $self->keep && $self->indexfile ) {
559 my $f = $self->indexfile;
560 $self->debug( "unlinking ".$f. "\n");
564 $self->SUPER::DESTROY
();