changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / SeqFeature / Collection.pm
blob09f548264d8a1d13d4ccc352ffd243674f49616a
2 # BioPerl module for Bio::SeqFeature::Collection
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeature::Collection - A container class for SeqFeatures
17 suitable for performing operations such as finding features within a
18 range, that match a certain feature type, etc.
20 =head1 SYNOPSIS
22 use Bio::SeqFeature::Collection;
23 use Bio::Location::Simple;
24 use Bio::Tools::GFF;
25 use Bio::Root::IO;
26 use File::Spec;
27 # let's first input some features
28 my $gffio = Bio::Tools::GFF->new(-file => File::Spec->catfile
29 ("t","data","myco_sites.gff"),
30 -gff_version => 2);
31 my @features = ();
32 # loop over the input stream
33 while(my $feature = $gffio->next_feature()) {
34 # do something with feature
35 push @features, $feature;
37 $gffio->close();
38 # build the Collection object
39 my $col = Bio::SeqFeature::Collection->new();
40 # add these features to the object
41 my $totaladded = $col->add_features(\@features);
43 my @subset = $col->features_in_range(-start => 1,
44 -end => 25000,
45 -strand => 1,
46 -contain => 0);
47 # subset should have 18 entries for this dataset
48 print "size is ", scalar @subset, "\n";
49 @subset = $col->features_in_range(-range => Bio::Location::Simple->new
50 (-start => 70000,
51 -end => 150000,
52 -strand => -1),
53 -contain => 1,
54 -strandmatch => 'strong');
56 # subset should have 22 entries for this dataset
57 print "size is ", scalar @subset, "\n";
58 print "total number of features in collection is ",
59 $col->feature_count(),"\n";
61 =head1 DESCRIPTION
63 This object will efficiently allow one for query subsets of ranges
64 within a large collection of sequence features (in fact the objects
65 just have to be Bio::RangeI compliant). This is done by the creation
66 of bins which are stored in order in a B-Tree data structure as
67 provided by the DB_File interface to the Berkeley DB.
69 This is based on work done by Lincoln for storage in a mysql instance
70 - this is intended to be an embedded in-memory implementation for
71 easily querying for subsets of a large range set.
73 Collections can be made persistent by keeping the indexfile and
74 passing in the -keep flag like this:
76 my $collection = Bio::SeqFeature::Collection->new(-keep => 1,
77 -file => 'col.idx');
78 $collaction->add_features(\@features);
79 undef $collection;
81 # To reuse this collection, next time you initialize a Collection object
82 # specify the filename and the index will be reused.
83 $collection = Bio::SeqFeature::Collection->new(-keep => 1,
84 -file => 'col.idx');
88 =head1 FEEDBACK
90 =head2 Mailing Lists
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to
94 the Bioperl mailing list. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 =head2 Support
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 of the bugs and their resolution. Bug reports can be submitted via
114 the web:
116 https://github.com/bioperl/bioperl-live/issues
118 =head1 AUTHOR - Jason Stajich
120 Email jason@bioperl.org
122 =head1 CONTRIBUTORS
124 Using code and strategy developed by Lincoln Stein (lstein@cshl.org)
125 in Bio::DB::GFF implementation. Credit also to Lincoln for suggesting
126 using Storable to serialize features rather than my previous implementation
127 which kept the features in memory.
129 =head1 APPENDIX
131 The rest of the documentation details each of the object methods.
132 Internal methods are usually preceded with a _
134 =cut
137 # Let the code begin...
140 package Bio::SeqFeature::Collection;
141 use strict;
143 # Object preamble - inherits from Bio::Root::Root
145 use Bio::DB::GFF::Util::Binning;
146 use DB_File;
147 use Bio::Location::Simple;
148 use Bio::SeqFeature::Generic;
149 use Storable qw(freeze thaw);
151 use base qw(Bio::Root::Root Bio::SeqFeature::CollectionI);
154 # This may need to get re-optimized for BDB usage as these
155 # numbers were derived empirically by Lincoln on a mysql srv
156 # running on his laptop
158 # this is the largest that any reference sequence can be (100 megabases)
159 use constant MAX_BIN => 100_000_000;
161 # this is the smallest bin (1 K)
162 use constant MIN_BIN => 1_000;
164 =head2 new
166 Title : new
167 Usage : my $obj = Bio::SeqFeature::Collection->new();
168 Function: Builds a new Bio::SeqFeature::Collection object
169 Returns : Bio::SeqFeature::Collection
170 Args :
172 -minbin minimum value to use for binning
173 (default is 100,000,000)
174 -maxbin maximum value to use for binning
175 (default is 1,000)
176 -file filename to store/read the
177 BTREE from rather than an in-memory structure
178 (default is false and in-memory).
179 -keep boolean, will not remove index file on
180 object destruction.
181 -features Array ref of features to add initially
183 =cut
185 sub new {
186 my($class,@args) = @_;
188 my $self = $class->SUPER::new(@args);
189 my ($maxbin,$minbin, $file, $keep,
190 $features) = $self->_rearrange([qw(MAXBIN MINBIN FILE KEEP
191 FEATURES)],@args);
193 defined $maxbin && $self->max_bin($maxbin);
194 defined $minbin && $self->min_bin($minbin);
196 defined $features && $self->add_features($features);
197 $DB_BTREE->{'flags'} = R_DUP ;
198 $DB_BTREE->{'compare'} = \&_compare;
199 $self->{'_btreehash'} = {};
200 if( $file ) {
201 $self->debug("using file $file");
202 $self->indexfile($file);
204 $self->keep($keep);
205 $self->{'_btree'} = tie %{$self->{'_btreehash'}},
206 'DB_File', $self->indexfile, O_RDWR|O_CREAT, 0640, $DB_BTREE;
207 $self->{'_btree'} || $self->throw("Unable to tie DB_File handle");
208 return $self;
212 =head2 add_features
214 Title : add_features
215 Usage : $collection->add_features(\@features);
216 Function:
217 Returns : number of features added
218 Args : arrayref of Bio::SeqFeatureI objects to index
221 =cut
223 sub add_features{
224 my ($self,$feats) = @_;
225 if( ref($feats) !~ /ARRAY/i ) {
226 $self->warn("Must provide a valid Array reference to add_features");
227 return 0;
229 my $count = 0;
230 foreach my $f ( @$feats ) {
231 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) {
232 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n");
233 next;
235 my $bin = bin($f->start,$f->end,$self->min_bin);
236 my $serialized = &feature_freeze($f);
237 $self->{'_btree'}->put($bin,$serialized);
238 if( $f->isa('Bio::SeqFeature::Generic') ) {
239 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n");
241 $count++;
243 return $count;
247 =head2 features_in_range
249 Title : features_in_range
250 Usage : my @features = $collection->features_in_range($range)
251 Function: Retrieves a list of features which were contained or overlap the
252 the requested range (see Args for way to specify overlap or
253 only those containe)d
254 Returns : List of Bio::SeqFeatureI objects
255 Args : -range => Bio::RangeI object defining range to search,
257 -start => start,
258 -end => end,
259 -strand => strand
261 -contain => boolean - true if feature must be completely
262 contained with range
263 OR false if should include features that simply overlap
264 the range. Default: true.
265 -strandmatch => 'strong', ranges must have the same strand
266 'weak', ranges must have the same
267 strand or no strand
268 'ignore', ignore strand information
269 Default. 'ignore'.
271 =cut
273 sub features_in_range{
274 my $self = shift;
275 my (@args) = @_;
276 my ($range, $contain, $strandmatch,$start,$end,$strand);
277 if( @args == 1 ) {
278 $range = shift @args;
279 } else {
280 ($start,$end,$strand,$range,
281 $contain,$strandmatch) = $self->_rearrange([qw(START END
282 STRAND
283 RANGE CONTAIN
284 STRANDMATCH)],
285 @args);
286 $contain = 1 unless defined $contain;
288 $strand = 1 unless defined $strand;
289 if( $strand !~ /^([\-\+])$/ &&
290 $strand !~ /^[\-\+]?1$/ ) {
291 $self->warn("must provide a valid numeric or +/- for strand");
292 return ();
294 if( defined $1 ) { $strand .= 1; }
296 if( !defined $start && !defined $end ) {
297 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") )
299 $self->warn("Must defined a valid Range for the method feature_in_range");
300 return ();
302 ($start,$end,$strand) = ($range->start,$range->end,$range->strand);
304 my $r = Bio::Location::Simple->new(-start => $start,
305 -end => $end,
306 -strand => $strand);
308 my @features;
309 my $maxbin = $self->max_bin;
310 my $minbin = $self->min_bin;
311 my $tier = $maxbin;
312 my ($k,$v,@bins) = ("",undef);
313 while ($tier >= $minbin) {
314 my ($tier_start,$tier_stop) = (bin_bot($tier,$start),
315 bin_top($tier,$end));
316 if( $tier_start == $tier_stop ) {
317 my @vals = $self->{'_btree'}->get_dup($tier_start);
318 if( scalar @vals > 0 ) {
319 push @bins, map { thaw($_) } @vals;
321 } else {
322 $k = $tier_start;
323 my @vals;
324 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR);
325 $rc == 0;
326 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) {
327 last if( $k > $tier_stop || $k < $tier_start);
328 push @bins, thaw($v);
331 $tier /= 10;
333 my %seen = ();
334 foreach my $t ( map { ref($_) } @bins) {
335 next if $seen{$t}++;
336 eval "require $t";
338 if( $@ ) {
339 $self->warn("Trying to thaw a stored feature $t which does not appear in your Perl library. $@");
340 next;
343 $strandmatch = 'ignore' unless defined $strandmatch;
344 return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins :
345 grep { $r->overlaps($_,$strandmatch)} @bins;
348 =head2 remove_features
350 Title : remove_features
351 Usage : $collection->remove_features(\@array)
352 Function: Removes the requested sequence features (based on features
353 which have the same location)
354 Returns : Number of features removed
355 Args : Arrayref of Bio::RangeI objects
358 =cut
360 sub remove_features{
361 my ($self,$feats) = @_;
362 if( ref($feats) !~ /ARRAY/i ) {
363 $self->warn("Must provide a valid Array reference to remove_features");
364 return 0;
366 my $countprocessed = 0;
368 foreach my $f ( @$feats ) {
369 next if ! ref($f) || ! $f->isa('Bio::RangeI');
370 my $bin = bin($f->start,$f->end,$self->min_bin);
371 my @vals = $self->{'_btree'}->get_dup($bin);
372 my $vcount = scalar @vals;
374 foreach my $v ( @vals ) {
375 # Once we have uniquely identifiable field
376 # I think it will work better.
377 if( $v eq &feature_freeze($f) ) {
378 $self->{'_btree'}->del_dup($bin,$v);
379 $vcount--;
380 $countprocessed++;
383 if( $vcount == 0 ) {
384 $self->{'_btree'}->del($bin);
387 $countprocessed;
391 =head2 get_all_features
393 Title : get_all_features
394 Usage : my @f = $col->get_all_features()
395 Function: Return all the features stored in this collection (Could be large)
396 Returns : Array of Bio::RangeI objects
397 Args : None
400 =cut
402 sub get_all_features{
403 my ($self) = @_;
404 my @features;
405 my ($key,$value);
406 for (my $status = $self->{'_btree'}->seq($key, $value, R_FIRST) ;
407 $status == 0 ;
408 $status = $self->{'_btree'}->seq($key, $value, R_NEXT) )
409 { next unless defined $value;
410 push @features, &thaw($value);
412 if( scalar @features != $self->feature_count() ) {
413 $self->warn("feature count does not match actual count\n");
415 return @features;
419 =head2 min_bin
421 Title : min_bin
422 Usage : my $minbin= $self->min_bin;
423 Function: Get/Set the minimum value to use for binning
424 Returns : integer
425 Args : [optional] minimum bin value
428 =cut
430 sub min_bin {
431 my ($self,$min) = @_;
432 if( defined $min ) {
433 $self->{'_min_bin'} = $min;
435 return $self->{'_min_bin'} || MIN_BIN;
438 =head2 max_bin
440 Title : max_bin
441 Usage : my $maxbin= $self->max_bin;
442 Function: Get/Set the maximum value to use for binning
443 Returns : integer
444 Args : [optional] maximum bin value
447 =cut
449 sub max_bin {
450 my ($self,$max) = @_;
451 if( defined $max ) {
452 $self->{'_max_bin'} = $max;
454 return $self->{'max_bin'} || MAX_BIN;
457 =head2 feature_count
459 Title : feature_count
460 Usage : my $c = $col->feature_count()
461 Function: Retrieve the total number of features in the collection
462 Returns : integer
463 Args : none
466 =cut
468 sub feature_count {
469 my $self = shift;
470 my $count = 0;
471 for ( keys %{$self->{'_btreehash'}} ) {
472 my $v = $self->{'_btreehash'}->{$_};
473 next unless defined $v;
474 $count++;
476 $count;
479 =head2 indexfile
481 Title : indexfile
482 Usage : $obj->indexfile($newval)
483 Function: Get/set the filename where index is kept
484 Returns : value of indexfile (a filename string)
485 Args : on set, new value (a filename string )
488 =cut
490 sub indexfile{
491 my $self = shift;
493 return $self->{'indexfile'} = shift if @_;
494 return $self->{'indexfile'};
497 =head2 keep
499 Title : keep
500 Usage : $obj->keep($newval)
501 Function: Get/set boolean flag to keep the indexfile after
502 exiting program
503 Example :
504 Returns : value of keep (boolean)
505 Args : on set, new value (boolean)
508 =cut
510 sub keep{
511 my $self = shift;
513 return $self->{'keep'} = shift if @_;
514 return $self->{'keep'};
517 sub _compare{
518 if( defined $_[0] && ! defined $_[1]) {
519 return -1;
520 } elsif ( defined $_[1] && ! defined $_[0]) {
521 return 1;
523 $_[0] <=> $_[1];
526 sub feature_freeze {
527 my $obj = shift;
528 _remove_cleanup_methods($obj);
529 return freeze($obj);
532 sub _remove_cleanup_methods {
533 my $obj = shift;
535 # we have to remove any cleanup methods here for Storable
536 for my $funcref ( $obj->_cleanup_methods ) {
537 $obj->_unregister_for_cleanup($funcref);
540 # ... and the same for any contained features; hopefully any implementations
541 # adhere to implementing Bio::SeqFeatureI::sub_SeqFeature
543 for my $contained ($obj->sub_SeqFeature) {
544 _remove_cleanup_methods($contained);
550 sub feature_thaw {
551 return thaw(shift);
554 sub DESTROY {
555 my $self = shift;
556 $self->{'_btree'} = undef;
557 untie(%{$self->{'_btreehash'}});
558 if( ! $self->keep && $self->indexfile ) {
559 my $f = $self->indexfile;
560 $self->debug( "unlinking ".$f. "\n");
561 close($f);
562 unlink($f);
564 $self->SUPER::DESTROY();