2 # BioPerl module for Bio::Seq::LargeLocatableSeq
4 # Cared for by Albert Vilella
6 # based on the Bio::LargePrimarySeq module
7 # by Ewan Birney <birney@sanger.ac.uk>
9 # and the Bio::LocatableSeq module
10 # by Ewan Birney <birney@sanger.ac.uk>
12 # Copyright Albert Vilella
14 # You may distribute this module under the same terms as perl itself
16 # POD documentation - main docs before the code
20 Bio::Seq::LargeLocatableSeq - LocatableSeq object that stores sequence as
25 # normal primary seq usage
26 use Bio::Seq::LargeLocatableSeq;
27 my $seq = Bio::Seq::LargeLocatableSeq->new(-seq => "CAGT-GGT",
34 Bio::Seq::LargeLocatableSeq - object with start/end points on it that
35 can be projected into a MSA or have coordinates relative to another
38 This object, unlike Bio::LocatableSeq, stores a sequence as a series
39 of files in a temporary directory. The aim is to allow someone the
40 ability to store very large sequences (eg, E<gt> 100MBases) in a file
41 system without running out of memory (eg, on a 64 MB real memory
44 Of course, to actually make use of this functionality, the programs
45 which use this object B<must> not call $primary_seq-E<gt>seq otherwise
46 the entire sequence will come out into memory and probably crash your
47 machine. However, calls like $primary_seq-E<gt>subseq(10,100) will cause
48 only 90 characters to be brought into real memory.
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to
56 the Bioperl mailing list. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63 Report bugs to the Bioperl bug tracking system to help us keep track
64 of the bugs and their resolution. Bug reports can be submitted via
67 http://bugzilla.open-bio.org/
69 =head1 AUTHOR - Albert Vilella
71 Email avilella-AT-gmail-DOT-com
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
81 # Let the code begin...
83 package Bio
::Seq
::LargeLocatableSeq
;
84 use vars
qw($AUTOLOAD);
88 use base qw(Bio::Seq::LargePrimarySeq Bio::LocatableSeq Bio::Root::IO);
94 Usage : my $obj = Bio::Seq::LargeLocatableSeq->new();
95 Function: Builds a new Bio::Seq::LargeLocatableSeq object
96 Returns : an instance of Bio::Seq::LargeLocatableSeq
103 my ($class, %params) = @_;
105 # don't let PrimarySeq set seq until we have
108 my $seq = $params{'-seq'} || $params{'-SEQ'};
110 delete $params{'-seq'};
111 delete $params{'-SEQ'};
113 my $self = $class->SUPER::new
(%params);
114 $self->_initialize_io(%params);
115 my $tempdir = $self->tempdir( CLEANUP
=> 1);
116 my ($tfh,$file) = $self->tempfile( DIR
=> $tempdir );
118 $tfh && $self->_fh($tfh);
119 $file && $self->_filename($file);
121 $seq && $self->seq($seq);
140 my ($obj,$value) = @_;
141 if( defined $value) {
142 $obj->{'length'} = $value;
145 return (defined $obj->{'length'}) ?
$obj->{'length'} : 0;
161 my ($self, $data) = @_;
162 if( defined $data ) {
163 if( $self->length() == 0) {
164 $self->add_sequence_as_string($data);
166 $self->warn("Trying to reset the seq string, cannot do this with a LargeLocatableSeq - must allocate a new object");
169 return $self->subseq(1,$self->length);
186 my ($self,$start,$end) = @_;
188 my $fh = $self->_fh();
190 if( ref($start) && $start->isa('Bio::LocationI') ) {
192 if( $loc->length == 0 ) {
193 $self->warn("Expect location lengths to be > 0");
195 } elsif( $loc->end < $loc->start ) {
196 # what about circular seqs
197 $self->warn("Expect location start to come before location end");
200 if( $loc->isa('Bio::Location::SplitLocationI') ) {
201 foreach my $subloc ( $loc->sub_Location ) {
202 if(! seek($fh,$subloc->start() - 1,0)) {
203 $self->throw("Unable to seek on file $start:$end $!");
205 my $ret = read($fh, $string, $subloc->length());
206 if( !defined $ret ) {
207 $self->throw("Unable to read $start:$end $!");
209 if( $subloc->strand < 0 ) {
210 # $string = Bio::PrimarySeq->new(-seq => $string)->revcom()->seq();
211 $string = Bio
::Seq
::LargePrimarySeq
->new(-seq
=> $string)->revcom()->seq();
216 if(! seek($fh,$loc->start()-1,0)) {
217 $self->throw("Unable to seek on file ".$loc->start.":".
220 my $ret = read($fh, $string, $loc->length());
221 if( !defined $ret ) {
222 $self->throw("Unable to read ".$loc->start.":".
227 if( defined $loc->strand &&
229 # $seq = Bio::PrimarySeq->new(-seq => $string)->revcom()->seq();
230 $seq = Bio
::Seq
::LargePrimarySeq
->new(-seq
=> $seq)->revcom()->seq();
234 if( $start <= 0 || $end > $self->length ) {
235 $self->throw("Attempting to get a subseq out of range $start:$end vs ".
238 if( $end < $start ) {
239 $self->throw("Attempting to subseq with end ($end) less than start ($start). To revcom use the revcom function with trunc");
242 if(! seek($fh,$start-1,0)) {
243 $self->throw("Unable to seek on file $start:$end $!");
245 my $ret = read($fh, $string, $end-$start+1);
246 if( !defined $ret ) {
247 $self->throw("Unable to read $start:$end $!");
253 =head2 add_sequence_as_string
255 Title : add_sequence_as_string
256 Usage : $seq->add_sequence_as_string("CATGAT");
257 Function: Appends additional residues to an existing LargeLocatableSeq object.
258 This allows one to build up a large sequence without storing
259 entire object in memory.
260 Returns : Current length of sequence
261 Args : string to append
265 sub add_sequence_as_string
{
266 my ($self,$str) = @_;
267 my $len = $self->length + CORE
::length($str);
268 my $fh = $self->_fh();
269 if(! seek($fh,0,2)) {
270 $self->throw("Unable to seek end of file: $!");
280 Usage : $obj->_filename($newval)
283 Returns : value of _filename
284 Args : newvalue (optional)
290 my ($obj,$value) = @_;
291 if( defined $value) {
292 $obj->{'_filename'} = $value;
294 return $obj->{'_filename'};
302 Usage : $obj->alphabet($newval)
305 Returns : value of alphabet
306 Args : newvalue (optional)
312 my ($self,$value) = @_;
313 if( defined $value) {
314 $self->SUPER::alphabet
($value);
316 return $self->SUPER::alphabet
() || 'dna';
324 Usage : $obj->end($newval)
326 Returns : value of end
327 Args : newvalue (optional)
335 my $string = $self->seq;
337 my $len = $self->_ungapped_len;
339 $self->warn("In sequence $id residue count gives end value $len.
340 Overriding value [$value] with value $len for Bio::LargeLocatableSeq::end().")
341 and $value = $len if $len != $value and $self->verbose > 0;
344 $self->{'end'} = $value;
347 return $self->{'end'} || $self->_ungapped_len;
353 my $fh = $self->_fh();
354 close($fh) if( defined $fh );
355 # this should be handled by Tempfile removal, but we'll unlink anyways.
356 unlink $self->_filename() if defined $self->_filename() && -e
$self->_filename;
357 $self->SUPER::DESTROY
();