3 # BioPerl module for Bio::LocatableSeq
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::LocatableSeq - A Sequence object with start/end points on it
16 that can be projected into a MSA or have coordinates relative to
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
31 # a normal sequence object
35 # has start,end points
39 # inherits off RangeI, so range operations possible
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 The locatable sequence object was developed mainly because the
54 SimpleAlign object requires this functionality, and in the rewrite
55 of the Sequence object we had to decide what to do with this.
57 It is, to be honest, not well integrated with the rest of bioperl, for
58 example, the trunc() function does not return a LocatableSeq object,
59 as some might have thought. There are all sorts of nasty gotcha's
60 about interactions between coordinate systems when these sort of
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
70 http://bugzilla.open-bio.org/
75 The rest of the documentation details each of the object
76 methods. Internal methods are usually preceded with a _
81 # Let the code begin...
83 package Bio
::LocatableSeq
;
86 use Bio
::Location
::Simple
;
87 use Bio
::Location
::Fuzzy
;
88 use vars
qw($MATCHPATTERN);
90 $MATCHPATTERN = '0-9A-Za-z\-\.\*\?=~';
93 use base qw(Bio::PrimarySeq Bio::RangeI);
96 my ($class, @args) = @_;
97 my $self = $class->SUPER::new
(@args);
99 my ($start,$end,$strand) =
100 $self->_rearrange( [qw(START END STRAND)],
103 defined $start && $self->start($start);
104 defined $end && $self->end($end);
105 defined $strand && $self->strand($strand);
107 return $self; # success - we hope!
113 Usage : $obj->start($newval)
114 Function: Get/set the 1-based start position of this sequence in the original
115 sequence. '0' means before the original sequence starts.
116 Returns : value of start
117 Args : newvalue (optional)
125 $self->{'start'} = $value;
127 return $self->{'start'} if defined $self->{'start'};
128 return 1 if $self->seq;
135 Usage : $obj->end($newval)
136 Function: Get/set the 1-based end position of this sequence in the original
137 sequence. '0' means before the original sequence starts.
138 Returns : value of end
139 Args : newvalue (optional)
147 my $string = $self->seq;
149 my $len = $self->_ungapped_len;
151 $self->warn("In sequence $id residue count gives end value $len.
152 Overriding value [$value] with value $len for Bio::LocatableSeq::end().")
153 and $value = $len if $len != $value and $self->verbose > 0;
156 $self->{'end'} = $value;
159 return defined $self->{'end'} ?
$self->{'end'} : $self->_ungapped_len;
164 my $string = $self->seq || '';
165 $string =~ s/[\.\-]+//g;
166 $self->seq ?
(return $self->start + CORE
::length($string) - 1 ) : undef;
172 Usage : $obj->strand($newval)
174 Returns : value of strand
175 Args : newvalue (optional)
183 $self->{'strand'} = $value;
185 return $self->{'strand'};
192 Function: read-only name of form id/start-end
200 my ($self,$char1,$char2) = @_;
205 $self->throw("Attribute id not set") unless defined($self->id());
206 $self->throw("Attribute start not set") unless defined($self->start());
207 $self->throw("Attribute end not set") unless defined($self->end());
209 return $self->id() . $char1 . $self->start . $char2 . $self->end ;
217 Usage :$self->no_gaps('.')
220 Gets number of gaps in the sequence. The count excludes
221 leading or trailing gap characters.
223 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
224 these, '.' and '-' are counted as gap characters unless an
225 optional argument specifies one of them.
227 Returns : number of internal gaps in the sequnce.
228 Args : a gap character (optional)
233 my ($self,$char) = @_;
234 my ($seq, $count) = (undef, 0);
236 # default gap characters
239 $self->warn("I hope you know what you are doing setting gap to [$char]")
240 unless $char =~ /[-.]/;
243 return 0 unless $seq; # empty sequence does not have gaps
245 $seq =~ s/^([$char]+)//;
246 $seq =~ s/([$char]+)$//;
247 $count++ while $seq =~ /[$char]+/g;
254 =head2 column_from_residue_number
256 Title : column_from_residue_number
257 Usage : $col = $seq->column_from_residue_number($resnumber)
260 This function gives the position in the alignment
261 (i.e. column number) of the given residue number in the
262 sequence. For example, for the sequence
264 Seq1/91-97 AC..DEF.GH
266 column_from_residue_number(94) returns 5.
268 An exception is thrown if the residue number would lie
269 outside the length of the aligment
270 (e.g. column_from_residue_number( "Seq2", 22 )
272 Returns : A column number for the position of the
273 given residue in the given sequence (1 = first column)
274 Args : A residue number in the whole sequence (not just that
275 segment of it in the alignment)
279 sub column_from_residue_number
{
280 my ($self, $resnumber) = @_;
282 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
283 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
285 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
286 my @residues = split //, $self->seq;
287 my $count = $self->start();
289 my ($start,$end,$inc,$test);
290 my $strand = $self->strand || 0;
291 # the following bit of "magic" allows the main loop logic to be the
292 # same regardless of the strand of the sequence
293 ($start,$end,$inc,$test)= ($strand == -1)?
294 (scalar(@residues-1),0,-1,sub{$i >= $end}) :
295 (0,scalar(@residues-1),1,sub{$i <= $end});
297 for ($i=$start; $test->(); $i+= $inc) {
298 if ($residues[$i] ne '.' and $residues[$i] ne '-') {
299 $count == $resnumber and last;
303 # $i now holds the index of the column.
304 # The actual column number is this index + 1
309 $self->throw("Could not find residue number $resnumber");
314 =head2 location_from_column
316 Title : location_from_column
317 Usage : $loc = $ali->location_from_column($column_number)
320 This function gives the residue number for a given position
321 in the alignment (i.e. column number) of the given. Gaps
322 complicate this process and force the output to be a
323 L<Bio::Range> where values can be undefined. For example,
326 Seq/91-97 .AC..DEF.G.
328 location_from_column( 3 ) position 93
329 location_from_column( 2 ) position 92^93
330 location_from_column(10 ) position 97^98
331 location_from_column( 1 ) position undef
333 An exact position returns a Bio::Location::Simple object
334 where where location_type() returns 'EXACT', if a position
335 is between bases location_type() returns 'IN-BETWEEN'.
336 Column before the first residue returns undef. Note that if
337 the position is after the last residue in the alignment,
338 that there is no guarantee that the original sequence has
339 residues after that position.
341 An exception is thrown if the column number is not within
344 Returns : Bio::Location::Simple or undef
345 Args : A column number
346 Throws : If column is not within the sequence
348 See L<Bio::Location::Simple> for more.
352 sub location_from_column
{
353 my ($self, $column) = @_;
355 $self->throw("Column number has to be a positive integer, not [$column]")
356 unless $column =~ /^\d+$/ and $column > 0;
357 $self->throw("Column number [$column] is larger than".
358 " sequence length [". $self->length. "]")
359 unless $column <= $self->length;
362 my $s = $self->subseq(1,$column);
363 $s =~ s/[^a-zA-Z\*]//g;
365 my $pos = CORE
::length $s;
367 my $start = $self->start || 0 ;
368 my $strand = $self->strand() || 1;
369 my $relative_pos = ($strand == -1)
370 ?
($self->end - $pos + 1)
371 : ($pos + $start - 1);
372 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
373 $loc = Bio
::Location
::Simple
->new
374 (-start
=> $relative_pos,
375 -end
=> $relative_pos,
378 } elsif ($pos == 0 and $self->start == 1) {
380 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
382 ($start,$end) = ($end,$start);
384 $loc = Bio
::Location
::Simple
->new
388 -location_type
=> 'IN-BETWEEN'
397 Usage : $rev = $seq->revcom()
398 Function: Produces a new Bio::LocatableSeq object which
399 has the reversed complement of the sequence. For protein
400 sequences this throws an exception of "Sequence is a
401 protein. Cannot revcom"
403 Returns : A new Bio::LocatableSeq object
411 my $new = $self->SUPER::revcom
;
412 $new->strand($self->strand * -1) if $self->strand;
413 $new->start($self->start) if $self->start;
414 $new->end($self->end) if $self->end;
422 Usage : $subseq = $myseq->trunc(10,100);
423 Function: Provides a truncation of a sequence,
426 Returns : a fresh Bio::PrimarySeqI implementing object
427 Args : Two integers denoting first and last columns of the
428 sequence to be included into sub-sequence.
434 my ($self, $start, $end) = @_;
435 my $new = $self->SUPER::trunc
($start, $end);
436 $new->strand($self->strand);
438 # end will be automatically calculated
439 $start = $end if $self->strand == -1;
441 $start = $self->location_from_column($start);
442 $start ?
($start = $start->end) : ($start = 1);
443 $new->start($start) if $start;
451 Usage : if(! $seq->validate_seq($seq_str) ) {
452 print "sequence $seq_str is not valid for an object of
453 alphabet ",$seq->alphabet, "\n";
455 Function: Validates a given sequence string. A validating sequence string
456 must be accepted by seq(). A string that does not validate will
457 lead to an exception if passed to seq().
459 The implementation provided here does not take alphabet() into
460 account. Allowed are all letters (A-Z), numbers [0-9]
461 and '-','.','*','?','=',and '~'.
464 Returns : 1 if the supplied sequence string is valid for the object, and
466 Args : The sequence string to be validated.
472 my ($self,$seqstr) = @_;
473 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
474 return 0 unless( defined $seqstr);
476 if((CORE
::length($seqstr) > 0) &&
477 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
478 $self->warn("seq doesn't validate, mismatch is " .
479 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));