3 # BioPerl module for Bio::LocatableSeq
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
16 that can be projected into a MSA or have coordinates relative to
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
31 # a normal sequence object
35 # has start,end points
39 # inherits off RangeI, so range operations possible
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 The locatable sequence object was developed mainly because the
54 SimpleAlign object requires this functionality, and in the rewrite
55 of the Sequence object we had to decide what to do with this.
57 It is, to be honest, not well integrated with the rest of bioperl, for
58 example, the trunc() function does not return a LocatableSeq object,
59 as some might have thought. There are all sorts of nasty gotcha's
60 about interactions between coordinate systems when these sort of
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
70 http://bugzilla.open-bio.org/
75 The rest of the documentation details each of the object
76 methods. Internal methods are usually preceded with a _
81 # Let the code begin...
83 package Bio
::LocatableSeq
;
86 use Bio
::Location
::Simple
;
87 use Bio
::Location
::Fuzzy
;
88 use vars
qw($GAP_SYMBOLS $OTHER_SYMBOLS $MATCHPATTERN);
90 # should we change these to non-globals? (I can see this
91 # causing problems down the road...) - cjfields
93 $GAP_SYMBOLS = '\-\.=~';
95 $OTHER_SYMBOLS = '\*\?';
97 $MATCHPATTERN = '0-9A-Za-z'.$GAP_SYMBOLS.$OTHER_SYMBOLS;
99 use base qw(Bio::PrimarySeq Bio::RangeI);
102 my ($class, @args) = @_;
103 my $self = $class->SUPER::new
(@args);
105 my ($start,$end,$strand, $mapping) =
106 $self->_rearrange( [qw(START END STRAND MAPPING)],
110 $self->mapping($mapping);
111 defined $start && $self->start($start);
112 defined $end && $self->end($end);
113 defined $strand && $self->strand($strand);
115 return $self; # success - we hope!
121 Usage : $obj->start($newval)
122 Function: Get/set the 1-based start position of this sequence in the original
123 sequence. '0' means before the original sequence starts.
124 Returns : value of start
125 Args : newvalue (optional)
133 $self->{'start'} = $value;
135 return $self->{'start'} if defined $self->{'start'};
136 return 1 if $self->seq;
143 Usage : $obj->end($newval)
144 Function: Get/set the 1-based end position of this sequence in the original
145 sequence. '0' means before the original sequence starts.
146 Returns : value of end
147 Args : newvalue (optional)
155 my $st = $self->start;
156 # start of 0 usually means the sequence is all gaps but maps to
157 # other sequences in an alignment
158 if ($self->seq && $st != 0 ) {
159 my $len = $self->_ungapped_len;
160 my $calend = $len + $st - 1;
162 if ($calend != $value) {
163 $self->warn("In sequence $id residue count gives end value ".
164 "$calend. \nOverriding value [$value] with value $calend for ".
165 "Bio::LocatableSeq::end().\n".$self->seq);
170 $self->{'end'} = $value;
173 if (defined $self->{'end'}) {
174 return $self->{'end'}
175 } elsif (my $len = $self->_ungapped_len) {
176 return $self->{'end'} = $len + $self->start - 1;
182 # changed 08.10.26 to return ungapped length, not the calculated end
186 return unless my $string = $self->seq;
187 my ($map_res, $map_coord) = $self->mapping;
188 $string =~ s/[$GAP_SYMBOLS]+//g;
189 return CORE
::length($string)/($map_res/$map_coord);
195 Usage : $obj->strand($newval)
196 Function: return or set the strandedness
197 Returns : the value of the strandedness (-1, 0 or 1)
198 Args : the value of the strandedness (-1, 0 or 1)
206 $self->{'strand'} = $value;
208 return $self->{'strand'};
214 Usage : $obj->mapping($newval)
215 Function: return or set the mapping indices (indicates # residues mapped to
217 Returns : two-element array (# res, # pos)
218 Args : two elements (# res, # pos); this can also be passed in as an
219 array reference of the two elements (as might be passed
220 upon Bio::LocatableSeq instantiation, for instance).
227 my @mapping = (ref($_[0]) eq 'ARRAY') ? @
{$_[0]} : @_;
228 $self->throw("Must pass two values (# residues mapped to # positions)")
229 unless @mapping == 2;
230 $self->{'_mapping'} = \
@mapping;
232 $self->throw('Arggh!!!') if !exists $self->{'_mapping'};
233 return @
{ $self->{'_mapping'} };
240 Function: read-only name of form id/start-end
248 my ($self,$char1,$char2) = @_;
253 $self->throw("Attribute id not set") unless defined($self->id());
254 $self->throw("Attribute start not set") unless defined($self->start());
255 $self->throw("Attribute end not set") unless defined($self->end());
257 return $self->id() . $char1 . $self->start . $char2 . $self->end ;
265 Usage :$self->no_gaps('.')
268 Gets number of gaps in the sequence. The count excludes
269 leading or trailing gap characters.
271 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
272 these, '.' and '-' are counted as gap characters unless an
273 optional argument specifies one of them.
275 Returns : number of internal gaps in the sequnce.
276 Args : a gap character (optional)
281 my ($self,$char) = @_;
282 my ($seq, $count) = (undef, 0);
284 # default gap characters
285 $char ||= $GAP_SYMBOLS;
287 $self->warn("I hope you know what you are doing setting gap to [$char]")
288 unless $char =~ /[-.]/;
291 return 0 unless $seq; # empty sequence does not have gaps
293 $seq =~ s/^([$char]+)//;
294 $seq =~ s/([$char]+)$//;
295 $count++ while $seq =~ /[$char]+/g;
302 =head2 column_from_residue_number
304 Title : column_from_residue_number
305 Usage : $col = $seq->column_from_residue_number($resnumber)
308 This function gives the position in the alignment
309 (i.e. column number) of the given residue number in the
310 sequence. For example, for the sequence
312 Seq1/91-97 AC..DEF.GH
314 column_from_residue_number(94) returns 6.
316 An exception is thrown if the residue number would lie
317 outside the length of the aligment
318 (e.g. column_from_residue_number( "Seq2", 22 )
320 Returns : A column number for the position of the
321 given residue in the given sequence (1 = first column)
322 Args : A residue number in the whole sequence (not just that
323 segment of it in the alignment)
327 sub column_from_residue_number
{
328 my ($self, $resnumber) = @_;
330 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
331 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
333 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
334 my @residues = split //, $self->seq;
335 my $count = $self->start();
337 my ($start,$end,$inc,$test);
338 my $strand = $self->strand || 0;
339 # the following bit of "magic" allows the main loop logic to be the
340 # same regardless of the strand of the sequence
341 ($start,$end,$inc,$test)= ($strand == -1)?
342 (scalar(@residues-1),0,-1,sub{$i >= $end}) :
343 (0,scalar(@residues-1),1,sub{$i <= $end});
345 for ($i=$start; $test->(); $i+= $inc) {
346 if ($residues[$i] ne '.' and $residues[$i] ne '-') {
347 $count == $resnumber and last;
351 # $i now holds the index of the column.
352 # The actual column number is this index + 1
357 $self->throw("Could not find residue number $resnumber");
361 =head2 location_from_column
363 Title : location_from_column
364 Usage : $loc = $ali->location_from_column($column_number)
367 This function gives the residue number for a given position
368 in the alignment (i.e. column number) of the given. Gaps
369 complicate this process and force the output to be a
370 L<Bio::Range> where values can be undefined. For example,
373 Seq/91-96 .AC..DEF.G.
375 location_from_column( 3 ) position 92
376 location_from_column( 4 ) position 92^93
377 location_from_column( 9 ) position 95^96
378 location_from_column( 1 ) position undef
380 An exact position returns a Bio::Location::Simple object
381 where where location_type() returns 'EXACT', if a position
382 is between bases location_type() returns 'IN-BETWEEN'.
383 Column before the first residue returns undef. Note that if
384 the position is after the last residue in the alignment,
385 that there is no guarantee that the original sequence has
386 residues after that position.
388 An exception is thrown if the column number is not within
391 Returns : Bio::Location::Simple or undef
392 Args : A column number
393 Throws : If column is not within the sequence
395 See L<Bio::Location::Simple> for more.
399 sub location_from_column
{
400 my ($self, $column) = @_;
402 $self->throw("Column number has to be a positive integer, not [$column]")
403 unless $column =~ /^\d+$/ and $column > 0;
404 $self->throw("Column number [$column] is larger than".
405 " sequence length [". $self->length. "]")
406 unless $column <= $self->length;
409 my $s = $self->subseq(1,$column);
410 $s =~ s/[^a-zA-Z\*]//g;
412 my $pos = CORE
::length $s;
414 my $start = $self->start || 0 ;
415 my $strand = $self->strand() || 1;
416 my $relative_pos = ($strand == -1)
417 ?
($self->end - $pos + 1)
418 : ($pos + $start - 1);
419 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
420 $loc = Bio
::Location
::Simple
->new
421 (-start
=> $relative_pos,
422 -end
=> $relative_pos,
425 } elsif ($pos == 0 and $self->start == 1) {
427 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
429 ($start,$end) = ($end,$start);
431 $loc = Bio
::Location
::Simple
->new
435 -location_type
=> 'IN-BETWEEN'
444 Usage : $rev = $seq->revcom()
445 Function: Produces a new Bio::LocatableSeq object which
446 has the reversed complement of the sequence. For protein
447 sequences this throws an exception of "Sequence is a
448 protein. Cannot revcom"
450 Returns : A new Bio::LocatableSeq object
458 my $new = $self->SUPER::revcom
;
459 $new->strand($self->strand * -1) if $self->strand;
460 $new->start($self->start) if $self->start;
461 $new->end($self->end) if $self->end;
469 Usage : $subseq = $myseq->trunc(10,100);
470 Function: Provides a truncation of a sequence,
473 Returns : a fresh Bio::PrimarySeqI implementing object
474 Args : Two integers denoting first and last columns of the
475 sequence to be included into sub-sequence.
481 my ($self, $start, $end) = @_;
482 my $new = $self->SUPER::trunc
($start, $end);
483 $new->strand($self->strand);
485 # end will be automatically calculated
486 $start = $end if $self->strand == -1;
488 $start = $self->location_from_column($start);
489 $start ?
($start = $start->end) : ($start = 1);
490 $new->start($start) if $start;
498 Usage : if(! $seq->validate_seq($seq_str) ) {
499 print "sequence $seq_str is not valid for an object of
500 alphabet ",$seq->alphabet, "\n";
502 Function: Validates a given sequence string. A validating sequence string
503 must be accepted by seq(). A string that does not validate will
504 lead to an exception if passed to seq().
506 The implementation provided here does not take alphabet() into
507 account. Allowed are all letters (A-Z), numbers [0-9]
508 and '-','.','*','?','=',and '~'.
511 Returns : 1 if the supplied sequence string is valid for the object, and
513 Args : The sequence string to be validated.
519 my ($self,$seqstr) = @_;
520 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
521 return 0 unless( defined $seqstr);
523 if((CORE
::length($seqstr) > 0) &&
524 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
525 $self->warn("seq doesn't validate, mismatch is " .
526 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));