* seq_inds is not defined for Model-based HSPs
[bioperl-live.git] / Bio / LocatableSeq.pm
blobd3e680f377a80dc07066d4a63d427c1b264b9cbb
1 # $Id$
3 # BioPerl module for Bio::LocatableSeq
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
16 that can be projected into a MSA or have coordinates relative to
17 another seq.
19 =head1 SYNOPSIS
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
24 -id => "seq1",
25 -start => 1,
26 -end => 7);
29 =head1 DESCRIPTION
31 # a normal sequence object
32 $locseq->seq();
33 $locseq->id();
35 # has start,end points
36 $locseq->start();
37 $locseq->end();
39 # inherits off RangeI, so range operations possible
41 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 The locatable sequence object was developed mainly because the
54 SimpleAlign object requires this functionality, and in the rewrite
55 of the Sequence object we had to decide what to do with this.
57 It is, to be honest, not well integrated with the rest of bioperl, for
58 example, the trunc() function does not return a LocatableSeq object,
59 as some might have thought. There are all sorts of nasty gotcha's
60 about interactions between coordinate systems when these sort of
61 objects are used.
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
68 web:
70 http://bugzilla.open-bio.org/
73 =head1 APPENDIX
75 The rest of the documentation details each of the object
76 methods. Internal methods are usually preceded with a _
78 =cut
81 # Let the code begin...
83 package Bio::LocatableSeq;
84 use strict;
86 use Bio::Location::Simple;
87 use Bio::Location::Fuzzy;
88 use vars qw($GAP_SYMBOLS $OTHER_SYMBOLS $MATCHPATTERN);
90 # should we change these to non-globals? (I can see this
91 # causing problems down the road...) - cjfields
93 $GAP_SYMBOLS = '\-\.=~';
95 $OTHER_SYMBOLS = '\*\?';
97 $MATCHPATTERN = '0-9A-Za-z'.$GAP_SYMBOLS.$OTHER_SYMBOLS;
99 use base qw(Bio::PrimarySeq Bio::RangeI);
101 sub new {
102 my ($class, @args) = @_;
103 my $self = $class->SUPER::new(@args);
105 my ($start,$end,$strand, $mapping) =
106 $self->_rearrange( [qw(START END STRAND MAPPING)],
107 @args);
109 $mapping ||= [1,1];
110 $self->mapping($mapping);
111 defined $start && $self->start($start);
112 defined $end && $self->end($end);
113 defined $strand && $self->strand($strand);
115 return $self; # success - we hope!
118 =head2 start
120 Title : start
121 Usage : $obj->start($newval)
122 Function: Get/set the 1-based start position of this sequence in the original
123 sequence. '0' means before the original sequence starts.
124 Returns : value of start
125 Args : newvalue (optional)
127 =cut
129 sub start{
130 my $self = shift;
131 if( @_ ) {
132 my $value = shift;
133 $self->{'start'} = $value;
135 return $self->{'start'} if defined $self->{'start'};
136 return 1 if $self->seq;
137 return;
140 =head2 end
142 Title : end
143 Usage : $obj->end($newval)
144 Function: Get/set the 1-based end position of this sequence in the original
145 sequence. '0' means before the original sequence starts.
146 Returns : value of end
147 Args : newvalue (optional)
149 =cut
151 sub end {
152 my $self = shift;
153 if( @_ ) {
154 my $value = shift;
155 my $st = $self->start;
156 # start of 0 usually means the sequence is all gaps but maps to
157 # other sequences in an alignment
158 if ($self->seq && $st != 0 ) {
159 my $len = $self->_ungapped_len;
160 my $calend = $len + $st - 1;
161 my $id = $self->id;
162 if ($calend != $value) {
163 $self->warn("In sequence $id residue count gives end value ".
164 "$calend. \nOverriding value [$value] with value $calend for ".
165 "Bio::LocatableSeq::end().\n".$self->seq);
166 $value = $calend;
170 $self->{'end'} = $value;
173 if (defined $self->{'end'}) {
174 return $self->{'end'}
175 } elsif (my $len = $self->_ungapped_len) {
176 return $self->{'end'} = $len + $self->start - 1;
177 } else {
178 return
182 # changed 08.10.26 to return ungapped length, not the calculated end
183 # of the sequence
184 sub _ungapped_len {
185 my $self = shift;
186 return unless my $string = $self->seq;
187 my ($map_res, $map_coord) = $self->mapping;
188 $string =~ s/[$GAP_SYMBOLS]+//g;
189 return CORE::length($string)/($map_res/$map_coord);
192 =head2 strand
194 Title : strand
195 Usage : $obj->strand($newval)
196 Function: return or set the strandedness
197 Returns : the value of the strandedness (-1, 0 or 1)
198 Args : the value of the strandedness (-1, 0 or 1)
200 =cut
202 sub strand{
203 my $self = shift;
204 if( @_ ) {
205 my $value = shift;
206 $self->{'strand'} = $value;
208 return $self->{'strand'};
211 =head2 mapping
213 Title : mapping
214 Usage : $obj->mapping($newval)
215 Function: return or set the mapping indices (indicates # residues mapped to
216 # of positions)
217 Returns : two-element array (# res, # pos)
218 Args : two elements (# res, # pos); this can also be passed in as an
219 array reference of the two elements (as might be passed
220 upon Bio::LocatableSeq instantiation, for instance).
222 =cut
224 sub mapping {
225 my $self = shift;
226 if( @_ ) {
227 my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
228 $self->throw("Must pass two values (# residues mapped to # positions)")
229 unless @mapping == 2;
230 $self->{'_mapping'} = \@mapping;
232 $self->throw('Arggh!!!') if !exists $self->{'_mapping'};
233 return @{ $self->{'_mapping'} };
236 =head2 get_nse
238 Title : get_nse
239 Usage :
240 Function: read-only name of form id/start-end
241 Example :
242 Returns :
243 Args :
245 =cut
247 sub get_nse{
248 my ($self,$char1,$char2) = @_;
250 $char1 ||= "/";
251 $char2 ||= "-";
253 $self->throw("Attribute id not set") unless defined($self->id());
254 $self->throw("Attribute start not set") unless defined($self->start());
255 $self->throw("Attribute end not set") unless defined($self->end());
257 return $self->id() . $char1 . $self->start . $char2 . $self->end ;
262 =head2 no_gap
264 Title : no_gaps
265 Usage :$self->no_gaps('.')
266 Function:
268 Gets number of gaps in the sequence. The count excludes
269 leading or trailing gap characters.
271 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
272 these, '.' and '-' are counted as gap characters unless an
273 optional argument specifies one of them.
275 Returns : number of internal gaps in the sequnce.
276 Args : a gap character (optional)
278 =cut
280 sub no_gaps {
281 my ($self,$char) = @_;
282 my ($seq, $count) = (undef, 0);
284 # default gap characters
285 $char ||= $GAP_SYMBOLS;
287 $self->warn("I hope you know what you are doing setting gap to [$char]")
288 unless $char =~ /[-.]/;
290 $seq = $self->seq;
291 return 0 unless $seq; # empty sequence does not have gaps
293 $seq =~ s/^([$char]+)//;
294 $seq =~ s/([$char]+)$//;
295 $count++ while $seq =~ /[$char]+/g;
297 return $count;
302 =head2 column_from_residue_number
304 Title : column_from_residue_number
305 Usage : $col = $seq->column_from_residue_number($resnumber)
306 Function:
308 This function gives the position in the alignment
309 (i.e. column number) of the given residue number in the
310 sequence. For example, for the sequence
312 Seq1/91-97 AC..DEF.GH
314 column_from_residue_number(94) returns 6.
316 An exception is thrown if the residue number would lie
317 outside the length of the aligment
318 (e.g. column_from_residue_number( "Seq2", 22 )
320 Returns : A column number for the position of the
321 given residue in the given sequence (1 = first column)
322 Args : A residue number in the whole sequence (not just that
323 segment of it in the alignment)
325 =cut
327 sub column_from_residue_number {
328 my ($self, $resnumber) = @_;
330 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
331 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
333 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
334 my @residues = split //, $self->seq;
335 my $count = $self->start();
336 my $i;
337 my ($start,$end,$inc,$test);
338 my $strand = $self->strand || 0;
339 # the following bit of "magic" allows the main loop logic to be the
340 # same regardless of the strand of the sequence
341 ($start,$end,$inc,$test)= ($strand == -1)?
342 (scalar(@residues-1),0,-1,sub{$i >= $end}) :
343 (0,scalar(@residues-1),1,sub{$i <= $end});
345 for ($i=$start; $test->(); $i+= $inc) {
346 if ($residues[$i] ne '.' and $residues[$i] ne '-') {
347 $count == $resnumber and last;
348 $count++;
351 # $i now holds the index of the column.
352 # The actual column number is this index + 1
354 return $i+1;
357 $self->throw("Could not find residue number $resnumber");
361 =head2 location_from_column
363 Title : location_from_column
364 Usage : $loc = $ali->location_from_column($column_number)
365 Function:
367 This function gives the residue number for a given position
368 in the alignment (i.e. column number) of the given. Gaps
369 complicate this process and force the output to be a
370 L<Bio::Range> where values can be undefined. For example,
371 for the sequence:
373 Seq/91-96 .AC..DEF.G.
375 location_from_column( 3 ) position 92
376 location_from_column( 4 ) position 92^93
377 location_from_column( 9 ) position 95^96
378 location_from_column( 1 ) position undef
380 An exact position returns a Bio::Location::Simple object
381 where where location_type() returns 'EXACT', if a position
382 is between bases location_type() returns 'IN-BETWEEN'.
383 Column before the first residue returns undef. Note that if
384 the position is after the last residue in the alignment,
385 that there is no guarantee that the original sequence has
386 residues after that position.
388 An exception is thrown if the column number is not within
389 the sequence.
391 Returns : Bio::Location::Simple or undef
392 Args : A column number
393 Throws : If column is not within the sequence
395 See L<Bio::Location::Simple> for more.
397 =cut
399 sub location_from_column {
400 my ($self, $column) = @_;
402 $self->throw("Column number has to be a positive integer, not [$column]")
403 unless $column =~ /^\d+$/ and $column > 0;
404 $self->throw("Column number [$column] is larger than".
405 " sequence length [". $self->length. "]")
406 unless $column <= $self->length;
408 my ($loc);
409 my $s = $self->subseq(1,$column);
410 $s =~ s/[^a-zA-Z\*]//g;
412 my $pos = CORE::length $s;
414 my $start = $self->start || 0 ;
415 my $strand = $self->strand() || 1;
416 my $relative_pos = ($strand == -1)
417 ? ($self->end - $pos + 1)
418 : ($pos + $start - 1);
419 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
420 $loc = Bio::Location::Simple->new
421 (-start => $relative_pos,
422 -end => $relative_pos,
423 -strand => 1,
425 } elsif ($pos == 0 and $self->start == 1) {
426 } else {
427 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
428 if ($strand == -1) {
429 ($start,$end) = ($end,$start);
431 $loc = Bio::Location::Simple->new
432 (-start => $start,
433 -end => $end,
434 -strand => 1,
435 -location_type => 'IN-BETWEEN'
438 return $loc;
441 =head2 revcom
443 Title : revcom
444 Usage : $rev = $seq->revcom()
445 Function: Produces a new Bio::LocatableSeq object which
446 has the reversed complement of the sequence. For protein
447 sequences this throws an exception of "Sequence is a
448 protein. Cannot revcom"
450 Returns : A new Bio::LocatableSeq object
451 Args : none
453 =cut
455 sub revcom {
456 my ($self) = @_;
458 my $new = $self->SUPER::revcom;
459 $new->strand($self->strand * -1) if $self->strand;
460 $new->start($self->start) if $self->start;
461 $new->end($self->end) if $self->end;
462 return $new;
466 =head2 trunc
468 Title : trunc
469 Usage : $subseq = $myseq->trunc(10,100);
470 Function: Provides a truncation of a sequence,
472 Example :
473 Returns : a fresh Bio::PrimarySeqI implementing object
474 Args : Two integers denoting first and last columns of the
475 sequence to be included into sub-sequence.
478 =cut
480 sub trunc {
481 my ($self, $start, $end) = @_;
482 my $new = $self->SUPER::trunc($start, $end);
483 $new->strand($self->strand);
485 # end will be automatically calculated
486 $start = $end if $self->strand == -1;
488 $start = $self->location_from_column($start);
489 $start ? ($start = $start->end) : ($start = 1);
490 $new->start($start) if $start;
492 return $new;
495 =head2 validate_seq
497 Title : validate_seq
498 Usage : if(! $seq->validate_seq($seq_str) ) {
499 print "sequence $seq_str is not valid for an object of
500 alphabet ",$seq->alphabet, "\n";
502 Function: Validates a given sequence string. A validating sequence string
503 must be accepted by seq(). A string that does not validate will
504 lead to an exception if passed to seq().
506 The implementation provided here does not take alphabet() into
507 account. Allowed are all letters (A-Z), numbers [0-9]
508 and '-','.','*','?','=',and '~'.
510 Example :
511 Returns : 1 if the supplied sequence string is valid for the object, and
512 0 otherwise.
513 Args : The sequence string to be validated.
516 =cut
518 sub validate_seq {
519 my ($self,$seqstr) = @_;
520 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
521 return 0 unless( defined $seqstr);
523 if((CORE::length($seqstr) > 0) &&
524 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
525 $self->warn("seq doesn't validate, mismatch is " .
526 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
527 return 0;
529 return 1;