[bug 2714]
[bioperl-live.git] / Bio / LocatableSeq.pm
blob3f9101b50cc38c3c55e4b5e7c4472b16a842c553
1 # $Id$
3 # BioPerl module for Bio::LocatableSeq
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
16 that can be projected into a MSA or have coordinates relative to
17 another seq.
19 =head1 SYNOPSIS
21 use Bio::LocatableSeq;
22 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
23 -id => "seq1",
24 -start => 1,
25 -end => 7);
27 # a normal sequence object
28 $locseq->seq();
29 $locseq->id();
31 # has start,end points
32 $locseq->start();
33 $locseq->end();
35 # inherits off RangeI, so range operations possible
37 =head1 DESCRIPTION
39 The LocatableSeq sequence object was developed mainly because the SimpleAlign
40 object requires this functionality, and in the rewrite of the Sequence object we
41 had to decide what to do with this.
43 It is, to be honest, not well integrated with the rest of bioperl. For example,
44 the trunc() function does not return a LocatableSeq object, as some might have
45 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
46 inherited from Bio::RangeI and is not stored in a Bio::Location.
48 There are all sorts of nasty gotcha's about interactions between coordinate
49 systems when these sort of objects are used. Some mapping now occurs to deal
50 with HSP data, however it can probably be integrated in better and most methods
51 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
52 trunc(), etc.) do not behave as expected and must be used with care.
54 Due to this, LocatableSeq functionality is to be refactored in a future BioPerl
55 release. However, for alignment functionality it works adequately for the time
56 being
58 =head1 FEEDBACK
60 =head2 Mailing Lists
62 User feedback is an integral part of the evolution of this and other
63 Bioperl modules. Send your comments and suggestions preferably to one
64 of the Bioperl mailing lists. Your participation is much appreciated.
66 bioperl-l@bioperl.org - General discussion
67 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
69 =head2 Reporting Bugs
71 Report bugs to the Bioperl bug tracking system to help us keep track
72 the bugs and their resolution. Bug reports can be submitted via the
73 web:
75 http://bugzilla.open-bio.org/
77 =head1 APPENDIX
79 The rest of the documentation details each of the object
80 methods. Internal methods are usually preceded with a _
82 =cut
85 # Let the code begin...
87 package Bio::LocatableSeq;
88 use strict;
90 use Bio::Location::Simple;
91 use Bio::Location::Fuzzy;
92 use vars qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
94 # The following global variables contain symbols used to represent gaps,
95 # frameshifts, residues, and other valid symbols. These are set at compile-time;
96 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
97 # LocatableSeq.t)
99 $GAP_SYMBOLS = '\-\.=~';
100 $FRAMESHIFT_SYMBOLS = '\\\/';
101 $OTHER_SYMBOLS = '\?';
102 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
103 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
105 use base qw(Bio::PrimarySeq Bio::RangeI);
107 sub new {
108 my ($class, @args) = @_;
109 my $self = $class->SUPER::new(@args);
111 my ($start,$end,$strand, $mapping, $fs, $nse) =
112 $self->_rearrange( [qw(START
114 STRAND
115 MAPPING
116 FRAMESHIFTS
117 FORCE_NSE
119 @args);
121 $mapping ||= [1,1];
122 $self->mapping($mapping);
123 $nse || 0;
124 $self->force_nse($nse);
125 defined $fs && $self->frameshifts($fs);
126 defined $start && $self->start($start);
127 defined $end && $self->end($end);
128 defined $strand && $self->strand($strand);
130 return $self; # success - we hope!
133 =head2 start
135 Title : start
136 Usage : $obj->start($newval)
137 Function: Get/set the 1-based start position of this sequence in the original
138 sequence. '0' means before the original sequence starts.
139 Returns : value of start
140 Args : newvalue (optional)
142 =cut
144 sub start{
145 my $self = shift;
146 if( @_ ) {
147 my $value = shift;
148 $self->{'start'} = $value;
150 return $self->{'start'} if defined $self->{'start'};
151 return 1 if $self->seq;
152 return;
155 =head2 end
157 Title : end
158 Usage : $obj->end($newval)
159 Function: Get/set the 1-based end position of this sequence in the original
160 sequence. '0' means before the original sequence starts.
161 Returns : value of end
162 Args : newvalue (optional)
163 Note : although this is a get/set, it checks passed values against the
164 calculated end point ( derived from the sequence and based on
165 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
166 it will warn and set the proper value. Probably best used for
167 debugging proper sequence calculations.
169 =cut
171 sub end {
172 my $self = shift;
173 if( @_ ) {
174 my $value = shift;
175 my $st = $self->start;
176 # start of 0 usually means the sequence is all gaps but maps to
177 # other sequences in an alignment
178 if ($self->seq && $st != 0 ) {
179 my $len = $self->_ungapped_len;
180 my $calend = $st + $len - 1;
181 my $id = $self->id || 'unknown';
182 if ($calend != $value) {
183 $self->warn("In sequence $id residue count gives end value ".
184 "$calend. \nOverriding value [$value] with value $calend for ".
185 "Bio::LocatableSeq::end().\n".$self->seq);
186 $value = $calend;
189 $self->{'end'} = $value;
192 if (defined $self->{'end'}) {
193 return $self->{'end'}
194 } elsif ( my $len = $self->_ungapped_len) {
195 return $len + $self->start - 1;
196 } else {
197 return;
201 # changed 08.10.26 to return ungapped length, not the calculated end
202 # of the sequence
203 sub _ungapped_len {
204 my $self = shift;
205 return unless my $string = $self->seq;
206 my ($map_res, $map_coord) = $self->mapping;
207 my $offset = 0;
208 if (my %data = $self->frameshifts) {
209 map {$offset += $_} values %data;
211 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
212 return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
215 =head2 strand
217 Title : strand
218 Usage : $obj->strand($newval)
219 Function: return or set the strandedness
220 Returns : the value of the strandedness (-1, 0 or 1)
221 Args : the value of the strandedness (-1, 0 or 1)
223 =cut
225 sub strand{
226 my $self = shift;
227 if( @_ ) {
228 my $value = shift;
229 $self->{'strand'} = $value;
231 return $self->{'strand'};
234 =head2 mapping
236 Title : mapping
237 Usage : $obj->mapping($newval)
238 Function: return or set the mapping indices (indicates # symbols/positions in
239 the source string mapping to # of coordinate positions)
240 Returns : two-element array (# symbols => # coordinate pos)
241 Args : two elements (# symbols => # coordinate pos); this can also be
242 passed in as an array reference of the two elements (as might be
243 passed upon Bio::LocatableSeq instantiation, for instance).
245 =cut
247 sub mapping {
248 my $self = shift;
249 if( @_ ) {
250 my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
251 $self->throw("Must pass two values (# residues mapped to # positions)")
252 if @mapping != 2;
253 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
254 $self->throw("Mapping values other than 1 or 3 are not currently supported")
256 $self->{'_mapping'} = \@mapping;
258 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
259 return @{ $self->{'_mapping'} };
262 =head2 frameshifts
264 Title : frameshifts
265 Usage : $obj->frameshifts($newval)
266 Function: get/set the frameshift hash, which contains sequence positions as
267 keys and the shift (-2, -1, 1, 2) as the value
268 Returns : hash
269 Args : hash or hash reference
271 =cut
273 sub frameshifts {
274 my $self = shift;
275 if( @_ ) {
276 if (ref $_[0] eq 'HASH') {
277 $self->{_frameshifts} = $_[0];
278 } else {
279 # assume this is a full list to be converted to a hash
280 $self->{_frameshifts} = \%{@_} # coerce into hash ref
283 (defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ?
284 return %{$self->{_frameshifts}} : return ();
287 =head2 get_nse
289 Title : get_nse
290 Usage :
291 Function: read-only name of form id/start-end
292 Example :
293 Returns :
294 Args :
296 =cut
298 sub get_nse{
299 my ($self,$char1,$char2) = @_;
301 $char1 ||= "/";
302 $char2 ||= "-";
304 my ($id, $st, $end) = ($self->id(), $self->start(), $self->end());
306 if ($self->force_nse) {
307 $id ||= '';
308 $st ||= 0;
309 $end ||= 0;
312 $self->throw("Attribute id not set") unless defined($id);
313 $self->throw("Attribute start not set") unless defined($st);
314 $self->throw("Attribute end not set") unless defined($end);
316 #Stockholm Rfam includes version if present so it is optional
317 my $v = $self->version ? '.'.$self->version : '';
318 return $id . $v. $char1 . $st . $char2 . $end ;
321 =head2 force_nse
323 Title : force_nse
324 Usage : $ls->force_nse()
325 Function: Boolean which forces get_nse() to build an NSE, regardless
326 of whether id(), start(), or end() is set
327 Returns : Boolean value
328 Args : (optional) Boolean (1 or 0)
329 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
330 respectively
332 =cut
334 sub force_nse {
335 my ($self, $flag) = @_;
336 if (defined $flag) {
337 $flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
339 return $self->{'_force_nse'};
342 =head2 no_gap
344 Title : no_gaps
345 Usage :$self->no_gaps('.')
346 Function:Gets number of gaps in the sequence. The count excludes
347 leading or trailing gap characters.
349 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
350 these, '.' and '-' are counted as gap characters unless an
351 optional argument specifies one of them.
353 Returns : number of internal gaps in the sequence.
354 Args : a gap character (optional)
356 =cut
358 sub no_gaps {
359 my ($self,$char) = @_;
360 my ($seq, $count) = (undef, 0);
362 # default gap characters
363 $char ||= $GAP_SYMBOLS;
365 $self->warn("I hope you know what you are doing setting gap to [$char]")
366 unless $char =~ /[$GAP_SYMBOLS]/;
368 $seq = $self->seq;
369 return 0 unless $seq; # empty sequence does not have gaps
371 $seq =~ s/^([$char]+)//;
372 $seq =~ s/([$char]+)$//;
373 while ( $seq =~ /[$char]+/g ) {
374 $count++;
377 return $count;
381 =head2 column_from_residue_number
383 Title : column_from_residue_number
384 Usage : $col = $seq->column_from_residue_number($resnumber)
385 Function:
387 This function gives the position in the alignment
388 (i.e. column number) of the given residue number in the
389 sequence. For example, for the sequence
391 Seq1/91-97 AC..DEF.GH
393 column_from_residue_number(94) returns 6.
395 An exception is thrown if the residue number would lie
396 outside the length of the aligment
397 (e.g. column_from_residue_number( "Seq2", 22 )
399 Returns : A column number for the position of the
400 given residue in the given sequence (1 = first column)
401 Args : A residue number in the whole sequence (not just that
402 segment of it in the alignment)
404 =cut
406 sub column_from_residue_number {
407 my ($self, $resnumber) = @_;
409 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
410 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
412 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
413 my @residues = split //, $self->seq;
414 my $count = $self->start();
415 my $i;
416 my ($start,$end,$inc,$test);
417 my $strand = $self->strand || 0;
418 # the following bit of "magic" allows the main loop logic to be the
419 # same regardless of the strand of the sequence
420 ($start,$end,$inc,$test)= ($strand == -1)?
421 (scalar(@residues-1),0,-1,sub{$i >= $end}) :
422 (0,scalar(@residues-1),1,sub{$i <= $end});
424 for ($i=$start; $test->(); $i+= $inc) {
425 if ($residues[$i] ne '.' and $residues[$i] ne '-') {
426 $count == $resnumber and last;
427 $count++;
430 # $i now holds the index of the column.
431 # The actual column number is this index + 1
433 return $i+1;
436 $self->throw("Could not find residue number $resnumber");
440 =head2 location_from_column
442 Title : location_from_column
443 Usage : $loc = $ali->location_from_column($column_number)
444 Function:
446 This function gives the residue number for a given position
447 in the alignment (i.e. column number) of the given. Gaps
448 complicate this process and force the output to be a
449 L<Bio::Range> where values can be undefined. For example,
450 for the sequence:
452 Seq/91-96 .AC..DEF.G.
454 location_from_column( 3 ) position 92
455 location_from_column( 4 ) position 92^93
456 location_from_column( 9 ) position 95^96
457 location_from_column( 1 ) position undef
459 An exact position returns a Bio::Location::Simple object
460 where where location_type() returns 'EXACT', if a position
461 is between bases location_type() returns 'IN-BETWEEN'.
462 Column before the first residue returns undef. Note that if
463 the position is after the last residue in the alignment,
464 that there is no guarantee that the original sequence has
465 residues after that position.
467 An exception is thrown if the column number is not within
468 the sequence.
470 Returns : Bio::Location::Simple or undef
471 Args : A column number
472 Throws : If column is not within the sequence
474 See L<Bio::Location::Simple> for more.
476 =cut
478 sub location_from_column {
479 my ($self, $column) = @_;
481 $self->throw("Column number has to be a positive integer, not [$column]")
482 unless $column =~ /^\d+$/ and $column > 0;
483 $self->throw("Column number [$column] is larger than".
484 " sequence length [". $self->length. "]")
485 unless $column <= $self->length;
487 my ($loc);
488 my $s = $self->subseq(1,$column);
489 $s =~ s/[^a-zA-Z\*]//g;
491 my $pos = CORE::length $s;
493 my $start = $self->start || 0 ;
494 my $strand = $self->strand() || 1;
495 my $relative_pos = ($strand == -1)
496 ? ($self->end - $pos + 1)
497 : ($pos + $start - 1);
498 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
499 $loc = Bio::Location::Simple->new
500 (-start => $relative_pos,
501 -end => $relative_pos,
502 -strand => 1,
504 } elsif ($pos == 0 and $self->start == 1) {
505 } else {
506 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
507 if ($strand == -1) {
508 ($start,$end) = ($end,$start);
510 $loc = Bio::Location::Simple->new
511 (-start => $start,
512 -end => $end,
513 -strand => 1,
514 -location_type => 'IN-BETWEEN'
517 return $loc;
520 =head2 revcom
522 Title : revcom
523 Usage : $rev = $seq->revcom()
524 Function: Produces a new Bio::LocatableSeq object which
525 has the reversed complement of the sequence. For protein
526 sequences this throws an exception of "Sequence is a
527 protein. Cannot revcom"
529 Returns : A new Bio::LocatableSeq object
530 Args : none
532 =cut
534 sub revcom {
535 my ($self) = @_;
536 # since we don't know whether sequences without 1 => 1 correlation can be
537 # revcom'd, kick back
538 if (grep {$_ != 1} $self->mapping) {
539 $self->warn('revcom() not supported for sequences with mapped values of > 1');
540 return;
542 my $new = $self->SUPER::revcom;
543 $new->strand($self->strand * -1) if $self->strand;
544 $new->start($self->start) if $self->start;
545 $new->end($self->end) if $self->end;
546 return $new;
549 =head2 trunc
551 Title : trunc
552 Usage : $subseq = $myseq->trunc(10,100);
553 Function: Provides a truncation of a sequence,
555 Example :
556 Returns : a fresh Bio::PrimarySeqI implementing object
557 Args : Two integers denoting first and last columns of the
558 sequence to be included into sub-sequence.
561 =cut
563 sub trunc {
564 my ($self, $start, $end) = @_;
565 my $new = $self->SUPER::trunc($start, $end);
566 $new->strand($self->strand);
568 # end will be automatically calculated
569 $start = $end if $self->strand == -1;
571 $start = $self->location_from_column($start);
572 $start ? ($start = $start->end) : ($start = 1);
573 $new->start($start) if $start;
575 return $new;
578 =head2 validate_seq
580 Title : validate_seq
581 Usage : if(! $seq->validate_seq($seq_str) ) {
582 print "sequence $seq_str is not valid for an object of
583 alphabet ",$seq->alphabet, "\n";
585 Function: Validates a given sequence string. A validating sequence string
586 must be accepted by seq(). A string that does not validate will
587 lead to an exception if passed to seq().
589 The implementation provided here does not take alphabet() into
590 account. Allowed are all letters (A-Z), numbers [0-9]
591 and common symbols used for gaps, stop codons, unknown residues,
592 and frameshifts, including '-','.','*','?','=',and '~'.
594 Example :
595 Returns : 1 if the supplied sequence string is valid for the object, and
596 0 otherwise.
597 Args : The sequence string to be validated.
599 =cut
601 sub validate_seq {
602 my ($self,$seqstr) = @_;
603 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
604 return 0 unless( defined $seqstr);
606 if((CORE::length($seqstr) > 0) &&
607 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
608 $self->warn("seq doesn't validate with [$MATCHPATTERN], mismatch is " .
609 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
610 return 0;
612 return 1;