tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / LocatableSeq.pm
blobeac59cf6a8e2ccec2446ebaf75a5f82374d3c377
1 # $Id$
3 # BioPerl module for Bio::LocatableSeq
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
18 that can be projected into a MSA or have coordinates relative to
19 another seq.
21 =head1 SYNOPSIS
23 use Bio::LocatableSeq;
24 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
25 -id => "seq1",
26 -start => 1,
27 -end => 7);
29 # a normal sequence object
30 $locseq->seq();
31 $locseq->id();
33 # has start,end points
34 $locseq->start();
35 $locseq->end();
37 # inherits off RangeI, so range operations possible
39 =head1 DESCRIPTION
41 The LocatableSeq sequence object was developed mainly because the SimpleAlign
42 object requires this functionality, and in the rewrite of the Sequence object we
43 had to decide what to do with this.
45 It is, to be honest, not well integrated with the rest of bioperl. For example,
46 the trunc() function does not return a LocatableSeq object, as some might have
47 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
48 inherited from Bio::RangeI and is not stored in a Bio::Location.
50 There are all sorts of nasty gotcha's about interactions between coordinate
51 systems when these sort of objects are used. Some mapping now occurs to deal
52 with HSP data, however it can probably be integrated in better and most methods
53 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
54 trunc(), etc.) do not behave as expected and must be used with care.
56 Due to this, LocatableSeq functionality is to be refactored in a future BioPerl
57 release. However, for alignment functionality it works adequately for the time
58 being
60 =head1 FEEDBACK
62 =head2 Mailing Lists
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to one
66 of the Bioperl mailing lists. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71 =head2 Support
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
82 =head2 Reporting Bugs
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 the bugs and their resolution. Bug reports can be submitted via the
86 web:
88 http://bugzilla.open-bio.org/
90 =head1 APPENDIX
92 The rest of the documentation details each of the object
93 methods. Internal methods are usually preceded with a _
95 =cut
98 # Let the code begin...
100 package Bio::LocatableSeq;
101 use strict;
103 use Bio::Location::Simple;
104 use Bio::Location::Fuzzy;
105 use vars qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
107 # The following global variables contain symbols used to represent gaps,
108 # frameshifts, residues, and other valid symbols. These are set at compile-time;
109 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
110 # LocatableSeq.t)
112 $GAP_SYMBOLS = '\-\.=~';
113 $FRAMESHIFT_SYMBOLS = '\\\/';
114 $OTHER_SYMBOLS = '\?';
115 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
116 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
118 use base qw(Bio::PrimarySeq Bio::RangeI);
120 sub new {
121 my ($class, @args) = @_;
122 my $self = $class->SUPER::new(@args);
124 my ($start,$end,$strand, $mapping, $fs, $nse) =
125 $self->_rearrange( [qw(START
127 STRAND
128 MAPPING
129 FRAMESHIFTS
130 FORCE_NSE
132 @args);
134 $mapping ||= [1,1];
135 $self->mapping($mapping);
136 $nse || 0;
137 $self->force_nse($nse);
138 defined $fs && $self->frameshifts($fs);
139 defined $start && $self->start($start);
140 defined $end && $self->end($end);
141 defined $strand && $self->strand($strand);
143 return $self; # success - we hope!
146 =head2 start
148 Title : start
149 Usage : $obj->start($newval)
150 Function: Get/set the 1-based start position of this sequence in the original
151 sequence. '0' means before the original sequence starts.
152 Returns : value of start
153 Args : newvalue (optional)
155 =cut
157 sub start{
158 my $self = shift;
159 if( @_ ) {
160 my $value = shift;
161 $self->{'start'} = $value;
163 return $self->{'start'} if defined $self->{'start'};
164 return 1 if $self->seq;
165 return;
168 =head2 end
170 Title : end
171 Usage : $obj->end($newval)
172 Function: Get/set the 1-based end position of this sequence in the original
173 sequence. '0' means before the original sequence starts.
174 Returns : value of end
175 Args : newvalue (optional)
176 Note : although this is a get/set, it checks passed values against the
177 calculated end point ( derived from the sequence and based on
178 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
179 it will warn and set the proper value. Probably best used for
180 debugging proper sequence calculations.
182 =cut
184 sub end {
185 my $self = shift;
186 if( @_ ) {
187 my $value = shift;
188 my $st = $self->start;
189 # start of 0 usually means the sequence is all gaps but maps to
190 # other sequences in an alignment
191 if ($self->seq && $st != 0 ) {
192 my $len = $self->_ungapped_len;
193 my $calend = $st + $len - 1;
194 my $id = $self->id || 'unknown';
195 if ($calend != $value) {
196 $self->warn("In sequence $id residue count gives end value ".
197 "$calend. \nOverriding value [$value] with value $calend for ".
198 "Bio::LocatableSeq::end().\n".$self->seq);
199 $value = $calend;
202 $self->{'end'} = $value;
205 if (defined $self->{'end'}) {
206 return $self->{'end'}
207 } elsif ( my $len = $self->_ungapped_len) {
208 return $len + $self->start - 1;
209 } else {
210 return;
214 # changed 08.10.26 to return ungapped length, not the calculated end
215 # of the sequence
216 sub _ungapped_len {
217 my $self = shift;
218 return unless my $string = $self->seq;
219 my ($map_res, $map_coord) = $self->mapping;
220 my $offset = 0;
221 if (my %data = $self->frameshifts) {
222 map {$offset += $_} values %data;
224 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
225 return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
228 =head2 strand
230 Title : strand
231 Usage : $obj->strand($newval)
232 Function: return or set the strandedness
233 Returns : the value of the strandedness (-1, 0 or 1)
234 Args : the value of the strandedness (-1, 0 or 1)
236 =cut
238 sub strand{
239 my $self = shift;
240 if( @_ ) {
241 my $value = shift;
242 $self->{'strand'} = $value;
244 return $self->{'strand'};
247 =head2 mapping
249 Title : mapping
250 Usage : $obj->mapping($newval)
251 Function: return or set the mapping indices (indicates # symbols/positions in
252 the source string mapping to # of coordinate positions)
253 Returns : two-element array (# symbols => # coordinate pos)
254 Args : two elements (# symbols => # coordinate pos); this can also be
255 passed in as an array reference of the two elements (as might be
256 passed upon Bio::LocatableSeq instantiation, for instance).
258 =cut
260 sub mapping {
261 my $self = shift;
262 if( @_ ) {
263 my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
264 $self->throw("Must pass two values (# residues mapped to # positions)")
265 if @mapping != 2;
266 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
267 $self->throw("Mapping values other than 1 or 3 are not currently supported")
269 $self->{'_mapping'} = \@mapping;
271 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
272 return @{ $self->{'_mapping'} };
275 =head2 frameshifts
277 Title : frameshifts
278 Usage : $obj->frameshifts($newval)
279 Function: get/set the frameshift hash, which contains sequence positions as
280 keys and the shift (-2, -1, 1, 2) as the value
281 Returns : hash
282 Args : hash or hash reference
284 =cut
286 sub frameshifts {
287 my $self = shift;
288 if( @_ ) {
289 if (ref $_[0] eq 'HASH') {
290 $self->{_frameshifts} = $_[0];
291 } else {
292 # assume this is a full list to be converted to a hash
293 $self->{_frameshifts} = \%{@_} # coerce into hash ref
296 (defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ?
297 return %{$self->{_frameshifts}} : return ();
300 =head2 get_nse
302 Title : get_nse
303 Usage :
304 Function: read-only name of form id/start-end
305 Example :
306 Returns :
307 Args :
309 =cut
311 sub get_nse{
312 my ($self,$char1,$char2) = @_;
314 $char1 ||= "/";
315 $char2 ||= "-";
317 my ($id, $st, $end) = ($self->id(), $self->start(), $self->end());
319 if ($self->force_nse) {
320 $id ||= '';
321 $st ||= 0;
322 $end ||= 0;
325 $self->throw("Attribute id not set") unless defined($id);
326 $self->throw("Attribute start not set") unless defined($st);
327 $self->throw("Attribute end not set") unless defined($end);
329 #Stockholm Rfam includes version if present so it is optional
330 my $v = $self->version ? '.'.$self->version : '';
331 return $id . $v. $char1 . $st . $char2 . $end ;
334 =head2 force_nse
336 Title : force_nse
337 Usage : $ls->force_nse()
338 Function: Boolean which forces get_nse() to build an NSE, regardless
339 of whether id(), start(), or end() is set
340 Returns : Boolean value
341 Args : (optional) Boolean (1 or 0)
342 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
343 respectively
345 =cut
347 sub force_nse {
348 my ($self, $flag) = @_;
349 if (defined $flag) {
350 $flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
352 return $self->{'_force_nse'};
355 =head2 num_gaps
357 Title : num_gaps
358 Usage :$self->num_gaps('.')
359 Function:Gets number of gaps in the sequence. The count excludes
360 leading or trailing gap characters.
362 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
363 these, '.' and '-' are counted as gap characters unless an
364 optional argument specifies one of them.
366 Returns : number of internal gaps in the sequence.
367 Args : a gap character (optional)
368 Status : Stable
369 Note : replaces no_gaps
371 =cut
373 sub num_gaps {
374 my ($self,$char) = @_;
375 my ($seq, $count) = (undef, 0);
377 # default gap characters
378 $char ||= $GAP_SYMBOLS;
380 $self->warn("I hope you know what you are doing setting gap to [$char]")
381 unless $char =~ /[$GAP_SYMBOLS]/;
383 $seq = $self->seq;
384 return 0 unless $seq; # empty sequence does not have gaps
386 $seq =~ s/^([$char]+)//;
387 $seq =~ s/([$char]+)$//;
388 while ( $seq =~ /[$char]+/g ) {
389 $count++;
392 return $count;
396 =head2 column_from_residue_number
398 Title : column_from_residue_number
399 Usage : $col = $seq->column_from_residue_number($resnumber)
400 Function:
402 This function gives the position in the alignment
403 (i.e. column number) of the given residue number in the
404 sequence. For example, for the sequence
406 Seq1/91-97 AC..DEF.GH
408 column_from_residue_number(94) returns 6.
410 An exception is thrown if the residue number would lie
411 outside the length of the aligment
412 (e.g. column_from_residue_number( "Seq2", 22 )
414 Returns : A column number for the position of the
415 given residue in the given sequence (1 = first column)
416 Args : A residue number in the whole sequence (not just that
417 segment of it in the alignment)
419 =cut
421 sub column_from_residue_number {
422 my ($self, $resnumber) = @_;
424 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
425 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
427 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
428 my @chunks;
429 my $column_incr;
430 my $current_column;
431 my $current_residue = $self->start - 1;
432 my $seq = $self->seq;
433 my $strand = $self->strand || 0;
435 if ($strand == -1) {
436 # @chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
437 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
438 $column_incr = -1;
439 $current_column = (CORE::length $seq) + 1;
441 else {
442 # @chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
443 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
444 $column_incr = 1;
445 $current_column = 0;
448 while (my $chunk = shift @chunks) {
449 # if ($chunk =~ m|^[\.\-]|o) {
450 if ($chunk =~ m|^[$GAP_SYMBOLS]|o) {
451 $current_column += $column_incr * CORE::length($chunk);
453 else {
454 if ($current_residue + CORE::length($chunk) < $resnumber) {
455 $current_column += $column_incr * CORE::length($chunk);
456 $current_residue += CORE::length($chunk);
458 else {
459 if ($strand == -1) {
460 $current_column -= $resnumber - $current_residue;
462 else {
463 $current_column += $resnumber - $current_residue;
465 return $current_column;
471 $self->throw("Could not find residue number $resnumber");
475 =head2 location_from_column
477 Title : location_from_column
478 Usage : $loc = $ali->location_from_column($column_number)
479 Function:
481 This function gives the residue number for a given position
482 in the alignment (i.e. column number) of the given. Gaps
483 complicate this process and force the output to be a
484 L<Bio::Location::Simple> where values can be undefined.
485 For example, for the sequence:
487 Seq/91-96 .AC..DEF.G.
489 location_from_column( 3 ) position 92
490 location_from_column( 4 ) position 92^93
491 location_from_column( 9 ) position 95^96
492 location_from_column( 1 ) position undef
494 An exact position returns a Bio::Location::Simple object
495 where where location_type() returns 'EXACT', if a position
496 is between bases location_type() returns 'IN-BETWEEN'.
497 Column before the first residue returns undef. Note that if
498 the position is after the last residue in the alignment,
499 that there is no guarantee that the original sequence has
500 residues after that position.
502 An exception is thrown if the column number is not within
503 the sequence.
505 Returns : Bio::Location::Simple or undef
506 Args : A column number
507 Throws : If column is not within the sequence
509 See L<Bio::Location::Simple> for more.
511 =cut
513 sub location_from_column {
514 my ($self, $column) = @_;
516 $self->throw("Column number has to be a positive integer, not [$column]")
517 unless $column =~ /^\d+$/ and $column > 0;
518 $self->throw("Column number [$column] is larger than".
519 " sequence length [". $self->length. "]")
520 unless $column <= $self->length;
522 my ($loc);
523 my $s = $self->subseq(1,$column);
524 $s =~ s/[^a-zA-Z\*]//g;
526 my $pos = CORE::length $s;
528 my $start = $self->start || 0 ;
529 my $strand = $self->strand() || 1;
530 my $relative_pos = ($strand == -1)
531 ? ($self->end - $pos + 1)
532 : ($pos + $start - 1);
533 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
534 $loc = Bio::Location::Simple->new
535 (-start => $relative_pos,
536 -end => $relative_pos,
537 -strand => 1,
539 } elsif ($pos == 0 and $self->start == 1) {
540 } else {
541 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
542 if ($strand == -1) {
543 ($start,$end) = ($end,$start);
545 $loc = Bio::Location::Simple->new
546 (-start => $start,
547 -end => $end,
548 -strand => 1,
549 -location_type => 'IN-BETWEEN'
552 return $loc;
555 =head2 revcom
557 Title : revcom
558 Usage : $rev = $seq->revcom()
559 Function: Produces a new Bio::LocatableSeq object which
560 has the reversed complement of the sequence. For protein
561 sequences this throws an exception of "Sequence is a
562 protein. Cannot revcom"
564 Returns : A new Bio::LocatableSeq object
565 Args : none
567 =cut
569 sub revcom {
570 my ($self) = @_;
571 # since we don't know whether sequences without 1 => 1 correlation can be
572 # revcom'd, kick back
573 if (grep {$_ != 1} $self->mapping) {
574 $self->warn('revcom() not supported for sequences with mapped values of > 1');
575 return;
577 my $new = $self->SUPER::revcom;
578 $new->strand($self->strand * -1) if $self->strand;
579 $new->start($self->start) if $self->start;
580 $new->end($self->end) if $self->end;
581 return $new;
584 =head2 trunc
586 Title : trunc
587 Usage : $subseq = $myseq->trunc(10,100);
588 Function: Provides a truncation of a sequence,
590 Example :
591 Returns : a fresh Bio::PrimarySeqI implementing object
592 Args : Two integers denoting first and last columns of the
593 sequence to be included into sub-sequence.
596 =cut
598 sub trunc {
599 my ($self, $start, $end) = @_;
600 my $new = $self->SUPER::trunc($start, $end);
601 $new->strand($self->strand);
603 # end will be automatically calculated
604 $start = $end if $self->strand == -1;
606 $start = $self->location_from_column($start);
607 $start ? ($start = $start->end) : ($start = 1);
608 $new->start($start) if $start;
610 return $new;
613 =head2 validate_seq
615 Title : validate_seq
616 Usage : if(! $seq->validate_seq($seq_str) ) {
617 print "sequence $seq_str is not valid for an object of
618 alphabet ",$seq->alphabet, "\n";
620 Function: Validates a given sequence string. A validating sequence string
621 must be accepted by seq(). A string that does not validate will
622 lead to an exception if passed to seq().
624 The implementation provided here does not take alphabet() into
625 account. Allowed are all letters (A-Z), numbers [0-9]
626 and common symbols used for gaps, stop codons, unknown residues,
627 and frameshifts, including '-','.','*','?','=',and '~'.
629 Example :
630 Returns : 1 if the supplied sequence string is valid for the object, and
631 0 otherwise.
632 Args : The sequence string to be validated.
634 =cut
636 sub validate_seq {
637 my ($self,$seqstr) = @_;
638 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
639 return 0 unless( defined $seqstr);
641 if((CORE::length($seqstr) > 0) &&
642 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
643 $self->warn("seq doesn't validate with [$MATCHPATTERN], mismatch is " .
644 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
645 return 0;
647 return 1;
650 ################## DEPRECATED METHODS ##################
652 =head2 no_gap
654 Title : no_gaps
655 Usage : $self->no_gaps('.')
656 Function : Gets number of gaps in the sequence. The count excludes
657 leading or trailing gap characters.
659 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
660 these, '.' and '-' are counted as gap characters unless an
661 optional argument specifies one of them.
663 Returns : number of internal gaps in the sequence.
664 Args : a gap character (optional)
665 Status : Deprecated (in favor of num_gaps())
667 =cut
669 sub no_gaps {
670 my $self = shift;
671 $self->deprecated(-warn_version => 1.0069,
672 -throw_version => 1.0075,
673 -message => 'Use of method no_gaps() is deprecated, use num_gaps() instead');
674 $self->num_gaps(@_);
677 =head2 no_sequences
679 Title : no_sequences
680 Usage : $gaps = $seq->no_sequences
681 Function : number of sequence in the sequence alignment
682 Returns : integer
683 Argument :
684 Status : Deprecated (in favor of num_sequences())
686 =cut
688 sub no_sequences {
689 my $self = shift;
690 $self->deprecated(-warn_version => 1.0069,
691 -throw_version => 1.0075,
692 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
693 $self->num_sequences(@_);