2 # BioPerl module for Bio::LocatableSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
17 that can be projected into a MSA or have coordinates relative to
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
28 # a normal sequence object
32 # has start,end points
36 # inherits off RangeI, so range operations possible
40 The LocatableSeq sequence object was developed mainly because the SimpleAlign
41 object requires this functionality, and in the rewrite of the Sequence object we
42 had to decide what to do with this.
44 It is, to be honest, not well integrated with the rest of bioperl. For example,
45 the trunc() function does not return a LocatableSeq object, as some might have
46 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
47 inherited from Bio::RangeI and is not stored in a Bio::Location.
49 There are all sorts of nasty gotcha's about interactions between coordinate
50 systems when these sort of objects are used. Some mapping now occurs to deal
51 with HSP data, however it can probably be integrated in better and most methods
52 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
53 trunc(), etc.) do not behave as expected and must be used with care. Due to this,
54 LocatableSeq functionality is to be refactored in a future BioPerl release.
55 However, for alignment functionality it works adequately for the time being.
57 If you do not need alignment functionality, L<Bio::SeqfeatureI>-implementing
58 modules may be a suitable alternative to L<Bio::LocatableSeq>. For example,
59 L<Bio::SeqFeature::Generic> and L<Bio::SeqFeature::Lite> provide methods to
60 attach a sequence to a specific region of a parent sequence and to set other
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to one
69 of the Bioperl mailing lists. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 Please direct usage questions or support issues to the mailing list:
78 I<bioperl-l@bioperl.org>
80 rather than to the module maintainer directly. Many experienced and
81 reponsive experts will be able look at the problem and quickly
82 address it. Please include a thorough description of the problem
83 with code and data examples if at all possible.
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 the bugs and their resolution. Bug reports can be submitted via the
91 https://github.com/bioperl/bioperl-live/issues
95 The rest of the documentation details each of the object
96 methods. Internal methods are usually preceded with a _
102 package Bio
::LocatableSeq
;
105 use Bio
::Location
::Simple
;
106 use Bio
::Location
::Fuzzy
;
107 use vars
qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
109 # The following global variables contain symbols used to represent gaps,
110 # frameshifts, residues, and other valid symbols. These are set at compile-time;
111 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
114 $GAP_SYMBOLS = '\-\.=~';
115 $FRAMESHIFT_SYMBOLS = '\\\/';
116 $OTHER_SYMBOLS = '\?';
117 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
118 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
120 use base qw(Bio::PrimarySeq Bio::RangeI);
124 my ($class, @args) = @_;
125 my $self = $class->SUPER::new
(@args);
127 my ($start,$end,$strand, $mapping, $fs, $nse) =
128 $self->_rearrange( [qw(START
138 $self->mapping($mapping);
140 $self->force_nse($nse);
141 defined $fs && $self->frameshifts($fs);
142 defined $start && $self->start($start);
143 defined $end && $self->end($end);
144 defined $strand && $self->strand($strand);
146 return $self; # success - we hope!
153 Usage : $obj->start($newval)
154 Function: Get/set the 1-based start position of this sequence in the original
155 sequence. '0' means before the original sequence starts.
156 Returns : value of start
157 Args : newvalue (optional)
165 $self->{'start'} = $value;
167 return $self->{'start'} if defined $self->{'start'};
168 return 1 if $self->seq;
176 Usage : $obj->end($newval)
177 Function: Get/set the 1-based end position of this sequence in the original
178 sequence. '0' means before the original sequence starts.
179 Returns : value of end
180 Args : newvalue (optional)
181 Note : although this is a get/set, it checks passed values against the
182 calculated end point ( derived from the sequence and based on
183 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
184 it will warn and set the proper value. Probably best used for
185 debugging proper sequence calculations.
193 my $st = $self->start;
194 # start of 0 usually means the sequence is all gaps but maps to
195 # other sequences in an alignment
196 if ($self->seq && $st != 0 ) {
197 my $len = $self->_ungapped_len;
198 my $calend = $st + $len - 1;
199 my $id = $self->id || 'unknown';
200 if ($calend != $value) {
201 $self->warn("In sequence $id residue count gives end value ".
202 "$calend. \nOverriding value [$value] with value $calend for ".
203 "Bio::LocatableSeq::end().\n".$self->seq);
207 $self->{'end'} = $value;
210 if (defined $self->{'end'}) {
211 return $self->{'end'}
212 } elsif ( my $len = $self->_ungapped_len) {
213 return $len + $self->start - 1;
220 # changed 08.10.26 to return ungapped length, not the calculated end
224 return unless my $string = $self->seq;
225 my ($map_res, $map_coord) = $self->mapping;
227 if (my %data = $self->frameshifts) {
228 map {$offset += $_} values %data;
230 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
231 return CORE
::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
236 # return unless my $string = $self->seq;
237 # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
238 # return CORE::length($string);
245 Usage : $obj->strand($newval)
246 Function: return or set the strandedness
247 Returns : the value of the strandedness (-1, 0 or 1)
248 Args : the value of the strandedness (-1, 0 or 1)
256 $self->{'strand'} = $value;
258 return $self->{'strand'};
265 Usage : $obj->mapping($newval)
266 Function: return or set the mapping indices (indicates # symbols/positions in
267 the source string mapping to # of coordinate positions)
268 Returns : two-element array (# symbols => # coordinate pos)
269 Args : two elements (# symbols => # coordinate pos); this can also be
270 passed in as an array reference of the two elements (as might be
271 passed upon Bio::LocatableSeq instantiation, for instance).
278 my @mapping = (ref($_[0]) eq 'ARRAY') ? @
{$_[0]} : @_;
279 $self->throw("Must pass two values (# residues mapped to # positions)")
281 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
282 $self->throw("Mapping values other than 1 or 3 are not currently supported")
284 $self->{'_mapping'} = \
@mapping;
286 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
287 return @
{ $self->{'_mapping'} };
294 Usage : $obj->frameshifts($newval)
295 Function: get/set the frameshift hash, which contains sequence positions as
296 keys and the shift (-2, -1, 1, 2) as the value
298 Args : hash or hash reference
305 if (ref $_[0] eq 'HASH') {
306 $self->{_frameshifts
} = $_[0];
308 # assume this is a full list to be converted to a hash
309 $self->{_frameshifts
} = \
%{@_} # coerce into hash ref
312 (defined $self->{_frameshifts
} && ref $self->{_frameshifts
} eq 'HASH') ?
313 return %{$self->{_frameshifts
}} : return ();
321 Function: read-only name of form id/start-end
329 my ($self,$char1,$char2) = @_;
334 my ($id, $st, $end, $strand) = ($self->id(), $self->start(),
335 $self->end(), $self->strand || 0);
337 if ($self->force_nse) {
343 $self->throw("Attribute id not set") unless defined($id);
344 $self->throw("Attribute start not set") unless defined($st);
345 $self->throw("Attribute end not set") unless defined($end);
347 if ($strand && $strand == -1) {
348 ($st, $end) = ($end, $st);
351 #Stockholm Rfam includes version if present so it is optional
352 my $v = $self->version ?
'.'.$self->version : '';
353 return join('',$id, $v, $char1, $st, $char2, $end);
360 Usage : $ls->force_nse()
361 Function: Boolean which forces get_nse() to build an NSE, regardless
362 of whether id(), start(), or end() is set
363 Returns : Boolean value
364 Args : (optional) Boolean (1 or 0)
365 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
371 my ($self, $flag) = @_;
373 $flag ?
(return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
375 return $self->{'_force_nse'};
382 Usage :$self->num_gaps('.')
383 Function:Gets number of gaps in the sequence. The count excludes
384 leading or trailing gap characters.
386 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
387 these, '.' and '-' are counted as gap characters unless an
388 optional argument specifies one of them.
390 Returns : number of internal gaps in the sequence.
391 Args : a gap character (optional)
393 Note : replaces no_gaps
398 my ($self,$char) = @_;
399 my ($seq, $count) = (undef, 0);
401 # default gap characters
402 $char ||= $GAP_SYMBOLS;
404 $self->warn("I hope you know what you are doing setting gap to [$char]")
405 unless $char =~ /[$GAP_SYMBOLS]/;
408 return 0 unless $seq; # empty sequence does not have gaps
410 $seq =~ s/^([$char]+)//;
411 $seq =~ s/([$char]+)$//;
412 while ( $seq =~ /[$char]+/g ) {
420 =head2 column_from_residue_number
422 Title : column_from_residue_number
423 Usage : $col = $seq->column_from_residue_number($resnumber)
426 This function gives the position in the alignment
427 (i.e. column number) of the given residue number in the
428 sequence. For example, for the sequence
430 Seq1/91-97 AC..DEF.GH
432 column_from_residue_number(94) returns 6.
434 An exception is thrown if the residue number would lie
435 outside the length of the aligment
436 (e.g. column_from_residue_number( "Seq2", 22 )
438 Returns : A column number for the position of the
439 given residue in the given sequence (1 = first column)
440 Args : A residue number in the whole sequence (not just that
441 segment of it in the alignment)
445 sub column_from_residue_number
{
446 my ($self, $resnumber) = @_;
448 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
449 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
451 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
455 my $current_residue = $self->start - 1;
456 my $seq = $self->seq;
457 my $strand = $self->strand || 0;
460 #@chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
461 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
463 $current_column = (CORE
::length $seq) + 1;
466 #@chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
467 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
472 while (my $chunk = shift @chunks) {
473 #if ($chunk =~ m|^[\.\-]|o) {
474 if ($chunk =~ m
|^[$GAP_SYMBOLS]|o
) {
475 $current_column += $column_incr * CORE
::length($chunk);
478 if ($current_residue + CORE
::length($chunk) < $resnumber) {
479 $current_column += $column_incr * CORE
::length($chunk);
480 $current_residue += CORE
::length($chunk);
484 $current_column -= $resnumber - $current_residue;
487 $current_column += $resnumber - $current_residue;
489 return $current_column;
495 $self->throw("Could not find residue number $resnumber");
500 =head2 location_from_column
502 Title : location_from_column
503 Usage : $loc = $ali->location_from_column($column_number)
506 This function gives the residue number for a given position
507 in the alignment (i.e. column number) of the given. Gaps
508 complicate this process and force the output to be a
509 L<Bio::Location::Simple> where values can be undefined.
510 For example, for the sequence:
512 Seq/91-96 .AC..DEF.G.
514 location_from_column( 3 ) position 92
515 location_from_column( 4 ) position 92^93
516 location_from_column( 9 ) position 95^96
517 location_from_column( 1 ) position undef
519 An exact position returns a Bio::Location::Simple object
520 where where location_type() returns 'EXACT', if a position
521 is between bases location_type() returns 'IN-BETWEEN'.
522 Column before the first residue returns undef. Note that if
523 the position is after the last residue in the alignment,
524 that there is no guarantee that the original sequence has
525 residues after that position.
527 An exception is thrown if the column number is not within
530 Returns : Bio::Location::Simple or undef
531 Args : A column number
532 Throws : If column is not within the sequence
534 See L<Bio::Location::Simple> for more.
538 sub location_from_column
{
539 my ($self, $column) = @_;
541 $self->throw("Column number has to be a positive integer, not [$column]")
542 unless $column =~ /^\d+$/ and $column > 0;
543 $self->throw("Column number [$column] is larger than".
544 " sequence length [". $self->length. "]")
545 unless $column <= $self->length;
548 my $s = $self->subseq(1,$column);
549 $s =~ s/[^a-zA-Z\*]//g;
551 my $pos = CORE
::length $s;
553 my $start = $self->start || 0 ;
554 my $strand = $self->strand() || 1;
555 my $relative_pos = ($strand == -1)
556 ?
($self->end - $pos + 1)
557 : ($pos + $start - 1);
558 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
559 $loc = Bio
::Location
::Simple
->new
560 (-start
=> $relative_pos,
561 -end
=> $relative_pos,
564 } elsif ($pos == 0 and $self->start == 1) {
566 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
568 ($start,$end) = ($end,$start);
570 $loc = Bio
::Location
::Simple
->new
574 -location_type
=> 'IN-BETWEEN'
584 Usage : $rev = $seq->revcom()
585 Function: Produces a new Bio::LocatableSeq object which
586 has the reversed complement of the sequence. For protein
587 sequences this throws an exception of "Sequence is a
588 protein. Cannot revcom"
590 Returns : A new Bio::LocatableSeq object
597 # since we don't know whether sequences without 1 => 1 correlation can be
598 # revcom'd, kick back
599 if (grep {$_ != 1} $self->mapping) {
600 $self->warn('revcom() not supported for sequences with mapped values of > 1');
603 my $new = $self->SUPER::revcom
;
604 $new->strand($self->strand * -1) if $self->strand;
605 $new->start($self->start) if $self->start;
606 $new->end($self->end) if $self->end;
614 Usage : $subseq = $myseq->trunc(10,100);
615 Function: Provides a truncation of a sequence,
616 Returns : a fresh Bio::PrimarySeqI implementing object
617 Args : Two integers denoting first and last columns of the
618 sequence to be included into sub-sequence.
623 my ($self, $start, $end) = @_;
624 my $new = $self->SUPER::trunc
($start, $end);
625 $new->strand($self->strand);
627 # end will be automatically calculated
628 $start = $end if $self->strand && $self->strand == -1;
630 $start = $self->location_from_column($start);
631 $start ?
($start = $start->end) : ($start = 1);
632 $new->start($start) if $start;
641 Usage : if(! $seqobj->validate_seq($seq_str) ) {
642 print "sequence $seq_str is not valid for an object of
643 alphabet ",$seqobj->alphabet, "\n";
645 Function: Test that the given sequence is valid, i.e. contains only valid
646 characters. The allowed characters are all letters (A-Z) and '-','.',
647 '*','?','=' and '~'. Spaces are not valid. Note that this
648 implementation does not take alphabet() into account.
649 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
650 Args : - Sequence string to be validated
651 - Boolean to throw an error if the sequence is invalid
656 my ($self, $seqstr, $throw) = @_;
657 $seqstr = '' if not defined $seqstr;
658 $throw = 0 if not defined $throw ; # 0 for backward compatibility
659 if ( (CORE
::length $seqstr > 0 ) &&
660 ($seqstr !~ /^([$MATCHPATTERN]+)$/) ) {
662 $self->throw("Failed validation of sequence '".(defined($self->id) ||
663 '[unidentified sequence]')."'. Invalid characters were: " .
664 join('',($seqstr =~ /([^$MATCHPATTERN]+)/g)));
672 ################## DEPRECATED METHODS ##################
678 Usage : $self->no_gaps('.')
679 Function : Gets number of gaps in the sequence. The count excludes
680 leading or trailing gap characters.
682 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
683 these, '.' and '-' are counted as gap characters unless an
684 optional argument specifies one of them.
686 Returns : number of internal gaps in the sequence.
687 Args : a gap character (optional)
688 Status : Deprecated (in favor of num_gaps())
694 $self->deprecated( -warn_version
=> 1.0069,
695 -throw_version
=> 1.0075,
696 -message
=> 'Use of method no_gaps() is deprecated, use num_gaps() instead' );
697 return $self->num_gaps(@_);
704 Usage : $gaps = $seq->no_sequences
705 Function : number of sequence in the sequence alignment
708 Status : Deprecated (in favor of num_sequences())
714 $self->deprecated( -warn_version
=> 1.0069,
715 -throw_version
=> 1.0075,
716 -message
=> 'Use of method no_sequences() is deprecated, use num_sequences() instead' );
717 return $self->num_sequences(@_);