2 # BioPerl module for Bio::LocatableSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
17 that can be projected into a MSA or have coordinates relative to
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
28 # a normal sequence object
32 # has start,end points
36 # inherits off RangeI, so range operations possible
40 The LocatableSeq sequence object was developed mainly because the SimpleAlign
41 object requires this functionality, and in the rewrite of the Sequence object we
42 had to decide what to do with this.
44 It is, to be honest, not well integrated with the rest of bioperl. For example,
45 the trunc() function does not return a LocatableSeq object, as some might have
46 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
47 inherited from Bio::RangeI and is not stored in a Bio::Location.
49 There are all sorts of nasty gotcha's about interactions between coordinate
50 systems when these sort of objects are used. Some mapping now occurs to deal
51 with HSP data, however it can probably be integrated in better and most methods
52 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
53 trunc(), etc.) do not behave as expected and must be used with care.
55 Due to this, LocatableSeq functionality is to be refactored in a future BioPerl
56 release. However, for alignment functionality it works adequately for the time
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 Please direct usage questions or support issues to the mailing list:
74 I<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 the bugs and their resolution. Bug reports can be submitted via the
87 https://redmine.open-bio.org/projects/bioperl/
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
97 # Let the code begin...
99 package Bio
::LocatableSeq
;
102 use Bio
::Location
::Simple
;
103 use Bio
::Location
::Fuzzy
;
104 use vars
qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
106 # The following global variables contain symbols used to represent gaps,
107 # frameshifts, residues, and other valid symbols. These are set at compile-time;
108 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
111 $GAP_SYMBOLS = '\-\.=~';
112 $FRAMESHIFT_SYMBOLS = '\\\/';
113 $OTHER_SYMBOLS = '\?';
114 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
115 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
117 use base qw(Bio::PrimarySeq Bio::RangeI);
120 my ($class, @args) = @_;
121 my $self = $class->SUPER::new
(@args);
123 my ($start,$end,$strand, $mapping, $fs, $nse) =
124 $self->_rearrange( [qw(START
134 $self->mapping($mapping);
136 $self->force_nse($nse);
137 defined $fs && $self->frameshifts($fs);
138 defined $start && $self->start($start);
139 defined $end && $self->end($end);
140 defined $strand && $self->strand($strand);
142 return $self; # success - we hope!
148 Usage : $obj->start($newval)
149 Function: Get/set the 1-based start position of this sequence in the original
150 sequence. '0' means before the original sequence starts.
151 Returns : value of start
152 Args : newvalue (optional)
160 $self->{'start'} = $value;
162 return $self->{'start'} if defined $self->{'start'};
163 return 1 if $self->seq;
170 Usage : $obj->end($newval)
171 Function: Get/set the 1-based end position of this sequence in the original
172 sequence. '0' means before the original sequence starts.
173 Returns : value of end
174 Args : newvalue (optional)
175 Note : although this is a get/set, it checks passed values against the
176 calculated end point ( derived from the sequence and based on
177 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
178 it will warn and set the proper value. Probably best used for
179 debugging proper sequence calculations.
187 my $st = $self->start;
188 # start of 0 usually means the sequence is all gaps but maps to
189 # other sequences in an alignment
190 if ($self->seq && $st != 0 ) {
191 my $len = $self->_ungapped_len;
192 my $calend = $st + $len - 1;
193 my $id = $self->id || 'unknown';
194 if ($calend != $value) {
195 $self->warn("In sequence $id residue count gives end value ".
196 "$calend. \nOverriding value [$value] with value $calend for ".
197 "Bio::LocatableSeq::end().\n".$self->seq);
201 $self->{'end'} = $value;
204 if (defined $self->{'end'}) {
205 return $self->{'end'}
206 } elsif ( my $len = $self->_ungapped_len) {
207 return $len + $self->start - 1;
213 # changed 08.10.26 to return ungapped length, not the calculated end
217 return unless my $string = $self->seq;
218 my ($map_res, $map_coord) = $self->mapping;
220 if (my %data = $self->frameshifts) {
221 map {$offset += $_} values %data;
223 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
224 return CORE
::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
229 # return unless my $string = $self->seq;
230 # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
231 # return CORE::length($string);
237 Usage : $obj->strand($newval)
238 Function: return or set the strandedness
239 Returns : the value of the strandedness (-1, 0 or 1)
240 Args : the value of the strandedness (-1, 0 or 1)
248 $self->{'strand'} = $value;
250 return $self->{'strand'};
256 Usage : $obj->mapping($newval)
257 Function: return or set the mapping indices (indicates # symbols/positions in
258 the source string mapping to # of coordinate positions)
259 Returns : two-element array (# symbols => # coordinate pos)
260 Args : two elements (# symbols => # coordinate pos); this can also be
261 passed in as an array reference of the two elements (as might be
262 passed upon Bio::LocatableSeq instantiation, for instance).
269 my @mapping = (ref($_[0]) eq 'ARRAY') ? @
{$_[0]} : @_;
270 $self->throw("Must pass two values (# residues mapped to # positions)")
272 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
273 $self->throw("Mapping values other than 1 or 3 are not currently supported")
275 $self->{'_mapping'} = \
@mapping;
277 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
278 return @
{ $self->{'_mapping'} };
284 Usage : $obj->frameshifts($newval)
285 Function: get/set the frameshift hash, which contains sequence positions as
286 keys and the shift (-2, -1, 1, 2) as the value
288 Args : hash or hash reference
295 if (ref $_[0] eq 'HASH') {
296 $self->{_frameshifts
} = $_[0];
298 # assume this is a full list to be converted to a hash
299 $self->{_frameshifts
} = \
%{@_} # coerce into hash ref
302 (defined $self->{_frameshifts
} && ref $self->{_frameshifts
} eq 'HASH') ?
303 return %{$self->{_frameshifts
}} : return ();
310 Function: read-only name of form id/start-end
318 my ($self,$char1,$char2) = @_;
323 my ($id, $st, $end, $strand) = ($self->id(), $self->start(),
324 $self->end(), $self->strand || 0);
326 if ($self->force_nse) {
332 $self->throw("Attribute id not set") unless defined($id);
333 $self->throw("Attribute start not set") unless defined($st);
334 $self->throw("Attribute end not set") unless defined($end);
336 if ($strand && $strand == -1) {
337 ($st, $end) = ($end, $st);
340 #Stockholm Rfam includes version if present so it is optional
341 my $v = $self->version ?
'.'.$self->version : '';
342 return join('',$id, $v, $char1, $st, $char2, $end);
348 Usage : $ls->force_nse()
349 Function: Boolean which forces get_nse() to build an NSE, regardless
350 of whether id(), start(), or end() is set
351 Returns : Boolean value
352 Args : (optional) Boolean (1 or 0)
353 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
359 my ($self, $flag) = @_;
361 $flag ?
(return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
363 return $self->{'_force_nse'};
369 Usage :$self->num_gaps('.')
370 Function:Gets number of gaps in the sequence. The count excludes
371 leading or trailing gap characters.
373 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
374 these, '.' and '-' are counted as gap characters unless an
375 optional argument specifies one of them.
377 Returns : number of internal gaps in the sequence.
378 Args : a gap character (optional)
380 Note : replaces no_gaps
385 my ($self,$char) = @_;
386 my ($seq, $count) = (undef, 0);
388 # default gap characters
389 $char ||= $GAP_SYMBOLS;
391 $self->warn("I hope you know what you are doing setting gap to [$char]")
392 unless $char =~ /[$GAP_SYMBOLS]/;
395 return 0 unless $seq; # empty sequence does not have gaps
397 $seq =~ s/^([$char]+)//;
398 $seq =~ s/([$char]+)$//;
399 while ( $seq =~ /[$char]+/g ) {
407 =head2 column_from_residue_number
409 Title : column_from_residue_number
410 Usage : $col = $seq->column_from_residue_number($resnumber)
413 This function gives the position in the alignment
414 (i.e. column number) of the given residue number in the
415 sequence. For example, for the sequence
417 Seq1/91-97 AC..DEF.GH
419 column_from_residue_number(94) returns 6.
421 An exception is thrown if the residue number would lie
422 outside the length of the aligment
423 (e.g. column_from_residue_number( "Seq2", 22 )
425 Returns : A column number for the position of the
426 given residue in the given sequence (1 = first column)
427 Args : A residue number in the whole sequence (not just that
428 segment of it in the alignment)
432 sub column_from_residue_number
{
433 my ($self, $resnumber) = @_;
435 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
436 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
438 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
442 my $current_residue = $self->start - 1;
443 my $seq = $self->seq;
444 my $strand = $self->strand || 0;
447 # @chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
448 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
450 $current_column = (CORE
::length $seq) + 1;
453 # @chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
454 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
459 while (my $chunk = shift @chunks) {
460 # if ($chunk =~ m|^[\.\-]|o) {
461 if ($chunk =~ m
|^[$GAP_SYMBOLS]|o
) {
462 $current_column += $column_incr * CORE
::length($chunk);
465 if ($current_residue + CORE
::length($chunk) < $resnumber) {
466 $current_column += $column_incr * CORE
::length($chunk);
467 $current_residue += CORE
::length($chunk);
471 $current_column -= $resnumber - $current_residue;
474 $current_column += $resnumber - $current_residue;
476 return $current_column;
482 $self->throw("Could not find residue number $resnumber");
486 =head2 location_from_column
488 Title : location_from_column
489 Usage : $loc = $ali->location_from_column($column_number)
492 This function gives the residue number for a given position
493 in the alignment (i.e. column number) of the given. Gaps
494 complicate this process and force the output to be a
495 L<Bio::Location::Simple> where values can be undefined.
496 For example, for the sequence:
498 Seq/91-96 .AC..DEF.G.
500 location_from_column( 3 ) position 92
501 location_from_column( 4 ) position 92^93
502 location_from_column( 9 ) position 95^96
503 location_from_column( 1 ) position undef
505 An exact position returns a Bio::Location::Simple object
506 where where location_type() returns 'EXACT', if a position
507 is between bases location_type() returns 'IN-BETWEEN'.
508 Column before the first residue returns undef. Note that if
509 the position is after the last residue in the alignment,
510 that there is no guarantee that the original sequence has
511 residues after that position.
513 An exception is thrown if the column number is not within
516 Returns : Bio::Location::Simple or undef
517 Args : A column number
518 Throws : If column is not within the sequence
520 See L<Bio::Location::Simple> for more.
524 sub location_from_column
{
525 my ($self, $column) = @_;
527 $self->throw("Column number has to be a positive integer, not [$column]")
528 unless $column =~ /^\d+$/ and $column > 0;
529 $self->throw("Column number [$column] is larger than".
530 " sequence length [". $self->length. "]")
531 unless $column <= $self->length;
534 my $s = $self->subseq(1,$column);
535 $s =~ s/[^a-zA-Z\*]//g;
537 my $pos = CORE
::length $s;
539 my $start = $self->start || 0 ;
540 my $strand = $self->strand() || 1;
541 my $relative_pos = ($strand == -1)
542 ?
($self->end - $pos + 1)
543 : ($pos + $start - 1);
544 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
545 $loc = Bio
::Location
::Simple
->new
546 (-start
=> $relative_pos,
547 -end
=> $relative_pos,
550 } elsif ($pos == 0 and $self->start == 1) {
552 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
554 ($start,$end) = ($end,$start);
556 $loc = Bio
::Location
::Simple
->new
560 -location_type
=> 'IN-BETWEEN'
569 Usage : $rev = $seq->revcom()
570 Function: Produces a new Bio::LocatableSeq object which
571 has the reversed complement of the sequence. For protein
572 sequences this throws an exception of "Sequence is a
573 protein. Cannot revcom"
575 Returns : A new Bio::LocatableSeq object
582 # since we don't know whether sequences without 1 => 1 correlation can be
583 # revcom'd, kick back
584 if (grep {$_ != 1} $self->mapping) {
585 $self->warn('revcom() not supported for sequences with mapped values of > 1');
588 my $new = $self->SUPER::revcom
;
589 $new->strand($self->strand * -1) if $self->strand;
590 $new->start($self->start) if $self->start;
591 $new->end($self->end) if $self->end;
598 Usage : $subseq = $myseq->trunc(10,100);
599 Function: Provides a truncation of a sequence,
602 Returns : a fresh Bio::PrimarySeqI implementing object
603 Args : Two integers denoting first and last columns of the
604 sequence to be included into sub-sequence.
610 my ($self, $start, $end) = @_;
611 my $new = $self->SUPER::trunc
($start, $end);
612 $new->strand($self->strand);
614 # end will be automatically calculated
615 $start = $end if $self->strand && $self->strand == -1;
617 $start = $self->location_from_column($start);
618 $start ?
($start = $start->end) : ($start = 1);
619 $new->start($start) if $start;
627 Usage : if(! $seq->validate_seq($seq_str) ) {
628 print "sequence $seq_str is not valid for an object of
629 alphabet ",$seq->alphabet, "\n";
631 Function: Validates a given sequence string. A validating sequence string
632 must be accepted by seq(). A string that does not validate will
633 lead to an exception if passed to seq().
635 The implementation provided here does not take alphabet() into
636 account. Allowed are all letters (A-Z), numbers [0-9]
637 and common symbols used for gaps, stop codons, unknown residues,
638 and frameshifts, including '-','.','*','?','=',and '~'.
641 Returns : 1 if the supplied sequence string is valid for the object, and
643 Args : The sequence string to be validated.
648 my ($self,$seqstr) = @_;
649 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
650 return 0 unless( defined $seqstr);
652 if((CORE
::length($seqstr) > 0) &&
653 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
654 $self->warn("seq doesn't validate with [$MATCHPATTERN], mismatch is " .
655 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
661 ################## DEPRECATED METHODS ##################
666 Usage : $self->no_gaps('.')
667 Function : Gets number of gaps in the sequence. The count excludes
668 leading or trailing gap characters.
670 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
671 these, '.' and '-' are counted as gap characters unless an
672 optional argument specifies one of them.
674 Returns : number of internal gaps in the sequence.
675 Args : a gap character (optional)
676 Status : Deprecated (in favor of num_gaps())
682 $self->deprecated(-warn_version
=> 1.0069,
683 -throw_version
=> 1.0075,
684 -message
=> 'Use of method no_gaps() is deprecated, use num_gaps() instead');
691 Usage : $gaps = $seq->no_sequences
692 Function : number of sequence in the sequence alignment
695 Status : Deprecated (in favor of num_sequences())
701 $self->deprecated(-warn_version
=> 1.0069,
702 -throw_version
=> 1.0075,
703 -message
=> 'Use of method no_sequences() is deprecated, use num_sequences() instead');
704 $self->num_sequences(@_);