perltidy; fixes per Peter Rice, re: duplicated accessions in the index and unmunged...
[bioperl-live.git] / Bio / LocatableSeq.pm
blob6a06a0f587864ffacb95cd869536e0b780c0b09c
2 # BioPerl module for Bio::LocatableSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
17 that can be projected into a MSA or have coordinates relative to
18 another seq.
20 =head1 SYNOPSIS
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
24 -id => "seq1",
25 -start => 1,
26 -end => 7);
28 # a normal sequence object
29 $locseq->seq();
30 $locseq->id();
32 # has start,end points
33 $locseq->start();
34 $locseq->end();
36 # inherits off RangeI, so range operations possible
38 =head1 DESCRIPTION
40 The LocatableSeq sequence object was developed mainly because the SimpleAlign
41 object requires this functionality, and in the rewrite of the Sequence object we
42 had to decide what to do with this.
44 It is, to be honest, not well integrated with the rest of bioperl. For example,
45 the trunc() function does not return a LocatableSeq object, as some might have
46 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
47 inherited from Bio::RangeI and is not stored in a Bio::Location.
49 There are all sorts of nasty gotcha's about interactions between coordinate
50 systems when these sort of objects are used. Some mapping now occurs to deal
51 with HSP data, however it can probably be integrated in better and most methods
52 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
53 trunc(), etc.) do not behave as expected and must be used with care.
55 Due to this, LocatableSeq functionality is to be refactored in a future BioPerl
56 release. However, for alignment functionality it works adequately for the time
57 being
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70 =head2 Support
72 Please direct usage questions or support issues to the mailing list:
74 I<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
81 =head2 Reporting Bugs
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 the bugs and their resolution. Bug reports can be submitted via the
85 web:
87 https://redmine.open-bio.org/projects/bioperl/
89 =head1 APPENDIX
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
94 =cut
97 # Let the code begin...
99 package Bio::LocatableSeq;
100 use strict;
102 use Bio::Location::Simple;
103 use Bio::Location::Fuzzy;
104 use vars qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
106 # The following global variables contain symbols used to represent gaps,
107 # frameshifts, residues, and other valid symbols. These are set at compile-time;
108 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
109 # LocatableSeq.t)
111 $GAP_SYMBOLS = '\-\.=~';
112 $FRAMESHIFT_SYMBOLS = '\\\/';
113 $OTHER_SYMBOLS = '\?';
114 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
115 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
117 use base qw(Bio::PrimarySeq Bio::RangeI);
119 sub new {
120 my ($class, @args) = @_;
121 my $self = $class->SUPER::new(@args);
123 my ($start,$end,$strand, $mapping, $fs, $nse) =
124 $self->_rearrange( [qw(START
126 STRAND
127 MAPPING
128 FRAMESHIFTS
129 FORCE_NSE
131 @args);
133 $mapping ||= [1,1];
134 $self->mapping($mapping);
135 $nse || 0;
136 $self->force_nse($nse);
137 defined $fs && $self->frameshifts($fs);
138 defined $start && $self->start($start);
139 defined $end && $self->end($end);
140 defined $strand && $self->strand($strand);
142 return $self; # success - we hope!
145 =head2 start
147 Title : start
148 Usage : $obj->start($newval)
149 Function: Get/set the 1-based start position of this sequence in the original
150 sequence. '0' means before the original sequence starts.
151 Returns : value of start
152 Args : newvalue (optional)
154 =cut
156 sub start{
157 my $self = shift;
158 if( @_ ) {
159 my $value = shift;
160 $self->{'start'} = $value;
162 return $self->{'start'} if defined $self->{'start'};
163 return 1 if $self->seq;
164 return;
167 =head2 end
169 Title : end
170 Usage : $obj->end($newval)
171 Function: Get/set the 1-based end position of this sequence in the original
172 sequence. '0' means before the original sequence starts.
173 Returns : value of end
174 Args : newvalue (optional)
175 Note : although this is a get/set, it checks passed values against the
176 calculated end point ( derived from the sequence and based on
177 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
178 it will warn and set the proper value. Probably best used for
179 debugging proper sequence calculations.
181 =cut
183 sub end {
184 my $self = shift;
185 if( @_ ) {
186 my $value = shift;
187 my $st = $self->start;
188 # start of 0 usually means the sequence is all gaps but maps to
189 # other sequences in an alignment
190 if ($self->seq && $st != 0 ) {
191 my $len = $self->_ungapped_len;
192 my $calend = $st + $len - 1;
193 my $id = $self->id || 'unknown';
194 if ($calend != $value) {
195 $self->warn("In sequence $id residue count gives end value ".
196 "$calend. \nOverriding value [$value] with value $calend for ".
197 "Bio::LocatableSeq::end().\n".$self->seq);
198 $value = $calend;
201 $self->{'end'} = $value;
204 if (defined $self->{'end'}) {
205 return $self->{'end'}
206 } elsif ( my $len = $self->_ungapped_len) {
207 return $len + $self->start - 1;
208 } else {
209 return;
213 # changed 08.10.26 to return ungapped length, not the calculated end
214 # of the sequence
215 sub _ungapped_len {
216 my $self = shift;
217 return unless my $string = $self->seq;
218 my ($map_res, $map_coord) = $self->mapping;
219 my $offset = 0;
220 if (my %data = $self->frameshifts) {
221 map {$offset += $_} values %data;
223 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
224 return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
227 #sub length {
228 # my $self = shift;
229 # return unless my $string = $self->seq;
230 # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
231 # return CORE::length($string);
234 =head2 strand
236 Title : strand
237 Usage : $obj->strand($newval)
238 Function: return or set the strandedness
239 Returns : the value of the strandedness (-1, 0 or 1)
240 Args : the value of the strandedness (-1, 0 or 1)
242 =cut
244 sub strand{
245 my $self = shift;
246 if( @_ ) {
247 my $value = shift;
248 $self->{'strand'} = $value;
250 return $self->{'strand'};
253 =head2 mapping
255 Title : mapping
256 Usage : $obj->mapping($newval)
257 Function: return or set the mapping indices (indicates # symbols/positions in
258 the source string mapping to # of coordinate positions)
259 Returns : two-element array (# symbols => # coordinate pos)
260 Args : two elements (# symbols => # coordinate pos); this can also be
261 passed in as an array reference of the two elements (as might be
262 passed upon Bio::LocatableSeq instantiation, for instance).
264 =cut
266 sub mapping {
267 my $self = shift;
268 if( @_ ) {
269 my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
270 $self->throw("Must pass two values (# residues mapped to # positions)")
271 if @mapping != 2;
272 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
273 $self->throw("Mapping values other than 1 or 3 are not currently supported")
275 $self->{'_mapping'} = \@mapping;
277 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
278 return @{ $self->{'_mapping'} };
281 =head2 frameshifts
283 Title : frameshifts
284 Usage : $obj->frameshifts($newval)
285 Function: get/set the frameshift hash, which contains sequence positions as
286 keys and the shift (-2, -1, 1, 2) as the value
287 Returns : hash
288 Args : hash or hash reference
290 =cut
292 sub frameshifts {
293 my $self = shift;
294 if( @_ ) {
295 if (ref $_[0] eq 'HASH') {
296 $self->{_frameshifts} = $_[0];
297 } else {
298 # assume this is a full list to be converted to a hash
299 $self->{_frameshifts} = \%{@_} # coerce into hash ref
302 (defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ?
303 return %{$self->{_frameshifts}} : return ();
306 =head2 get_nse
308 Title : get_nse
309 Usage :
310 Function: read-only name of form id/start-end
311 Example :
312 Returns :
313 Args :
315 =cut
317 sub get_nse{
318 my ($self,$char1,$char2) = @_;
320 $char1 ||= "/";
321 $char2 ||= "-";
323 my ($id, $st, $end, $strand) = ($self->id(), $self->start(),
324 $self->end(), $self->strand || 0);
326 if ($self->force_nse) {
327 $id ||= '';
328 $st ||= 0;
329 $end ||= 0;
332 $self->throw("Attribute id not set") unless defined($id);
333 $self->throw("Attribute start not set") unless defined($st);
334 $self->throw("Attribute end not set") unless defined($end);
336 if ($strand && $strand == -1) {
337 ($st, $end) = ($end, $st);
340 #Stockholm Rfam includes version if present so it is optional
341 my $v = $self->version ? '.'.$self->version : '';
342 return join('',$id, $v, $char1, $st, $char2, $end);
345 =head2 force_nse
347 Title : force_nse
348 Usage : $ls->force_nse()
349 Function: Boolean which forces get_nse() to build an NSE, regardless
350 of whether id(), start(), or end() is set
351 Returns : Boolean value
352 Args : (optional) Boolean (1 or 0)
353 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
354 respectively
356 =cut
358 sub force_nse {
359 my ($self, $flag) = @_;
360 if (defined $flag) {
361 $flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
363 return $self->{'_force_nse'};
366 =head2 num_gaps
368 Title : num_gaps
369 Usage :$self->num_gaps('.')
370 Function:Gets number of gaps in the sequence. The count excludes
371 leading or trailing gap characters.
373 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
374 these, '.' and '-' are counted as gap characters unless an
375 optional argument specifies one of them.
377 Returns : number of internal gaps in the sequence.
378 Args : a gap character (optional)
379 Status : Stable
380 Note : replaces no_gaps
382 =cut
384 sub num_gaps {
385 my ($self,$char) = @_;
386 my ($seq, $count) = (undef, 0);
388 # default gap characters
389 $char ||= $GAP_SYMBOLS;
391 $self->warn("I hope you know what you are doing setting gap to [$char]")
392 unless $char =~ /[$GAP_SYMBOLS]/;
394 $seq = $self->seq;
395 return 0 unless $seq; # empty sequence does not have gaps
397 $seq =~ s/^([$char]+)//;
398 $seq =~ s/([$char]+)$//;
399 while ( $seq =~ /[$char]+/g ) {
400 $count++;
403 return $count;
407 =head2 column_from_residue_number
409 Title : column_from_residue_number
410 Usage : $col = $seq->column_from_residue_number($resnumber)
411 Function:
413 This function gives the position in the alignment
414 (i.e. column number) of the given residue number in the
415 sequence. For example, for the sequence
417 Seq1/91-97 AC..DEF.GH
419 column_from_residue_number(94) returns 6.
421 An exception is thrown if the residue number would lie
422 outside the length of the aligment
423 (e.g. column_from_residue_number( "Seq2", 22 )
425 Returns : A column number for the position of the
426 given residue in the given sequence (1 = first column)
427 Args : A residue number in the whole sequence (not just that
428 segment of it in the alignment)
430 =cut
432 sub column_from_residue_number {
433 my ($self, $resnumber) = @_;
435 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
436 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
438 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
439 my @chunks;
440 my $column_incr;
441 my $current_column;
442 my $current_residue = $self->start - 1;
443 my $seq = $self->seq;
444 my $strand = $self->strand || 0;
446 if ($strand == -1) {
447 # @chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
448 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
449 $column_incr = -1;
450 $current_column = (CORE::length $seq) + 1;
452 else {
453 # @chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
454 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
455 $column_incr = 1;
456 $current_column = 0;
459 while (my $chunk = shift @chunks) {
460 # if ($chunk =~ m|^[\.\-]|o) {
461 if ($chunk =~ m|^[$GAP_SYMBOLS]|o) {
462 $current_column += $column_incr * CORE::length($chunk);
464 else {
465 if ($current_residue + CORE::length($chunk) < $resnumber) {
466 $current_column += $column_incr * CORE::length($chunk);
467 $current_residue += CORE::length($chunk);
469 else {
470 if ($strand == -1) {
471 $current_column -= $resnumber - $current_residue;
473 else {
474 $current_column += $resnumber - $current_residue;
476 return $current_column;
482 $self->throw("Could not find residue number $resnumber");
486 =head2 location_from_column
488 Title : location_from_column
489 Usage : $loc = $ali->location_from_column($column_number)
490 Function:
492 This function gives the residue number for a given position
493 in the alignment (i.e. column number) of the given. Gaps
494 complicate this process and force the output to be a
495 L<Bio::Location::Simple> where values can be undefined.
496 For example, for the sequence:
498 Seq/91-96 .AC..DEF.G.
500 location_from_column( 3 ) position 92
501 location_from_column( 4 ) position 92^93
502 location_from_column( 9 ) position 95^96
503 location_from_column( 1 ) position undef
505 An exact position returns a Bio::Location::Simple object
506 where where location_type() returns 'EXACT', if a position
507 is between bases location_type() returns 'IN-BETWEEN'.
508 Column before the first residue returns undef. Note that if
509 the position is after the last residue in the alignment,
510 that there is no guarantee that the original sequence has
511 residues after that position.
513 An exception is thrown if the column number is not within
514 the sequence.
516 Returns : Bio::Location::Simple or undef
517 Args : A column number
518 Throws : If column is not within the sequence
520 See L<Bio::Location::Simple> for more.
522 =cut
524 sub location_from_column {
525 my ($self, $column) = @_;
527 $self->throw("Column number has to be a positive integer, not [$column]")
528 unless $column =~ /^\d+$/ and $column > 0;
529 $self->throw("Column number [$column] is larger than".
530 " sequence length [". $self->length. "]")
531 unless $column <= $self->length;
533 my ($loc);
534 my $s = $self->subseq(1,$column);
535 $s =~ s/[^a-zA-Z\*]//g;
537 my $pos = CORE::length $s;
539 my $start = $self->start || 0 ;
540 my $strand = $self->strand() || 1;
541 my $relative_pos = ($strand == -1)
542 ? ($self->end - $pos + 1)
543 : ($pos + $start - 1);
544 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
545 $loc = Bio::Location::Simple->new
546 (-start => $relative_pos,
547 -end => $relative_pos,
548 -strand => 1,
550 } elsif ($pos == 0 and $self->start == 1) {
551 } else {
552 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
553 if ($strand == -1) {
554 ($start,$end) = ($end,$start);
556 $loc = Bio::Location::Simple->new
557 (-start => $start,
558 -end => $end,
559 -strand => 1,
560 -location_type => 'IN-BETWEEN'
563 return $loc;
566 =head2 revcom
568 Title : revcom
569 Usage : $rev = $seq->revcom()
570 Function: Produces a new Bio::LocatableSeq object which
571 has the reversed complement of the sequence. For protein
572 sequences this throws an exception of "Sequence is a
573 protein. Cannot revcom"
575 Returns : A new Bio::LocatableSeq object
576 Args : none
578 =cut
580 sub revcom {
581 my ($self) = @_;
582 # since we don't know whether sequences without 1 => 1 correlation can be
583 # revcom'd, kick back
584 if (grep {$_ != 1} $self->mapping) {
585 $self->warn('revcom() not supported for sequences with mapped values of > 1');
586 return;
588 my $new = $self->SUPER::revcom;
589 $new->strand($self->strand * -1) if $self->strand;
590 $new->start($self->start) if $self->start;
591 $new->end($self->end) if $self->end;
592 return $new;
595 =head2 trunc
597 Title : trunc
598 Usage : $subseq = $myseq->trunc(10,100);
599 Function: Provides a truncation of a sequence,
601 Example :
602 Returns : a fresh Bio::PrimarySeqI implementing object
603 Args : Two integers denoting first and last columns of the
604 sequence to be included into sub-sequence.
607 =cut
609 sub trunc {
610 my ($self, $start, $end) = @_;
611 my $new = $self->SUPER::trunc($start, $end);
612 $new->strand($self->strand);
614 # end will be automatically calculated
615 $start = $end if $self->strand && $self->strand == -1;
617 $start = $self->location_from_column($start);
618 $start ? ($start = $start->end) : ($start = 1);
619 $new->start($start) if $start;
621 return $new;
624 =head2 validate_seq
626 Title : validate_seq
627 Usage : if(! $seq->validate_seq($seq_str) ) {
628 print "sequence $seq_str is not valid for an object of
629 alphabet ",$seq->alphabet, "\n";
631 Function: Validates a given sequence string. A validating sequence string
632 must be accepted by seq(). A string that does not validate will
633 lead to an exception if passed to seq().
635 The implementation provided here does not take alphabet() into
636 account. Allowed are all letters (A-Z), numbers [0-9]
637 and common symbols used for gaps, stop codons, unknown residues,
638 and frameshifts, including '-','.','*','?','=',and '~'.
640 Example :
641 Returns : 1 if the supplied sequence string is valid for the object, and
642 0 otherwise.
643 Args : The sequence string to be validated.
645 =cut
647 sub validate_seq {
648 my ($self,$seqstr) = @_;
649 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
650 return 0 unless( defined $seqstr);
652 if((CORE::length($seqstr) > 0) &&
653 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
654 $self->warn("seq doesn't validate with [$MATCHPATTERN], mismatch is " .
655 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
656 return 0;
658 return 1;
661 ################## DEPRECATED METHODS ##################
663 =head2 no_gap
665 Title : no_gaps
666 Usage : $self->no_gaps('.')
667 Function : Gets number of gaps in the sequence. The count excludes
668 leading or trailing gap characters.
670 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
671 these, '.' and '-' are counted as gap characters unless an
672 optional argument specifies one of them.
674 Returns : number of internal gaps in the sequence.
675 Args : a gap character (optional)
676 Status : Deprecated (in favor of num_gaps())
678 =cut
680 sub no_gaps {
681 my $self = shift;
682 $self->deprecated(-warn_version => 1.0069,
683 -throw_version => 1.0075,
684 -message => 'Use of method no_gaps() is deprecated, use num_gaps() instead');
685 $self->num_gaps(@_);
688 =head2 no_sequences
690 Title : no_sequences
691 Usage : $gaps = $seq->no_sequences
692 Function : number of sequence in the sequence alignment
693 Returns : integer
694 Argument :
695 Status : Deprecated (in favor of num_sequences())
697 =cut
699 sub no_sequences {
700 my $self = shift;
701 $self->deprecated(-warn_version => 1.0069,
702 -throw_version => 1.0075,
703 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
704 $self->num_sequences(@_);