bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / LocatableSeq.pm
blobe7efe5c9c31ffba69b409287b9203d18caea157f
1 # $Id$
3 # BioPerl module for Bio::LocatableSeq
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::LocatableSeq - A Sequence object with start/end points on it
16 that can be projected into a MSA or have coordinates relative to
17 another seq.
19 =head1 SYNOPSIS
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
24 -id => "seq1",
25 -start => 1,
26 -end => 7);
29 =head1 DESCRIPTION
31 # a normal sequence object
32 $locseq->seq();
33 $locseq->id();
35 # has start,end points
36 $locseq->start();
37 $locseq->end();
39 # inherits off RangeI, so range operations possible
41 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 The locatable sequence object was developed mainly because the
54 SimpleAlign object requires this functionality, and in the rewrite
55 of the Sequence object we had to decide what to do with this.
57 It is, to be honest, not well integrated with the rest of bioperl, for
58 example, the trunc() function does not return a LocatableSeq object,
59 as some might have thought. There are all sorts of nasty gotcha's
60 about interactions between coordinate systems when these sort of
61 objects are used.
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
68 web:
70 http://bugzilla.open-bio.org/
73 =head1 APPENDIX
75 The rest of the documentation details each of the object
76 methods. Internal methods are usually preceded with a _
78 =cut
81 # Let the code begin...
83 package Bio::LocatableSeq;
84 use strict;
86 use Bio::Location::Simple;
87 use Bio::Location::Fuzzy;
88 use vars qw($MATCHPATTERN);
90 $MATCHPATTERN = '0-9A-Za-z\-\.\*\?=~';
93 use base qw(Bio::PrimarySeq Bio::RangeI);
95 sub new {
96 my ($class, @args) = @_;
97 my $self = $class->SUPER::new(@args);
99 my ($start,$end,$strand) =
100 $self->_rearrange( [qw(START END STRAND)],
101 @args);
103 defined $start && $self->start($start);
104 defined $end && $self->end($end);
105 defined $strand && $self->strand($strand);
107 return $self; # success - we hope!
110 =head2 start
112 Title : start
113 Usage : $obj->start($newval)
114 Function: Get/set the 1-based start position of this sequence in the original
115 sequence. '0' means before the original sequence starts.
116 Returns : value of start
117 Args : newvalue (optional)
119 =cut
121 sub start{
122 my $self = shift;
123 if( @_ ) {
124 my $value = shift;
125 $self->{'start'} = $value;
127 return $self->{'start'} if defined $self->{'start'};
128 return 1 if $self->seq;
129 return;
132 =head2 end
134 Title : end
135 Usage : $obj->end($newval)
136 Function: Get/set the 1-based end position of this sequence in the original
137 sequence. '0' means before the original sequence starts.
138 Returns : value of end
139 Args : newvalue (optional)
141 =cut
143 sub end {
144 my $self = shift;
145 if( @_ ) {
146 my $value = shift;
147 my $string = $self->seq;
148 if ($self->seq) {
149 my $len = $self->_ungapped_len;
150 my $id = $self->id;
151 $self->warn("In sequence $id residue count gives end value $len.
152 Overriding value [$value] with value $len for Bio::LocatableSeq::end().")
153 and $value = $len if $len != $value and $self->verbose > 0;
156 $self->{'end'} = $value;
159 return defined $self->{'end'} ? $self->{'end'} : $self->_ungapped_len;
162 sub _ungapped_len {
163 my $self = shift;
164 my $string = $self->seq || '';
165 $string =~ s/[\.\-]+//g;
166 $self->seq ? (return $self->start + CORE::length($string) - 1 ) : undef;
169 =head2 strand
171 Title : strand
172 Usage : $obj->strand($newval)
173 Function:
174 Returns : value of strand
175 Args : newvalue (optional)
177 =cut
179 sub strand{
180 my $self = shift;
181 if( @_ ) {
182 my $value = shift;
183 $self->{'strand'} = $value;
185 return $self->{'strand'};
188 =head2 get_nse
190 Title : get_nse
191 Usage :
192 Function: read-only name of form id/start-end
193 Example :
194 Returns :
195 Args :
197 =cut
199 sub get_nse{
200 my ($self,$char1,$char2) = @_;
202 $char1 ||= "/";
203 $char2 ||= "-";
205 $self->throw("Attribute id not set") unless defined($self->id());
206 $self->throw("Attribute start not set") unless defined($self->start());
207 $self->throw("Attribute end not set") unless defined($self->end());
209 return $self->id() . $char1 . $self->start . $char2 . $self->end ;
214 =head2 no_gap
216 Title : no_gaps
217 Usage :$self->no_gaps('.')
218 Function:
220 Gets number of gaps in the sequence. The count excludes
221 leading or trailing gap characters.
223 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
224 these, '.' and '-' are counted as gap characters unless an
225 optional argument specifies one of them.
227 Returns : number of internal gaps in the sequnce.
228 Args : a gap character (optional)
230 =cut
232 sub no_gaps {
233 my ($self,$char) = @_;
234 my ($seq, $count) = (undef, 0);
236 # default gap characters
237 $char ||= '-.';
239 $self->warn("I hope you know what you are doing setting gap to [$char]")
240 unless $char =~ /[-.]/;
242 $seq = $self->seq;
243 return 0 unless $seq; # empty sequence does not have gaps
245 $seq =~ s/^([$char]+)//;
246 $seq =~ s/([$char]+)$//;
247 $count++ while $seq =~ /[$char]+/g;
249 return $count;
254 =head2 column_from_residue_number
256 Title : column_from_residue_number
257 Usage : $col = $seq->column_from_residue_number($resnumber)
258 Function:
260 This function gives the position in the alignment
261 (i.e. column number) of the given residue number in the
262 sequence. For example, for the sequence
264 Seq1/91-97 AC..DEF.GH
266 column_from_residue_number(94) returns 5.
268 An exception is thrown if the residue number would lie
269 outside the length of the aligment
270 (e.g. column_from_residue_number( "Seq2", 22 )
272 Returns : A column number for the position of the
273 given residue in the given sequence (1 = first column)
274 Args : A residue number in the whole sequence (not just that
275 segment of it in the alignment)
277 =cut
279 sub column_from_residue_number {
280 my ($self, $resnumber) = @_;
282 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
283 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
285 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
286 my @residues = split //, $self->seq;
287 my $count = $self->start();
288 my $i;
289 my ($start,$end,$inc,$test);
290 my $strand = $self->strand || 0;
291 # the following bit of "magic" allows the main loop logic to be the
292 # same regardless of the strand of the sequence
293 ($start,$end,$inc,$test)= ($strand == -1)?
294 (scalar(@residues-1),0,-1,sub{$i >= $end}) :
295 (0,scalar(@residues-1),1,sub{$i <= $end});
297 for ($i=$start; $test->(); $i+= $inc) {
298 if ($residues[$i] ne '.' and $residues[$i] ne '-') {
299 $count == $resnumber and last;
300 $count++;
303 # $i now holds the index of the column.
304 # The actual column number is this index + 1
306 return $i+1;
309 $self->throw("Could not find residue number $resnumber");
314 =head2 location_from_column
316 Title : location_from_column
317 Usage : $loc = $ali->location_from_column($column_number)
318 Function:
320 This function gives the residue number for a given position
321 in the alignment (i.e. column number) of the given. Gaps
322 complicate this process and force the output to be a
323 L<Bio::Range> where values can be undefined. For example,
324 for the sequence:
326 Seq/91-97 .AC..DEF.G.
328 location_from_column( 3 ) position 93
329 location_from_column( 2 ) position 92^93
330 location_from_column(10 ) position 97^98
331 location_from_column( 1 ) position undef
333 An exact position returns a Bio::Location::Simple object
334 where where location_type() returns 'EXACT', if a position
335 is between bases location_type() returns 'IN-BETWEEN'.
336 Column before the first residue returns undef. Note that if
337 the position is after the last residue in the alignment,
338 that there is no guarantee that the original sequence has
339 residues after that position.
341 An exception is thrown if the column number is not within
342 the sequence.
344 Returns : Bio::Location::Simple or undef
345 Args : A column number
346 Throws : If column is not within the sequence
348 See L<Bio::Location::Simple> for more.
350 =cut
352 sub location_from_column {
353 my ($self, $column) = @_;
355 $self->throw("Column number has to be a positive integer, not [$column]")
356 unless $column =~ /^\d+$/ and $column > 0;
357 $self->throw("Column number [$column] is larger than".
358 " sequence length [". $self->length. "]")
359 unless $column <= $self->length;
361 my ($loc);
362 my $s = $self->subseq(1,$column);
363 $s =~ s/[^a-zA-Z\*]//g;
365 my $pos = CORE::length $s;
367 my $start = $self->start || 0 ;
368 my $strand = $self->strand() || 1;
369 my $relative_pos = ($strand == -1)
370 ? ($self->end - $pos + 1)
371 : ($pos + $start - 1);
372 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
373 $loc = Bio::Location::Simple->new
374 (-start => $relative_pos,
375 -end => $relative_pos,
376 -strand => 1,
378 } elsif ($pos == 0 and $self->start == 1) {
379 } else {
380 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
381 if ($strand == -1) {
382 ($start,$end) = ($end,$start);
384 $loc = Bio::Location::Simple->new
385 (-start => $start,
386 -end => $end,
387 -strand => 1,
388 -location_type => 'IN-BETWEEN'
391 return $loc;
394 =head2 revcom
396 Title : revcom
397 Usage : $rev = $seq->revcom()
398 Function: Produces a new Bio::LocatableSeq object which
399 has the reversed complement of the sequence. For protein
400 sequences this throws an exception of "Sequence is a
401 protein. Cannot revcom"
403 Returns : A new Bio::LocatableSeq object
404 Args : none
406 =cut
408 sub revcom {
409 my ($self) = @_;
411 my $new = $self->SUPER::revcom;
412 $new->strand($self->strand * -1) if $self->strand;
413 $new->start($self->start) if $self->start;
414 $new->end($self->end) if $self->end;
415 return $new;
419 =head2 trunc
421 Title : trunc
422 Usage : $subseq = $myseq->trunc(10,100);
423 Function: Provides a truncation of a sequence,
425 Example :
426 Returns : a fresh Bio::PrimarySeqI implementing object
427 Args : Two integers denoting first and last columns of the
428 sequence to be included into sub-sequence.
431 =cut
433 sub trunc {
434 my ($self, $start, $end) = @_;
435 my $new = $self->SUPER::trunc($start, $end);
436 $new->strand($self->strand);
438 # end will be automatically calculated
439 $start = $end if $self->strand == -1;
441 $start = $self->location_from_column($start);
442 $start ? ($start = $start->end) : ($start = 1);
443 $new->start($start) if $start;
445 return $new;
448 =head2 validate_seq
450 Title : validate_seq
451 Usage : if(! $seq->validate_seq($seq_str) ) {
452 print "sequence $seq_str is not valid for an object of
453 alphabet ",$seq->alphabet, "\n";
455 Function: Validates a given sequence string. A validating sequence string
456 must be accepted by seq(). A string that does not validate will
457 lead to an exception if passed to seq().
459 The implementation provided here does not take alphabet() into
460 account. Allowed are all letters (A-Z), numbers [0-9]
461 and '-','.','*','?','=',and '~'.
463 Example :
464 Returns : 1 if the supplied sequence string is valid for the object, and
465 0 otherwise.
466 Args : The sequence string to be validated.
469 =cut
471 sub validate_seq {
472 my ($self,$seqstr) = @_;
473 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
474 return 0 unless( defined $seqstr);
476 if((CORE::length($seqstr) > 0) &&
477 ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
478 $self->warn("seq doesn't validate, mismatch is " .
479 join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
480 return 0;
482 return 1;