Merge branch 'bugfix-exonerate' of git://github.com/JeanFred/bioperl-live into topic...
[bioperl-live.git] / Bio / SeqFeatureI.pm
bloba8e3eb5597fe58be2fc9b09ab4fdbf5f0ed3fab5
2 # BioPerl module for Bio::SeqFeatureI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
18 =head1 SYNOPSIS
20 # get a seqfeature somehow, eg, from a Sequence with Features attached
22 foreach $feat ( $seq->get_SeqFeatures() ) {
23 print "Feature from ", $feat->start, "to ",
24 $feat->end, " Primary tag ", $feat->primary_tag,
25 ", produced by ", $feat->source_tag(), "\n";
27 if( $feat->strand == 0 ) {
28 print "Feature applicable to either strand\n";
29 } else {
30 print "Feature on strand ", $feat->strand,"\n"; # -1,1
33 print "feature location is ",$feat->start, "..",
34 $feat->end, " on strand ", $feat->strand, "\n";
35 print "easy utility to print locations in GenBank/EMBL way ",
36 $feat->location->to_FTstring(), "\n";
38 foreach $tag ( $feat->get_all_tags() ) {
39 print "Feature has tag ", $tag, " with values, ",
40 join(' ',$feat->get_tag_values($tag)), "\n";
42 print "new feature\n" if $feat->has_tag('new');
43 # features can have sub features
44 my @subfeat = $feat->get_SeqFeatures();
47 =head1 DESCRIPTION
49 This interface is the functions one can expect for any Sequence
50 Feature, whatever its implementation or whether it is a more complex
51 type (eg, a Gene). This object does not actually provide any
52 implementation, it just provides the definitions of what methods one can
53 call. See Bio::SeqFeature::Generic for a good standard implementation
54 of this object
56 =head1 FEEDBACK
58 User feedback is an integral part of the evolution of this and other
59 Bioperl modules. Send your comments and suggestions preferably to one
60 of the Bioperl mailing lists. Your participation is much appreciated.
62 bioperl-l@bioperl.org - General discussion
63 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
65 =head2 Support
67 Please direct usage questions or support issues to the mailing list:
69 I<bioperl-l@bioperl.org>
71 rather than to the module maintainer directly. Many experienced and
72 reponsive experts will be able look at the problem and quickly
73 address it. Please include a thorough description of the problem
74 with code and data examples if at all possible.
76 =head2 Reporting Bugs
78 Report bugs to the Bioperl bug tracking system to help us keep track
79 the bugs and their resolution. Bug reports can be submitted via the
80 web:
82 https://redmine.open-bio.org/projects/bioperl/
84 =head1 APPENDIX
86 The rest of the documentation details each of the object
87 methods. Internal methods are usually preceded with a _
89 =cut
92 # Let the code begin...
95 package Bio::SeqFeatureI;
96 use vars qw($HasInMemory);
97 use strict;
98 BEGIN {
99 eval { require Bio::DB::InMemoryCache };
100 if( $@ ) { $HasInMemory = 0 }
101 else { $HasInMemory = 1 }
104 use Bio::Seq;
106 use Carp;
108 use base qw(Bio::RangeI);
110 =head1 Bio::SeqFeatureI specific methods
112 New method interfaces.
114 =cut
116 =head2 get_SeqFeatures
118 Title : get_SeqFeatures
119 Usage : @feats = $feat->get_SeqFeatures();
120 Function: Returns an array of sub Sequence Features
121 Returns : An array
122 Args : none
124 =cut
126 sub get_SeqFeatures{
127 my ($self,@args) = @_;
129 $self->throw_not_implemented();
132 =head2 display_name
134 Title : display_name
135 Usage : $name = $feat->display_name()
136 Function: Returns the human-readable name of the feature for displays.
137 Returns : a string
138 Args : none
140 =cut
142 sub display_name {
143 shift->throw_not_implemented();
146 =head2 primary_tag
148 Title : primary_tag
149 Usage : $tag = $feat->primary_tag()
150 Function: Returns the primary tag for a feature,
151 eg 'exon'
152 Returns : a string
153 Args : none
156 =cut
158 sub primary_tag{
159 my ($self,@args) = @_;
161 $self->throw_not_implemented();
165 =head2 source_tag
167 Title : source_tag
168 Usage : $tag = $feat->source_tag()
169 Function: Returns the source tag for a feature,
170 eg, 'genscan'
171 Returns : a string
172 Args : none
175 =cut
177 sub source_tag{
178 my ($self,@args) = @_;
180 $self->throw_not_implemented();
183 =head2 has_tag
185 Title : has_tag
186 Usage : $tag_exists = $self->has_tag('some_tag')
187 Function:
188 Returns : TRUE if the specified tag exists, and FALSE otherwise
189 Args :
191 =cut
193 sub has_tag{
194 my ($self,@args) = @_;
196 $self->throw_not_implemented();
200 =head2 get_tag_values
202 Title : get_tag_values
203 Usage : @values = $self->get_tag_values('some_tag')
204 Function:
205 Returns : An array comprising the values of the specified tag.
206 Args : a string
208 throws an exception if there is no such tag
210 =cut
212 sub get_tag_values {
213 shift->throw_not_implemented();
216 =head2 get_tagset_values
218 Title : get_tagset_values
219 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
220 Function:
221 Returns : An array comprising the values of the specified tags, in order of tags
222 Args : An array of strings
224 does NOT throw an exception if none of the tags are not present
226 this method is useful for getting a human-readable label for a
227 SeqFeatureI; not all tags can be assumed to be present, so a list of
228 possible tags in preferential order is provided
230 =cut
232 # interface + abstract method
233 sub get_tagset_values {
234 my ($self, @args) = @_;
235 my @vals = ();
236 foreach my $arg (@args) {
237 if ($self->has_tag($arg)) {
238 push(@vals, $self->get_tag_values($arg));
241 return @vals;
244 =head2 get_all_tags
246 Title : get_all_tags
247 Usage : @tags = $feat->get_all_tags()
248 Function: gives all tags for this feature
249 Returns : an array of strings
250 Args : none
253 =cut
255 sub get_all_tags{
256 shift->throw_not_implemented();
259 =head2 attach_seq
261 Title : attach_seq
262 Usage : $sf->attach_seq($seq)
263 Function: Attaches a Bio::Seq object to this feature. This
264 Bio::Seq object is for the *entire* sequence: ie
265 from 1 to 10000
267 Note that it is not guaranteed that if you obtain a feature from
268 an object in bioperl, it will have a sequence attached. Also,
269 implementors of this interface can choose to provide an empty
270 implementation of this method. I.e., there is also no guarantee
271 that if you do attach a sequence, seq() or entire_seq() will not
272 return undef.
274 The reason that this method is here on the interface is to enable
275 you to call it on every SeqFeatureI compliant object, and
276 that it will be implemented in a useful way and set to a useful
277 value for the great majority of use cases. Implementors who choose
278 to ignore the call are encouraged to specifically state this in
279 their documentation.
281 Example :
282 Returns : TRUE on success
283 Args : a Bio::PrimarySeqI compliant object
286 =cut
288 sub attach_seq {
289 shift->throw_not_implemented();
292 =head2 seq
294 Title : seq
295 Usage : $tseq = $sf->seq()
296 Function: returns the truncated sequence (if there is a sequence attached)
297 for this feature
298 Example :
299 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
300 bounded by start & end, or undef if there is no sequence attached
301 Args : none
304 =cut
306 sub seq {
307 shift->throw_not_implemented();
310 =head2 entire_seq
312 Title : entire_seq
313 Usage : $whole_seq = $sf->entire_seq()
314 Function: gives the entire sequence that this seqfeature is attached to
315 Example :
316 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
317 sequence attached
318 Args : none
321 =cut
323 sub entire_seq {
324 shift->throw_not_implemented();
328 =head2 seq_id
330 Title : seq_id
331 Usage : $obj->seq_id($newval)
332 Function: There are many cases when you make a feature that you
333 do know the sequence name, but do not know its actual
334 sequence. This is an attribute such that you can store
335 the ID (e.g., display_id) of the sequence.
337 This attribute should *not* be used in GFF dumping, as
338 that should come from the collection in which the seq
339 feature was found.
340 Returns : value of seq_id
341 Args : newvalue (optional)
344 =cut
346 sub seq_id {
347 shift->throw_not_implemented();
350 =head2 gff_string
352 Title : gff_string
353 Usage : $str = $feat->gff_string;
354 $str = $feat->gff_string($gff_formatter);
355 Function: Provides the feature information in GFF format.
357 The implementation provided here returns GFF2 by default. If you
358 want a different version, supply an object implementing a method
359 gff_string() accepting a SeqFeatureI object as argument. E.g., to
360 obtain GFF1 format, do the following:
362 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
363 $gff1str = $feat->gff_string($gff1io);
365 Returns : A string
366 Args : Optionally, an object implementing gff_string().
369 =cut
371 sub gff_string{
372 my ($self,$formatter) = @_;
374 $formatter = $self->_static_gff_formatter unless $formatter;
375 return $formatter->gff_string($self);
378 my $static_gff_formatter = undef;
380 =head2 _static_gff_formatter
382 Title : _static_gff_formatter
383 Usage :
384 Function:
385 Example :
386 Returns :
387 Args :
390 =cut
392 sub _static_gff_formatter{
393 my ($self,@args) = @_;
394 require Bio::Tools::GFF; # on the fly inclusion -- is this better?
395 if( !defined $static_gff_formatter ) {
396 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
398 return $static_gff_formatter;
402 =head1 Decorating methods
404 These methods have an implementation provided by Bio::SeqFeatureI,
405 but can be validly overwritten by subclasses
407 =head2 spliced_seq
409 Title : spliced_seq
411 Usage : $seq = $feature->spliced_seq()
412 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
414 Function: Provides a sequence of the feature which is the most
415 semantically "relevant" feature for this sequence. A default
416 implementation is provided which for simple cases returns just
417 the sequence, but for split cases, loops over the split location
418 to return the sequence. In the case of split locations with
419 remote locations, eg
421 join(AB000123:5567-5589,80..1144)
423 in the case when a database object is passed in, it will attempt
424 to retrieve the sequence from the database object, and "Do the right thing",
425 however if no database object is provided, it will generate the correct
426 number of N's (DNA) or X's (protein, though this is unlikely).
428 This function is deliberately "magical" attempting to second guess
429 what a user wants as "the" sequence for this feature.
431 Implementing classes are free to override this method with their
432 own magic if they have a better idea what the user wants.
434 Args : [optional]
435 -db A L<Bio::DB::RandomAccessI> compliant object if
436 one needs to retrieve remote seqs.
437 -nosort boolean if the locations should not be sorted
438 by start location. This may occur, for instance,
439 in a circular sequence where a gene span starts
440 before the end of the sequence and ends after the
441 sequence start. Example : join(15685..16260,1..207)
442 (default = if sequence is_circular(), 1, otherwise 0)
443 -phase truncates the returned sequence based on the
444 intron phase (0,1,2).
446 Returns : A L<Bio::PrimarySeqI> object
448 =cut
450 sub spliced_seq {
451 my $self = shift;
452 my @args = @_;
453 my ($db, $nosort, $phase) =
454 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
456 # set no_sort based on the parent sequence status
457 if ($self->entire_seq->is_circular) {
458 $nosort = 1;
461 # (added 7/7/06 to allow use old API (with warnings)
462 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ? 1 : 0;
463 if (@args && $old_api) {
464 $self->warn(q(API has changed; please use '-db' or '-nosort' ).
465 qq(for args. See POD for more details.));
466 $db = shift @args if @args;
467 $nosort = shift @args if @args;
468 $phase = shift @args if @args;
471 if (defined($phase) && ($phase < 0 || $phase > 2)) {
472 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
473 $phase = 0;
476 if( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
477 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object".
478 " for access to remote locations for spliced_seq");
479 $db = undef;
480 } elsif( defined $db && $HasInMemory &&
481 $db->isa('Bio::DB::InMemoryCache') ) {
482 $db = Bio::DB::InMemoryCache->new(-seqdb => $db);
485 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
486 if ($phase) {
487 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
488 my $seqstr = substr($self->seq->seq, $phase);
489 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
490 . "_spliced_feat",
491 -seq => $seqstr);
492 return $out;
493 } else {
494 return $self->seq(); # nice and easy!
498 # redundant test, but the above ISA is probably not ideal.
499 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
500 $self->throw("not atomic, not split, yikes, in trouble!");
503 my $seqstr = '';
504 my $seqid = $self->entire_seq->display_id;
505 # This is to deal with reverse strand features
506 # so we are really sorting features 5' -> 3' on their strand
507 # i.e. rev strand features will be sorted largest to smallest
508 # as this how revcom CDSes seem to be annotated in genbank.
509 # Might need to eventually allow this to be programable?
510 # (can I mention how much fun this is NOT! --jason)
512 my ($mixed,$mixedloc, $fstrand) = (0);
514 if( $self->isa('Bio::Das::SegmentI') &&
515 ! $self->absolute ) {
516 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI which does have absolute set to 1 -- be warned you may not be getting things on the correct strand");
519 my @locset = $self->location->each_Location;
520 my @locs;
521 if( ! $nosort ) {
522 @locs = map { $_->[0] }
523 # sort so that most negative is first basically to order
524 # the features on the opposite strand 5'->3' on their strand
525 # rather than they way most are input which is on the fwd strand
527 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
528 map {
529 $fstrand = $_->strand unless defined $fstrand;
530 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
531 if( defined $_->seq_id ) {
532 $mixedloc = 1 if( $_->seq_id ne $seqid );
534 [ $_, $_->start * ($_->strand || 1)];
535 } @locset;
537 if ( $mixed ) {
538 $self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
539 @locs = @locset;
541 } else {
542 # use the original order instead of trying to sort
543 @locs = @locset;
544 $fstrand = $locs[0]->strand;
547 foreach my $loc ( @locs ) {
548 if( ! $loc->isa("Bio::Location::Atomic") ) {
549 $self->throw("Can only deal with one level deep locations");
551 my $called_seq;
552 if( $fstrand != $loc->strand ) {
553 $self->warn("feature strand is different from location strand!");
555 # deal with remote sequences
557 if( defined $loc->seq_id &&
558 $loc->seq_id ne $seqid ) {
559 if( defined $db ) {
560 my $sid = $loc->seq_id;
561 $sid =~ s/\.\d+$//g;
562 eval {
563 $called_seq = $db->get_Seq_by_acc($sid);
565 if( $@ ) {
566 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
567 $called_seq = undef;
569 } else {
570 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
571 $called_seq = undef;
573 if( !defined $called_seq ) {
574 $seqstr .= 'N' x $self->length;
575 next;
577 } else {
578 $called_seq = $self->entire_seq;
581 # does the called sequence make sense? Bug 1780
582 if ($called_seq->length < $loc->end) {
583 my $accession = $called_seq->accession;
584 my $end = $loc->end;
585 my $length = $called_seq->length;
586 my $orig_id = $self->seq_id; # originating sequence
587 my ($locus) = $self->get_tagset_values("locus_tag");
588 $self->throw("Location end ($end) exceeds length ($length) of ".
589 "called sequence $accession.\nCheck sequence version used in ".
590 "$locus locus-tagged SeqFeature in $orig_id.");
593 if( $self->isa('Bio::Das::SegmentI') ) {
594 my ($s,$e) = ($loc->start,$loc->end);
595 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
596 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
597 # in turn returns a string. Confused?
598 $seqstr .= $called_seq->subseq($s,$e)->seq()->seq();
599 } else {
600 # This is dumb, subseq should work on locations...
601 if( $loc->strand == 1 ) {
602 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
603 } else {
604 if( $nosort ) {
605 $seqstr = $called_seq->trunc($loc->start,$loc->end)->revcom->seq() . $seqstr;
606 } else {
607 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
613 if (defined($phase)) {
614 $seqstr = substr($seqstr, $phase);
617 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
618 . "_spliced_feat",
619 -seq => $seqstr);
621 return $out;
624 =head2 location
626 Title : location
627 Usage : my $location = $seqfeature->location()
628 Function: returns a location object suitable for identifying location
629 of feature on sequence or parent feature
630 Returns : Bio::LocationI object
631 Args : none
634 =cut
636 sub location {
637 my ($self) = @_;
639 $self->throw_not_implemented();
643 =head2 primary_id
645 Title : primary_id
646 Usage : $obj->primary_id($newval)
647 Function:
648 Example :
649 Returns : value of primary_id (a scalar)
650 Args : on set, new value (a scalar or undef, optional)
652 Primary ID is a synonym for the tag 'ID'
654 =cut
656 sub primary_id{
657 my $self = shift;
658 # note from cjm@fruitfly.org:
659 # I have commented out the following 2 lines:
661 #return $self->{'primary_id'} = shift if @_;
662 #return $self->{'primary_id'};
664 #... and replaced it with the following; see
665 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
666 # for the discussion that lead to this change
668 if (@_) {
669 if ($self->has_tag('ID')) {
670 $self->remove_tag('ID');
672 $self->add_tag_value('ID', shift);
674 my ($id) = $self->get_tagset_values('ID');
675 return $id;
678 sub generate_unique_persistent_id {
679 # DEPRECATED - us IDHandler
680 my $self = shift;
681 require Bio::SeqFeature::Tools::IDHandler;
682 Bio::SeqFeature::Tools::IDHandler->new->generate_unique_persistent_id($self);
686 =head2 phase
688 Title : phase
689 Usage : $obj->phase($newval)
690 Function: get/set this feature's phase.
691 Example :
692 Returns : undef if no phase is set,
693 otherwise 0, 1, or 2 (the only valid values for phase)
694 Args : on set, the new value
696 Most features do not have or need a defined phase.
698 For features representing a CDS, the phase indicates where the feature
699 begins with reference to the reading frame. The phase is one of the
700 integers 0, 1, or 2, indicating the number of bases that should be
701 removed from the beginning of this feature to reach the first base of
702 the next codon. In other words, a phase of "0" indicates that the next
703 codon begins at the first base of the region described by the current
704 line, a phase of "1" indicates that the next codon begins at the
705 second base of this region, and a phase of "2" indicates that the
706 codon begins at the third base of this region. This is NOT to be
707 confused with the frame, which is simply start modulo 3.
709 For forward strand features, phase is counted from the start
710 field. For reverse strand features, phase is counted from the end
711 field.
713 =cut
715 sub phase {
716 my $self = shift;
717 if( @_ ) {
718 $self->remove_tag('phase') if $self->has_tag('phase');
719 my $newphase = shift;
720 $self->throw("illegal phase value '$newphase', phase must be either undef, 0, 1, or 2")
721 unless !defined $newphase || $newphase == 0 || $newphase == 1 || $newphase == 2;
722 $self->add_tag_value('phase', $newphase );
723 return $newphase;
726 return $self->has_tag('phase') ? ($self->get_tag_values('phase'))[0] : undef;
730 =head1 Bio::RangeI methods
732 These methods are inherited from RangeI and can be used
733 directly from a SeqFeatureI interface. Remember that a
734 SeqFeature is-a RangeI, and so wherever you see RangeI you
735 can use a feature ($r in the below documentation).
737 =cut
739 =head2 start()
741 See L<Bio::RangeI>
743 =head2 end()
745 See L<Bio::RangeI>
747 =head2 strand()
749 See L<Bio::RangeI>
751 =head2 overlaps()
753 See L<Bio::RangeI>
755 =head2 contains()
757 See L<Bio::RangeI>
759 =head2 equals()
761 See L<Bio::RangeI>
763 =head2 intersection()
765 See L<Bio::RangeI>
767 =head2 union()
769 See L<Bio::RangeI>
771 =cut