changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / SeqFeatureI.pm
blob86a35a2255423e7561e6801ac59be4b8d10367fc
2 # BioPerl module for Bio::SeqFeatureI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
18 =head1 SYNOPSIS
20 # get a seqfeature somehow, eg, from a Sequence with Features attached
22 foreach $feat ( $seq->get_SeqFeatures() ) {
23 print "Feature from ", $feat->start, "to ",
24 $feat->end, " Primary tag ", $feat->primary_tag,
25 ", produced by ", $feat->source_tag(), "\n";
27 if( $feat->strand == 0 ) {
28 print "Feature applicable to either strand\n";
29 } else {
30 print "Feature on strand ", $feat->strand,"\n"; # -1,1
33 print "feature location is ",$feat->start, "..",
34 $feat->end, " on strand ", $feat->strand, "\n";
35 print "easy utility to print locations in GenBank/EMBL way ",
36 $feat->location->to_FTstring(), "\n";
38 foreach $tag ( $feat->get_all_tags() ) {
39 print "Feature has tag ", $tag, " with values, ",
40 join(' ',$feat->get_tag_values($tag)), "\n";
42 print "new feature\n" if $feat->has_tag('new');
43 # features can have sub features
44 my @subfeat = $feat->get_SeqFeatures();
47 =head1 DESCRIPTION
49 This interface is the functions one can expect for any Sequence
50 Feature, whatever its implementation or whether it is a more complex
51 type (eg, a Gene). This object does not actually provide any
52 implementation, it just provides the definitions of what methods one can
53 call. See Bio::SeqFeature::Generic for a good standard implementation
54 of this object
56 =head1 FEEDBACK
58 User feedback is an integral part of the evolution of this and other
59 Bioperl modules. Send your comments and suggestions preferably to one
60 of the Bioperl mailing lists. Your participation is much appreciated.
62 bioperl-l@bioperl.org - General discussion
63 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
65 =head2 Support
67 Please direct usage questions or support issues to the mailing list:
69 I<bioperl-l@bioperl.org>
71 rather than to the module maintainer directly. Many experienced and
72 reponsive experts will be able look at the problem and quickly
73 address it. Please include a thorough description of the problem
74 with code and data examples if at all possible.
76 =head2 Reporting Bugs
78 Report bugs to the Bioperl bug tracking system to help us keep track
79 the bugs and their resolution. Bug reports can be submitted via the
80 web:
82 https://github.com/bioperl/bioperl-live/issues
84 =head1 APPENDIX
86 The rest of the documentation details each of the object
87 methods. Internal methods are usually preceded with a _
89 =cut
92 # Let the code begin...
95 package Bio::SeqFeatureI;
96 use vars qw($HasInMemory);
97 use strict;
98 BEGIN {
99 eval { require Bio::DB::InMemoryCache };
100 if( $@ ) { $HasInMemory = 0 }
101 else { $HasInMemory = 1 }
104 use Bio::Seq;
106 use Carp;
108 use base qw(Bio::RangeI);
110 =head1 Bio::SeqFeatureI specific methods
112 New method interfaces.
114 =cut
116 =head2 get_SeqFeatures
118 Title : get_SeqFeatures
119 Usage : @feats = $feat->get_SeqFeatures();
120 Function: Returns an array of sub Sequence Features
121 Returns : An array
122 Args : none
124 =cut
126 sub get_SeqFeatures{
127 my ($self,@args) = @_;
129 $self->throw_not_implemented();
132 =head2 display_name
134 Title : display_name
135 Usage : $name = $feat->display_name()
136 Function: Returns the human-readable name of the feature for displays.
137 Returns : a string
138 Args : none
140 =cut
142 sub display_name {
143 shift->throw_not_implemented();
146 =head2 primary_tag
148 Title : primary_tag
149 Usage : $tag = $feat->primary_tag()
150 Function: Returns the primary tag for a feature,
151 eg 'exon'
152 Returns : a string
153 Args : none
156 =cut
158 sub primary_tag{
159 my ($self,@args) = @_;
161 $self->throw_not_implemented();
165 =head2 source_tag
167 Title : source_tag
168 Usage : $tag = $feat->source_tag()
169 Function: Returns the source tag for a feature,
170 eg, 'genscan'
171 Returns : a string
172 Args : none
175 =cut
177 sub source_tag{
178 my ($self,@args) = @_;
180 $self->throw_not_implemented();
183 =head2 has_tag
185 Title : has_tag
186 Usage : $tag_exists = $self->has_tag('some_tag')
187 Function:
188 Returns : TRUE if the specified tag exists, and FALSE otherwise
189 Args :
191 =cut
193 sub has_tag{
194 my ($self,@args) = @_;
196 $self->throw_not_implemented();
200 =head2 get_tag_values
202 Title : get_tag_values
203 Usage : @values = $self->get_tag_values('some_tag')
204 Function:
205 Returns : An array comprising the values of the specified tag.
206 Args : a string
208 throws an exception if there is no such tag
210 =cut
212 sub get_tag_values {
213 shift->throw_not_implemented();
216 =head2 get_tagset_values
218 Title : get_tagset_values
219 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
220 Function:
221 Returns : An array comprising the values of the specified tags, in order of tags
222 Args : An array of strings
224 does NOT throw an exception if none of the tags are not present
226 this method is useful for getting a human-readable label for a
227 SeqFeatureI; not all tags can be assumed to be present, so a list of
228 possible tags in preferential order is provided
230 =cut
232 # interface + abstract method
233 sub get_tagset_values {
234 my ($self, @args) = @_;
235 my @vals = ();
236 foreach my $arg (@args) {
237 if ($self->has_tag($arg)) {
238 push(@vals, $self->get_tag_values($arg));
241 return @vals;
244 =head2 get_all_tags
246 Title : get_all_tags
247 Usage : @tags = $feat->get_all_tags()
248 Function: gives all tags for this feature
249 Returns : an array of strings
250 Args : none
253 =cut
255 sub get_all_tags{
256 shift->throw_not_implemented();
259 =head2 attach_seq
261 Title : attach_seq
262 Usage : $sf->attach_seq($seq)
263 Function: Attaches a Bio::Seq object to this feature. This
264 Bio::Seq object is for the *entire* sequence: ie
265 from 1 to 10000
267 Note that it is not guaranteed that if you obtain a feature from
268 an object in bioperl, it will have a sequence attached. Also,
269 implementors of this interface can choose to provide an empty
270 implementation of this method. I.e., there is also no guarantee
271 that if you do attach a sequence, seq() or entire_seq() will not
272 return undef.
274 The reason that this method is here on the interface is to enable
275 you to call it on every SeqFeatureI compliant object, and
276 that it will be implemented in a useful way and set to a useful
277 value for the great majority of use cases. Implementors who choose
278 to ignore the call are encouraged to specifically state this in
279 their documentation.
281 Example :
282 Returns : TRUE on success
283 Args : a Bio::PrimarySeqI compliant object
286 =cut
288 sub attach_seq {
289 shift->throw_not_implemented();
292 =head2 seq
294 Title : seq
295 Usage : $tseq = $sf->seq()
296 Function: returns the truncated sequence (if there is a sequence attached)
297 for this feature
298 Example :
299 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
300 bounded by start & end, or undef if there is no sequence attached
301 Args : none
304 =cut
306 sub seq {
307 shift->throw_not_implemented();
310 =head2 entire_seq
312 Title : entire_seq
313 Usage : $whole_seq = $sf->entire_seq()
314 Function: gives the entire sequence that this seqfeature is attached to
315 Example :
316 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
317 sequence attached
318 Args : none
321 =cut
323 sub entire_seq {
324 shift->throw_not_implemented();
328 =head2 seq_id
330 Title : seq_id
331 Usage : $obj->seq_id($newval)
332 Function: There are many cases when you make a feature that you
333 do know the sequence name, but do not know its actual
334 sequence. This is an attribute such that you can store
335 the ID (e.g., display_id) of the sequence.
337 This attribute should *not* be used in GFF dumping, as
338 that should come from the collection in which the seq
339 feature was found.
340 Returns : value of seq_id
341 Args : newvalue (optional)
344 =cut
346 sub seq_id {
347 shift->throw_not_implemented();
350 =head2 gff_string
352 Title : gff_string
353 Usage : $str = $feat->gff_string;
354 $str = $feat->gff_string($gff_formatter);
355 Function: Provides the feature information in GFF format.
357 The implementation provided here returns GFF2 by default. If you
358 want a different version, supply an object implementing a method
359 gff_string() accepting a SeqFeatureI object as argument. E.g., to
360 obtain GFF1 format, do the following:
362 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
363 $gff1str = $feat->gff_string($gff1io);
365 Returns : A string
366 Args : Optionally, an object implementing gff_string().
369 =cut
371 sub gff_string{
372 my ($self,$formatter) = @_;
374 $formatter = $self->_static_gff_formatter unless $formatter;
375 return $formatter->gff_string($self);
378 my $static_gff_formatter = undef;
380 =head2 _static_gff_formatter
382 Title : _static_gff_formatter
383 Usage :
384 Function:
385 Example :
386 Returns :
387 Args :
390 =cut
392 sub _static_gff_formatter{
393 my ($self,@args) = @_;
394 require Bio::Tools::GFF; # on the fly inclusion -- is this better?
395 if( !defined $static_gff_formatter ) {
396 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
398 return $static_gff_formatter;
402 =head1 Decorating methods
404 These methods have an implementation provided by Bio::SeqFeatureI,
405 but can be validly overwritten by subclasses
407 =head2 spliced_seq
409 Title : spliced_seq
411 Usage : $seq = $feature->spliced_seq()
412 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
414 Function: Provides a sequence of the feature which is the most
415 semantically "relevant" feature for this sequence. A default
416 implementation is provided which for simple cases returns just
417 the sequence, but for split cases, loops over the split location
418 to return the sequence. In the case of split locations with
419 remote locations, eg
421 join(AB000123:5567-5589,80..1144)
423 in the case when a database object is passed in, it will attempt
424 to retrieve the sequence from the database object, and "Do the right thing",
425 however if no database object is provided, it will generate the correct
426 number of N's (DNA) or X's (protein, though this is unlikely).
428 This function is deliberately "magical" attempting to second guess
429 what a user wants as "the" sequence for this feature.
431 Implementing classes are free to override this method with their
432 own magic if they have a better idea what the user wants.
434 Args : [optional]
435 -db A L<Bio::DB::RandomAccessI> compliant object if
436 one needs to retrieve remote seqs.
437 -nosort boolean if the locations should not be sorted
438 by start location. This may occur, for instance,
439 in a circular sequence where a gene span starts
440 before the end of the sequence and ends after the
441 sequence start. Example : join(15685..16260,1..207)
442 (default = if sequence is_circular(), 1, otherwise 0)
443 -phase truncates the returned sequence based on the
444 intron phase (0,1,2).
446 Returns : A L<Bio::PrimarySeqI> object
448 =cut
450 sub spliced_seq {
451 my $self = shift;
452 my @args = @_;
453 my ($db, $nosort, $phase) =
454 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
456 # set no_sort based on the parent sequence status
457 if ($self->entire_seq->is_circular) {
458 $nosort = 1;
461 # (added 7/7/06 to allow use old API (with warnings)
462 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ? 1 : 0;
463 if (@args && $old_api) {
464 $self->warn(q(API has changed; please use '-db' or '-nosort' ).
465 qq(for args. See POD for more details.));
466 $db = shift @args if @args;
467 $nosort = shift @args if @args;
468 $phase = shift @args if @args;
471 if (defined($phase) && ($phase < 0 || $phase > 2)) {
472 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
473 $phase = 0;
476 if( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
477 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object".
478 " for access to remote locations for spliced_seq");
479 $db = undef;
480 } elsif( defined $db && $HasInMemory &&
481 $db->isa('Bio::DB::InMemoryCache') ) {
482 $db = Bio::DB::InMemoryCache->new(-seqdb => $db);
485 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
486 if ($phase) {
487 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
488 my $seqstr = substr($self->seq->seq, $phase);
489 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
490 . "_spliced_feat",
491 -seq => $seqstr);
492 return $out;
493 } else {
494 return $self->seq(); # nice and easy!
498 # redundant test, but the above ISA is probably not ideal.
499 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
500 $self->throw("not atomic, not split, yikes, in trouble!");
503 my $seqstr = '';
504 my $seqid = $self->entire_seq->display_id;
505 # This is to deal with reverse strand features
506 # so we are really sorting features 5' -> 3' on their strand
507 # i.e. rev strand features will be sorted largest to smallest
508 # as this how revcom CDSes seem to be annotated in genbank.
509 # Might need to eventually allow this to be programable?
510 # (can I mention how much fun this is NOT! --jason)
512 my ($mixed,$mixedloc, $fstrand) = (0);
514 if( $self->isa('Bio::Das::SegmentI') &&
515 ! $self->absolute ) {
516 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI which does have absolute set to 1 -- be warned you may not be getting things on the correct strand");
519 my @locset = $self->location->each_Location;
520 my @locs;
521 if( ! $nosort ) {
522 @locs = map { $_->[0] }
523 # sort so that most negative is first basically to order
524 # the features on the opposite strand 5'->3' on their strand
525 # rather than they way most are input which is on the fwd strand
527 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
528 map {
529 $fstrand = $_->strand unless defined $fstrand;
530 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
531 if( defined $_->seq_id ) {
532 $mixedloc = 1 if( $_->seq_id ne $seqid );
534 [ $_, $_->start * ($_->strand || 1)];
535 } @locset;
537 if ( $mixed ) {
538 $self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
539 @locs = @locset;
541 } else {
542 # use the original order instead of trying to sort
543 @locs = @locset;
544 $fstrand = $locs[0]->strand;
547 foreach my $loc ( @locs ) {
548 if( ! $loc->isa("Bio::Location::Atomic") ) {
549 $self->throw("Can only deal with one level deep locations");
551 my $called_seq;
552 if( $fstrand != $loc->strand ) {
553 $self->warn("feature strand is different from location strand!");
555 # deal with remote sequences
557 if( defined $loc->seq_id &&
558 $loc->seq_id ne $seqid ) {
559 if( defined $db ) {
560 my $sid = $loc->seq_id;
561 $sid =~ s/\.\d+$//g;
562 eval {
563 $called_seq = $db->get_Seq_by_acc($sid);
565 if( $@ ) {
566 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
567 $called_seq = undef;
569 } else {
570 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
571 $called_seq = undef;
573 if( !defined $called_seq ) {
574 $seqstr .= 'N' x $self->length;
575 next;
577 } else {
578 $called_seq = $self->entire_seq;
581 # does the called sequence make sense? Bug 1780
582 if ($called_seq->length < $loc->end) {
583 my $accession = $called_seq->accession;
584 my $end = $loc->end;
585 my $length = $called_seq->length;
586 my $orig_id = $self->seq_id; # originating sequence
587 my ($locus) = $self->get_tagset_values("locus_tag");
588 $self->throw("Location end ($end) exceeds length ($length) of ".
589 "called sequence $accession.\nCheck sequence version used in ".
590 "$locus locus-tagged SeqFeature in $orig_id.");
593 if( $self->isa('Bio::Das::SegmentI') ) {
594 my ($s,$e) = ($loc->start,$loc->end);
595 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
596 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
597 # in turn returns a string. Confused?
598 $seqstr .= $called_seq->subseq($s,$e)->seq()->seq();
599 } else {
600 # If guide_strand is defined, assemble the sequence first and revcom later if needed,
601 # if its not defined, apply revcom immediately to proper locations
602 if (defined $self->location->guide_strand) {
603 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
604 } else {
605 my $strand = defined ($loc->strand) ? ($loc->strand) : 0;
606 if ($strand == -1) {
607 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq;
608 } else {
609 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
614 # Use revcom only after the whole sequence has been assembled
615 my $guide_strand = defined ($self->location->guide_strand) ? ($self->location->guide_strand) : 0;
616 if ($guide_strand == -1) {
617 my $seqstr_obj = Bio::Seq->new(-seq => $seqstr);
618 $seqstr = $seqstr_obj->revcom->seq;
621 if (defined($phase)) {
622 $seqstr = substr($seqstr, $phase);
625 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
626 . "_spliced_feat",
627 -seq => $seqstr);
629 return $out;
632 =head2 location
634 Title : location
635 Usage : my $location = $seqfeature->location()
636 Function: returns a location object suitable for identifying location
637 of feature on sequence or parent feature
638 Returns : Bio::LocationI object
639 Args : none
642 =cut
644 sub location {
645 my ($self) = @_;
647 $self->throw_not_implemented();
651 =head2 primary_id
653 Title : primary_id
654 Usage : $obj->primary_id($newval)
655 Function:
656 Example :
657 Returns : value of primary_id (a scalar)
658 Args : on set, new value (a scalar or undef, optional)
660 Primary ID is a synonym for the tag 'ID'
662 =cut
664 sub primary_id{
665 my $self = shift;
666 # note from cjm@fruitfly.org:
667 # I have commented out the following 2 lines:
669 #return $self->{'primary_id'} = shift if @_;
670 #return $self->{'primary_id'};
672 #... and replaced it with the following; see
673 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
674 # for the discussion that lead to this change
676 if (@_) {
677 if ($self->has_tag('ID')) {
678 $self->remove_tag('ID');
680 $self->add_tag_value('ID', shift);
682 my ($id) = $self->get_tagset_values('ID');
683 return $id;
686 sub generate_unique_persistent_id {
687 # DEPRECATED - us IDHandler
688 my $self = shift;
689 require Bio::SeqFeature::Tools::IDHandler;
690 Bio::SeqFeature::Tools::IDHandler->new->generate_unique_persistent_id($self);
694 =head2 phase
696 Title : phase
697 Usage : $obj->phase($newval)
698 Function: get/set this feature's phase.
699 Example :
700 Returns : undef if no phase is set,
701 otherwise 0, 1, or 2 (the only valid values for phase)
702 Args : on set, the new value
704 Most features do not have or need a defined phase.
706 For features representing a CDS, the phase indicates where the feature
707 begins with reference to the reading frame. The phase is one of the
708 integers 0, 1, or 2, indicating the number of bases that should be
709 removed from the beginning of this feature to reach the first base of
710 the next codon. In other words, a phase of "0" indicates that the next
711 codon begins at the first base of the region described by the current
712 line, a phase of "1" indicates that the next codon begins at the
713 second base of this region, and a phase of "2" indicates that the
714 codon begins at the third base of this region. This is NOT to be
715 confused with the frame, which is simply start modulo 3.
717 For forward strand features, phase is counted from the start
718 field. For reverse strand features, phase is counted from the end
719 field.
721 =cut
723 sub phase {
724 my $self = shift;
725 if( @_ ) {
726 $self->remove_tag('phase') if $self->has_tag('phase');
727 my $newphase = shift;
728 $self->throw("illegal phase value '$newphase', phase must be either undef, 0, 1, or 2")
729 unless !defined $newphase || $newphase == 0 || $newphase == 1 || $newphase == 2;
730 $self->add_tag_value('phase', $newphase );
731 return $newphase;
734 return $self->has_tag('phase') ? ($self->get_tag_values('phase'))[0] : undef;
738 =head1 Bio::RangeI methods
740 These methods are inherited from RangeI and can be used
741 directly from a SeqFeatureI interface. Remember that a
742 SeqFeature is-a RangeI, and so wherever you see RangeI you
743 can use a feature ($r in the below documentation).
745 =cut
747 =head2 start()
749 See L<Bio::RangeI>
751 =head2 end()
753 See L<Bio::RangeI>
755 =head2 strand()
757 See L<Bio::RangeI>
759 =head2 overlaps()
761 See L<Bio::RangeI>
763 =head2 contains()
765 See L<Bio::RangeI>
767 =head2 equals()
769 See L<Bio::RangeI>
771 =head2 intersection()
773 See L<Bio::RangeI>
775 =head2 union()
777 See L<Bio::RangeI>
779 =cut