2 # BioPerl module for Bio::SeqFeatureI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
20 # get a seqfeature somehow, eg, from a Sequence with Features attached
22 foreach $feat ( $seq->get_SeqFeatures() ) {
23 print "Feature from ", $feat->start, "to ",
24 $feat->end, " Primary tag ", $feat->primary_tag,
25 ", produced by ", $feat->source_tag(), "\n";
27 if ( $feat->strand == 0 ) {
28 print "Feature applicable to either strand\n";
31 print "Feature on strand ", $feat->strand,"\n"; # -1,1
34 print "feature location is ",$feat->start, "..",
35 $feat->end, " on strand ", $feat->strand, "\n";
36 print "easy utility to print locations in GenBank/EMBL way ",
37 $feat->location->to_FTstring(), "\n";
39 foreach $tag ( $feat->get_all_tags() ) {
40 print "Feature has tag ", $tag, " with values, ",
41 join(' ',$feat->get_tag_values($tag)), "\n";
43 print "new feature\n" if $feat->has_tag('new');
44 # features can have sub features
45 my @subfeat = $feat->get_SeqFeatures();
50 This interface is the functions one can expect for any Sequence
51 Feature, whatever its implementation or whether it is a more complex
52 type (eg, a Gene). This object does not actually provide any
53 implementation, it just provides the definitions of what methods one can
54 call. See Bio::SeqFeature::Generic for a good standard implementation
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to one
61 of the Bioperl mailing lists. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
83 https://github.com/bioperl/bioperl-live/issues
87 The rest of the documentation details each of the object
88 methods. Internal methods are usually preceded with a _
93 # Let the code begin...
96 package Bio
::SeqFeatureI
;
97 use vars
qw($HasInMemory);
100 eval { require Bio::DB::InMemoryCache };
101 if( $@ ) { $HasInMemory = 0 }
102 else { $HasInMemory = 1 }
109 use base qw(Bio::RangeI);
111 =head1 Bio::SeqFeatureI specific methods
113 New method interfaces.
117 =head2 get_SeqFeatures
119 Title : get_SeqFeatures
120 Usage : @feats = $feat->get_SeqFeatures();
121 Function: Returns an array of sub Sequence Features
128 my ($self,@args) = @_;
130 $self->throw_not_implemented();
136 Usage : $name = $feat->display_name()
137 Function: Returns the human-readable name of the feature for displays.
144 shift->throw_not_implemented();
150 Usage : $tag = $feat->primary_tag()
151 Function: Returns the primary tag for a feature,
160 my ($self,@args) = @_;
162 $self->throw_not_implemented();
169 Usage : $tag = $feat->source_tag()
170 Function: Returns the source tag for a feature,
179 my ($self,@args) = @_;
181 $self->throw_not_implemented();
187 Usage : $tag_exists = $self->has_tag('some_tag')
189 Returns : TRUE if the specified tag exists, and FALSE otherwise
195 my ($self,@args) = @_;
197 $self->throw_not_implemented();
201 =head2 get_tag_values
203 Title : get_tag_values
204 Usage : @values = $self->get_tag_values('some_tag')
206 Returns : An array comprising the values of the specified tag.
209 throws an exception if there is no such tag
214 shift->throw_not_implemented();
217 =head2 get_tagset_values
219 Title : get_tagset_values
220 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
222 Returns : An array comprising the values of the specified tags, in order of tags
223 Args : An array of strings
225 does NOT throw an exception if none of the tags are not present
227 this method is useful for getting a human-readable label for a
228 SeqFeatureI; not all tags can be assumed to be present, so a list of
229 possible tags in preferential order is provided
233 # interface + abstract method
234 sub get_tagset_values
{
235 my ($self, @args) = @_;
237 foreach my $arg (@args) {
238 if ($self->has_tag($arg)) {
239 push(@vals, $self->get_tag_values($arg));
248 Usage : @tags = $feat->get_all_tags()
249 Function: gives all tags for this feature
250 Returns : an array of strings
257 shift->throw_not_implemented();
263 Usage : $sf->attach_seq($seq)
264 Function: Attaches a Bio::Seq object to this feature. This
265 Bio::Seq object is for the *entire* sequence: ie
268 Note that it is not guaranteed that if you obtain a feature from
269 an object in bioperl, it will have a sequence attached. Also,
270 implementors of this interface can choose to provide an empty
271 implementation of this method. I.e., there is also no guarantee
272 that if you do attach a sequence, seq() or entire_seq() will not
275 The reason that this method is here on the interface is to enable
276 you to call it on every SeqFeatureI compliant object, and
277 that it will be implemented in a useful way and set to a useful
278 value for the great majority of use cases. Implementors who choose
279 to ignore the call are encouraged to specifically state this in
283 Returns : TRUE on success
284 Args : a Bio::PrimarySeqI compliant object
290 shift->throw_not_implemented();
296 Usage : $tseq = $sf->seq()
297 Function: returns the truncated sequence (if there is a sequence attached)
300 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
301 bounded by start & end, or undef if there is no sequence attached
308 shift->throw_not_implemented();
314 Usage : $whole_seq = $sf->entire_seq()
315 Function: gives the entire sequence that this seqfeature is attached to
317 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
325 shift->throw_not_implemented();
332 Usage : $obj->seq_id($newval)
333 Function: There are many cases when you make a feature that you
334 do know the sequence name, but do not know its actual
335 sequence. This is an attribute such that you can store
336 the ID (e.g., display_id) of the sequence.
338 This attribute should *not* be used in GFF dumping, as
339 that should come from the collection in which the seq
341 Returns : value of seq_id
342 Args : newvalue (optional)
348 shift->throw_not_implemented();
354 Usage : $str = $feat->gff_string;
355 $str = $feat->gff_string($gff_formatter);
356 Function: Provides the feature information in GFF format.
358 The implementation provided here returns GFF2 by default. If you
359 want a different version, supply an object implementing a method
360 gff_string() accepting a SeqFeatureI object as argument. E.g., to
361 obtain GFF1 format, do the following:
363 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
364 $gff1str = $feat->gff_string($gff1io);
367 Args : Optionally, an object implementing gff_string().
373 my ($self,$formatter) = @_;
375 $formatter = $self->_static_gff_formatter unless $formatter;
376 return $formatter->gff_string($self);
379 my $static_gff_formatter = undef;
381 =head2 _static_gff_formatter
383 Title : _static_gff_formatter
393 sub _static_gff_formatter
{
394 my ($self,@args) = @_;
395 require Bio
::Tools
::GFF
; # on the fly inclusion -- is this better?
396 if( !defined $static_gff_formatter ) {
397 $static_gff_formatter = Bio
::Tools
::GFF
->new('-gff_version' => 2);
399 return $static_gff_formatter;
403 =head1 Decorating methods
405 These methods have an implementation provided by Bio::SeqFeatureI,
406 but can be validly overwritten by subclasses
412 Usage : $seq = $feature->spliced_seq()
413 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
415 Function: Provides a sequence of the feature which is the most
416 semantically "relevant" feature for this sequence. A default
417 implementation is provided which for simple cases returns just
418 the sequence, but for split cases, loops over the split location
419 to return the sequence. In the case of split locations with
422 join(AB000123:5567-5589,80..1144)
424 in the case when a database object is passed in, it will attempt
425 to retrieve the sequence from the database object, and "Do the right thing",
426 however if no database object is provided, it will generate the correct
427 number of N's (DNA) or X's (protein, though this is unlikely).
429 This function is deliberately "magical" attempting to second guess
430 what a user wants as "the" sequence for this feature.
432 Implementing classes are free to override this method with their
433 own magic if they have a better idea what the user wants.
436 -db A L<Bio::DB::RandomAccessI> compliant object if
437 one needs to retrieve remote seqs.
438 -nosort boolean if the locations should not be sorted
439 by start location. This may occur, for instance,
440 in a circular sequence where a gene span starts
441 before the end of the sequence and ends after the
442 sequence start. Example : join(15685..16260,1..207)
443 (default = if sequence is_circular(), 1, otherwise 0)
444 -phase truncates the returned sequence based on the
445 intron phase (0,1,2).
447 Returns : A L<Bio::PrimarySeqI> object
454 my ($db, $nosort, $phase) =
455 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
457 # set no_sort based on the parent sequence status
458 if ($self->entire_seq->is_circular) {
462 # (added 7/7/06 to allow use old API (with warnings)
463 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ?
1 : 0;
464 if (@args && $old_api) {
465 $self->warn( q
(API has changed
; please
use '-db' or '-nosort' )
466 . qq(for args
. See POD
for more details
.));
467 $db = shift @args if @args;
468 $nosort = shift @args if @args;
469 $phase = shift @args if @args;
472 if (defined($phase) && ($phase < 0 || $phase > 2)) {
473 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
477 if ( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
478 $self->warn( "Must pass in a valid Bio::DB::RandomAccessI object"
479 . " for access to remote locations for spliced_seq");
482 elsif ( defined $db && $HasInMemory && $db->isa('Bio::DB::InMemoryCache') ) {
483 $db = Bio
::DB
::InMemoryCache
->new(-seqdb
=> $db);
486 if ( not $self->location->isa("Bio::Location::SplitLocationI") ) {
488 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
489 my $seqstr = substr($self->seq->seq, $phase);
490 my $out = Bio
::Seq
->new( -id
=> $self->entire_seq->display_id
496 return $self->seq(); # nice and easy!
500 # redundant test, but the above ISA is probably not ideal.
501 if ( not $self->location->isa("Bio::Location::SplitLocationI") ) {
502 $self->throw("not atomic, not split, yikes, in trouble!");
506 my $seqid = $self->entire_seq->display_id;
507 # This is to deal with reverse strand features
508 # so we are really sorting features 5' -> 3' on their strand
509 # i.e. rev strand features will be sorted largest to smallest
510 # as this how revcom CDSes seem to be annotated in genbank.
511 # Might need to eventually allow this to be programable?
512 # (can I mention how much fun this is NOT! --jason)
514 my ($mixed,$mixedloc, $fstrand) = (0);
516 if ( $self->isa('Bio::Das::SegmentI') and not $self->absolute ) {
517 $self->warn( "Calling spliced_seq with a Bio::Das::SegmentI which "
518 . "does have absolute set to 1 -- be warned you may not "
519 . "be getting things on the correct strand");
522 my @locset = $self->location->each_Location;
525 # @locs = map { $_->[0] }
526 # sort so that most negative is first basically to order
527 # the features on the opposite strand 5'->3' on their strand
528 # rather than they way most are input which is on the fwd strand
530 # sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
533 $fstrand = $_->strand unless defined $fstrand;
534 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
536 if( defined $_->seq_id ) {
537 $mixedloc = 1 if( $_->seq_id ne $seqid );
539 [ $_, $_->start * ($_->strand || 1) ];
543 if ( $fstrand == 1 ) {
544 @sort_locs = sort { $a->[1] <=> $b->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
545 }elsif ( $fstrand == -1 ){
546 @sort_locs = sort { $b->[1] <=> $a->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
548 @sort_locs = @proc_locs;
550 @locs = map { $_->[0] } @sort_locs;
553 $self->warn( "Mixed strand locations, spliced seq using the "
554 . "input order rather than trying to sort");
559 # use the original order instead of trying to sort
561 $fstrand = $locs[0]->strand;
566 my $called_seq = undef;
567 # This will be left as undefined if 1) db is remote or 2)seq_id is undefined.
568 # In that case, old code is used to make exon sequence
569 my $called_seq_seq = undef;
570 my $called_seq_len = undef;
572 foreach my $loc ( @locs ) {
573 if ( not $loc->isa("Bio::Location::Atomic") ) {
574 $self->throw("Can only deal with one level deep locations");
577 if ( $fstrand != $loc->strand ) {
578 $self->warn("feature strand is different from location strand!");
582 if ( defined $loc->seq_id ) {
583 $loc_seq_id = $loc->seq_id;
585 # deal with remote sequences
586 if ($loc_seq_id ne $seqid ) {
587 # might be too big to download whole sequence
588 $called_seq_seq = undef;
591 my $sid = $loc_seq_id;
594 $called_seq = $db->get_Seq_by_acc($sid);
597 $self->warn( "In attempting to join a remote location, sequence $sid "
598 . "was not in database. Will provide padding N's. Full exception \n\n$@");
603 $self->warn( "cannot get remote location for ".$loc_seq_id ." without a valid "
604 . "Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
607 if ( !defined $called_seq ) {
608 $seqstr .= 'N' x
$loc->length;
612 # have local sequence available
614 # don't have to pull out source sequence again if it's local unless
615 # it's the first exon or different from previous exon
616 unless (defined(($last_id) && $last_id eq $loc_seq_id )){
617 $called_seq = $self->entire_seq;
618 $called_seq_seq = $called_seq->seq(); # this is slow
622 #undefined $loc->seq->id
624 $called_seq = $self->entire_seq;
625 $called_seq_seq = undef;
628 my ($start,$end) = ($loc->start,$loc->end);
630 # does the called sequence make sense? Bug 1780
633 # can avoid a seq() call on called_seq
634 if (defined($called_seq_seq)) {
635 $called_seq_len = length($called_seq_seq);
637 # can't avoid a seq() call on called_seq
639 $called_seq_len = $called_seq->length # this is slow
642 if ($called_seq_len < $loc->end) {
643 my $accession = $called_seq->accession;
644 my $orig_id = $self->seq_id; # originating sequence
645 my ($locus) = $self->get_tagset_values("locus_tag");
646 $self->throw( "Location end ($end) exceeds length ($called_seq_len) of "
647 . "called sequence $accession.\nCheck sequence version used in "
648 . "$locus locus-tagged SeqFeature in $orig_id.");
651 if ( $self->isa('Bio::Das::SegmentI') ) {
652 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
653 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
654 # in turn returns a string. Confused?
655 $seqstr .= $called_seq->subseq($start,$end)->seq()->seq(); # this is slow
659 if (defined ($called_seq_seq)){
660 $exon_seq = substr($called_seq_seq, $start-1, $end-$start+1); # this is quick
663 $exon_seq = $called_seq->subseq($loc->start,$loc->end); # this is slow
666 # If guide_strand is defined, assemble the sequence first and revcom later if needed,
667 # if its not defined, apply revcom immediately to proper locations
668 if (defined $self->location->guide_strand) {
669 $seqstr .= $exon_seq;
672 my $strand = defined ($loc->strand) ?
($loc->strand) : 0;
676 $exon_seq = reverse($exon_seq);
677 $exon_seq =~ tr/ABCDGHKMNRSTUVWXYabcdghkmnrstuvwxy/TVGHCDMKNYSAABWXRtvghcdmknysaabwxr/;
678 $seqstr .= $exon_seq;
681 $seqstr .= $exon_seq;
686 $last_id = $loc_seq_id if (defined($loc_seq_id));
689 # Use revcom only after the whole sequence has been assembled
690 my $guide_strand = defined ($self->location->guide_strand) ?
($self->location->guide_strand) : 0;
691 if ($guide_strand == -1) {
692 my $seqstr_obj = Bio
::Seq
->new(-seq
=> $seqstr);
693 $seqstr = $seqstr_obj->revcom->seq;
696 if (defined($phase)) {
697 $seqstr = substr($seqstr, $phase);
700 my $out = Bio
::Seq
->new( -id
=> $self->entire_seq->display_id
710 Usage : my $location = $seqfeature->location()
711 Function: returns a location object suitable for identifying location
712 of feature on sequence or parent feature
713 Returns : Bio::LocationI object
722 $self->throw_not_implemented();
729 Usage : $obj->primary_id($newval)
732 Returns : value of primary_id (a scalar)
733 Args : on set, new value (a scalar or undef, optional)
735 Primary ID is a synonym for the tag 'ID'
741 # note from cjm@fruitfly.org:
742 # I have commented out the following 2 lines:
744 #return $self->{'primary_id'} = shift if @_;
745 #return $self->{'primary_id'};
747 #... and replaced it with the following; see
748 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
749 # for the discussion that lead to this change
752 if ($self->has_tag('ID')) {
753 $self->remove_tag('ID');
755 $self->add_tag_value('ID', shift);
757 my ($id) = $self->get_tagset_values('ID');
761 sub generate_unique_persistent_id
{
762 # DEPRECATED - us IDHandler
764 require Bio
::SeqFeature
::Tools
::IDHandler
;
765 Bio
::SeqFeature
::Tools
::IDHandler
->new->generate_unique_persistent_id($self);
772 Usage : $obj->phase($newval)
773 Function: get/set this feature's phase.
775 Returns : undef if no phase is set,
776 otherwise 0, 1, or 2 (the only valid values for phase)
777 Args : on set, the new value
779 Most features do not have or need a defined phase.
781 For features representing a CDS, the phase indicates where the feature
782 begins with reference to the reading frame. The phase is one of the
783 integers 0, 1, or 2, indicating the number of bases that should be
784 removed from the beginning of this feature to reach the first base of
785 the next codon. In other words, a phase of "0" indicates that the next
786 codon begins at the first base of the region described by the current
787 line, a phase of "1" indicates that the next codon begins at the
788 second base of this region, and a phase of "2" indicates that the
789 codon begins at the third base of this region. This is NOT to be
790 confused with the frame, which is simply start modulo 3.
792 For forward strand features, phase is counted from the start
793 field. For reverse strand features, phase is counted from the end
801 $self->remove_tag('phase') if $self->has_tag('phase');
802 my $newphase = shift;
803 $self->throw("illegal phase value '$newphase', phase must be either undef, 0, 1, or 2")
804 unless !defined $newphase || $newphase == 0 || $newphase == 1 || $newphase == 2;
805 $self->add_tag_value('phase', $newphase );
809 return $self->has_tag('phase') ?
($self->get_tag_values('phase'))[0] : undef;
813 =head1 Bio::RangeI methods
815 These methods are inherited from RangeI and can be used
816 directly from a SeqFeatureI interface. Remember that a
817 SeqFeature is-a RangeI, and so wherever you see RangeI you
818 can use a feature ($r in the below documentation).
846 =head2 intersection()