2 # BioPerl module for Bio::SeqFeatureI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
20 # get a seqfeature somehow, eg, from a Sequence with Features attached
22 foreach $feat ( $seq->get_SeqFeatures() ) {
23 print "Feature from ", $feat->start, "to ",
24 $feat->end, " Primary tag ", $feat->primary_tag,
25 ", produced by ", $feat->source_tag(), "\n";
27 if( $feat->strand == 0 ) {
28 print "Feature applicable to either strand\n";
30 print "Feature on strand ", $feat->strand,"\n"; # -1,1
33 print "feature location is ",$feat->start, "..",
34 $feat->end, " on strand ", $feat->strand, "\n";
35 print "easy utility to print locations in GenBank/EMBL way ",
36 $feat->location->to_FTstring(), "\n";
38 foreach $tag ( $feat->get_all_tags() ) {
39 print "Feature has tag ", $tag, " with values, ",
40 join(' ',$feat->get_tag_values($tag)), "\n";
42 print "new feature\n" if $feat->has_tag('new');
43 # features can have sub features
44 my @subfeat = $feat->get_SeqFeatures();
49 This interface is the functions one can expect for any Sequence
50 Feature, whatever its implementation or whether it is a more complex
51 type (eg, a Gene). This object does not actually provide any
52 implementation, it just provides the definitions of what methods one can
53 call. See Bio::SeqFeature::Generic for a good standard implementation
58 User feedback is an integral part of the evolution of this and other
59 Bioperl modules. Send your comments and suggestions preferably to one
60 of the Bioperl mailing lists. Your participation is much appreciated.
62 bioperl-l@bioperl.org - General discussion
63 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
67 Please direct usage questions or support issues to the mailing list:
69 I<bioperl-l@bioperl.org>
71 rather than to the module maintainer directly. Many experienced and
72 reponsive experts will be able look at the problem and quickly
73 address it. Please include a thorough description of the problem
74 with code and data examples if at all possible.
78 Report bugs to the Bioperl bug tracking system to help us keep track
79 the bugs and their resolution. Bug reports can be submitted via the
82 https://redmine.open-bio.org/projects/bioperl/
86 The rest of the documentation details each of the object
87 methods. Internal methods are usually preceded with a _
92 # Let the code begin...
95 package Bio
::SeqFeatureI
;
96 use vars
qw($HasInMemory);
99 eval { require Bio::DB::InMemoryCache };
100 if( $@ ) { $HasInMemory = 0 }
101 else { $HasInMemory = 1 }
108 use base qw(Bio::RangeI);
110 =head1 Bio::SeqFeatureI specific methods
112 New method interfaces.
116 =head2 get_SeqFeatures
118 Title : get_SeqFeatures
119 Usage : @feats = $feat->get_SeqFeatures();
120 Function: Returns an array of sub Sequence Features
127 my ($self,@args) = @_;
129 $self->throw_not_implemented();
135 Usage : $name = $feat->display_name()
136 Function: Returns the human-readable name of the feature for displays.
143 shift->throw_not_implemented();
149 Usage : $tag = $feat->primary_tag()
150 Function: Returns the primary tag for a feature,
159 my ($self,@args) = @_;
161 $self->throw_not_implemented();
168 Usage : $tag = $feat->source_tag()
169 Function: Returns the source tag for a feature,
178 my ($self,@args) = @_;
180 $self->throw_not_implemented();
186 Usage : $tag_exists = $self->has_tag('some_tag')
188 Returns : TRUE if the specified tag exists, and FALSE otherwise
194 my ($self,@args) = @_;
196 $self->throw_not_implemented();
200 =head2 get_tag_values
202 Title : get_tag_values
203 Usage : @values = $self->get_tag_values('some_tag')
205 Returns : An array comprising the values of the specified tag.
208 throws an exception if there is no such tag
213 shift->throw_not_implemented();
216 =head2 get_tagset_values
218 Title : get_tagset_values
219 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
221 Returns : An array comprising the values of the specified tags, in order of tags
222 Args : An array of strings
224 does NOT throw an exception if none of the tags are not present
226 this method is useful for getting a human-readable label for a
227 SeqFeatureI; not all tags can be assumed to be present, so a list of
228 possible tags in preferential order is provided
232 # interface + abstract method
233 sub get_tagset_values
{
234 my ($self, @args) = @_;
236 foreach my $arg (@args) {
237 if ($self->has_tag($arg)) {
238 push(@vals, $self->get_tag_values($arg));
247 Usage : @tags = $feat->get_all_tags()
248 Function: gives all tags for this feature
249 Returns : an array of strings
256 shift->throw_not_implemented();
262 Usage : $sf->attach_seq($seq)
263 Function: Attaches a Bio::Seq object to this feature. This
264 Bio::Seq object is for the *entire* sequence: ie
267 Note that it is not guaranteed that if you obtain a feature from
268 an object in bioperl, it will have a sequence attached. Also,
269 implementors of this interface can choose to provide an empty
270 implementation of this method. I.e., there is also no guarantee
271 that if you do attach a sequence, seq() or entire_seq() will not
274 The reason that this method is here on the interface is to enable
275 you to call it on every SeqFeatureI compliant object, and
276 that it will be implemented in a useful way and set to a useful
277 value for the great majority of use cases. Implementors who choose
278 to ignore the call are encouraged to specifically state this in
282 Returns : TRUE on success
283 Args : a Bio::PrimarySeqI compliant object
289 shift->throw_not_implemented();
295 Usage : $tseq = $sf->seq()
296 Function: returns the truncated sequence (if there is a sequence attached)
299 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
300 bounded by start & end, or undef if there is no sequence attached
307 shift->throw_not_implemented();
313 Usage : $whole_seq = $sf->entire_seq()
314 Function: gives the entire sequence that this seqfeature is attached to
316 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
324 shift->throw_not_implemented();
331 Usage : $obj->seq_id($newval)
332 Function: There are many cases when you make a feature that you
333 do know the sequence name, but do not know its actual
334 sequence. This is an attribute such that you can store
335 the ID (e.g., display_id) of the sequence.
337 This attribute should *not* be used in GFF dumping, as
338 that should come from the collection in which the seq
340 Returns : value of seq_id
341 Args : newvalue (optional)
347 shift->throw_not_implemented();
353 Usage : $str = $feat->gff_string;
354 $str = $feat->gff_string($gff_formatter);
355 Function: Provides the feature information in GFF format.
357 The implementation provided here returns GFF2 by default. If you
358 want a different version, supply an object implementing a method
359 gff_string() accepting a SeqFeatureI object as argument. E.g., to
360 obtain GFF1 format, do the following:
362 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
363 $gff1str = $feat->gff_string($gff1io);
366 Args : Optionally, an object implementing gff_string().
372 my ($self,$formatter) = @_;
374 $formatter = $self->_static_gff_formatter unless $formatter;
375 return $formatter->gff_string($self);
378 my $static_gff_formatter = undef;
380 =head2 _static_gff_formatter
382 Title : _static_gff_formatter
392 sub _static_gff_formatter
{
393 my ($self,@args) = @_;
394 require Bio
::Tools
::GFF
; # on the fly inclusion -- is this better?
395 if( !defined $static_gff_formatter ) {
396 $static_gff_formatter = Bio
::Tools
::GFF
->new('-gff_version' => 2);
398 return $static_gff_formatter;
402 =head1 Decorating methods
404 These methods have an implementation provided by Bio::SeqFeatureI,
405 but can be validly overwritten by subclasses
411 Usage : $seq = $feature->spliced_seq()
412 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
414 Function: Provides a sequence of the feature which is the most
415 semantically "relevant" feature for this sequence. A default
416 implementation is provided which for simple cases returns just
417 the sequence, but for split cases, loops over the split location
418 to return the sequence. In the case of split locations with
421 join(AB000123:5567-5589,80..1144)
423 in the case when a database object is passed in, it will attempt
424 to retrieve the sequence from the database object, and "Do the right thing",
425 however if no database object is provided, it will generate the correct
426 number of N's (DNA) or X's (protein, though this is unlikely).
428 This function is deliberately "magical" attempting to second guess
429 what a user wants as "the" sequence for this feature.
431 Implementing classes are free to override this method with their
432 own magic if they have a better idea what the user wants.
435 -db A L<Bio::DB::RandomAccessI> compliant object if
436 one needs to retrieve remote seqs.
437 -nosort boolean if the locations should not be sorted
438 by start location. This may occur, for instance,
439 in a circular sequence where a gene span starts
440 before the end of the sequence and ends after the
441 sequence start. Example : join(15685..16260,1..207)
442 (default = if sequence is_circular(), 1, otherwise 0)
443 -phase truncates the returned sequence based on the
444 intron phase (0,1,2).
446 Returns : A L<Bio::PrimarySeqI> object
453 my ($db, $nosort, $phase) =
454 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
456 # set no_sort based on the parent sequence status
457 if ($self->entire_seq->is_circular) {
461 # (added 7/7/06 to allow use old API (with warnings)
462 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ?
1 : 0;
463 if (@args && $old_api) {
464 $self->warn(q
(API has changed
; please
use '-db' or '-nosort' ).
465 qq(for args
. See POD
for more details
.));
466 $db = shift @args if @args;
467 $nosort = shift @args if @args;
468 $phase = shift @args if @args;
471 if (defined($phase) && ($phase < 0 || $phase > 2)) {
472 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
476 if( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
477 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object".
478 " for access to remote locations for spliced_seq");
480 } elsif( defined $db && $HasInMemory &&
481 $db->isa('Bio::DB::InMemoryCache') ) {
482 $db = Bio
::DB
::InMemoryCache
->new(-seqdb
=> $db);
485 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
487 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
488 my $seqstr = substr($self->seq->seq, $phase);
489 my $out = Bio
::Seq
->new( -id
=> $self->entire_seq->display_id
494 return $self->seq(); # nice and easy!
498 # redundant test, but the above ISA is probably not ideal.
499 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
500 $self->throw("not atomic, not split, yikes, in trouble!");
504 my $seqid = $self->entire_seq->display_id;
505 # This is to deal with reverse strand features
506 # so we are really sorting features 5' -> 3' on their strand
507 # i.e. rev strand features will be sorted largest to smallest
508 # as this how revcom CDSes seem to be annotated in genbank.
509 # Might need to eventually allow this to be programable?
510 # (can I mention how much fun this is NOT! --jason)
512 my ($mixed,$mixedloc, $fstrand) = (0);
514 if( $self->isa('Bio::Das::SegmentI') &&
515 ! $self->absolute ) {
516 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI which does have absolute set to 1 -- be warned you may not be getting things on the correct strand");
519 my @locset = $self->location->each_Location;
522 @locs = map { $_->[0] }
523 # sort so that most negative is first basically to order
524 # the features on the opposite strand 5'->3' on their strand
525 # rather than they way most are input which is on the fwd strand
527 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
529 $fstrand = $_->strand unless defined $fstrand;
530 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
531 if( defined $_->seq_id ) {
532 $mixedloc = 1 if( $_->seq_id ne $seqid );
534 [ $_, $_->start * ($_->strand || 1)];
538 $self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
542 # use the original order instead of trying to sort
544 $fstrand = $locs[0]->strand;
547 foreach my $loc ( @locs ) {
548 if( ! $loc->isa("Bio::Location::Atomic") ) {
549 $self->throw("Can only deal with one level deep locations");
552 if( $fstrand != $loc->strand ) {
553 $self->warn("feature strand is different from location strand!");
555 # deal with remote sequences
557 if( defined $loc->seq_id &&
558 $loc->seq_id ne $seqid ) {
560 my $sid = $loc->seq_id;
563 $called_seq = $db->get_Seq_by_acc($sid);
566 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
570 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
573 if( !defined $called_seq ) {
574 $seqstr .= 'N' x
$self->length;
578 $called_seq = $self->entire_seq;
581 # does the called sequence make sense? Bug 1780
582 if ($called_seq->length < $loc->end) {
583 my $accession = $called_seq->accession;
585 my $length = $called_seq->length;
586 my $orig_id = $self->seq_id; # originating sequence
587 my ($locus) = $self->get_tagset_values("locus_tag");
588 $self->throw("Location end ($end) exceeds length ($length) of ".
589 "called sequence $accession.\nCheck sequence version used in ".
590 "$locus locus-tagged SeqFeature in $orig_id.");
593 if( $self->isa('Bio::Das::SegmentI') ) {
594 my ($s,$e) = ($loc->start,$loc->end);
595 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
596 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
597 # in turn returns a string. Confused?
598 $seqstr .= $called_seq->subseq($s,$e)->seq()->seq();
600 # This is dumb, subseq should work on locations...
601 if( $loc->strand == 1 ) {
602 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
605 $seqstr = $called_seq->trunc($loc->start,$loc->end)->revcom->seq() . $seqstr;
607 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
613 if (defined($phase)) {
614 $seqstr = substr($seqstr, $phase);
617 my $out = Bio
::Seq
->new( -id
=> $self->entire_seq->display_id
627 Usage : my $location = $seqfeature->location()
628 Function: returns a location object suitable for identifying location
629 of feature on sequence or parent feature
630 Returns : Bio::LocationI object
639 $self->throw_not_implemented();
646 Usage : $obj->primary_id($newval)
649 Returns : value of primary_id (a scalar)
650 Args : on set, new value (a scalar or undef, optional)
652 Primary ID is a synonym for the tag 'ID'
658 # note from cjm@fruitfly.org:
659 # I have commented out the following 2 lines:
661 #return $self->{'primary_id'} = shift if @_;
662 #return $self->{'primary_id'};
664 #... and replaced it with the following; see
665 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
666 # for the discussion that lead to this change
669 if ($self->has_tag('ID')) {
670 $self->remove_tag('ID');
672 $self->add_tag_value('ID', shift);
674 my ($id) = $self->get_tagset_values('ID');
678 sub generate_unique_persistent_id
{
679 # DEPRECATED - us IDHandler
681 require Bio
::SeqFeature
::Tools
::IDHandler
;
682 Bio
::SeqFeature
::Tools
::IDHandler
->new->generate_unique_persistent_id($self);
689 Usage : $obj->phase($newval)
690 Function: get/set this feature's phase.
692 Returns : undef if no phase is set,
693 otherwise 0, 1, or 2 (the only valid values for phase)
694 Args : on set, the new value
696 Most features do not have or need a defined phase.
698 For features representing a CDS, the phase indicates where the feature
699 begins with reference to the reading frame. The phase is one of the
700 integers 0, 1, or 2, indicating the number of bases that should be
701 removed from the beginning of this feature to reach the first base of
702 the next codon. In other words, a phase of "0" indicates that the next
703 codon begins at the first base of the region described by the current
704 line, a phase of "1" indicates that the next codon begins at the
705 second base of this region, and a phase of "2" indicates that the
706 codon begins at the third base of this region. This is NOT to be
707 confused with the frame, which is simply start modulo 3.
709 For forward strand features, phase is counted from the start
710 field. For reverse strand features, phase is counted from the end
718 $self->remove_tag('phase') if $self->has_tag('phase');
719 my $newphase = shift;
720 $self->throw("illegal phase value '$newphase', phase must be either undef, 0, 1, or 2")
721 unless !defined $newphase || $newphase == 0 || $newphase == 1 || $newphase == 2;
722 $self->add_tag_value('phase', $newphase );
726 return $self->has_tag('phase') ?
($self->get_tag_values('phase'))[0] : undef;
730 =head1 Bio::RangeI methods
732 These methods are inherited from RangeI and can be used
733 directly from a SeqFeatureI interface. Remember that a
734 SeqFeature is-a RangeI, and so wherever you see RangeI you
735 can use a feature ($r in the below documentation).
763 =head2 intersection()