3 # BioPerl module for Bio::SeqFeatureI
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
19 # get a seqfeature somehow, eg, from a Sequence with Features attached
21 foreach $feat ( $seq->get_SeqFeatures() ) {
22 print "Feature from ", $feat->start, "to ",
23 $feat->end, " Primary tag ", $feat->primary_tag,
24 ", produced by ", $feat->source_tag(), "\n";
26 if( $feat->strand == 0 ) {
27 print "Feature applicable to either strand\n";
29 print "Feature on strand ", $feat->strand,"\n"; # -1,1
32 print "feature location is ",$feat->start, "..",
33 $feat->end, " on strand ", $feat->strand, "\n";
34 print "easy utility to print locations in GenBank/EMBL way ",
35 $feat->location->to_FTstring(), "\n";
37 foreach $tag ( $feat->get_all_tags() ) {
38 print "Feature has tag ", $tag, " with values, ",
39 join(' ',$feat->get_tag_values($tag)), "\n";
41 print "new feature\n" if $feat->has_tag('new');
42 # features can have sub features
43 my @subfeat = $feat->get_SeqFeatures();
48 This interface is the functions one can expect for any Sequence
49 Feature, whatever its implementation or whether it is a more complex
50 type (eg, a Gene). This object does not actually provide any
51 implemention, it just provides the definitions of what methods one can
52 call. See Bio::SeqFeature::Generic for a good standard implementation
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to one
59 of the Bioperl mailing lists. Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
70 http://bugzilla.open-bio.org/
74 The rest of the documentation details each of the object
75 methods. Internal methods are usually preceded with a _
80 # Let the code begin...
83 package Bio
::SeqFeatureI
;
84 use vars
qw($HasInMemory);
87 eval { require Bio::DB::InMemoryCache };
88 if( $@ ) { $HasInMemory = 0 }
89 else { $HasInMemory = 1 }
96 use base qw(Bio::RangeI);
98 =head1 Bio::SeqFeatureI specific methods
100 New method interfaces.
104 =head2 get_SeqFeatures
106 Title : get_SeqFeatures
107 Usage : @feats = $feat->get_SeqFeatures();
108 Function: Returns an array of sub Sequence Features
115 my ($self,@args) = @_;
117 $self->throw_not_implemented();
123 Usage : $name = $feat->display_name()
124 Function: Returns the human-readable name of the feature for displays.
131 shift->throw_not_implemented();
137 Usage : $tag = $feat->primary_tag()
138 Function: Returns the primary tag for a feature,
147 my ($self,@args) = @_;
149 $self->throw_not_implemented();
156 Usage : $tag = $feat->source_tag()
157 Function: Returns the source tag for a feature,
166 my ($self,@args) = @_;
168 $self->throw_not_implemented();
174 Usage : $tag_exists = $self->has_tag('some_tag')
176 Returns : TRUE if the specified tag exists, and FALSE otherwise
183 my ($self,@args) = @_;
185 $self->throw_not_implemented();
189 =head2 get_tag_values
191 Title : get_tag_values
192 Usage : @values = $self->get_tag_values('some_tag')
194 Returns : An array comprising the values of the specified tag.
197 throws an exception if there is no such tag
202 shift->throw_not_implemented();
205 =head2 get_tagset_values
207 Title : get_tagset_values
208 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
210 Returns : An array comprising the values of the specified tags, in order of tags
211 Args : An array of strings
213 does NOT throw an exception if none of the tags are not present
215 this method is useful for getting a human-readable label for a
216 SeqFeatureI; not all tags can be assumed to be present, so a list of
217 possible tags in preferential order is provided
221 # interface + abstract method
222 sub get_tagset_values
{
223 my ($self, @args) = @_;
225 foreach my $arg (@args) {
226 if ($self->has_tag($arg)) {
227 push(@vals, $self->get_tag_values($arg));
236 Usage : @tags = $feat->get_all_tags()
237 Function: gives all tags for this feature
238 Returns : an array of strings
245 shift->throw_not_implemented();
251 Usage : $sf->attach_seq($seq)
252 Function: Attaches a Bio::Seq object to this feature. This
253 Bio::Seq object is for the *entire* sequence: ie
256 Note that it is not guaranteed that if you obtain a feature from
257 an object in bioperl, it will have a sequence attached. Also,
258 implementors of this interface can choose to provide an empty
259 implementation of this method. I.e., there is also no guarantee
260 that if you do attach a sequence, seq() or entire_seq() will not
263 The reason that this method is here on the interface is to enable
264 you to call it on every SeqFeatureI compliant object, and
265 that it will be implemented in a useful way and set to a useful
266 value for the great majority of use cases. Implementors who choose
267 to ignore the call are encouraged to specifically state this in
271 Returns : TRUE on success
272 Args : a Bio::PrimarySeqI compliant object
278 shift->throw_not_implemented();
284 Usage : $tseq = $sf->seq()
285 Function: returns the truncated sequence (if there is a sequence attached)
288 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
289 bounded by start & end, or undef if there is no sequence attached
296 shift->throw_not_implemented();
302 Usage : $whole_seq = $sf->entire_seq()
303 Function: gives the entire sequence that this seqfeature is attached to
305 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
313 shift->throw_not_implemented();
320 Usage : $obj->seq_id($newval)
321 Function: There are many cases when you make a feature that you
322 do know the sequence name, but do not know its actual
323 sequence. This is an attribute such that you can store
324 the ID (e.g., display_id) of the sequence.
326 This attribute should *not* be used in GFF dumping, as
327 that should come from the collection in which the seq
329 Returns : value of seq_id
330 Args : newvalue (optional)
336 shift->throw_not_implemented();
342 Usage : $str = $feat->gff_string;
343 $str = $feat->gff_string($gff_formatter);
344 Function: Provides the feature information in GFF format.
346 The implementation provided here returns GFF2 by default. If you
347 want a different version, supply an object implementing a method
348 gff_string() accepting a SeqFeatureI object as argument. E.g., to
349 obtain GFF1 format, do the following:
351 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
352 $gff1str = $feat->gff_string($gff1io);
355 Args : Optionally, an object implementing gff_string().
361 my ($self,$formatter) = @_;
363 $formatter = $self->_static_gff_formatter unless $formatter;
364 return $formatter->gff_string($self);
367 my $static_gff_formatter = undef;
369 =head2 _static_gff_formatter
371 Title : _static_gff_formatter
381 sub _static_gff_formatter
{
382 my ($self,@args) = @_;
383 require Bio
::Tools
::GFF
; # on the fly inclusion -- is this better?
384 if( !defined $static_gff_formatter ) {
385 $static_gff_formatter = Bio
::Tools
::GFF
->new('-gff_version' => 2);
387 return $static_gff_formatter;
391 =head1 Decorating methods
393 These methods have an implementation provided by Bio::SeqFeatureI,
394 but can be validly overwritten by subclasses
400 Usage : $seq = $feature->spliced_seq()
401 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
403 Function: Provides a sequence of the feature which is the most
404 semantically "relevant" feature for this sequence. A default
405 implementation is provided which for simple cases returns just
406 the sequence, but for split cases, loops over the split location
407 to return the sequence. In the case of split locations with
410 join(AB000123:5567-5589,80..1144)
412 in the case when a database object is passed in, it will attempt
413 to retrieve the sequence from the database object, and "Do the right thing",
414 however if no database object is provided, it will generate the correct
415 number of N's (DNA) or X's (protein, though this is unlikely).
417 This function is deliberately "magical" attempting to second guess
418 what a user wants as "the" sequence for this feature.
420 Implementing classes are free to override this method with their
421 own magic if they have a better idea what the user wants.
424 -db A L<Bio::DB::RandomAccessI> compliant object if
425 one needs to retrieve remote seqs.
426 -nosort boolean if the locations should not be sorted
427 by start location. This may occur, for instance,
428 in a circular sequence where a gene span starts
429 before the end of the sequence and ends after the
430 sequence start. Example : join(15685..16260,1..207)
431 -phase truncates the returned sequence based on the
432 intron phase (0,1,2).
434 Returns : A L<Bio::PrimarySeqI> object
441 my ($db, $nosort, $phase) =
442 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
444 # (added 7/7/06 to allow use old API (with warnings)
445 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ?
1 : 0;
446 if (@args && $old_api) {
447 $self->warn(q
(API has changed
; please
use '-db' or '-nosort' ).
448 qq(for args
. See POD
for more details
.));
449 $db = shift @args if @args;
450 $nosort = shift @args if @args;
451 $phase = shift @args if @args;
454 if (defined($phase) && ($phase < 0 || $phase > 2)) {
455 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
459 if( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
460 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object".
461 " for access to remote locations for spliced_seq");
463 } elsif( defined $db && $HasInMemory &&
464 $db->isa('Bio::DB::InMemoryCache') ) {
465 $db = Bio
::DB
::InMemoryCache
->new(-seqdb
=> $db);
468 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
470 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
471 my $seqstr = substr($self->seq->seq, $phase);
472 my $out = Bio
::Seq
->new( -id
=> $self->entire_seq->display_id
477 return $self->seq(); # nice and easy!
481 # redundant test, but the above ISA is probably not ideal.
482 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
483 $self->throw("not atomic, not split, yikes, in trouble!");
487 my $seqid = $self->entire_seq->display_id;
488 # This is to deal with reverse strand features
489 # so we are really sorting features 5' -> 3' on their strand
490 # i.e. rev strand features will be sorted largest to smallest
491 # as this how revcom CDSes seem to be annotated in genbank.
492 # Might need to eventually allow this to be programable?
493 # (can I mention how much fun this is NOT! --jason)
495 my ($mixed,$mixedloc, $fstrand) = (0);
497 if( $self->isa('Bio::Das::SegmentI') &&
498 ! $self->absolute ) {
499 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI which does have absolute set to 1 -- be warned you may not be getting things on the correct strand");
502 my @locset = $self->location->each_Location;
505 @locs = map { $_->[0] }
506 # sort so that most negative is first basically to order
507 # the features on the opposite strand 5'->3' on their strand
508 # rather than they way most are input which is on the fwd strand
510 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
512 $fstrand = $_->strand unless defined $fstrand;
513 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
514 if( defined $_->seq_id ) {
515 $mixedloc = 1 if( $_->seq_id ne $seqid );
517 [ $_, $_->start * ($_->strand || 1)];
521 $self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
525 # use the original order instead of trying to sort
527 $fstrand = $locs[0]->strand;
530 foreach my $loc ( @locs ) {
531 if( ! $loc->isa("Bio::Location::Atomic") ) {
532 $self->throw("Can only deal with one level deep locations");
535 if( $fstrand != $loc->strand ) {
536 $self->warn("feature strand is different from location strand!");
538 # deal with remote sequences
540 if( defined $loc->seq_id &&
541 $loc->seq_id ne $seqid ) {
543 my $sid = $loc->seq_id;
546 $called_seq = $db->get_Seq_by_acc($sid);
549 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
553 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
556 if( !defined $called_seq ) {
557 $seqstr .= 'N' x
$self->length;
561 $called_seq = $self->entire_seq;
564 # does the called sequence make sense? Bug 1780
565 if ($called_seq->length < $loc->end) {
566 my $accession = $called_seq->accession;
568 my $length = $called_seq->length;
569 my $orig_id = $self->seq_id; # originating sequence
570 my ($locus) = $self->get_tagset_values("locus_tag");
571 $self->throw("Location end ($end) exceeds length ($length) of ".
572 "called sequence $accession.\nCheck sequence version used in ".
573 "$locus locus-tagged SeqFeature in $orig_id.");
576 if( $self->isa('Bio::Das::SegmentI') ) {
577 my ($s,$e) = ($loc->start,$loc->end);
578 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
579 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
580 # in turn returns a string. Confused?
581 $seqstr .= $called_seq->subseq($s,$e)->seq()->seq();
583 # This is dumb, subseq should work on locations...
584 if( $loc->strand == 1 ) {
585 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
588 $seqstr = $called_seq->trunc($loc->start,$loc->end)->revcom->seq() . $seqstr;
590 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
596 if (defined($phase)) {
597 $seqstr = substr($seqstr, $phase);
600 my $out = Bio
::Seq
->new( -id
=> $self->entire_seq->display_id
610 Usage : my $location = $seqfeature->location()
611 Function: returns a location object suitable for identifying location
612 of feature on sequence or parent feature
613 Returns : Bio::LocationI object
622 $self->throw_not_implemented();
629 Usage : $obj->primary_id($newval)
632 Returns : value of primary_id (a scalar)
633 Args : on set, new value (a scalar or undef, optional)
635 Primary ID is a synonym for the tag 'ID'
641 # note from cjm@fruitfly.org:
642 # I have commented out the following 2 lines:
644 #return $self->{'primary_id'} = shift if @_;
645 #return $self->{'primary_id'};
647 #... and replaced it with the following; see
648 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
649 # for the discussion that lead to this change
652 if ($self->has_tag('ID')) {
653 $self->remove_tag('ID');
655 $self->add_tag_value('ID', shift);
657 my ($id) = $self->get_tagset_values('ID');
661 sub generate_unique_persistent_id
{
662 # DEPRECATED - us IDHandler
664 require Bio
::SeqFeature
::Tools
::IDHandler
;
665 Bio
::SeqFeature
::Tools
::IDHandler
->new->generate_unique_persistent_id($self);
668 =head1 Bio::RangeI methods
670 These methods are inherited from RangeI and can be used
671 directly from a SeqFeatureI interface. Remember that a
672 SeqFeature is-a RangeI, and so wherever you see RangeI you
673 can use a feature ($r in the below documentation).
701 =head2 intersection()
709 =head1 Bio::AnnotatableI methods
715 B<Deprecated>. See L<Bio::AnnotatableI>
719 B<Deprecated>. See L<Bio::AnnotatableI>
721 =head2 add_tag_value()
723 B<Deprecated>. See L<Bio::AnnotatableI>
725 =head2 get_tag_values()
727 B<Deprecated>. See L<Bio::AnnotatableI>
729 =head2 get_tagset_values()
731 B<Deprecated>. See L<Bio::AnnotatableI>
733 =head2 get_all_tags()
735 B<Deprecated>. See L<Bio::AnnotatableI>