bug 2356
[bioperl-live.git] / Bio / SeqFeatureI.pm
blob0bbdede4a17cced465b9cbc257e60bc3ecc0f603
1 # $Id$
3 # BioPerl module for Bio::SeqFeatureI
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
17 =head1 SYNOPSIS
19 # get a seqfeature somehow, eg, from a Sequence with Features attached
21 foreach $feat ( $seq->get_SeqFeatures() ) {
22 print "Feature from ", $feat->start, "to ",
23 $feat->end, " Primary tag ", $feat->primary_tag,
24 ", produced by ", $feat->source_tag(), "\n";
26 if( $feat->strand == 0 ) {
27 print "Feature applicable to either strand\n";
28 } else {
29 print "Feature on strand ", $feat->strand,"\n"; # -1,1
32 print "feature location is ",$feat->start, "..",
33 $feat->end, " on strand ", $feat->strand, "\n";
34 print "easy utility to print locations in GenBank/EMBL way ",
35 $feat->location->to_FTstring(), "\n";
37 foreach $tag ( $feat->get_all_tags() ) {
38 print "Feature has tag ", $tag, " with values, ",
39 join(' ',$feat->get_tag_values($tag)), "\n";
41 print "new feature\n" if $feat->has_tag('new');
42 # features can have sub features
43 my @subfeat = $feat->get_SeqFeatures();
46 =head1 DESCRIPTION
48 This interface is the functions one can expect for any Sequence
49 Feature, whatever its implementation or whether it is a more complex
50 type (eg, a Gene). This object does not actually provide any
51 implemention, it just provides the definitions of what methods one can
52 call. See Bio::SeqFeature::Generic for a good standard implementation
53 of this object
55 =head1 FEEDBACK
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to one
59 of the Bioperl mailing lists. Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
68 web:
70 http://bugzilla.open-bio.org/
72 =head1 APPENDIX
74 The rest of the documentation details each of the object
75 methods. Internal methods are usually preceded with a _
77 =cut
80 # Let the code begin...
83 package Bio::SeqFeatureI;
84 use vars qw($HasInMemory);
85 use strict;
86 BEGIN {
87 eval { require Bio::DB::InMemoryCache };
88 if( $@ ) { $HasInMemory = 0 }
89 else { $HasInMemory = 1 }
92 use Bio::Seq;
94 use Carp;
96 use base qw(Bio::RangeI);
98 =head1 Bio::SeqFeatureI specific methods
100 New method interfaces.
102 =cut
104 =head2 get_SeqFeatures
106 Title : get_SeqFeatures
107 Usage : @feats = $feat->get_SeqFeatures();
108 Function: Returns an array of sub Sequence Features
109 Returns : An array
110 Args : none
112 =cut
114 sub get_SeqFeatures{
115 my ($self,@args) = @_;
117 $self->throw_not_implemented();
120 =head2 display_name
122 Title : display_name
123 Usage : $name = $feat->display_name()
124 Function: Returns the human-readable name of the feature for displays.
125 Returns : a string
126 Args : none
128 =cut
130 sub display_name {
131 shift->throw_not_implemented();
134 =head2 primary_tag
136 Title : primary_tag
137 Usage : $tag = $feat->primary_tag()
138 Function: Returns the primary tag for a feature,
139 eg 'exon'
140 Returns : a string
141 Args : none
144 =cut
146 sub primary_tag{
147 my ($self,@args) = @_;
149 $self->throw_not_implemented();
153 =head2 source_tag
155 Title : source_tag
156 Usage : $tag = $feat->source_tag()
157 Function: Returns the source tag for a feature,
158 eg, 'genscan'
159 Returns : a string
160 Args : none
163 =cut
165 sub source_tag{
166 my ($self,@args) = @_;
168 $self->throw_not_implemented();
171 =head2 has_tag
173 Title : has_tag
174 Usage : $tag_exists = $self->has_tag('some_tag')
175 Function:
176 Returns : TRUE if the specified tag exists, and FALSE otherwise
177 Args :
180 =cut
182 sub has_tag{
183 my ($self,@args) = @_;
185 $self->throw_not_implemented();
189 =head2 get_tag_values
191 Title : get_tag_values
192 Usage : @values = $self->get_tag_values('some_tag')
193 Function:
194 Returns : An array comprising the values of the specified tag.
195 Args : a string
197 throws an exception if there is no such tag
199 =cut
201 sub get_tag_values {
202 shift->throw_not_implemented();
205 =head2 get_tagset_values
207 Title : get_tagset_values
208 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
209 Function:
210 Returns : An array comprising the values of the specified tags, in order of tags
211 Args : An array of strings
213 does NOT throw an exception if none of the tags are not present
215 this method is useful for getting a human-readable label for a
216 SeqFeatureI; not all tags can be assumed to be present, so a list of
217 possible tags in preferential order is provided
219 =cut
221 # interface + abstract method
222 sub get_tagset_values {
223 my ($self, @args) = @_;
224 my @vals = ();
225 foreach my $arg (@args) {
226 if ($self->has_tag($arg)) {
227 push(@vals, $self->get_tag_values($arg));
230 return @vals;
233 =head2 get_all_tags
235 Title : get_all_tags
236 Usage : @tags = $feat->get_all_tags()
237 Function: gives all tags for this feature
238 Returns : an array of strings
239 Args : none
242 =cut
244 sub get_all_tags{
245 shift->throw_not_implemented();
248 =head2 attach_seq
250 Title : attach_seq
251 Usage : $sf->attach_seq($seq)
252 Function: Attaches a Bio::Seq object to this feature. This
253 Bio::Seq object is for the *entire* sequence: ie
254 from 1 to 10000
256 Note that it is not guaranteed that if you obtain a feature from
257 an object in bioperl, it will have a sequence attached. Also,
258 implementors of this interface can choose to provide an empty
259 implementation of this method. I.e., there is also no guarantee
260 that if you do attach a sequence, seq() or entire_seq() will not
261 return undef.
263 The reason that this method is here on the interface is to enable
264 you to call it on every SeqFeatureI compliant object, and
265 that it will be implemented in a useful way and set to a useful
266 value for the great majority of use cases. Implementors who choose
267 to ignore the call are encouraged to specifically state this in
268 their documentation.
270 Example :
271 Returns : TRUE on success
272 Args : a Bio::PrimarySeqI compliant object
275 =cut
277 sub attach_seq {
278 shift->throw_not_implemented();
281 =head2 seq
283 Title : seq
284 Usage : $tseq = $sf->seq()
285 Function: returns the truncated sequence (if there is a sequence attached)
286 for this feature
287 Example :
288 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
289 bounded by start & end, or undef if there is no sequence attached
290 Args : none
293 =cut
295 sub seq {
296 shift->throw_not_implemented();
299 =head2 entire_seq
301 Title : entire_seq
302 Usage : $whole_seq = $sf->entire_seq()
303 Function: gives the entire sequence that this seqfeature is attached to
304 Example :
305 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
306 sequence attached
307 Args : none
310 =cut
312 sub entire_seq {
313 shift->throw_not_implemented();
317 =head2 seq_id
319 Title : seq_id
320 Usage : $obj->seq_id($newval)
321 Function: There are many cases when you make a feature that you
322 do know the sequence name, but do not know its actual
323 sequence. This is an attribute such that you can store
324 the ID (e.g., display_id) of the sequence.
326 This attribute should *not* be used in GFF dumping, as
327 that should come from the collection in which the seq
328 feature was found.
329 Returns : value of seq_id
330 Args : newvalue (optional)
333 =cut
335 sub seq_id {
336 shift->throw_not_implemented();
339 =head2 gff_string
341 Title : gff_string
342 Usage : $str = $feat->gff_string;
343 $str = $feat->gff_string($gff_formatter);
344 Function: Provides the feature information in GFF format.
346 The implementation provided here returns GFF2 by default. If you
347 want a different version, supply an object implementing a method
348 gff_string() accepting a SeqFeatureI object as argument. E.g., to
349 obtain GFF1 format, do the following:
351 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
352 $gff1str = $feat->gff_string($gff1io);
354 Returns : A string
355 Args : Optionally, an object implementing gff_string().
358 =cut
360 sub gff_string{
361 my ($self,$formatter) = @_;
363 $formatter = $self->_static_gff_formatter unless $formatter;
364 return $formatter->gff_string($self);
367 my $static_gff_formatter = undef;
369 =head2 _static_gff_formatter
371 Title : _static_gff_formatter
372 Usage :
373 Function:
374 Example :
375 Returns :
376 Args :
379 =cut
381 sub _static_gff_formatter{
382 my ($self,@args) = @_;
383 require Bio::Tools::GFF; # on the fly inclusion -- is this better?
384 if( !defined $static_gff_formatter ) {
385 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
387 return $static_gff_formatter;
391 =head1 Decorating methods
393 These methods have an implementation provided by Bio::SeqFeatureI,
394 but can be validly overwritten by subclasses
396 =head2 spliced_seq
398 Title : spliced_seq
400 Usage : $seq = $feature->spliced_seq()
401 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
403 Function: Provides a sequence of the feature which is the most
404 semantically "relevant" feature for this sequence. A default
405 implementation is provided which for simple cases returns just
406 the sequence, but for split cases, loops over the split location
407 to return the sequence. In the case of split locations with
408 remote locations, eg
410 join(AB000123:5567-5589,80..1144)
412 in the case when a database object is passed in, it will attempt
413 to retrieve the sequence from the database object, and "Do the right thing",
414 however if no database object is provided, it will generate the correct
415 number of N's (DNA) or X's (protein, though this is unlikely).
417 This function is deliberately "magical" attempting to second guess
418 what a user wants as "the" sequence for this feature.
420 Implementing classes are free to override this method with their
421 own magic if they have a better idea what the user wants.
423 Args : [optional]
424 -db A L<Bio::DB::RandomAccessI> compliant object if
425 one needs to retrieve remote seqs.
426 -nosort boolean if the locations should not be sorted
427 by start location. This may occur, for instance,
428 in a circular sequence where a gene span starts
429 before the end of the sequence and ends after the
430 sequence start. Example : join(15685..16260,1..207)
431 -phase truncates the returned sequence based on the
432 intron phase (0,1,2).
434 Returns : A L<Bio::PrimarySeqI> object
436 =cut
438 sub spliced_seq {
439 my $self = shift;
440 my @args = @_;
441 my ($db, $nosort, $phase) =
442 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
444 # (added 7/7/06 to allow use old API (with warnings)
445 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ? 1 : 0;
446 if (@args && $old_api) {
447 $self->warn(q(API has changed; please use '-db' or '-nosort' ).
448 qq(for args. See POD for more details.));
449 $db = shift @args if @args;
450 $nosort = shift @args if @args;
451 $phase = shift @args if @args;
454 if (defined($phase) && ($phase < 0 || $phase > 2)) {
455 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
456 $phase = 0;
459 if( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
460 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object".
461 " for access to remote locations for spliced_seq");
462 $db = undef;
463 } elsif( defined $db && $HasInMemory &&
464 $db->isa('Bio::DB::InMemoryCache') ) {
465 $db = Bio::DB::InMemoryCache->new(-seqdb => $db);
468 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
469 if ($phase) {
470 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
471 my $seqstr = substr($self->seq->seq, $phase);
472 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
473 . "_spliced_feat",
474 -seq => $seqstr);
475 return $out;
476 } else {
477 return $self->seq(); # nice and easy!
481 # redundant test, but the above ISA is probably not ideal.
482 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
483 $self->throw("not atomic, not split, yikes, in trouble!");
486 my $seqstr = '';
487 my $seqid = $self->entire_seq->display_id;
488 # This is to deal with reverse strand features
489 # so we are really sorting features 5' -> 3' on their strand
490 # i.e. rev strand features will be sorted largest to smallest
491 # as this how revcom CDSes seem to be annotated in genbank.
492 # Might need to eventually allow this to be programable?
493 # (can I mention how much fun this is NOT! --jason)
495 my ($mixed,$mixedloc, $fstrand) = (0);
497 if( $self->isa('Bio::Das::SegmentI') &&
498 ! $self->absolute ) {
499 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI which does have absolute set to 1 -- be warned you may not be getting things on the correct strand");
502 my @locset = $self->location->each_Location;
503 my @locs;
504 if( ! $nosort ) {
505 @locs = map { $_->[0] }
506 # sort so that most negative is first basically to order
507 # the features on the opposite strand 5'->3' on their strand
508 # rather than they way most are input which is on the fwd strand
510 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
511 map {
512 $fstrand = $_->strand unless defined $fstrand;
513 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
514 if( defined $_->seq_id ) {
515 $mixedloc = 1 if( $_->seq_id ne $seqid );
517 [ $_, $_->start * ($_->strand || 1)];
518 } @locset;
520 if ( $mixed ) {
521 $self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
522 @locs = @locset;
524 } else {
525 # use the original order instead of trying to sort
526 @locs = @locset;
527 $fstrand = $locs[0]->strand;
530 foreach my $loc ( @locs ) {
531 if( ! $loc->isa("Bio::Location::Atomic") ) {
532 $self->throw("Can only deal with one level deep locations");
534 my $called_seq;
535 if( $fstrand != $loc->strand ) {
536 $self->warn("feature strand is different from location strand!");
538 # deal with remote sequences
540 if( defined $loc->seq_id &&
541 $loc->seq_id ne $seqid ) {
542 if( defined $db ) {
543 my $sid = $loc->seq_id;
544 $sid =~ s/\.\d+$//g;
545 eval {
546 $called_seq = $db->get_Seq_by_acc($sid);
548 if( $@ ) {
549 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
550 $called_seq = undef;
552 } else {
553 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
554 $called_seq = undef;
556 if( !defined $called_seq ) {
557 $seqstr .= 'N' x $self->length;
558 next;
560 } else {
561 $called_seq = $self->entire_seq;
564 # does the called sequence make sense? Bug 1780
565 if ($called_seq->length < $loc->end) {
566 my $accession = $called_seq->accession;
567 my $end = $loc->end;
568 my $length = $called_seq->length;
569 my $orig_id = $self->seq_id; # originating sequence
570 my ($locus) = $self->get_tagset_values("locus_tag");
571 $self->throw("Location end ($end) exceeds length ($length) of ".
572 "called sequence $accession.\nCheck sequence version used in ".
573 "$locus locus-tagged SeqFeature in $orig_id.");
576 if( $self->isa('Bio::Das::SegmentI') ) {
577 my ($s,$e) = ($loc->start,$loc->end);
578 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
579 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
580 # in turn returns a string. Confused?
581 $seqstr .= $called_seq->subseq($s,$e)->seq()->seq();
582 } else {
583 # This is dumb, subseq should work on locations...
584 if( $loc->strand == 1 ) {
585 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
586 } else {
587 if( $nosort ) {
588 $seqstr = $called_seq->trunc($loc->start,$loc->end)->revcom->seq() . $seqstr;
589 } else {
590 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
596 if (defined($phase)) {
597 $seqstr = substr($seqstr, $phase);
600 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
601 . "_spliced_feat",
602 -seq => $seqstr);
604 return $out;
607 =head2 location
609 Title : location
610 Usage : my $location = $seqfeature->location()
611 Function: returns a location object suitable for identifying location
612 of feature on sequence or parent feature
613 Returns : Bio::LocationI object
614 Args : none
617 =cut
619 sub location {
620 my ($self) = @_;
622 $self->throw_not_implemented();
626 =head2 primary_id
628 Title : primary_id
629 Usage : $obj->primary_id($newval)
630 Function:
631 Example :
632 Returns : value of primary_id (a scalar)
633 Args : on set, new value (a scalar or undef, optional)
635 Primary ID is a synonym for the tag 'ID'
637 =cut
639 sub primary_id{
640 my $self = shift;
641 # note from cjm@fruitfly.org:
642 # I have commented out the following 2 lines:
644 #return $self->{'primary_id'} = shift if @_;
645 #return $self->{'primary_id'};
647 #... and replaced it with the following; see
648 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
649 # for the discussion that lead to this change
651 if (@_) {
652 if ($self->has_tag('ID')) {
653 $self->remove_tag('ID');
655 $self->add_tag_value('ID', shift);
657 my ($id) = $self->get_tagset_values('ID');
658 return $id;
661 sub generate_unique_persistent_id {
662 # DEPRECATED - us IDHandler
663 my $self = shift;
664 require Bio::SeqFeature::Tools::IDHandler;
665 Bio::SeqFeature::Tools::IDHandler->new->generate_unique_persistent_id($self);
668 =head1 Bio::RangeI methods
670 These methods are inherited from RangeI and can be used
671 directly from a SeqFeatureI interface. Remember that a
672 SeqFeature is-a RangeI, and so wherever you see RangeI you
673 can use a feature ($r in the below documentation).
675 =cut
677 =head2 start()
679 See L<Bio::RangeI>
681 =head2 end()
683 See L<Bio::RangeI>
685 =head2 strand()
687 See L<Bio::RangeI>
689 =head2 overlaps()
691 See L<Bio::RangeI>
693 =head2 contains()
695 See L<Bio::RangeI>
697 =head2 equals()
699 See L<Bio::RangeI>
701 =head2 intersection()
703 See L<Bio::RangeI>
705 =head2 union()
707 See L<Bio::RangeI>
709 =head1 Bio::AnnotatableI methods
711 =cut
713 =head2 has_tag()
715 B<Deprecated>. See L<Bio::AnnotatableI>
717 =head2 remove_tag()
719 B<Deprecated>. See L<Bio::AnnotatableI>
721 =head2 add_tag_value()
723 B<Deprecated>. See L<Bio::AnnotatableI>
725 =head2 get_tag_values()
727 B<Deprecated>. See L<Bio::AnnotatableI>
729 =head2 get_tagset_values()
731 B<Deprecated>. See L<Bio::AnnotatableI>
733 =head2 get_all_tags()
735 B<Deprecated>. See L<Bio::AnnotatableI>
737 =cut