* sync with trunk
[bioperl-live.git] / Bio / SeqFeatureI.pm
blob2d97e0bd8eed4af2578bfe3a7ffb79714b88504b
1 # $Id$
3 # BioPerl module for Bio::SeqFeatureI
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
17 =head1 SYNOPSIS
19 # get a seqfeature somehow, eg, from a Sequence with Features attached
21 foreach $feat ( $seq->get_SeqFeatures() ) {
22 print "Feature from ", $feat->start, "to ",
23 $feat->end, " Primary tag ", $feat->primary_tag,
24 ", produced by ", $feat->source_tag(), "\n";
26 if( $feat->strand == 0 ) {
27 print "Feature applicable to either strand\n";
28 } else {
29 print "Feature on strand ", $feat->strand,"\n"; # -1,1
32 print "feature location is ",$feat->start, "..",
33 $feat->end, " on strand ", $feat->strand, "\n";
34 print "easy utility to print locations in GenBank/EMBL way ",
35 $feat->location->to_FTstring(), "\n";
37 foreach $tag ( $feat->get_all_tags() ) {
38 print "Feature has tag ", $tag, " with values, ",
39 join(' ',$feat->get_tag_values($tag)), "\n";
41 print "new feature\n" if $feat->has_tag('new');
42 # features can have sub features
43 my @subfeat = $feat->get_SeqFeatures();
46 =head1 DESCRIPTION
48 This interface is the functions one can expect for any Sequence
49 Feature, whatever its implementation or whether it is a more complex
50 type (eg, a Gene). This object does not actually provide any
51 implemention, it just provides the definitions of what methods one can
52 call. See Bio::SeqFeature::Generic for a good standard implementation
53 of this object
55 =head1 FEEDBACK
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to one
59 of the Bioperl mailing lists. Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
68 web:
70 http://bugzilla.open-bio.org/
72 =head1 APPENDIX
74 The rest of the documentation details each of the object
75 methods. Internal methods are usually preceded with a _
77 =cut
80 # Let the code begin...
83 package Bio::SeqFeatureI;
84 use vars qw($HasInMemory);
85 use strict;
86 BEGIN {
87 eval { require Bio::DB::InMemoryCache };
88 if( $@ ) { $HasInMemory = 0 }
89 else { $HasInMemory = 1 }
92 use Bio::Seq;
94 use Carp;
96 use base qw(Bio::RangeI);
98 =head1 Bio::SeqFeatureI specific methods
100 New method interfaces.
102 =cut
104 =head2 get_SeqFeatures
106 Title : get_SeqFeatures
107 Usage : @feats = $feat->get_SeqFeatures();
108 Function: Returns an array of sub Sequence Features
109 Returns : An array
110 Args : none
112 =cut
114 sub get_SeqFeatures{
115 my ($self,@args) = @_;
117 $self->throw_not_implemented();
120 =head2 display_name
122 Title : display_name
123 Usage : $name = $feat->display_name()
124 Function: Returns the human-readable name of the feature for displays.
125 Returns : a string
126 Args : none
128 =cut
130 sub display_name {
131 shift->throw_not_implemented();
134 =head2 primary_tag
136 Title : primary_tag
137 Usage : $tag = $feat->primary_tag()
138 Function: Returns the primary tag for a feature,
139 eg 'exon'
140 Returns : a string
141 Args : none
144 =cut
146 sub primary_tag{
147 my ($self,@args) = @_;
149 $self->throw_not_implemented();
153 =head2 source_tag
155 Title : source_tag
156 Usage : $tag = $feat->source_tag()
157 Function: Returns the source tag for a feature,
158 eg, 'genscan'
159 Returns : a string
160 Args : none
163 =cut
165 sub source_tag{
166 my ($self,@args) = @_;
168 $self->throw_not_implemented();
171 =head2 has_tag
173 Title : has_tag
174 Usage : $tag_exists = $self->has_tag('some_tag')
175 Function:
176 Returns : TRUE if the specified tag exists, and FALSE otherwise
177 Args :
180 =cut
182 sub has_tag{
183 my ($self,@args) = @_;
185 $self->throw_not_implemented();
189 =head2 get_tag_values
191 Title : get_tag_values
192 Usage : @values = $self->get_tag_values('some_tag')
193 Function:
194 Returns : An array comprising the values of the specified tag.
195 Args : a string
197 throws an exception if there is no such tag
199 =cut
201 sub get_tag_values {
202 shift->throw_not_implemented();
205 =head2 get_tagset_values
207 Title : get_tagset_values
208 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
209 Function:
210 Returns : An array comprising the values of the specified tags, in order of tags
211 Args : An array of strings
213 does NOT throw an exception if none of the tags are not present
215 this method is useful for getting a human-readable label for a
216 SeqFeatureI; not all tags can be assumed to be present, so a list of
217 possible tags in preferential order is provided
219 =cut
221 # interface + abstract method
222 sub get_tagset_values {
223 my ($self, @args) = @_;
224 my @vals = ();
225 foreach my $arg (@args) {
226 if ($self->has_tag($arg)) {
227 push(@vals, $self->get_tag_values($arg));
230 return @vals;
233 =head2 get_all_tags
235 Title : get_all_tags
236 Usage : @tags = $feat->get_all_tags()
237 Function: gives all tags for this feature
238 Returns : an array of strings
239 Args : none
242 =cut
244 sub get_all_tags{
245 shift->throw_not_implemented();
248 =head2 attach_seq
250 Title : attach_seq
251 Usage : $sf->attach_seq($seq)
252 Function: Attaches a Bio::Seq object to this feature. This
253 Bio::Seq object is for the *entire* sequence: ie
254 from 1 to 10000
256 Note that it is not guaranteed that if you obtain a feature from
257 an object in bioperl, it will have a sequence attached. Also,
258 implementors of this interface can choose to provide an empty
259 implementation of this method. I.e., there is also no guarantee
260 that if you do attach a sequence, seq() or entire_seq() will not
261 return undef.
263 The reason that this method is here on the interface is to enable
264 you to call it on every SeqFeatureI compliant object, and
265 that it will be implemented in a useful way and set to a useful
266 value for the great majority of use cases. Implementors who choose
267 to ignore the call are encouraged to specifically state this in
268 their documentation.
270 Example :
271 Returns : TRUE on success
272 Args : a Bio::PrimarySeqI compliant object
275 =cut
277 sub attach_seq {
278 shift->throw_not_implemented();
281 =head2 seq
283 Title : seq
284 Usage : $tseq = $sf->seq()
285 Function: returns the truncated sequence (if there is a sequence attached)
286 for this feature
287 Example :
288 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
289 bounded by start & end, or undef if there is no sequence attached
290 Args : none
293 =cut
295 sub seq {
296 shift->throw_not_implemented();
299 =head2 entire_seq
301 Title : entire_seq
302 Usage : $whole_seq = $sf->entire_seq()
303 Function: gives the entire sequence that this seqfeature is attached to
304 Example :
305 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
306 sequence attached
307 Args : none
310 =cut
312 sub entire_seq {
313 shift->throw_not_implemented();
317 =head2 seq_id
319 Title : seq_id
320 Usage : $obj->seq_id($newval)
321 Function: There are many cases when you make a feature that you
322 do know the sequence name, but do not know its actual
323 sequence. This is an attribute such that you can store
324 the ID (e.g., display_id) of the sequence.
326 This attribute should *not* be used in GFF dumping, as
327 that should come from the collection in which the seq
328 feature was found.
329 Returns : value of seq_id
330 Args : newvalue (optional)
333 =cut
335 sub seq_id {
336 shift->throw_not_implemented();
339 =head2 gff_string
341 Title : gff_string
342 Usage : $str = $feat->gff_string;
343 $str = $feat->gff_string($gff_formatter);
344 Function: Provides the feature information in GFF format.
346 The implementation provided here returns GFF2 by default. If you
347 want a different version, supply an object implementing a method
348 gff_string() accepting a SeqFeatureI object as argument. E.g., to
349 obtain GFF1 format, do the following:
351 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
352 $gff1str = $feat->gff_string($gff1io);
354 Returns : A string
355 Args : Optionally, an object implementing gff_string().
358 =cut
360 sub gff_string{
361 my ($self,$formatter) = @_;
363 $formatter = $self->_static_gff_formatter unless $formatter;
364 return $formatter->gff_string($self);
367 my $static_gff_formatter = undef;
369 =head2 _static_gff_formatter
371 Title : _static_gff_formatter
372 Usage :
373 Function:
374 Example :
375 Returns :
376 Args :
379 =cut
381 sub _static_gff_formatter{
382 my ($self,@args) = @_;
383 require Bio::Tools::GFF; # on the fly inclusion -- is this better?
384 if( !defined $static_gff_formatter ) {
385 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
387 return $static_gff_formatter;
391 =head1 Decorating methods
393 These methods have an implementation provided by Bio::SeqFeatureI,
394 but can be validly overwritten by subclasses
396 =head2 spliced_seq
398 Title : spliced_seq
400 Usage : $seq = $feature->spliced_seq()
401 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
403 Function: Provides a sequence of the feature which is the most
404 semantically "relevant" feature for this sequence. A default
405 implementation is provided which for simple cases returns just
406 the sequence, but for split cases, loops over the split location
407 to return the sequence. In the case of split locations with
408 remote locations, eg
410 join(AB000123:5567-5589,80..1144)
412 in the case when a database object is passed in, it will attempt
413 to retrieve the sequence from the database object, and "Do the right thing",
414 however if no database object is provided, it will generate the correct
415 number of N's (DNA) or X's (protein, though this is unlikely).
417 This function is deliberately "magical" attempting to second guess
418 what a user wants as "the" sequence for this feature.
420 Implementing classes are free to override this method with their
421 own magic if they have a better idea what the user wants.
423 Args : [optional]
424 -db A L<Bio::DB::RandomAccessI> compliant object if
425 one needs to retrieve remote seqs.
426 -nosort boolean if the locations should not be sorted
427 by start location. This may occur, for instance,
428 in a circular sequence where a gene span starts
429 before the end of the sequence and ends after the
430 sequence start. Example : join(15685..16260,1..207)
431 (default = if sequence is_circular(), 1, otherwise 0)
432 -phase truncates the returned sequence based on the
433 intron phase (0,1,2).
435 Returns : A L<Bio::PrimarySeqI> object
437 =cut
439 sub spliced_seq {
440 my $self = shift;
441 my @args = @_;
442 my ($db, $nosort, $phase) =
443 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
445 # set no_sort based on the parent sequence status
446 if ($self->entire_seq->is_circular) {
447 $nosort = 1;
450 # (added 7/7/06 to allow use old API (with warnings)
451 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ? 1 : 0;
452 if (@args && $old_api) {
453 $self->warn(q(API has changed; please use '-db' or '-nosort' ).
454 qq(for args. See POD for more details.));
455 $db = shift @args if @args;
456 $nosort = shift @args if @args;
457 $phase = shift @args if @args;
460 if (defined($phase) && ($phase < 0 || $phase > 2)) {
461 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
462 $phase = 0;
465 if( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
466 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object".
467 " for access to remote locations for spliced_seq");
468 $db = undef;
469 } elsif( defined $db && $HasInMemory &&
470 $db->isa('Bio::DB::InMemoryCache') ) {
471 $db = Bio::DB::InMemoryCache->new(-seqdb => $db);
474 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
475 if ($phase) {
476 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
477 my $seqstr = substr($self->seq->seq, $phase);
478 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
479 . "_spliced_feat",
480 -seq => $seqstr);
481 return $out;
482 } else {
483 return $self->seq(); # nice and easy!
487 # redundant test, but the above ISA is probably not ideal.
488 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
489 $self->throw("not atomic, not split, yikes, in trouble!");
492 my $seqstr = '';
493 my $seqid = $self->entire_seq->display_id;
494 # This is to deal with reverse strand features
495 # so we are really sorting features 5' -> 3' on their strand
496 # i.e. rev strand features will be sorted largest to smallest
497 # as this how revcom CDSes seem to be annotated in genbank.
498 # Might need to eventually allow this to be programable?
499 # (can I mention how much fun this is NOT! --jason)
501 my ($mixed,$mixedloc, $fstrand) = (0);
503 if( $self->isa('Bio::Das::SegmentI') &&
504 ! $self->absolute ) {
505 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI which does have absolute set to 1 -- be warned you may not be getting things on the correct strand");
508 my @locset = $self->location->each_Location;
509 my @locs;
510 if( ! $nosort ) {
511 @locs = map { $_->[0] }
512 # sort so that most negative is first basically to order
513 # the features on the opposite strand 5'->3' on their strand
514 # rather than they way most are input which is on the fwd strand
516 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
517 map {
518 $fstrand = $_->strand unless defined $fstrand;
519 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
520 if( defined $_->seq_id ) {
521 $mixedloc = 1 if( $_->seq_id ne $seqid );
523 [ $_, $_->start * ($_->strand || 1)];
524 } @locset;
526 if ( $mixed ) {
527 $self->warn("Mixed strand locations, spliced seq using the input order rather than trying to sort");
528 @locs = @locset;
530 } else {
531 # use the original order instead of trying to sort
532 @locs = @locset;
533 $fstrand = $locs[0]->strand;
536 foreach my $loc ( @locs ) {
537 if( ! $loc->isa("Bio::Location::Atomic") ) {
538 $self->throw("Can only deal with one level deep locations");
540 my $called_seq;
541 if( $fstrand != $loc->strand ) {
542 $self->warn("feature strand is different from location strand!");
544 # deal with remote sequences
546 if( defined $loc->seq_id &&
547 $loc->seq_id ne $seqid ) {
548 if( defined $db ) {
549 my $sid = $loc->seq_id;
550 $sid =~ s/\.\d+$//g;
551 eval {
552 $called_seq = $db->get_Seq_by_acc($sid);
554 if( $@ ) {
555 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
556 $called_seq = undef;
558 } else {
559 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
560 $called_seq = undef;
562 if( !defined $called_seq ) {
563 $seqstr .= 'N' x $self->length;
564 next;
566 } else {
567 $called_seq = $self->entire_seq;
570 # does the called sequence make sense? Bug 1780
571 if ($called_seq->length < $loc->end) {
572 my $accession = $called_seq->accession;
573 my $end = $loc->end;
574 my $length = $called_seq->length;
575 my $orig_id = $self->seq_id; # originating sequence
576 my ($locus) = $self->get_tagset_values("locus_tag");
577 $self->throw("Location end ($end) exceeds length ($length) of ".
578 "called sequence $accession.\nCheck sequence version used in ".
579 "$locus locus-tagged SeqFeature in $orig_id.");
582 if( $self->isa('Bio::Das::SegmentI') ) {
583 my ($s,$e) = ($loc->start,$loc->end);
584 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
585 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
586 # in turn returns a string. Confused?
587 $seqstr .= $called_seq->subseq($s,$e)->seq()->seq();
588 } else {
589 # This is dumb, subseq should work on locations...
590 if( $loc->strand == 1 ) {
591 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
592 } else {
593 if( $nosort ) {
594 $seqstr = $called_seq->trunc($loc->start,$loc->end)->revcom->seq() . $seqstr;
595 } else {
596 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
602 if (defined($phase)) {
603 $seqstr = substr($seqstr, $phase);
606 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
607 . "_spliced_feat",
608 -seq => $seqstr);
610 return $out;
613 =head2 location
615 Title : location
616 Usage : my $location = $seqfeature->location()
617 Function: returns a location object suitable for identifying location
618 of feature on sequence or parent feature
619 Returns : Bio::LocationI object
620 Args : none
623 =cut
625 sub location {
626 my ($self) = @_;
628 $self->throw_not_implemented();
632 =head2 primary_id
634 Title : primary_id
635 Usage : $obj->primary_id($newval)
636 Function:
637 Example :
638 Returns : value of primary_id (a scalar)
639 Args : on set, new value (a scalar or undef, optional)
641 Primary ID is a synonym for the tag 'ID'
643 =cut
645 sub primary_id{
646 my $self = shift;
647 # note from cjm@fruitfly.org:
648 # I have commented out the following 2 lines:
650 #return $self->{'primary_id'} = shift if @_;
651 #return $self->{'primary_id'};
653 #... and replaced it with the following; see
654 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
655 # for the discussion that lead to this change
657 if (@_) {
658 if ($self->has_tag('ID')) {
659 $self->remove_tag('ID');
661 $self->add_tag_value('ID', shift);
663 my ($id) = $self->get_tagset_values('ID');
664 return $id;
667 sub generate_unique_persistent_id {
668 # DEPRECATED - us IDHandler
669 my $self = shift;
670 require Bio::SeqFeature::Tools::IDHandler;
671 Bio::SeqFeature::Tools::IDHandler->new->generate_unique_persistent_id($self);
674 =head1 Bio::RangeI methods
676 These methods are inherited from RangeI and can be used
677 directly from a SeqFeatureI interface. Remember that a
678 SeqFeature is-a RangeI, and so wherever you see RangeI you
679 can use a feature ($r in the below documentation).
681 =cut
683 =head2 start()
685 See L<Bio::RangeI>
687 =head2 end()
689 See L<Bio::RangeI>
691 =head2 strand()
693 See L<Bio::RangeI>
695 =head2 overlaps()
697 See L<Bio::RangeI>
699 =head2 contains()
701 See L<Bio::RangeI>
703 =head2 equals()
705 See L<Bio::RangeI>
707 =head2 intersection()
709 See L<Bio::RangeI>
711 =head2 union()
713 See L<Bio::RangeI>
715 =head1 Bio::AnnotatableI methods
717 =cut
719 =head2 has_tag()
721 B<Deprecated>. See L<Bio::AnnotatableI>
723 =head2 remove_tag()
725 B<Deprecated>. See L<Bio::AnnotatableI>
727 =head2 add_tag_value()
729 B<Deprecated>. See L<Bio::AnnotatableI>
731 =head2 get_tag_values()
733 B<Deprecated>. See L<Bio::AnnotatableI>
735 =head2 get_tagset_values()
737 B<Deprecated>. See L<Bio::AnnotatableI>
739 =head2 get_all_tags()
741 B<Deprecated>. See L<Bio::AnnotatableI>
743 =cut