tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / SeqFeature / Gene / Transcript.pm
blobdd42b52c42ba1c7363c9ebe846eb7cddbf8eb4cd
1 # $Id$
3 # BioPerl module for Bio::SeqFeature::Gene::Transcript
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Hilmar Lapp <hlapp@gmx.net>
9 # Copyright Hilmar Lapp
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SeqFeature::Gene::Transcript - A feature representing a transcript
19 =head1 SYNOPSIS
21 # See documentation of methods.
23 =head1 DESCRIPTION
25 A feature representing a transcript.
27 =head1 FEEDBACK
29 =head2 Mailing Lists
31 User feedback is an integral part of the evolution of this and other
32 Bioperl modules. Send your comments and suggestions preferably to one
33 of the Bioperl mailing lists. Your participation is much appreciated.
35 bioperl-l@bioperl.org - General discussion
36 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
38 =head2 Support
40 Please direct usage questions or support issues to the mailing list:
42 I<bioperl-l@bioperl.org>
44 rather than to the module maintainer directly. Many experienced and
45 reponsive experts will be able look at the problem and quickly
46 address it. Please include a thorough description of the problem
47 with code and data examples if at all possible.
49 =head2 Reporting Bugs
51 Report bugs to the Bioperl bug tracking system to help us keep track
52 the bugs and their resolution. Bug reports can be submitted via the
53 web:
55 http://bugzilla.open-bio.org/
57 =head1 AUTHOR - Hilmar Lapp
59 Email hlapp@gmx.net
61 =head1 APPENDIX
63 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
65 =cut
68 # Let the code begin...
70 package Bio::SeqFeature::Gene::Transcript;
71 use strict;
74 use Bio::PrimarySeq;
76 use base qw(Bio::SeqFeature::Generic Bio::SeqFeature::Gene::TranscriptI);
78 sub new {
79 my ($caller, @args) = @_;
80 my $self = $caller->SUPER::new(@args);
81 $self->_register_for_cleanup(\&transcript_destroy);
82 my ($primary) = $self->_rearrange([qw(PRIMARY)],@args);
84 $primary = 'transcript' unless $primary;
85 $self->primary_tag($primary);
86 $self->strand(0) if(! defined($self->strand()));
87 return $self;
91 =head2 promoters
93 Title : promoters()
94 Usage : @proms = $transcript->promoters();
95 Function: Get the promoter features/sites of this transcript.
97 Note that OO-modeling of regulatory elements is not stable yet.
98 This means that this method might change or even disappear in a
99 future release. Be aware of this if you use it.
101 Returns : An array of Bio::SeqFeatureI implementing objects representing the
102 promoter regions or sites.
103 Args :
106 =cut
108 sub promoters {
109 my ($self) = @_;
110 return $self->get_feature_type('Bio::SeqFeature::Gene::Promoter');
113 =head2 add_promoter
115 Title : add_promoter()
116 Usage : $transcript->add_promoter($feature);
117 Function: Add a promoter feature/site to this transcript.
120 Note that OO-modeling of regulatory elements is not stable yet.
121 This means that this method might change or even disappear in a
122 future release. Be aware of this if you use it.
124 Returns :
125 Args : A Bio::SeqFeatureI implementing object.
128 =cut
130 sub add_promoter {
131 my ($self, $fea) = @_;
132 $self->_add($fea,'Bio::SeqFeature::Gene::Promoter');
135 =head2 flush_promoters
137 Title : flush_promoters()
138 Usage : $transcript->flush_promoters();
139 Function: Remove all promoter features/sites from this transcript.
141 Note that OO-modeling of regulatory elements is not stable yet.
142 This means that this method might change or even disappear in a
143 future release. Be aware of this if you use it.
145 Returns : the removed features as a list
146 Args : none
149 =cut
151 sub flush_promoters {
152 my ($self) = @_;
153 return $self->_flush('Bio::SeqFeature::Gene::Promoter');
156 =head2 exons
158 Title : exons()
159 Usage : @exons = $gene->exons();
160 ($inital_exon) = $gene->exons('Initial');
161 Function: Get all exon features or all exons of specified type of this
162 transcript.
164 Exon type is treated as a case-insensitive regular expression and
165 is optional. For consistency, use only the following types:
166 initial, internal, terminal.
168 Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects.
169 Args : An optional string specifying the primary_tag of the feature.
172 =cut
174 sub exons {
175 my ($self, $type) = @_;
176 return $self->get_unordered_feature_type('Bio::SeqFeature::Gene::ExonI',
177 $type);
180 =head2 exons_ordered
182 Title : exons_ordered
183 Usage : @exons = $gene->exons_ordered();
184 @exons = $gene->exons_ordered("Internal");
185 Function: Get an ordered list of all exon features or all exons of specified
186 type of this transcript.
188 Exon type is treated as a case-insensitive regular expression and
189 is optional. For consistency, use only the following types:
191 Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects.
192 Args : An optional string specifying the primary_tag of the feature.
194 =cut
196 sub exons_ordered {
197 my ($self,$type) = @_;
198 return $self->get_feature_type('Bio::SeqFeature::Gene::ExonI', $type);
201 =head2 add_exon
203 Title : add_exon()
204 Usage : $transcript->add_exon($exon,'initial');
205 Function: Add a exon feature to this transcript.
207 The second argument denotes the type of exon. Mixing exons with and
208 without a type is likely to cause trouble in exons(). Either
209 leave out the type for all exons or for none.
211 Presently, the following types are known: initial, internal,
212 terminal, utr, utr5prime, and utr3prime (all case-insensitive).
213 UTR should better be added through utrs()/add_utr().
215 If you wish to use other or additional types, you will almost
216 certainly have to call exon_type_sortorder() in order to replace
217 the default sort order, or mrna(), cds(), protein(), and exons()
218 may yield unexpected results.
220 Returns :
221 Args : A Bio::SeqFeature::Gene::ExonI implementing object.
222 A string indicating the type of the exon (optional).
225 =cut
227 sub add_exon {
228 my ($self, $fea, $type) = @_;
229 if(! $fea->isa('Bio::SeqFeature::Gene::ExonI') ) {
230 $self->throw("$fea does not implement Bio::SeqFeature::Gene::ExonI");
232 $self->_add($fea,'Bio::SeqFeature::Gene::Exon', $type);
235 =head2 flush_exons
237 Title : flush_exons()
238 Usage : $transcript->flush_exons();
239 $transcript->flush_exons('terminal');
240 Function: Remove all or a certain type of exon features from this transcript.
242 See add_exon() for documentation about types.
244 Calling without a type will not flush UTRs. Call flush_utrs() for
245 this purpose.
246 Returns : the deleted features as a list
247 Args : A string indicating the type of the exon (optional).
250 =cut
252 sub flush_exons {
253 my ($self, $type) = @_;
254 return $self->_flush('Bio::SeqFeature::Gene::Exon',$type);
257 =head2 introns
259 Title : introns()
260 Usage : @introns = $gene->introns();
261 Function: Get all intron features this gene structure.
263 Note that this implementation generates these features
264 on-the-fly, that is, it simply treats all regions between
265 exons as introns, assuming that exons do not overlap. A
266 consequence is that a consistent correspondence between the
267 elements in the returned array and the array that exons()
268 returns will exist only if the exons are properly sorted
269 within their types (forward for plus- strand and reverse
270 for minus-strand transcripts). To ensure correctness the
271 elements in the array returned will always be sorted.
273 Returns : An array of Bio::SeqFeature::Gene::Intron objects representing
274 the intron regions.
275 Args :
278 =cut
280 sub introns {
281 my ($self) = @_;
282 my @introns = ();
283 my @exons = $self->exons();
284 my ($strand, $rev_order);
286 # if there's 1 or less exons we're done
287 return () unless($#exons > 0);
288 # record strand and order (a minus-strand transcript is likely to have
289 # the exons stacked in reverse order)
290 foreach my $exon (@exons) {
291 $strand = $exon->strand();
292 last if $strand; # we're done if we've got 1 or -1
294 $rev_order = ($exons[0]->end() < $exons[1]->start() ? 0 : 1);
296 # Make sure exons are sorted. Because we assume they don't overlap, we
297 # simply sort by start position.
298 if((! defined($strand)) || ($strand != -1) || (! $rev_order)) {
299 # always sort forward for plus-strand transcripts, and for negative-
300 # strand transcripts that appear to be unsorted or forward sorted
301 @exons = map { $_->[0] } sort { $a->[1] <=> $b->[1] }
302 map { [ $_, $_->start * ($_->strand || 1)] } @exons;
303 } else {
304 # sort in reverse order for transcripts on the negative strand and
305 # found to be in reverse order
306 @exons = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->start()] } @exons;
308 # loop over all intervening gaps
309 while ((my $exonA = shift (@exons)) &&(my $exonB = shift(@exons))){
310 my $intron = Bio::SeqFeature::Gene::Intron->new(-primary=>'intron');
311 $intron->upstream_Exon($exonA);
312 $intron->downstream_Exon($exonB);
313 $intron->attach_seq($self->entire_seq) if $self->entire_seq;
314 unshift(@exons,$exonB);
315 push @introns,$intron;
317 return @introns;
320 =head2 poly_A_site
322 Title : poly_A_site()
323 Usage : $polyAsite = $transcript->poly_A_site();
324 Function: Get/set the poly-adenylation feature/site of this transcript.
325 Returns : A Bio::SeqFeatureI implementing object representing the
326 poly-adenylation region.
327 Args : A Bio::SeqFeatureI implementing object on set, or FALSE to flush
328 a previously set object.
331 =cut
333 sub poly_A_site {
334 my ($self, $fea) = @_;
335 if ($fea) {
336 $self->_add($fea,'Bio::SeqFeature::Gene::Poly_A_site');
338 return ($self->get_feature_type('Bio::SeqFeature::Gene::Poly_A_site'))[0];
341 =head2 utrs
343 Title : utrs()
344 Usage : @utr_sites = $transcript->utrs('utr3prime');
345 @utr_sites = $transcript->utrs('utr5prime');
346 @utr_sites = $transcript->utrs();
347 Function: Get the features representing untranslated regions (UTR) of this
348 transcript.
350 You may provide an argument specifying the type of UTR. Currently
351 the following types are recognized: utr5prime utr3prime for UTR on the
352 5' and 3' end of the CDS, respectively.
354 Returns : An array of Bio::SeqFeature::Gene::UTR objects
355 representing the UTR regions or sites.
356 Args : Optionally, either utr3prime, or utr5prime for the the type of UTR
357 feature.
360 =cut
362 sub utrs {
363 my ($self, $type) = @_;
364 return $self->get_feature_type('Bio::SeqFeature::Gene::UTR',$type);
368 =head2 add_utr
370 Title : add_utr()
371 Usage : $transcript->add_utr($utrobj, 'utr3prime');
372 $transcript->add_utr($utrobj);
373 Function: Add a UTR feature/site to this transcript.
375 The second parameter is optional and denotes the type of the UTR
376 feature. Presently recognized types include 'utr5prime' and 'utr3prime'
377 for UTR on the 5' and 3' end of a gene, respectively.
379 Calling this method is the same as calling
380 add_exon($utrobj, 'utr'.$type). In this sense a UTR object is a
381 special exon object, which is transcribed, not spliced out, but
382 not translated.
384 Note that the object supplied should return FALSE for is_coding().
385 Otherwise cds() and friends will become confused.
387 Returns :
388 Args : A Bio::SeqFeature::Gene::UTR implementing object.
391 =cut
393 sub add_utr {
394 my ($self, $fea, $type) = @_;
395 $self->_add($fea,'Bio::SeqFeature::Gene::UTR',$type);
398 =head2 flush_utrs
400 Title : flush_utrs()
401 Usage : $transcript->flush_utrs();
402 $transcript->flush_utrs('utr3prime');
403 Function: Remove all or a specific type of UTR features/sites from this
404 transcript.
406 Cf. add_utr() for documentation about recognized types.
407 Returns : a list of the removed features
408 Args : Optionally a string denoting the type of UTR feature.
411 =cut
413 sub flush_utrs {
414 my ($self, $type) = @_;
415 return $self->_flush('Bio::SeqFeature::Gene::UTR',$type);
418 =head2 sub_SeqFeature
420 Title : sub_SeqFeature
421 Usage : @feats = $transcript->sub_SeqFeature();
422 Function: Returns an array of all subfeatures.
424 This method is defined in Bio::SeqFeatureI. We override this here
425 to include the exon etc features.
427 Returns : An array Bio::SeqFeatureI implementing objects.
428 Args : none
431 =cut
433 sub sub_SeqFeature {
434 my ($self) = @_;
435 my @feas;
437 # get what the parent already has
438 @feas = $self->SUPER::sub_SeqFeature();
439 # add the features we have in addition
440 push(@feas, $self->exons()); # this includes UTR features
441 push(@feas, $self->promoters());
442 push(@feas, $self->poly_A_site()) if($self->poly_A_site());
443 return @feas;
446 =head2 flush_sub_SeqFeature
448 Title : flush_sub_SeqFeature
449 Usage : $transcript->flush_sub_SeqFeature();
450 $transcript->flush_sub_SeqFeature(1);
451 Function: Removes all subfeatures.
453 This method is overridden from Bio::SeqFeature::Generic to flush
454 all additional subfeatures like exons, promoters, etc., which is
455 almost certainly not what you want. To remove only features added
456 through $transcript->add_sub_SeqFeature($feature) pass any
457 argument evaluating to TRUE.
459 Example :
460 Returns : none
461 Args : Optionally, an argument evaluating to TRUE will suppress flushing
462 of all transcript-specific subfeatures (exons etc.).
465 =cut
467 sub flush_sub_SeqFeature {
468 my ($self,$fea_only) = @_;
470 $self->SUPER::flush_sub_SeqFeature();
471 if(! $fea_only) {
472 $self->flush_promoters();
473 $self->flush_exons();
474 $self->flush_utrs();
475 $self->poly_A_site(0);
480 =head2 cds
482 Title : cds
483 Usage : $seq = $transcript->cds();
484 Function: Returns the CDS (coding sequence) as defined by the exons
485 of this transcript and the attached sequence.
487 If no sequence is attached this method will return false.
489 Note that the implementation provided here returns a
490 concatenation of all coding exons, thereby assuming that
491 exons do not overlap.
493 Note also that you cannot set the CDS via this method. Set
494 a single CDS feature as a single exon, or derive your own
495 class if you want to store a predicted CDS.
497 Example :
498 Returns : A Bio::PrimarySeqI implementing object.
499 Args :
501 =cut
503 sub cds {
504 my ($self) = @_;
505 my @exons = $self->exons_ordered(); #this is always sorted properly according to strand
506 my $strand;
508 return unless(@exons);
509 # record strand (a minus-strand transcript must have the exons sorted in
510 # reverse order)
511 foreach my $exon (@exons) {
512 if(defined($exon->strand()) && (! $strand)) {
513 $strand = $exon->strand();
515 if($exon->strand() && (($exon->strand() * $strand) < 0)) {
516 $self->throw("Transcript mixes coding exons on plus and minus ".
517 "strand. This makes no sense.");
520 my $cds = $self->_make_cds(@exons);
521 return unless $cds;
522 return Bio::PrimarySeq->new('-id' => $self->seq_id(),
523 '-seq' => $cds,
524 '-alphabet' => "dna");
527 =head2 protein
529 Title : protein()
530 Usage : $protein = $transcript->protein();
531 Function: Get the protein encoded by the transcript as a sequence object.
533 The implementation provided here simply calls translate() on the
534 object returned by cds().
536 Returns : A Bio::PrimarySeqI implementing object.
537 Args :
540 =cut
542 sub protein {
543 my ($self) = @_;
544 my $seq;
546 $seq = $self->cds();
547 return $seq->translate() if $seq;
548 return;
551 =head2 mrna
553 Title : mrna()
554 Usage : $mrna = $transcript->mrna();
555 Function: Get the mRNA of the transcript as a sequence object.
557 The difference to cds() is that the sequence object returned by
558 this methods will also include UTR and the poly-adenylation site,
559 but not promoter sequence (TBD).
561 HL: do we really need this method?
563 Returns : A Bio::PrimarySeqI implementing object.
564 Args :
567 =cut
569 sub mrna {
570 my ($self) = @_;
571 my ($seq, $mrna, $elem);
573 # get the coding part
574 $seq = $self->cds();
575 if(! $seq) {
576 $seq = Bio::PrimarySeq->new('-id' => $self->seq_id(),
577 '-alphabet' => "rna",
578 '-seq' => "");
580 # get and add UTR sequences
581 $mrna = "";
582 foreach $elem ($self->utrs('utr5prime')) {
583 $mrna .= $elem->seq()->seq();
585 $seq->seq($mrna . $seq->seq());
586 $mrna = "";
587 foreach $elem ($self->utrs('utr3prime')) {
588 $mrna .= $elem->seq()->seq();
590 $seq->seq($seq->seq() . $mrna);
591 if($self->poly_A_site()) {
592 $seq->seq($seq->seq() . $self->poly_A_site()->seq()->seq());
594 return if($seq->length() == 0);
595 return $seq;
598 sub _get_typed_keys {
599 my ($self, $keyprefix, $type) = @_;
600 my @keys = ();
601 my @feas = ();
603 # make case-insensitive
604 $type = ($type ? lc($type) : "");
605 # pull out all feature types that exist and match
606 @keys = grep { /^_$keyprefix$type/i; } (keys(%{$self}));
607 return @keys;
610 sub _make_cds {
611 my ($self,@exons) = @_;
612 my $cds = "";
614 foreach my $exon (@exons) {
615 next if((! defined($exon->seq())) || (! $exon->is_coding()));
616 my $phase = length($cds) % 3;
617 # let's check the simple case
618 if((! defined($exon->frame())) || ($phase == $exon->frame())) {
619 # this one fits exactly, or frame of the exon is undefined (should
620 # we warn about that?); we bypass the $exon->cds() here (hmm,
621 # not very clean style, but I don't see where this screws up)
622 $cds .= $exon->seq()->seq();
623 } else {
624 # this one is probably from exon shuffling and needs some work
625 my $seq = $exon->cds(); # now $seq is guaranteed to be in frame 0
626 next if(! $seq);
627 $seq = $seq->seq();
628 # adjustment needed?
629 if($phase > 0) {
630 # how many Ns can we chop off the piece to be added?
631 my $n_crop = 0;
632 if($seq =~ /^(n+)/i) {
633 $n_crop = length($1);
635 if($n_crop >= $phase) {
636 # chop off to match the phase
637 $seq = substr($seq, $phase);
638 } else {
639 # fill in Ns
640 $seq = ("n" x (3-$phase)) . $seq;
643 $cds .= $seq;
646 return $cds;
649 =head2 features
651 Title : features
652 Usage : my @features=$transcript->features;
653 Function: returns all the features associated with this transcript
654 Returns : a list of SeqFeatureI implementing objects
655 Args : none
658 =cut
661 sub features {
662 my $self = shift;
663 return grep { defined } @{$self->{'_features'} || []};
666 =head2 features_ordered
668 Title : features_ordered
669 Usage : my @features=$transcript->features_ordered;
670 Function: returns all the features associated with this transcript,
671 in order by feature start, according to strand
672 Returns : a list of SeqFeatureI implementing objects
673 Args : none
676 =cut
678 sub features_ordered{
679 my ($self) = @_;
680 return $self->_stranded_sort(@{$self->{'_features'} || []});
684 sub get_unordered_feature_type{
685 my ($self, $type, $pri)=@_;
686 my @list;
687 foreach ( $self->features) {
688 if ($_->isa($type)) {
689 if ($pri && $_->primary_tag !~ /$pri/i) {
690 next;
692 push @list,$_;
695 return @list;
699 sub get_feature_type {
700 my ($self)=shift;
701 return $self->_stranded_sort($self->get_unordered_feature_type(@_));
704 #This was fixed by Gene Cutler - the indexing on the list being reversed
705 #fixed a bad bug. Thanks Gene!
706 sub _flush {
707 my ($self, $type, $pri)=@_;
708 my @list=$self->features;
709 my @cut;
710 for (reverse (0..$#list)) {
711 if (defined $list[$_] &&
712 $list[$_]->isa($type)) {
713 if ($pri && $list[$_]->primary_tag !~ /$pri/i) {
714 next;
716 push @cut, splice @list, $_, 1; #remove the element of $type from @list
717 #and return each of them in @cut
720 $self->{'_features'}=\@list;
721 return reverse @cut;
724 sub _add {
725 my ($self, $fea, $type, $pri)=@_;
726 require Bio::SeqFeature::Gene::Promoter;
727 require Bio::SeqFeature::Gene::UTR;
728 require Bio::SeqFeature::Gene::Exon;
729 require Bio::SeqFeature::Gene::Intron;
730 require Bio::SeqFeature::Gene::Poly_A_site;
732 if(! $fea->isa('Bio::SeqFeatureI') ) {
733 $self->throw("$fea does not implement Bio::SeqFeatureI");
735 if(! $fea->isa($type) || $pri) {
736 $fea=$self->_new_of_type($fea,$type,$pri);
738 if (! $self->strand) {
739 $self->strand($fea->strand);
740 } else {
741 if ($self->strand * $fea->strand == -1) {
742 $self->throw("$fea is on opposite strand from $self");
746 $self->_expand_region($fea);
747 if(defined($self->entire_seq()) && (! defined($fea->entire_seq())) &&
748 $fea->can('attach_seq')) {
749 $fea->attach_seq($self->entire_seq());
751 if (defined $self->parent) {
752 $self->parent->_expand_region($fea);
754 push(@{$self->{'_features'}}, $fea);
758 sub _stranded_sort {
759 my ($self,@list)=@_;
760 my $strand;
761 foreach my $fea (@list) {
762 if($fea->strand()) {
763 # defined and != 0
764 $strand = $fea->strand() if(! $strand);
765 if(($fea->strand() * $strand) < 0) {
766 $strand = undef;
767 last;
771 if (defined $strand && $strand == - 1) { #reverse strand
772 return map { $_->[0] } sort {$b->[1] <=> $a->[1]} map { [$_, $_->start] } @list;
773 } else { #undef or forward strand
774 return map { $_->[0] } sort {$a->[1] <=> $b->[1]} map { [$_, $_->start] } @list;
778 sub _new_of_type {
779 my ($self, $fea, $type, $pri)= @_;
780 my $primary;
781 if ($pri) {
782 $primary = $pri; #can set new primary tag if desired
783 } else {
784 ($primary) = $type =~ /.*::(.+)/; #or else primary is just end of type string
786 bless $fea,$type;
787 $fea->primary_tag($primary);
788 return $fea;
791 sub transcript_destroy {
792 my $self = shift;
793 # We're going to be really explicit to insure memory leaks
794 # don't occur
795 foreach my $f ( $self->features ) {
796 $f = undef;
798 $self->parent(undef);