maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / SeqFeature / Gene / Transcript.pm
blob44537cc6de1fb2e0ad58060acf38ab6e48f34b96
2 # BioPerl module for Bio::SeqFeature::Gene::Transcript
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp@gmx.net>
8 # Copyright Hilmar Lapp
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeature::Gene::Transcript - A feature representing a transcript
18 =head1 SYNOPSIS
20 # See documentation of methods.
22 =head1 DESCRIPTION
24 A feature representing a transcript.
26 =head1 FEEDBACK
28 =head2 Mailing Lists
30 User feedback is an integral part of the evolution of this and other
31 Bioperl modules. Send your comments and suggestions preferably to one
32 of the Bioperl mailing lists. Your participation is much appreciated.
34 bioperl-l@bioperl.org - General discussion
35 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
37 =head2 Support
39 Please direct usage questions or support issues to the mailing list:
41 I<bioperl-l@bioperl.org>
43 rather than to the module maintainer directly. Many experienced and
44 reponsive experts will be able look at the problem and quickly
45 address it. Please include a thorough description of the problem
46 with code and data examples if at all possible.
48 =head2 Reporting Bugs
50 Report bugs to the Bioperl bug tracking system to help us keep track
51 the bugs and their resolution. Bug reports can be submitted via the
52 web:
54 https://github.com/bioperl/bioperl-live/issues
56 =head1 AUTHOR - Hilmar Lapp
58 Email hlapp@gmx.net
60 =head1 APPENDIX
62 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
64 =cut
67 # Let the code begin...
69 package Bio::SeqFeature::Gene::Transcript;
70 use strict;
73 use Bio::PrimarySeq;
75 use base qw(Bio::SeqFeature::Generic Bio::SeqFeature::Gene::TranscriptI);
77 sub new {
78 my ($caller, @args) = @_;
79 my $self = $caller->SUPER::new(@args);
80 $self->_register_for_cleanup(\&transcript_destroy);
81 my ($primary) = $self->_rearrange([qw(PRIMARY)],@args);
83 $primary = 'transcript' unless $primary;
84 $self->primary_tag($primary);
85 $self->strand(0) if(! defined($self->strand()));
86 return $self;
90 =head2 promoters
92 Title : promoters()
93 Usage : @proms = $transcript->promoters();
94 Function: Get the promoter features/sites of this transcript.
96 Note that OO-modeling of regulatory elements is not stable yet.
97 This means that this method might change or even disappear in a
98 future release. Be aware of this if you use it.
100 Returns : An array of Bio::SeqFeatureI implementing objects representing the
101 promoter regions or sites.
102 Args :
105 =cut
107 sub promoters {
108 my ($self) = @_;
109 return $self->get_feature_type('Bio::SeqFeature::Gene::Promoter');
112 =head2 add_promoter
114 Title : add_promoter()
115 Usage : $transcript->add_promoter($feature);
116 Function: Add a promoter feature/site to this transcript.
119 Note that OO-modeling of regulatory elements is not stable yet.
120 This means that this method might change or even disappear in a
121 future release. Be aware of this if you use it.
123 Returns :
124 Args : A Bio::SeqFeatureI implementing object.
127 =cut
129 sub add_promoter {
130 my ($self, $fea) = @_;
131 $self->_add($fea,'Bio::SeqFeature::Gene::Promoter');
134 =head2 flush_promoters
136 Title : flush_promoters()
137 Usage : $transcript->flush_promoters();
138 Function: Remove all promoter features/sites from this transcript.
140 Note that OO-modeling of regulatory elements is not stable yet.
141 This means that this method might change or even disappear in a
142 future release. Be aware of this if you use it.
144 Returns : the removed features as a list
145 Args : none
148 =cut
150 sub flush_promoters {
151 my ($self) = @_;
152 return $self->_flush('Bio::SeqFeature::Gene::Promoter');
155 =head2 exons
157 Title : exons()
158 Usage : @exons = $gene->exons();
159 ($inital_exon) = $gene->exons('Initial');
160 Function: Get all exon features or all exons of specified type of this
161 transcript.
163 Exon type is treated as a case-insensitive regular expression and
164 is optional. For consistency, use only the following types:
165 initial, internal, terminal.
167 Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects.
168 Args : An optional string specifying the primary_tag of the feature.
171 =cut
173 sub exons {
174 my ($self, $type) = @_;
175 return $self->get_unordered_feature_type('Bio::SeqFeature::Gene::ExonI',
176 $type);
179 =head2 exons_ordered
181 Title : exons_ordered
182 Usage : @exons = $gene->exons_ordered();
183 @exons = $gene->exons_ordered("Internal");
184 Function: Get an ordered list of all exon features or all exons of specified
185 type of this transcript.
187 Exon type is treated as a case-insensitive regular expression and
188 is optional. For consistency, use only the following types:
190 Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects.
191 Args : An optional string specifying the primary_tag of the feature.
193 =cut
195 sub exons_ordered {
196 my ($self,$type) = @_;
197 return $self->get_feature_type('Bio::SeqFeature::Gene::ExonI', $type);
200 =head2 add_exon
202 Title : add_exon()
203 Usage : $transcript->add_exon($exon,'initial');
204 Function: Add a exon feature to this transcript.
206 The second argument denotes the type of exon. Mixing exons with and
207 without a type is likely to cause trouble in exons(). Either
208 leave out the type for all exons or for none.
210 Presently, the following types are known: initial, internal,
211 terminal, utr, utr5prime, and utr3prime (all case-insensitive).
212 UTR should better be added through utrs()/add_utr().
214 If you wish to use other or additional types, you will almost
215 certainly have to call exon_type_sortorder() in order to replace
216 the default sort order, or mrna(), cds(), protein(), and exons()
217 may yield unexpected results.
219 Returns :
220 Args : A Bio::SeqFeature::Gene::ExonI implementing object.
221 A string indicating the type of the exon (optional).
224 =cut
226 sub add_exon {
227 my ($self, $fea, $type) = @_;
228 if(! $fea->isa('Bio::SeqFeature::Gene::ExonI') ) {
229 $self->throw("$fea does not implement Bio::SeqFeature::Gene::ExonI");
231 $self->_add($fea,'Bio::SeqFeature::Gene::Exon', $type);
234 =head2 flush_exons
236 Title : flush_exons()
237 Usage : $transcript->flush_exons();
238 $transcript->flush_exons('terminal');
239 Function: Remove all or a certain type of exon features from this transcript.
241 See add_exon() for documentation about types.
243 Calling without a type will not flush UTRs. Call flush_utrs() for
244 this purpose.
245 Returns : the deleted features as a list
246 Args : A string indicating the type of the exon (optional).
249 =cut
251 sub flush_exons {
252 my ($self, $type) = @_;
253 return $self->_flush('Bio::SeqFeature::Gene::Exon',$type);
256 =head2 introns
258 Title : introns()
259 Usage : @introns = $gene->introns();
260 Function: Get all intron features this gene structure.
262 Note that this implementation generates these features
263 on-the-fly, that is, it simply treats all regions between
264 exons as introns, assuming that exons do not overlap. A
265 consequence is that a consistent correspondence between the
266 elements in the returned array and the array that exons()
267 returns will exist only if the exons are properly sorted
268 within their types (forward for plus- strand and reverse
269 for minus-strand transcripts). To ensure correctness the
270 elements in the array returned will always be sorted.
272 Returns : An array of Bio::SeqFeature::Gene::Intron objects representing
273 the intron regions.
274 Args :
277 =cut
279 sub introns {
280 my ($self) = @_;
281 my @introns = ();
282 my @exons = $self->exons();
283 my ($strand, $rev_order);
285 # if there's 1 or less exons we're done
286 return () unless($#exons > 0);
287 # record strand and order (a minus-strand transcript is likely to have
288 # the exons stacked in reverse order)
289 foreach my $exon (@exons) {
290 $strand = $exon->strand();
291 last if $strand; # we're done if we've got 1 or -1
293 $rev_order = ($exons[0]->end() < $exons[1]->start() ? 0 : 1);
295 # Make sure exons are sorted. Because we assume they don't overlap, we
296 # simply sort by start position.
297 if((! defined($strand)) || ($strand != -1) || (! $rev_order)) {
298 # always sort forward for plus-strand transcripts, and for negative-
299 # strand transcripts that appear to be unsorted or forward sorted
300 @exons = map { $_->[0] } sort { $a->[1] <=> $b->[1] }
301 map { [ $_, $_->start * ($_->strand || 1)] } @exons;
302 } else {
303 # sort in reverse order for transcripts on the negative strand and
304 # found to be in reverse order
305 @exons = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->start()] } @exons;
307 # loop over all intervening gaps
308 while ((my $exonA = shift (@exons)) &&(my $exonB = shift(@exons))){
309 my $intron = Bio::SeqFeature::Gene::Intron->new(-primary=>'intron');
310 $intron->upstream_Exon($exonA);
311 $intron->downstream_Exon($exonB);
312 $intron->attach_seq($self->entire_seq) if $self->entire_seq;
313 unshift(@exons,$exonB);
314 push @introns,$intron;
316 return @introns;
319 =head2 poly_A_site
321 Title : poly_A_site()
322 Usage : $polyAsite = $transcript->poly_A_site();
323 Function: Get/set the poly-adenylation feature/site of this transcript.
324 Returns : A Bio::SeqFeatureI implementing object representing the
325 poly-adenylation region.
326 Args : A Bio::SeqFeatureI implementing object on set, or FALSE to flush
327 a previously set object.
330 =cut
332 sub poly_A_site {
333 my ($self, $fea) = @_;
334 if ($fea) {
335 $self->_add($fea,'Bio::SeqFeature::Gene::Poly_A_site');
337 return ($self->get_feature_type('Bio::SeqFeature::Gene::Poly_A_site'))[0];
340 =head2 utrs
342 Title : utrs()
343 Usage : @utr_sites = $transcript->utrs('utr3prime');
344 @utr_sites = $transcript->utrs('utr5prime');
345 @utr_sites = $transcript->utrs();
346 Function: Get the features representing untranslated regions (UTR) of this
347 transcript.
349 You may provide an argument specifying the type of UTR. Currently
350 the following types are recognized: utr5prime utr3prime for UTR on the
351 5' and 3' end of the CDS, respectively.
353 Returns : An array of Bio::SeqFeature::Gene::UTR objects
354 representing the UTR regions or sites.
355 Args : Optionally, either utr3prime, or utr5prime for the the type of UTR
356 feature.
359 =cut
361 sub utrs {
362 my ($self, $type) = @_;
363 return $self->get_feature_type('Bio::SeqFeature::Gene::UTR',$type);
367 =head2 add_utr
369 Title : add_utr()
370 Usage : $transcript->add_utr($utrobj, 'utr3prime');
371 $transcript->add_utr($utrobj);
372 Function: Add a UTR feature/site to this transcript.
374 The second parameter is optional and denotes the type of the UTR
375 feature. Presently recognized types include 'utr5prime' and 'utr3prime'
376 for UTR on the 5' and 3' end of a gene, respectively.
378 Calling this method is the same as calling
379 add_exon($utrobj, 'utr'.$type). In this sense a UTR object is a
380 special exon object, which is transcribed, not spliced out, but
381 not translated.
383 Note that the object supplied should return FALSE for is_coding().
384 Otherwise cds() and friends will become confused.
386 Returns :
387 Args : A Bio::SeqFeature::Gene::UTR implementing object.
390 =cut
392 sub add_utr {
393 my ($self, $fea, $type) = @_;
394 $self->_add($fea,'Bio::SeqFeature::Gene::UTR',$type);
397 =head2 flush_utrs
399 Title : flush_utrs()
400 Usage : $transcript->flush_utrs();
401 $transcript->flush_utrs('utr3prime');
402 Function: Remove all or a specific type of UTR features/sites from this
403 transcript.
405 Cf. add_utr() for documentation about recognized types.
406 Returns : a list of the removed features
407 Args : Optionally a string denoting the type of UTR feature.
410 =cut
412 sub flush_utrs {
413 my ($self, $type) = @_;
414 return $self->_flush('Bio::SeqFeature::Gene::UTR',$type);
417 =head2 sub_SeqFeature
419 Title : sub_SeqFeature
420 Usage : @feats = $transcript->sub_SeqFeature();
421 Function: Returns an array of all subfeatures.
423 This method is defined in Bio::SeqFeatureI. We override this here
424 to include the exon etc features.
426 Returns : An array Bio::SeqFeatureI implementing objects.
427 Args : none
430 =cut
432 sub sub_SeqFeature {
433 my ($self) = @_;
434 my @feas;
436 # get what the parent already has
437 @feas = $self->SUPER::sub_SeqFeature();
438 # add the features we have in addition
439 push(@feas, $self->exons()); # this includes UTR features
440 push(@feas, $self->promoters());
441 push(@feas, $self->poly_A_site()) if($self->poly_A_site());
442 return @feas;
445 =head2 flush_sub_SeqFeature
447 Title : flush_sub_SeqFeature
448 Usage : $transcript->flush_sub_SeqFeature();
449 $transcript->flush_sub_SeqFeature(1);
450 Function: Removes all subfeatures.
452 This method is overridden from Bio::SeqFeature::Generic to flush
453 all additional subfeatures like exons, promoters, etc., which is
454 almost certainly not what you want. To remove only features added
455 through $transcript->add_sub_SeqFeature($feature) pass any
456 argument evaluating to TRUE.
458 Example :
459 Returns : none
460 Args : Optionally, an argument evaluating to TRUE will suppress flushing
461 of all transcript-specific subfeatures (exons etc.).
464 =cut
466 sub flush_sub_SeqFeature {
467 my ($self,$fea_only) = @_;
469 $self->SUPER::flush_sub_SeqFeature();
470 if(! $fea_only) {
471 $self->flush_promoters();
472 $self->flush_exons();
473 $self->flush_utrs();
474 $self->poly_A_site(0);
479 =head2 cds
481 Title : cds
482 Usage : $seq = $transcript->cds();
483 Function: Returns the CDS (coding sequence) as defined by the exons
484 of this transcript and the attached sequence.
486 If no sequence is attached this method will return false.
488 Note that the implementation provided here returns a
489 concatenation of all coding exons, thereby assuming that
490 exons do not overlap.
492 Note also that you cannot set the CDS via this method. Set
493 a single CDS feature as a single exon, or derive your own
494 class if you want to store a predicted CDS.
496 Example :
497 Returns : A Bio::PrimarySeqI implementing object.
498 Args :
500 =cut
502 sub cds {
503 my ($self) = @_;
504 my @exons = $self->exons_ordered(); #this is always sorted properly according to strand
505 my $strand;
507 return unless(@exons);
508 # record strand (a minus-strand transcript must have the exons sorted in
509 # reverse order)
510 foreach my $exon (@exons) {
511 if(defined($exon->strand()) && (! $strand)) {
512 $strand = $exon->strand();
514 if($exon->strand() && (($exon->strand() * $strand) < 0)) {
515 $self->throw("Transcript mixes coding exons on plus and minus ".
516 "strand. This makes no sense.");
519 my $cds = $self->_make_cds(@exons);
520 return unless $cds;
521 return Bio::PrimarySeq->new('-id' => $self->seq_id(),
522 '-seq' => $cds,
523 '-alphabet' => "dna");
526 =head2 protein
528 Title : protein()
529 Usage : $protein = $transcript->protein();
530 Function: Get the protein encoded by the transcript as a sequence object.
532 The implementation provided here simply calls translate() on the
533 object returned by cds().
535 Returns : A Bio::PrimarySeqI implementing object.
536 Args :
539 =cut
541 sub protein {
542 my ($self) = @_;
543 my $seq;
545 $seq = $self->cds();
546 return $seq->translate() if $seq;
547 return;
550 =head2 mrna
552 Title : mrna()
553 Usage : $mrna = $transcript->mrna();
554 Function: Get the mRNA of the transcript as a sequence object.
556 The difference to cds() is that the sequence object returned by
557 this methods will also include UTR and the poly-adenylation site,
558 but not promoter sequence (TBD).
560 HL: do we really need this method?
562 Returns : A Bio::PrimarySeqI implementing object.
563 Args :
566 =cut
568 sub mrna {
569 my ($self) = @_;
570 my ($seq, $mrna, $elem);
572 # get the coding part
573 $seq = $self->cds();
574 if(! $seq) {
575 $seq = Bio::PrimarySeq->new('-id' => $self->seq_id(),
576 '-alphabet' => "rna",
577 '-seq' => "");
579 # get and add UTR sequences
580 $mrna = "";
581 foreach $elem ($self->utrs('utr5prime')) {
582 $mrna .= $elem->seq()->seq();
584 $seq->seq($mrna . $seq->seq());
585 $mrna = "";
586 foreach $elem ($self->utrs('utr3prime')) {
587 $mrna .= $elem->seq()->seq();
589 $seq->seq($seq->seq() . $mrna);
590 if($self->poly_A_site()) {
591 $seq->seq($seq->seq() . $self->poly_A_site()->seq()->seq());
593 return if($seq->length() == 0);
594 return $seq;
597 sub _get_typed_keys {
598 my ($self, $keyprefix, $type) = @_;
599 my @keys = ();
600 my @feas = ();
602 # make case-insensitive
603 $type = ($type ? lc($type) : "");
604 # pull out all feature types that exist and match
605 @keys = grep { /^_$keyprefix$type/i; } (keys(%{$self}));
606 return @keys;
609 sub _make_cds {
610 my ($self,@exons) = @_;
611 my $cds = "";
613 foreach my $exon (@exons) {
614 next if((! defined($exon->seq())) || (! $exon->is_coding()));
615 my $phase = length($cds) % 3;
616 # let's check the simple case
617 if((! defined($exon->frame())) || ($phase == $exon->frame())) {
618 # this one fits exactly, or frame of the exon is undefined (should
619 # we warn about that?); we bypass the $exon->cds() here (hmm,
620 # not very clean style, but I don't see where this screws up)
621 $cds .= $exon->seq()->seq();
622 } else {
623 # this one is probably from exon shuffling and needs some work
624 my $seq = $exon->cds(); # now $seq is guaranteed to be in frame 0
625 next if(! $seq);
626 $seq = $seq->seq();
627 # adjustment needed?
628 if($phase > 0) {
629 # how many Ns can we chop off the piece to be added?
630 my $n_crop = 0;
631 if($seq =~ /^(n+)/i) {
632 $n_crop = length($1);
634 if($n_crop >= $phase) {
635 # chop off to match the phase
636 $seq = substr($seq, $phase);
637 } else {
638 # fill in Ns
639 $seq = ("n" x (3-$phase)) . $seq;
642 $cds .= $seq;
645 return $cds;
648 =head2 features
650 Title : features
651 Usage : my @features=$transcript->features;
652 Function: returns all the features associated with this transcript
653 Returns : a list of SeqFeatureI implementing objects
654 Args : none
657 =cut
660 sub features {
661 my $self = shift;
662 return grep { defined } @{$self->{'_features'} || []};
665 =head2 features_ordered
667 Title : features_ordered
668 Usage : my @features=$transcript->features_ordered;
669 Function: returns all the features associated with this transcript,
670 in order by feature start, according to strand
671 Returns : a list of SeqFeatureI implementing objects
672 Args : none
675 =cut
677 sub features_ordered{
678 my ($self) = @_;
679 return $self->_stranded_sort(@{$self->{'_features'} || []});
683 sub get_unordered_feature_type{
684 my ($self, $type, $pri)=@_;
685 my @list;
686 foreach ( $self->features) {
687 if ($_->isa($type)) {
688 if ($pri && $_->primary_tag !~ /$pri/i) {
689 next;
691 push @list,$_;
694 return @list;
698 sub get_feature_type {
699 my ($self)=shift;
700 return $self->_stranded_sort($self->get_unordered_feature_type(@_));
703 #This was fixed by Gene Cutler - the indexing on the list being reversed
704 #fixed a bad bug. Thanks Gene!
705 sub _flush {
706 my ($self, $type, $pri)=@_;
707 my @list=$self->features;
708 my @cut;
709 for (reverse (0..$#list)) {
710 if (defined $list[$_] &&
711 $list[$_]->isa($type)) {
712 if ($pri && $list[$_]->primary_tag !~ /$pri/i) {
713 next;
715 push @cut, splice @list, $_, 1; #remove the element of $type from @list
716 #and return each of them in @cut
719 $self->{'_features'}=\@list;
720 return reverse @cut;
723 sub _add {
724 my ($self, $fea, $type, $pri)=@_;
725 require Bio::SeqFeature::Gene::Promoter;
726 require Bio::SeqFeature::Gene::UTR;
727 require Bio::SeqFeature::Gene::Exon;
728 require Bio::SeqFeature::Gene::Intron;
729 require Bio::SeqFeature::Gene::Poly_A_site;
731 if(! $fea->isa('Bio::SeqFeatureI') ) {
732 $self->throw("$fea does not implement Bio::SeqFeatureI");
734 if(! $fea->isa($type) || $pri) {
735 $fea=$self->_new_of_type($fea,$type,$pri);
737 if (! $self->strand) {
738 $self->strand($fea->strand);
739 } else {
740 if ($self->strand * $fea->strand == -1) {
741 $self->throw("$fea is on opposite strand from $self");
745 $self->_expand_region($fea);
746 if(defined($self->entire_seq()) && (! defined($fea->entire_seq())) &&
747 $fea->can('attach_seq')) {
748 $fea->attach_seq($self->entire_seq());
750 if (defined $self->parent) {
751 $self->parent->_expand_region($fea);
753 push(@{$self->{'_features'}}, $fea);
757 sub _stranded_sort {
758 my ($self,@list)=@_;
759 my $strand;
760 foreach my $fea (@list) {
761 if($fea->strand()) {
762 # defined and != 0
763 $strand = $fea->strand() if(! $strand);
764 if(($fea->strand() * $strand) < 0) {
765 $strand = undef;
766 last;
770 if (defined $strand && $strand == - 1) { #reverse strand
771 return map { $_->[0] } sort {$b->[1] <=> $a->[1]} map { [$_, $_->start] } @list;
772 } else { #undef or forward strand
773 return map { $_->[0] } sort {$a->[1] <=> $b->[1]} map { [$_, $_->start] } @list;
777 sub _new_of_type {
778 my ($self, $fea, $type, $pri)= @_;
779 my $primary;
780 if ($pri) {
781 $primary = $pri; #can set new primary tag if desired
782 } else {
783 ($primary) = $type =~ /.*::(.+)/; #or else primary is just end of type string
785 bless $fea,$type;
786 $fea->primary_tag($primary);
787 return $fea;
790 sub transcript_destroy {
791 my $self = shift;
792 # We're going to be really explicit to insure memory leaks
793 # don't occur
794 foreach my $f ( $self->features ) {
795 $f = undef;
797 $self->parent(undef);