3 # bioperl module for Bio::LiveSeq::Transcript
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
9 # Copyright Joseph Insana
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::LiveSeq::Transcript - Transcript class for LiveSeq
21 # documentation needed
25 This stores informations about coding sequences (CDS).
26 The implementation is that a Transcript object accesses a collection of
27 Exon objects, inferring from them the nucleotide structure and sequence.
29 =head1 AUTHOR - Joseph A.L. Insana
31 Email: Insana@ebi.ac.uk, jinsana@gmx.net
35 The rest of the documentation details each of the object
36 methods. Internal methods are usually preceded with a _
40 # Let the code begin...
42 package Bio
::LiveSeq
::Transcript
;
45 # use Carp qw(carp cluck);
46 use Bio
::LiveSeq
::Exon
; # uses Exon to create new exon in case of deletion
47 use base
qw(Bio::LiveSeq::SeqI);
52 Usage : $transcript = Bio::LiveSeq::Transcript->new(-exons => \@obj_refs);
54 Function: generates a new Bio::LiveSeq::Transcript
55 Returns : reference to a new object of class Transcript
57 Args : reference to an array of Exon object references
62 my ($thing, %args) = @_;
63 my $class = ref($thing) || $thing;
64 my ($obj,%transcript);
66 my @exons=@
{$args{-exons
}};
69 $obj = bless $obj, $class;
72 $obj->warn("$class not initialised because exons array empty");
76 # now useless, after start and end methods have been overridden here
77 my $firstexon = $exons[0];
78 #my $lastexon = $exons[-1];
79 #my $start = $firstexon->start;
80 #my $end = $lastexon->end;
81 my $strand = $firstexon->strand;
82 my $seq = $firstexon->{'seq'};
83 $obj->alphabet('rna');
85 unless (_checkexons
(\
@exons)) {
86 $obj->warn("$class not initialised because of problems in the exon structure");
89 $obj->{'strand'}=$strand;
90 $obj->{'exons'}=\
@exons;
93 # set Transcript into each Exon
95 foreach $exon (@exons) {
96 $exon->{'transcript'}=$obj;
105 Usage : $transcript_obj->all_Exons()
106 Function: returns references to all Exon objects the Transcript is composed of
107 Example : foreach $exon ($transcript->all_Exons()) { do_something }
108 Returns : array of object references
115 my $exonsref=$self->{'exons'};
116 my @exons=@
{$exonsref};
119 foreach $exon (@exons) {
120 unless ($exon->obj_valid) {
121 $self->warn("$exon no more valid, start or end label lost, skipping....",1); # ignorable
123 push(@newexons,$exon);
126 if ($#exons != $#newexons) {
128 $self->{'exons'}=\
@newexons;
133 =head2 downstream_seq
135 Title : downstream_seq
136 Usage : $transcript_obj->downstream_seq()
137 : $transcript_obj->downstream_seq(64)
138 Function: returns a string of nucleotides downstream of the end of the
139 CDS. If there is some information of the real mRNA, from features in
140 an attached Gene object, it will return up to those boundaries.
141 Otherwise it will return 1000 nucleotides.
142 If an argument is given it will override the default 1000 number
143 and return instead /that/ requested number of nucleotides.
144 But if a Gene object is attached, this argument will be ignored.
146 Args : an optional integer number of nucleotides to be returned instead of
147 the default if no gene attached
152 my ($self,$howmany)=@_;
154 if (defined ($howmany)) {
155 unless ($howmany > 0) {
156 $self->throw("No sense in asking less than 1 downstream nucleotides!");
159 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve until the end
160 #$str=$DNAobj->labelsubseq($self->end,undef,undef,"unsecuremoderequested");
161 #return(substr($str,1)); # delete first nucleotide that is the last of Transcript
162 if ($self->gene) { # if there is Gene object attached fetch relevant info
163 $str=$self->{'seq'}->labelsubseq($self->end,undef,$self->gene->maxtranscript->end); # retrieve from end of this Transcript to end of the maxtranscript
164 $str=substr($str,1); # delete first nucleotide that is the last of Transcript
165 if (CORE
::length($str) > 0) {
167 } else { # if there was no downstream through the gene's maxtranscript, go the usual way
175 my @exons=$self->all_Exons;
176 my $strand=$self->strand();
177 my $lastexon=$exons[-1];
178 my $lastexonlength=$lastexon->length;
179 # $howmany nucs after end of last exon
180 #my $downstream_seq=$lastexon->subseq($lastexonlength+1,undef,$howmany);
184 $downstream_seq=substr($lastexon->labelsubseq($self->end,$howmany,undef,"unsecuremoderequested"),1);
187 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->end,"unsecuremoderequested"),1);
189 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->start,"unsecuremoderequested"),1);
192 return $downstream_seq;
198 Usage : $transcript_obj->upstream_seq()
199 : $transcript_obj->upstream_seq(64)
200 Function: just like downstream_seq but returns nucleotides before the ATG
201 Note : the default, if no Gene information present and no nucleotides
202 number given, is to return up to 400 nucleotides.
207 my ($self,$howmany)=@_;
208 if (defined ($howmany)) {
209 unless ($howmany > 0) {
210 $self->throw("No sense in asking less than 1 upstream nucleotides!");
213 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve from the start
214 if ($self->gene) { # if there is Gene object attached fetch relevant info
215 my $str=$self->{'seq'}->labelsubseq($self->gene->maxtranscript->start,undef,$self->start); # retrieve from start of maxtranscript to start of this Transcript
216 chop $str; # delete last nucleotide that is the A of starting ATG
217 if (length($str) > 0) {
219 } else { # if there was no upstream through the gene's maxtranscript, go the usual way
227 my @exons=$self->all_Exons;
228 my $firstexon=$exons[0];
231 my $strand=$self->strand();
233 if ($howmany) {# $howmany nucs before begin of first exon
234 my $labelbefore=$firstexon->label(-$howmany,$firstexon->start);
235 if ($labelbefore < 1) {
237 $labelbefore=$self->{'seq'}->start;
239 $labelbefore=$self->{'seq'}->end;
242 $upstream_seq=$firstexon->labelsubseq($labelbefore,undef,$firstexon->start,"unsecuremoderequested");
246 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->start,undef,$self->start,"unsecuremoderequested");
247 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG
249 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->end,undef,$self->start,"unsecuremoderequested");
250 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG
253 return $upstream_seq;
256 # These get redefined here, overriding the SeqI one because they draw their
257 # information from the Exons a Transcript is built of
258 # optional argument: firstlabel. If not given, it checks coordinate_start
259 # This is useful when called by Translation
260 # also used by _delete
262 my ($self,$position,$firstlabel)=@_;
263 unless ($position) { # if position = 0 complain ?
264 $self->warn("Position not given or position 0");
267 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
268 my ($label,@labels,$length,$arraypos);
269 unless (defined ($firstlabel)) {
270 $firstlabel=$self->coordinate_start; # this is inside Transcript obj
272 my $coord_pos=$self->_inside_position($firstlabel);
273 $length=$self->length;
276 $position++; # to account for missing of 0 position
278 $arraypos=$position+$coord_pos-2;
279 #print "\n=-=-=-=-DEBUG: arraypos $arraypos, pos $position, coordpos: $coord_pos";
281 $label=$self->{'seq'}->label($arraypos,$start,$strand); #?
282 } elsif ($arraypos >= $length) {
283 $label=$self->{'seq'}->label($arraypos-$length+2,$end,$strand); #?
284 } else { # inside the Transcript
285 @labels=$self->all_labels;
286 $label=$labels[$arraypos];
292 # returns: position of label according to coord_start
293 # errorcode: 0 label not found
294 # optional argument: firstlabel. If not given, it checks coordinate_start
295 # This is useful when called by Translation
297 my ($self,$label,$firstlabel)=@_;
298 unless ($self->{'seq'}->valid($label)) {
299 $self->warn("label is not valid");
302 unless (defined ($firstlabel)) {
303 $firstlabel=$self->coordinate_start; # this is inside Transcript obj
305 if ($label == $firstlabel) {
308 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
309 my ($position,$in_pos,$out_pos,$coord_pos);
310 my $length=$self->length;
311 $coord_pos=$self->_inside_position($firstlabel);
312 if ($self->valid($label)) { # if label is inside the Transcript
313 $in_pos=$self->_inside_position($label);
314 $position=$in_pos-$coord_pos+1;
315 if ($position <= 0) {
316 return ($position-1); # accounts for the missing of the 0 position
319 if ($self->follows($end,$label)) { # label after end of transcript
320 $out_pos=$self->{'seq'}->position($label,$end,$strand);
321 #print "\n+++++++++DEBUG label $label FOLLOWS end $end outpos $out_pos coordpos $coord_pos";
322 $position=$out_pos+$length-$coord_pos;
323 } elsif ($self->follows($label,$start)) { # label before begin of transcript
324 #print "\n+++++++++DEBUG label $label BEFORE start $start outpos $out_pos coordpos $coord_pos";
325 $out_pos=$self->{'seq'}->position($label,$start,$strand);
326 $position=$out_pos-$coord_pos+1;
327 } else { # label is in intron (not valid, not after, not before)!
328 $self->warn("Cannot give position of label pointing to intron according to CDS numbering!",1);
338 my @exons=$self->all_Exons();
339 foreach $exon (@exons) {
340 $str .= $exon->seq();
348 my @exons=$self->all_Exons();
349 foreach $exon (@exons) {
350 $length += $exon->length();
358 my @exons=$self->all_Exons();
359 foreach $exon (@exons) {
360 push (@labels,$exon->all_labels());
365 # redefined here so that it will retrieve effective subseq without introns
366 # otherwise it would have retrieved an underlying DNA (possibly with introns)
368 # Drawback: this is really bulky, label->position and then a call to
369 # subseq that will do the opposite position-> label
371 # one day this can be rewritten as the main one so that the normal subseq
372 # will rely on this one and hence avoid this double (useless and lengthy)
373 # conversion between labels and positions
374 sub old_labelsubseq
{
375 my ($self,$start,$length,$end)=@_;
378 unless ($self->valid($start)) {
379 $self->warn("Start label not valid"); return (-1);
381 $pos1=$self->position($start);
384 if ($end == $start) {
387 unless ($self->valid($end)) {
388 $self->warn("End label not valid"); return (-1);
390 unless ($self->follows($start,$end) == 1) {
391 $self->warn("End label does not follow Start label!"); return (-1);
393 $pos2=$self->position($end);
397 return ($self->subseq($pos1,$pos2,$length));
400 # rewritten, eventually
403 my ($self,$start,$length,$end,$unsecuremode)=@_;
404 unless (defined $unsecuremode &&
405 $unsecuremode eq "unsecuremoderequested")
406 { # to skip security checks (faster)
408 unless ($self->valid($start)) {
409 $self->warn("Start label not valid"); return (-1);
415 if ($end == $start) {
419 undef $length; # end argument overrides length argument
420 unless ($self->valid($end)) {
421 $self->warn("End label not valid"); return (-1);
423 unless ($self->follows($start,$end) == 1) {
424 $self->warn("End label does not follow Start label!"); return (-1);
431 my ($seq,$exon,$startexon,$endexon); my @exonlabels;
432 my @exons=$self->all_Exons;
434 foreach $exon (@exons) {
435 if ((!(defined($startexon)))&&($exon->valid($start))) { # checks only if not yet found
438 if ($exon->valid($end)) {
441 if ((!(defined($seq)) && (defined($startexon)))) { # initializes only once
442 if ((defined($endexon)) && ($endexon eq $startexon)) { # then perfect, we are finished
444 $seq = $startexon->labelsubseq($start,$length,undef,"unsecuremoderequested");
449 $seq = $startexon->labelsubseq($start,undef,$end,"unsecuremoderequested");
452 } else { # get up to the end of the exon
453 $seq = $startexon->labelsubseq($start,undef,undef,"unsecuremoderequested");
456 if (($startexon)&&($exon ne $startexon)) {
457 if (defined($endexon)) { # we arrived to the last exon
458 $seq .= $endexon->labelsubseq(undef,undef,$end,"unsecuremoderequested"); # get from the start of the exon
461 } elsif (defined($startexon)) { # we are in a whole-exon-in-the-middle case
462 $seq .= $exon->seq; # we add it completely to the seq
463 } # else, we still have to reach the start point, exon useless, we move on
464 if ($length) { # if length argument specified
465 if (($seq && (CORE
::length($seq) >= $length))) {
472 return (substr($seq,0,$length));
480 # returns: the objref and progressive number of the Exon containing that label
483 my ($self,$label)=@_;
485 my @exons=$self->all_Exons;
486 foreach $exon (@exons) {
487 $count++; # 1st exon is numbered "1"
488 if ($exon->valid($label)) {
489 return ($exon,$count)
492 return (-1); # if nothing found
495 # recoded to exploit the new fast labelsubseq()
496 # valid only inside Transcript
498 my ($self,$pos1,$pos2,$length) = @_;
499 my ($str,$startlabel,$endlabel);
500 if (defined ($pos1)) {
501 if ($pos1 == 0) { # if position = 0 complain
502 $self->warn("Position cannot be 0!"); return (-1);
504 if ((defined ($pos2))&&($pos1>$pos2)) {
505 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1);
507 $startlabel=$self->label($pos1);
508 unless ($self->valid($startlabel)) {
509 $self->warn("Start label not valid"); return (-1);
511 if ($startlabel < 1) {
512 $self->warn("position $pos1 not valid as start of subseq!"); return (-1);
515 $startlabel=$self->start;
517 if (defined ($pos2)) {
518 if ($pos2 == 0) { # if position = 0 complain
519 $self->warn("Position cannot be 0!"); return (-1);
522 if ((defined ($pos1))&&($pos1>$pos2)) {
523 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1);
525 $endlabel=$self->label($pos2);
526 unless ($self->valid($endlabel)) {
527 $self->warn("End label not valid"); return (-1);
530 $self->warn("position $pos2 not valid as end of subseq!"); return (-1);
533 unless (defined ($length)) {
534 $endlabel=$self->end;
537 return ($self->labelsubseq($startlabel,$length,$endlabel,"unsecuremoderequested"));
540 # works only inside the transcript, complains if asked outside
542 my ($self,$pos1,$pos2,$length) = @_;
543 my ($str,$startcount,$endcount,$seq,$seqlength);
544 if (defined ($length)) {
546 $self->warn("No sense asking for a subseq of length < 1");
550 my $firstlabel=$self->coordinate_start; # this is inside Transcript obj
551 my $coord_pos=$self->_inside_position($firstlabel); # TESTME old
553 $seqlength=CORE
::length($seq);
554 unless (defined ($pos1)) {
555 $startcount=1+$coord_pos-1; # i.e. coord_pos
557 if ($pos1 == 0) { # if position = 0 complain
558 $self->warn("Position cannot be 0!"); return (-1);
559 } elsif ($pos1 < 0) {
562 if ((defined ($pos2))&&($pos1>$pos2)) {
563 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!");
566 $startcount=$pos1+$coord_pos-1;
568 unless (defined ($pos2)) {
571 if ($pos2 == 0) { # if position = 0 complain
572 $self->warn("Position cannot be 0!"); return (-1);
573 } elsif ($pos2 < 0) {
576 if ((defined ($pos1))&&($pos1>$pos2)) {
577 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!");
580 $endcount=$pos2+$coord_pos-1;
581 if ($endcount > $seqlength) {
582 #print "\n###DEBUG###: pos1 $pos1 pos2 $pos2 coordpos $coord_pos endcount $endcount seqln $seqlength\n";
583 $self->warn("Cannot access end position after the end of Transcript");
586 $length=$endcount-$startcount+1;
588 #print "\n###DEBUG pos1 $pos1 pos2 $pos2 start $startcount end $endcount length $length coordpos $coord_pos\n";
589 my $offset=$startcount-1;
591 $self->warn("Cannot access startposition before the beginning of Transcript, returning from start",1); # ignorable
592 return (substr($seq,0,$length));
593 } elsif ($offset >= $seqlength) {
594 $self->warn("Cannot access startposition after the end of Transcript");
597 $str=substr($seq,$offset,$length);
598 if (CORE
::length($str) < $length) {
599 $self->warn("Attention, cannot return the length requested ".
600 "for subseq",1) if $self->verbose > 0; # ignorable
606 # redefined so that it doesn't require other methods (after deletions) to
610 my $exonsref=$self->{'exons'};
611 my @exons=@
{$exonsref};
612 return ($exons[0]->start);
617 my $exonsref=$self->{'exons'};
618 my @exons=@
{$exonsref};
619 return ($exons[-1]->end);
623 # internal methods begin here
625 # returns: position of label in transcript's all_labels
626 # with STARTlabel == 1
627 # errorcode 0 -> label not found
629 sub _inside_position
{
630 my ($self,$label)=@_;
631 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
632 my ($position,$checkme);
633 my @labels=$self->all_labels;
634 foreach $checkme (@labels) {
636 if ($label == $checkme) {
643 # returns 1 OK or 0 ERROR
644 # arguments: reference to array of Exon object references
646 my ($exon,$thisstart);
649 my @exons=@
{$exonsref};
651 my $firstexon = $exons[0];
653 unless (ref($firstexon) eq "Bio::LiveSeq::Exon") {
654 $self->warn("Object not of class Exon");
657 my $strand = $firstexon->strand;
659 my $prevend = $firstexon->end;
660 shift @exons; # skip first one
661 foreach $exon (@exons) {
662 unless (ref($exon) eq "Bio::LiveSeq::Exon") { # object class check
663 $self->warn("Object not of class Exon");
666 if ($exon->strand != $strand) { # strand consistency check
667 $self->warn("Exons' strands not consistent when trying to create Transcript");
670 $thisstart = $exon->start;
671 unless ($exon->{'seq'}->follows($prevend,$thisstart,$strand)) {
672 $self->warn("Exons not in correct order when trying to create Transcript");
675 $prevend = $exon->end;
680 =head2 get_Translation
683 Usage : $translation = $obj->get_Translation()
684 Function: retrieves the reference to the object of class Translation (if any)
685 attached to a LiveSeq object
686 Returns : object reference
691 sub get_Translation
{
693 return ($self->{'translation'}); # this is set when Translation->new is called
696 # this checks so that deletion spanning multiple exons is
697 # handled accordingly and correctly
698 # arguments: begin and end label of a deletion
699 # this is called BEFORE any deletion in the chain
701 my ($self,$startlabel,$endlabel)=@_;
702 my $exonsref=$self->{'exons'};
703 my @exons=@
{$exonsref};
704 my ($startexon,$endexon,$exon);
705 $startexon=$endexon=0;
706 foreach $exon (@exons) {
707 if (($startexon == 0)&&($exon->valid($startlabel))) {
708 $startexon=$exon; # exon containing start of deletion
710 if (($endexon == 0)&&($exon->valid($endlabel))) {
711 $endexon=$exon; # exon containing end of deletion
713 if (($startexon)&&($endexon)) {
714 last; # don't check further
717 my $nextend=$self->label(2,$endlabel); # retrieve the next label
718 my $prevstart=$self->label(-1,$startlabel); # retrieve the prev label
720 if ($startexon eq $endexon) { # intra-exon deletion
721 if (($startexon->start eq $startlabel) && ($startexon->end eq $endlabel)) {
722 # let's delete the entire exon
724 foreach $exon (@exons) {
725 unless ($exon eq $startexon) {
726 push(@newexons,$exon);
729 $self->{'exons'}=\
@newexons;
730 } elsif ($startexon->start eq $startlabel) { # special cases
731 $startexon->{'start'}=$nextend; # set a new start of exon
732 } elsif ($startexon->end eq $endlabel) {
733 $startexon->{'end'}=$prevstart; # set a new end of exon
737 } else { # two new exons to be created, inter-exons deletion
740 my $dna=$self->{'seq'};
741 my $strand=$self->strand;
742 my $notmiddle=1; # flag for skipping exons in the middle of deletion
743 foreach $exon (@exons) {
744 if ($exon eq $startexon) {
745 $exonobj=Bio
::LiveSeq
::Exon
->new('-seq'=>$dna,'-start'=>$exon->start,'-end'=>$prevstart,'-strand'=>$strand); # new partial exon
746 push(@newexons,$exonobj);
747 $notmiddle=0; # now we enter totally deleted exons
748 } elsif ($exon eq $endexon) {
749 $exonobj=Bio
::LiveSeq
::Exon
->new('-seq'=>$dna,'-start'=>$nextend,'-end'=>$exon->end,'-strand'=>$strand); # new partial exon
750 push(@newexons,$exonobj);
751 $notmiddle=1; # exiting totally deleted exons
753 if ($notmiddle) { # if before or after exons with deletion
754 push(@newexons,$exon);
758 $self->{'exons'}=\
@newexons;
762 =head2 translation_table
764 Title : translation_table
765 Usage : $name = $obj->translation_table;
766 : $name = $obj->translation_table(11);
767 Function: Returns or sets the translation_table used for translating the
769 If it has never been set, it will return undef.
774 sub translation_table
{
775 my ($self,$value) = @_;
776 if (defined $value) {
777 $self->{'translation_table'} = $value;
779 unless (exists $self->{'translation_table'}) {
782 return $self->{'translation_table'};
789 Usage : $frame = $transcript->frame($label);
790 Function: Returns the frame of a particular nucleotide.
791 Frame can be 0 1 or 2 and means the position in the codon triplet
792 of the particulat nucleotide. 0 is the first codon_position.
793 Codon_position (1 2 3) is simply frame+1.
794 If the label asked for is not inside the Transcript, -1 will be
803 # returns: frame of nucleotide (0 1 2)
806 my ($self,$inputlabel)=@_;
807 my @labels=$self->all_labels;
808 my ($label,$frame,$count);
809 foreach $label (@labels) {
810 if ($inputlabel == $label) {
813 $count++; # 0 1 2 3 4....
815 return (-1); # label not found amid Transcript labels