Merge branch 'master' of git@github.com:bioperl/bioperl-live
[bioperl-live.git] / Bio / PrimarySeqI.pm
blob4844d8cb0cec7bc94c63619aef418ea1a43a0d47
1 # $Id$
3 # BioPerl module for Bio::PrimarySeqI
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
19 =head1 SYNOPSIS
21 # Bio::PrimarySeqI is the interface class for sequences.
22 # If you are a newcomer to bioperl, you might want to start with
23 # Bio::Seq documentation.
25 # Test if this is a seq object
26 $obj->isa("Bio::PrimarySeqI") ||
27 $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
29 # Accessors
30 $string = $obj->seq();
31 $substring = $obj->subseq(12,50);
32 $display = $obj->display_id(); # for human display
33 $id = $obj->primary_id(); # unique id for this object,
34 # implementation defined
35 $unique_key= $obj->accession_number(); # unique biological id
38 # Object manipulation
39 eval {
40 $rev = $obj->revcom();
42 if( $@ ) {
43 $obj->throw("Could not reverse complement. ".
44 "Probably not DNA. Actual exception\n$@\n");
47 $trunc = $obj->trunc(12,50);
48 # $rev and $trunc are Bio::PrimarySeqI compliant objects
51 =head1 DESCRIPTION
53 This object defines an abstract interface to basic sequence
54 information - for most users of the package the documentation (and
55 methods) in this class are not useful - this is a developers-only
56 class which defines what methods have to be implmented by other Perl
57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
59 sequences.
61 PrimarySeq is an object just for the sequence and its name(s), nothing
62 more. Seq is the larger object complete with features. There is a pure
63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
64 use L<Bio::PrimarySeq> objects, then please read that module first. This
65 module defines the interface, and is of more interest to people who
66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
67 they "are" bioperl sequence objects, even though it is not using Perl
68 to store the sequence etc.
70 This interface defines what bioperl considers necessary to "be" a
71 sequence, without providing an implementation of this, an
72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
73 a Bio::PrimarySeq-compliant object which in fact wraps another
74 object/database/out-of-perl experience, then this is the correct thing
75 to wrap, generally by providing a wrapper class which would inherit
76 from your object and this Bio::PrimarySeqI interface. The wrapper class
77 then would have methods lists in the "Implementation Specific
78 Functions" which would provide these methods for your object.
80 =head1 FEEDBACK
82 =head2 Mailing Lists
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91 =head2 Support
93 Please direct usage questions or support issues to the mailing list:
95 I<bioperl-l@bioperl.org>
97 rather than to the module maintainer directly. Many experienced and
98 reponsive experts will be able look at the problem and quickly
99 address it. Please include a thorough description of the problem
100 with code and data examples if at all possible.
102 =head2 Reporting Bugs
104 Report bugs to the Bioperl bug tracking system to help us keep track
105 the bugs and their resolution. Bug reports can be submitted via the
106 web:
108 http://bugzilla.open-bio.org/
110 =head1 AUTHOR - Ewan Birney
112 Email birney@ebi.ac.uk
114 =head1 APPENDIX
116 The rest of the documentation details each of the object
117 methods. Internal methods are usually preceded with a _
119 =cut
122 # Let the code begin...
125 package Bio::PrimarySeqI;
126 use strict;
127 use Bio::Tools::CodonTable;
129 use base qw(Bio::Root::RootI);
131 =head1 Implementation Specific Functions
133 These functions are the ones that a specific implementation must
134 define.
136 =head2 seq
138 Title : seq
139 Usage : $string = $obj->seq()
140 Function: Returns the sequence as a string of letters. The
141 case of the letters is left up to the implementer.
142 Suggested cases are upper case for proteins and lower case for
143 DNA sequence (IUPAC standard), but implementations are suggested to
144 keep an open mind about case (some users... want mixed case!)
145 Returns : A scalar
146 Status : Virtual
148 =cut
150 sub seq {
151 my ($self) = @_;
152 $self->throw_not_implemented();
155 =head2 subseq
157 Title : subseq
158 Usage : $substring = $obj->subseq(10,40);
159 Function: Returns the subseq from start to end, where the first base
160 is 1 and the number is inclusive, i.e. 1-2 are the first two
161 bases of the sequence.
163 Start cannot be larger than end but can be equal.
165 Returns : A string
166 Args :
167 Status : Virtual
169 =cut
171 sub subseq{
172 my ($self) = @_;
173 $self->throw_not_implemented();
176 =head2 display_id
178 Title : display_id
179 Usage : $id_string = $obj->display_id();
180 Function: Returns the display id, also known as the common name of the Sequence
181 object.
183 The semantics of this is that it is the most likely string
184 to be used as an identifier of the sequence, and likely to
185 have "human" readability. The id is equivalent to the ID
186 field of the GenBank/EMBL databanks and the id field of the
187 Swissprot/sptrembl database. In fasta format, the >(\S+) is
188 presumed to be the id, though some people overload the id
189 to embed other information. Bioperl does not use any
190 embedded information in the ID field, and people are
191 encouraged to use other mechanisms (accession field for
192 example, or extending the sequence object) to solve this.
194 Notice that $seq->id() maps to this function, mainly for
195 legacy/convenience reasons.
196 Returns : A string
197 Args : None
198 Status : Virtual
201 =cut
203 sub display_id {
204 my ($self) = @_;
205 $self->throw_not_implemented();
209 =head2 accession_number
211 Title : accession_number
212 Usage : $unique_biological_key = $obj->accession_number;
213 Function: Returns the unique biological id for a sequence, commonly
214 called the accession_number. For sequences from established
215 databases, the implementors should try to use the correct
216 accession number. Notice that primary_id() provides the
217 unique id for the implemetation, allowing multiple objects
218 to have the same accession number in a particular implementation.
220 For sequences with no accession number, this method should return
221 "unknown".
222 Returns : A string
223 Args : None
224 Status : Virtual
227 =cut
229 sub accession_number {
230 my ($self,@args) = @_;
231 $self->throw_not_implemented();
236 =head2 primary_id
238 Title : primary_id
239 Usage : $unique_implementation_key = $obj->primary_id;
240 Function: Returns the unique id for this object in this
241 implementation. This allows implementations to manage their
242 own object ids in a way the implementaiton can control
243 clients can expect one id to map to one object.
245 For sequences with no accession number, this method should
246 return a stringified memory location.
248 Returns : A string
249 Args : None
250 Status : Virtual
253 =cut
255 sub primary_id {
256 my ($self,@args) = @_;
257 $self->throw_not_implemented();
261 =head2 can_call_new
263 Title : can_call_new
264 Usage : if( $obj->can_call_new ) {
265 $newobj = $obj->new( %param );
267 Function: Can_call_new returns 1 or 0 depending
268 on whether an implementation allows new
269 constructor to be called. If a new constructor
270 is allowed, then it should take the followed hashed
271 constructor list.
273 $myobject->new( -seq => $sequence_as_string,
274 -display_id => $id
275 -accession_number => $accession
276 -alphabet => 'dna',
278 Returns : 1 or 0
279 Args :
282 =cut
284 sub can_call_new{
285 my ($self,@args) = @_;
287 # we default to 0 here
289 return 0;
292 =head2 alphabet
294 Title : alphabet
295 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
296 Function: Returns the type of sequence being one of
297 'dna', 'rna' or 'protein'. This is case sensitive.
299 This is not called "type" because this would cause
300 upgrade problems from the 0.5 and earlier Seq objects.
302 Returns : A string either 'dna','rna','protein'. NB - the object must
303 make a call of the alphabet, if there is no alphabet specified it
304 has to guess.
305 Args : None
306 Status : Virtual
309 =cut
311 sub alphabet{
312 my ( $self ) = @_;
313 $self->throw_not_implemented();
316 =head2 moltype
318 Title : moltype
319 Usage : Deprecated. Use alphabet() instead.
321 =cut
323 sub moltype{
324 my ($self,@args) = @_;
326 $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
327 $self->alphabet(@args);
331 =head1 Optional Implementation Functions
333 The following functions rely on the above functions. An
334 implementing class does not need to provide these functions, as they
335 will be provided by this class, but is free to override these
336 functions.
338 The revcom(), trunc(), and translate() methods create new sequence
339 objects. They will call new() on the class of the sequence object
340 instance passed as argument, unless can_call_new() returns FALSE. In
341 the latter case a Bio::PrimarySeq object will be created. Implementors
342 which really want to control how objects are created (eg, for object
343 persistence over a database, or objects in a CORBA framework), they
344 are encouraged to override these methods
346 =head2 revcom
348 Title : revcom
349 Usage : $rev = $seq->revcom()
350 Function: Produces a new Bio::PrimarySeqI implementing object which
351 is the reversed complement of the sequence. For protein
352 sequences this throws an exception of "Sequence is a
353 protein. Cannot revcom".
355 The id is the same id as the original sequence, and the
356 accession number is also indentical. If someone wants to
357 track that this sequence has be reversed, it needs to
358 define its own extensionsj.
360 To do an inplace edit of an object you can go:
362 $seq = $seq->revcom();
364 This of course, causes Perl to handle the garbage
365 collection of the old object, but it is roughly speaking as
366 efficient as an inplace edit.
368 Returns : A new (fresh) Bio::PrimarySeqI object
369 Args : None
372 =cut
374 sub revcom{
375 my ($self) = @_;
378 # check the type is good first.
379 my $t = $self->alphabet;
381 if( $t eq 'protein' ) {
382 $self->throw("Sequence is a protein. Cannot revcom");
385 if( $t ne 'dna' && $t ne 'rna' ) {
386 if( $self->can('warn') ) {
387 $self->warn("Sequence is not dna or rna, but [$t]. ".
388 "Attempting to revcom, but unsure if this is right");
389 } else {
390 warn("[$self] Sequence is not dna or rna, but [$t]. ".
391 "Attempting to revcom, but unsure if this is right");
395 # yank out the sequence string
397 my $str = $self->seq();
399 # if is RNA - map to DNA then map back
401 if( $t eq 'rna' ) {
402 $str =~ tr/uU/tT/;
405 # revcom etc...
407 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
408 my $revseq = CORE::reverse $str;
410 if( $t eq 'rna' ) {
411 $revseq =~ tr/tT/uU/;
414 my $seqclass;
415 if($self->can_call_new()) {
416 $seqclass = ref($self);
417 } else {
418 $seqclass = 'Bio::PrimarySeq';
419 $self->_attempt_to_load_Seq();
421 my $out = $seqclass->new( '-seq' => $revseq,
422 '-is_circular' => $self->is_circular,
423 '-display_id' => $self->display_id,
424 '-accession_number' => $self->accession_number,
425 '-alphabet' => $self->alphabet,
426 '-desc' => $self->desc(),
427 '-verbose' => $self->verbose
429 return $out;
433 =head2 trunc
435 Title : trunc
436 Usage : $subseq = $myseq->trunc(10,100);
437 Function: Provides a truncation of a sequence.
438 Returns : A fresh Bio::PrimarySeqI implementing object.
439 Args : Two integers denoting first and last base of the sub-sequence.
442 =cut
444 sub trunc{
445 my ($self,$start,$end) = @_;
447 my $str;
448 if( defined $start && ref($start) &&
449 $start->isa('Bio::LocationI') ) {
450 $str = $self->subseq($start); # start is a location actually
451 } elsif( !$end ) {
452 $self->throw("trunc start,end -- there was no end for $start");
453 } elsif( $end < $start ) {
454 my $msg = "start [$start] is greater than end [$end]. \n".
455 "If you want to truncated and reverse complement, \n".
456 "you must call trunc followed by revcom. Sorry.";
457 $self->throw($msg);
458 } else {
459 $str = $self->subseq($start,$end);
462 my $seqclass;
463 if($self->can_call_new()) {
464 $seqclass = ref($self);
465 } else {
466 $seqclass = 'Bio::PrimarySeq';
467 $self->_attempt_to_load_Seq();
470 my $out = $seqclass->new( '-seq' => $str,
471 '-display_id' => $self->display_id,
472 '-accession_number' => $self->accession_number,
473 '-alphabet' => $self->alphabet,
474 '-desc' => $self->desc(),
475 '-verbose' => $self->verbose
477 return $out;
481 =head2 translate
483 Title : translate
484 Usage : $protein_seq_obj = $dna_seq_obj->translate
486 Or if you expect a complete coding sequence (CDS) translation,
487 with inititator at the beginning and terminator at the end:
489 $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
491 Or if you want translate() to find the first initiation
492 codon and return the corresponding protein:
494 $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
496 Function: Provides the translation of the DNA sequence using full
497 IUPAC ambiguities in DNA/RNA and amino acid codes.
499 The complete CDS translation is identical to EMBL/TREMBL
500 database translation. Note that the trailing terminator
501 character is removed before returning the translated protein
502 object.
504 Note: if you set $dna_seq_obj->verbose(1) you will get a
505 warning if the first codon is not a valid initiator.
507 Returns : A Bio::PrimarySeqI implementing object
508 Args : -terminator
509 character for terminator, default '*'
510 -unknown
511 character for unknown, default 'X'
512 -frame
513 positive integer frame shift (in bases), default 0
514 -codontable_id
515 integer codon table id, default 1
516 -complete
517 boolean, if true, complete CDS is expected. default false
518 -throw
519 boolean, throw exception if ORF not complete, default false
520 -orf
521 if 'longest', find longest ORF. other true value, find
522 first ORF. default 0
523 -codontable
524 optional L<Bio::Tools::CodonTable> object to use for
525 translation
526 -start
527 optional three-character string to force as initiation
528 codon (e.g. 'atg'). If unset, start codons are
529 determined by the CodonTable. Case insensitive.
530 -offset
531 optional positive integer offset for fuzzy locations.
532 if set, must be either 1, 2, or 3
534 =head3 Notes
536 The -start argument only applies when -orf is set to 1. By default all
537 initiation codons found in the given codon table are used but when
538 "start" is set to some codon this codon will be used exclusively as
539 the initiation codon. Note that the default codon table (NCBI
540 "Standard") has 3 initiation codons!
542 By default translate() translates termination codons to the some
543 character (default is *), both internal and trailing codons. Setting
544 "-complete" to 1 tells translate() to remove the trailing character.
546 -offset is used for seqfeatures which contain the the \codon_start tag
547 and can be set to 1, 2, or 3. This is the offset by which the
548 sequence translation starts relative to the first base of the feature
550 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
552 Deprecated argument set (v. 1.5.1 and prior versions)
553 where each argument is an element in an array:
555 1: character for terminator (optional), defaults to '*'.
556 2: character for unknown amino acid (optional), defaults to 'X'.
557 3: frame (optional), valid values are 0, 1, 2, defaults to 0.
558 4: codon table id (optional), defaults to 1.
559 5: complete coding sequence expected, defaults to 0 (false).
560 6: boolean, throw exception if not complete coding sequence
561 (true), defaults to warning (false)
562 7: codontable, a custom Bio::Tools::CodonTable object (optional).
564 =cut
566 sub translate {
567 my ($self,@args) = @_;
568 my ($terminator, $unknown, $frame, $codonTableId, $complete, $throw,
569 $codonTable, $orf, $start_codon, $offset);
571 ## new API with named parameters, post 1.5.1
572 if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
573 ($terminator, $unknown, $frame, $codonTableId, $complete, $throw,
574 $codonTable, $orf, $start_codon, $offset) =
575 $self->_rearrange([qw(TERMINATOR
576 UNKNOWN
577 FRAME
578 CODONTABLE_ID
579 COMPLETE
580 THROW
581 CODONTABLE
583 START
584 OFFSET)], @args);
585 ## old API, 1.5.1 and preceding versions
586 } else {
587 ($terminator, $unknown, $frame, $codonTableId,
588 $complete, $throw, $codonTable, $offset) = @args;
591 ## Initialize termination codon, unknown codon, codon table id, frame
592 $terminator = '*' unless (defined($terminator) and $terminator ne '');
593 $unknown = "X" unless (defined($unknown) and $unknown ne '');
594 $frame = 0 unless (defined($frame) and $frame ne '');
595 $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
597 ## Get a CodonTable, error if custom CodonTable is invalid
598 if ($codonTable) {
599 $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
600 unless $codonTable->isa('Bio::Tools::CodonTable');
601 } else {
602 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
605 ## Error if alphabet is "protein"
606 $self->throw("Can't translate an amino acid sequence.") if
607 ($self->alphabet =~ /protein/i);
609 ## Error if -start parameter isn't a valid codon
610 if ($start_codon) {
611 $self->throw("Invalid start codon: $start_codon.") if
612 ( $start_codon !~ /^[A-Z]{3}$/i );
615 my $seq;
617 if ($offset) {
618 $self->throw("Offset must be 1, 2, or 3.") if
619 ( $offset !~ /^[123]$/ );
620 my ($start, $end) = ($offset, $self->length);
621 ($seq) = $self->subseq($start, $end);
622 } else {
623 ($seq) = $self->seq();
626 ## ignore frame if an ORF is supposed to be found
627 if ( $orf ) {
628 my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
629 $seq = $self->_orf_sequence( $seq, $orf_region );
630 } else {
631 ## use frame, error if frame is not 0, 1 or 2
632 $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
633 unless ($frame == 0 or $frame == 1 or $frame == 2);
634 $seq = substr($seq,$frame);
637 ## Translate it
638 my $output = $codonTable->translate($seq);
639 # Use user-input terminator/unknown
640 $output =~ s/\*/$terminator/g;
641 $output =~ s/X/$unknown/g;
643 ## Only if we are expecting to translate a complete coding region
644 if ($complete) {
645 my $id = $self->display_id;
646 # remove the terminator character
647 if( substr($output,-1,1) eq $terminator ) {
648 chop $output;
649 } else {
650 $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
651 $self->warn("Seq [$id]: Not using a valid terminator codon!");
653 # test if there are terminator characters inside the protein sequence!
654 if ($output =~ /\Q$terminator\E/) {
655 $id ||= '';
656 $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
657 $self->warn("Seq [$id]: Terminator codon inside CDS!");
659 # if the initiator codon is not ATG, the amino acid needs to be changed to M
660 if ( substr($output,0,1) ne 'M' ) {
661 if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
662 $output = 'M'. substr($output,1);
663 } elsif ($throw) {
664 $self->throw("Seq [$id]: Not using a valid initiator codon!");
665 } else {
666 $self->warn("Seq [$id]: Not using a valid initiator codon!");
671 my $seqclass;
672 if ($self->can_call_new()) {
673 $seqclass = ref($self);
674 } else {
675 $seqclass = 'Bio::PrimarySeq';
676 $self->_attempt_to_load_Seq();
678 my $out = $seqclass->new( '-seq' => $output,
679 '-display_id' => $self->display_id,
680 '-accession_number' => $self->accession_number,
681 # is there anything wrong with retaining the
682 # description?
683 '-desc' => $self->desc(),
684 '-alphabet' => 'protein',
685 '-verbose' => $self->verbose
687 return $out;
690 =head2 transcribe()
692 Title : transcribe
693 Usage : $xseq = $seq->transcribe
694 Function: Convert base T to base U
695 Returns : PrimarySeqI object of alphabet 'rna' or
696 undef if $seq->alphabet ne 'dna'
697 Args :
699 =cut
701 sub transcribe {
702 my $self = shift;
703 return unless $self->alphabet eq 'dna';
704 my $s = $self->seq;
705 $s =~ tr/tT/uU/;
706 my $class;
707 if ($self->can_call_new) {
708 $class = ref($self);
709 } else {
710 $class = 'Bio::PrimarySeq';
711 $self->_attempt_to_load_Seq;
713 my $desc = $self->desc || '';
714 return $class->new(
715 '-seq' => $s,
716 '-alphabet' => 'rna',
717 '-display_id' => $self->display_id,
718 '-accession_number' => $self->accession_number,
719 '-desc' => "${desc}[TRANSCRIBED]",
720 '-verbose' => $self->verbose
724 =head2 rev_transcribe()
726 Title : rev_transcribe
727 Usage : $rtseq = $seq->rev_transcribe
728 Function: Convert base U to base T
729 Returns : PrimarySeqI object of alphabet 'dna' or
730 undef if $seq->alphabet ne 'rna'
731 Args :
733 =cut
735 sub rev_transcribe {
736 my $self = shift;
737 return unless $self->alphabet eq 'rna';
738 my $s = $self->seq;
739 $s =~ tr/uU/tT/;
740 my $class;
741 if ($self->can_call_new) {
742 $class = ref($self);
743 } else {
744 $class = 'Bio::PrimarySeq';
745 $self->_attempt_to_load_Seq;
747 return $class->new(
748 '-seq' => $s,
749 '-alphabet' => 'dna',
750 '-display_id' => $self->display_id,
751 '-accession_number' => $self->accession_number,
752 '-desc' => $self->desc . "[REVERSE TRANSCRIBED]",
753 '-verbose' => $self->verbose
757 =head2 id
759 Title : id
760 Usage : $id = $seq->id()
761 Function: ID of the sequence. This should normally be (and actually is in
762 the implementation provided here) just a synonym for display_id().
763 Returns : A string.
764 Args :
766 =cut
768 sub id {
769 my ($self)= @_;
771 return $self->display_id();
775 =head2 length
777 Title : length
778 Usage : $len = $seq->length()
779 Function:
780 Returns : Integer representing the length of the sequence.
781 Args :
783 =cut
785 sub length {
786 my ($self)= @_;
787 $self->throw_not_implemented();
790 =head2 desc
792 Title : desc
793 Usage : $seq->desc($newval);
794 $description = $seq->desc();
795 Function: Get/set description text for a seq object
796 Returns : Value of desc
797 Args : newvalue (optional)
799 =cut
801 sub desc {
802 shift->throw_not_implemented();
806 =head2 is_circular
808 Title : is_circular
809 Usage : if( $obj->is_circular) { /Do Something/ }
810 Function: Returns true if the molecule is circular
811 Returns : Boolean value
812 Args : none
814 =cut
816 sub is_circular{
817 shift->throw_not_implemented;
820 =head1 Private functions
822 These are some private functions for the PrimarySeqI interface. You do not
823 need to implement these functions
825 =head2 _find_orfs_nucleotide
827 Title : _find_orfs_nucleotide
828 Usage :
829 Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
830 The ORF is not required to have a termination codon.
831 Example :
832 Returns : a list of string coordinates of ORF locations (0-based half-open),
833 sorted descending by length (so that the longest is first)
834 as: [ start, end, frame, length ], [ start, end, frame, length ], ...
835 Args : Nucleotide sequence,
836 CodonTable object,
837 (optional) alternative initiation codon (e.g. 'ATA'),
838 (optional) boolean that, if true, stops after finding the
839 first available ORF
841 =cut
843 sub _find_orfs_nucleotide {
844 my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
845 $sequence = uc $sequence;
846 $start_codon = uc $start_codon if $start_codon;
848 my $is_start = $start_codon
849 ? sub { shift eq $start_codon }
850 : sub { $codon_table->is_start_codon( shift ) };
852 # stores the begin index of the currently-running ORF in each
853 # reading frame
854 my @current_orf_start = (-1,-1,-1);
856 #< stores coordinates of longest observed orf (so far) in each
857 # reading frame
858 my @orfs;
860 # go through each base of the sequence, and each reading frame for each base
861 my $seqlen = CORE::length $sequence;
862 for( my $j = 0; $j <= $seqlen-3; $j++ ) {
863 my $frame = $j % 3;
865 my $this_codon = substr( $sequence, $j, 3 );
867 # if in an orf and this is either a stop codon or the last in-frame codon in the string
868 if ( $current_orf_start[$frame] >= 0 ) {
869 if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
870 # record ORF start, end (half-open), length, and frame
871 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
872 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
874 $self->warn( "Translating partial ORF "
875 .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
876 .' from end of nucleotide sequence'
878 if $first_only && $is_last_codon_in_frame;
880 return \@this_orf if $first_only;
881 push @orfs, \@this_orf;
882 $current_orf_start[$frame] = -1;
885 # if this is a start codon
886 elsif ( $is_start->($this_codon) ) {
887 $current_orf_start[$frame] = $j;
891 return sort { $b->[2] <=> $a->[2] } @orfs;
894 sub _truncate_seq {
895 my ($self,$seq) = @_;
896 return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
898 sub _orf_sequence {
899 my ($self, $seq, $orf ) = @_;
900 return '' unless $orf;
901 return substr( $seq, $orf->[0], $orf->[2] )
904 =head2 _attempt_to_load_Seq
906 Title : _attempt_to_load_Seq
907 Usage :
908 Function:
909 Example :
910 Returns :
911 Args :
914 =cut
916 sub _attempt_to_load_Seq{
917 my ($self) = @_;
919 if( $main::{'Bio::PrimarySeq'} ) {
920 return 1;
921 } else {
922 eval {
923 require Bio::PrimarySeq;
925 if( $@ ) {
926 my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
927 "This indicates that you are using Bio::PrimarySeqI ".
928 "without Bio::PrimarySeq loaded or without providing a ".
929 "complete implementation.\nThe most likely problem is that there ".
930 "has been a misconfiguration of the bioperl environment\n".
931 "Actual exception:\n\n";
932 $self->throw("$text$@\n");
933 return 0;
935 return 1;