see if we can make PAUSE skip indexing these modules
[bioperl-live.git] / Bio / PrimarySeqI.pm
blob84253e76a8a293dba3d7c1b2016db317c0bc6f1e
2 # BioPerl module for Bio::PrimarySeqI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
18 =head1 SYNOPSIS
20 # Bio::PrimarySeqI is the interface class for sequences.
21 # If you are a newcomer to bioperl, you might want to start with
22 # Bio::Seq documentation.
24 # Test if this is a seq object
25 $obj->isa("Bio::PrimarySeqI") ||
26 $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
28 # Accessors
29 $string = $obj->seq();
30 $substring = $obj->subseq(12,50);
31 $display = $obj->display_id(); # for human display
32 $id = $obj->primary_id(); # unique id for this object,
33 # implementation defined
34 $unique_key= $obj->accession_number(); # unique biological id
37 # Object manipulation
38 eval {
39 $rev = $obj->revcom();
41 if( $@ ) {
42 $obj->throw("Could not reverse complement. ".
43 "Probably not DNA. Actual exception\n$@\n");
46 $trunc = $obj->trunc(12,50);
47 # $rev and $trunc are Bio::PrimarySeqI compliant objects
50 =head1 DESCRIPTION
52 This object defines an abstract interface to basic sequence
53 information - for most users of the package the documentation (and
54 methods) in this class are not useful - this is a developers-only
55 class which defines what methods have to be implmented by other Perl
56 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
57 Bio::Seq" or "man Bio::Seq" for more information on the main class for
58 sequences.
60 PrimarySeq is an object just for the sequence and its name(s), nothing
61 more. Seq is the larger object complete with features. There is a pure
62 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
63 use L<Bio::PrimarySeq> objects, then please read that module first. This
64 module defines the interface, and is of more interest to people who
65 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
66 they "are" bioperl sequence objects, even though it is not using Perl
67 to store the sequence etc.
69 This interface defines what bioperl considers necessary to "be" a
70 sequence, without providing an implementation of this, an
71 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
72 a Bio::PrimarySeq-compliant object which in fact wraps another
73 object/database/out-of-perl experience, then this is the correct thing
74 to wrap, generally by providing a wrapper class which would inherit
75 from your object and this Bio::PrimarySeqI interface. The wrapper class
76 then would have methods lists in the "Implementation Specific
77 Functions" which would provide these methods for your object.
79 =head1 FEEDBACK
81 =head2 Mailing Lists
83 User feedback is an integral part of the evolution of this and other
84 Bioperl modules. Send your comments and suggestions preferably to one
85 of the Bioperl mailing lists. Your participation is much appreciated.
87 bioperl-l@bioperl.org - General discussion
88 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
90 =head2 Support
92 Please direct usage questions or support issues to the mailing list:
94 I<bioperl-l@bioperl.org>
96 rather than to the module maintainer directly. Many experienced and
97 reponsive experts will be able look at the problem and quickly
98 address it. Please include a thorough description of the problem
99 with code and data examples if at all possible.
101 =head2 Reporting Bugs
103 Report bugs to the Bioperl bug tracking system to help us keep track
104 the bugs and their resolution. Bug reports can be submitted via the
105 web:
107 https://redmine.open-bio.org/projects/bioperl/
109 =head1 AUTHOR - Ewan Birney
111 Email birney@ebi.ac.uk
113 =head1 APPENDIX
115 The rest of the documentation details each of the object
116 methods. Internal methods are usually preceded with a _
118 =cut
121 # Let the code begin...
124 package Bio::PrimarySeqI;
125 use strict;
126 use Bio::Tools::CodonTable;
128 use base qw(Bio::Root::RootI);
130 =head1 Implementation Specific Functions
132 These functions are the ones that a specific implementation must
133 define.
135 =head2 seq
137 Title : seq
138 Usage : $string = $obj->seq()
139 Function: Returns the sequence as a string of letters. The
140 case of the letters is left up to the implementer.
141 Suggested cases are upper case for proteins and lower case for
142 DNA sequence (IUPAC standard), but implementations are suggested to
143 keep an open mind about case (some users... want mixed case!)
144 Returns : A scalar
145 Status : Virtual
147 =cut
149 sub seq {
150 my ($self) = @_;
151 $self->throw_not_implemented();
154 =head2 subseq
156 Title : subseq
157 Usage : $substring = $obj->subseq(10,40);
158 Function: Returns the subseq from start to end, where the first base
159 is 1 and the number is inclusive, i.e. 1-2 are the first two
160 bases of the sequence.
162 Start cannot be larger than end but can be equal.
164 Returns : A string
165 Args :
166 Status : Virtual
168 =cut
170 sub subseq{
171 my ($self) = @_;
172 $self->throw_not_implemented();
175 =head2 display_id
177 Title : display_id
178 Usage : $id_string = $obj->display_id();
179 Function: Returns the display id, also known as the common name of the Sequence
180 object.
182 The semantics of this is that it is the most likely string
183 to be used as an identifier of the sequence, and likely to
184 have "human" readability. The id is equivalent to the ID
185 field of the GenBank/EMBL databanks and the id field of the
186 Swissprot/sptrembl database. In fasta format, the >(\S+) is
187 presumed to be the id, though some people overload the id
188 to embed other information. Bioperl does not use any
189 embedded information in the ID field, and people are
190 encouraged to use other mechanisms (accession field for
191 example, or extending the sequence object) to solve this.
193 Notice that $seq->id() maps to this function, mainly for
194 legacy/convenience reasons.
195 Returns : A string
196 Args : None
197 Status : Virtual
200 =cut
202 sub display_id {
203 my ($self) = @_;
204 $self->throw_not_implemented();
208 =head2 accession_number
210 Title : accession_number
211 Usage : $unique_biological_key = $obj->accession_number;
212 Function: Returns the unique biological id for a sequence, commonly
213 called the accession_number. For sequences from established
214 databases, the implementors should try to use the correct
215 accession number. Notice that primary_id() provides the
216 unique id for the implemetation, allowing multiple objects
217 to have the same accession number in a particular implementation.
219 For sequences with no accession number, this method should return
220 "unknown".
221 Returns : A string
222 Args : None
223 Status : Virtual
226 =cut
228 sub accession_number {
229 my ($self,@args) = @_;
230 $self->throw_not_implemented();
235 =head2 primary_id
237 Title : primary_id
238 Usage : $unique_implementation_key = $obj->primary_id;
239 Function: Returns the unique id for this object in this
240 implementation. This allows implementations to manage their
241 own object ids in a way the implementaiton can control
242 clients can expect one id to map to one object.
244 For sequences with no accession number, this method should
245 return a stringified memory location.
247 Returns : A string
248 Args : None
249 Status : Virtual
252 =cut
254 sub primary_id {
255 my ($self,@args) = @_;
256 $self->throw_not_implemented();
260 =head2 can_call_new
262 Title : can_call_new
263 Usage : if( $obj->can_call_new ) {
264 $newobj = $obj->new( %param );
266 Function: Can_call_new returns 1 or 0 depending
267 on whether an implementation allows new
268 constructor to be called. If a new constructor
269 is allowed, then it should take the followed hashed
270 constructor list.
272 $myobject->new( -seq => $sequence_as_string,
273 -display_id => $id
274 -accession_number => $accession
275 -alphabet => 'dna',
277 Returns : 1 or 0
278 Args :
281 =cut
283 sub can_call_new{
284 my ($self,@args) = @_;
286 # we default to 0 here
288 return 0;
291 =head2 alphabet
293 Title : alphabet
294 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
295 Function: Returns the type of sequence being one of
296 'dna', 'rna' or 'protein'. This is case sensitive.
298 This is not called "type" because this would cause
299 upgrade problems from the 0.5 and earlier Seq objects.
301 Returns : A string either 'dna','rna','protein'. NB - the object must
302 make a call of the alphabet, if there is no alphabet specified it
303 has to guess.
304 Args : None
305 Status : Virtual
308 =cut
310 sub alphabet{
311 my ( $self ) = @_;
312 $self->throw_not_implemented();
315 =head2 moltype
317 Title : moltype
318 Usage : Deprecated. Use alphabet() instead.
320 =cut
322 sub moltype{
323 my ($self,@args) = @_;
325 $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
326 $self->alphabet(@args);
330 =head1 Optional Implementation Functions
332 The following functions rely on the above functions. An
333 implementing class does not need to provide these functions, as they
334 will be provided by this class, but is free to override these
335 functions.
337 The revcom(), trunc(), and translate() methods create new sequence
338 objects. They will call new() on the class of the sequence object
339 instance passed as argument, unless can_call_new() returns FALSE. In
340 the latter case a Bio::PrimarySeq object will be created. Implementors
341 which really want to control how objects are created (eg, for object
342 persistence over a database, or objects in a CORBA framework), they
343 are encouraged to override these methods
345 =head2 revcom
347 Title : revcom
348 Usage : $rev = $seq->revcom()
349 Function: Produces a new Bio::PrimarySeqI implementing object which
350 is the reversed complement of the sequence. For protein
351 sequences this throws an exception of "Sequence is a
352 protein. Cannot revcom".
354 The id is the same id as the original sequence, and the
355 accession number is also indentical. If someone wants to
356 track that this sequence has be reversed, it needs to
357 define its own extensionsj.
359 To do an inplace edit of an object you can go:
361 $seq = $seq->revcom();
363 This of course, causes Perl to handle the garbage
364 collection of the old object, but it is roughly speaking as
365 efficient as an inplace edit.
367 Returns : A new (fresh) Bio::PrimarySeqI object
368 Args : None
371 =cut
373 sub revcom{
374 my ($self) = @_;
377 # check the type is good first.
378 my $t = $self->alphabet;
380 if( $t eq 'protein' ) {
381 $self->throw("Sequence is a protein. Cannot revcom");
384 if( $t ne 'dna' && $t ne 'rna' ) {
385 if( $self->can('warn') ) {
386 $self->warn("Sequence is not dna or rna, but [$t]. ".
387 "Attempting to revcom, but unsure if this is right");
388 } else {
389 warn("[$self] Sequence is not dna or rna, but [$t]. ".
390 "Attempting to revcom, but unsure if this is right");
394 # yank out the sequence string
396 my $str = $self->seq();
398 # if is RNA - map to DNA then map back
400 if( $t eq 'rna' ) {
401 $str =~ tr/uU/tT/;
404 # revcom etc...
406 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
407 my $revseq = CORE::reverse $str;
409 if( $t eq 'rna' ) {
410 $revseq =~ tr/tT/uU/;
413 my $seqclass;
414 if($self->can_call_new()) {
415 $seqclass = ref($self);
416 } else {
417 $seqclass = 'Bio::PrimarySeq';
418 $self->_attempt_to_load_Seq();
420 my $out = $seqclass->new( '-seq' => $revseq,
421 '-is_circular' => $self->is_circular,
422 '-display_id' => $self->display_id,
423 '-accession_number' => $self->accession_number,
424 '-alphabet' => $self->alphabet,
425 '-desc' => $self->desc(),
426 '-verbose' => $self->verbose
428 return $out;
432 =head2 trunc
434 Title : trunc
435 Usage : $subseq = $myseq->trunc(10,100);
436 Function: Provides a truncation of a sequence.
437 Returns : A fresh Bio::PrimarySeqI implementing object.
438 Args : Two integers denoting first and last base of the sub-sequence.
441 =cut
443 sub trunc{
444 my ($self,$start,$end) = @_;
446 my $str;
447 if( defined $start && ref($start) &&
448 $start->isa('Bio::LocationI') ) {
449 $str = $self->subseq($start); # start is a location actually
450 } elsif( !$end ) {
451 $self->throw("trunc start,end -- there was no end for $start");
452 } elsif( $end < $start ) {
453 my $msg = "start [$start] is greater than end [$end]. \n".
454 "If you want to truncated and reverse complement, \n".
455 "you must call trunc followed by revcom. Sorry.";
456 $self->throw($msg);
457 } else {
458 $str = $self->subseq($start,$end);
461 my $seqclass;
462 if($self->can_call_new()) {
463 $seqclass = ref($self);
464 } else {
465 $seqclass = 'Bio::PrimarySeq';
466 $self->_attempt_to_load_Seq();
469 my $out = $seqclass->new( '-seq' => $str,
470 '-display_id' => $self->display_id,
471 '-accession_number' => $self->accession_number,
472 '-alphabet' => $self->alphabet,
473 '-desc' => $self->desc(),
474 '-verbose' => $self->verbose
476 return $out;
480 =head2 translate
482 Title : translate
483 Usage : $protein_seq_obj = $dna_seq_obj->translate
485 Or if you expect a complete coding sequence (CDS) translation,
486 with initiator at the beginning and terminator at the end:
488 $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
490 Or if you want translate() to find the first initiation
491 codon and return the corresponding protein:
493 $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
495 Function: Provides the translation of the DNA sequence using full
496 IUPAC ambiguities in DNA/RNA and amino acid codes.
498 The complete CDS translation is identical to EMBL/TREMBL
499 database translation. Note that the trailing terminator
500 character is removed before returning the translated protein
501 object.
503 Note: if you set $dna_seq_obj->verbose(1) you will get a
504 warning if the first codon is not a valid initiator.
506 Returns : A Bio::PrimarySeqI implementing object
507 Args : -terminator
508 character for terminator, default '*'
509 -unknown
510 character for unknown, default 'X'
511 -frame
512 positive integer frame shift (in bases), default 0
513 -codontable_id
514 integer codon table id, default 1
515 -complete
516 boolean, if true, complete CDS is expected. default false
517 -complete_codons
518 boolean, if true, codons which are incomplete are translated if a
519 suitable amino acid is found. For instance, if the incomplete
520 codon is 'GG', the completed codon is 'GGN', which is glycine
521 (G). Defaults to 'false'; setting '-complete' also makes this
522 true.
523 -throw
524 boolean, throw exception if ORF not complete, default false
525 -orf
526 if 'longest', find longest ORF. other true value, find
527 first ORF. default 0
528 -codontable
529 optional L<Bio::Tools::CodonTable> object to use for
530 translation
531 -start
532 optional three-character string to force as initiation
533 codon (e.g. 'atg'). If unset, start codons are
534 determined by the CodonTable. Case insensitive.
535 -offset
536 optional positive integer offset for fuzzy locations.
537 if set, must be either 1, 2, or 3
539 =head3 Notes
541 The -start argument only applies when -orf is set to 1. By default all
542 initiation codons found in the given codon table are used but when
543 "start" is set to some codon this codon will be used exclusively as
544 the initiation codon. Note that the default codon table (NCBI
545 "Standard") has 3 initiation codons!
547 By default translate() translates termination codons to the some
548 character (default is *), both internal and trailing codons. Setting
549 "-complete" to 1 tells translate() to remove the trailing character.
551 -offset is used for seqfeatures which contain the the \codon_start tag
552 and can be set to 1, 2, or 3. This is the offset by which the
553 sequence translation starts relative to the first base of the feature
555 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
557 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
558 element in an array:
560 1: character for terminator (optional), defaults to '*'.
561 2: character for unknown amino acid (optional), defaults to 'X'.
562 3: frame (optional), valid values are 0, 1, 2, defaults to 0.
563 4: codon table id (optional), defaults to 1.
564 5: complete coding sequence expected, defaults to 0 (false).
565 6: boolean, throw exception if not complete coding sequence
566 (true), defaults to warning (false)
567 7: codontable, a custom Bio::Tools::CodonTable object (optional).
569 =cut
571 sub translate {
572 my ($self,@args) = @_;
573 my ($terminator, $unknown, $frame, $codonTableId, $complete,
574 $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
576 ## new API with named parameters, post 1.5.1
577 if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
578 ($terminator, $unknown, $frame, $codonTableId, $complete,
579 $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
580 $self->_rearrange([qw(TERMINATOR
581 UNKNOWN
582 FRAME
583 CODONTABLE_ID
584 COMPLETE
585 COMPLETE_CODONS
586 THROW
587 CODONTABLE
589 START
590 OFFSET)], @args);
591 ## old API, 1.5.1 and preceding versions
592 } else {
593 ($terminator, $unknown, $frame, $codonTableId,
594 $complete, $throw, $codonTable, $offset) = @args;
597 ## Initialize termination codon, unknown codon, codon table id, frame
598 $terminator = '*' unless (defined($terminator) and $terminator ne '');
599 $unknown = "X" unless (defined($unknown) and $unknown ne '');
600 $frame = 0 unless (defined($frame) and $frame ne '');
601 $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
602 $complete_codons ||= $complete || 0;
604 ## Get a CodonTable, error if custom CodonTable is invalid
605 if ($codonTable) {
606 $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
607 unless $codonTable->isa('Bio::Tools::CodonTable');
608 } else {
610 # shouldn't this be cached? Seems wasteful to have a new instance
611 # every time...
612 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
615 ## Error if alphabet is "protein"
616 $self->throw("Can't translate an amino acid sequence.") if
617 ($self->alphabet =~ /protein/i);
619 ## Error if -start parameter isn't a valid codon
620 if ($start_codon) {
621 $self->throw("Invalid start codon: $start_codon.") if
622 ( $start_codon !~ /^[A-Z]{3}$/i );
625 my $seq;
627 if ($offset) {
628 $self->throw("Offset must be 1, 2, or 3.") if
629 ( $offset !~ /^[123]$/ );
630 my ($start, $end) = ($offset, $self->length);
631 ($seq) = $self->subseq($start, $end);
632 } else {
633 ($seq) = $self->seq();
636 ## ignore frame if an ORF is supposed to be found
637 if ( $orf ) {
638 my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
639 $seq = $self->_orf_sequence( $seq, $orf_region );
640 } else {
641 ## use frame, error if frame is not 0, 1 or 2
642 $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
643 unless ($frame == 0 or $frame == 1 or $frame == 2);
644 $seq = substr($seq,$frame);
647 ## Translate it
648 my $output = $codonTable->translate($seq, $complete_codons);
649 # Use user-input terminator/unknown
650 $output =~ s/\*/$terminator/g;
651 $output =~ s/X/$unknown/g;
653 ## Only if we are expecting to translate a complete coding region
654 if ($complete) {
655 my $id = $self->display_id;
656 # remove the terminator character
657 if( substr($output,-1,1) eq $terminator ) {
658 chop $output;
659 } else {
660 $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
661 $self->warn("Seq [$id]: Not using a valid terminator codon!");
663 # test if there are terminator characters inside the protein sequence!
664 if ($output =~ /\Q$terminator\E/) {
665 $id ||= '';
666 $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
667 $self->warn("Seq [$id]: Terminator codon inside CDS!");
669 # if the initiator codon is not ATG, the amino acid needs to be changed to M
670 if ( substr($output,0,1) ne 'M' ) {
671 if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
672 $output = 'M'. substr($output,1);
673 } elsif ($throw) {
674 $self->throw("Seq [$id]: Not using a valid initiator codon!");
675 } else {
676 $self->warn("Seq [$id]: Not using a valid initiator codon!");
681 my $seqclass;
682 if ($self->can_call_new()) {
683 $seqclass = ref($self);
684 } else {
685 $seqclass = 'Bio::PrimarySeq';
686 $self->_attempt_to_load_Seq();
688 my $out = $seqclass->new( '-seq' => $output,
689 '-display_id' => $self->display_id,
690 '-accession_number' => $self->accession_number,
691 # is there anything wrong with retaining the
692 # description?
693 '-desc' => $self->desc(),
694 '-alphabet' => 'protein',
695 '-verbose' => $self->verbose
697 return $out;
700 =head2 transcribe()
702 Title : transcribe
703 Usage : $xseq = $seq->transcribe
704 Function: Convert base T to base U
705 Returns : PrimarySeqI object of alphabet 'rna' or
706 undef if $seq->alphabet ne 'dna'
707 Args :
709 =cut
711 sub transcribe {
712 my $self = shift;
713 return unless $self->alphabet eq 'dna';
714 my $s = $self->seq;
715 $s =~ tr/tT/uU/;
716 my $class;
717 if ($self->can_call_new) {
718 $class = ref($self);
719 } else {
720 $class = 'Bio::PrimarySeq';
721 $self->_attempt_to_load_Seq;
723 my $desc = $self->desc || '';
724 return $class->new(
725 '-seq' => $s,
726 '-alphabet' => 'rna',
727 '-display_id' => $self->display_id,
728 '-accession_number' => $self->accession_number,
729 '-desc' => "${desc}[TRANSCRIBED]",
730 '-verbose' => $self->verbose
734 =head2 rev_transcribe()
736 Title : rev_transcribe
737 Usage : $rtseq = $seq->rev_transcribe
738 Function: Convert base U to base T
739 Returns : PrimarySeqI object of alphabet 'dna' or
740 undef if $seq->alphabet ne 'rna'
741 Args :
743 =cut
745 sub rev_transcribe {
746 my $self = shift;
747 return unless $self->alphabet eq 'rna';
748 my $s = $self->seq;
749 $s =~ tr/uU/tT/;
750 my $class;
751 if ($self->can_call_new) {
752 $class = ref($self);
753 } else {
754 $class = 'Bio::PrimarySeq';
755 $self->_attempt_to_load_Seq;
757 return $class->new(
758 '-seq' => $s,
759 '-alphabet' => 'dna',
760 '-display_id' => $self->display_id,
761 '-accession_number' => $self->accession_number,
762 '-desc' => $self->desc . "[REVERSE TRANSCRIBED]",
763 '-verbose' => $self->verbose
767 =head2 id
769 Title : id
770 Usage : $id = $seq->id()
771 Function: ID of the sequence. This should normally be (and actually is in
772 the implementation provided here) just a synonym for display_id().
773 Returns : A string.
774 Args :
776 =cut
778 sub id {
779 my ($self)= @_;
781 return $self->display_id();
785 =head2 length
787 Title : length
788 Usage : $len = $seq->length()
789 Function:
790 Returns : Integer representing the length of the sequence.
791 Args :
793 =cut
795 sub length {
796 my ($self)= @_;
797 $self->throw_not_implemented();
800 =head2 desc
802 Title : desc
803 Usage : $seq->desc($newval);
804 $description = $seq->desc();
805 Function: Get/set description text for a seq object
806 Returns : Value of desc
807 Args : newvalue (optional)
809 =cut
811 sub desc {
812 shift->throw_not_implemented();
816 =head2 is_circular
818 Title : is_circular
819 Usage : if( $obj->is_circular) { /Do Something/ }
820 Function: Returns true if the molecule is circular
821 Returns : Boolean value
822 Args : none
824 =cut
826 sub is_circular{
827 shift->throw_not_implemented;
830 =head1 Private functions
832 These are some private functions for the PrimarySeqI interface. You do not
833 need to implement these functions
835 =head2 _find_orfs_nucleotide
837 Title : _find_orfs_nucleotide
838 Usage :
839 Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
840 The ORF is not required to have a termination codon.
841 Example :
842 Returns : a list of string coordinates of ORF locations (0-based half-open),
843 sorted descending by length (so that the longest is first)
844 as: [ start, end, frame, length ], [ start, end, frame, length ], ...
845 Args : Nucleotide sequence,
846 CodonTable object,
847 (optional) alternative initiation codon (e.g. 'ATA'),
848 (optional) boolean that, if true, stops after finding the
849 first available ORF
851 =cut
853 sub _find_orfs_nucleotide {
854 my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
855 $sequence = uc $sequence;
856 $start_codon = uc $start_codon if $start_codon;
858 my $is_start = $start_codon
859 ? sub { shift eq $start_codon }
860 : sub { $codon_table->is_start_codon( shift ) };
862 # stores the begin index of the currently-running ORF in each
863 # reading frame
864 my @current_orf_start = (-1,-1,-1);
866 #< stores coordinates of longest observed orf (so far) in each
867 # reading frame
868 my @orfs;
870 # go through each base of the sequence, and each reading frame for each base
871 my $seqlen = CORE::length $sequence;
872 for( my $j = 0; $j <= $seqlen-3; $j++ ) {
873 my $frame = $j % 3;
875 my $this_codon = substr( $sequence, $j, 3 );
877 # if in an orf and this is either a stop codon or the last in-frame codon in the string
878 if ( $current_orf_start[$frame] >= 0 ) {
879 if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
880 # record ORF start, end (half-open), length, and frame
881 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
882 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
884 $self->warn( "Translating partial ORF "
885 .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
886 .' from end of nucleotide sequence'
888 if $first_only && $is_last_codon_in_frame;
890 return \@this_orf if $first_only;
891 push @orfs, \@this_orf;
892 $current_orf_start[$frame] = -1;
895 # if this is a start codon
896 elsif ( $is_start->($this_codon) ) {
897 $current_orf_start[$frame] = $j;
901 return sort { $b->[2] <=> $a->[2] } @orfs;
904 sub _truncate_seq {
905 my ($self,$seq) = @_;
906 return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
908 sub _orf_sequence {
909 my ($self, $seq, $orf ) = @_;
910 return '' unless $orf;
911 return substr( $seq, $orf->[0], $orf->[2] )
914 =head2 _attempt_to_load_Seq
916 Title : _attempt_to_load_Seq
917 Usage :
918 Function:
919 Example :
920 Returns :
921 Args :
924 =cut
926 sub _attempt_to_load_Seq{
927 my ($self) = @_;
929 if( $main::{'Bio::PrimarySeq'} ) {
930 return 1;
931 } else {
932 eval {
933 require Bio::PrimarySeq;
935 if( $@ ) {
936 my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
937 "This indicates that you are using Bio::PrimarySeqI ".
938 "without Bio::PrimarySeq loaded or without providing a ".
939 "complete implementation.\nThe most likely problem is that there ".
940 "has been a misconfiguration of the bioperl environment\n".
941 "Actual exception:\n\n";
942 $self->throw("$text$@\n");
943 return 0;
945 return 1;