3 # bioperl module for Bio::Tools::CodonTable
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
9 # Copyright Heikki Lehvaslaiho
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::CodonTable - Codon table object
21 # This is a read-only class for all known codon tables. The IDs are
22 # the ones used by nucleotide sequence databases. All common IUPAC
23 # ambiguity codes for DNA, RNA and amino acids are recognized.
25 use Bio::Tools::CodonTable;
27 # defaults to ID 1 "Standard"
28 $myCodonTable = Bio::Tools::CodonTable->new();
29 $myCodonTable2 = Bio::Tools::CodonTable->new( -id => 3 );
35 print join (' ', "The name of the codon table no.", $myCodonTable->id(4),
36 "is:", $myCodonTable->name(), "\n");
38 # print possible codon tables
39 $tables = Bio::Tools::CodonTable->tables;
40 while ( ($id,$name) = each %{$tables} ) {
41 print "$id = $name\n";
45 $aa = $myCodonTable->translate('ACU');
46 $aa = $myCodonTable->translate('act');
47 $aa = $myCodonTable->translate('ytr');
49 # reverse translate an amino acid
50 @codons = $myCodonTable->revtranslate('A');
51 @codons = $myCodonTable->revtranslate('Ser');
52 @codons = $myCodonTable->revtranslate('Glx');
53 @codons = $myCodonTable->revtranslate('cYS', 'rna');
55 # reverse translate an entire amino acid sequence into a IUPAC
58 my $seqobj = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
59 my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
62 print "Is a start\n" if $myCodonTable->is_start_codon('ATG');
63 print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
64 print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG');
68 Codon tables are also called translation tables or genetic codes
69 since that is what they represent. A bit more complete picture
70 of the full complexity of codon usage in various taxonomic groups
71 is presented at the NCBI Genetic Codes Home page.
73 CodonTable is a BioPerl class that knows all current translation
74 tables that are used by primary nucleotide sequence databases
75 (GenBank, EMBL and DDBJ). It provides methods to output information
76 about tables and relationships between codons and amino acids.
78 This class and its methods recognized all common IUPAC ambiguity codes
79 for DNA, RNA and animo acids. The translation method follows the
80 conventions in EMBL and TREMBL databases.
82 It is a nuisance to separate RNA and cDNA representations of nucleic
83 acid transcripts. The CodonTable object accepts codons of both type as
84 input and allows the user to set the mode for output when reverse
85 translating. Its default for output is DNA.
89 This class deals primarily with individual codons and amino
90 acids. However in the interest of speed you can L<translate>
91 longer sequence, too. The full complexity of protein translation
92 is tackled by L<Bio::PrimarySeqI::translate>.
95 The amino acid codes are IUPAC recommendations for common amino acids:
112 O Pyl Pyrrolysine (22nd amino acid)
113 U Sec Selenocysteine (21st amino acid)
119 B Asx Aspartic acid or Asparagine
120 Z Glx Glutamine or Glutamic acid
121 J Xle Isoleucine or Valine (mass spec ambiguity)
122 X Xaa Any or unknown amino acid
125 It is worth noting that, "Bacterial" codon table no. 11 produces an
126 polypeptide that is, confusingly, identical to the standard one. The
127 only differences are in available initiator codons.
130 NCBI Genetic Codes home page:
131 http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
133 EBI Translation Table Viewer:
134 http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi
136 Amended ASN.1 version with ids 16 and 21 is at:
137 ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/
139 Thanks to Matteo diTomasso for the original Perl implementation
146 User feedback is an integral part of the evolution of this and other
147 Bioperl modules. Send your comments and suggestions preferably to the
148 Bioperl mailing lists Your participation is much appreciated.
150 bioperl-l@bioperl.org - General discussion
151 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
155 Please direct usage questions or support issues to the mailing list:
157 L<bioperl-l@bioperl.org>
159 rather than to the module maintainer directly. Many experienced and
160 reponsive experts will be able look at the problem and quickly
161 address it. Please include a thorough description of the problem
162 with code and data examples if at all possible.
164 =head2 Reporting Bugs
166 Report bugs to the Bioperl bug tracking system to help us keep track
167 the bugs and their resolution. Bug reports can be submitted via the
170 http://bugzilla.open-bio.org/
172 =head1 AUTHOR - Heikki Lehvaslaiho
174 Email: heikki-at-bioperl-dot-org
178 The rest of the documentation details each of the object
179 methods. Internal methods are usually preceded with a _
184 # Let the code begin...
186 package Bio
::Tools
::CodonTable
;
187 use vars
qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
188 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
191 # Object preamble - inherits from Bio::Root::Root
192 use Bio::Tools::IUPAC;
195 use base qw(Bio::Root::Root);
198 # first set internal values for all translation tables
201 use constant CODONSIZE
=> 3;
203 $CODONGAP = $GAP x CODONSIZE
;
208 'Vertebrate Mitochondrial',#2
209 'Yeast Mitochondrial',# 3
210 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4
211 'Invertebrate Mitochondrial',#5
212 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
214 'Echinoderm Mitochondrial',#9
215 'Euplotid Nuclear',#10
217 'Alternative Yeast Nuclear',# 12
218 'Ascidian Mitochondrial',# 13
219 'Flatworm Mitochondrial',# 14
220 'Blepharisma Nuclear',# 15
221 'Chlorophycean Mitochondrial',# 16
223 'Trematode Mitochondrial',# 21
224 'Scenedesmus obliquus Mitochondrial', #22
225 'Thraustochytrium Mitochondrial' #23
230 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
231 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
232 FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
233 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
234 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
235 FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
237 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
238 FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
239 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
240 FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
241 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
242 FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
243 FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
244 FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
246 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
247 FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
248 FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
254 ---M---------------M---------------M----------------------------
255 --------------------------------MMMM---------------M------------
256 ----------------------------------MM----------------------------
257 --MM---------------M------------MMMM---------------M------------
258 ---M----------------------------MMMM---------------M------------
259 -----------------------------------M----------------------------
261 -----------------------------------M----------------------------
262 -----------------------------------M----------------------------
263 ---M---------------M------------MMMM---------------M------------
264 -------------------M---------------M----------------------------
265 -----------------------------------M----------------------------
266 -----------------------------------M----------------------------
267 -----------------------------------M----------------------------
268 -----------------------------------M----------------------------
270 -----------------------------------M---------------M------------
271 -----------------------------------M----------------------------
272 --------------------------------M--M---------------M------------
275 my @nucs = qw(t c a g);
277 ($CODONS, $TRCOL) = ({}, {});
281 my $codon = "$i$j$k";
282 $CODONS->{$codon} = $x;
283 $TRCOL->{$x} = $codon;
288 %IUPAC_DNA = Bio
::Tools
::IUPAC
->iupac_iub();
289 %IUPAC_AA = Bio
::Tools
::IUPAC
->iupac_iup();
290 %THREELETTERSYMBOLS = Bio
::SeqUtils
->valid_aa(2);
291 $VALID_PROTEIN = '['.join('',Bio
::SeqUtils
->valid_aa(0)).']';
296 my($class,@args) = @_;
297 my $self = $class->SUPER::new
(@args);
300 $self->_rearrange([qw(ID
304 $id = 1 if ( ! $id );
305 $id && $self->id($id);
306 return $self; # success - we hope!
312 Usage : $obj->id(3); $id_integer = $obj->id();
313 Function: Sets or returns the id of the translation table. IDs are
314 integers from 1 to 15, excluding 7 and 8 which have been
315 removed as redundant. If an invalid ID is given the method
318 Returns : value of id, a scalar, 0 if not a valid
319 Args : newvalue (optional)
324 my ($self,$value) = @_;
325 if( defined $value) {
326 if ( !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') {
327 $self->warn("Not a valid codon table ID [$value] ");
330 $self->{'id'} = $value;
332 return $self->{'id'};
339 Function: returns the descriptive name of the translation table
350 my ($id) = $self->{'id'};
351 return $NAMES[$id-1];
357 Usage : $obj->tables() or Bio::Tools::CodonTable->tables()
358 Function: returns a hash reference where each key is a valid codon
359 table id() number, and each value is the corresponding
360 codon table name() string
370 for my $id (1 .. @NAMES) {
371 my $name = $NAMES[$id-1];
372 $tables{$id} = $name if $name;
380 Usage : $obj->translate('YTR')
381 Function: Returns a string of one letter amino acid codes from
382 nucleotide sequence input. The imput can be of any length.
384 Returns 'X' for unknown codons and codons that code for
385 more than one amino acid. Returns an empty string if input
386 is not three characters long. Exceptions for these are:
388 - IUPAC amino acid code B for Aspartic Acid and
390 - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
392 - if the codon is two nucleotides long and if by adding
393 an a third character 'N', it codes for a single amino
394 acid (with exceptions above), return that, otherwise
397 Returns empty string for other input strings that are not
398 three characters long.
401 Returns : a string of one letter ambiguous IUPAC amino acid codes
402 Args : ambiguous IUPAC nucleotide string
408 my ($self, $seq) = @_;
409 $self->throw("Calling translate without a seq argument!") unless defined $seq;
410 return '' unless $seq;
414 $partial = 2 if length($seq) % CODONSIZE
== 2;
419 if ($seq =~ /[^actg]/ ) { #ambiguous chars
420 for (my $i = 0; $i < (length($seq) - (CODONSIZE
-1)); $i+= CODONSIZE
) {
421 my $triplet = substr($seq, $i, CODONSIZE
);
422 if( $triplet eq $CODONGAP ) {
424 } elsif (exists $CODONS->{$triplet}) {
425 $protein .= substr($TABLES[$id-1],
426 $CODONS->{$triplet},1);
428 $protein .= $self->_translate_ambiguous_codon($triplet);
431 } else { # simple, strict translation
432 for (my $i = 0; $i < (length($seq) - (CODONSIZE
-1)); $i+=CODONSIZE
) {
433 my $triplet = substr($seq, $i, CODONSIZE
);
434 if( $triplet eq $CODONGAP ) {
436 } if (exists $CODONS->{$triplet}) {
437 $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1);
443 if ($partial == 2) { # 2 overhanging nucleotides
444 my $triplet = substr($seq, ($partial -4)). "n";
445 if( $triplet eq $CODONGAP ) {
447 } elsif (exists $CODONS->{$triplet}) {
448 my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1);
451 $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
457 sub _translate_ambiguous_codon
{
458 my ($self, $triplet, $partial) = @_;
462 my @codons = _unambiquous_codons
($triplet);
464 foreach my $codon (@codons) {
465 $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1;
467 my $count = scalar keys %aas;
469 $aa = (keys %aas)[0];
471 elsif ( $count == 2 ) {
472 if ($aas{'D'} and $aas{'N'}) {
475 elsif ($aas{'E'} and $aas{'Q'}) {
478 $partial ?
($aa = '') : ($aa = 'X');
481 $partial ?
($aa = '') : ($aa = 'X');
486 =head2 translate_strict
488 Title : translate_strict
489 Usage : $obj->translate_strict('ACT')
490 Function: returns one letter amino acid code for a codon input
492 Fast and simple translation. User is responsible to resolve
493 ambiguous nucleotide codes before calling this
494 method. Returns 'X' for unknown codons and an empty string
495 for input strings that are not three characters long.
497 It is not recommended to use this method in a production
498 environment. Use method translate, instead.
502 Args : a codon = a three nucleotide character string
507 sub translate_strict
{
508 my ($self, $value) = @_;
509 my ($id) = $self->{'id'};
514 if (length $value != 3 ) {
517 elsif (!(defined $CODONS->{$value})) {
521 return substr($TABLES[$id-1],$CODONS->{$value},1);
528 Usage : $obj->revtranslate('G')
529 Function: returns codons for an amino acid
531 Returns an empty string for unknown amino acid
532 codes. Ambiquous IUPAC codes Asx,B, (Asp,D; Asn,N) and
533 Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
534 letter amino acid codes are accepted. '*' and 'Ter' are
537 By default, the output codons are shown in DNA. If the
538 output is needed in RNA (tr/t/u/), add a second argument
541 Example : $obj->revtranslate('Gly', 'RNA')
542 Returns : An array of three lower case letter strings i.e. codons
543 Args : amino acid, 'RNA'
548 my ($self, $value, $coding) = @_;
549 my ($id) = $self->{'id'};
553 if (length($value) == 3 ) {
555 $value = ucfirst $value;
556 $value = $THREELETTERSYMBOLS{$value};
558 if ( defined $value and $value =~ /$VALID_PROTEIN/
559 and length($value) == 1 ) {
561 @aas = @
{$IUPAC_AA{$value}};
562 foreach my $aa (@aas) {
564 $aa = '\*' if $aa eq '*';
565 while ($TABLES[$id-1] =~ m/$aa/g) {
566 $p = pos $TABLES[$id-1];
567 push (@codons, $TRCOL->{--$p});
572 if ($coding and uc ($coding) eq 'RNA') {
573 for my $i (0..$#codons) {
574 $codons[$i] =~ tr/t/u/;
581 =head2 reverse_translate_all
583 Title : reverse_translate_all
584 Usage : my $iup_str = $cttable->reverse_translate_all($seq_object)
585 my $iup_str = $cttable->reverse_translate_all($seq_object,
588 Function: reverse translates a protein sequence into IUPAC nucleotide
589 sequence. An 'X' in the protein sequence is converted to 'NNN'
590 in the nucleotide sequence.
592 Args : a Bio::PrimarySeqI compatible object (mandatory)
593 a Bio::CodonUsage::Table object and a threshold if only
594 codons with a relative frequency above the threshold are
598 sub reverse_translate_all
{
600 my ($self, $obj, $cut, $threshold) = @_;
604 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
605 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
608 if($obj->alphabet ne 'protein') {
609 $self->throw("Cannot reverse translate, need an amino acid sequence .".
610 "This sequence is of type [" . $obj->alphabet ."]");
613 my @seq = split '', $obj->seq;
615 ## if we're not supplying a codon usage table...
616 if( !$cut && !$threshold) {
617 ## get lists of possible codons for each aa.
620 push @data, (['NNN']);
622 my @cods = $self->revtranslate($aa);
627 #else we are supplying a codon usage table, we just want common codons
629 if(!$cut->isa('Bio::CodonUsage::Table')) {
630 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
633 my $cod_ref = $cut->probable_codons($threshold);
636 push @data, (['NNN']);
639 push @data, $cod_ref->{$aa};
643 return $self->_make_iupac_string(\
@data);
647 =head2 reverse_translate_best
649 Title : reverse_translate_best
650 Usage : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
651 Function: Reverse translates a protein sequence into plain nucleotide
652 sequence (GATC), uses the most common codon for each amino acid
654 Args : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
658 sub reverse_translate_best
{
660 my ($self, $obj, $cut) = @_;
662 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
663 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
666 if ($obj->alphabet ne 'protein') {
667 $self->throw("Cannot reverse translate, need an amino acid sequence .".
668 "This sequence is of type [" . $obj->alphabet ."]");
670 if ( !$cut | !$cut->isa('Bio::CodonUsage::Table')) {
671 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
676 my @seq = split '', $obj->seq;
678 my $cod_ref = $cut->most_common_codons();
680 for my $aa ( @seq ) {
685 if ( defined $cod_ref->{$aa} ) {
686 $str .= $cod_ref->{$aa};
688 $self->throw("Input sequence contains invalid character: $aa");
694 =head2 is_start_codon
696 Title : is_start_codon
697 Usage : $obj->is_start_codon('ATG')
698 Function: returns true (1) for all codons that can be used as a
699 translation start, false (0) for others.
700 Example : $myCodonTable->is_start_codon('ATG')
707 my ($self, $value) = @_;
708 my ($id) = $self->{'id'};
713 if (length $value != 3 ) {
718 my @ms = map { substr($STARTS[$id-1],$CODONS->{$_},1) } _unambiquous_codons
($value);
719 foreach my $c (@ms) {
720 $result = 0 if $c ne 'M';
731 Usage : $obj->is_ter_codon('GAA')
732 Function: returns true (1) for all codons that can be used as a
733 translation tarminator, false (0) for others.
734 Example : $myCodonTable->is_ter_codon('ATG')
741 my ($self, $value) = @_;
742 my ($id) = $self->{'id'};
747 if (length $value != 3 ) {
752 my @ms = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons
($value);
753 foreach my $c (@ms) {
754 $result = 0 if $c ne $TERMINATOR;
760 =head2 is_unknown_codon
762 Title : is_unknown_codon
763 Usage : $obj->is_unknown_codon('GAJ')
764 Function: returns false (0) for all codons that are valid,
766 Example : $myCodonTable->is_unknown_codon('NTG')
773 sub is_unknown_codon
{
774 my ($self, $value) = @_;
775 my ($id) = $self->{'id'};
780 if (length $value != 3 ) {
785 my @cs = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons
($value);
786 $result = 1 if scalar @cs == 0;
791 =head2 _unambiquous_codons
793 Title : _unambiquous_codons
794 Usage : @codons = _unambiquous_codons('ACN')
797 Returns : array of strings (one letter unambiguous amino acid codes)
798 Args : a codon = a three IUPAC nucleotide character string
802 sub _unambiquous_codons
{
807 @nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
808 for my $i (@
{$nts[0]}) {
809 for my $j (@
{$nts[1]}) {
810 for my $k (@
{$nts[2]}) {
811 push @codons, lc "$i$j$k";
821 Usage : $newid = $ct->add_table($name, $table, $starts)
822 Function: Add a custom Codon Table into the object.
823 Know what you are doing, only the length of
824 the argument strings is checked!
825 Returns : the id of the new codon table
826 Args : name, a string, optional (can be empty)
827 table, a string of 64 characters
828 startcodons, a string of 64 characters, defaults to standard
833 my ($self, $name, $table, $starts) = @_;
835 $name ||= 'Custom'. scalar @NAMES + 1;
836 $starts ||= $STARTS[0];
837 $self->throw('Suspect input!')
838 unless length($table) == 64 and length($starts) == 64;
841 push @TABLES, $table;
842 push @STARTS, $starts;
844 return scalar @NAMES;
848 sub _make_iupac_string
{
850 my ($self, $cod_ref) = @_;
851 if(ref($cod_ref) ne 'ARRAY') {
852 $self->throw(" I need a reference to a list of references to codons, ".
853 " not a [". ref($cod_ref) . "].");
855 my %iupac_hash = Bio
::Tools
::IUPAC
->iupac_rev_iub();
856 my $iupac_string = ''; ## the string to be returned
857 for my $aa (@
$cod_ref) {
859 ## scan through codon positions, record the differing values,
860 # then look up in the iub hash
861 for my $index(0..2) {
863 map { my $k = substr($_,$index,1);
864 $h{$k} = undef;} @
$aa;
865 my $lookup_key = join '', sort{$a cmp $b}keys %h;
868 $iupac_string .= $iupac_hash{uc$lookup_key};
871 return $iupac_string;