3 # bioperl module for Bio::Tools::CodonTable
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::CodonTable - Codon table object
19 # This is a read-only class for all known codon tables. The IDs are
20 # the ones used by nucleotide sequence databases. All common IUPAC
21 # ambiguity codes for DNA, RNA and amino acids are recognized.
23 use Bio::Tools::CodonTable;
25 # defaults to ID 1 "Standard"
26 $myCodonTable = Bio::Tools::CodonTable->new();
27 $myCodonTable2 = Bio::Tools::CodonTable->new( -id => 3 );
33 print join (' ', "The name of the codon table no.", $myCodonTable->id(4),
34 "is:", $myCodonTable->name(), "\n");
36 # print possible codon tables
37 $tables = Bio::Tools::CodonTable->tables;
38 while ( ($id,$name) = each %{$tables} ) {
39 print "$id = $name\n";
43 $aa = $myCodonTable->translate('ACU');
44 $aa = $myCodonTable->translate('act');
45 $aa = $myCodonTable->translate('ytr');
47 # reverse translate an amino acid
48 @codons = $myCodonTable->revtranslate('A');
49 @codons = $myCodonTable->revtranslate('Ser');
50 @codons = $myCodonTable->revtranslate('Glx');
51 @codons = $myCodonTable->revtranslate('cYS', 'rna');
53 # reverse translate an entire amino acid sequence into a IUPAC
56 my $seqobj = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
57 my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
60 print "Is a start\n" if $myCodonTable->is_start_codon('ATG');
61 print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
62 print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG');
66 Codon tables are also called translation tables or genetic codes
67 since that is what they represent. A bit more complete picture
68 of the full complexity of codon usage in various taxonomic groups
69 is presented at the NCBI Genetic Codes Home page.
71 CodonTable is a BioPerl class that knows all current translation
72 tables that are used by primary nucleotide sequence databases
73 (GenBank, EMBL and DDBJ). It provides methods to output information
74 about tables and relationships between codons and amino acids.
76 This class and its methods recognized all common IUPAC ambiguity codes
77 for DNA, RNA and animo acids. The translation method follows the
78 conventions in EMBL and TREMBL databases.
80 It is a nuisance to separate RNA and cDNA representations of nucleic
81 acid transcripts. The CodonTable object accepts codons of both type as
82 input and allows the user to set the mode for output when reverse
83 translating. Its default for output is DNA.
87 This class deals primarily with individual codons and amino
88 acids. However in the interest of speed you can L<translate>
89 longer sequence, too. The full complexity of protein translation
90 is tackled by L<Bio::PrimarySeqI::translate>.
93 The amino acid codes are IUPAC recommendations for common amino acids:
110 O Pyl Pyrrolysine (22nd amino acid)
111 U Sec Selenocysteine (21st amino acid)
117 B Asx Aspartic acid or Asparagine
118 Z Glx Glutamine or Glutamic acid
119 J Xle Isoleucine or Valine (mass spec ambiguity)
120 X Xaa Any or unknown amino acid
123 It is worth noting that, "Bacterial" codon table no. 11 produces an
124 polypeptide that is, confusingly, identical to the standard one. The
125 only differences are in available initiator codons.
128 NCBI Genetic Codes home page:
129 http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
131 EBI Translation Table Viewer:
132 http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi
134 Amended ASN.1 version with ids 16 and 21 is at:
135 ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/
137 Thanks to Matteo diTomasso for the original Perl implementation
144 User feedback is an integral part of the evolution of this and other
145 Bioperl modules. Send your comments and suggestions preferably to the
146 Bioperl mailing lists Your participation is much appreciated.
148 bioperl-l@bioperl.org - General discussion
149 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151 =head2 Reporting Bugs
153 Report bugs to the Bioperl bug tracking system to help us keep track
154 the bugs and their resolution. Bug reports can be submitted via the
157 http://bugzilla.open-bio.org/
159 =head1 AUTHOR - Heikki Lehvaslaiho
161 Email: heikki-at-bioperl-dot-org
165 The rest of the documentation details each of the object
166 methods. Internal methods are usually preceded with a _
171 # Let the code begin...
173 package Bio
::Tools
::CodonTable
;
174 use vars
qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
175 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
178 # Object preamble - inherits from Bio::Root::Root
179 use Bio::Tools::IUPAC;
182 use base qw(Bio::Root::Root);
185 # first set internal values for all translation tables
188 use constant CODONSIZE
=> 3;
190 $CODONGAP = $GAP x CODONSIZE
;
195 'Vertebrate Mitochondrial',#2
196 'Yeast Mitochondrial',# 3
197 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4
198 'Invertebrate Mitochondrial',#5
199 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
201 'Echinoderm Mitochondrial',#9
202 'Euplotid Nuclear',#10
204 'Alternative Yeast Nuclear',# 12
205 'Ascidian Mitochondrial',# 13
206 'Flatworm Mitochondrial',# 14
207 'Blepharisma Nuclear',# 15
208 'Chlorophycean Mitochondrial',# 16
210 'Trematode Mitochondrial',# 21
211 'Scenedesmus obliquus Mitochondrial', #22
212 'Thraustochytrium Mitochondrial' #23
217 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
218 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
219 FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
220 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
221 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
222 FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
224 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
225 FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
226 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
227 FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
228 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
229 FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
230 FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
231 FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
233 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
234 FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
235 FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
241 ---M---------------M---------------M----------------------------
242 --------------------------------MMMM---------------M------------
243 ----------------------------------MM----------------------------
244 --MM---------------M------------MMMM---------------M------------
245 ---M----------------------------MMMM---------------M------------
246 -----------------------------------M----------------------------
248 -----------------------------------M----------------------------
249 -----------------------------------M----------------------------
250 ---M---------------M------------MMMM---------------M------------
251 -------------------M---------------M----------------------------
252 -----------------------------------M----------------------------
253 -----------------------------------M----------------------------
254 -----------------------------------M----------------------------
255 -----------------------------------M----------------------------
257 -----------------------------------M---------------M------------
258 -----------------------------------M----------------------------
259 --------------------------------M--M---------------M------------
262 my @nucs = qw(t c a g);
264 ($CODONS, $TRCOL) = ({}, {});
268 my $codon = "$i$j$k";
269 $CODONS->{$codon} = $x;
270 $TRCOL->{$x} = $codon;
275 %IUPAC_DNA = Bio
::Tools
::IUPAC
->iupac_iub();
276 %IUPAC_AA = Bio
::Tools
::IUPAC
->iupac_iup();
277 %THREELETTERSYMBOLS = Bio
::SeqUtils
->valid_aa(2);
278 $VALID_PROTEIN = '['.join('',Bio
::SeqUtils
->valid_aa(0)).']';
283 my($class,@args) = @_;
284 my $self = $class->SUPER::new
(@args);
287 $self->_rearrange([qw(ID
291 $id = 1 if ( ! $id );
292 $id && $self->id($id);
293 return $self; # success - we hope!
299 Usage : $obj->id(3); $id_integer = $obj->id();
300 Function: Sets or returns the id of the translation table. IDs are
301 integers from 1 to 15, excluding 7 and 8 which have been
302 removed as redundant. If an invalid ID is given the method
305 Returns : value of id, a scalar, 0 if not a valid
306 Args : newvalue (optional)
311 my ($self,$value) = @_;
312 if( defined $value) {
313 if ( !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') {
314 $self->warn("Not a valid codon table ID [$value] ");
317 $self->{'id'} = $value;
319 return $self->{'id'};
326 Function: returns the descriptive name of the translation table
337 my ($id) = $self->{'id'};
338 return $NAMES[$id-1];
344 Usage : $obj->tables() or Bio::Tools::CodonTable->tables()
345 Function: returns a hash reference where each key is a valid codon
346 table id() number, and each value is the corresponding
347 codon table name() string
357 for my $id (1 .. @NAMES) {
358 my $name = $NAMES[$id-1];
359 $tables{$id} = $name if $name;
367 Usage : $obj->translate('YTR')
368 Function: Returns a string of one letter amino acid codes from
369 nucleotide sequence input. The imput can be of any length.
371 Returns 'X' for unknown codons and codons that code for
372 more than one amino acid. Returns an empty string if input
373 is not three characters long. Exceptions for these are:
375 - IUPAC amino acid code B for Aspartic Acid and
377 - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
379 - if the codon is two nucleotides long and if by adding
380 an a third character 'N', it codes for a single amino
381 acid (with exceptions above), return that, otherwise
384 Returns empty string for other input strings that are not
385 three characters long.
388 Returns : a string of one letter ambiguous IUPAC amino acid codes
389 Args : ambiguous IUPAC nucleotide string
395 my ($self, $seq) = @_;
396 $self->throw("Calling translate without a seq argument!") unless defined $seq;
397 return '' unless $seq;
401 $partial = 2 if length($seq) % CODONSIZE
== 2;
406 if ($seq =~ /[^actg]/ ) { #ambiguous chars
407 for (my $i = 0; $i < (length($seq) - (CODONSIZE
-1)); $i+= CODONSIZE
) {
408 my $triplet = substr($seq, $i, CODONSIZE
);
409 if( $triplet eq $CODONGAP ) {
411 } elsif (exists $CODONS->{$triplet}) {
412 $protein .= substr($TABLES[$id-1],
413 $CODONS->{$triplet},1);
415 $protein .= $self->_translate_ambiguous_codon($triplet);
418 } else { # simple, strict translation
419 for (my $i = 0; $i < (length($seq) - (CODONSIZE
-1)); $i+=CODONSIZE
) {
420 my $triplet = substr($seq, $i, CODONSIZE
);
421 if( $triplet eq $CODONGAP ) {
423 } if (exists $CODONS->{$triplet}) {
424 $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1);
430 if ($partial == 2) { # 2 overhanging nucleotides
431 my $triplet = substr($seq, ($partial -4)). "n";
432 if( $triplet eq $CODONGAP ) {
434 } elsif (exists $CODONS->{$triplet}) {
435 my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1);
438 $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
444 sub _translate_ambiguous_codon
{
445 my ($self, $triplet, $partial) = @_;
449 my @codons = _unambiquous_codons
($triplet);
451 foreach my $codon (@codons) {
452 $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1;
454 my $count = scalar keys %aas;
456 $aa = (keys %aas)[0];
458 elsif ( $count == 2 ) {
459 if ($aas{'D'} and $aas{'N'}) {
462 elsif ($aas{'E'} and $aas{'Q'}) {
465 $partial ?
($aa = '') : ($aa = 'X');
468 $partial ?
($aa = '') : ($aa = 'X');
473 =head2 translate_strict
475 Title : translate_strict
476 Usage : $obj->translate_strict('ACT')
477 Function: returns one letter amino acid code for a codon input
479 Fast and simple translation. User is responsible to resolve
480 ambiguous nucleotide codes before calling this
481 method. Returns 'X' for unknown codons and an empty string
482 for input strings that are not three characters long.
484 It is not recommended to use this method in a production
485 environment. Use method translate, instead.
489 Args : a codon = a three nucleotide character string
494 sub translate_strict
{
495 my ($self, $value) = @_;
496 my ($id) = $self->{'id'};
501 if (length $value != 3 ) {
504 elsif (!(defined $CODONS->{$value})) {
508 return substr($TABLES[$id-1],$CODONS->{$value},1);
515 Usage : $obj->revtranslate('G')
516 Function: returns codons for an amino acid
518 Returns an empty string for unknown amino acid
519 codes. Ambiquous IUPAC codes Asx,B, (Asp,D; Asn,N) and
520 Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
521 letter amino acid codes are accepted. '*' and 'Ter' are
524 By default, the output codons are shown in DNA. If the
525 output is needed in RNA (tr/t/u/), add a second argument
528 Example : $obj->revtranslate('Gly', 'RNA')
529 Returns : An array of three lower case letter strings i.e. codons
530 Args : amino acid, 'RNA'
535 my ($self, $value, $coding) = @_;
536 my ($id) = $self->{'id'};
540 if (length($value) == 3 ) {
542 $value = ucfirst $value;
543 $value = $THREELETTERSYMBOLS{$value};
545 if ( defined $value and $value =~ /$VALID_PROTEIN/
546 and length($value) == 1 ) {
548 @aas = @
{$IUPAC_AA{$value}};
549 foreach my $aa (@aas) {
551 $aa = '\*' if $aa eq '*';
552 while ($TABLES[$id-1] =~ m/$aa/g) {
553 $p = pos $TABLES[$id-1];
554 push (@codons, $TRCOL->{--$p});
559 if ($coding and uc ($coding) eq 'RNA') {
560 for my $i (0..$#codons) {
561 $codons[$i] =~ tr/t/u/;
568 =head2 reverse_translate_all
570 Title : reverse_translate_all
571 Usage : my $iup_str = $cttable->reverse_translate_all($seq_object)
572 my $iup_str = $cttable->reverse_translate_all($seq_object,
575 Function: reverse translates a protein sequence into IUPAC nucleotide
576 sequence. An 'X' in the protein sequence is converted to 'NNN'
577 in the nucleotide sequence.
579 Args : a Bio::PrimarySeqI compatible object (mandatory)
580 a Bio::CodonUsage::Table object and a threshold if only
581 codons with a relative frequency above the threshold are
585 sub reverse_translate_all
{
587 my ($self, $obj, $cut, $threshold) = @_;
591 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
592 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
595 if($obj->alphabet ne 'protein') {
596 $self->throw("Cannot reverse translate, need an amino acid sequence .".
597 "This sequence is of type [" . $obj->alphabet ."]");
600 my @seq = split '', $obj->seq;
602 ## if we're not supplying a codon usage table...
603 if( !$cut && !$threshold) {
604 ## get lists of possible codons for each aa.
607 push @data, (['NNN']);
609 my @cods = $self->revtranslate($aa);
614 #else we are supplying a codon usage table, we just want common codons
616 if(!$cut->isa('Bio::CodonUsage::Table')) {
617 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
620 my $cod_ref = $cut->probable_codons($threshold);
623 push @data, (['NNN']);
626 push @data, $cod_ref->{$aa};
630 return $self->_make_iupac_string(\
@data);
634 =head2 reverse_translate_best
636 Title : reverse_translate_best
637 Usage : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
638 Function: Reverse translates a protein sequence into plain nucleotide
639 sequence (GATC), uses the most common codon for each amino acid
641 Args : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
645 sub reverse_translate_best
{
647 my ($self, $obj, $cut) = @_;
649 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
650 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
653 if ($obj->alphabet ne 'protein') {
654 $self->throw("Cannot reverse translate, need an amino acid sequence .".
655 "This sequence is of type [" . $obj->alphabet ."]");
657 if ( !$cut | !$cut->isa('Bio::CodonUsage::Table')) {
658 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
663 my @seq = split '', $obj->seq;
665 my $cod_ref = $cut->most_common_codons();
667 for my $aa ( @seq ) {
672 if ( defined $cod_ref->{$aa} ) {
673 $str .= $cod_ref->{$aa};
675 $self->throw("Input sequence contains invalid character: $aa");
681 =head2 is_start_codon
683 Title : is_start_codon
684 Usage : $obj->is_start_codon('ATG')
685 Function: returns true (1) for all codons that can be used as a
686 translation start, false (0) for others.
687 Example : $myCodonTable->is_start_codon('ATG')
694 my ($self, $value) = @_;
695 my ($id) = $self->{'id'};
700 if (length $value != 3 ) {
705 my @ms = map { substr($STARTS[$id-1],$CODONS->{$_},1) } _unambiquous_codons
($value);
706 foreach my $c (@ms) {
707 $result = 0 if $c ne 'M';
718 Usage : $obj->is_ter_codon('GAA')
719 Function: returns true (1) for all codons that can be used as a
720 translation tarminator, false (0) for others.
721 Example : $myCodonTable->is_ter_codon('ATG')
728 my ($self, $value) = @_;
729 my ($id) = $self->{'id'};
734 if (length $value != 3 ) {
739 my @ms = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons
($value);
740 foreach my $c (@ms) {
741 $result = 0 if $c ne $TERMINATOR;
747 =head2 is_unknown_codon
749 Title : is_unknown_codon
750 Usage : $obj->is_unknown_codon('GAJ')
751 Function: returns false (0) for all codons that are valid,
753 Example : $myCodonTable->is_unknown_codon('NTG')
760 sub is_unknown_codon
{
761 my ($self, $value) = @_;
762 my ($id) = $self->{'id'};
767 if (length $value != 3 ) {
772 my @cs = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons
($value);
773 $result = 1 if scalar @cs == 0;
778 =head2 _unambiquous_codons
780 Title : _unambiquous_codons
781 Usage : @codons = _unambiquous_codons('ACN')
784 Returns : array of strings (one letter unambiguous amino acid codes)
785 Args : a codon = a three IUPAC nucleotide character string
789 sub _unambiquous_codons
{
794 @nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
795 for my $i (@
{$nts[0]}) {
796 for my $j (@
{$nts[1]}) {
797 for my $k (@
{$nts[2]}) {
798 push @codons, lc "$i$j$k";
808 Usage : $newid = $ct->add_table($name, $table, $starts)
809 Function: Add a custom Codon Table into the object.
810 Know what you are doing, only the length of
811 the argument strings is checked!
812 Returns : the id of the new codon table
813 Args : name, a string, optional (can be empty)
814 table, a string of 64 characters
815 startcodons, a string of 64 characters, defaults to standard
820 my ($self, $name, $table, $starts) = @_;
822 $name ||= 'Custom'. scalar @NAMES + 1;
823 $starts ||= $STARTS[0];
824 $self->throw('Suspect input!')
825 unless length($table) == 64 and length($starts) == 64;
828 push @TABLES, $table;
829 push @STARTS, $starts;
831 return scalar @NAMES;
835 sub _make_iupac_string
{
837 my ($self, $cod_ref) = @_;
838 if(ref($cod_ref) ne 'ARRAY') {
839 $self->throw(" I need a reference to a list of references to codons, ".
840 " not a [". ref($cod_ref) . "].");
842 my %iupac_hash = Bio
::Tools
::IUPAC
->iupac_rev_iub();
843 my $iupac_string = ''; ## the string to be returned
844 for my $aa (@
$cod_ref) {
846 ## scan through codon positions, record the differing values,
847 # then look up in the iub hash
848 for my $index(0..2) {
850 map { my $k = substr($_,$index,1);
851 $h{$k} = undef;} @
$aa;
852 my $lookup_key = join '', sort{$a cmp $b}keys %h;
855 $iupac_string .= $iupac_hash{uc$lookup_key};
858 return $iupac_string;