From 8d4cc1b26d35a524986fe5b4ee724806298241dc Mon Sep 17 00:00:00 2001 From: "Francisco J. Ossandon" Date: Sun, 16 Nov 2014 20:56:55 -0300 Subject: [PATCH] CodonTable.pm: Fix for issue #90. Added new codon tables 24 & 25 and moved special 'Strict' table to ID 0, so it don't interfere with new tables in the future. Deleted a dead link and updated other tables according to the latest definitions. Since ID 0 now exists and is used by 'Strict', the 'id' subroutine now returns 1 (standard table) for invalid specified ids (besides the warning). Also fixed some indentations. CodonTable.t: Added tests to cover all the changes plus a few extras to improve code coverage after looking Coveralls report. --- Bio/Tools/CodonTable.pm | 190 ++++++++++++++++++++++++------------------------ t/SeqTools/CodonTable.t | 67 +++++++++++------ 2 files changed, 141 insertions(+), 116 deletions(-) diff --git a/Bio/Tools/CodonTable.pm b/Bio/Tools/CodonTable.pm index d008e311b..1ee516484 100644 --- a/Bio/Tools/CodonTable.pm +++ b/Bio/Tools/CodonTable.pm @@ -127,13 +127,11 @@ only differences are in available initiator codons. NCBI Genetic Codes home page: + (Last update of the Genetic Codes: April 30, 2013) http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c -EBI Translation Table Viewer: - http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi - -Amended ASN.1 version with ids 16 and 21 is at: - ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/ +ASN.1 version with ids 1 to 25 is at: + ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt Thanks to Matteo diTomasso for the original Perl implementation of these tables. @@ -202,31 +200,34 @@ BEGIN { @NAMES = #id ( + 'Strict', #0, special option for ATG-only start 'Standard', #1 'Vertebrate Mitochondrial',#2 'Yeast Mitochondrial',# 3 - 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4 + 'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4 'Invertebrate Mitochondrial',#5 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6 '', '', - 'Echinoderm Mitochondrial',#9 + 'Echinoderm and Flatworm Mitochondrial',#9 'Euplotid Nuclear',#10 - '"Bacterial"',# 11 + 'Bacterial, Archaeal and Plant Plastid',# 11 'Alternative Yeast Nuclear',# 12 'Ascidian Mitochondrial',# 13 - 'Flatworm Mitochondrial',# 14 + 'Alternative Flatworm Mitochondriall',# 14 'Blepharisma Nuclear',# 15 'Chlorophycean Mitochondrial',# 16 '', '', '', '', 'Trematode Mitochondrial',# 21 'Scenedesmus obliquus Mitochondrial', #22 'Thraustochytrium Mitochondrial', #23 - 'Strict', #24, option for only ATG start + 'Pterobranchia Mitochondrial', #24 + 'Candidate Division SR1 and Gracilibacteria', #25 ); @TABLES = qw( FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG @@ -245,7 +246,8 @@ BEGIN { FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG - FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG + FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG ); # (bases used for these tables, for reference) @@ -255,6 +257,7 @@ BEGIN { @STARTS = qw( + -----------------------------------M---------------------------- ---M---------------M---------------M---------------------------- --------------------------------MMMM---------------M------------ ----------------------------------MM---------------------------- @@ -262,11 +265,11 @@ BEGIN { ---M----------------------------MMMM---------------M------------ -----------------------------------M---------------------------- '' '' - -----------------------------------M---------------------------- + -----------------------------------M---------------M------------ -----------------------------------M---------------------------- ---M---------------M------------MMMM---------------M------------ -------------------M---------------M---------------------------- - -----------------------------------M---------------------------- + ---M------------------------------MM---------------M------------ -----------------------------------M---------------------------- -----------------------------------M---------------------------- -----------------------------------M---------------------------- @@ -274,22 +277,23 @@ BEGIN { -----------------------------------M---------------M------------ -----------------------------------M---------------------------- --------------------------------M--M---------------M------------ - -----------------------------------M---------------------------- + ---M---------------M---------------M---------------M------------ + ---M-------------------------------M---------------M------------ ); my @nucs = qw(t c a g); my $x = 0; ($CODONS, $TRCOL) = ({}, {}); for my $i (@nucs) { - for my $j (@nucs) { - for my $k (@nucs) { - my $codon = "$i$j$k"; - $CODONS->{$codon} = $x; - $TRCOL->{$x} = $codon; - $x++; + for my $j (@nucs) { + for my $k (@nucs) { + my $codon = "$i$j$k"; + $CODONS->{$codon} = $x; + $TRCOL->{$x} = $codon; + $x++; + } } } - } %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub(); %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup(); %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2); @@ -316,25 +320,26 @@ sub new { Title : id Usage : $obj->id(3); $id_integer = $obj->id(); Function: Sets or returns the id of the translation table. IDs are - integers from 1 to 15, excluding 7 and 8 which have been - removed as redundant. If an invalid ID is given the method - returns 0, false. + integers from 0 (special ATG-only start) to 25, excluding + 7-8 and 17-20 which have been removed. If an invalid ID is + given the method returns 1, the standard table. Example : - Returns : value of id, a scalar, 0 if not a valid + Returns : value of id, a scalar, warn and fall back to 1 (standard table) + if specified id is not valid Args : newvalue (optional) =cut sub id{ - my ($self,$value) = @_; - if( defined $value) { - if ( !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') { - $self->warn("Not a valid codon table ID [$value] "); - $value = 0; - } - $self->{'id'} = $value; - } - return $self->{'id'}; + my ($self,$value) = @_; + if( defined $value) { + if ( not defined $TABLES[$value] or $TABLES[$value] eq '') { + $self->warn("Not a valid codon table ID [$value], using [1] instead "); + $value = 1; + } + $self->{'id'} = $value; + } + return $self->{'id'}; } =head2 name @@ -353,7 +358,7 @@ sub name{ my ($self) = @_; my ($id) = $self->{'id'}; - return $NAMES[$id-1]; + return $NAMES[$id]; } =head2 tables @@ -372,8 +377,8 @@ sub name{ sub tables{ my %tables; - for my $id (1 .. @NAMES) { - my $name = $NAMES[$id-1]; + for my $id (0 .. $#NAMES) { + my $name = $NAMES[$id]; $tables{$id} = $name if $name; } return \%tables; @@ -424,37 +429,38 @@ sub translate { if ($seq =~ /[^actg]/ ) { #ambiguous chars for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) { my $triplet = substr($seq, $i, CODONSIZE); - if( $triplet eq $CODONGAP ) { - $protein .= $GAP; - } elsif (exists $CODONS->{$triplet}) { - $protein .= substr($TABLES[$id-1], - $CODONS->{$triplet},1); - } else { - $protein .= $self->_translate_ambiguous_codon($triplet); + if( $triplet eq $CODONGAP ) { + $protein .= $GAP; + } elsif (exists $CODONS->{$triplet}) { + $protein .= substr($TABLES[$id], + $CODONS->{$triplet},1); + } else { + $protein .= $self->_translate_ambiguous_codon($triplet); + } } - } } else { # simple, strict translation - for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) { - my $triplet = substr($seq, $i, CODONSIZE); + for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) { + my $triplet = substr($seq, $i, CODONSIZE); if( $triplet eq $CODONGAP ) { - $protein .= $GAP; - } if (exists $CODONS->{$triplet}) { - $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1); - } else { + $protein .= $GAP; + } + if (exists $CODONS->{$triplet}) { + $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1); + } else { $protein .= 'X'; } } } if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides - my $triplet = substr($seq, ($partial -4)). "n"; - if( $triplet eq $CODONGAP ) { - $protein .= $GAP; - } elsif (exists $CODONS->{$triplet}) { - my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1); - $protein .= $aa; - } else { - $protein .= $self->_translate_ambiguous_codon($triplet, $partial); - } + my $triplet = substr($seq, ($partial -4)). "n"; + if( $triplet eq $CODONGAP ) { + $protein .= $GAP; + } elsif (exists $CODONS->{$triplet}) { + my $aa = substr($TABLES[$id], $CODONS->{$triplet},1); + $protein .= $aa; + } else { + $protein .= $self->_translate_ambiguous_codon($triplet, $partial); + } } return $protein; } @@ -467,23 +473,23 @@ sub _translate_ambiguous_codon { my @codons = $self->unambiguous_codons($triplet); my %aas =(); foreach my $codon (@codons) { - $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1; + $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1; } my $count = scalar keys %aas; if ( $count == 1 ) { - $aa = (keys %aas)[0]; + $aa = (keys %aas)[0]; } elsif ( $count == 2 ) { - if ($aas{'D'} and $aas{'N'}) { - $aa = 'B'; - } - elsif ($aas{'E'} and $aas{'Q'}) { - $aa = 'Z'; - } else { - $partial ? ($aa = '') : ($aa = 'X'); - } + if ($aas{'D'} and $aas{'N'}) { + $aa = 'B'; + } + elsif ($aas{'E'} and $aas{'Q'}) { + $aa = 'Z'; + } else { + $partial ? ($aa = '') : ($aa = 'X'); + } } else { - $partial ? ($aa = '') : ($aa = 'X'); + $partial ? ($aa = '') : ($aa = 'X'); } return $aa; } @@ -520,7 +526,7 @@ sub translate_strict{ return 'X' unless defined $CODONS->{$value}; - return substr( $TABLES[$id-1], $CODONS->{$value}, 1 ); + return substr( $TABLES[$id], $CODONS->{$value}, 1 ); } =head2 revtranslate @@ -554,8 +560,9 @@ sub revtranslate { $value = ucfirst $value; $value = $THREELETTERSYMBOLS{$value}; } - if ( defined $value and $value =~ /$VALID_PROTEIN/ - and length($value) == 1 ) { + if ( defined $value and $value =~ /$VALID_PROTEIN/ + and length($value) == 1 + ) { my $id = $self->{'id'}; $value = uc $value; @@ -563,18 +570,18 @@ sub revtranslate { foreach my $aa (@aas) { #print $aa, " -2\n"; $aa = '\*' if $aa eq '*'; - while ($TABLES[$id-1] =~ m/$aa/g) { - my $p = pos $TABLES[$id-1]; - push (@codons, $TRCOL->{--$p}); - } + while ($TABLES[$id] =~ m/$aa/g) { + my $p = pos $TABLES[$id]; + push (@codons, $TRCOL->{--$p}); + } } } - if ($coding and uc ($coding) eq 'RNA') { - for my $i (0..$#codons) { - $codons[$i] =~ tr/t/u/; - } - } + if ($coding and uc ($coding) eq 'RNA') { + for my $i (0..$#codons) { + $codons[$i] =~ tr/t/u/; + } + } return @codons; } @@ -597,7 +604,6 @@ sub revtranslate { =cut sub reverse_translate_all { - my ($self, $obj, $cut, $threshold) = @_; ## check args are OK @@ -642,7 +648,6 @@ sub reverse_translate_all { } return $self->_make_iupac_string(\@data); - } =head2 reverse_translate_best @@ -689,7 +694,7 @@ sub reverse_translate_best { $self->throw("Input sequence contains invalid character: $aa"); } } - $str; + return $str; } =head2 is_start_codon @@ -742,7 +747,7 @@ sub _codon_is { my $id = $self->{'id'}; for my $c ( $self->unambiguous_codons($value) ) { - my $m = substr( $table->[$id-1], $CODONS->{$c}, 1 ); + my $m = substr( $table->[$id], $CODONS->{$c}, 1 ); return 0 unless $m eq $key; } return 1; @@ -818,21 +823,19 @@ sub _unambiquous_codons { sub add_table { my ($self, $name, $table, $starts) = @_; - $name ||= 'Custom'. scalar @NAMES + 1; - $starts ||= $STARTS[0]; + $name ||= 'Custom' . $#NAMES + 1; + $starts ||= $STARTS[1]; $self->throw('Suspect input!') unless length($table) == 64 and length($starts) == 64; - push @NAMES, $name; + push @NAMES, $name; push @TABLES, $table; push @STARTS, $starts; - return scalar @NAMES; - + return $#NAMES; } sub _make_iupac_string { - my ($self, $cod_ref) = @_; if(ref($cod_ref) ne 'ARRAY') { $self->throw(" I need a reference to a list of references to codons, ". @@ -855,7 +858,6 @@ sub _make_iupac_string { } } return $iupac_string; - } diff --git a/t/SeqTools/CodonTable.t b/t/SeqTools/CodonTable.t index 4b4f06bf3..11652a7c3 100644 --- a/t/SeqTools/CodonTable.t +++ b/t/SeqTools/CodonTable.t @@ -3,11 +3,11 @@ use strict; -BEGIN { +BEGIN { use lib '.'; use Bio::Root::Test; - test_begin(-tests => 71); + test_begin(-tests => 81); use_ok('Bio::Tools::CodonTable'); use_ok('Bio::CodonUsage::IO'); @@ -23,6 +23,17 @@ isa_ok $myCodonTable, 'Bio::Tools::CodonTable'; $myCodonTable = Bio::Tools::CodonTable->new(); is $myCodonTable->id(), 1; +# invalid table should produce a warn and set default table (1) +my $stderr = ''; +{ + # capture stderr output + local *STDERR; + open STDERR, '>', \$stderr; + $myCodonTable->id(99); +} +like $stderr, qr/Not a valid codon table ID/; +is $myCodonTable->id, 1; + # change codon table $myCodonTable->id(10); is $myCodonTable->id, 10; @@ -30,12 +41,12 @@ is $myCodonTable->name(), 'Euplotid Nuclear'; # enumerate tables as object method my $table = $myCodonTable->tables(); -cmp_ok (keys %{$table}, '>=', 17); # currently 17 known tables -is $table->{11}, q{"Bacterial"}; +cmp_ok (keys %{$table}, '>=', 19); # currently 19 known tables +is $table->{11}, 'Bacterial, Archaeal and Plant Plastid'; # enumerate tables as class method $table = Bio::Tools::CodonTable->tables; -cmp_ok (values %{$table}, '>=', 17); # currently 17 known tables +cmp_ok (values %{$table}, '>=', 19); # currently 19 known tables is $table->{23}, 'Thraustochytrium Mitochondrial'; # translate codons @@ -74,7 +85,7 @@ is $myCodonTable->translate('jj',1), ''; is $myCodonTable->translate('jjg'), 'X'; is $myCodonTable->translate('jjg',1), 'X'; -is $myCodonTable->translate('gt'), ''; +is $myCodonTable->translate('gt'), ''; is $myCodonTable->translate('gt',1), 'V'; is $myCodonTable->translate('g'), ''; @@ -89,20 +100,20 @@ ggkggyggsggvgghggdggbggxgtmgtrgtwgtkgtygtsgtvgthgtdgtbgtxtartaytcmtcrtcwt cktcytcstcvtchtcdtcbtcxtgyttrttytramgamggmgrracratrayytaytgytrsaasagsartaa; SEQ $seq =~ s/\s+//g; -@ii = grep { length == 3 } split /(.{3})/, $seq; +@ii = grep { length == 3 } split /(.{3})/, $seq; print join (' ', @ii), "\n" if( $DEBUG); my $prot = <translate($ii[$i]) ) { - $test = 0; + $test = 0; print $ii[$i], ": |", $res[$i], "| ne |", $myCodonTable->translate($ii[$i]), "| @ $i\n" if( $DEBUG); last ; @@ -110,12 +121,14 @@ for my $i (0..$#ii) { } ok $test; -# reverse translate amino acids +# reverse translate amino acids is $myCodonTable->revtranslate('U'), 0; is $myCodonTable->revtranslate('O'), 0; is $myCodonTable->revtranslate('J'), 9; is $myCodonTable->revtranslate('I'), 3; +my @RNA_codons = $myCodonTable->revtranslate('M', 'RNA'); +is $RNA_codons[0], 'aug'; # test RNA output @ii = qw(A l ACN Thr sER ter Glx); @res = ( @@ -144,10 +157,10 @@ $test = 1; } ok $test; -# boolean tests -$myCodonTable->id(1); +# boolean tests +$myCodonTable->id(1); # Standard table -ok $myCodonTable->is_start_codon('ATG'); +ok $myCodonTable->is_start_codon('ATG'); is $myCodonTable->is_start_codon('GGH'), 0; ok $myCodonTable->is_start_codon('HTG'); is $myCodonTable->is_start_codon('CCC'), 0; @@ -164,20 +177,17 @@ is $myCodonTable->is_unknown_codon('UAG'), 0; is $myCodonTable->translate_strict('ATG'), 'M'; - - # # adding a custom codon table # - my @custom_table = ( 'test1', 'FFLLSSSSYY**CC*WLLLL**PPHHQQR*RRIIIMT*TT*NKKSSRRV*VVAA*ADDEE*GGG' ); ok my $custct = $myCodonTable->add_table(@custom_table); -is $custct, 25; +is $custct, 26; is $myCodonTable->translate('atgaaraayacmacracwacka'), 'MKNTTTT'; ok $myCodonTable->id($custct); is $myCodonTable->translate('atgaaraayacmacracwacka'), 'MKXXTTT'; @@ -202,18 +212,19 @@ is $myCodonTable->reverse_translate_all($seq), 'GCBWSNNNNTTYCAYAARYTN'; # # test reverse_translate_best(), requires a Bio::CodonUsage::Table object -# +# -ok $seq = Bio::PrimarySeq->new(-seq =>'ACDEFGHIKLMNPQRSTVWY'); +ok $seq = Bio::PrimarySeq->new(-seq =>'ACDEFGHIKLMNPQRSTVWYX'); ok my $io = Bio::CodonUsage::IO->new(-file => test_input_file('MmCT')); ok my $cut = $io->next_data(); -is $myCodonTable->reverse_translate_best($seq,$cut), 'GCCTGCGACGAGTTCGGCCACATCAAGCTGATGAACCCCCAGCGCTCCACCGTGTGGTAC'; +is $myCodonTable->reverse_translate_best($seq,$cut), 'GCCTGCGACGAGTTCGGCCACATCAAGCTGATGAACCCCCAGCGCTCCACCGTGTGGTACNNN'; +is $myCodonTable->reverse_translate_all($seq, $cut, 15), 'GCNTGYGAYGARTTYGGVCAYATYAARCTSATGAAYCCNCARMGVWSYACHGTSTGGTAYNNN'; # # test 'Strict' table, requires a Bio::CodonUsage::Table object # -$myCodonTable = Bio::Tools::CodonTable->new(); +$myCodonTable = Bio::Tools::CodonTable->new(); # Default Standard table # boolean tests is $myCodonTable->is_start_codon('ATG'), 1; @@ -222,10 +233,22 @@ is $myCodonTable->is_start_codon('TTG'), 1; is $myCodonTable->is_start_codon('CTG'), 1; is $myCodonTable->is_start_codon('CCC'), 0; -$myCodonTable->id(24); +$myCodonTable->id(0); # Special 'Strict' table (ATG-only start) is $myCodonTable->is_start_codon('ATG'), 1; is $myCodonTable->is_start_codon('GTG'), 0; is $myCodonTable->is_start_codon('TTG'), 0; is $myCodonTable->is_start_codon('CTG'), 0; is $myCodonTable->is_start_codon('CCC'), 0; + +# Pterobranchia Mitochondrial codon table +$myCodonTable->id(24); +is $myCodonTable->is_start_codon('GTG'), 1; +is $myCodonTable->is_start_codon('CTG'), 1; +is $myCodonTable->translate_strict('TGA'), 'W'; + +# Candidate Division SR1 and Gracilibacteria codon table +$myCodonTable->id(25); +is $myCodonTable->is_start_codon('GTG'), 1; +is $myCodonTable->is_start_codon('CTG'), 0; +is $myCodonTable->translate_strict('TGA'), 'G'; -- 2.11.4.GIT