Bio/Tools/CodonTable.pm

   1 #
   2 # bioperl module for Bio::Tools::CodonTable
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Heikki Lehvaslaiho
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::CodonTable - Codon table object
  17
  18 =head1 SYNOPSIS
  19
  20   # This is a read-only class for all known codon tables.  The IDs are
  21   # the ones used by nucleotide sequence databases.  All common IUPAC
  22   # ambiguity codes for DNA, RNA and amino acids are recognized.
  23
  24   use Bio::Tools::CodonTable;
  25
  26   # defaults to ID 1 "Standard"
  27   $myCodonTable   = Bio::Tools::CodonTable->new();
  28   $myCodonTable2  = Bio::Tools::CodonTable->new( -id => 3 );
  29
  30   # change codon table
  31   $myCodonTable->id(5);
  32
  33   # examine codon table
  34   print  join (' ', "The name of the codon table no.", $myCodonTable->id(4),
  35            "is:", $myCodonTable->name(), "\n");
  36
  37   # print possible codon tables
  38   $tables = Bio::Tools::CodonTable->tables;
  39   while ( ($id,$name) = each %{$tables} ) {
  40     print "$id = $name\n";
  41   }
  42
  43   # translate a codon
  44   $aa = $myCodonTable->translate('ACU');
  45   $aa = $myCodonTable->translate('act');
  46   $aa = $myCodonTable->translate('ytr');
  47
  48   # reverse translate an amino acid
  49   @codons = $myCodonTable->revtranslate('A');
  50   @codons = $myCodonTable->revtranslate('Ser');
  51   @codons = $myCodonTable->revtranslate('Glx');
  52   @codons = $myCodonTable->revtranslate('cYS', 'rna');
  53
  54   # reverse translate an entire amino acid sequence into a IUPAC
  55   # nucleotide string
  56
  57   my $seqobj    = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
  58   my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
  59
  60   # boolean tests
  61   print "Is a start\n"       if $myCodonTable->is_start_codon('ATG');
  62   print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
  63   print "Is a unknown\n"     if $myCodonTable->is_unknown_codon('JTG');
  64
  65 =head1 DESCRIPTION
  66
  67 Codon tables are also called translation tables or genetic codes
  68 since that is what they represent. A bit more complete picture
  69 of the full complexity of codon usage in various taxonomic groups
  70 is presented at the NCBI Genetic Codes Home page.
  71
  72 CodonTable is a BioPerl class that knows all current translation
  73 tables that are used by primary nucleotide sequence databases
  74 (GenBank, EMBL and DDBJ). It provides methods to output information
  75 about tables and relationships between codons and amino acids.
  76
  77 This class and its methods recognized all common IUPAC ambiguity codes
  78 for DNA, RNA and animo acids. The translation method follows the
  79 conventions in EMBL and TREMBL databases.
  80
  81 It is a nuisance to separate RNA and cDNA representations of nucleic
  82 acid transcripts. The CodonTable object accepts codons of both type as
  83 input and allows the user to set the mode for output when reverse
  84 translating. Its default for output is DNA.
  85
  86 Note:
  87
  88 This class deals primarily with individual codons and amino
  89 acids. However in the interest of speed you can L<translate>
  90 longer sequence, too. The full complexity of protein translation
  91 is tackled by L<Bio::PrimarySeqI::translate>.
  92
  93
  94 The amino acid codes are IUPAC recommendations for common amino acids:
  95
  96           A           Ala            Alanine
  97           R           Arg            Arginine
  98           N           Asn            Asparagine
  99           D           Asp            Aspartic acid
 100           C           Cys            Cysteine
 101           Q           Gln            Glutamine
 102           E           Glu            Glutamic acid
 103           G           Gly            Glycine
 104           H           His            Histidine
 105           I           Ile            Isoleucine
 106           L           Leu            Leucine
 107           K           Lys            Lysine
 108           M           Met            Methionine
 109           F           Phe            Phenylalanine
 110           P           Pro            Proline
 111           O           Pyl            Pyrrolysine (22nd amino acid)
 112           U           Sec            Selenocysteine (21st amino acid)
 113           S           Ser            Serine
 114           T           Thr            Threonine
 115           W           Trp            Tryptophan
 116           Y           Tyr            Tyrosine
 117           V           Val            Valine
 118           B           Asx            Aspartic acid or Asparagine
 119           Z           Glx            Glutamine or Glutamic acid
 120           J           Xle            Isoleucine or Valine (mass spec ambiguity)
 121           X           Xaa            Any or unknown amino acid
 122
 123
 124 It is worth noting that, "Bacterial" codon table no. 11 produces an
 125 polypeptide that is, confusingly, identical to the standard one. The
 126 only differences are in available initiator codons.
 127
 128
 129 NCBI Genetic Codes home page:
 130      (Last update of the Genetic Codes: April 30, 2013)
 131      https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
 132
 133 ASN.1 version with ids 1 to 25 is at:
 134      ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
 135
 136 Thanks to Matteo diTomasso for the original Perl implementation
 137 of these tables.
 138
 139 =head1 FEEDBACK
 140
 141 =head2 Mailing Lists
 142
 143 User feedback is an integral part of the evolution of this and other
 144 Bioperl modules. Send your comments and suggestions preferably to the
 145 Bioperl mailing lists  Your participation is much appreciated.
 146
 147   bioperl-l@bioperl.org                  - General discussion
 148   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 149
 150 =head2 Support
 151
 152 Please direct usage questions or support issues to the mailing list:
 153
 154 I<bioperl-l@bioperl.org>
 155
 156 rather than to the module maintainer directly. Many experienced and
 157 reponsive experts will be able look at the problem and quickly
 158 address it. Please include a thorough description of the problem
 159 with code and data examples if at all possible.
 160
 161 =head2 Reporting Bugs
 162
 163 Report bugs to the Bioperl bug tracking system to help us keep track
 164 the bugs and their resolution.  Bug reports can be submitted via the
 165 web:
 166
 167   https://github.com/bioperl/bioperl-live/issues
 168
 169 =head1 AUTHOR - Heikki Lehvaslaiho
 170
 171 Email:  heikki-at-bioperl-dot-org
 172
 173 =head1 APPENDIX
 174
 175 The rest of the documentation details each of the object
 176 methods. Internal methods are usually preceded with a _
 177
 178 =cut
 179
 180 # Let the code begin...
 181
 182 package Bio::Tools::CodonTable;
 183 use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
 184                 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
 185 use strict;
 186
 187 # Object preamble - inherits from Bio::Root::Root
 188 use Bio::Tools::IUPAC;
 189 use Bio::SeqUtils;
 190
 191 use base qw(Bio::Root::Root);
 192
 193
 194 # first set internal values for all translation tables
 195
 196 BEGIN {
 197     use constant CODONSIZE => 3;
 198     $GAP = '-';
 199     $CODONGAP = $GAP x CODONSIZE;
 200
 201     @NAMES =            #id
 202     (
 203      'Strict', #0, special option for ATG-only start
 204      'Standard',        #1
 205      'Vertebrate Mitochondrial',#2
 206      'Yeast Mitochondrial',# 3
 207      'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4
 208      'Invertebrate Mitochondrial',#5
 209      'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
 210      '', '',
 211      'Echinoderm and Flatworm Mitochondrial',#9
 212      'Euplotid Nuclear',#10
 213      'Bacterial, Archaeal and Plant Plastid',# 11
 214      'Alternative Yeast Nuclear',# 12
 215      'Ascidian Mitochondrial',# 13
 216      'Alternative Flatworm Mitochondrial',# 14
 217      'Blepharisma Nuclear',# 15
 218      'Chlorophycean Mitochondrial',# 16
 219      '', '',  '', '',
 220      'Trematode Mitochondrial',# 21
 221      'Scenedesmus obliquus Mitochondrial', #22
 222      'Thraustochytrium Mitochondrial', #23
 223      'Pterobranchia Mitochondrial', #24
 224      'Candidate Division SR1 and Gracilibacteria', #25
 225      );
 226
 227     @TABLES =
 228     qw(
 229        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 230        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 231        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
 232        FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 233        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 234        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
 235        FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 236        '' ''
 237        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 238        FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 239        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 240        FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 241        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
 242        FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 243        FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 244        FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 245        '' '' '' ''
 246        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 247        FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 248        FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 249        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG
 250        FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 251        );
 252
 253    #           (bases used for these tables, for reference)
 254    # 1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 255    # 2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 256    # 3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 257
 258     @STARTS =
 259     qw(
 260        -----------------------------------M----------------------------
 261        ---M---------------M---------------M----------------------------
 262        --------------------------------MMMM---------------M------------
 263        ----------------------------------MM----------------------------
 264        --MM---------------M------------MMMM---------------M------------
 265        ---M----------------------------MMMM---------------M------------
 266        -----------------------------------M----------------------------
 267        '' ''
 268        -----------------------------------M---------------M------------
 269        -----------------------------------M----------------------------
 270        ---M---------------M------------MMMM---------------M------------
 271        -------------------M---------------M----------------------------
 272        ---M------------------------------MM---------------M------------
 273        -----------------------------------M----------------------------
 274        -----------------------------------M----------------------------
 275        -----------------------------------M----------------------------
 276        '' ''  '' ''
 277        -----------------------------------M---------------M------------
 278        -----------------------------------M----------------------------
 279        --------------------------------M--M---------------M------------
 280        ---M---------------M---------------M---------------M------------
 281        ---M-------------------------------M---------------M------------
 282        );
 283
 284     my @nucs = qw(t c a g);
 285     my $x = 0;
 286     ($CODONS, $TRCOL) = ({}, {});
 287     for my $i (@nucs) {
 288         for my $j (@nucs) {
 289             for my $k (@nucs) {
 290                 my $codon = "$i$j$k";
 291                 $CODONS->{$codon} = $x;
 292                 $TRCOL->{$x} = $codon;
 293                 $x++;
 294             }
 295         }
 296     }
 297     %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
 298     %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
 299     %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
 300     $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
 301     $TERMINATOR = '*';
 302 }
 303
 304 sub new {
 305     my($class,@args) = @_;
 306     my $self = $class->SUPER::new(@args);
 307
 308     my($id) =
 309     $self->_rearrange([qw(ID
 310                  )],
 311              @args);
 312
 313     $id = 1 if ( ! $id );
 314     $id  && $self->id($id);
 315     return $self; # success - we hope!
 316 }
 317
 318 =head2 id
 319
 320  Title   : id
 321  Usage   : $obj->id(3); $id_integer = $obj->id();
 322  Function: Sets or returns the id of the translation table.  IDs are
 323            integers from 0 (special ATG-only start) to 25, excluding
 324            7-8 and 17-20 which have been removed. If an invalid ID is
 325            given the method returns 1, the standard table.
 326  Example :
 327  Returns : value of id, a scalar, warn and fall back to 1 (standard table)
 328            if specified id is not valid
 329  Args    : newvalue (optional)
 330
 331 =cut
 332
 333 sub id{
 334     my ($self,$value) = @_;
 335     if( defined $value) {
 336         if (  not defined $TABLES[$value] or $TABLES[$value] eq '') {
 337             $self->warn("Not a valid codon table ID [$value], using [1] instead ");
 338             $value = 1;
 339         }
 340         $self->{'id'} = $value;
 341     }
 342     return $self->{'id'};
 343 }
 344
 345 =head2 name
 346
 347  Title   : name
 348  Usage   : $obj->name()
 349  Function: returns the descriptive name of the translation table
 350  Example :
 351  Returns : A string
 352  Args    : None
 353
 354
 355 =cut
 356
 357 sub name{
 358    my ($self) = @_;
 359
 360    my ($id) = $self->{'id'};
 361    return $NAMES[$id];
 362 }
 363
 364 =head2 tables
 365
 366  Title   : tables
 367  Usage   : $obj->tables()  or  Bio::Tools::CodonTable->tables()
 368  Function: returns a hash reference where each key is a valid codon
 369            table id() number, and each value is the corresponding
 370            codon table name() string
 371  Example :
 372  Returns : A hashref
 373  Args    : None
 374
 375
 376 =cut
 377
 378 sub tables{
 379   my %tables;
 380   for my $id (0 .. $#NAMES) {
 381     my $name = $NAMES[$id];
 382     $tables{$id} = $name if $name;
 383   }
 384   return \%tables;
 385 }
 386
 387 =head2 translate
 388
 389  Title   : translate
 390  Usage   : $obj->translate('YTR')
 391  Function: Returns a string of one letter amino acid codes from
 392            nucleotide sequence input. The imput can be of any length.
 393
 394            Returns 'X' for unknown codons and codons that code for
 395            more than one amino acid. Returns an empty string if input
 396            is not three characters long. Exceptions for these are:
 397
 398              - IUPAC amino acid code B for Aspartic Acid and
 399                Asparagine, is used.
 400              - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
 401                used.
 402              - if the codon is two nucleotides long and if by adding
 403                an a third character 'N', it codes for a single amino
 404                acid (with exceptions above), return that, otherwise
 405                return empty string.
 406
 407            Returns empty string for other input strings that are not
 408            three characters long.
 409
 410  Example :
 411  Returns : a string of one letter ambiguous IUPAC amino acid codes
 412  Args    : ambiguous IUPAC nucleotide string
 413
 414
 415 =cut
 416
 417 sub translate {
 418     my ($self, $seq, $complete_codon) = @_;
 419     $self->throw("Calling translate without a seq argument!") unless defined $seq;
 420     return '' unless $seq;
 421
 422     my $id = $self->id;
 423     my ($partial) = 0;
 424     $partial = 2 if length($seq) % CODONSIZE == 2;
 425
 426     $seq = lc $seq;
 427     $seq =~ tr/u/t/;
 428     my $protein = "";
 429     if ($seq =~ /[^actg]/ ) { #ambiguous chars
 430         for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
 431             my $triplet = substr($seq, $i, CODONSIZE);
 432             if( $triplet eq $CODONGAP ) {
 433                 $protein .= $GAP;
 434             } elsif (exists $CODONS->{$triplet}) {
 435                 $protein .= substr($TABLES[$id],
 436                            $CODONS->{$triplet},1);
 437             } else {
 438                 $protein .= $self->_translate_ambiguous_codon($triplet);
 439             }
 440         }
 441     } else { # simple, strict translation
 442         for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
 443             my $triplet = substr($seq, $i, CODONSIZE);
 444             if( $triplet eq $CODONGAP ) {
 445                 $protein .= $GAP;
 446             }
 447             if (exists $CODONS->{$triplet}) {
 448                 $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
 449             } else {
 450                 $protein .= 'X';
 451             }
 452         }
 453     }
 454     if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
 455         my $triplet = substr($seq, ($partial -4)). "n";
 456         if( $triplet eq $CODONGAP ) {
 457             $protein .= $GAP;
 458         } elsif (exists $CODONS->{$triplet}) {
 459             my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
 460             $protein .= $aa;
 461         } else {
 462             $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
 463         }
 464     }
 465     return $protein;
 466 }
 467
 468 sub _translate_ambiguous_codon {
 469     my ($self, $triplet, $partial) = @_;
 470     $partial ||= 0;
 471     my $id = $self->id;
 472     my $aa;
 473     my @codons = $self->unambiguous_codons($triplet);
 474     my %aas =();
 475     foreach my $codon (@codons) {
 476         $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
 477     }
 478     my $count = scalar keys %aas;
 479     if ( $count == 1 ) {
 480         $aa = (keys %aas)[0];
 481     }
 482     elsif ( $count == 2 ) {
 483         if ($aas{'D'} and $aas{'N'}) {
 484             $aa = 'B';
 485         }
 486         elsif ($aas{'E'} and $aas{'Q'}) {
 487             $aa = 'Z';
 488         } else {
 489             $partial ? ($aa = '') : ($aa = 'X');
 490         }
 491     } else {
 492         $partial ? ($aa = '') :  ($aa = 'X');
 493     }
 494     return $aa;
 495 }
 496
 497 =head2 translate_strict
 498
 499  Title   : translate_strict
 500  Usage   : $obj->translate_strict('ACT')
 501  Function: returns one letter amino acid code for a codon input
 502
 503            Fast and simple translation. User is responsible to resolve
 504            ambiguous nucleotide codes before calling this
 505            method. Returns 'X' for unknown codons and an empty string
 506            for input strings that are not three characters long.
 507
 508            It is not recommended to use this method in a production
 509            environment. Use method translate, instead.
 510
 511  Example :
 512  Returns : A string
 513  Args    : a codon = a three nucleotide character string
 514
 515
 516 =cut
 517
 518 sub translate_strict{
 519    my ($self, $value) = @_;
 520    my $id = $self->{'id'};
 521
 522    $value  = lc $value;
 523    $value  =~ tr/u/t/;
 524
 525    return '' unless length $value == 3;
 526
 527    return 'X' unless defined $CODONS->{$value};
 528
 529    return substr( $TABLES[$id], $CODONS->{$value}, 1 );
 530 }
 531
 532 =head2 revtranslate
 533
 534  Title   : revtranslate
 535  Usage   : $obj->revtranslate('G')
 536  Function: returns codons for an amino acid
 537
 538            Returns an empty string for unknown amino acid
 539            codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
 540            Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
 541            letter amino acid codes are accepted. '*' and 'Ter' are
 542            used for terminator.
 543
 544            By default, the output codons are shown in DNA.  If the
 545            output is needed in RNA (tr/t/u/), add a second argument
 546            'RNA'.
 547
 548  Example : $obj->revtranslate('Gly', 'RNA')
 549  Returns : An array of three lower case letter strings i.e. codons
 550  Args    : amino acid, 'RNA'
 551
 552 =cut
 553
 554 sub revtranslate {
 555     my ($self, $value, $coding) = @_;
 556     my @codons;
 557
 558     if (length($value) == 3 ) {
 559         $value = lc $value;
 560         $value = ucfirst $value;
 561         $value = $THREELETTERSYMBOLS{$value};
 562     }
 563     if (    defined $value and $value =~ /$VALID_PROTEIN/
 564         and length($value) == 1
 565         ) {
 566         my $id = $self->{'id'};
 567
 568         $value = uc $value;
 569         my @aas = @{$IUPAC_AA{$value}};
 570         foreach my $aa (@aas) {
 571             #print $aa, " -2\n";
 572             $aa = '\*' if $aa eq '*';
 573             while ($TABLES[$id] =~ m/$aa/g) {
 574                 my $p = pos $TABLES[$id];
 575                 push (@codons, $TRCOL->{--$p});
 576             }
 577         }
 578     }
 579
 580     if ($coding and uc ($coding) eq 'RNA') {
 581         for my $i (0..$#codons)  {
 582             $codons[$i] =~ tr/t/u/;
 583         }
 584     }
 585
 586    return @codons;
 587 }
 588
 589 =head2 reverse_translate_all
 590
 591  Title   : reverse_translate_all
 592  Usage   : my $iup_str = $cttable->reverse_translate_all($seq_object)
 593            my $iup_str = $cttable->reverse_translate_all($seq_object,
 594                                                          $cutable,
 595                                                          15);
 596  Function: reverse translates a protein sequence into IUPAC nucleotide
 597            sequence. An 'X' in the protein sequence is converted to 'NNN'
 598            in the nucleotide sequence.
 599  Returns : a string
 600  Args    : a Bio::PrimarySeqI compatible object (mandatory)
 601            a Bio::CodonUsage::Table object and a threshold if only
 602              codons with a relative frequency above the threshold are
 603              to be considered.
 604 =cut
 605
 606 sub reverse_translate_all {
 607     my ($self, $obj, $cut, $threshold) = @_;
 608
 609     ## check args are OK
 610
 611     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 612         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 613                         ref($obj) . "]");
 614         }
 615     if($obj->alphabet ne 'protein') {
 616         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 617                      "This sequence is of type [" . $obj->alphabet ."]");
 618         }
 619     my @data;
 620     my @seq = split '', $obj->seq;
 621
 622     ## if we're not supplying a codon usage table...
 623     if( !$cut && !$threshold) {
 624         ## get lists of possible codons for each aa.
 625         for my $aa (@seq) {
 626             if ($aa =~ /x/i) {
 627                 push @data, (['NNN']);
 628             }else {
 629                 my @cods = $self->revtranslate($aa);
 630                 push @data, \@cods;
 631             }
 632         }
 633     }else{
 634     #else we are supplying a codon usage table, we just want common codons
 635     #check args first.
 636         if(!$cut->isa('Bio::CodonUsage::Table'))    {
 637             $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 638                      ref($cut). "].");
 639             }
 640         my $cod_ref = $cut->probable_codons($threshold);
 641         for my $aa (@seq) {
 642             if ($aa =~ /x/i) {
 643                 push @data, (['NNN']);
 644                 next;
 645                 }
 646             push @data, $cod_ref->{$aa};
 647         }
 648     }
 649
 650     return $self->_make_iupac_string(\@data);
 651 }
 652
 653 =head2 reverse_translate_best
 654
 655  Title   : reverse_translate_best
 656  Usage   : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
 657  Function: Reverse translates a protein sequence into plain nucleotide
 658            sequence (GATC), uses the most common codon for each amino acid
 659  Returns : A string
 660  Args    : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
 661
 662 =cut
 663
 664 sub reverse_translate_best {
 665
 666     my ($self, $obj, $cut) = @_;
 667
 668     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 669         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 670                          ref($obj) . "]");
 671     }
 672     if ($obj->alphabet ne 'protein')    {
 673         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 674                          "This sequence is of type [" . $obj->alphabet ."]");
 675     }
 676     if ( !$cut | !$cut->isa('Bio::CodonUsage::Table'))  {
 677         $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 678                          ref($cut). "].");
 679     }
 680
 681     my $str = '';
 682     my @seq = split '', $obj->seq;
 683
 684     my $cod_ref = $cut->most_common_codons();
 685
 686     for my $aa ( @seq ) {
 687         if ($aa =~ /x/i) {
 688             $str .= 'NNN';
 689             next;
 690         }
 691         if ( defined $cod_ref->{$aa} ) {
 692             $str .= $cod_ref->{$aa};
 693         } else {
 694             $self->throw("Input sequence contains invalid character: $aa");
 695         }
 696     }
 697    return $str;
 698 }
 699
 700 =head2 is_start_codon
 701
 702  Title   : is_start_codon
 703  Usage   : $obj->is_start_codon('ATG')
 704  Function: returns true (1) for all codons that can be used as a
 705            translation start, false (0) for others.
 706  Example : $myCodonTable->is_start_codon('ATG')
 707  Returns : boolean
 708  Args    : codon
 709
 710 =cut
 711
 712 sub is_start_codon{
 713    shift->_codon_is( shift, \@STARTS, 'M' );
 714 }
 715
 716 =head2 is_ter_codon
 717
 718  Title   : is_ter_codon
 719  Usage   : $obj->is_ter_codon('GAA')
 720  Function: returns true (1) for all codons that can be used as a
 721            translation tarminator, false (0) for others.
 722  Example : $myCodonTable->is_ter_codon('ATG')
 723  Returns : boolean
 724  Args    : codon
 725
 726 =cut
 727
 728 sub is_ter_codon{
 729    my ($self, $value) = @_;
 730    my $id = $self->{'id'};
 731
 732    # We need to ensure U is mapped to T (ie. UAG)
 733    $value = uc $value;
 734    $value =~ tr/U/T/;
 735
 736    if (length $value != 3  )  {
 737        # Incomplete codons are not stop codons
 738        return 0;
 739    } else {
 740        my $result = 0;
 741
 742        # For all the possible codons, if any are not a stop
 743        # codon, fail immediately
 744        for my $c ( $self->unambiguous_codons($value) ) {
 745            my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
 746            if($m eq $TERMINATOR) {
 747                $result = 1;
 748            } else {
 749                return 0;
 750            }
 751        }
 752        return $result;
 753    }
 754 }
 755
 756 # desc: compares the passed value with a single entry in the given
 757 #       codon table
 758 # args: a value (typically a three-char string like 'atg'),
 759 #       a reference to the appropriate set of codon tables,
 760 #       a single-character value to check for at the position in the
 761 #       given codon table
 762 # ret:  boolean, true if the given codon table contains the $key at the
 763 #       position corresponding to $value
 764 sub _codon_is {
 765    my ($self, $value, $table, $key ) = @_;
 766
 767    return 0 unless length $value == 3;
 768
 769    $value  = lc $value;
 770    $value  =~ tr/u/t/;
 771
 772    my $id = $self->{'id'};
 773    for my $c ( $self->unambiguous_codons($value) ) {
 774        my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
 775        if ($m eq $key) { return 1; }
 776    }
 777    return 0;
 778 }
 779
 780 =head2 is_unknown_codon
 781
 782  Title   : is_unknown_codon
 783  Usage   : $obj->is_unknown_codon('GAJ')
 784  Function: returns false (0) for all codons that are valid,
 785         true (1) for others.
 786  Example : $myCodonTable->is_unknown_codon('NTG')
 787  Returns : boolean
 788  Args    : codon
 789
 790
 791 =cut
 792
 793 sub is_unknown_codon{
 794    my ($self, $value) = @_;
 795    $value  = lc $value;
 796    $value  =~ tr/u/t/;
 797    return 1 unless $self->unambiguous_codons($value);
 798    return 0;
 799 }
 800
 801 =head2 unambiguous_codons
 802
 803  Title   : unambiguous_codons
 804  Usage   : @codons = $self->unambiguous_codons('ACN')
 805  Returns : array of strings (one-letter unambiguous amino acid codes)
 806  Args    : a codon = a three IUPAC nucleotide character string
 807
 808 =cut
 809
 810 sub unambiguous_codons{
 811     my ($self,$value) = @_;
 812     my @nts = map { $IUPAC_DNA{uc $_} }  split(//, $value);
 813
 814     my @codons;
 815     for my $i ( @{$nts[0]} ) {
 816     for my $j ( @{$nts[1]} ) {
 817     for my $k ( @{$nts[2]} ) {
 818         push @codons, lc "$i$j$k";
 819     }}}
 820     return @codons;
 821 }
 822
 823 =head2 _unambiquous_codons
 824
 825 deprecated, now an alias for unambiguous_codons
 826
 827 =cut
 828
 829 sub _unambiquous_codons {
 830     unambiguous_codons( undef, @_ );
 831 }
 832
 833 =head2 add_table
 834
 835  Title   : add_table
 836  Usage   : $newid = $ct->add_table($name, $table, $starts)
 837  Function: Add a custom Codon Table into the object.
 838            Know what you are doing, only the length of
 839            the argument strings is checked!
 840  Returns : the id of the new codon table
 841  Args    : name, a string, optional (can be empty)
 842            table, a string of 64 characters
 843            startcodons, a string of 64 characters, defaults to standard
 844
 845 =cut
 846
 847 sub add_table {
 848     my ($self, $name, $table, $starts) = @_;
 849
 850     $name   ||= 'Custom' . $#NAMES + 1;
 851     $starts ||= $STARTS[1];
 852     $self->throw('Suspect input!')
 853         unless length($table) == 64 and length($starts) == 64;
 854
 855     push @NAMES,  $name;
 856     push @TABLES, $table;
 857     push @STARTS, $starts;
 858
 859     return $#NAMES;
 860 }
 861
 862 sub _make_iupac_string {
 863     my ($self, $cod_ref) = @_;
 864     if(ref($cod_ref) ne 'ARRAY') {
 865         $self->throw(" I need a reference to a list of references to codons, ".
 866                      " not a [". ref($cod_ref) . "].");
 867         }
 868     my %iupac_hash   = Bio::Tools::IUPAC->iupac_rev_iub();
 869     my $iupac_string = ''; ## the string to be returned
 870     for my $aa (@$cod_ref) {
 871
 872         ## scan through codon positions, record the differing values,
 873         # then look up in the iub hash
 874         for my $index(0..2) {
 875             my %h;
 876             map { my $k = substr($_,$index,1);
 877                 $h{$k}  = undef;} @$aa;
 878             my $lookup_key = join '', sort{$a cmp $b}keys %h;
 879
 880             ## extend string
 881             $iupac_string .= $iupac_hash{uc$lookup_key};
 882         }
 883     }
 884     return $iupac_string;
 885 }
 886
 887
 888 1;