Bio/Tools/CodonTable.pm

   1 # $Id$
   2 #
   3 # bioperl module for Bio::Tools::CodonTable
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   8 #
   9 # Copyright Heikki Lehvaslaiho
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::Tools::CodonTable - Codon table object
  18
  19 =head1 SYNOPSIS
  20
  21   # This is a read-only class for all known codon tables.  The IDs are
  22   # the ones used by nucleotide sequence databases.  All common IUPAC
  23   # ambiguity codes for DNA, RNA and amino acids are recognized.
  24
  25   use Bio::Tools::CodonTable;
  26
  27   # defaults to ID 1 "Standard"
  28   $myCodonTable   = Bio::Tools::CodonTable->new();
  29   $myCodonTable2  = Bio::Tools::CodonTable->new( -id => 3 );
  30
  31   # change codon table
  32   $myCodonTable->id(5);
  33
  34   # examine codon table
  35   print  join (' ', "The name of the codon table no.", $myCodonTable->id(4),
  36                "is:", $myCodonTable->name(), "\n");
  37
  38   # print possible codon tables
  39   $tables = Bio::Tools::CodonTable->tables;
  40   while ( ($id,$name) = each %{$tables} ) {
  41     print "$id = $name\n";
  42   }
  43
  44   # translate a codon
  45   $aa = $myCodonTable->translate('ACU');
  46   $aa = $myCodonTable->translate('act');
  47   $aa = $myCodonTable->translate('ytr');
  48
  49   # reverse translate an amino acid
  50   @codons = $myCodonTable->revtranslate('A');
  51   @codons = $myCodonTable->revtranslate('Ser');
  52   @codons = $myCodonTable->revtranslate('Glx');
  53   @codons = $myCodonTable->revtranslate('cYS', 'rna');
  54
  55   # reverse translate an entire amino acid sequence into a IUPAC
  56   # nucleotide string
  57
  58   my $seqobj    = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
  59   my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
  60
  61   # boolean tests
  62   print "Is a start\n"       if $myCodonTable->is_start_codon('ATG');
  63   print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
  64   print "Is a unknown\n"     if $myCodonTable->is_unknown_codon('JTG');
  65
  66 =head1 DESCRIPTION
  67
  68 Codon tables are also called translation tables or genetic codes
  69 since that is what they represent. A bit more complete picture
  70 of the full complexity of codon usage in various taxonomic groups
  71 is presented at the NCBI Genetic Codes Home page.
  72
  73 CodonTable is a BioPerl class that knows all current translation
  74 tables that are used by primary nucleotide sequence databases
  75 (GenBank, EMBL and DDBJ). It provides methods to output information
  76 about tables and relationships between codons and amino acids.
  77
  78 This class and its methods recognized all common IUPAC ambiguity codes
  79 for DNA, RNA and animo acids. The translation method follows the
  80 conventions in EMBL and TREMBL databases.
  81
  82 It is a nuisance to separate RNA and cDNA representations of nucleic
  83 acid transcripts. The CodonTable object accepts codons of both type as
  84 input and allows the user to set the mode for output when reverse
  85 translating. Its default for output is DNA.
  86
  87 Note:
  88
  89 This class deals primarily with individual codons and amino
  90 acids. However in the interest of speed you can L<translate>
  91 longer sequence, too. The full complexity of protein translation
  92 is tackled by L<Bio::PrimarySeqI::translate>.
  93
  94
  95 The amino acid codes are IUPAC recommendations for common amino acids:
  96
  97           A           Ala            Alanine
  98           R           Arg            Arginine
  99           N           Asn            Asparagine
 100           D           Asp            Aspartic acid
 101           C           Cys            Cysteine
 102           Q           Gln            Glutamine
 103           E           Glu            Glutamic acid
 104           G           Gly            Glycine
 105           H           His            Histidine
 106           I           Ile            Isoleucine
 107           L           Leu            Leucine
 108           K           Lys            Lysine
 109           M           Met            Methionine
 110           F           Phe            Phenylalanine
 111           P           Pro            Proline
 112           O           Pyl            Pyrrolysine (22nd amino acid)
 113           U           Sec            Selenocysteine (21st amino acid)
 114           S           Ser            Serine
 115           T           Thr            Threonine
 116           W           Trp            Tryptophan
 117           Y           Tyr            Tyrosine
 118           V           Val            Valine
 119           B           Asx            Aspartic acid or Asparagine
 120           Z           Glx            Glutamine or Glutamic acid
 121           J           Xle            Isoleucine or Valine (mass spec ambiguity)
 122           X           Xaa            Any or unknown amino acid
 123
 124
 125 It is worth noting that, "Bacterial" codon table no. 11 produces an
 126 polypeptide that is, confusingly, identical to the standard one. The
 127 only differences are in available initiator codons.
 128
 129
 130 NCBI Genetic Codes home page:
 131      http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
 132
 133 EBI Translation Table Viewer:
 134      http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi
 135
 136 Amended ASN.1 version with ids 16 and 21 is at:
 137      ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/
 138
 139 Thanks to Matteo diTomasso for the original Perl implementation
 140 of these tables.
 141
 142 =head1 FEEDBACK
 143
 144 =head2 Mailing Lists
 145
 146 User feedback is an integral part of the evolution of this and other
 147 Bioperl modules. Send your comments and suggestions preferably to the
 148 Bioperl mailing lists  Your participation is much appreciated.
 149
 150   bioperl-l@bioperl.org                  - General discussion
 151   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 152
 153 =head2 Support
 154
 155 Please direct usage questions or support issues to the mailing list:
 156
 157 L<bioperl-l@bioperl.org>
 158
 159 rather than to the module maintainer directly. Many experienced and
 160 reponsive experts will be able look at the problem and quickly
 161 address it. Please include a thorough description of the problem
 162 with code and data examples if at all possible.
 163
 164 =head2 Reporting Bugs
 165
 166 Report bugs to the Bioperl bug tracking system to help us keep track
 167 the bugs and their resolution.  Bug reports can be submitted via the
 168 web:
 169
 170   http://bugzilla.open-bio.org/
 171
 172 =head1 AUTHOR - Heikki Lehvaslaiho
 173
 174 Email:  heikki-at-bioperl-dot-org
 175
 176 =head1 APPENDIX
 177
 178 The rest of the documentation details each of the object
 179 methods. Internal methods are usually preceded with a _
 180
 181 =cut
 182
 183
 184 # Let the code begin...
 185
 186 package Bio::Tools::CodonTable;
 187 use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
 188                                 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
 189 use strict;
 190
 191 # Object preamble - inherits from Bio::Root::Root
 192 use Bio::Tools::IUPAC;
 193 use Bio::SeqUtils;
 194
 195 use base qw(Bio::Root::Root);
 196
 197
 198 # first set internal values for all translation tables
 199
 200 BEGIN {
 201     use constant CODONSIZE => 3;
 202     $GAP = '-';
 203     $CODONGAP = $GAP x CODONSIZE;
 204
 205     @NAMES =                    #id
 206         (
 207          'Standard',            #1
 208          'Vertebrate Mitochondrial',#2
 209          'Yeast Mitochondrial',# 3
 210          'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4
 211          'Invertebrate Mitochondrial',#5
 212          'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
 213          '', '',
 214          'Echinoderm Mitochondrial',#9
 215          'Euplotid Nuclear',#10
 216          '"Bacterial"',# 11
 217          'Alternative Yeast Nuclear',# 12
 218          'Ascidian Mitochondrial',# 13
 219          'Flatworm Mitochondrial',# 14
 220          'Blepharisma Nuclear',# 15
 221          'Chlorophycean Mitochondrial',# 16
 222          '', '',  '', '',
 223          'Trematode Mitochondrial',# 21
 224          'Scenedesmus obliquus Mitochondrial', #22
 225          'Thraustochytrium Mitochondrial' #23
 226          );
 227
 228     @TABLES =
 229         qw(
 230            FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 231            FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
 232            FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 233            FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 234            FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
 235            FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 236            '' ''
 237            FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 238            FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 239            FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 240            FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 241            FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
 242            FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 243            FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 244            FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 245            '' '' '' ''
 246            FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 247            FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 248            FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 249            );
 250
 251
 252     @STARTS =
 253         qw(
 254            ---M---------------M---------------M----------------------------
 255            --------------------------------MMMM---------------M------------
 256            ----------------------------------MM----------------------------
 257            --MM---------------M------------MMMM---------------M------------
 258            ---M----------------------------MMMM---------------M------------
 259            -----------------------------------M----------------------------
 260            '' ''
 261            -----------------------------------M----------------------------
 262            -----------------------------------M----------------------------
 263            ---M---------------M------------MMMM---------------M------------
 264            -------------------M---------------M----------------------------
 265            -----------------------------------M----------------------------
 266            -----------------------------------M----------------------------
 267            -----------------------------------M----------------------------
 268            -----------------------------------M----------------------------
 269            '' ''  '' ''
 270            -----------------------------------M---------------M------------
 271            -----------------------------------M----------------------------
 272            --------------------------------M--M---------------M------------
 273            );
 274
 275     my @nucs = qw(t c a g);
 276     my $x = 0;
 277     ($CODONS, $TRCOL) = ({}, {});
 278     for my $i (@nucs) {
 279         for my $j (@nucs) {
 280             for my $k (@nucs) {
 281                 my $codon = "$i$j$k";
 282                 $CODONS->{$codon} = $x;
 283                 $TRCOL->{$x} = $codon;
 284                 $x++;
 285             }
 286         }
 287     }
 288     %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
 289     %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
 290     %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
 291     $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
 292     $TERMINATOR = '*';
 293 }
 294
 295 sub new {
 296     my($class,@args) = @_;
 297     my $self = $class->SUPER::new(@args);
 298
 299     my($id) =
 300         $self->_rearrange([qw(ID
 301                              )],
 302                          @args);
 303
 304     $id = 1 if ( ! $id );
 305     $id  && $self->id($id);
 306     return $self; # success - we hope!
 307 }
 308
 309 =head2 id
 310
 311  Title   : id
 312  Usage   : $obj->id(3); $id_integer = $obj->id();
 313  Function: Sets or returns the id of the translation table.  IDs are
 314            integers from 1 to 15, excluding 7 and 8 which have been
 315            removed as redundant. If an invalid ID is given the method
 316            returns 0, false.
 317  Example :
 318  Returns : value of id, a scalar, 0 if not a valid
 319  Args    : newvalue (optional)
 320
 321 =cut
 322
 323 sub id{
 324    my ($self,$value) = @_;
 325    if( defined $value) {
 326        if (  !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') {
 327            $self->warn("Not a valid codon table ID [$value] ");
 328            $value = 0;
 329        }
 330        $self->{'id'} = $value;
 331    }
 332    return $self->{'id'};
 333 }
 334
 335 =head2 name
 336
 337  Title   : name
 338  Usage   : $obj->name()
 339  Function: returns the descriptive name of the translation table
 340  Example :
 341  Returns : A string
 342  Args    : None
 343
 344
 345 =cut
 346
 347 sub name{
 348    my ($self) = @_;
 349
 350    my ($id) = $self->{'id'};
 351    return $NAMES[$id-1];
 352 }
 353
 354 =head2 tables
 355
 356  Title   : tables
 357  Usage   : $obj->tables()  or  Bio::Tools::CodonTable->tables()
 358  Function: returns a hash reference where each key is a valid codon
 359            table id() number, and each value is the corresponding
 360            codon table name() string
 361  Example :
 362  Returns : A hashref
 363  Args    : None
 364
 365
 366 =cut
 367
 368 sub tables{
 369   my %tables;
 370   for my $id (1 .. @NAMES) {
 371     my $name = $NAMES[$id-1];
 372     $tables{$id} = $name if $name;
 373   }
 374   return \%tables;
 375 }
 376
 377 =head2 translate
 378
 379  Title   : translate
 380  Usage   : $obj->translate('YTR')
 381  Function: Returns a string of one letter amino acid codes from
 382            nucleotide sequence input. The imput can be of any length.
 383
 384            Returns 'X' for unknown codons and codons that code for
 385            more than one amino acid. Returns an empty string if input
 386            is not three characters long. Exceptions for these are:
 387
 388              - IUPAC amino acid code B for Aspartic Acid and
 389                Asparagine, is used.
 390              - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
 391                used.
 392              - if the codon is two nucleotides long and if by adding
 393                an a third character 'N', it codes for a single amino
 394                acid (with exceptions above), return that, otherwise
 395                return empty string.
 396
 397            Returns empty string for other input strings that are not
 398            three characters long.
 399
 400  Example :
 401  Returns : a string of one letter ambiguous IUPAC amino acid codes
 402  Args    : ambiguous IUPAC nucleotide string
 403
 404
 405 =cut
 406
 407 sub translate {
 408     my ($self, $seq) = @_;
 409     $self->throw("Calling translate without a seq argument!") unless defined $seq;
 410     return '' unless $seq;
 411
 412     my $id = $self->id;
 413     my ($partial) = 0;
 414     $partial = 2 if length($seq) % CODONSIZE == 2;
 415
 416     $seq = lc $seq;
 417     $seq =~ tr/u/t/;
 418     my $protein = "";
 419     if ($seq =~ /[^actg]/ ) { #ambiguous chars
 420         for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
 421             my $triplet = substr($seq, $i, CODONSIZE);
 422             if( $triplet eq $CODONGAP ) {
 423                 $protein .= $GAP;
 424             } elsif (exists $CODONS->{$triplet}) {
 425                 $protein .= substr($TABLES[$id-1],
 426                                    $CODONS->{$triplet},1);
 427             } else {
 428                 $protein .= $self->_translate_ambiguous_codon($triplet);
 429             }
 430         }
 431     } else { # simple, strict translation
 432         for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
 433             my $triplet = substr($seq, $i, CODONSIZE);
 434             if( $triplet eq $CODONGAP ) {
 435                 $protein .= $GAP;
 436             } if (exists $CODONS->{$triplet}) {
 437                 $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1);
 438             } else {
 439                 $protein .= 'X';
 440             }
 441         }
 442     }
 443     if ($partial == 2) { # 2 overhanging nucleotides
 444         my $triplet = substr($seq, ($partial -4)). "n";
 445         if( $triplet eq $CODONGAP ) {
 446             $protein .= $GAP;
 447         } elsif (exists $CODONS->{$triplet}) {
 448             my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1);
 449             $protein .= $aa;
 450         } else {
 451             $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
 452         }
 453     }
 454     return $protein;
 455 }
 456
 457 sub _translate_ambiguous_codon {
 458     my ($self, $triplet, $partial) = @_;
 459     $partial ||= 0;
 460     my $id = $self->id;
 461     my $aa;
 462     my @codons = _unambiquous_codons($triplet);
 463     my %aas =();
 464     foreach my $codon (@codons) {
 465         $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1;
 466     }
 467     my $count = scalar keys %aas;
 468     if ( $count == 1 ) {
 469         $aa = (keys %aas)[0];
 470     }
 471     elsif ( $count == 2 ) {
 472         if ($aas{'D'} and $aas{'N'}) {
 473             $aa = 'B';
 474         }
 475         elsif ($aas{'E'} and $aas{'Q'}) {
 476             $aa = 'Z';
 477         } else {
 478             $partial ? ($aa = '') : ($aa = 'X');
 479         }
 480     } else {
 481         $partial ? ($aa = '') :  ($aa = 'X');
 482     }
 483     return $aa;
 484 }
 485
 486 =head2 translate_strict
 487
 488  Title   : translate_strict
 489  Usage   : $obj->translate_strict('ACT')
 490  Function: returns one letter amino acid code for a codon input
 491
 492            Fast and simple translation. User is responsible to resolve
 493            ambiguous nucleotide codes before calling this
 494            method. Returns 'X' for unknown codons and an empty string
 495            for input strings that are not three characters long.
 496
 497            It is not recommended to use this method in a production
 498            environment. Use method translate, instead.
 499
 500  Example :
 501  Returns : A string
 502  Args    : a codon = a three nucleotide character string
 503
 504
 505 =cut
 506
 507 sub translate_strict{
 508    my ($self, $value) = @_;
 509    my ($id) = $self->{'id'};
 510
 511    $value  = lc $value;
 512    $value  =~ tr/u/t/;
 513
 514    if (length $value != 3 ) {
 515        return '';
 516    }
 517    elsif (!(defined $CODONS->{$value}))  {
 518        return 'X';
 519    }
 520    else {
 521        return substr($TABLES[$id-1],$CODONS->{$value},1);
 522    }
 523 }
 524
 525 =head2 revtranslate
 526
 527  Title   : revtranslate
 528  Usage   : $obj->revtranslate('G')
 529  Function: returns codons for an amino acid
 530
 531            Returns an empty string for unknown amino acid
 532            codes. Ambiquous IUPAC codes Asx,B, (Asp,D; Asn,N) and
 533            Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
 534            letter amino acid codes are accepted. '*' and 'Ter' are
 535            used for terminator.
 536
 537            By default, the output codons are shown in DNA.  If the
 538            output is needed in RNA (tr/t/u/), add a second argument
 539            'RNA'.
 540
 541  Example : $obj->revtranslate('Gly', 'RNA')
 542  Returns : An array of three lower case letter strings i.e. codons
 543  Args    : amino acid, 'RNA'
 544
 545 =cut
 546
 547 sub revtranslate {
 548         my ($self, $value, $coding) = @_;
 549         my ($id) = $self->{'id'};
 550         my (@aas,  $p);
 551         my (@codons) = ();
 552
 553         if (length($value) == 3 ) {
 554                 $value = lc $value;
 555                 $value = ucfirst $value;
 556                 $value = $THREELETTERSYMBOLS{$value};
 557         }
 558         if ( defined $value and $value =~ /$VALID_PROTEIN/
 559                   and length($value) == 1 ) {
 560                 $value = uc $value;
 561                 @aas = @{$IUPAC_AA{$value}};
 562                 foreach my $aa (@aas) {
 563                         #print $aa, " -2\n";
 564                         $aa = '\*' if $aa eq '*';
 565               while ($TABLES[$id-1] =~ m/$aa/g) {
 566                       $p = pos $TABLES[$id-1];
 567                       push (@codons, $TRCOL->{--$p});
 568               }
 569             }
 570     }
 571
 572    if ($coding and uc ($coding) eq 'RNA') {
 573            for my $i (0..$#codons)  {
 574               $codons[$i] =~ tr/t/u/;
 575            }
 576    }
 577
 578    return @codons;
 579 }
 580
 581 =head2 reverse_translate_all
 582
 583  Title   : reverse_translate_all
 584  Usage   : my $iup_str = $cttable->reverse_translate_all($seq_object)
 585            my $iup_str = $cttable->reverse_translate_all($seq_object,
 586                                                          $cutable,
 587                                                          15);
 588  Function: reverse translates a protein sequence into IUPAC nucleotide
 589            sequence. An 'X' in the protein sequence is converted to 'NNN'
 590            in the nucleotide sequence.
 591  Returns : a string
 592  Args    : a Bio::PrimarySeqI compatible object (mandatory)
 593            a Bio::CodonUsage::Table object and a threshold if only
 594              codons with a relative frequency above the threshold are
 595              to be considered.
 596 =cut
 597
 598 sub reverse_translate_all {
 599
 600         my ($self, $obj, $cut, $threshold) = @_;
 601
 602         ## check args are OK
 603
 604         if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 605                 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 606                                                 ref($obj) . "]");
 607                 }
 608         if($obj->alphabet ne 'protein') {
 609                 $self->throw("Cannot reverse translate, need an amino acid sequence .".
 610                      "This sequence is of type [" . $obj->alphabet ."]");
 611                 }
 612         my @data;
 613         my @seq = split '', $obj->seq;
 614
 615         ## if we're not supplying a codon usage table...
 616         if( !$cut && !$threshold) {
 617                 ## get lists of possible codons for each aa.
 618                 for my $aa (@seq) {
 619                         if ($aa =~ /x/i) {
 620                                 push @data, (['NNN']);
 621                         }else {
 622                                 my @cods = $self->revtranslate($aa);
 623                                 push @data, \@cods;
 624                         }
 625                 }
 626         }else{
 627         #else we are supplying a codon usage table, we just want common codons
 628         #check args first.
 629                 if(!$cut->isa('Bio::CodonUsage::Table'))        {
 630                         $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 631                      ref($cut). "].");
 632                         }
 633                 my $cod_ref = $cut->probable_codons($threshold);
 634                 for my $aa (@seq) {
 635                         if ($aa =~ /x/i) {
 636                                 push @data, (['NNN']);
 637                                 next;
 638                                 }
 639                         push @data, $cod_ref->{$aa};
 640                 }
 641         }
 642
 643         return $self->_make_iupac_string(\@data);
 644
 645 }
 646
 647 =head2 reverse_translate_best
 648
 649  Title   : reverse_translate_best
 650  Usage   : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
 651  Function: Reverse translates a protein sequence into plain nucleotide
 652            sequence (GATC), uses the most common codon for each amino acid
 653  Returns : A string
 654  Args    : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
 655
 656 =cut
 657
 658 sub reverse_translate_best {
 659
 660         my ($self, $obj, $cut) = @_;
 661
 662         if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 663                 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 664                                                  ref($obj) . "]");
 665         }
 666         if ($obj->alphabet ne 'protein')        {
 667                 $self->throw("Cannot reverse translate, need an amino acid sequence .".
 668                                                  "This sequence is of type [" . $obj->alphabet ."]");
 669         }
 670         if ( !$cut | !$cut->isa('Bio::CodonUsage::Table'))      {
 671                 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 672                                                  ref($cut). "].");
 673         }
 674
 675         my $str = '';
 676         my @seq = split '', $obj->seq;
 677
 678         my $cod_ref = $cut->most_common_codons();
 679
 680         for my $aa ( @seq ) {
 681                 if ($aa =~ /x/i) {
 682                         $str .= 'NNN';
 683                         next;
 684                 }
 685                 if ( defined $cod_ref->{$aa} ) {
 686                         $str .= $cod_ref->{$aa};
 687                 } else {
 688                         $self->throw("Input sequence contains invalid character: $aa");
 689                 }
 690         }
 691    $str;
 692 }
 693
 694 =head2 is_start_codon
 695
 696  Title   : is_start_codon
 697  Usage   : $obj->is_start_codon('ATG')
 698  Function: returns true (1) for all codons that can be used as a
 699            translation start, false (0) for others.
 700  Example : $myCodonTable->is_start_codon('ATG')
 701  Returns : boolean
 702  Args    : codon
 703
 704 =cut
 705
 706 sub is_start_codon{
 707    my ($self, $value) = @_;
 708    my ($id) = $self->{'id'};
 709
 710    $value  = lc $value;
 711    $value  =~ tr/u/t/;
 712
 713    if (length $value != 3  )  {
 714        return 0;
 715    }
 716    else {
 717        my $result = 1;
 718        my @ms = map { substr($STARTS[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value);
 719        foreach my $c (@ms) {
 720            $result = 0 if $c ne 'M';
 721        }
 722        return $result;
 723    }
 724 }
 725
 726
 727
 728 =head2 is_ter_codon
 729
 730  Title   : is_ter_codon
 731  Usage   : $obj->is_ter_codon('GAA')
 732  Function: returns true (1) for all codons that can be used as a
 733            translation tarminator, false (0) for others.
 734  Example : $myCodonTable->is_ter_codon('ATG')
 735  Returns : boolean
 736  Args    : codon
 737
 738 =cut
 739
 740 sub is_ter_codon{
 741    my ($self, $value) = @_;
 742    my ($id) = $self->{'id'};
 743
 744    $value  = lc $value;
 745    $value  =~ tr/u/t/;
 746
 747    if (length $value != 3  )  {
 748        return 0;
 749    }
 750    else {
 751        my $result = 1;
 752        my @ms = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value);
 753        foreach my $c (@ms) {
 754            $result = 0 if $c ne $TERMINATOR;
 755        }
 756        return $result;
 757    }
 758 }
 759
 760 =head2 is_unknown_codon
 761
 762  Title   : is_unknown_codon
 763  Usage   : $obj->is_unknown_codon('GAJ')
 764  Function: returns false (0) for all codons that are valid,
 765             true (1) for others.
 766  Example : $myCodonTable->is_unknown_codon('NTG')
 767  Returns : boolean
 768  Args    : codon
 769
 770
 771 =cut
 772
 773 sub is_unknown_codon{
 774    my ($self, $value) = @_;
 775    my ($id) = $self->{'id'};
 776
 777    $value  = lc $value;
 778    $value  =~ tr/u/t/;
 779
 780    if (length $value != 3  )  {
 781        return 1;
 782    }
 783    else {
 784        my $result = 0;
 785        my @cs = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value);
 786        $result = 1 if scalar @cs == 0;
 787        return $result;
 788    }
 789 }
 790
 791 =head2 _unambiquous_codons
 792
 793  Title   : _unambiquous_codons
 794  Usage   : @codons = _unambiquous_codons('ACN')
 795  Function:
 796  Example :
 797  Returns : array of strings (one letter unambiguous amino acid codes)
 798  Args    : a codon = a three IUPAC nucleotide character string
 799
 800 =cut
 801
 802 sub _unambiquous_codons{
 803     my ($value) = @_;
 804     my @nts = ();
 805     my @codons = ();
 806     my ($i, $j, $k);
 807     @nts = map { $IUPAC_DNA{uc $_} }  split(//, $value);
 808     for my $i (@{$nts[0]}) {
 809         for my $j (@{$nts[1]}) {
 810             for my $k (@{$nts[2]}) {
 811                 push @codons, lc "$i$j$k";
 812             }
 813         }
 814     }
 815     return @codons;
 816 }
 817
 818 =head2 add_table
 819
 820  Title   : add_table
 821  Usage   : $newid = $ct->add_table($name, $table, $starts)
 822  Function: Add a custom Codon Table into the object.
 823            Know what you are doing, only the length of
 824            the argument strings is checked!
 825  Returns : the id of the new codon table
 826  Args    : name, a string, optional (can be empty)
 827            table, a string of 64 characters
 828            startcodons, a string of 64 characters, defaults to standard
 829
 830 =cut
 831
 832 sub add_table {
 833     my ($self, $name, $table, $starts) = @_;
 834
 835     $name ||= 'Custom'. scalar @NAMES + 1;
 836     $starts ||= $STARTS[0];
 837     $self->throw('Suspect input!')
 838         unless length($table) == 64 and length($starts) == 64;
 839
 840     push @NAMES, $name;
 841     push @TABLES, $table;
 842     push @STARTS, $starts;
 843
 844     return scalar @NAMES;
 845
 846 }
 847
 848 sub _make_iupac_string {
 849
 850         my ($self, $cod_ref) = @_;
 851         if(ref($cod_ref) ne 'ARRAY') {
 852                 $self->throw(" I need a reference to a list of references to codons, ".
 853                                          " not a [". ref($cod_ref) . "].");
 854                 }
 855     my %iupac_hash   = Bio::Tools::IUPAC->iupac_rev_iub();
 856         my $iupac_string = ''; ## the string to be returned
 857         for my $aa (@$cod_ref) {
 858
 859                 ## scan through codon positions, record the differing values,
 860                 # then look up in the iub hash
 861                 for my $index(0..2) {
 862                         my %h;
 863                         map { my $k = substr($_,$index,1);
 864                                 $h{$k}  = undef;} @$aa;
 865                         my $lookup_key = join '', sort{$a cmp $b}keys %h;
 866
 867             ## extend string
 868                         $iupac_string .= $iupac_hash{uc$lookup_key};
 869                 }
 870         }
 871     return $iupac_string;
 872
 873 }
 874
 875
 876 1;