3 # BioPerl module for Bio::CodonUsage::Table
5 # Cared for by Richard Adams (richard.adams@ed.ac.uk)
7 # Copyright Richard Adams
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::CodonUsage::Table - for access to the Codon usage Database
16 at http://www.kazusa.or.jp/codon.
20 use Bio::CodonUsage::Table;
22 use Bio::CodonUsage::IO;
23 use Bio::Tools::SeqStats;
25 # Get a codon usage table from web database
26 my $cdtable = Bio::DB::CUTG->new(-sp => 'Mus musculus',
30 my $io = Bio::CodonUsage::IO->new(-file => "file");
31 my $cdtable = $io->next_data();
33 # Or create your own from a Bio::PrimarySeq compliant object,
34 # $codonstats is a ref to a hash of codon name /count key-value pairs
35 my $codonstats = Bio::Tools::SeqStats->count_codons($Seq_objct);
37 # '-data' must be specified, '-species' and 'genetic_code' are optional
38 my $CUT = Bio::CodonUsage::Table->new(-data => $codonstats,
39 -species => 'Hsapiens_kinase');
41 print "leu frequency is ", $cdtable->aa_frequency('LEU'), "\n";
42 print "freq of ATG is ", $cdtable->codon_rel_frequency('ttc'), "\n";
43 print "abs freq of ATG is ", $cdtable->codon_abs_frequency('ATG'), "\n";
44 print "number of ATG codons is ", $cdtable->codon_count('ATG'), "\n";
45 print "GC content at position 1 is ", $cdtable->get_coding_gc('1'), "\n";
46 print "total CDSs for Mus musculus is ", $cdtable->cds_count(), "\n";
51 This class provides methods for accessing codon usage table data.
53 All of the methods at present are simple look-ups of the table or are
54 derived from simple calculations from the table. Future methods could
55 include measuring the codon usage of a sequence , for example, or
56 provide methods for examining codon usage in alignments.
60 L<Bio::Tools::CodonTable>,
62 L<Bio::CodonUsage::IO>,
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to one
72 of the Bioperl mailing lists. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
83 http://bugzilla.open-bio.org/
87 Richard Adams, Richard.Adams@ed.ac.uk
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
97 # Let the code begin...
99 package Bio
::CodonUsage
::Table
;
101 use vars
qw(%STRICTAA @AA);
103 use Bio::Tools::CodonTable;
105 use base qw(Bio::Root::Root);
108 @AA = qw(A C D E F G H I K L M N P Q R S T V W Y *);
109 map {$STRICTAA{$_} = undef} @AA;
115 Usage : my $cut = Bio::CodonUsage::Table->new(-data => $cut_hash_ref,
116 -species => 'H.sapiens_kinase'
118 Returns : a reference to a new Bio::CodonUsage::Table object
119 Args : none or a reference to a hash of codon counts. This constructor is
120 designed to be compatible with the output of
121 Bio::Tools::SeqUtils::count_codons()
122 Species and genetic code parameters can be entered here or via the
123 species() and genetic_code() methods separately.
128 my ($class, @args) = @_;
129 my $self= $class->SUPER::new
(@args);
131 $self->_rearrange([qw(DATA)], @args);
132 shift @args; # get rid of key
133 my $arg = shift @args;
134 $self->throw("need a hash reference, not a [" . ref($arg). "] reference") if ref($arg) ne 'HASH';
135 ### flags to detect argument type, can be either to start with ##
136 my $is_codon_hash = 1;
138 for my $k (keys %$arg) {
141 if (!exists($STRICTAA{$k}) ){
144 elsif ($k =~ /[^ATCGatcg]/) {
148 if (!$is_codon_hash && !$is_Aa_hash) {
149 $self->throw(" invalid key values in CUT hash - must be unique aa or nucleotide identifiers");
151 elsif ($is_Aa_hash) {
152 $self->_init_from_aa($arg);
154 elsif($is_codon_hash) {
155 $self->_init_from_cod($arg);
158 my $key = shift @args;
159 $key =~ s/\-(\w+)/\L$1/;
161 $self->$key(shift @args);
168 =head2 all_aa_frequencies
170 Title : all_aa_frequencies
171 Usage : my $freq = $cdtable->all_aa_frequencies();
172 Returns : a reference to a hash where each key is an amino acid
173 and each value is its frequency in all proteins in that
179 sub all_aa_frequencies
{
182 for my $aa (keys %STRICTAA) {
183 my $freq = $self->aa_frequency($aa);
184 $aa_freqs{$aa} = $freq;
189 =head2 codon_abs_frequency
191 Title : codon_abs_frequency
192 Usage : my $freq = $cdtable->codon_abs_frequency('CTG');
193 Purpose : To return the frequency of that codon as a percentage
194 of all codons in the organism.
195 Returns : a percentage frequency
196 Args : a non-ambiguous codon string
200 sub codon_abs_frequency
{
203 if ($self->_check_codon($cod)) {
204 my $ctable = Bio
::Tools
::CodonTable
->new;
205 $ctable->id($self->genetic_code() );
206 my $aa =$Bio::SeqUtils
::THREECODE
{$ctable->translate($cod)};
208 return $self->{'_table'}{$aa}{$cod}{'per1000'}/10 ;
213 =head2 codon_rel_frequency
215 Title : codon_rel_frequency
216 Usage : my $freq = $cdtable->codon_rel_frequency('CTG');
217 Purpose : To return the frequency of that codon as a percentage
218 of codons coding for the same amino acid. E.g., ATG and TGG
219 would return 100 as those codons are unique.
220 Returns : a percentage frequency
221 Args : a non-ambiguous codon string
226 sub codon_rel_frequency
{
229 if ($self->_check_codon($cod)) {
230 my $ctable = Bio
::Tools
::CodonTable
->new;
231 $ctable->id($self->genetic_code () );
232 my $aa =$Bio::SeqUtils
::THREECODE
{$ctable->translate($cod)};
233 return $self->{'_table'}{$aa}{$cod}{'rel_freq'};
240 =head2 probable_codons
242 Title : probable_codons
243 Usage : my $prob_codons = $cd_table->probable_codons(10);
244 Purpose : to obtain a list of codons for the amino acid above a given
245 threshold % relative frequency
246 Returns : A reference to a hash where keys are 1 letter amino acid codes
247 and values are references to arrays of codons whose frequency
248 is above the threshold.
249 Arguments: a minimum threshold frequency
253 sub probable_codons
{
254 my ($self, $threshold) = @_;
255 if (!$threshold || $threshold < 0 || $threshold > 100) {
256 $self->throw(" I need a threshold percentage ");
259 for my $a(keys %STRICTAA) {
261 my $aa =$Bio::SeqUtils
::THREECODE
{$a};
262 for my $codon (keys %{ $self->{'_table'}{$aa}}) {
263 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $threshold/100){
264 push @common_codons, $codon;
267 $return_hash{$a} = \
@common_codons;
269 ## check to make sure that all codons are populated ##
270 if (grep{scalar @
{$return_hash{$_}} == 0} keys %return_hash) {
271 $self->warn("Threshold is too high, some amino acids do not" .
272 " have any codon above the threshold!");
274 return \
%return_hash;
278 =head2 most_common_codons
280 Title : most_common_codons
281 Usage : my $common_codons = $cd_table->most_common_codons();
282 Purpose : To obtain the most common codon for a given amino acid
283 Returns : A reference to a hash where keys are 1 letter amino acid codes
284 and the values are the single most common codons for those amino acids
289 sub most_common_codons
{
294 for my $a ( keys %STRICTAA ) {
296 my $common_codon = '';
298 my $aa = $Bio::SeqUtils
::THREECODE
{$a};
300 if ( ! defined $self->{'_table'}{$aa} ) {
301 $self->warn("Amino acid $aa ($a) does not have any codons!");
305 for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
306 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
307 $common_codon = $codon;
308 $rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
311 $return_hash{$a} = $common_codon;
314 return \
%return_hash;
320 Usage : my $count = $cdtable->codon_count('CTG');
321 Purpose : To obtain the absolute number of the codons in the
324 Args : a non-ambiguous codon string
333 if ($self->_check_codon($cod)) {
334 my $ctable = Bio
::Tools
::CodonTable
->new;
335 $ctable->id($self->genetic_code());
336 my $aa =$Bio::SeqUtils
::THREECODE
{$ctable->translate($cod)};
337 return $self->{'_table'}{$aa}{$cod}{'abs_count'};
342 $self->warn(" need to give a codon sequence as a parameter ");
350 Title : get_coding_gc
351 Usage : my $count = $cdtable->get_coding_gc(1);
352 Purpose : To return the percentage GC composition for the organism at
353 codon positions 1,2 or 3, or an average for all coding sequence
355 Returns : a number (%-age GC content) or 0 if these fields are undefined
356 Args : 1,2,3 or 'all'.
363 $self->warn(" no parameters supplied must be a codon position (1,2,3) or 'all'");
368 ##return request if valid ##
369 if ( exists($self->{'_coding_gc'}{$n} ) ) {
370 return sprintf("%.2f", $self->{'_coding_gc'}{$n});
372 ##else return 'all' value if exists
373 elsif (exists($self->{'_coding_gc'}{'all'} )) {
374 $self->warn("coding gc doesn't have value for [$n], returning gc content for all CDSs");
375 return sprintf("%.2f", $self->{'_coding_gc'}{'all'});
379 $self->warn("coding gc values aren't defined, returning 0");
389 Title : set_coding_gc
390 Usage : my $count = $cdtable->set_coding_gc(-1=>55.78);
391 Purpose : To set the percentage GC composition for the organism at
392 codon positions 1,2 or 3, or an average for all coding sequence
395 Args : a hash where the key must be 1,2,3 or 'all' and the value the %age GC
396 at that codon position..
401 my ($self, $key, $value) = @_;
402 my @allowed = qw(1 2 3 all);
404 if (!grep {$key eq $_} @allowed ) {
405 $self->warn ("invalid key! - must be one of [ ". (join " ", @allowed) . "]");
408 $self->{'_coding_gc'}{$key} = $value;
416 Usage : my $sp = $cut->species();
417 Purpose : Get/setter for species name of codon table
418 Returns : Void or species name string
419 Args : None or species name string
426 $self->{'_species'} = shift;
428 return $self->{'_species'} || "unknown";
434 Usage : my $sp = $cut->genetic_code();
435 Purpose : Get/setter for genetic_code name of codon table
436 Returns : Void or genetic_code id, 1 by default
437 Args : None or genetic_code id, 1 by default if invalid argument.
445 if ($val < 0 || $val >16 || $val =~ /[^\d]/
446 || $val ==7 || $val ==8) {
447 $self->warn ("invalid genetic code - must be 1-16 but not 7 or 8,setting to default [1]");
448 $self->{'_genetic_code'} = 1;
451 $self->{'_genetic_code'} = shift;
454 return $self->{'_genetic_code'} || 1;
460 Usage : my $count = $cdtable->cds_count();
461 Purpose : To retrieve the total number of CDSs used to generate the Codon Table
464 Args : none (if retrieving the value) or an integer( if setting ).
473 $self->warn("can't have negative count initializing to 1");
474 $self->{'_cds_count'} = 0.00;
477 $self->{'_cds_count'} = $val;
480 $self->warn("cds_count value is undefined, returning 0")
481 if !exists($self->{'_cds_count'});
483 return $self->{'_cds_count'} || 0.00;
489 Usage : my $freq = $cdtable->aa_frequency('Leu');
490 Purpose : To retrieve the frequency of an amino acid in the organism
491 Returns : a percentage
492 Args : a 1 letter or 3 letter string representing the amino acid
502 ## deal with cases ##
504 $aa =~ s/^(\w)/\U$1/;
505 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils
::ONECODE
{$aa}) ) {
506 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
509 #translate to 3 letter code for Ctable #
510 my $aa3 = $Bio::SeqUtils
::THREECODE
{$aa} || $aa;
512 ## return % of all amino acids in organism ##
514 map {$freq += $self->{'_table'}{$aa3}{$_}{'per1000'} } keys %{$self->{'_table'}{$aa3}};
515 return sprintf("%.2f", $freq/10);
521 Usage : my $freq = $cdtable->common_codon('Leu');
522 Purpose : To retrieve the frequency of the most common codon of that aa
523 Returns : a percentage
524 Args : a 1 letter or 3 letter string representing the amino acid
532 $aa =~ s/^(\w)/\U$1/;
534 if ($self->_check_aa($aa)) {
535 my $aa3 = $Bio::SeqUtils
::THREECODE
{$aa} ;
538 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
539 $max = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} > $max) ?
540 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$max;
549 Usage : my $freq = $cdtable->rare_codon('Leu');
550 Purpose : To retrieve the frequency of the least common codon of that aa
551 Returns : a percentage
552 Args : a 1 letter or 3 letter string representing the amino acid
559 $aa =~ s/^(\w)/\U$1/;
560 if ($self->_check_aa($aa)) {
561 my $aa3 = $Bio::SeqUtils
::THREECODE
{$aa};
564 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
565 $min = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} < $min) ?
566 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$min;
575 ## internal sub that checks a codon is correct format
577 my ($self, $aa ) = @_;
578 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils
::ONECODE
{$aa}) ) {
579 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
588 my ($self, $cod) = @_;
589 if ($cod =~ /[^ATCG]/ || $cod !~ /\w\w\w/) {
590 $self->warn(" impossible codon - must be 3 letters and just containing ATCG");
597 ## make hash based on aa and then send to _init_from_aa
598 my ($self, $ref) = @_;
599 my $ct = Bio
::Tools
::CodonTable
->new();
601 for my $codon(keys %$ref ) {
602 my $aa = $ct->translate($codon);
603 $aa_hash{$aa}{$codon} = $ref->{$codon};
605 $self->_init_from_aa(\
%aa_hash);
610 my ($self, $ref) = @_;
611 ## abs counts and count codons
612 my $total_codons = 0;
614 map{$threeletter{$Bio::SeqUtils
::THREECODE
{$_}} = $ref->{$_} } keys %$ref;
615 $ref = \
%threeletter;
616 for my $aa (keys %$ref) {
617 for my $cod(keys %{$ref->{$aa}} ) {
618 $self->{'_table'}{$aa}{$cod}{'abs_count'} = $ref->{$aa}{$cod};
619 $total_codons += $ref->{$aa}{$cod};
623 ## now calculate abs codon frequencies
624 for my $aa (keys %$ref) {
625 for my $cod(keys %{$ref->{$aa}} ) {
626 $self->{'_table'}{$aa}{$cod}{'per1000'} =
627 sprintf("%.2f",$ref->{$aa}{$cod} /$total_codons * 1000) ;
630 ## now calculate rel codon_frequencies
631 for my $aa (keys %$ref) {
633 map{$aa_freq += $ref->{$aa}{$_} }
635 for my $cod(keys %{$ref->{$aa}} ) {
636 $self->{'_table'}{$aa}{$cod}{'rel_freq'}=
637 sprintf("%.2f",$ref->{$aa}{$cod}/ $aa_freq );
642 ## now calculate gc fields
644 for my $aa (keys %$ref) {
645 for my $cod(keys %{$ref->{$aa}} ) {
646 for my $index (qw(1 2 3) ) {
647 if (substr ($cod, $index -1, 1) =~ /g|c/oi) {
648 $GC{$index} += $ref->{$aa}{$cod};
654 map{$tot += $GC{$_}} qw(1 2 3);
655 $self->set_coding_gc('all', $tot/(3 *$total_codons) * 100);
656 map{$self->set_coding_gc($_,$GC{$_}/$total_codons * 100)} qw(1 2 3);
664 return $self->{'_gd_db'} || "unknown";