* sync with trunk
[bioperl-live.git] / Bio / CodonUsage / Table.pm
blobbcbc120bcf117e4ebe9b5edc2aa54ed7a129a28f
1 # $Id$
3 # BioPerl module for Bio::CodonUsage::Table
5 # Cared for by Richard Adams (richard.adams@ed.ac.uk)
7 # Copyright Richard Adams
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::CodonUsage::Table - for access to the Codon usage Database
16 at http://www.kazusa.or.jp/codon.
18 =head1 SYNOPSIS
20 use Bio::CodonUsage::Table;
21 use Bio::DB::CUTG;
22 use Bio::CodonUsage::IO;
23 use Bio::Tools::SeqStats;
25 # Get a codon usage table from web database
26 my $cdtable = Bio::DB::CUTG->new(-sp => 'Mus musculus',
27 -gc => 1);
29 # Or from local file
30 my $io = Bio::CodonUsage::IO->new(-file => "file");
31 my $cdtable = $io->next_data();
33 # Or create your own from a Bio::PrimarySeq compliant object,
34 # $codonstats is a ref to a hash of codon name /count key-value pairs
35 my $codonstats = Bio::Tools::SeqStats->count_codons($Seq_objct);
37 # '-data' must be specified, '-species' and 'genetic_code' are optional
38 my $CUT = Bio::CodonUsage::Table->new(-data => $codonstats,
39 -species => 'Hsapiens_kinase');
41 print "leu frequency is ", $cdtable->aa_frequency('LEU'), "\n";
42 print "freq of ATG is ", $cdtable->codon_rel_frequency('ttc'), "\n";
43 print "abs freq of ATG is ", $cdtable->codon_abs_frequency('ATG'), "\n";
44 print "number of ATG codons is ", $cdtable->codon_count('ATG'), "\n";
45 print "GC content at position 1 is ", $cdtable->get_coding_gc('1'), "\n";
46 print "total CDSs for Mus musculus is ", $cdtable->cds_count(), "\n";
48 =head1 DESCRIPTION
51 This class provides methods for accessing codon usage table data.
53 All of the methods at present are simple look-ups of the table or are
54 derived from simple calculations from the table. Future methods could
55 include measuring the codon usage of a sequence , for example, or
56 provide methods for examining codon usage in alignments.
58 =head1 SEE ALSO
60 L<Bio::Tools::CodonTable>,
61 L<Bio::WebAgent>,
62 L<Bio::CodonUsage::IO>,
63 L<Bio::DB::CUTG>
65 =head1 FEEDBACK
67 =head2 Mailing Lists
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to one
72 of the Bioperl mailing lists. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 =head2 Reporting Bugs
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
81 web:
83 http://bugzilla.open-bio.org/
85 =head1 AUTHORS
87 Richard Adams, Richard.Adams@ed.ac.uk
89 =head1 APPENDIX
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
94 =cut
97 # Let the code begin...
99 package Bio::CodonUsage::Table;
100 use strict;
101 use vars qw(%STRICTAA @AA);
102 use Bio::SeqUtils;
103 use Bio::Tools::CodonTable;
105 use base qw(Bio::Root::Root);
107 BEGIN{
108 @AA = qw(A C D E F G H I K L M N P Q R S T V W Y *);
109 map {$STRICTAA{$_} = undef} @AA;
112 =head2 new
114 Title : new
115 Usage : my $cut = Bio::CodonUsage::Table->new(-data => $cut_hash_ref,
116 -species => 'H.sapiens_kinase'
117 -genetic_code =>1);
118 Returns : a reference to a new Bio::CodonUsage::Table object
119 Args : none or a reference to a hash of codon counts. This constructor is
120 designed to be compatible with the output of
121 Bio::Tools::SeqUtils::count_codons()
122 Species and genetic code parameters can be entered here or via the
123 species() and genetic_code() methods separately.
125 =cut
127 sub new {
128 my ($class, @args) = @_;
129 my $self= $class->SUPER::new(@args);
130 if (@args) {
131 $self->_rearrange([qw(DATA)], @args);
132 shift @args; # get rid of key
133 my $arg = shift @args;
134 $self->throw("need a hash reference, not a [" . ref($arg). "] reference") if ref($arg) ne 'HASH';
135 ### flags to detect argument type, can be either to start with ##
136 my $is_codon_hash = 1;
137 my $is_Aa_hash = 1;
138 for my $k (keys %$arg) {
139 ## convert to UC
140 $k =~ s/(\w+)/\U$1/;
141 if (!exists($STRICTAA{$k}) ){
142 $is_Aa_hash = 0;
144 elsif ($k =~ /[^ATCGatcg]/) {
145 $is_codon_hash = 0;
148 if (!$is_codon_hash && !$is_Aa_hash) {
149 $self->throw(" invalid key values in CUT hash - must be unique aa or nucleotide identifiers");
151 elsif ($is_Aa_hash) {
152 $self->_init_from_aa($arg);
154 elsif($is_codon_hash) {
155 $self->_init_from_cod($arg);
157 while (@args) {
158 my $key = shift @args;
159 $key =~ s/\-(\w+)/\L$1/;
161 $self->$key(shift @args);
165 return $self;
168 =head2 all_aa_frequencies
170 Title : all_aa_frequencies
171 Usage : my $freq = $cdtable->all_aa_frequencies();
172 Returns : a reference to a hash where each key is an amino acid
173 and each value is its frequency in all proteins in that
174 species.
175 Args : none
177 =cut
179 sub all_aa_frequencies {
180 my $self = shift;
181 my %aa_freqs =();
182 for my $aa (keys %STRICTAA) {
183 my $freq = $self->aa_frequency($aa);
184 $aa_freqs{$aa} = $freq;
186 return \%aa_freqs;
189 =head2 codon_abs_frequency
191 Title : codon_abs_frequency
192 Usage : my $freq = $cdtable->codon_abs_frequency('CTG');
193 Purpose : To return the frequency of that codon as a percentage
194 of all codons in the organism.
195 Returns : a percentage frequency
196 Args : a non-ambiguous codon string
198 =cut
200 sub codon_abs_frequency {
201 my ($self, $a) = @_;
202 my $cod = uc $a;
203 if ($self->_check_codon($cod)) {
204 my $ctable = Bio::Tools::CodonTable->new;
205 $ctable->id($self->genetic_code() );
206 my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
208 return $self->{'_table'}{$aa}{$cod}{'per1000'}/10 ;
210 else {return 0;}
213 =head2 codon_rel_frequency
215 Title : codon_rel_frequency
216 Usage : my $freq = $cdtable->codon_rel_frequency('CTG');
217 Purpose : To return the frequency of that codon as a percentage
218 of codons coding for the same amino acid. E.g., ATG and TGG
219 would return 100 as those codons are unique.
220 Returns : a percentage frequency
221 Args : a non-ambiguous codon string
223 =cut
226 sub codon_rel_frequency {
227 my ($self, $a) = @_;
228 my $cod = uc $a;
229 if ($self->_check_codon($cod)) {
230 my $ctable = Bio::Tools::CodonTable->new;
231 $ctable->id($self->genetic_code () );
232 my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
233 return $self->{'_table'}{$aa}{$cod}{'rel_freq'};
235 else {
236 return 0;
240 =head2 probable_codons
242 Title : probable_codons
243 Usage : my $prob_codons = $cd_table->probable_codons(10);
244 Purpose : to obtain a list of codons for the amino acid above a given
245 threshold % relative frequency
246 Returns : A reference to a hash where keys are 1 letter amino acid codes
247 and values are references to arrays of codons whose frequency
248 is above the threshold.
249 Arguments: a minimum threshold frequency
251 =cut
253 sub probable_codons {
254 my ($self, $threshold) = @_;
255 if (!$threshold || $threshold < 0 || $threshold > 100) {
256 $self->throw(" I need a threshold percentage ");
258 my %return_hash;
259 for my $a(keys %STRICTAA) {
260 my @common_codons;
261 my $aa =$Bio::SeqUtils::THREECODE{$a};
262 for my $codon (keys %{ $self->{'_table'}{$aa}}) {
263 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $threshold/100){
264 push @common_codons, $codon;
267 $return_hash{$a} = \@common_codons;
269 ## check to make sure that all codons are populated ##
270 if (grep{scalar @{$return_hash{$_}} == 0} keys %return_hash) {
271 $self->warn("Threshold is too high, some amino acids do not" .
272 " have any codon above the threshold!");
274 return \%return_hash;
278 =head2 most_common_codons
280 Title : most_common_codons
281 Usage : my $common_codons = $cd_table->most_common_codons();
282 Purpose : To obtain the most common codon for a given amino acid
283 Returns : A reference to a hash where keys are 1 letter amino acid codes
284 and the values are the single most common codons for those amino acids
285 Arguments: None
287 =cut
289 sub most_common_codons {
290 my $self = shift;
292 my %return_hash;
294 for my $a ( keys %STRICTAA ) {
296 my $common_codon = '';
297 my $rel_freq = 0;
298 my $aa = $Bio::SeqUtils::THREECODE{$a};
300 if ( ! defined $self->{'_table'}{$aa} ) {
301 $self->warn("Amino acid $aa ($a) does not have any codons!");
302 next;
305 for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
306 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
307 $common_codon = $codon;
308 $rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
311 $return_hash{$a} = $common_codon;
314 return \%return_hash;
317 =head2 codon_count
319 Title : codon_count
320 Usage : my $count = $cdtable->codon_count('CTG');
321 Purpose : To obtain the absolute number of the codons in the
322 organism.
323 Returns : an integer
324 Args : a non-ambiguous codon string
326 =cut
328 sub codon_count {
329 my $self = shift;
330 if (@_) {
331 my $a = shift;
332 my $cod = uc $a;
333 if ($self->_check_codon($cod)) {
334 my $ctable = Bio::Tools::CodonTable->new;
335 $ctable->id($self->genetic_code());
336 my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
337 return $self->{'_table'}{$aa}{$cod}{'abs_count'};
339 else {return 0;}
341 else {
342 $self->warn(" need to give a codon sequence as a parameter ");
343 return 0;
348 =head2 get_coding_gc
350 Title : get_coding_gc
351 Usage : my $count = $cdtable->get_coding_gc(1);
352 Purpose : To return the percentage GC composition for the organism at
353 codon positions 1,2 or 3, or an average for all coding sequence
354 ('all').
355 Returns : a number (%-age GC content) or 0 if these fields are undefined
356 Args : 1,2,3 or 'all'.
358 =cut
360 sub get_coding_gc {
361 my $self = shift;
362 if (! @_) {
363 $self->warn(" no parameters supplied must be a codon position (1,2,3) or 'all'");
364 return 0;
366 else{
367 my $n = shift;
368 ##return request if valid ##
369 if ( exists($self->{'_coding_gc'}{$n} ) ) {
370 return sprintf("%.2f", $self->{'_coding_gc'}{$n});
372 ##else return 'all' value if exists
373 elsif (exists($self->{'_coding_gc'}{'all'} )) {
374 $self->warn("coding gc doesn't have value for [$n], returning gc content for all CDSs");
375 return sprintf("%.2f", $self->{'_coding_gc'}{'all'});
377 ### else return 0,
378 else {
379 $self->warn("coding gc values aren't defined, returning 0");
380 return 0;
383 }#end of outer else
387 =head2 set_coding_gc
389 Title : set_coding_gc
390 Usage : my $count = $cdtable->set_coding_gc(-1=>55.78);
391 Purpose : To set the percentage GC composition for the organism at
392 codon positions 1,2 or 3, or an average for all coding sequence
393 ('all').
394 Returns : void
395 Args : a hash where the key must be 1,2,3 or 'all' and the value the %age GC
396 at that codon position..
398 =cut
400 sub set_coding_gc {
401 my ($self, $key, $value) = @_;
402 my @allowed = qw(1 2 3 all);
403 $key =~ s/\-//;
404 if (!grep {$key eq $_} @allowed ) {
405 $self->warn ("invalid key! - must be one of [ ". (join " ", @allowed) . "]");
406 return;
408 $self->{'_coding_gc'}{$key} = $value;
413 =head2 species
415 Title : species
416 Usage : my $sp = $cut->species();
417 Purpose : Get/setter for species name of codon table
418 Returns : Void or species name string
419 Args : None or species name string
421 =cut
423 sub species {
424 my $self = shift;
425 if (@_ ){
426 $self->{'_species'} = shift;
428 return $self->{'_species'} || "unknown";
431 =head2 genetic_code
433 Title : genetic_code
434 Usage : my $sp = $cut->genetic_code();
435 Purpose : Get/setter for genetic_code name of codon table
436 Returns : Void or genetic_code id, 1 by default
437 Args : None or genetic_code id, 1 by default if invalid argument.
439 =cut
441 sub genetic_code {
442 my $self = shift;
443 if (@_ ){
444 my $val = shift;
445 if ($val < 0 || $val >16 || $val =~ /[^\d]/
446 || $val ==7 || $val ==8) {
447 $self->warn ("invalid genetic code - must be 1-16 but not 7 or 8,setting to default [1]");
448 $self->{'_genetic_code'} = 1;
450 else {
451 $self->{'_genetic_code'} = shift;
454 return $self->{'_genetic_code'} || 1;
457 =head2 cds_count
459 Title : cds_count
460 Usage : my $count = $cdtable->cds_count();
461 Purpose : To retrieve the total number of CDSs used to generate the Codon Table
462 for that organism.
463 Returns : an integer
464 Args : none (if retrieving the value) or an integer( if setting ).
466 =cut
468 sub cds_count {
469 my $self= shift;
470 if (@_) {
471 my $val = shift;
472 if ($val < 0) {
473 $self->warn("can't have negative count initializing to 1");
474 $self->{'_cds_count'} = 0.00;
476 else{
477 $self->{'_cds_count'} = $val;
480 $self->warn("cds_count value is undefined, returning 0")
481 if !exists($self->{'_cds_count'});
483 return $self->{'_cds_count'} || 0.00;
486 =head2 aa_frequency
488 Title : aa_frequency
489 Usage : my $freq = $cdtable->aa_frequency('Leu');
490 Purpose : To retrieve the frequency of an amino acid in the organism
491 Returns : a percentage
492 Args : a 1 letter or 3 letter string representing the amino acid
494 =cut
498 sub aa_frequency {
499 my ($self, $a) = @_;
500 ## process args ##
502 ## deal with cases ##
503 my $aa = lc $a;
504 $aa =~ s/^(\w)/\U$1/;
505 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
506 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
507 return;
509 #translate to 3 letter code for Ctable #
510 my $aa3 = $Bio::SeqUtils::THREECODE{$aa} || $aa;
512 ## return % of all amino acids in organism ##
513 my $freq = 0;
514 map {$freq += $self->{'_table'}{$aa3}{$_}{'per1000'} } keys %{$self->{'_table'}{$aa3}};
515 return sprintf("%.2f", $freq/10);
518 =head2 common_codon
520 Title : common_codon
521 Usage : my $freq = $cdtable->common_codon('Leu');
522 Purpose : To retrieve the frequency of the most common codon of that aa
523 Returns : a percentage
524 Args : a 1 letter or 3 letter string representing the amino acid
526 =cut
528 sub common_codon{
530 my ($self, $a) = @_;
531 my $aa = lc $a;
532 $aa =~ s/^(\w)/\U$1/;
534 if ($self->_check_aa($aa)) {
535 my $aa3 = $Bio::SeqUtils::THREECODE{$aa} ;
536 $aa3 ||= $aa;
537 my $max = 0;
538 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
539 $max = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} > $max) ?
540 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$max;
542 return $max;
543 }else {return 0;}
546 =head2 rare_codon
548 Title : rare_codon
549 Usage : my $freq = $cdtable->rare_codon('Leu');
550 Purpose : To retrieve the frequency of the least common codon of that aa
551 Returns : a percentage
552 Args : a 1 letter or 3 letter string representing the amino acid
554 =cut
556 sub rare_codon {
557 my ($self, $a) = @_;
558 my $aa = lc $a;
559 $aa =~ s/^(\w)/\U$1/;
560 if ($self->_check_aa($aa)) {
561 my $aa3 = $Bio::SeqUtils::THREECODE{$aa};
562 $aa3 ||= $aa;
563 my $min = 1;
564 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
565 $min = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} < $min) ?
566 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$min;
568 return $min;
569 }else {return 0;}
575 ## internal sub that checks a codon is correct format
576 sub _check_aa {
577 my ($self, $aa ) = @_;
578 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
579 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
580 return 0;
581 }else {return 1;}
587 sub _check_codon {
588 my ($self, $cod) = @_;
589 if ($cod =~ /[^ATCG]/ || $cod !~ /\w\w\w/) {
590 $self->warn(" impossible codon - must be 3 letters and just containing ATCG");
591 return 0;
593 else {return 1;}
595 sub _init_from_cod {
597 ## make hash based on aa and then send to _init_from_aa
598 my ($self, $ref) = @_;
599 my $ct = Bio::Tools::CodonTable->new();
600 my %aa_hash;
601 for my $codon(keys %$ref ) {
602 my $aa = $ct->translate($codon);
603 $aa_hash{$aa}{$codon} = $ref->{$codon};
605 $self->_init_from_aa(\%aa_hash);
609 sub _init_from_aa {
610 my ($self, $ref) = @_;
611 ## abs counts and count codons
612 my $total_codons = 0;
613 my %threeletter;
614 map{$threeletter{$Bio::SeqUtils::THREECODE{$_}} = $ref->{$_} } keys %$ref;
615 $ref = \%threeletter;
616 for my $aa (keys %$ref) {
617 for my $cod(keys %{$ref->{$aa}} ) {
618 $self->{'_table'}{$aa}{$cod}{'abs_count'} = $ref->{$aa}{$cod};
619 $total_codons += $ref->{$aa}{$cod};
623 ## now calculate abs codon frequencies
624 for my $aa (keys %$ref) {
625 for my $cod(keys %{$ref->{$aa}} ) {
626 $self->{'_table'}{$aa}{$cod}{'per1000'} =
627 sprintf("%.2f",$ref->{$aa}{$cod} /$total_codons * 1000) ;
630 ## now calculate rel codon_frequencies
631 for my $aa (keys %$ref) {
632 my $aa_freq = 0;
633 map{$aa_freq += $ref->{$aa}{$_} }
634 keys %{$ref->{$aa}};
635 for my $cod(keys %{$ref->{$aa}} ) {
636 $self->{'_table'}{$aa}{$cod}{'rel_freq'}=
637 sprintf("%.2f",$ref->{$aa}{$cod}/ $aa_freq );
642 ## now calculate gc fields
643 my %GC;
644 for my $aa (keys %$ref) {
645 for my $cod(keys %{$ref->{$aa}} ) {
646 for my $index (qw(1 2 3) ) {
647 if (substr ($cod, $index -1, 1) =~ /g|c/oi) {
648 $GC{$index} += $ref->{$aa}{$cod};
653 my $tot = 0;
654 map{$tot += $GC{$_}} qw(1 2 3);
655 $self->set_coding_gc('all', $tot/(3 *$total_codons) * 100);
656 map{$self->set_coding_gc($_,$GC{$_}/$total_codons * 100)} qw(1 2 3);
659 return $self;
662 sub _gb_db {
663 my $self = shift;
664 return $self->{'_gd_db'} || "unknown";