t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / CodonUsage / Table.pm
blobc8e5060499c2daf0f3fffbad461c36ad22a5c212
2 # BioPerl module for Bio::CodonUsage::Table
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Richard Adams (richard.adams@ed.ac.uk)
8 # Copyright Richard Adams
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::CodonUsage::Table - for access to the Codon usage Database
17 at http://www.kazusa.or.jp/codon.
19 =head1 SYNOPSIS
21 use Bio::CodonUsage::Table;
22 use Bio::DB::CUTG;
23 use Bio::CodonUsage::IO;
24 use Bio::Tools::SeqStats;
26 # Get a codon usage table from web database
27 my $cdtable = Bio::DB::CUTG->new(-sp => 'Mus musculus',
28 -gc => 1);
30 # Or from local file
31 my $io = Bio::CodonUsage::IO->new(-file => "file");
32 my $cdtable = $io->next_data();
34 # Or create your own from a Bio::PrimarySeq compliant object,
35 # $codonstats is a ref to a hash of codon name /count key-value pairs
36 my $codonstats = Bio::Tools::SeqStats->count_codons($Seq_objct);
38 # '-data' must be specified, '-species' and 'genetic_code' are optional
39 my $CUT = Bio::CodonUsage::Table->new(-data => $codonstats,
40 -species => 'Hsapiens_kinase');
42 print "leu frequency is ", $cdtable->aa_frequency('LEU'), "\n";
43 print "freq of ATG is ", $cdtable->codon_rel_frequency('ttc'), "\n";
44 print "abs freq of ATG is ", $cdtable->codon_abs_frequency('ATG'), "\n";
45 print "number of ATG codons is ", $cdtable->codon_count('ATG'), "\n";
46 print "GC content at position 1 is ", $cdtable->get_coding_gc('1'), "\n";
47 print "total CDSs for Mus musculus is ", $cdtable->cds_count(), "\n";
49 =head1 DESCRIPTION
52 This class provides methods for accessing codon usage table data.
54 All of the methods at present are simple look-ups of the table or are
55 derived from simple calculations from the table. Future methods could
56 include measuring the codon usage of a sequence , for example, or
57 provide methods for examining codon usage in alignments.
59 =head1 SEE ALSO
61 L<Bio::Tools::CodonTable>,
62 L<Bio::WebAgent>,
63 L<Bio::CodonUsage::IO>,
64 L<Bio::DB::CUTG>
66 =head1 FEEDBACK
68 =head2 Mailing Lists
71 User feedback is an integral part of the evolution of this and other
72 Bioperl modules. Send your comments and suggestions preferably to one
73 of the Bioperl mailing lists. Your participation is much appreciated.
75 bioperl-l@bioperl.org - General discussion
76 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
78 =head2 Support
80 Please direct usage questions or support issues to the mailing list:
82 I<bioperl-l@bioperl.org>
84 rather than to the module maintainer directly. Many experienced and
85 reponsive experts will be able look at the problem and quickly
86 address it. Please include a thorough description of the problem
87 with code and data examples if at all possible.
89 =head2 Reporting Bugs
91 Report bugs to the Bioperl bug tracking system to help us keep track
92 the bugs and their resolution. Bug reports can be submitted via the
93 web:
95 https://github.com/bioperl/bioperl-live/issues
97 =head1 AUTHORS
99 Richard Adams, Richard.Adams@ed.ac.uk
101 =head1 APPENDIX
103 The rest of the documentation details each of the object
104 methods. Internal methods are usually preceded with a _
106 =cut
109 # Let the code begin...
111 package Bio::CodonUsage::Table;
112 use strict;
113 use vars qw(%STRICTAA @AA);
114 use Bio::SeqUtils;
115 use Bio::Tools::CodonTable;
117 use base qw(Bio::Root::Root);
119 BEGIN{
120 @AA = qw(A C D E F G H I K L M N P Q R S T V W Y *);
121 map {$STRICTAA{$_} = undef} @AA;
124 =head2 new
126 Title : new
127 Usage : my $cut = Bio::CodonUsage::Table->new(-data => $cut_hash_ref,
128 -species => 'H.sapiens_kinase'
129 -genetic_code =>1);
130 Returns : a reference to a new Bio::CodonUsage::Table object
131 Args : none or a reference to a hash of codon counts. This constructor is
132 designed to be compatible with the output of
133 Bio::Tools::SeqUtils::count_codons()
134 Species and genetic code parameters can be entered here or via the
135 species() and genetic_code() methods separately.
137 =cut
139 sub new {
140 my ($class, @args) = @_;
141 my $self= $class->SUPER::new(@args);
142 if (@args) {
143 $self->_rearrange([qw(DATA)], @args);
144 shift @args; # get rid of key
145 my $arg = shift @args;
146 $self->throw("need a hash reference, not a [" . ref($arg). "] reference") if ref($arg) ne 'HASH';
147 ### flags to detect argument type, can be either to start with ##
148 my $is_codon_hash = 1;
149 my $is_Aa_hash = 1;
150 for my $k (keys %$arg) {
151 ## convert to UC
152 $k =~ s/(\w+)/\U$1/;
153 if (!exists($STRICTAA{$k}) ){
154 $is_Aa_hash = 0;
156 elsif ($k =~ /[^ATCGatcg]/) {
157 $is_codon_hash = 0;
160 if (!$is_codon_hash && !$is_Aa_hash) {
161 $self->throw(" invalid key values in CUT hash - must be unique aa or nucleotide identifiers");
163 elsif ($is_Aa_hash) {
164 $self->_init_from_aa($arg);
166 elsif($is_codon_hash) {
167 $self->_init_from_cod($arg);
169 while (@args) {
170 my $key = shift @args;
171 $key =~ s/\-(\w+)/\L$1/;
173 $self->$key(shift @args);
177 return $self;
180 =head2 all_aa_frequencies
182 Title : all_aa_frequencies
183 Usage : my $freq = $cdtable->all_aa_frequencies();
184 Returns : a reference to a hash where each key is an amino acid
185 and each value is its frequency in all proteins in that
186 species.
187 Args : none
189 =cut
191 sub all_aa_frequencies {
192 my $self = shift;
193 my %aa_freqs =();
194 for my $aa (keys %STRICTAA) {
195 my $freq = $self->aa_frequency($aa);
196 $aa_freqs{$aa} = $freq;
198 return \%aa_freqs;
201 =head2 codon_abs_frequency
203 Title : codon_abs_frequency
204 Usage : my $freq = $cdtable->codon_abs_frequency('CTG');
205 Purpose : To return the frequency of that codon as a percentage
206 of all codons in the organism.
207 Returns : a percentage frequency
208 Args : a non-ambiguous codon string
210 =cut
212 sub codon_abs_frequency {
213 my ($self, $a) = @_;
214 my $cod = uc $a;
215 if ($self->_check_codon($cod)) {
216 my $ctable = Bio::Tools::CodonTable->new;
217 $ctable->id($self->genetic_code() );
218 my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
220 return $self->{'_table'}{$aa}{$cod}{'per1000'}/10 ;
222 else {return 0;}
225 =head2 codon_rel_frequency
227 Title : codon_rel_frequency
228 Usage : my $freq = $cdtable->codon_rel_frequency('CTG');
229 Purpose : To return the frequency of that codon as a percentage
230 of codons coding for the same amino acid. E.g., ATG and TGG
231 would return 100 as those codons are unique.
232 Returns : a percentage frequency
233 Args : a non-ambiguous codon string
235 =cut
238 sub codon_rel_frequency {
239 my ($self, $a) = @_;
240 my $cod = uc $a;
241 if ($self->_check_codon($cod)) {
242 my $ctable = Bio::Tools::CodonTable->new;
243 $ctable->id($self->genetic_code () );
244 my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
245 return $self->{'_table'}{$aa}{$cod}{'rel_freq'};
247 else {
248 return 0;
252 =head2 probable_codons
254 Title : probable_codons
255 Usage : my $prob_codons = $cd_table->probable_codons(10);
256 Purpose : to obtain a list of codons for the amino acid above a given
257 threshold % relative frequency
258 Returns : A reference to a hash where keys are 1 letter amino acid codes
259 and values are references to arrays of codons whose frequency
260 is above the threshold.
261 Arguments: a minimum threshold frequency
263 =cut
265 sub probable_codons {
266 my ($self, $threshold) = @_;
267 if (!$threshold || $threshold < 0 || $threshold > 100) {
268 $self->throw(" I need a threshold percentage ");
270 my %return_hash;
271 for my $a(keys %STRICTAA) {
272 my @common_codons;
273 my $aa =$Bio::SeqUtils::THREECODE{$a};
274 for my $codon (keys %{ $self->{'_table'}{$aa}}) {
275 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $threshold/100){
276 push @common_codons, $codon;
279 $return_hash{$a} = \@common_codons;
281 ## check to make sure that all codons are populated ##
282 if (grep{scalar @{$return_hash{$_}} == 0} keys %return_hash) {
283 $self->warn("Threshold is too high, some amino acids do not" .
284 " have any codon above the threshold!");
286 return \%return_hash;
290 =head2 most_common_codons
292 Title : most_common_codons
293 Usage : my $common_codons = $cd_table->most_common_codons();
294 Purpose : To obtain the most common codon for a given amino acid
295 Returns : A reference to a hash where keys are 1 letter amino acid codes
296 and the values are the single most common codons for those amino acids
297 Arguments: None
299 =cut
301 sub most_common_codons {
302 my $self = shift;
304 my %return_hash;
306 for my $a ( keys %STRICTAA ) {
308 my $common_codon = '';
309 my $rel_freq = 0;
310 my $aa = $Bio::SeqUtils::THREECODE{$a};
312 if ( ! defined $self->{'_table'}{$aa} ) {
313 $self->warn("Amino acid $aa ($a) does not have any codons!");
314 next;
317 for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
318 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
319 $common_codon = $codon;
320 $rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
323 $return_hash{$a} = $common_codon;
326 return \%return_hash;
329 =head2 codon_count
331 Title : codon_count
332 Usage : my $count = $cdtable->codon_count('CTG');
333 Purpose : To obtain the absolute number of the codons in the
334 organism.
335 Returns : an integer
336 Args : a non-ambiguous codon string
338 =cut
340 sub codon_count {
341 my $self = shift;
342 if (@_) {
343 my $a = shift;
344 my $cod = uc $a;
345 if ($self->_check_codon($cod)) {
346 my $ctable = Bio::Tools::CodonTable->new;
347 $ctable->id($self->genetic_code());
348 my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
349 return $self->{'_table'}{$aa}{$cod}{'abs_count'};
351 else {return 0;}
353 else {
354 $self->warn(" need to give a codon sequence as a parameter ");
355 return 0;
360 =head2 get_coding_gc
362 Title : get_coding_gc
363 Usage : my $count = $cdtable->get_coding_gc(1);
364 Purpose : To return the percentage GC composition for the organism at
365 codon positions 1,2 or 3, or an average for all coding sequence
366 ('all').
367 Returns : a number (%-age GC content) or 0 if these fields are undefined
368 Args : 1,2,3 or 'all'.
370 =cut
372 sub get_coding_gc {
373 my $self = shift;
374 if (! @_) {
375 $self->warn(" no parameters supplied must be a codon position (1,2,3) or 'all'");
376 return 0;
378 else{
379 my $n = shift;
380 ##return request if valid ##
381 if ( exists($self->{'_coding_gc'}{$n} ) ) {
382 return sprintf("%.2f", $self->{'_coding_gc'}{$n});
384 ##else return 'all' value if exists
385 elsif (exists($self->{'_coding_gc'}{'all'} )) {
386 $self->warn("coding gc doesn't have value for [$n], returning gc content for all CDSs");
387 return sprintf("%.2f", $self->{'_coding_gc'}{'all'});
389 ### else return 0,
390 else {
391 $self->warn("coding gc values aren't defined, returning 0");
392 return 0;
395 }#end of outer else
399 =head2 set_coding_gc
401 Title : set_coding_gc
402 Usage : my $count = $cdtable->set_coding_gc(-1=>55.78);
403 Purpose : To set the percentage GC composition for the organism at
404 codon positions 1,2 or 3, or an average for all coding sequence
405 ('all').
406 Returns : void
407 Args : a hash where the key must be 1,2,3 or 'all' and the value the %age GC
408 at that codon position..
410 =cut
412 sub set_coding_gc {
413 my ($self, $key, $value) = @_;
414 my @allowed = qw(1 2 3 all);
415 $key =~ s/\-//;
416 if (!grep {$key eq $_} @allowed ) {
417 $self->warn ("invalid key! - must be one of [ ". (join " ", @allowed) . "]");
418 return;
420 $self->{'_coding_gc'}{$key} = $value;
425 =head2 species
427 Title : species
428 Usage : my $sp = $cut->species();
429 Purpose : Get/setter for species name of codon table
430 Returns : Void or species name string
431 Args : None or species name string
433 =cut
435 sub species {
436 my $self = shift;
437 if (@_ ){
438 $self->{'_species'} = shift;
440 return $self->{'_species'} || "unknown";
443 =head2 genetic_code
445 Title : genetic_code
446 Usage : my $sp = $cut->genetic_code();
447 Purpose : Get/setter for genetic_code name of codon table
448 Returns : Void or genetic_code id, 1 by default
449 Args : None or genetic_code id, 1 by default if invalid argument.
451 =cut
453 sub genetic_code {
454 my $self = shift;
455 if (@_ ){
456 my $val = shift;
457 if ($val < 0 || $val >16 || $val =~ /[^\d]/
458 || $val ==7 || $val ==8) {
459 $self->warn ("invalid genetic code - must be 1-16 but not 7 or 8,setting to default [1]");
460 $self->{'_genetic_code'} = 1;
462 else {
463 $self->{'_genetic_code'} = shift;
466 return $self->{'_genetic_code'} || 1;
469 =head2 cds_count
471 Title : cds_count
472 Usage : my $count = $cdtable->cds_count();
473 Purpose : To retrieve the total number of CDSs used to generate the Codon Table
474 for that organism.
475 Returns : an integer
476 Args : none (if retrieving the value) or an integer( if setting ).
478 =cut
480 sub cds_count {
481 my $self= shift;
482 if (@_) {
483 my $val = shift;
484 if ($val < 0) {
485 $self->warn("can't have negative count initializing to 1");
486 $self->{'_cds_count'} = 0.00;
488 else{
489 $self->{'_cds_count'} = $val;
492 $self->warn("cds_count value is undefined, returning 0")
493 if !exists($self->{'_cds_count'});
495 return $self->{'_cds_count'} || 0.00;
498 =head2 aa_frequency
500 Title : aa_frequency
501 Usage : my $freq = $cdtable->aa_frequency('Leu');
502 Purpose : To retrieve the frequency of an amino acid in the organism
503 Returns : a percentage
504 Args : a 1 letter or 3 letter string representing the amino acid
506 =cut
510 sub aa_frequency {
511 my ($self, $a) = @_;
512 ## process args ##
514 ## deal with cases ##
515 my $aa = lc $a;
516 $aa =~ s/^(\w)/\U$1/;
517 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
518 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
519 return;
521 #translate to 3 letter code for Ctable #
522 my $aa3 = $Bio::SeqUtils::THREECODE{$aa} || $aa;
524 ## return % of all amino acids in organism ##
525 my $freq = 0;
526 map {$freq += $self->{'_table'}{$aa3}{$_}{'per1000'} } keys %{$self->{'_table'}{$aa3}};
527 return sprintf("%.2f", $freq/10);
530 =head2 common_codon
532 Title : common_codon
533 Usage : my $freq = $cdtable->common_codon('Leu');
534 Purpose : To retrieve the frequency of the most common codon of that aa
535 Returns : a percentage
536 Args : a 1 letter or 3 letter string representing the amino acid
538 =cut
540 sub common_codon{
542 my ($self, $a) = @_;
543 my $aa = lc $a;
544 $aa =~ s/^(\w)/\U$1/;
546 if ($self->_check_aa($aa)) {
547 my $aa3 = $Bio::SeqUtils::THREECODE{$aa} ;
548 $aa3 ||= $aa;
549 my $max = 0;
550 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
551 $max = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} > $max) ?
552 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$max;
554 return $max;
555 }else {return 0;}
558 =head2 rare_codon
560 Title : rare_codon
561 Usage : my $freq = $cdtable->rare_codon('Leu');
562 Purpose : To retrieve the frequency of the least common codon of that aa
563 Returns : a percentage
564 Args : a 1 letter or 3 letter string representing the amino acid
566 =cut
568 sub rare_codon {
569 my ($self, $a) = @_;
570 my $aa = lc $a;
571 $aa =~ s/^(\w)/\U$1/;
572 if ($self->_check_aa($aa)) {
573 my $aa3 = $Bio::SeqUtils::THREECODE{$aa};
574 $aa3 ||= $aa;
575 my $min = 1;
576 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
577 $min = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} < $min) ?
578 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$min;
580 return $min;
581 }else {return 0;}
587 ## internal sub that checks a codon is correct format
588 sub _check_aa {
589 my ($self, $aa ) = @_;
590 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
591 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
592 return 0;
593 }else {return 1;}
599 sub _check_codon {
600 my ($self, $cod) = @_;
601 if ($cod =~ /[^ATCG]/ || $cod !~ /\w\w\w/) {
602 $self->warn(" impossible codon - must be 3 letters and just containing ATCG");
603 return 0;
605 else {return 1;}
607 sub _init_from_cod {
609 ## make hash based on aa and then send to _init_from_aa
610 my ($self, $ref) = @_;
611 my $ct = Bio::Tools::CodonTable->new();
612 my %aa_hash;
613 for my $codon(keys %$ref ) {
614 my $aa = $ct->translate($codon);
615 $aa_hash{$aa}{$codon} = $ref->{$codon};
617 $self->_init_from_aa(\%aa_hash);
621 sub _init_from_aa {
622 my ($self, $ref) = @_;
623 ## abs counts and count codons
624 my $total_codons = 0;
625 my %threeletter;
626 map{$threeletter{$Bio::SeqUtils::THREECODE{$_}} = $ref->{$_} } keys %$ref;
627 $ref = \%threeletter;
628 for my $aa (keys %$ref) {
629 for my $cod(keys %{$ref->{$aa}} ) {
630 $self->{'_table'}{$aa}{$cod}{'abs_count'} = $ref->{$aa}{$cod};
631 $total_codons += $ref->{$aa}{$cod};
635 ## now calculate abs codon frequencies
636 for my $aa (keys %$ref) {
637 for my $cod(keys %{$ref->{$aa}} ) {
638 $self->{'_table'}{$aa}{$cod}{'per1000'} =
639 sprintf("%.2f",$ref->{$aa}{$cod} /$total_codons * 1000) ;
642 ## now calculate rel codon_frequencies
643 for my $aa (keys %$ref) {
644 my $aa_freq = 0;
645 map{$aa_freq += $ref->{$aa}{$_} }
646 keys %{$ref->{$aa}};
647 for my $cod(keys %{$ref->{$aa}} ) {
648 $self->{'_table'}{$aa}{$cod}{'rel_freq'}=
649 sprintf("%.2f",$ref->{$aa}{$cod}/ $aa_freq );
654 ## now calculate gc fields
655 my %GC;
656 for my $aa (keys %$ref) {
657 for my $cod(keys %{$ref->{$aa}} ) {
658 for my $index (qw(1 2 3) ) {
659 if (substr ($cod, $index -1, 1) =~ /g|c/oi) {
660 $GC{$index} += $ref->{$aa}{$cod};
665 my $tot = 0;
666 map{$tot += $GC{$_}} qw(1 2 3);
667 $self->set_coding_gc('all', $tot/(3 *$total_codons) * 100);
668 map{$self->set_coding_gc($_,$GC{$_}/$total_codons * 100)} qw(1 2 3);
671 return $self;
674 sub _gb_db {
675 my $self = shift;
676 return $self->{'_gd_db'} || "unknown";