Bio::Tools::CodonTable and Bio::Tools::IUPAC: use our and drop BEGIN blocks.
[bioperl-live.git] / lib / Bio / Tools / SeqStats.pm
blob0ce46c06d5985f2e92ddf66d5abc76de2e70feb5
2 # BioPerl module for Bio::Tools::SeqStats
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by
8 # Copyright Peter Schattner
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::SeqStats - Object holding statistics for one
17 particular sequence
19 =head1 SYNOPSIS
21 # build a primary nucleic acid or protein sequence object somehow
22 # then build a statistics object from the sequence object
24 $seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
25 -alphabet => 'dna',
26 -id => 'test');
27 $seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
29 # obtain a hash of counts of each type of monomer
30 # (i.e. amino or nucleic acid)
31 print "\nMonomer counts using statistics object\n";
32 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
33 $hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
34 foreach $base (sort keys %$hash_ref) {
35 print "Number of bases of type ", $base, "= ",
36 %$hash_ref->{$base},"\n";
39 # obtain the count directly without creating a new statistics object
40 print "\nMonomer counts without statistics object\n";
41 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
42 foreach $base (sort keys %$hash_ref) {
43 print "Number of bases of type ", $base, "= ",
44 %$hash_ref->{$base},"\n";
48 # obtain hash of counts of each type of codon in a nucleic acid sequence
49 print "\nCodon counts using statistics object\n";
50 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
51 foreach $base (sort keys %$hash_ref) {
52 print "Number of codons of type ", $base, "= ",
53 %$hash_ref->{$base},"\n";
56 # or
57 print "\nCodon counts without statistics object\n";
58 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
59 foreach $base (sort keys %$hash_ref) {
60 print "Number of codons of type ", $base, "= ",
61 %$hash_ref->{$base},"\n";
64 # Obtain the molecular weight of a sequence. Since the sequence
65 # may contain ambiguous monomers, the molecular weight is returned
66 # as a (reference to) a two element array containing greatest lower
67 # bound (GLB) and least upper bound (LUB) of the molecular weight
68 $weight = $seq_stats->get_mol_wt();
69 print "\nMolecular weight (using statistics object) of sequence ",
70 $seqobj->id(), " is between ", $$weight[0], " and " ,
71 $$weight[1], "\n";
73 # or
74 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
75 print "\nMolecular weight (without statistics object) of sequence ",
76 $seqobj->id(), " is between ", $$weight[0], " and " ,
77 $$weight[1], "\n";
79 # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
80 my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
81 -alphabet=>'protein');
82 my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
83 print "might be hydropathic" if $gravy > 1;
85 =head1 DESCRIPTION
87 Bio::Tools::SeqStats is a lightweight object for the calculation of
88 simple statistical and numerical properties of a sequence. By
89 "lightweight" I mean that only "primary" sequences are handled by the
90 object. The calling script needs to create the appropriate primary
91 sequence to be passed to SeqStats if statistics on a sequence feature
92 are required. Similarly if a codon count is desired for a
93 frame-shifted sequence and/or a negative strand sequence, the calling
94 script needs to create that sequence and pass it to the SeqStats
95 object.
97 Nota that nucleotide sequences in bioperl do not strictly separate RNA
98 and DNA sequences. By convention, sequences from RNA molecules are
99 shown as is they were DNA. Objects are supposed to make the
100 distinction when needed. This class is one of the few where this
101 distinctions needs to be made. Internally, it changes all Ts into Us
102 before weight and monomer count.
104 SeqStats can be called in two distinct manners. If only a single
105 computation is required on a given sequence object, the method can be
106 called easily using the SeqStats object directly:
108 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
110 Alternately, if several computations will be required on a given
111 sequence object, an "instance" statistics object can be constructed
112 and used for the method calls:
114 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
115 $monomers = $seq_stats->count_monomers();
116 $codons = $seq_stats->count_codons();
117 $weight = $seq_stats->get_mol_wt();
118 $gravy = $seq_stats->hydropathicity();
120 As currently implemented the object can return the following values
121 from a sequence:
123 =over
125 =item *
127 The molecular weight of the sequence: get_mol_wt()
129 =item *
131 The number of each type of monomer present: count_monomers()
133 =item *
135 The number of each codon present in a nucleic acid sequence:
136 count_codons()
138 =item *
140 The mean hydropathicity ("gravy" score) of a protein:
141 hydropathicity()
143 =back
145 For DNA and RNA sequences single-stranded weights are returned. The
146 molecular weights are calculated for neutral, or not ionized,
147 nucleic acids. The returned weight is the sum of the
148 base-sugar-phosphate residues of the chain plus one weight of water to
149 to account for the additional OH on the phosphate of the 5' residue
150 and the additional H on the sugar ring of the 3' residue. Note that
151 this leads to a difference of 18 in calculated molecular weights
152 compared to some other available programs (e.g. Informax VectorNTI).
154 Note that since sequences may contain ambiguous monomers (e.g. "M",
155 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
156 returns a two-element array containing the greatest lower bound and
157 least upper bound of the molecule. For a sequence with no ambiguous
158 monomers, the two elements of the returned array will be equal. The
159 method count_codons() handles ambiguous bases by simply counting all
160 ambiguous codons together and issuing a warning to that effect.
163 =head1 DEVELOPERS NOTES
165 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
167 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
168 molecular weights to have the output match values in SWISS-PROT.
170 Torsten added hydropathicity calculation.
172 =head1 FEEDBACK
174 =head2 Mailing Lists
176 User feedback is an integral part of the evolution of this and other
177 Bioperl modules. Send your comments and suggestions preferably to one
178 of the Bioperl mailing lists. Your participation is much appreciated.
180 bioperl-l@bioperl.org - General discussion
181 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
183 =head2 Support
185 Please direct usage questions or support issues to the mailing list:
187 I<bioperl-l@bioperl.org>
189 rather than to the module maintainer directly. Many experienced and
190 reponsive experts will be able look at the problem and quickly
191 address it. Please include a thorough description of the problem
192 with code and data examples if at all possible.
194 =head2 Reporting Bugs
196 Report bugs to the Bioperl bug tracking system to help us keep track
197 the bugs and their resolution. Bug reports can be submitted the web:
199 https://github.com/bioperl/bioperl-live/issues
201 =head1 AUTHOR - Peter Schattner
203 Email schattner AT alum.mit.edu
205 =head1 CONTRIBUTOR - Torsten Seemann
207 Email torsten.seemann AT infotech.monash.edu.au
209 =head1 APPENDIX
211 The rest of the documentation details each of the object
212 methods. Internal methods are usually preceded with a _
214 =cut
217 package Bio::Tools::SeqStats;
219 use strict;
220 use vars qw(%Alphabets %Alphabets_strict $amino_weights
221 $rna_weights $dna_weights %Weights $amino_hydropathicity);
222 use Bio::Seq;
223 use base qw(Bio::Root::Root);
225 BEGIN {
226 %Alphabets = (
227 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
228 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
229 'protein' => [ qw(A R N D C Q E G H I L K M F U
230 P S T W X Y V B Z J O *) ], # sac: added B, Z
233 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
234 %Alphabets_strict = (
235 'dna' => [ qw( A C G T ) ],
236 'rna' => [ qw( A C G U ) ],
237 'protein' => [ qw(A R N D C Q E G H I L K M F U
238 P S T W Y V O) ],
242 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
243 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
245 # Amino Acid alphabet
247 # ------------------------------------------
248 # Symbol Meaning
249 # ------------------------------------------
251 my $amino_A_wt = 89.09;
252 my $amino_C_wt = 121.15;
253 my $amino_D_wt = 133.1;
254 my $amino_E_wt = 147.13;
255 my $amino_F_wt = 165.19;
256 my $amino_G_wt = 75.07;
257 my $amino_H_wt = 155.16;
258 my $amino_I_wt = 131.17;
259 my $amino_K_wt = 146.19;
260 my $amino_L_wt = 131.17;
261 my $amino_M_wt = 149.21;
262 my $amino_N_wt = 132.12;
263 my $amino_O_wt = 255.31;
264 my $amino_P_wt = 115.13;
265 my $amino_Q_wt = 146.15;
266 my $amino_R_wt = 174.20;
267 my $amino_S_wt = 105.09;
268 my $amino_T_wt = 119.12;
269 my $amino_U_wt = 168.06;
270 my $amino_V_wt = 117.15;
271 my $amino_W_wt = 204.23;
272 my $amino_Y_wt = 181.19;
275 $amino_weights = {
276 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
277 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
278 'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
279 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
280 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
281 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
282 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
283 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
284 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
285 'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
286 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
287 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
288 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
289 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
290 'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
291 'P' => [$amino_P_wt, $amino_P_wt], # Proline
292 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
293 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
294 'S' => [$amino_S_wt, $amino_S_wt], # Serine
295 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
296 'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
297 'V' => [$amino_V_wt, $amino_V_wt], # Valine
298 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
299 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
300 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
301 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
304 # Extended Dna / Rna alphabet
305 use vars ( qw($C $O $N $H $P $water) );
306 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
307 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
308 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
309 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
310 use vars ( qw($dna_weights $rna_weights %Weights));
312 $C = 12.01;
313 $O = 16.00;
314 $N = 14.01;
315 $H = 1.01;
316 $P = 30.97;
317 $water = 18.015;
319 $adenine = 5 * $C + 5 * $N + 5 * $H;
320 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
321 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
322 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
323 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
325 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
326 # neutral (unionized) form
327 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
329 # the following are single strand molecular weights / base
330 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
331 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
332 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
333 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
335 $rna_A_wt = $adenine + $ribose_phosphate - $water;
336 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
337 $rna_G_wt = $guanine + $ribose_phosphate - $water;
338 $rna_U_wt = $uracil + $ribose_phosphate - $water;
340 $dna_weights = {
341 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
342 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
343 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
344 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
345 'M' => [$dna_C_wt,$dna_A_wt], # A or C
346 'R' => [$dna_A_wt,$dna_G_wt], # A or G
347 'W' => [$dna_T_wt,$dna_A_wt], # A or T
348 'S' => [$dna_C_wt,$dna_G_wt], # C or G
349 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
350 'K' => [$dna_T_wt,$dna_G_wt], # G or T
351 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
352 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
353 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
354 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
355 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
356 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
359 $rna_weights = {
360 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
361 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
362 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
363 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
364 'M' => [$rna_C_wt,$rna_A_wt], # A or C
365 'R' => [$rna_A_wt,$rna_G_wt], # A or G
366 'W' => [$rna_U_wt,$rna_A_wt], # A or U
367 'S' => [$rna_C_wt,$rna_G_wt], # C or G
368 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
369 'K' => [$rna_U_wt,$rna_G_wt], # G or U
370 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
371 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
372 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
373 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
374 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
375 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
378 %Weights = (
379 'dna' => $dna_weights,
380 'rna' => $rna_weights,
381 'protein' => $amino_weights,
384 # Amino acid scale: Hydropathicity.
385 # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
386 # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
388 $amino_hydropathicity = {
389 A => 1.800,
390 R => -4.500,
391 N => -3.500,
392 D => -3.500,
393 C => 2.500,
394 Q => -3.500,
395 E => -3.500,
396 G => -0.400,
397 H => -3.200,
398 I => 4.500,
399 L => 3.800,
400 K => -3.900,
401 M => 1.900,
402 F => 2.800,
403 P => -1.600,
404 S => -0.800,
405 T => -0.700,
406 W => -0.900,
407 Y => -1.300,
408 V => 4.200,
413 sub new {
414 my($class,@args) = @_;
415 my $self = $class->SUPER::new(@args);
417 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
418 unless ($seqobj->isa("Bio::PrimarySeqI")) {
419 $self->throw("SeqStats works only on PrimarySeqI objects");
421 if ( !defined $seqobj->alphabet ||
422 !defined $Alphabets{$seqobj->alphabet}) {
423 $self->throw("Must have a valid alphabet defined for seq (".
424 join(",",keys %Alphabets));
426 $self->{'_seqref'} = $seqobj;
427 # check the letters in the sequence
428 $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
429 return $self;
432 =head2 count_monomers
434 Title : count_monomers
435 Usage : $rcount = $seq_stats->count_monomers();
436 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
437 Function: Counts the number of each type of monomer (amino acid or
438 base) in the sequence.
439 Ts are counted as Us in RNA sequences.
440 Example :
441 Returns : Reference to a hash in which keys are letters of the
442 genetic alphabet used and values are number of occurrences
443 of the letter in the sequence.
444 Args : None or reference to sequence object
445 Throws : Throws an exception if type of sequence is unknown (ie amino
446 or nucleic)or if unknown letter in alphabet. Ambiguous
447 elements are allowed.
449 =cut
451 sub count_monomers{
452 my %count = ();
453 my $seqobj;
454 my $_is_strict;
455 my $element = '';
456 my $_is_instance = 1 ;
457 my $self = shift @_;
458 my $object_argument = shift @_;
460 # First we need to determine if the present object is an instance
461 # object or if the sequence object has been passed as an argument
463 if (defined $object_argument) {
464 $_is_instance = 0;
467 # If we are using an instance object...
468 if ($_is_instance) {
469 if ($self->{'_monomer_count'}) {
470 return $self->{'_monomer_count'}; # return count if previously calculated
472 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
473 $seqobj = $self->{'_seqref'};
474 } else {
475 # otherwise...
476 $seqobj = $object_argument;
478 # Following two lines lead to error in "throw" routine
479 $seqobj->isa("Bio::PrimarySeqI") ||
480 $self->throw("SeqStats works only on PrimarySeqI objects");
481 # is alphabet OK? Is it strict?
482 $_is_strict = _is_alphabet_strict($seqobj);
485 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
486 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
488 # convert everything to upper case to be safe
489 my $seqstring = uc $seqobj->seq();
491 # Since T is used in RichSeq RNA sequences, do conversion locally
492 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
494 # For each letter, count the number of times it appears in
495 # the sequence
496 LETTER:
497 foreach $element (@$alphabet) {
498 # skip terminator symbol which may confuse regex
499 next LETTER if $element eq '*';
500 $count{$element} = ( $seqstring =~ s/$element/$element/g);
503 if ($_is_instance) {
504 $self->{'_monomer_count'} = \%count; # Save in case called again later
507 return \%count;
510 =head2 get_mol_wt
512 Title : get_mol_wt
513 Usage : $wt = $seqobj->get_mol_wt() or
514 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
515 Function: Calculate molecular weight of sequence
516 Ts are counted as Us in RNA sequences.
517 Example :
519 Returns : Reference to two element array containing lower and upper
520 bounds of molecule molecular weight. For DNA and RNA
521 sequences single-stranded weights are returned. If
522 sequence contains no ambiguous elements, both entries in
523 array are equal to molecular weight of molecule.
524 Args : None or reference to sequence object
525 Throws : Exception if type of sequence is unknown (ie not amino or
526 nucleic) or if unknown letter in alphabet. Ambiguous
527 elements are allowed.
529 =cut
531 sub get_mol_wt {
532 my $seqobj;
533 my $_is_strict;
534 my $element = '';
535 my $_is_instance = 1 ;
536 my $self = shift @_;
537 my $object_argument = shift @_;
538 my ($weight_array, $rcount);
540 if (defined $object_argument) {
541 $_is_instance = 0;
544 if ($_is_instance) {
545 if ($weight_array = $self->{'_mol_wt'}) {
546 # return mol. weight if previously calculated
547 return $weight_array;
549 $seqobj = $self->{'_seqref'};
550 $rcount = $self->count_monomers();
551 } else {
552 $seqobj = $object_argument;
553 $seqobj->isa("Bio::PrimarySeqI") ||
554 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
555 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
556 $rcount = $self->count_monomers($seqobj);
559 # We will also need to know what type of monomer we are dealing with
560 my $moltype = $seqobj->alphabet();
562 # In general,the molecular weight is bounded below by the sum of the
563 # weights of lower bounds of each alphabet symbol times the number of
564 # occurrences of the symbol in the sequence. A similar upper bound on
565 # the weight is also calculated.
567 # Note that for "strict" (i.e. unambiguous) sequences there is an
568 # inefficiency since the upper bound = the lower bound and there are
569 # two calculations. However, this decrease in performance will be
570 # minor and leads to significantly more readable code.
572 my $weight_lower_bound = 0;
573 my $weight_upper_bound = 0;
574 my $weight_table = $Weights{$moltype};
575 my $total_res;
577 # compute weight of all the residues
578 foreach $element (keys %$rcount) {
579 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
580 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
582 # this tracks only the residues used for counting MW
583 $total_res += $$rcount{$element};
585 if ($moltype =~ /protein/) {
586 # remove H2O during peptide bond formation.
587 $weight_lower_bound -= $water * ($total_res - 1);
588 $weight_upper_bound -= $water * ($total_res - 1);
589 } else {
590 # Correction because phosphate of 5' residue has additional OH and
591 # sugar ring of 3' residue has additional H
592 $weight_lower_bound += $water;
593 $weight_upper_bound += $water;
596 $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
597 $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
599 $weight_array = [$weight_lower_bound, $weight_upper_bound];
601 if ($_is_instance) {
602 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
604 return $weight_array;
608 =head2 count_codons
610 Title : count_codons
611 Usage : $rcount = $seqstats->count_codons() or
612 $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
613 Function: Counts the number of each type of codons for a dna or rna
614 sequence, starting at the 1st triple of the input sequence.
615 Example :
616 Returns : Reference to a hash in which keys are codons of the genetic
617 alphabet used and values are number of occurrences of the
618 codons in the sequence. All codons with "ambiguous" bases
619 are counted together.
620 Args : None or sequence object
621 Throws : an exception if type of sequence is unknown or protein.
623 =cut
625 sub count_codons {
626 my $rcount = {};
627 my $codon ;
628 my $seqobj;
629 my $_is_strict;
630 my $element = '';
631 my $_is_instance = 1 ;
632 my $self = shift @_;
633 my $object_argument = shift @_;
635 if (defined $object_argument) {
636 $_is_instance = 0;
639 if ($_is_instance) {
640 if ($rcount = $self->{'_codon_count'}) {
641 return $rcount; # return count if previously calculated
643 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
644 $seqobj = $self->{'_seqref'};
645 } else {
646 $seqobj = $object_argument;
647 $seqobj->isa("Bio::PrimarySeqI") ||
648 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
649 $_is_strict = _is_alphabet_strict($seqobj);
652 # Codon counts only make sense for nucleic acid sequences
653 my $alphabet = $seqobj->alphabet();
655 unless ($alphabet =~ /[dr]na/i) {
656 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
657 "not for $alphabet sequences.");
660 # If sequence contains ambiguous bases, warn that codons
661 # containing them will all be lumped together in the count.
663 if (!$_is_strict ) {
664 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
665 " All codons with ambiguous bases will be added together in count.")
666 if $self->verbose >= 0 ;
669 my $seq = $seqobj->seq();
671 # Now step through the string by threes and count the codons
673 CODON:
674 while (length($seq) > 2) {
675 $codon = uc substr($seq,0,3);
676 $seq = substr($seq,3);
677 if ($codon =~ /[^ACTGU]/i) {
678 $$rcount{'ambiguous'}++; #lump together ambiguous codons
679 next CODON;
681 if (!defined $$rcount{$codon}) {
682 $$rcount{$codon}= 1 ;
683 next CODON;
685 $$rcount{$codon}++; # default
688 if ($_is_instance) {
689 $self->{'_codon_count'} = $rcount; # Save in case called again later
692 return $rcount;
696 =head2 hydropathicity
698 Title : hydropathicity
699 Usage : $gravy = $seqstats->hydropathicity(); or
700 $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
702 Function: Calculates the mean Kyte-Doolittle hydropathicity for a
703 protein sequence. Also known as the "gravy" score. Refer to
704 Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
705 Example :
706 Returns : float
707 Args : None or reference to sequence object
709 Throws : an exception if type of sequence is not protein.
711 =cut
713 sub hydropathicity {
714 my $seqobj;
715 my $_is_strict;
716 my $element = '';
717 my $_is_instance = 1 ;
718 my $self = shift @_;
719 my $object_argument = shift @_;
721 if (defined $object_argument) {
722 $_is_instance = 0;
725 if ($_is_instance) {
726 if (my $gravy = $self->{'_hydropathicity'}) {
727 return $gravy; # return value if previously calculated
729 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
730 $seqobj = $self->{'_seqref'};
731 } else {
732 $seqobj = $object_argument;
733 $seqobj->isa("Bio::PrimarySeqI") ||
734 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
735 $_is_strict = _is_alphabet_strict($seqobj);
738 # hydropathicity not menaingful for empty sequences
739 unless ($seqobj->length() > 0) {
740 $seqobj->throw("hydropathicity not defined for zero-length sequences");
743 # hydropathicity only make sense for protein sequences
744 my $alphabet = $seqobj->alphabet();
746 unless ($alphabet =~ /protein/i) {
747 $seqobj->throw("hydropathicity only meaningful for protein, ".
748 "not for $alphabet sequences.");
751 # If sequence contains ambiguous bases, warn that codons
752 # containing them will all be lumped together in the count.
754 unless ($_is_strict ) {
755 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
756 "Hydropathicity can not be caculated.")
759 my $seq = $seqobj->seq();
761 # Now step through the string and add up the hydropathicity values
763 my $gravy = 0;
764 for my $i ( 0 .. length($seq) ) {
765 my $codon = uc(substr($seq,$i,1));
766 $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
768 $gravy /= length($seq);
771 if ($_is_instance) {
772 $self->{'_hydropathicity'} = $gravy; # Save in case called again later
775 return $gravy;
779 =head2 _is_alphabet_strict
781 Title : _is_alphabet_strict
782 Usage :
783 Function: internal function to determine whether there are
784 any ambiguous elements in the current sequence
785 Example :
786 Returns : 1 if strict alphabet is being used,
787 0 if ambiguous elements are present
788 Args :
790 Throws : an exception if type of sequence is unknown (ie amino or
791 nucleic) or if unknown letter in alphabet. Ambiguous
792 monomers are allowed.
794 =cut
796 sub _is_alphabet_strict {
798 my ($seqobj) = @_;
799 my $moltype = $seqobj->alphabet();
801 # convert everything to upper case to be safe
802 my $seqstring = uc $seqobj->seq();
804 # Since T is used in RichSeq RNA sequences, do conversion locally
805 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
807 # First we check if only the 'strict' letters are present in the
808 # sequence string If not, we check whether the remaining letters
809 # are ambiguous monomers or whether there are illegal letters in
810 # the string
812 # $alpha_array is a ref to an array of the 'strictly' allowed letters
813 my $alpha_array = $Alphabets_strict{$moltype} ;
815 # $alphabet contains the allowed letters in string form
816 my $alphabet = join ('', @$alpha_array) ;
817 unless ($seqstring =~ /[^$alphabet]/) {
818 return 1 ;
821 # Next try to match with the alphabet's ambiguous letters
822 $alpha_array = $Alphabets{$moltype} ;
823 $alphabet = join ('', @$alpha_array) ;
825 unless ($seqstring =~ /[^$alphabet]/) {
826 return 0 ;
829 # If we got here there is an illegal letter in the sequence
830 $seqobj->throw("Alphabet not OK for $seqobj");
833 =head2 _print_data
835 Title : _print_data
836 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
837 Function: Displays dna / rna parameters (used for debugging)
838 Returns : 1
839 Args : None
841 Used for debugging.
843 =cut
845 sub _print_data {
847 print "\n adenine = : $adenine \n";
848 print "\n guanine = : $guanine \n";
849 print "\n cytosine = : $cytosine \n";
850 print "\n thymine = : $thymine \n";
851 print "\n uracil = : $uracil \n";
853 print "\n dna_A_wt = : $dna_A_wt \n";
854 print "\n dna_C_wt = : $dna_C_wt \n";
855 print "\n dna_G_wt = : $dna_G_wt \n";
856 print "\n dna_T_wt = : $dna_T_wt \n";
858 print "\n rna_A_wt = : $rna_A_wt \n";
859 print "\n rna_C_wt = : $rna_C_wt \n";
860 print "\n rna_G_wt = : $rna_G_wt \n";
861 print "\n rna_U_wt = : $rna_U_wt \n";
863 return 1;