tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Tools / SeqStats.pm
blobf82520d59d29682e803c01f5194d7700e724e759
1 # $Id$
3 # BioPerl module for Bio::Tools::SeqStats
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by
9 # Copyright Peter Schattner
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::SeqStats - Object holding statistics for one
18 particular sequence
20 =head1 SYNOPSIS
22 # build a primary nucleic acid or protein sequence object somehow
23 # then build a statistics object from the sequence object
25 $seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
26 -alphabet => 'dna',
27 -id => 'test');
28 $seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
30 # obtain a hash of counts of each type of monomer
31 # (i.e. amino or nucleic acid)
32 print "\nMonomer counts using statistics object\n";
33 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
34 $hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
35 foreach $base (sort keys %$hash_ref) {
36 print "Number of bases of type ", $base, "= ",
37 %$hash_ref->{$base},"\n";
40 # obtain the count directly without creating a new statistics object
41 print "\nMonomer counts without statistics object\n";
42 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
43 foreach $base (sort keys %$hash_ref) {
44 print "Number of bases of type ", $base, "= ",
45 %$hash_ref->{$base},"\n";
49 # obtain hash of counts of each type of codon in a nucleic acid sequence
50 print "\nCodon counts using statistics object\n";
51 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
52 foreach $base (sort keys %$hash_ref) {
53 print "Number of codons of type ", $base, "= ",
54 %$hash_ref->{$base},"\n";
57 # or
58 print "\nCodon counts without statistics object\n";
59 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
60 foreach $base (sort keys %$hash_ref) {
61 print "Number of codons of type ", $base, "= ",
62 %$hash_ref->{$base},"\n";
65 # Obtain the molecular weight of a sequence. Since the sequence
66 # may contain ambiguous monomers, the molecular weight is returned
67 # as a (reference to) a two element array containing greatest lower
68 # bound (GLB) and least upper bound (LUB) of the molecular weight
69 $weight = $seq_stats->get_mol_wt();
70 print "\nMolecular weight (using statistics object) of sequence ",
71 $seqobj->id(), " is between ", $$weight[0], " and " ,
72 $$weight[1], "\n";
74 # or
75 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
76 print "\nMolecular weight (without statistics object) of sequence ",
77 $seqobj->id(), " is between ", $$weight[0], " and " ,
78 $$weight[1], "\n";
80 # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
81 my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
82 -alphabet=>'protein');
83 my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
84 print "might be hydropathic" if $gravy > 1;
86 =head1 DESCRIPTION
88 Bio::Tools::SeqStats is a lightweight object for the calculation of
89 simple statistical and numerical properties of a sequence. By
90 "lightweight" I mean that only "primary" sequences are handled by the
91 object. The calling script needs to create the appropriate primary
92 sequence to be passed to SeqStats if statistics on a sequence feature
93 are required. Similarly if a codon count is desired for a
94 frame-shifted sequence and/or a negative strand sequence, the calling
95 script needs to create that sequence and pass it to the SeqStats
96 object.
98 Nota that nucleotide sequences in bioperl do not strictly separate RNA
99 and DNA sequences. By convention, sequences from RNA molecules are
100 shown as is they were DNA. Objects are supposed to make the
101 distinction when needed. This class is one of the few where this
102 distinctions needs to be made. Internally, it changes all Ts into Us
103 before weight and monomer count.
105 SeqStats can be called in two distinct manners. If only a single
106 computation is required on a given sequence object, the method can be
107 called easily using the SeqStats object directly:
109 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
111 Alternately, if several computations will be required on a given
112 sequence object, an "instance" statistics object can be constructed
113 and used for the method calls:
115 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
116 $monomers = $seq_stats->count_monomers();
117 $codons = $seq_stats->count_codons();
118 $weight = $seq_stats->get_mol_wt();
119 $gravy = $seq_stats->hydropathicity();
121 As currently implemented the object can return the following values
122 from a sequence:
124 =over
126 =item *
128 The molecular weight of the sequence: get_mol_wt()
130 =item *
132 The number of each type of monomer present: count_monomers()
134 =item *
136 The number of each codon present in a nucleic acid sequence:
137 count_codons()
139 =item *
141 The mean hydropathicity ("gravy" score) of a protein:
142 hydropathicity()
144 =back
146 For DNA and RNA sequences single-stranded weights are returned. The
147 molecular weights are calculated for neutral, or not ionized,
148 nucleic acids. The returned weight is the sum of the
149 base-sugar-phosphate residues of the chain plus one weight of water to
150 to account for the additional OH on the phosphate of the 5' residue
151 and the additional H on the sugar ring of the 3' residue. Note that
152 this leads to a difference of 18 in calculated molecular weights
153 compared to some other available programs (e.g. Informax VectorNTI).
155 Note that since sequences may contain ambiguous monomers (e.g. "M",
156 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
157 returns a two-element array containing the greatest lower bound and
158 least upper bound of the molecule. For a sequence with no ambiguous
159 monomers, the two elements of the returned array will be equal. The
160 method count_codons() handles ambiguous bases by simply counting all
161 ambiguous codons together and issuing a warning to that effect.
164 =head1 DEVELOPERS NOTES
166 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
168 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
169 molecular weights to have the output match values in SWISS-PROT.
171 Torsten added hydropathicity calculation.
173 =head1 FEEDBACK
175 =head2 Mailing Lists
177 User feedback is an integral part of the evolution of this and other
178 Bioperl modules. Send your comments and suggestions preferably to one
179 of the Bioperl mailing lists. Your participation is much appreciated.
181 bioperl-l@bioperl.org - General discussion
182 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
184 =head2 Support
186 Please direct usage questions or support issues to the mailing list:
188 I<bioperl-l@bioperl.org>
190 rather than to the module maintainer directly. Many experienced and
191 reponsive experts will be able look at the problem and quickly
192 address it. Please include a thorough description of the problem
193 with code and data examples if at all possible.
195 =head2 Reporting Bugs
197 Report bugs to the Bioperl bug tracking system to help us keep track
198 the bugs and their resolution. Bug reports can be submitted the web:
200 http://bugzilla.open-bio.org/
202 =head1 AUTHOR - Peter Schattner
204 Email schattner AT alum.mit.edu
206 =head1 CONTRIBUTOR - Torsten Seemann
208 Email torsten.seemann AT infotech.monash.edu.au
210 =head1 APPENDIX
212 The rest of the documentation details each of the object
213 methods. Internal methods are usually preceded with a _
215 =cut
218 package Bio::Tools::SeqStats;
219 use strict;
220 use vars qw(%Alphabets %Alphabets_strict $amino_weights
221 $rna_weights $dna_weights %Weights $amino_hydropathicity);
222 use Bio::Seq;
223 use base qw(Bio::Root::Root);
225 BEGIN {
226 %Alphabets = (
227 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
228 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
229 'protein' => [ qw(A R N D C Q E G H I L K M F U
230 P S T W X Y V B Z J O *) ], # sac: added B, Z
233 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
234 %Alphabets_strict = (
235 'dna' => [ qw( A C G T ) ],
236 'rna' => [ qw( A C G U ) ],
237 'protein' => [ qw(A R N D C Q E G H I L K M F U
238 P S T W Y V O) ],
242 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
243 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
245 # Amino Acid alphabet
247 # ------------------------------------------
248 # Symbol Meaning
249 # ------------------------------------------
251 my $amino_A_wt = 89.09;
252 my $amino_C_wt = 121.15;
253 my $amino_D_wt = 133.1;
254 my $amino_E_wt = 147.13;
255 my $amino_F_wt = 165.19;
256 my $amino_G_wt = 75.07;
257 my $amino_H_wt = 155.16;
258 my $amino_I_wt = 131.17;
259 my $amino_K_wt = 146.19;
260 my $amino_L_wt = 131.17;
261 my $amino_M_wt = 149.21;
262 my $amino_N_wt = 132.12;
263 my $amino_O_wt = 255.31;
264 my $amino_P_wt = 115.13;
265 my $amino_Q_wt = 146.15;
266 my $amino_R_wt = 174.20;
267 my $amino_S_wt = 105.09;
268 my $amino_T_wt = 119.12;
269 my $amino_U_wt = 168.06;
270 my $amino_V_wt = 117.15;
271 my $amino_W_wt = 204.23;
272 my $amino_Y_wt = 181.19;
275 $amino_weights = {
276 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
277 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
278 'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
279 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
280 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
281 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
282 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
283 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
284 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
285 'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
286 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
287 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
288 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
289 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
290 'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
291 'P' => [$amino_P_wt, $amino_P_wt], # Proline
292 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
293 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
294 'S' => [$amino_S_wt, $amino_S_wt], # Serine
295 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
296 'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
297 'V' => [$amino_V_wt, $amino_V_wt], # Valine
298 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
299 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
300 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
301 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
304 # Extended Dna / Rna alphabet
305 use vars ( qw($C $O $N $H $P $water) );
306 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
307 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
308 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
309 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
310 use vars ( qw($dna_weights $rna_weights %Weights));
312 $C = 12.01;
313 $O = 16.00;
314 $N = 14.01;
315 $H = 1.01;
316 $P = 30.97;
317 $water = 18.015;
319 $adenine = 5 * $C + 5 * $N + 5 * $H;
320 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
321 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
322 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
323 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
325 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
326 # neutral (unionized) form
327 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
329 # the following are single strand molecular weights / base
330 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
331 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
332 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
333 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
335 $rna_A_wt = $adenine + $ribose_phosphate - $water;
336 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
337 $rna_G_wt = $guanine + $ribose_phosphate - $water;
338 $rna_U_wt = $uracil + $ribose_phosphate - $water;
340 $dna_weights = {
341 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
342 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
343 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
344 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
345 'M' => [$dna_C_wt,$dna_A_wt], # A or C
346 'R' => [$dna_A_wt,$dna_G_wt], # A or G
347 'W' => [$dna_T_wt,$dna_A_wt], # A or T
348 'S' => [$dna_C_wt,$dna_G_wt], # C or G
349 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
350 'K' => [$dna_T_wt,$dna_G_wt], # G or T
351 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
352 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
353 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
354 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
355 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
356 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
359 $rna_weights = {
360 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
361 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
362 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
363 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
364 'M' => [$rna_C_wt,$rna_A_wt], # A or C
365 'R' => [$rna_A_wt,$rna_G_wt], # A or G
366 'W' => [$rna_U_wt,$rna_A_wt], # A or U
367 'S' => [$rna_C_wt,$rna_G_wt], # C or G
368 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
369 'K' => [$rna_U_wt,$rna_G_wt], # G or U
370 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
371 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
372 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
373 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
374 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
375 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
378 %Weights = (
379 'dna' => $dna_weights,
380 'rna' => $rna_weights,
381 'protein' => $amino_weights,
384 # Amino acid scale: Hydropathicity.
385 # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
386 # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
388 $amino_hydropathicity = {
389 A => 1.800,
390 R => -4.500,
391 N => -3.500,
392 D => -3.500,
393 C => 2.500,
394 Q => -3.500,
395 E => -3.500,
396 G => -0.400,
397 H => -3.200,
398 I => 4.500,
399 L => 3.800,
400 K => -3.900,
401 M => 1.900,
402 F => 2.800,
403 P => -1.600,
404 S => -0.800,
405 T => -0.700,
406 W => -0.900,
407 Y => -1.300,
408 V => 4.200,
413 sub new {
414 my($class,@args) = @_;
415 my $self = $class->SUPER::new(@args);
417 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
418 unless ($seqobj->isa("Bio::PrimarySeqI")) {
419 $self->throw("SeqStats works only on PrimarySeqI objects");
421 if ( !defined $seqobj->alphabet ||
422 !defined $Alphabets{$seqobj->alphabet}) {
423 $self->throw("Must have a valid alphabet defined for seq (".
424 join(",",keys %Alphabets));
426 $self->{'_seqref'} = $seqobj;
427 # check the letters in the sequence
428 $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
429 return $self;
432 =head2 count_monomers
434 Title : count_monomers
435 Usage : $rcount = $seq_stats->count_monomers();
436 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
437 Function: Counts the number of each type of monomer (amino acid or
438 base) in the sequence.
439 Ts are counted as Us in RNA sequences.
440 Example :
441 Returns : Reference to a hash in which keys are letters of the
442 genetic alphabet used and values are number of occurrences
443 of the letter in the sequence.
444 Args : None or reference to sequence object
445 Throws : Throws an exception if type of sequence is unknown (ie amino
446 or nucleic)or if unknown letter in alphabet. Ambiguous
447 elements are allowed.
449 =cut
451 sub count_monomers{
452 my %count = ();
453 my $seqobj;
454 my $_is_strict;
455 my $element = '';
456 my $_is_instance = 1 ;
457 my $self = shift @_;
458 my $object_argument = shift @_;
460 # First we need to determine if the present object is an instance
461 # object or if the sequence object has been passed as an argument
463 if (defined $object_argument) {
464 $_is_instance = 0;
467 # If we are using an instance object...
468 if ($_is_instance) {
469 if ($self->{'_monomer_count'}) {
470 return $self->{'_monomer_count'}; # return count if previously calculated
472 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
473 $seqobj = $self->{'_seqref'};
474 } else {
475 # otherwise...
476 $seqobj = $object_argument;
478 # Following two lines lead to error in "throw" routine
479 $seqobj->isa("Bio::PrimarySeqI") ||
480 $self->throw("SeqStats works only on PrimarySeqI objects");
481 # is alphabet OK? Is it strict?
482 $_is_strict = _is_alphabet_strict($seqobj);
485 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
486 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
488 # convert everything to upper case to be safe
489 my $seqstring = uc $seqobj->seq();
491 # Since T is used in RichSeq RNA sequences, do conversion locally
492 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
494 # For each letter, count the number of times it appears in
495 # the sequence
496 LETTER:
497 foreach $element (@$alphabet) {
498 # skip terminator symbol which may confuse regex
499 next LETTER if $element eq '*';
500 $count{$element} = ( $seqstring =~ s/$element/$element/g);
503 if ($_is_instance) {
504 $self->{'_monomer_count'} = \%count; # Save in case called again later
507 return \%count;
510 =head2 get_mol_wt
512 Title : get_mol_wt
513 Usage : $wt = $seqobj->get_mol_wt() or
514 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
515 Function: Calculate molecular weight of sequence
516 Ts are counted as Us in RNA sequences.
517 Example :
519 Returns : Reference to two element array containing lower and upper
520 bounds of molecule molecular weight. For DNA and RNA
521 sequences single-stranded weights are returned. If
522 sequence contains no ambiguous elements, both entries in
523 array are equal to molecular weight of molecule.
524 Args : None or reference to sequence object
525 Throws : Exception if type of sequence is unknown (ie not amino or
526 nucleic) or if unknown letter in alphabet. Ambiguous
527 elements are allowed.
529 =cut
531 sub get_mol_wt {
532 my $seqobj;
533 my $_is_strict;
534 my $element = '';
535 my $_is_instance = 1 ;
536 my $self = shift @_;
537 my $object_argument = shift @_;
538 my ($weight_array, $rcount);
540 if (defined $object_argument) {
541 $_is_instance = 0;
544 if ($_is_instance) {
545 if ($weight_array = $self->{'_mol_wt'}) {
546 # return mol. weight if previously calculated
547 return $weight_array;
549 $seqobj = $self->{'_seqref'};
550 $rcount = $self->count_monomers();
551 } else {
552 $seqobj = $object_argument;
553 $seqobj->isa("Bio::PrimarySeqI") ||
554 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
555 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
556 $rcount = $self->count_monomers($seqobj);
559 # We will also need to know what type of monomer we are dealing with
560 my $moltype = $seqobj->alphabet();
562 # In general,the molecular weight is bounded below by the sum of the
563 # weights of lower bounds of each alphabet symbol times the number of
564 # occurrences of the symbol in the sequence. A similar upper bound on
565 # the weight is also calculated.
567 # Note that for "strict" (i.e. unambiguous) sequences there is an
568 # inefficiency since the upper bound = the lower bound and there are
569 # two calculations. However, this decrease in performance will be
570 # minor and leads to significantly more readable code.
572 my $weight_lower_bound = 0;
573 my $weight_upper_bound = 0;
574 my $weight_table = $Weights{$moltype};
576 # compute weight of all the residues
577 foreach $element (keys %$rcount) {
578 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
579 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
581 if ($moltype =~ /protein/) {
582 # remove H2O during peptide bond formation.
583 $weight_lower_bound -= $water * ($seqobj->length - 1);
584 $weight_upper_bound -= $water * ($seqobj->length - 1);
585 } else {
586 # Correction because phosphate of 5' residue has additional OH and
587 # sugar ring of 3' residue has additional H
588 $weight_lower_bound += $water;
589 $weight_upper_bound += $water;
592 $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
593 $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
595 $weight_array = [$weight_lower_bound, $weight_upper_bound];
597 if ($_is_instance) {
598 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
600 return $weight_array;
604 =head2 count_codons
606 Title : count_codons
607 Usage : $rcount = $seqstats->count_codons() or
608 $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
609 Function: Counts the number of each type of codons for a dna or rna
610 sequence, starting at the 1st triple of the input sequence.
611 Example :
612 Returns : Reference to a hash in which keys are codons of the genetic
613 alphabet used and values are number of occurrences of the
614 codons in the sequence. All codons with "ambiguous" bases
615 are counted together.
616 Args : None or sequence object
617 Throws : an exception if type of sequence is unknown or protein.
619 =cut
621 sub count_codons {
622 my $rcount = {};
623 my $codon ;
624 my $seqobj;
625 my $_is_strict;
626 my $element = '';
627 my $_is_instance = 1 ;
628 my $self = shift @_;
629 my $object_argument = shift @_;
631 if (defined $object_argument) {
632 $_is_instance = 0;
635 if ($_is_instance) {
636 if ($rcount = $self->{'_codon_count'}) {
637 return $rcount; # return count if previously calculated
639 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
640 $seqobj = $self->{'_seqref'};
641 } else {
642 $seqobj = $object_argument;
643 $seqobj->isa("Bio::PrimarySeqI") ||
644 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
645 $_is_strict = _is_alphabet_strict($seqobj);
648 # Codon counts only make sense for nucleic acid sequences
649 my $alphabet = $seqobj->alphabet();
651 unless ($alphabet =~ /[dr]na/i) {
652 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
653 "not for $alphabet sequences.");
656 # If sequence contains ambiguous bases, warn that codons
657 # containing them will all be lumped together in the count.
659 if (!$_is_strict ) {
660 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
661 " All codons with ambiguous bases will be added together in count.")
662 if $self->verbose >= 0 ;
665 my $seq = $seqobj->seq();
667 # Now step through the string by threes and count the codons
669 CODON:
670 while (length($seq) > 2) {
671 $codon = uc substr($seq,0,3);
672 $seq = substr($seq,3);
673 if ($codon =~ /[^ACTGU]/i) {
674 $$rcount{'ambiguous'}++; #lump together ambiguous codons
675 next CODON;
677 if (!defined $$rcount{$codon}) {
678 $$rcount{$codon}= 1 ;
679 next CODON;
681 $$rcount{$codon}++; # default
684 if ($_is_instance) {
685 $self->{'_codon_count'} = $rcount; # Save in case called again later
688 return $rcount;
692 =head2 hydropathicity
694 Title : hydropathicity
695 Usage : $gravy = $seqstats->hydropathicity(); or
696 $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
698 Function: Calculates the mean Kyte-Doolittle hydropathicity for a
699 protein sequence. Also known as the "gravy" score. Refer to
700 Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
701 Example :
702 Returns : float
703 Args : None or reference to sequence object
705 Throws : an exception if type of sequence is not protein.
707 =cut
709 sub hydropathicity {
710 my $seqobj;
711 my $_is_strict;
712 my $element = '';
713 my $_is_instance = 1 ;
714 my $self = shift @_;
715 my $object_argument = shift @_;
717 if (defined $object_argument) {
718 $_is_instance = 0;
721 if ($_is_instance) {
722 if (my $gravy = $self->{'_hydropathicity'}) {
723 return $gravy; # return value if previously calculated
725 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
726 $seqobj = $self->{'_seqref'};
727 } else {
728 $seqobj = $object_argument;
729 $seqobj->isa("Bio::PrimarySeqI") ||
730 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
731 $_is_strict = _is_alphabet_strict($seqobj);
734 # hydropathicity not menaingful for empty sequences
735 unless ($seqobj->length() > 0) {
736 $seqobj->throw("hydropathicity not defined for zero-length sequences");
739 # hydropathicity only make sense for protein sequences
740 my $alphabet = $seqobj->alphabet();
742 unless ($alphabet =~ /protein/i) {
743 $seqobj->throw("hydropathicity only meaningful for protein, ".
744 "not for $alphabet sequences.");
747 # If sequence contains ambiguous bases, warn that codons
748 # containing them will all be lumped together in the count.
750 unless ($_is_strict ) {
751 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
752 "Hydropathicity can not be caculated.")
755 my $seq = $seqobj->seq();
757 # Now step through the string and add up the hydropathicity values
759 my $gravy = 0;
760 for my $i ( 0 .. length($seq) ) {
761 my $codon = uc(substr($seq,$i,1));
762 $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
764 $gravy /= length($seq);
767 if ($_is_instance) {
768 $self->{'_hydropathicity'} = $gravy; # Save in case called again later
771 return $gravy;
775 =head2 _is_alphabet_strict
777 Title : _is_alphabet_strict
778 Usage :
779 Function: internal function to determine whether there are
780 any ambiguous elements in the current sequence
781 Example :
782 Returns : 1 if strict alphabet is being used,
783 0 if ambiguous elements are present
784 Args :
786 Throws : an exception if type of sequence is unknown (ie amino or
787 nucleic) or if unknown letter in alphabet. Ambiguous
788 monomers are allowed.
790 =cut
792 sub _is_alphabet_strict {
794 my ($seqobj) = @_;
795 my $moltype = $seqobj->alphabet();
797 # convert everything to upper case to be safe
798 my $seqstring = uc $seqobj->seq();
800 # Since T is used in RichSeq RNA sequences, do conversion locally
801 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
803 # First we check if only the 'strict' letters are present in the
804 # sequence string If not, we check whether the remaining letters
805 # are ambiguous monomers or whether there are illegal letters in
806 # the string
808 # $alpha_array is a ref to an array of the 'strictly' allowed letters
809 my $alpha_array = $Alphabets_strict{$moltype} ;
811 # $alphabet contains the allowed letters in string form
812 my $alphabet = join ('', @$alpha_array) ;
813 unless ($seqstring =~ /[^$alphabet]/) {
814 return 1 ;
817 # Next try to match with the alphabet's ambiguous letters
818 $alpha_array = $Alphabets{$moltype} ;
819 $alphabet = join ('', @$alpha_array) ;
821 unless ($seqstring =~ /[^$alphabet]/) {
822 return 0 ;
825 # If we got here there is an illegal letter in the sequence
826 $seqobj->throw("Alphabet not OK for $seqobj");
829 =head2 _print_data
831 Title : _print_data
832 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
833 Function: Displays dna / rna parameters (used for debugging)
834 Returns : 1
835 Args : None
837 Used for debugging.
839 =cut
841 sub _print_data {
843 print "\n adenine = : $adenine \n";
844 print "\n guanine = : $guanine \n";
845 print "\n cytosine = : $cytosine \n";
846 print "\n thymine = : $thymine \n";
847 print "\n uracil = : $uracil \n";
849 print "\n dna_A_wt = : $dna_A_wt \n";
850 print "\n dna_C_wt = : $dna_C_wt \n";
851 print "\n dna_G_wt = : $dna_G_wt \n";
852 print "\n dna_T_wt = : $dna_T_wt \n";
854 print "\n rna_A_wt = : $rna_A_wt \n";
855 print "\n rna_C_wt = : $rna_C_wt \n";
856 print "\n rna_G_wt = : $rna_G_wt \n";
857 print "\n rna_U_wt = : $rna_U_wt \n";
859 return 1;