Bio/Tools/SeqStats.pm

   1 #
   2 # BioPerl module for Bio::Tools::SeqStats
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by
   7 #
   8 # Copyright Peter Schattner
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::SeqStats - Object holding statistics for one
  17 particular sequence
  18
  19 =head1 SYNOPSIS
  20
  21   # build a primary nucleic acid or protein sequence object somehow
  22   # then build a statistics object from the sequence object
  23
  24   $seqobj = Bio::PrimarySeq->new(-seq      => 'ACTGTGGCGTCAACTG',
  25                                  -alphabet => 'dna',
  26                                  -id       => 'test');
  27   $seq_stats  =  Bio::Tools::SeqStats->new(-seq => $seqobj);
  28
  29   # obtain a hash of counts of each type of monomer
  30   # (i.e. amino or nucleic acid)
  31   print "\nMonomer counts using statistics object\n";
  32   $seq_stats  =  Bio::Tools::SeqStats->new(-seq=>$seqobj);
  33   $hash_ref = $seq_stats->count_monomers();  # e.g. for DNA sequence
  34   foreach $base (sort keys %$hash_ref) {
  35       print "Number of bases of type ", $base, "= ",
  36          %$hash_ref->{$base},"\n";
  37   }
  38
  39   # obtain the count directly without creating a new statistics object
  40   print "\nMonomer counts without statistics object\n";
  41   $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
  42   foreach $base (sort keys %$hash_ref) {
  43       print "Number of bases of type ", $base, "= ",
  44          %$hash_ref->{$base},"\n";
  45   }
  46
  47
  48   # obtain hash of counts of each type of codon in a nucleic acid sequence
  49   print "\nCodon counts using statistics object\n";
  50   $hash_ref = $seq_stats-> count_codons();  # for nucleic acid sequence
  51   foreach $base (sort keys %$hash_ref) {
  52       print "Number of codons of type ", $base, "= ",
  53          %$hash_ref->{$base},"\n";
  54   }
  55
  56   #  or
  57   print "\nCodon counts without statistics object\n";
  58   $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
  59   foreach $base (sort keys %$hash_ref) {
  60       print "Number of codons of type ", $base, "= ",
  61          %$hash_ref->{$base},"\n";
  62   }
  63
  64   # Obtain the molecular weight of a sequence. Since the sequence
  65   # may contain ambiguous monomers, the molecular weight is returned
  66   # as a (reference to) a two element array containing greatest lower
  67   # bound (GLB) and least upper bound (LUB) of the molecular weight
  68   $weight = $seq_stats->get_mol_wt();
  69   print "\nMolecular weight (using statistics object) of sequence ",
  70           $seqobj->id(), " is between ", $$weight[0], " and " ,
  71           $$weight[1], "\n";
  72
  73   #  or
  74   $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
  75   print "\nMolecular weight (without statistics object) of sequence ",
  76         $seqobj->id(), " is between ", $$weight[0], " and " ,
  77         $$weight[1], "\n";
  78
  79   # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
  80   my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
  81                                   -alphabet=>'protein');
  82   my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
  83   print "might be hydropathic" if $gravy > 1;
  84
  85 =head1 DESCRIPTION
  86
  87 Bio::Tools::SeqStats is a lightweight object for the calculation of
  88 simple statistical and numerical properties of a sequence. By
  89 "lightweight" I mean that only "primary" sequences are handled by the
  90 object.  The calling script needs to create the appropriate primary
  91 sequence to be passed to SeqStats if statistics on a sequence feature
  92 are required.  Similarly if a codon count is desired for a
  93 frame-shifted sequence and/or a negative strand sequence, the calling
  94 script needs to create that sequence and pass it to the SeqStats
  95 object.
  96
  97 Nota that nucleotide sequences in bioperl do not strictly separate RNA
  98 and DNA sequences. By convention, sequences from RNA molecules are
  99 shown as is they were DNA. Objects are supposed to make the
 100 distinction when needed. This class is one of the few where this
 101 distinctions needs to be made. Internally, it changes all Ts into Us
 102 before weight and monomer count.
 103
 104 SeqStats can be called in two distinct manners.  If only a single
 105 computation is required on a given sequence object, the method can be
 106 called easily using the SeqStats object directly:
 107
 108   $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
 109
 110 Alternately, if several computations will be required on a given
 111 sequence object, an "instance" statistics object can be constructed
 112 and used for the method calls:
 113
 114   $seq_stats = Bio::Tools::SeqStats->new($seqobj);
 115   $monomers = $seq_stats->count_monomers();
 116   $codons = $seq_stats->count_codons();
 117   $weight = $seq_stats->get_mol_wt();
 118   $gravy = $seq_stats->hydropathicity();
 119
 120 As currently implemented the object can return the following values
 121 from a sequence:
 122
 123 =over
 124
 125 =item *
 126
 127 The molecular weight of the sequence: get_mol_wt()
 128
 129 =item *
 130
 131 The number of each type of monomer present: count_monomers()
 132
 133 =item *
 134
 135 The number of each codon present in a nucleic acid sequence:
 136 count_codons()
 137
 138 =item *
 139
 140 The mean hydropathicity ("gravy" score) of a protein:
 141 hydropathicity()
 142
 143 =back
 144
 145 For DNA and RNA sequences single-stranded weights are returned. The
 146 molecular weights are calculated for neutral, or not ionized,
 147 nucleic acids. The returned weight is the sum of the
 148 base-sugar-phosphate residues of the chain plus one weight of water to
 149 to account for the additional OH on the phosphate of the 5' residue
 150 and the additional H on the sugar ring of the 3' residue.  Note that
 151 this leads to a difference of 18 in calculated molecular weights
 152 compared to some other available programs (e.g. Informax VectorNTI).
 153
 154 Note that since sequences may contain ambiguous monomers (e.g. "M",
 155 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
 156 returns a two-element array containing the greatest lower bound and
 157 least upper bound of the molecule. For a sequence with no ambiguous
 158 monomers, the two elements of the returned array will be equal. The
 159 method count_codons() handles ambiguous bases by simply counting all
 160 ambiguous codons together and issuing a warning to that effect.
 161
 162
 163 =head1 DEVELOPERS NOTES
 164
 165 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
 166
 167 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
 168 molecular weights to have the output match values in SWISS-PROT.
 169
 170 Torsten added hydropathicity calculation.
 171
 172 =head1 FEEDBACK
 173
 174 =head2 Mailing Lists
 175
 176 User feedback is an integral part of the evolution of this and other
 177 Bioperl modules. Send your comments and suggestions preferably to one
 178 of the Bioperl mailing lists.  Your participation is much appreciated.
 179
 180   bioperl-l@bioperl.org                  - General discussion
 181   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 182
 183 =head2 Support
 184
 185 Please direct usage questions or support issues to the mailing list:
 186
 187 I<bioperl-l@bioperl.org>
 188
 189 rather than to the module maintainer directly. Many experienced and
 190 reponsive experts will be able look at the problem and quickly
 191 address it. Please include a thorough description of the problem
 192 with code and data examples if at all possible.
 193
 194 =head2 Reporting Bugs
 195
 196 Report bugs to the Bioperl bug tracking system to help us keep track
 197 the bugs and their resolution.  Bug reports can be submitted the web:
 198
 199   https://github.com/bioperl/bioperl-live/issues
 200
 201 =head1 AUTHOR - Peter Schattner
 202
 203 Email schattner AT alum.mit.edu
 204
 205 =head1 CONTRIBUTOR - Torsten Seemann
 206
 207 Email torsten.seemann AT infotech.monash.edu.au
 208
 209 =head1 APPENDIX
 210
 211 The rest of the documentation details each of the object
 212 methods. Internal methods are usually preceded with a _
 213
 214 =cut
 215
 216
 217 package Bio::Tools::SeqStats;
 218 use strict;
 219 use vars qw(%Alphabets %Alphabets_strict $amino_weights
 220             $rna_weights $dna_weights %Weights $amino_hydropathicity);
 221 use Bio::Seq;
 222 use base qw(Bio::Root::Root);
 223
 224 BEGIN {
 225         %Alphabets =   (
 226                          'dna'     => [ qw(A C G T R Y M K S W H B V D X N) ],
 227                     'rna'     => [ qw(A C G U R Y M K S W H B V D X N) ],
 228                     'protein' => [ qw(A R N D C Q E G H I L K M F U
 229                                                                          P S T W X Y V B Z J O *) ], # sac: added B, Z
 230                                                 );
 231
 232 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
 233     %Alphabets_strict = (
 234                          'dna'     => [ qw( A C G T ) ],
 235                          'rna'     => [ qw( A C G U ) ],
 236                          'protein'    => [ qw(A R N D C Q E G H I L K M F U
 237                                               P S T W Y V O) ],
 238                          );
 239
 240
 241 #  IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
 242 #   Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
 243
 244 #  Amino Acid alphabet
 245
 246 # ------------------------------------------
 247 # Symbol           Meaning
 248 # ------------------------------------------
 249
 250     my $amino_A_wt = 89.09;
 251     my $amino_C_wt = 121.15;
 252     my $amino_D_wt = 133.1;
 253     my $amino_E_wt = 147.13;
 254     my $amino_F_wt = 165.19;
 255     my $amino_G_wt = 75.07;
 256     my $amino_H_wt = 155.16;
 257     my $amino_I_wt = 131.17;
 258     my $amino_K_wt = 146.19;
 259     my $amino_L_wt = 131.17;
 260     my $amino_M_wt = 149.21;
 261     my $amino_N_wt = 132.12;
 262     my $amino_O_wt = 255.31;
 263     my $amino_P_wt = 115.13;
 264     my $amino_Q_wt = 146.15;
 265     my $amino_R_wt = 174.20;
 266     my $amino_S_wt = 105.09;
 267     my $amino_T_wt = 119.12;
 268     my $amino_U_wt = 168.06;
 269     my $amino_V_wt = 117.15;
 270     my $amino_W_wt = 204.23;
 271     my $amino_Y_wt = 181.19;
 272
 273
 274     $amino_weights = {
 275         'A'     => [$amino_A_wt, $amino_A_wt], # Alanine
 276         'B'     => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
 277         'C'     => [$amino_C_wt, $amino_C_wt], # Cysteine
 278         'D'     => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
 279         'E'     => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
 280         'F'     => [$amino_F_wt, $amino_F_wt], # Phenylalanine
 281         'G'     => [$amino_G_wt, $amino_G_wt], # Glycine
 282         'H'     => [$amino_H_wt, $amino_H_wt], # Histidine
 283         'I'     => [$amino_I_wt, $amino_I_wt], # Isoleucine
 284         'J'     => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
 285         'K'     => [$amino_K_wt, $amino_K_wt], # Lysine
 286         'L'     => [$amino_L_wt, $amino_L_wt], # Leucine
 287         'M'     => [$amino_M_wt, $amino_M_wt], # Methionine
 288         'N'     => [$amino_N_wt, $amino_N_wt], # Asparagine
 289         'O'     => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
 290         'P'     => [$amino_P_wt, $amino_P_wt], # Proline
 291         'Q'     => [$amino_Q_wt, $amino_Q_wt], # Glutamine
 292         'R'     => [$amino_R_wt, $amino_R_wt], # Arginine
 293         'S'     => [$amino_S_wt, $amino_S_wt], # Serine
 294         'T'     => [$amino_T_wt, $amino_T_wt], # Threonine
 295         'U'     => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
 296         'V'     => [$amino_V_wt, $amino_V_wt], # Valine
 297         'W'     => [$amino_W_wt, $amino_W_wt], # Tryptophan
 298         'X'     => [$amino_G_wt, $amino_W_wt], # Unknown
 299         'Y'     => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
 300         'Z'     => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
 301     };
 302
 303     # Extended Dna / Rna alphabet
 304     use vars ( qw($C $O $N $H $P $water) );
 305     use vars ( qw($adenine   $guanine   $cytosine   $thymine   $uracil));
 306     use vars ( qw($ribose_phosphate   $deoxyribose_phosphate   $ppi));
 307     use vars ( qw($dna_A_wt   $dna_C_wt   $dna_G_wt  $dna_T_wt
 308                   $rna_A_wt   $rna_C_wt   $rna_G_wt   $rna_U_wt));
 309     use vars ( qw($dna_weights   $rna_weights   %Weights));
 310
 311     $C = 12.01;
 312     $O = 16.00;
 313     $N = 14.01;
 314     $H = 1.01;
 315     $P = 30.97;
 316     $water = 18.015;
 317
 318     $adenine = 5 * $C + 5 * $N + 5 * $H;
 319     $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
 320     $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
 321     $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
 322     $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
 323
 324     $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
 325     # neutral (unionized) form
 326     $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
 327
 328     # the following are single strand molecular weights / base
 329     $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
 330     $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
 331     $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
 332     $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
 333
 334     $rna_A_wt = $adenine + $ribose_phosphate - $water;
 335     $rna_C_wt = $cytosine + $ribose_phosphate - $water;
 336     $rna_G_wt = $guanine + $ribose_phosphate - $water;
 337     $rna_U_wt = $uracil + $ribose_phosphate - $water;
 338
 339     $dna_weights = {
 340         'A'             => [$dna_A_wt,$dna_A_wt],            # Adenine
 341         'C'             => [$dna_C_wt,$dna_C_wt],            # Cytosine
 342         'G'             => [$dna_G_wt,$dna_G_wt],            # Guanine
 343         'T'             => [$dna_T_wt,$dna_T_wt],            # Thymine
 344         'M'             => [$dna_C_wt,$dna_A_wt],            # A or C
 345         'R'             => [$dna_A_wt,$dna_G_wt],            # A or G
 346         'W'             => [$dna_T_wt,$dna_A_wt],            # A or T
 347         'S'             => [$dna_C_wt,$dna_G_wt],            # C or G
 348         'Y'             => [$dna_C_wt,$dna_T_wt],            # C or T
 349         'K'             => [$dna_T_wt,$dna_G_wt],            # G or T
 350         'V'             => [$dna_C_wt,$dna_G_wt],            # A or C or G
 351         'H'             => [$dna_C_wt,$dna_A_wt],            # A or C or T
 352         'D'             => [$dna_T_wt,$dna_G_wt],            # A or G or T
 353         'B'             => [$dna_C_wt,$dna_G_wt],            # C or G or T
 354         'X'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
 355         'N'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
 356     };
 357
 358     $rna_weights =  {
 359         'A'             => [$rna_A_wt,$rna_A_wt],            # Adenine
 360         'C'             => [$rna_C_wt,$rna_C_wt],            # Cytosine
 361         'G'             => [$rna_G_wt,$rna_G_wt],            # Guanine
 362         'U'             => [$rna_U_wt,$rna_U_wt],            # Uracil
 363         'M'             => [$rna_C_wt,$rna_A_wt],            # A or C
 364         'R'             => [$rna_A_wt,$rna_G_wt],            # A or G
 365         'W'             => [$rna_U_wt,$rna_A_wt],            # A or U
 366         'S'             => [$rna_C_wt,$rna_G_wt],            # C or G
 367         'Y'             => [$rna_C_wt,$rna_U_wt],            # C or U
 368         'K'             => [$rna_U_wt,$rna_G_wt],            # G or U
 369         'V'             => [$rna_C_wt,$rna_G_wt],            # A or C or G
 370         'H'             => [$rna_C_wt,$rna_A_wt],            # A or C or U
 371         'D'             => [$rna_U_wt,$rna_G_wt],            # A or G or U
 372         'B'             => [$rna_C_wt,$rna_G_wt],            # C or G or U
 373         'X'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
 374         'N'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
 375     };
 376
 377     %Weights =   (
 378                   'dna'     =>  $dna_weights,
 379                   'rna'     =>  $rna_weights,
 380                   'protein' =>  $amino_weights,
 381                   );
 382
 383         # Amino acid scale: Hydropathicity.
 384         # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
 385         # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
 386
 387         $amino_hydropathicity = {
 388     A =>  1.800,
 389     R => -4.500,
 390     N => -3.500,
 391     D => -3.500,
 392     C =>  2.500,
 393     Q => -3.500,
 394     E => -3.500,
 395     G => -0.400,
 396     H => -3.200,
 397     I =>  4.500,
 398     L =>  3.800,
 399     K => -3.900,
 400     M =>  1.900,
 401     F =>  2.800,
 402     P => -1.600,
 403     S => -0.800,
 404     T => -0.700,
 405     W => -0.900,
 406     Y => -1.300,
 407     V =>  4.200,
 408         };
 409
 410 }
 411
 412 sub new {
 413         my($class,@args) = @_;
 414         my $self = $class->SUPER::new(@args);
 415
 416         my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
 417         unless  ($seqobj->isa("Bio::PrimarySeqI")) {
 418                 $self->throw("SeqStats works only on PrimarySeqI objects");
 419         }
 420         if ( !defined $seqobj->alphabet ||
 421                   !defined $Alphabets{$seqobj->alphabet}) {
 422                 $self->throw("Must have a valid alphabet defined for seq (".
 423                                                  join(",",keys %Alphabets));
 424         }
 425         $self->{'_seqref'} = $seqobj;
 426         # check the letters in the sequence
 427         $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
 428         return $self;
 429 }
 430
 431 =head2 count_monomers
 432
 433  Title   : count_monomers
 434  Usage   : $rcount = $seq_stats->count_monomers();
 435            or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
 436  Function: Counts the number of each type of monomer (amino acid or
 437                 base) in the sequence.
 438            Ts are counted as Us in RNA sequences.
 439  Example :
 440  Returns : Reference to a hash in which keys are letters of the
 441            genetic alphabet used and values are number of occurrences
 442            of the letter in the sequence.
 443  Args    : None or reference to sequence object
 444  Throws  : Throws an exception if type of sequence is unknown (ie amino
 445            or nucleic)or if unknown letter in alphabet. Ambiguous
 446            elements are allowed.
 447
 448 =cut
 449
 450 sub count_monomers{
 451         my %count  = ();
 452         my $seqobj;
 453         my $_is_strict;
 454         my $element = '';
 455         my $_is_instance = 1 ;
 456         my $self = shift @_;
 457         my $object_argument = shift @_;
 458
 459         # First we need to determine if the present object is an instance
 460         # object or if the sequence object has been passed as an argument
 461
 462         if (defined $object_argument) {
 463                 $_is_instance = 0;
 464         }
 465
 466         # If we are using an instance object...
 467         if ($_is_instance) {
 468                 if ($self->{'_monomer_count'}) {
 469                         return $self->{'_monomer_count'}; # return count if previously calculated
 470                 }
 471                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 472                 $seqobj =  $self->{'_seqref'};
 473         } else {
 474                 #  otherwise...
 475                 $seqobj =  $object_argument;
 476
 477                 #  Following two lines lead to error in "throw" routine
 478                 $seqobj->isa("Bio::PrimarySeqI") ||
 479                   $self->throw("SeqStats works only on PrimarySeqI objects");
 480                 # is alphabet OK? Is it strict?
 481                 $_is_strict =  _is_alphabet_strict($seqobj);
 482         }
 483
 484         my $alphabet =  $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
 485           $Alphabets{$seqobj->alphabet}  ; # get array of allowed letters
 486
 487         # convert everything to upper case to be safe
 488         my $seqstring = uc $seqobj->seq();
 489
 490         # Since T is used in RichSeq RNA sequences, do conversion locally
 491         $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
 492
 493         #  For each letter, count the number of times it appears in
 494         #  the sequence
 495  LETTER:
 496         foreach $element (@$alphabet) {
 497                 # skip terminator symbol which may confuse regex
 498                 next LETTER if $element eq '*';
 499                 $count{$element} = ( $seqstring =~ s/$element/$element/g);
 500         }
 501
 502         if ($_is_instance) {
 503                 $self->{'_monomer_count'} = \%count;  # Save in case called again later
 504         }
 505
 506         return \%count;
 507 }
 508
 509 =head2  get_mol_wt
 510
 511  Title   : get_mol_wt
 512  Usage   : $wt = $seqobj->get_mol_wt() or
 513            $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
 514  Function: Calculate molecular weight of sequence
 515            Ts are counted as Us in RNA sequences.
 516  Example :
 517
 518  Returns : Reference to two element array containing lower and upper
 519            bounds of molecule molecular weight. For DNA and RNA
 520            sequences single-stranded weights are returned. If
 521            sequence contains no ambiguous elements, both entries in
 522            array are equal to molecular weight of molecule.
 523  Args    : None or reference to sequence object
 524  Throws  : Exception if type of sequence is unknown (ie not amino or
 525            nucleic) or if unknown letter in alphabet. Ambiguous
 526            elements are allowed.
 527
 528 =cut
 529
 530 sub get_mol_wt {
 531         my $seqobj;
 532         my $_is_strict;
 533         my $element = '';
 534         my $_is_instance = 1 ;
 535         my $self = shift @_;
 536         my $object_argument = shift @_;
 537         my ($weight_array, $rcount);
 538
 539         if (defined $object_argument) {
 540                 $_is_instance = 0;
 541         }
 542
 543         if ($_is_instance) {
 544                 if ($weight_array = $self->{'_mol_wt'}) {
 545                         # return mol. weight if previously calculated
 546                         return $weight_array;
 547                 }
 548                 $seqobj =  $self->{'_seqref'};
 549                 $rcount = $self->count_monomers();
 550         } else {
 551                 $seqobj =  $object_argument;
 552                 $seqobj->isa("Bio::PrimarySeqI") ||
 553                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 554                 $_is_strict =  _is_alphabet_strict($seqobj); # is alphabet OK?
 555                 $rcount =  $self->count_monomers($seqobj);
 556         }
 557
 558         # We will also need to know what type of monomer we are dealing with
 559         my $moltype = $seqobj->alphabet();
 560
 561         # In general,the molecular weight is bounded below by the sum of the
 562         # weights of lower bounds of each alphabet symbol times the number of
 563         # occurrences of the symbol in the sequence. A similar upper bound on
 564         # the weight is also calculated.
 565
 566         # Note that for "strict" (i.e. unambiguous) sequences there is an
 567         # inefficiency since the upper bound = the lower bound and there are
 568         # two calculations.  However, this decrease in performance will be
 569         # minor and leads to significantly more readable code.
 570
 571         my $weight_lower_bound = 0;
 572         my $weight_upper_bound = 0;
 573         my $weight_table =  $Weights{$moltype};
 574     my $total_res;
 575
 576         # compute weight of all the residues
 577         foreach $element (keys %$rcount) {
 578                 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
 579                 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
 580
 581         # this tracks only the residues used for counting MW
 582         $total_res += $$rcount{$element};
 583         }
 584         if ($moltype =~ /protein/) {
 585         # remove H2O during peptide bond formation.
 586         $weight_lower_bound -= $water * ($total_res - 1);
 587         $weight_upper_bound -= $water * ($total_res - 1);
 588         } else {
 589         # Correction because phosphate of 5' residue has additional OH and
 590         # sugar ring of 3' residue has additional H
 591         $weight_lower_bound += $water;
 592         $weight_upper_bound += $water;
 593         }
 594
 595         $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
 596         $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
 597
 598         $weight_array = [$weight_lower_bound, $weight_upper_bound];
 599
 600         if ($_is_instance) {
 601                 $self->{'_mol_wt'} = $weight_array;  # Save in case called again later
 602         }
 603         return $weight_array;
 604 }
 605
 606
 607 =head2  count_codons
 608
 609  Title   : count_codons
 610  Usage   : $rcount = $seqstats->count_codons() or
 611            $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
 612  Function: Counts the number of each type of codons for a dna or rna
 613            sequence, starting at the 1st triple of the input sequence.
 614  Example :
 615  Returns : Reference to a hash in which keys are codons of the genetic
 616            alphabet used and values are number of occurrences of the
 617            codons in the sequence. All codons with "ambiguous" bases
 618            are counted together.
 619  Args    : None or sequence object
 620  Throws  : an exception if type of sequence is unknown or protein.
 621
 622 =cut
 623
 624 sub count_codons {
 625         my $rcount = {};
 626         my $codon ;
 627         my $seqobj;
 628         my $_is_strict;
 629         my $element = '';
 630         my $_is_instance = 1 ;
 631         my $self = shift @_;
 632         my $object_argument = shift @_;
 633
 634         if (defined $object_argument) {
 635                 $_is_instance = 0;
 636         }
 637
 638         if ($_is_instance) {
 639                 if ($rcount = $self->{'_codon_count'}) {
 640                         return $rcount;        # return count if previously calculated
 641                 }
 642                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 643                 $seqobj =  $self->{'_seqref'};
 644         } else {
 645                 $seqobj =  $object_argument;
 646                 $seqobj->isa("Bio::PrimarySeqI") ||
 647                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 648                 $_is_strict =  _is_alphabet_strict($seqobj);
 649         }
 650
 651         # Codon counts only make sense for nucleic acid sequences
 652         my $alphabet = $seqobj->alphabet();
 653
 654         unless ($alphabet =~ /[dr]na/i) {
 655                 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
 656                                                         "not for $alphabet sequences.");
 657         }
 658
 659         # If sequence contains ambiguous bases, warn that codons
 660         # containing them will all be lumped together in the count.
 661
 662         if (!$_is_strict ) {
 663                 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
 664                 " All codons with ambiguous bases will be added together in count.")
 665                     if $self->verbose >= 0 ;
 666         }
 667
 668         my $seq = $seqobj->seq();
 669
 670         # Now step through the string by threes and count the codons
 671
 672  CODON:
 673         while (length($seq) > 2) {
 674                 $codon = uc substr($seq,0,3);
 675                 $seq = substr($seq,3);
 676                 if ($codon =~ /[^ACTGU]/i) {
 677                         $$rcount{'ambiguous'}++; #lump together ambiguous codons
 678                         next CODON;
 679                 }
 680                 if (!defined $$rcount{$codon}) {
 681                         $$rcount{$codon}= 1 ;
 682                         next CODON;
 683                 }
 684                 $$rcount{$codon}++;  # default
 685         }
 686
 687         if ($_is_instance) {
 688                 $self->{'_codon_count'} = $rcount;  # Save in case called again later
 689         }
 690
 691         return $rcount;
 692 }
 693
 694
 695 =head2  hydropathicity
 696
 697  Title   : hydropathicity
 698  Usage   : $gravy = $seqstats->hydropathicity(); or
 699            $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
 700
 701  Function: Calculates the mean Kyte-Doolittle hydropathicity for a
 702            protein sequence. Also known as the "gravy" score. Refer to
 703            Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
 704  Example :
 705  Returns : float
 706  Args    : None or reference to sequence object
 707
 708  Throws  : an exception if type of sequence is not protein.
 709
 710 =cut
 711
 712 sub hydropathicity {
 713         my $seqobj;
 714         my $_is_strict;
 715         my $element = '';
 716         my $_is_instance = 1 ;
 717         my $self = shift @_;
 718         my $object_argument = shift @_;
 719
 720         if (defined $object_argument) {
 721                 $_is_instance = 0;
 722         }
 723
 724         if ($_is_instance) {
 725                 if (my $gravy = $self->{'_hydropathicity'}) {
 726                         return $gravy;        # return value if previously calculated
 727                 }
 728                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 729                 $seqobj =  $self->{'_seqref'};
 730         } else {
 731                 $seqobj =  $object_argument;
 732                 $seqobj->isa("Bio::PrimarySeqI") ||
 733                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 734                 $_is_strict =  _is_alphabet_strict($seqobj);
 735         }
 736
 737         # hydropathicity not menaingful for empty sequences
 738         unless ($seqobj->length() > 0) {
 739           $seqobj->throw("hydropathicity not defined for zero-length sequences");
 740         }
 741
 742         # hydropathicity only make sense for protein sequences
 743         my $alphabet = $seqobj->alphabet();
 744
 745         unless ($alphabet =~ /protein/i) {
 746                 $seqobj->throw("hydropathicity only meaningful for protein, ".
 747                                                         "not for $alphabet sequences.");
 748         }
 749
 750         # If sequence contains ambiguous bases, warn that codons
 751         # containing them will all be lumped together in the count.
 752
 753         unless ($_is_strict ) {
 754                 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
 755                 "Hydropathicity can not be caculated.")
 756         }
 757
 758         my $seq = $seqobj->seq();
 759
 760         # Now step through the string and add up the hydropathicity values
 761
 762     my $gravy = 0;
 763     for my $i ( 0 .. length($seq) ) {
 764        my $codon = uc(substr($seq,$i,1));
 765        $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
 766     }
 767     $gravy /= length($seq);
 768
 769
 770         if ($_is_instance) {
 771                 $self->{'_hydropathicity'} = $gravy;  # Save in case called again later
 772         }
 773
 774         return $gravy;
 775 }
 776
 777
 778 =head2  _is_alphabet_strict
 779
 780  Title   :  _is_alphabet_strict
 781  Usage   :
 782  Function: internal function to determine whether there are
 783            any ambiguous elements in the current sequence
 784  Example :
 785  Returns : 1 if strict alphabet is being used,
 786            0 if ambiguous elements are present
 787  Args    :
 788
 789  Throws  : an exception if type of sequence is unknown (ie amino or
 790            nucleic) or if unknown letter in alphabet. Ambiguous
 791            monomers are allowed.
 792
 793 =cut
 794
 795 sub _is_alphabet_strict {
 796
 797         my ($seqobj) = @_;
 798         my $moltype = $seqobj->alphabet();
 799
 800         # convert everything to upper case to be safe
 801         my $seqstring = uc $seqobj->seq();
 802
 803         # Since T is used in RichSeq RNA sequences, do conversion locally
 804         $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
 805
 806         # First we check if only the 'strict' letters are present in the
 807         # sequence string If not, we check whether the remaining letters
 808         # are ambiguous monomers or whether there are illegal letters in
 809         # the string
 810
 811         # $alpha_array is a ref to an array of the 'strictly' allowed letters
 812         my $alpha_array =   $Alphabets_strict{$moltype} ;
 813
 814         # $alphabet contains the allowed letters in string form
 815         my $alphabet = join ('', @$alpha_array) ;
 816         unless ($seqstring =~ /[^$alphabet]/)  {
 817                 return 1 ;
 818         }
 819
 820         # Next try to match with the alphabet's ambiguous letters
 821         $alpha_array =   $Alphabets{$moltype} ;
 822         $alphabet = join ('', @$alpha_array) ;
 823
 824         unless ($seqstring =~ /[^$alphabet]/)  {
 825                 return 0 ;
 826         }
 827
 828         # If we got here there is an illegal letter in the sequence
 829         $seqobj->throw("Alphabet not OK for $seqobj");
 830 }
 831
 832 =head2   _print_data
 833
 834  Title   : _print_data
 835  Usage   : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
 836  Function: Displays dna / rna parameters (used for debugging)
 837  Returns : 1
 838  Args    : None
 839
 840 Used for debugging.
 841
 842 =cut
 843
 844 sub _print_data {
 845
 846     print "\n adenine  = :  $adenine \n";
 847     print "\n guanine  = :  $guanine \n";
 848     print "\n cytosine = :  $cytosine \n";
 849     print "\n thymine  = :  $thymine \n";
 850     print "\n uracil   = :  $uracil \n";
 851
 852     print "\n dna_A_wt = :  $dna_A_wt \n";
 853     print "\n dna_C_wt = :  $dna_C_wt \n";
 854     print "\n dna_G_wt = :  $dna_G_wt \n";
 855     print "\n dna_T_wt = :  $dna_T_wt \n";
 856
 857     print "\n rna_A_wt = :  $rna_A_wt \n";
 858     print "\n rna_C_wt = :  $rna_C_wt \n";
 859     print "\n rna_G_wt = :  $rna_G_wt \n";
 860     print "\n rna_U_wt = :  $rna_U_wt \n";
 861
 862     return 1;
 863 }
 864
 865 1;