lib/Bio/Tools/SeqStats.pm

   1 #
   2 # BioPerl module for Bio::Tools::SeqStats
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by
   7 #
   8 # Copyright Peter Schattner
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::SeqStats - Object holding statistics for one
  17 particular sequence
  18
  19 =head1 SYNOPSIS
  20
  21   # build a primary nucleic acid or protein sequence object somehow
  22   # then build a statistics object from the sequence object
  23
  24   $seqobj = Bio::PrimarySeq->new(-seq      => 'ACTGTGGCGTCAACTG',
  25                                  -alphabet => 'dna',
  26                                  -id       => 'test');
  27   $seq_stats  =  Bio::Tools::SeqStats->new(-seq => $seqobj);
  28
  29   # obtain a hash of counts of each type of monomer
  30   # (i.e. amino or nucleic acid)
  31   print "\nMonomer counts using statistics object\n";
  32   $seq_stats  =  Bio::Tools::SeqStats->new(-seq=>$seqobj);
  33   $hash_ref = $seq_stats->count_monomers();  # e.g. for DNA sequence
  34   foreach $base (sort keys %$hash_ref) {
  35       print "Number of bases of type ", $base, "= ",
  36          %$hash_ref->{$base},"\n";
  37   }
  38
  39   # obtain the count directly without creating a new statistics object
  40   print "\nMonomer counts without statistics object\n";
  41   $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
  42   foreach $base (sort keys %$hash_ref) {
  43       print "Number of bases of type ", $base, "= ",
  44          %$hash_ref->{$base},"\n";
  45   }
  46
  47
  48   # obtain hash of counts of each type of codon in a nucleic acid sequence
  49   print "\nCodon counts using statistics object\n";
  50   $hash_ref = $seq_stats-> count_codons();  # for nucleic acid sequence
  51   foreach $base (sort keys %$hash_ref) {
  52       print "Number of codons of type ", $base, "= ",
  53          %$hash_ref->{$base},"\n";
  54   }
  55
  56   #  or
  57   print "\nCodon counts without statistics object\n";
  58   $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
  59   foreach $base (sort keys %$hash_ref) {
  60       print "Number of codons of type ", $base, "= ",
  61          %$hash_ref->{$base},"\n";
  62   }
  63
  64   # Obtain the molecular weight of a sequence. Since the sequence
  65   # may contain ambiguous monomers, the molecular weight is returned
  66   # as a (reference to) a two element array containing greatest lower
  67   # bound (GLB) and least upper bound (LUB) of the molecular weight
  68   $weight = $seq_stats->get_mol_wt();
  69   print "\nMolecular weight (using statistics object) of sequence ",
  70           $seqobj->id(), " is between ", $$weight[0], " and " ,
  71           $$weight[1], "\n";
  72
  73   #  or
  74   $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
  75   print "\nMolecular weight (without statistics object) of sequence ",
  76         $seqobj->id(), " is between ", $$weight[0], " and " ,
  77         $$weight[1], "\n";
  78
  79   # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
  80   my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
  81                                   -alphabet=>'protein');
  82   my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
  83   print "might be hydropathic" if $gravy > 1;
  84
  85 =head1 DESCRIPTION
  86
  87 Bio::Tools::SeqStats is a lightweight object for the calculation of
  88 simple statistical and numerical properties of a sequence. By
  89 "lightweight" I mean that only "primary" sequences are handled by the
  90 object.  The calling script needs to create the appropriate primary
  91 sequence to be passed to SeqStats if statistics on a sequence feature
  92 are required.  Similarly if a codon count is desired for a
  93 frame-shifted sequence and/or a negative strand sequence, the calling
  94 script needs to create that sequence and pass it to the SeqStats
  95 object.
  96
  97 Nota that nucleotide sequences in bioperl do not strictly separate RNA
  98 and DNA sequences. By convention, sequences from RNA molecules are
  99 shown as is they were DNA. Objects are supposed to make the
 100 distinction when needed. This class is one of the few where this
 101 distinctions needs to be made. Internally, it changes all Ts into Us
 102 before weight and monomer count.
 103
 104 SeqStats can be called in two distinct manners.  If only a single
 105 computation is required on a given sequence object, the method can be
 106 called easily using the SeqStats object directly:
 107
 108   $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
 109
 110 Alternately, if several computations will be required on a given
 111 sequence object, an "instance" statistics object can be constructed
 112 and used for the method calls:
 113
 114   $seq_stats = Bio::Tools::SeqStats->new($seqobj);
 115   $monomers = $seq_stats->count_monomers();
 116   $codons = $seq_stats->count_codons();
 117   $weight = $seq_stats->get_mol_wt();
 118   $gravy = $seq_stats->hydropathicity();
 119
 120 As currently implemented the object can return the following values
 121 from a sequence:
 122
 123 =over
 124
 125 =item *
 126
 127 The molecular weight of the sequence: get_mol_wt()
 128
 129 =item *
 130
 131 The number of each type of monomer present: count_monomers()
 132
 133 =item *
 134
 135 The number of each codon present in a nucleic acid sequence:
 136 count_codons()
 137
 138 =item *
 139
 140 The mean hydropathicity ("gravy" score) of a protein:
 141 hydropathicity()
 142
 143 =back
 144
 145 For DNA and RNA sequences single-stranded weights are returned. The
 146 molecular weights are calculated for neutral, or not ionized,
 147 nucleic acids. The returned weight is the sum of the
 148 base-sugar-phosphate residues of the chain plus one weight of water to
 149 to account for the additional OH on the phosphate of the 5' residue
 150 and the additional H on the sugar ring of the 3' residue.  Note that
 151 this leads to a difference of 18 in calculated molecular weights
 152 compared to some other available programs (e.g. Informax VectorNTI).
 153
 154 Note that since sequences may contain ambiguous monomers (e.g. "M",
 155 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
 156 returns a two-element array containing the greatest lower bound and
 157 least upper bound of the molecule. For a sequence with no ambiguous
 158 monomers, the two elements of the returned array will be equal. The
 159 method count_codons() handles ambiguous bases by simply counting all
 160 ambiguous codons together and issuing a warning to that effect.
 161
 162
 163 =head1 DEVELOPERS NOTES
 164
 165 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
 166
 167 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
 168 molecular weights to have the output match values in SWISS-PROT.
 169
 170 Torsten added hydropathicity calculation.
 171
 172 =head1 FEEDBACK
 173
 174 =head2 Mailing Lists
 175
 176 User feedback is an integral part of the evolution of this and other
 177 Bioperl modules. Send your comments and suggestions preferably to one
 178 of the Bioperl mailing lists.  Your participation is much appreciated.
 179
 180   bioperl-l@bioperl.org                  - General discussion
 181   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 182
 183 =head2 Support
 184
 185 Please direct usage questions or support issues to the mailing list:
 186
 187 I<bioperl-l@bioperl.org>
 188
 189 rather than to the module maintainer directly. Many experienced and
 190 reponsive experts will be able look at the problem and quickly
 191 address it. Please include a thorough description of the problem
 192 with code and data examples if at all possible.
 193
 194 =head2 Reporting Bugs
 195
 196 Report bugs to the Bioperl bug tracking system to help us keep track
 197 the bugs and their resolution.  Bug reports can be submitted the web:
 198
 199   https://github.com/bioperl/bioperl-live/issues
 200
 201 =head1 AUTHOR - Peter Schattner
 202
 203 Email schattner AT alum.mit.edu
 204
 205 =head1 CONTRIBUTOR - Torsten Seemann
 206
 207 Email torsten.seemann AT infotech.monash.edu.au
 208
 209 =head1 APPENDIX
 210
 211 The rest of the documentation details each of the object
 212 methods. Internal methods are usually preceded with a _
 213
 214 =cut
 215
 216
 217 package Bio::Tools::SeqStats;
 218
 219 use strict;
 220 use vars qw(%Alphabets %Alphabets_strict $amino_weights
 221             $rna_weights $dna_weights %Weights $amino_hydropathicity);
 222 use Bio::Seq;
 223 use base qw(Bio::Root::Root);
 224
 225 BEGIN {
 226         %Alphabets =   (
 227                          'dna'     => [ qw(A C G T R Y M K S W H B V D X N) ],
 228                     'rna'     => [ qw(A C G U R Y M K S W H B V D X N) ],
 229                     'protein' => [ qw(A R N D C Q E G H I L K M F U
 230                                                                          P S T W X Y V B Z J O *) ], # sac: added B, Z
 231                                                 );
 232
 233 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
 234     %Alphabets_strict = (
 235                          'dna'     => [ qw( A C G T ) ],
 236                          'rna'     => [ qw( A C G U ) ],
 237                          'protein'    => [ qw(A R N D C Q E G H I L K M F U
 238                                               P S T W Y V O) ],
 239                          );
 240
 241
 242 #  IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
 243 #   Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
 244
 245 #  Amino Acid alphabet
 246
 247 # ------------------------------------------
 248 # Symbol           Meaning
 249 # ------------------------------------------
 250
 251     my $amino_A_wt = 89.09;
 252     my $amino_C_wt = 121.15;
 253     my $amino_D_wt = 133.1;
 254     my $amino_E_wt = 147.13;
 255     my $amino_F_wt = 165.19;
 256     my $amino_G_wt = 75.07;
 257     my $amino_H_wt = 155.16;
 258     my $amino_I_wt = 131.17;
 259     my $amino_K_wt = 146.19;
 260     my $amino_L_wt = 131.17;
 261     my $amino_M_wt = 149.21;
 262     my $amino_N_wt = 132.12;
 263     my $amino_O_wt = 255.31;
 264     my $amino_P_wt = 115.13;
 265     my $amino_Q_wt = 146.15;
 266     my $amino_R_wt = 174.20;
 267     my $amino_S_wt = 105.09;
 268     my $amino_T_wt = 119.12;
 269     my $amino_U_wt = 168.06;
 270     my $amino_V_wt = 117.15;
 271     my $amino_W_wt = 204.23;
 272     my $amino_Y_wt = 181.19;
 273
 274
 275     $amino_weights = {
 276         'A'     => [$amino_A_wt, $amino_A_wt], # Alanine
 277         'B'     => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
 278         'C'     => [$amino_C_wt, $amino_C_wt], # Cysteine
 279         'D'     => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
 280         'E'     => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
 281         'F'     => [$amino_F_wt, $amino_F_wt], # Phenylalanine
 282         'G'     => [$amino_G_wt, $amino_G_wt], # Glycine
 283         'H'     => [$amino_H_wt, $amino_H_wt], # Histidine
 284         'I'     => [$amino_I_wt, $amino_I_wt], # Isoleucine
 285         'J'     => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
 286         'K'     => [$amino_K_wt, $amino_K_wt], # Lysine
 287         'L'     => [$amino_L_wt, $amino_L_wt], # Leucine
 288         'M'     => [$amino_M_wt, $amino_M_wt], # Methionine
 289         'N'     => [$amino_N_wt, $amino_N_wt], # Asparagine
 290         'O'     => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
 291         'P'     => [$amino_P_wt, $amino_P_wt], # Proline
 292         'Q'     => [$amino_Q_wt, $amino_Q_wt], # Glutamine
 293         'R'     => [$amino_R_wt, $amino_R_wt], # Arginine
 294         'S'     => [$amino_S_wt, $amino_S_wt], # Serine
 295         'T'     => [$amino_T_wt, $amino_T_wt], # Threonine
 296         'U'     => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
 297         'V'     => [$amino_V_wt, $amino_V_wt], # Valine
 298         'W'     => [$amino_W_wt, $amino_W_wt], # Tryptophan
 299         'X'     => [$amino_G_wt, $amino_W_wt], # Unknown
 300         'Y'     => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
 301         'Z'     => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
 302     };
 303
 304     # Extended Dna / Rna alphabet
 305     use vars ( qw($C $O $N $H $P $water) );
 306     use vars ( qw($adenine   $guanine   $cytosine   $thymine   $uracil));
 307     use vars ( qw($ribose_phosphate   $deoxyribose_phosphate   $ppi));
 308     use vars ( qw($dna_A_wt   $dna_C_wt   $dna_G_wt  $dna_T_wt
 309                   $rna_A_wt   $rna_C_wt   $rna_G_wt   $rna_U_wt));
 310     use vars ( qw($dna_weights   $rna_weights   %Weights));
 311
 312     $C = 12.01;
 313     $O = 16.00;
 314     $N = 14.01;
 315     $H = 1.01;
 316     $P = 30.97;
 317     $water = 18.015;
 318
 319     $adenine = 5 * $C + 5 * $N + 5 * $H;
 320     $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
 321     $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
 322     $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
 323     $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
 324
 325     $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
 326     # neutral (unionized) form
 327     $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
 328
 329     # the following are single strand molecular weights / base
 330     $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
 331     $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
 332     $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
 333     $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
 334
 335     $rna_A_wt = $adenine + $ribose_phosphate - $water;
 336     $rna_C_wt = $cytosine + $ribose_phosphate - $water;
 337     $rna_G_wt = $guanine + $ribose_phosphate - $water;
 338     $rna_U_wt = $uracil + $ribose_phosphate - $water;
 339
 340     $dna_weights = {
 341         'A'             => [$dna_A_wt,$dna_A_wt],            # Adenine
 342         'C'             => [$dna_C_wt,$dna_C_wt],            # Cytosine
 343         'G'             => [$dna_G_wt,$dna_G_wt],            # Guanine
 344         'T'             => [$dna_T_wt,$dna_T_wt],            # Thymine
 345         'M'             => [$dna_C_wt,$dna_A_wt],            # A or C
 346         'R'             => [$dna_A_wt,$dna_G_wt],            # A or G
 347         'W'             => [$dna_T_wt,$dna_A_wt],            # A or T
 348         'S'             => [$dna_C_wt,$dna_G_wt],            # C or G
 349         'Y'             => [$dna_C_wt,$dna_T_wt],            # C or T
 350         'K'             => [$dna_T_wt,$dna_G_wt],            # G or T
 351         'V'             => [$dna_C_wt,$dna_G_wt],            # A or C or G
 352         'H'             => [$dna_C_wt,$dna_A_wt],            # A or C or T
 353         'D'             => [$dna_T_wt,$dna_G_wt],            # A or G or T
 354         'B'             => [$dna_C_wt,$dna_G_wt],            # C or G or T
 355         'X'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
 356         'N'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
 357     };
 358
 359     $rna_weights =  {
 360         'A'             => [$rna_A_wt,$rna_A_wt],            # Adenine
 361         'C'             => [$rna_C_wt,$rna_C_wt],            # Cytosine
 362         'G'             => [$rna_G_wt,$rna_G_wt],            # Guanine
 363         'U'             => [$rna_U_wt,$rna_U_wt],            # Uracil
 364         'M'             => [$rna_C_wt,$rna_A_wt],            # A or C
 365         'R'             => [$rna_A_wt,$rna_G_wt],            # A or G
 366         'W'             => [$rna_U_wt,$rna_A_wt],            # A or U
 367         'S'             => [$rna_C_wt,$rna_G_wt],            # C or G
 368         'Y'             => [$rna_C_wt,$rna_U_wt],            # C or U
 369         'K'             => [$rna_U_wt,$rna_G_wt],            # G or U
 370         'V'             => [$rna_C_wt,$rna_G_wt],            # A or C or G
 371         'H'             => [$rna_C_wt,$rna_A_wt],            # A or C or U
 372         'D'             => [$rna_U_wt,$rna_G_wt],            # A or G or U
 373         'B'             => [$rna_C_wt,$rna_G_wt],            # C or G or U
 374         'X'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
 375         'N'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
 376     };
 377
 378     %Weights =   (
 379                   'dna'     =>  $dna_weights,
 380                   'rna'     =>  $rna_weights,
 381                   'protein' =>  $amino_weights,
 382                   );
 383
 384         # Amino acid scale: Hydropathicity.
 385         # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
 386         # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
 387
 388         $amino_hydropathicity = {
 389     A =>  1.800,
 390     R => -4.500,
 391     N => -3.500,
 392     D => -3.500,
 393     C =>  2.500,
 394     Q => -3.500,
 395     E => -3.500,
 396     G => -0.400,
 397     H => -3.200,
 398     I =>  4.500,
 399     L =>  3.800,
 400     K => -3.900,
 401     M =>  1.900,
 402     F =>  2.800,
 403     P => -1.600,
 404     S => -0.800,
 405     T => -0.700,
 406     W => -0.900,
 407     Y => -1.300,
 408     V =>  4.200,
 409         };
 410
 411 }
 412
 413 sub new {
 414         my($class,@args) = @_;
 415         my $self = $class->SUPER::new(@args);
 416
 417         my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
 418         unless  ($seqobj->isa("Bio::PrimarySeqI")) {
 419                 $self->throw("SeqStats works only on PrimarySeqI objects");
 420         }
 421         if ( !defined $seqobj->alphabet ||
 422                   !defined $Alphabets{$seqobj->alphabet}) {
 423                 $self->throw("Must have a valid alphabet defined for seq (".
 424                                                  join(",",keys %Alphabets));
 425         }
 426         $self->{'_seqref'} = $seqobj;
 427         # check the letters in the sequence
 428         $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
 429         return $self;
 430 }
 431
 432 =head2 count_monomers
 433
 434  Title   : count_monomers
 435  Usage   : $rcount = $seq_stats->count_monomers();
 436            or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
 437  Function: Counts the number of each type of monomer (amino acid or
 438                 base) in the sequence.
 439            Ts are counted as Us in RNA sequences.
 440  Example :
 441  Returns : Reference to a hash in which keys are letters of the
 442            genetic alphabet used and values are number of occurrences
 443            of the letter in the sequence.
 444  Args    : None or reference to sequence object
 445  Throws  : Throws an exception if type of sequence is unknown (ie amino
 446            or nucleic)or if unknown letter in alphabet. Ambiguous
 447            elements are allowed.
 448
 449 =cut
 450
 451 sub count_monomers{
 452         my %count  = ();
 453         my $seqobj;
 454         my $_is_strict;
 455         my $element = '';
 456         my $_is_instance = 1 ;
 457         my $self = shift @_;
 458         my $object_argument = shift @_;
 459
 460         # First we need to determine if the present object is an instance
 461         # object or if the sequence object has been passed as an argument
 462
 463         if (defined $object_argument) {
 464                 $_is_instance = 0;
 465         }
 466
 467         # If we are using an instance object...
 468         if ($_is_instance) {
 469                 if ($self->{'_monomer_count'}) {
 470                         return $self->{'_monomer_count'}; # return count if previously calculated
 471                 }
 472                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 473                 $seqobj =  $self->{'_seqref'};
 474         } else {
 475                 #  otherwise...
 476                 $seqobj =  $object_argument;
 477
 478                 #  Following two lines lead to error in "throw" routine
 479                 $seqobj->isa("Bio::PrimarySeqI") ||
 480                   $self->throw("SeqStats works only on PrimarySeqI objects");
 481                 # is alphabet OK? Is it strict?
 482                 $_is_strict =  _is_alphabet_strict($seqobj);
 483         }
 484
 485         my $alphabet =  $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
 486           $Alphabets{$seqobj->alphabet}  ; # get array of allowed letters
 487
 488         # convert everything to upper case to be safe
 489         my $seqstring = uc $seqobj->seq();
 490
 491         # Since T is used in RichSeq RNA sequences, do conversion locally
 492         $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
 493
 494         #  For each letter, count the number of times it appears in
 495         #  the sequence
 496  LETTER:
 497         foreach $element (@$alphabet) {
 498                 # skip terminator symbol which may confuse regex
 499                 next LETTER if $element eq '*';
 500                 $count{$element} = ( $seqstring =~ s/$element/$element/g);
 501         }
 502
 503         if ($_is_instance) {
 504                 $self->{'_monomer_count'} = \%count;  # Save in case called again later
 505         }
 506
 507         return \%count;
 508 }
 509
 510 =head2  get_mol_wt
 511
 512  Title   : get_mol_wt
 513  Usage   : $wt = $seqobj->get_mol_wt() or
 514            $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
 515  Function: Calculate molecular weight of sequence
 516            Ts are counted as Us in RNA sequences.
 517  Example :
 518
 519  Returns : Reference to two element array containing lower and upper
 520            bounds of molecule molecular weight. For DNA and RNA
 521            sequences single-stranded weights are returned. If
 522            sequence contains no ambiguous elements, both entries in
 523            array are equal to molecular weight of molecule.
 524  Args    : None or reference to sequence object
 525  Throws  : Exception if type of sequence is unknown (ie not amino or
 526            nucleic) or if unknown letter in alphabet. Ambiguous
 527            elements are allowed.
 528
 529 =cut
 530
 531 sub get_mol_wt {
 532         my $seqobj;
 533         my $_is_strict;
 534         my $element = '';
 535         my $_is_instance = 1 ;
 536         my $self = shift @_;
 537         my $object_argument = shift @_;
 538         my ($weight_array, $rcount);
 539
 540         if (defined $object_argument) {
 541                 $_is_instance = 0;
 542         }
 543
 544         if ($_is_instance) {
 545                 if ($weight_array = $self->{'_mol_wt'}) {
 546                         # return mol. weight if previously calculated
 547                         return $weight_array;
 548                 }
 549                 $seqobj =  $self->{'_seqref'};
 550                 $rcount = $self->count_monomers();
 551         } else {
 552                 $seqobj =  $object_argument;
 553                 $seqobj->isa("Bio::PrimarySeqI") ||
 554                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 555                 $_is_strict =  _is_alphabet_strict($seqobj); # is alphabet OK?
 556                 $rcount =  $self->count_monomers($seqobj);
 557         }
 558
 559         # We will also need to know what type of monomer we are dealing with
 560         my $moltype = $seqobj->alphabet();
 561
 562         # In general,the molecular weight is bounded below by the sum of the
 563         # weights of lower bounds of each alphabet symbol times the number of
 564         # occurrences of the symbol in the sequence. A similar upper bound on
 565         # the weight is also calculated.
 566
 567         # Note that for "strict" (i.e. unambiguous) sequences there is an
 568         # inefficiency since the upper bound = the lower bound and there are
 569         # two calculations.  However, this decrease in performance will be
 570         # minor and leads to significantly more readable code.
 571
 572         my $weight_lower_bound = 0;
 573         my $weight_upper_bound = 0;
 574         my $weight_table =  $Weights{$moltype};
 575     my $total_res;
 576
 577         # compute weight of all the residues
 578         foreach $element (keys %$rcount) {
 579                 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
 580                 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
 581
 582         # this tracks only the residues used for counting MW
 583         $total_res += $$rcount{$element};
 584         }
 585         if ($moltype =~ /protein/) {
 586         # remove H2O during peptide bond formation.
 587         $weight_lower_bound -= $water * ($total_res - 1);
 588         $weight_upper_bound -= $water * ($total_res - 1);
 589         } else {
 590         # Correction because phosphate of 5' residue has additional OH and
 591         # sugar ring of 3' residue has additional H
 592         $weight_lower_bound += $water;
 593         $weight_upper_bound += $water;
 594         }
 595
 596         $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
 597         $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
 598
 599         $weight_array = [$weight_lower_bound, $weight_upper_bound];
 600
 601         if ($_is_instance) {
 602                 $self->{'_mol_wt'} = $weight_array;  # Save in case called again later
 603         }
 604         return $weight_array;
 605 }
 606
 607
 608 =head2  count_codons
 609
 610  Title   : count_codons
 611  Usage   : $rcount = $seqstats->count_codons() or
 612            $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
 613  Function: Counts the number of each type of codons for a dna or rna
 614            sequence, starting at the 1st triple of the input sequence.
 615  Example :
 616  Returns : Reference to a hash in which keys are codons of the genetic
 617            alphabet used and values are number of occurrences of the
 618            codons in the sequence. All codons with "ambiguous" bases
 619            are counted together.
 620  Args    : None or sequence object
 621  Throws  : an exception if type of sequence is unknown or protein.
 622
 623 =cut
 624
 625 sub count_codons {
 626         my $rcount = {};
 627         my $codon ;
 628         my $seqobj;
 629         my $_is_strict;
 630         my $element = '';
 631         my $_is_instance = 1 ;
 632         my $self = shift @_;
 633         my $object_argument = shift @_;
 634
 635         if (defined $object_argument) {
 636                 $_is_instance = 0;
 637         }
 638
 639         if ($_is_instance) {
 640                 if ($rcount = $self->{'_codon_count'}) {
 641                         return $rcount;        # return count if previously calculated
 642                 }
 643                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 644                 $seqobj =  $self->{'_seqref'};
 645         } else {
 646                 $seqobj =  $object_argument;
 647                 $seqobj->isa("Bio::PrimarySeqI") ||
 648                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 649                 $_is_strict =  _is_alphabet_strict($seqobj);
 650         }
 651
 652         # Codon counts only make sense for nucleic acid sequences
 653         my $alphabet = $seqobj->alphabet();
 654
 655         unless ($alphabet =~ /[dr]na/i) {
 656                 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
 657                                                         "not for $alphabet sequences.");
 658         }
 659
 660         # If sequence contains ambiguous bases, warn that codons
 661         # containing them will all be lumped together in the count.
 662
 663         if (!$_is_strict ) {
 664                 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
 665                 " All codons with ambiguous bases will be added together in count.")
 666                     if $self->verbose >= 0 ;
 667         }
 668
 669         my $seq = $seqobj->seq();
 670
 671         # Now step through the string by threes and count the codons
 672
 673  CODON:
 674         while (length($seq) > 2) {
 675                 $codon = uc substr($seq,0,3);
 676                 $seq = substr($seq,3);
 677                 if ($codon =~ /[^ACTGU]/i) {
 678                         $$rcount{'ambiguous'}++; #lump together ambiguous codons
 679                         next CODON;
 680                 }
 681                 if (!defined $$rcount{$codon}) {
 682                         $$rcount{$codon}= 1 ;
 683                         next CODON;
 684                 }
 685                 $$rcount{$codon}++;  # default
 686         }
 687
 688         if ($_is_instance) {
 689                 $self->{'_codon_count'} = $rcount;  # Save in case called again later
 690         }
 691
 692         return $rcount;
 693 }
 694
 695
 696 =head2  hydropathicity
 697
 698  Title   : hydropathicity
 699  Usage   : $gravy = $seqstats->hydropathicity(); or
 700            $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
 701
 702  Function: Calculates the mean Kyte-Doolittle hydropathicity for a
 703            protein sequence. Also known as the "gravy" score. Refer to
 704            Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
 705  Example :
 706  Returns : float
 707  Args    : None or reference to sequence object
 708
 709  Throws  : an exception if type of sequence is not protein.
 710
 711 =cut
 712
 713 sub hydropathicity {
 714         my $seqobj;
 715         my $_is_strict;
 716         my $element = '';
 717         my $_is_instance = 1 ;
 718         my $self = shift @_;
 719         my $object_argument = shift @_;
 720
 721         if (defined $object_argument) {
 722                 $_is_instance = 0;
 723         }
 724
 725         if ($_is_instance) {
 726                 if (my $gravy = $self->{'_hydropathicity'}) {
 727                         return $gravy;        # return value if previously calculated
 728                 }
 729                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 730                 $seqobj =  $self->{'_seqref'};
 731         } else {
 732                 $seqobj =  $object_argument;
 733                 $seqobj->isa("Bio::PrimarySeqI") ||
 734                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 735                 $_is_strict =  _is_alphabet_strict($seqobj);
 736         }
 737
 738         # hydropathicity not menaingful for empty sequences
 739         unless ($seqobj->length() > 0) {
 740           $seqobj->throw("hydropathicity not defined for zero-length sequences");
 741         }
 742
 743         # hydropathicity only make sense for protein sequences
 744         my $alphabet = $seqobj->alphabet();
 745
 746         unless ($alphabet =~ /protein/i) {
 747                 $seqobj->throw("hydropathicity only meaningful for protein, ".
 748                                                         "not for $alphabet sequences.");
 749         }
 750
 751         # If sequence contains ambiguous bases, warn that codons
 752         # containing them will all be lumped together in the count.
 753
 754         unless ($_is_strict ) {
 755                 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
 756                 "Hydropathicity can not be caculated.")
 757         }
 758
 759         my $seq = $seqobj->seq();
 760
 761         # Now step through the string and add up the hydropathicity values
 762
 763     my $gravy = 0;
 764     for my $i ( 0 .. length($seq) ) {
 765        my $codon = uc(substr($seq,$i,1));
 766        $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
 767     }
 768     $gravy /= length($seq);
 769
 770
 771         if ($_is_instance) {
 772                 $self->{'_hydropathicity'} = $gravy;  # Save in case called again later
 773         }
 774
 775         return $gravy;
 776 }
 777
 778
 779 =head2  _is_alphabet_strict
 780
 781  Title   :  _is_alphabet_strict
 782  Usage   :
 783  Function: internal function to determine whether there are
 784            any ambiguous elements in the current sequence
 785  Example :
 786  Returns : 1 if strict alphabet is being used,
 787            0 if ambiguous elements are present
 788  Args    :
 789
 790  Throws  : an exception if type of sequence is unknown (ie amino or
 791            nucleic) or if unknown letter in alphabet. Ambiguous
 792            monomers are allowed.
 793
 794 =cut
 795
 796 sub _is_alphabet_strict {
 797
 798         my ($seqobj) = @_;
 799         my $moltype = $seqobj->alphabet();
 800
 801         # convert everything to upper case to be safe
 802         my $seqstring = uc $seqobj->seq();
 803
 804         # Since T is used in RichSeq RNA sequences, do conversion locally
 805         $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
 806
 807         # First we check if only the 'strict' letters are present in the
 808         # sequence string If not, we check whether the remaining letters
 809         # are ambiguous monomers or whether there are illegal letters in
 810         # the string
 811
 812         # $alpha_array is a ref to an array of the 'strictly' allowed letters
 813         my $alpha_array =   $Alphabets_strict{$moltype} ;
 814
 815         # $alphabet contains the allowed letters in string form
 816         my $alphabet = join ('', @$alpha_array) ;
 817         unless ($seqstring =~ /[^$alphabet]/)  {
 818                 return 1 ;
 819         }
 820
 821         # Next try to match with the alphabet's ambiguous letters
 822         $alpha_array =   $Alphabets{$moltype} ;
 823         $alphabet = join ('', @$alpha_array) ;
 824
 825         unless ($seqstring =~ /[^$alphabet]/)  {
 826                 return 0 ;
 827         }
 828
 829         # If we got here there is an illegal letter in the sequence
 830         $seqobj->throw("Alphabet not OK for $seqobj");
 831 }
 832
 833 =head2   _print_data
 834
 835  Title   : _print_data
 836  Usage   : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
 837  Function: Displays dna / rna parameters (used for debugging)
 838  Returns : 1
 839  Args    : None
 840
 841 Used for debugging.
 842
 843 =cut
 844
 845 sub _print_data {
 846
 847     print "\n adenine  = :  $adenine \n";
 848     print "\n guanine  = :  $guanine \n";
 849     print "\n cytosine = :  $cytosine \n";
 850     print "\n thymine  = :  $thymine \n";
 851     print "\n uracil   = :  $uracil \n";
 852
 853     print "\n dna_A_wt = :  $dna_A_wt \n";
 854     print "\n dna_C_wt = :  $dna_C_wt \n";
 855     print "\n dna_G_wt = :  $dna_G_wt \n";
 856     print "\n dna_T_wt = :  $dna_T_wt \n";
 857
 858     print "\n rna_A_wt = :  $rna_A_wt \n";
 859     print "\n rna_C_wt = :  $rna_C_wt \n";
 860     print "\n rna_G_wt = :  $rna_G_wt \n";
 861     print "\n rna_U_wt = :  $rna_U_wt \n";
 862
 863     return 1;
 864 }
 865
 866 1;