branch-1-6/Bio/Tools/SeqStats.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Tools::SeqStats
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by
   8 #
   9 # Copyright Peter Schattner
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::Tools::SeqStats - Object holding statistics for one
  18 particular sequence
  19
  20 =head1 SYNOPSIS
  21
  22   # build a primary nucleic acid or protein sequence object somehow
  23   # then build a statistics object from the sequence object
  24
  25   $seqobj = Bio::PrimarySeq->new(-seq      => 'ACTGTGGCGTCAACTG',
  26                                  -alphabet => 'dna',
  27                                  -id       => 'test');
  28   $seq_stats  =  Bio::Tools::SeqStats->new(-seq => $seqobj);
  29
  30   # obtain a hash of counts of each type of monomer
  31   # (i.e. amino or nucleic acid)
  32   print "\nMonomer counts using statistics object\n";
  33   $seq_stats  =  Bio::Tools::SeqStats->new(-seq=>$seqobj);
  34   $hash_ref = $seq_stats->count_monomers();  # e.g. for DNA sequence
  35   foreach $base (sort keys %$hash_ref) {
  36       print "Number of bases of type ", $base, "= ",
  37          %$hash_ref->{$base},"\n";
  38   }
  39
  40   # obtain the count directly without creating a new statistics object
  41   print "\nMonomer counts without statistics object\n";
  42   $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
  43   foreach $base (sort keys %$hash_ref) {
  44       print "Number of bases of type ", $base, "= ",
  45          %$hash_ref->{$base},"\n";
  46   }
  47
  48
  49   # obtain hash of counts of each type of codon in a nucleic acid sequence
  50   print "\nCodon counts using statistics object\n";
  51   $hash_ref = $seq_stats-> count_codons();  # for nucleic acid sequence
  52   foreach $base (sort keys %$hash_ref) {
  53       print "Number of codons of type ", $base, "= ",
  54          %$hash_ref->{$base},"\n";
  55   }
  56
  57   #  or
  58   print "\nCodon counts without statistics object\n";
  59   $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
  60   foreach $base (sort keys %$hash_ref) {
  61       print "Number of codons of type ", $base, "= ",
  62          %$hash_ref->{$base},"\n";
  63   }
  64
  65   # Obtain the molecular weight of a sequence. Since the sequence
  66   # may contain ambiguous monomers, the molecular weight is returned
  67   # as a (reference to) a two element array containing greatest lower
  68   # bound (GLB) and least upper bound (LUB) of the molecular weight
  69   $weight = $seq_stats->get_mol_wt();
  70   print "\nMolecular weight (using statistics object) of sequence ",
  71           $seqobj->id(), " is between ", $$weight[0], " and " ,
  72           $$weight[1], "\n";
  73
  74   #  or
  75   $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
  76   print "\nMolecular weight (without statistics object) of sequence ",
  77         $seqobj->id(), " is between ", $$weight[0], " and " ,
  78         $$weight[1], "\n";
  79
  80   # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
  81   my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
  82                                   -alphabet=>'protein');
  83   my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
  84   print "might be hydropathic" if $gravy > 1;
  85
  86 =head1 DESCRIPTION
  87
  88 Bio::Tools::SeqStats is a lightweight object for the calculation of
  89 simple statistical and numerical properties of a sequence. By
  90 "lightweight" I mean that only "primary" sequences are handled by the
  91 object.  The calling script needs to create the appropriate primary
  92 sequence to be passed to SeqStats if statistics on a sequence feature
  93 are required.  Similarly if a codon count is desired for a
  94 frame-shifted sequence and/or a negative strand sequence, the calling
  95 script needs to create that sequence and pass it to the SeqStats
  96 object.
  97
  98 Nota that nucleotide sequences in bioperl do not strictly separate RNA
  99 and DNA sequences. By convention, sequences from RNA molecules are
 100 shown as is they were DNA. Objects are supposed to make the
 101 distinction when needed. This class is one of the few where this
 102 distinctions needs to be made. Internally, it changes all Ts into Us
 103 before weight and monomer count.
 104
 105 SeqStats can be called in two distinct manners.  If only a single
 106 computation is required on a given sequence object, the method can be
 107 called easily using the SeqStats object directly:
 108
 109   $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
 110
 111 Alternately, if several computations will be required on a given
 112 sequence object, an "instance" statistics object can be constructed
 113 and used for the method calls:
 114
 115   $seq_stats = Bio::Tools::SeqStats->new($seqobj);
 116   $monomers = $seq_stats->count_monomers();
 117   $codons = $seq_stats->count_codons();
 118   $weight = $seq_stats->get_mol_wt();
 119   $gravy = $seq_stats->hydropathicity();
 120
 121 As currently implemented the object can return the following values
 122 from a sequence:
 123
 124 =over
 125
 126 =item *
 127
 128 The molecular weight of the sequence: get_mol_wt()
 129
 130 =item *
 131
 132 The number of each type of monomer present: count_monomers()
 133
 134 =item *
 135
 136 The number of each codon present in a nucleic acid sequence:
 137 count_codons()
 138
 139 =item *
 140
 141 The mean hydropathicity ("gravy" score) of a protein:
 142 hydropathicity()
 143
 144 =back
 145
 146 For DNA and RNA sequences single-stranded weights are returned. The
 147 molecular weights are calculated for neutral, or not ionized,
 148 nucleic acids. The returned weight is the sum of the
 149 base-sugar-phosphate residues of the chain plus one weight of water to
 150 to account for the additional OH on the phosphate of the 5' residue
 151 and the additional H on the sugar ring of the 3' residue.  Note that
 152 this leads to a difference of 18 in calculated molecular weights
 153 compared to some other available programs (e.g. Informax VectorNTI).
 154
 155 Note that since sequences may contain ambiguous monomers (e.g. "M",
 156 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
 157 returns a two-element array containing the greatest lower bound and
 158 least upper bound of the molecule. For a sequence with no ambiguous
 159 monomers, the two elements of the returned array will be equal. The
 160 method count_codons() handles ambiguous bases by simply counting all
 161 ambiguous codons together and issuing a warning to that effect.
 162
 163
 164 =head1 DEVELOPERS NOTES
 165
 166 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
 167
 168 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
 169 molecular weights to have the output match values in SWISS-PROT.
 170
 171 Torsten added hydropathicity calculation.
 172
 173 =head1 FEEDBACK
 174
 175 =head2 Mailing Lists
 176
 177 User feedback is an integral part of the evolution of this and other
 178 Bioperl modules. Send your comments and suggestions preferably to one
 179 of the Bioperl mailing lists.  Your participation is much appreciated.
 180
 181   bioperl-l@bioperl.org                  - General discussion
 182   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 183
 184 =head2 Support
 185
 186 Please direct usage questions or support issues to the mailing list:
 187
 188 I<bioperl-l@bioperl.org>
 189
 190 rather than to the module maintainer directly. Many experienced and
 191 reponsive experts will be able look at the problem and quickly
 192 address it. Please include a thorough description of the problem
 193 with code and data examples if at all possible.
 194
 195 =head2 Reporting Bugs
 196
 197 Report bugs to the Bioperl bug tracking system to help us keep track
 198 the bugs and their resolution.  Bug reports can be submitted the web:
 199
 200   http://bugzilla.open-bio.org/
 201
 202 =head1 AUTHOR - Peter Schattner
 203
 204 Email schattner AT alum.mit.edu
 205
 206 =head1 CONTRIBUTOR - Torsten Seemann
 207
 208 Email torsten.seemann AT infotech.monash.edu.au
 209
 210 =head1 APPENDIX
 211
 212 The rest of the documentation details each of the object
 213 methods. Internal methods are usually preceded with a _
 214
 215 =cut
 216
 217
 218 package Bio::Tools::SeqStats;
 219 use strict;
 220 use vars qw(%Alphabets %Alphabets_strict $amino_weights
 221             $rna_weights $dna_weights %Weights $amino_hydropathicity);
 222 use Bio::Seq;
 223 use base qw(Bio::Root::Root);
 224
 225 BEGIN {
 226         %Alphabets =   (
 227                          'dna'     => [ qw(A C G T R Y M K S W H B V D X N) ],
 228                     'rna'     => [ qw(A C G U R Y M K S W H B V D X N) ],
 229                     'protein' => [ qw(A R N D C Q E G H I L K M F U
 230                                                                          P S T W X Y V B Z J O *) ], # sac: added B, Z
 231                                                 );
 232
 233 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
 234     %Alphabets_strict = (
 235                          'dna'     => [ qw( A C G T ) ],
 236                          'rna'     => [ qw( A C G U ) ],
 237                          'protein'    => [ qw(A R N D C Q E G H I L K M F U
 238                                               P S T W Y V O) ],
 239                          );
 240
 241
 242 #  IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
 243 #   Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
 244
 245 #  Amino Acid alphabet
 246
 247 # ------------------------------------------
 248 # Symbol           Meaning
 249 # ------------------------------------------
 250
 251     my $amino_A_wt = 89.09;
 252     my $amino_C_wt = 121.15;
 253     my $amino_D_wt = 133.1;
 254     my $amino_E_wt = 147.13;
 255     my $amino_F_wt = 165.19;
 256     my $amino_G_wt = 75.07;
 257     my $amino_H_wt = 155.16;
 258     my $amino_I_wt = 131.17;
 259     my $amino_K_wt = 146.19;
 260     my $amino_L_wt = 131.17;
 261     my $amino_M_wt = 149.21;
 262     my $amino_N_wt = 132.12;
 263     my $amino_O_wt = 255.31;
 264     my $amino_P_wt = 115.13;
 265     my $amino_Q_wt = 146.15;
 266     my $amino_R_wt = 174.20;
 267     my $amino_S_wt = 105.09;
 268     my $amino_T_wt = 119.12;
 269     my $amino_U_wt = 168.06;
 270     my $amino_V_wt = 117.15;
 271     my $amino_W_wt = 204.23;
 272     my $amino_Y_wt = 181.19;
 273
 274
 275     $amino_weights = {
 276         'A'     => [$amino_A_wt, $amino_A_wt], # Alanine
 277         'B'     => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
 278         'C'     => [$amino_C_wt, $amino_C_wt], # Cysteine
 279         'D'     => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
 280         'E'     => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
 281         'F'     => [$amino_F_wt, $amino_F_wt], # Phenylalanine
 282         'G'     => [$amino_G_wt, $amino_G_wt], # Glycine
 283         'H'     => [$amino_H_wt, $amino_H_wt], # Histidine
 284         'I'     => [$amino_I_wt, $amino_I_wt], # Isoleucine
 285         'J'     => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
 286         'K'     => [$amino_K_wt, $amino_K_wt], # Lysine
 287         'L'     => [$amino_L_wt, $amino_L_wt], # Leucine
 288         'M'     => [$amino_M_wt, $amino_M_wt], # Methionine
 289         'N'     => [$amino_N_wt, $amino_N_wt], # Asparagine
 290         'O'     => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
 291         'P'     => [$amino_P_wt, $amino_P_wt], # Proline
 292         'Q'     => [$amino_Q_wt, $amino_Q_wt], # Glutamine
 293         'R'     => [$amino_R_wt, $amino_R_wt], # Arginine
 294         'S'     => [$amino_S_wt, $amino_S_wt], # Serine
 295         'T'     => [$amino_T_wt, $amino_T_wt], # Threonine
 296         'U'     => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
 297         'V'     => [$amino_V_wt, $amino_V_wt], # Valine
 298         'W'     => [$amino_W_wt, $amino_W_wt], # Tryptophan
 299         'X'     => [$amino_G_wt, $amino_W_wt], # Unknown
 300         'Y'     => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
 301         'Z'     => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
 302     };
 303
 304     # Extended Dna / Rna alphabet
 305     use vars ( qw($C $O $N $H $P $water) );
 306     use vars ( qw($adenine   $guanine   $cytosine   $thymine   $uracil));
 307     use vars ( qw($ribose_phosphate   $deoxyribose_phosphate   $ppi));
 308     use vars ( qw($dna_A_wt   $dna_C_wt   $dna_G_wt  $dna_T_wt
 309                   $rna_A_wt   $rna_C_wt   $rna_G_wt   $rna_U_wt));
 310     use vars ( qw($dna_weights   $rna_weights   %Weights));
 311
 312     $C = 12.01;
 313     $O = 16.00;
 314     $N = 14.01;
 315     $H = 1.01;
 316     $P = 30.97;
 317     $water = 18.015;
 318
 319     $adenine = 5 * $C + 5 * $N + 5 * $H;
 320     $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
 321     $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
 322     $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
 323     $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
 324
 325     $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
 326     # neutral (unionized) form
 327     $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
 328
 329     # the following are single strand molecular weights / base
 330     $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
 331     $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
 332     $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
 333     $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
 334
 335     $rna_A_wt = $adenine + $ribose_phosphate - $water;
 336     $rna_C_wt = $cytosine + $ribose_phosphate - $water;
 337     $rna_G_wt = $guanine + $ribose_phosphate - $water;
 338     $rna_U_wt = $uracil + $ribose_phosphate - $water;
 339
 340     $dna_weights = {
 341         'A'             => [$dna_A_wt,$dna_A_wt],            # Adenine
 342         'C'             => [$dna_C_wt,$dna_C_wt],            # Cytosine
 343         'G'             => [$dna_G_wt,$dna_G_wt],            # Guanine
 344         'T'             => [$dna_T_wt,$dna_T_wt],            # Thymine
 345         'M'             => [$dna_C_wt,$dna_A_wt],            # A or C
 346         'R'             => [$dna_A_wt,$dna_G_wt],            # A or G
 347         'W'             => [$dna_T_wt,$dna_A_wt],            # A or T
 348         'S'             => [$dna_C_wt,$dna_G_wt],            # C or G
 349         'Y'             => [$dna_C_wt,$dna_T_wt],            # C or T
 350         'K'             => [$dna_T_wt,$dna_G_wt],            # G or T
 351         'V'             => [$dna_C_wt,$dna_G_wt],            # A or C or G
 352         'H'             => [$dna_C_wt,$dna_A_wt],            # A or C or T
 353         'D'             => [$dna_T_wt,$dna_G_wt],            # A or G or T
 354         'B'             => [$dna_C_wt,$dna_G_wt],            # C or G or T
 355         'X'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
 356         'N'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
 357     };
 358
 359     $rna_weights =  {
 360         'A'             => [$rna_A_wt,$rna_A_wt],            # Adenine
 361         'C'             => [$rna_C_wt,$rna_C_wt],            # Cytosine
 362         'G'             => [$rna_G_wt,$rna_G_wt],            # Guanine
 363         'U'             => [$rna_U_wt,$rna_U_wt],            # Uracil
 364         'M'             => [$rna_C_wt,$rna_A_wt],            # A or C
 365         'R'             => [$rna_A_wt,$rna_G_wt],            # A or G
 366         'W'             => [$rna_U_wt,$rna_A_wt],            # A or U
 367         'S'             => [$rna_C_wt,$rna_G_wt],            # C or G
 368         'Y'             => [$rna_C_wt,$rna_U_wt],            # C or U
 369         'K'             => [$rna_U_wt,$rna_G_wt],            # G or U
 370         'V'             => [$rna_C_wt,$rna_G_wt],            # A or C or G
 371         'H'             => [$rna_C_wt,$rna_A_wt],            # A or C or U
 372         'D'             => [$rna_U_wt,$rna_G_wt],            # A or G or U
 373         'B'             => [$rna_C_wt,$rna_G_wt],            # C or G or U
 374         'X'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
 375         'N'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
 376     };
 377
 378     %Weights =   (
 379                   'dna'     =>  $dna_weights,
 380                   'rna'     =>  $rna_weights,
 381                   'protein' =>  $amino_weights,
 382                   );
 383
 384         # Amino acid scale: Hydropathicity.
 385         # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
 386         # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
 387
 388         $amino_hydropathicity = {
 389     A =>  1.800,
 390     R => -4.500,
 391     N => -3.500,
 392     D => -3.500,
 393     C =>  2.500,
 394     Q => -3.500,
 395     E => -3.500,
 396     G => -0.400,
 397     H => -3.200,
 398     I =>  4.500,
 399     L =>  3.800,
 400     K => -3.900,
 401     M =>  1.900,
 402     F =>  2.800,
 403     P => -1.600,
 404     S => -0.800,
 405     T => -0.700,
 406     W => -0.900,
 407     Y => -1.300,
 408     V =>  4.200,
 409         };
 410
 411 }
 412
 413 sub new {
 414         my($class,@args) = @_;
 415         my $self = $class->SUPER::new(@args);
 416
 417         my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
 418         unless  ($seqobj->isa("Bio::PrimarySeqI")) {
 419                 $self->throw("SeqStats works only on PrimarySeqI objects");
 420         }
 421         if ( !defined $seqobj->alphabet ||
 422                   !defined $Alphabets{$seqobj->alphabet}) {
 423                 $self->throw("Must have a valid alphabet defined for seq (".
 424                                                  join(",",keys %Alphabets));
 425         }
 426         $self->{'_seqref'} = $seqobj;
 427         # check the letters in the sequence
 428         $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
 429         return $self;
 430 }
 431
 432 =head2 count_monomers
 433
 434  Title   : count_monomers
 435  Usage   : $rcount = $seq_stats->count_monomers();
 436            or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
 437  Function: Counts the number of each type of monomer (amino acid or
 438                 base) in the sequence.
 439            Ts are counted as Us in RNA sequences.
 440  Example :
 441  Returns : Reference to a hash in which keys are letters of the
 442            genetic alphabet used and values are number of occurrences
 443            of the letter in the sequence.
 444  Args    : None or reference to sequence object
 445  Throws  : Throws an exception if type of sequence is unknown (ie amino
 446            or nucleic)or if unknown letter in alphabet. Ambiguous
 447            elements are allowed.
 448
 449 =cut
 450
 451 sub count_monomers{
 452         my %count  = ();
 453         my $seqobj;
 454         my $_is_strict;
 455         my $element = '';
 456         my $_is_instance = 1 ;
 457         my $self = shift @_;
 458         my $object_argument = shift @_;
 459
 460         # First we need to determine if the present object is an instance
 461         # object or if the sequence object has been passed as an argument
 462
 463         if (defined $object_argument) {
 464                 $_is_instance = 0;
 465         }
 466
 467         # If we are using an instance object...
 468         if ($_is_instance) {
 469                 if ($self->{'_monomer_count'}) {
 470                         return $self->{'_monomer_count'}; # return count if previously calculated
 471                 }
 472                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 473                 $seqobj =  $self->{'_seqref'};
 474         } else {
 475                 #  otherwise...
 476                 $seqobj =  $object_argument;
 477
 478                 #  Following two lines lead to error in "throw" routine
 479                 $seqobj->isa("Bio::PrimarySeqI") ||
 480                   $self->throw("SeqStats works only on PrimarySeqI objects");
 481                 # is alphabet OK? Is it strict?
 482                 $_is_strict =  _is_alphabet_strict($seqobj);
 483         }
 484
 485         my $alphabet =  $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
 486           $Alphabets{$seqobj->alphabet}  ; # get array of allowed letters
 487
 488         # convert everything to upper case to be safe
 489         my $seqstring = uc $seqobj->seq();
 490
 491         # Since T is used in RichSeq RNA sequences, do conversion locally
 492         $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
 493
 494         #  For each letter, count the number of times it appears in
 495         #  the sequence
 496  LETTER:
 497         foreach $element (@$alphabet) {
 498                 # skip terminator symbol which may confuse regex
 499                 next LETTER if $element eq '*';
 500                 $count{$element} = ( $seqstring =~ s/$element/$element/g);
 501         }
 502
 503         if ($_is_instance) {
 504                 $self->{'_monomer_count'} = \%count;  # Save in case called again later
 505         }
 506
 507         return \%count;
 508 }
 509
 510 =head2  get_mol_wt
 511
 512  Title   : get_mol_wt
 513  Usage   : $wt = $seqobj->get_mol_wt() or
 514            $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
 515  Function: Calculate molecular weight of sequence
 516            Ts are counted as Us in RNA sequences.
 517  Example :
 518
 519  Returns : Reference to two element array containing lower and upper
 520            bounds of molecule molecular weight. For DNA and RNA
 521            sequences single-stranded weights are returned. If
 522            sequence contains no ambiguous elements, both entries in
 523            array are equal to molecular weight of molecule.
 524  Args    : None or reference to sequence object
 525  Throws  : Exception if type of sequence is unknown (ie not amino or
 526            nucleic) or if unknown letter in alphabet. Ambiguous
 527            elements are allowed.
 528
 529 =cut
 530
 531 sub get_mol_wt {
 532         my $seqobj;
 533         my $_is_strict;
 534         my $element = '';
 535         my $_is_instance = 1 ;
 536         my $self = shift @_;
 537         my $object_argument = shift @_;
 538         my ($weight_array, $rcount);
 539
 540         if (defined $object_argument) {
 541                 $_is_instance = 0;
 542         }
 543
 544         if ($_is_instance) {
 545                 if ($weight_array = $self->{'_mol_wt'}) {
 546                         # return mol. weight if previously calculated
 547                         return $weight_array;
 548                 }
 549                 $seqobj =  $self->{'_seqref'};
 550                 $rcount = $self->count_monomers();
 551         } else {
 552                 $seqobj =  $object_argument;
 553                 $seqobj->isa("Bio::PrimarySeqI") ||
 554                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 555                 $_is_strict =  _is_alphabet_strict($seqobj); # is alphabet OK?
 556                 $rcount =  $self->count_monomers($seqobj);
 557         }
 558
 559         # We will also need to know what type of monomer we are dealing with
 560         my $moltype = $seqobj->alphabet();
 561
 562         # In general,the molecular weight is bounded below by the sum of the
 563         # weights of lower bounds of each alphabet symbol times the number of
 564         # occurrences of the symbol in the sequence. A similar upper bound on
 565         # the weight is also calculated.
 566
 567         # Note that for "strict" (i.e. unambiguous) sequences there is an
 568         # inefficiency since the upper bound = the lower bound and there are
 569         # two calculations.  However, this decrease in performance will be
 570         # minor and leads to significantly more readable code.
 571
 572         my $weight_lower_bound = 0;
 573         my $weight_upper_bound = 0;
 574         my $weight_table =  $Weights{$moltype};
 575
 576         # compute weight of all the residues
 577         foreach $element (keys %$rcount) {
 578                 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
 579                 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
 580         }
 581         if ($moltype =~ /protein/) {
 582         # remove H2O during peptide bond formation.
 583         $weight_lower_bound -= $water * ($seqobj->length - 1);
 584         $weight_upper_bound -= $water * ($seqobj->length - 1);
 585         } else {
 586         # Correction because phosphate of 5' residue has additional OH and
 587         # sugar ring of 3' residue has additional H
 588         $weight_lower_bound += $water;
 589         $weight_upper_bound += $water;
 590         }
 591
 592         $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
 593         $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
 594
 595         $weight_array = [$weight_lower_bound, $weight_upper_bound];
 596
 597         if ($_is_instance) {
 598                 $self->{'_mol_wt'} = $weight_array;  # Save in case called again later
 599         }
 600         return $weight_array;
 601 }
 602
 603
 604 =head2  count_codons
 605
 606  Title   : count_codons
 607  Usage   : $rcount = $seqstats->count_codons() or
 608            $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
 609  Function: Counts the number of each type of codons for a dna or rna
 610            sequence, starting at the 1st triple of the input sequence.
 611  Example :
 612  Returns : Reference to a hash in which keys are codons of the genetic
 613            alphabet used and values are number of occurrences of the
 614            codons in the sequence. All codons with "ambiguous" bases
 615            are counted together.
 616  Args    : None or sequence object
 617  Throws  : an exception if type of sequence is unknown or protein.
 618
 619 =cut
 620
 621 sub count_codons {
 622         my $rcount = {};
 623         my $codon ;
 624         my $seqobj;
 625         my $_is_strict;
 626         my $element = '';
 627         my $_is_instance = 1 ;
 628         my $self = shift @_;
 629         my $object_argument = shift @_;
 630
 631         if (defined $object_argument) {
 632                 $_is_instance = 0;
 633         }
 634
 635         if ($_is_instance) {
 636                 if ($rcount = $self->{'_codon_count'}) {
 637                         return $rcount;        # return count if previously calculated
 638                 }
 639                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 640                 $seqobj =  $self->{'_seqref'};
 641         } else {
 642                 $seqobj =  $object_argument;
 643                 $seqobj->isa("Bio::PrimarySeqI") ||
 644                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 645                 $_is_strict =  _is_alphabet_strict($seqobj);
 646         }
 647
 648         # Codon counts only make sense for nucleic acid sequences
 649         my $alphabet = $seqobj->alphabet();
 650
 651         unless ($alphabet =~ /[dr]na/i) {
 652                 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
 653                                                         "not for $alphabet sequences.");
 654         }
 655
 656         # If sequence contains ambiguous bases, warn that codons
 657         # containing them will all be lumped together in the count.
 658
 659         if (!$_is_strict ) {
 660                 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
 661                 " All codons with ambiguous bases will be added together in count.")
 662                     if $self->verbose >= 0 ;
 663         }
 664
 665         my $seq = $seqobj->seq();
 666
 667         # Now step through the string by threes and count the codons
 668
 669  CODON:
 670         while (length($seq) > 2) {
 671                 $codon = uc substr($seq,0,3);
 672                 $seq = substr($seq,3);
 673                 if ($codon =~ /[^ACTGU]/i) {
 674                         $$rcount{'ambiguous'}++; #lump together ambiguous codons
 675                         next CODON;
 676                 }
 677                 if (!defined $$rcount{$codon}) {
 678                         $$rcount{$codon}= 1 ;
 679                         next CODON;
 680                 }
 681                 $$rcount{$codon}++;  # default
 682         }
 683
 684         if ($_is_instance) {
 685                 $self->{'_codon_count'} = $rcount;  # Save in case called again later
 686         }
 687
 688         return $rcount;
 689 }
 690
 691
 692 =head2  hydropathicity
 693
 694  Title   : hydropathicity
 695  Usage   : $gravy = $seqstats->hydropathicity(); or
 696            $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
 697
 698  Function: Calculates the mean Kyte-Doolittle hydropathicity for a
 699            protein sequence. Also known as the "gravy" score. Refer to
 700            Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
 701  Example :
 702  Returns : float
 703  Args    : None or reference to sequence object
 704
 705  Throws  : an exception if type of sequence is not protein.
 706
 707 =cut
 708
 709 sub hydropathicity {
 710         my $seqobj;
 711         my $_is_strict;
 712         my $element = '';
 713         my $_is_instance = 1 ;
 714         my $self = shift @_;
 715         my $object_argument = shift @_;
 716
 717         if (defined $object_argument) {
 718                 $_is_instance = 0;
 719         }
 720
 721         if ($_is_instance) {
 722                 if (my $gravy = $self->{'_hydropathicity'}) {
 723                         return $gravy;        # return value if previously calculated
 724                 }
 725                 $_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
 726                 $seqobj =  $self->{'_seqref'};
 727         } else {
 728                 $seqobj =  $object_argument;
 729                 $seqobj->isa("Bio::PrimarySeqI") ||
 730                   $self->throw("Error: SeqStats works only on PrimarySeqI objects");
 731                 $_is_strict =  _is_alphabet_strict($seqobj);
 732         }
 733
 734         # hydropathicity not menaingful for empty sequences
 735         unless ($seqobj->length() > 0) {
 736           $seqobj->throw("hydropathicity not defined for zero-length sequences");
 737         }
 738
 739         # hydropathicity only make sense for protein sequences
 740         my $alphabet = $seqobj->alphabet();
 741
 742         unless ($alphabet =~ /protein/i) {
 743                 $seqobj->throw("hydropathicity only meaningful for protein, ".
 744                                                         "not for $alphabet sequences.");
 745         }
 746
 747         # If sequence contains ambiguous bases, warn that codons
 748         # containing them will all be lumped together in the count.
 749
 750         unless ($_is_strict ) {
 751                 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
 752                 "Hydropathicity can not be caculated.")
 753         }
 754
 755         my $seq = $seqobj->seq();
 756
 757         # Now step through the string and add up the hydropathicity values
 758
 759     my $gravy = 0;
 760     for my $i ( 0 .. length($seq) ) {
 761        my $codon = uc(substr($seq,$i,1));
 762        $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
 763     }
 764     $gravy /= length($seq);
 765
 766
 767         if ($_is_instance) {
 768                 $self->{'_hydropathicity'} = $gravy;  # Save in case called again later
 769         }
 770
 771         return $gravy;
 772 }
 773
 774
 775 =head2  _is_alphabet_strict
 776
 777  Title   :  _is_alphabet_strict
 778  Usage   :
 779  Function: internal function to determine whether there are
 780            any ambiguous elements in the current sequence
 781  Example :
 782  Returns : 1 if strict alphabet is being used,
 783            0 if ambiguous elements are present
 784  Args    :
 785
 786  Throws  : an exception if type of sequence is unknown (ie amino or
 787            nucleic) or if unknown letter in alphabet. Ambiguous
 788            monomers are allowed.
 789
 790 =cut
 791
 792 sub _is_alphabet_strict {
 793
 794         my ($seqobj) = @_;
 795         my $moltype = $seqobj->alphabet();
 796
 797         # convert everything to upper case to be safe
 798         my $seqstring = uc $seqobj->seq();
 799
 800         # Since T is used in RichSeq RNA sequences, do conversion locally
 801         $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
 802
 803         # First we check if only the 'strict' letters are present in the
 804         # sequence string If not, we check whether the remaining letters
 805         # are ambiguous monomers or whether there are illegal letters in
 806         # the string
 807
 808         # $alpha_array is a ref to an array of the 'strictly' allowed letters
 809         my $alpha_array =   $Alphabets_strict{$moltype} ;
 810
 811         # $alphabet contains the allowed letters in string form
 812         my $alphabet = join ('', @$alpha_array) ;
 813         unless ($seqstring =~ /[^$alphabet]/)  {
 814                 return 1 ;
 815         }
 816
 817         # Next try to match with the alphabet's ambiguous letters
 818         $alpha_array =   $Alphabets{$moltype} ;
 819         $alphabet = join ('', @$alpha_array) ;
 820
 821         unless ($seqstring =~ /[^$alphabet]/)  {
 822                 return 0 ;
 823         }
 824
 825         # If we got here there is an illegal letter in the sequence
 826         $seqobj->throw("Alphabet not OK for $seqobj");
 827 }
 828
 829 =head2   _print_data
 830
 831  Title   : _print_data
 832  Usage   : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
 833  Function: Displays dna / rna parameters (used for debugging)
 834  Returns : 1
 835  Args    : None
 836
 837 Used for debugging.
 838
 839 =cut
 840
 841 sub _print_data {
 842
 843     print "\n adenine  = :  $adenine \n";
 844     print "\n guanine  = :  $guanine \n";
 845     print "\n cytosine = :  $cytosine \n";
 846     print "\n thymine  = :  $thymine \n";
 847     print "\n uracil   = :  $uracil \n";
 848
 849     print "\n dna_A_wt = :  $dna_A_wt \n";
 850     print "\n dna_C_wt = :  $dna_C_wt \n";
 851     print "\n dna_G_wt = :  $dna_G_wt \n";
 852     print "\n dna_T_wt = :  $dna_T_wt \n";
 853
 854     print "\n rna_A_wt = :  $rna_A_wt \n";
 855     print "\n rna_C_wt = :  $rna_C_wt \n";
 856     print "\n rna_G_wt = :  $rna_G_wt \n";
 857     print "\n rna_U_wt = :  $rna_U_wt \n";
 858
 859     return 1;
 860 }
 861
 862 1;