branch-1-6/scripts/seqstats/gccalc.PLS

   1 #!perl -w
   2 # $Id$
   3
   4 use strict;
   5
   6 use Bio::SeqIO;
   7 use Bio::Tools::SeqStats;
   8 use Getopt::Long;
   9 my $format = 'fasta';
  10 my $file;
  11 my $help =0;
  12 my $aggregate;
  13 GetOptions(
  14             'f|format:s' => \$format,
  15             'i|in:s'     => \$file,
  16             'h|help|?'    => \$help,
  17             'a|aggregate!'=> \$aggregate,
  18             );
  19
  20
  21 my $USAGE = "usage: gccalc.pl -f format -i filename\n";
  22 if( $help ) {
  23     die $USAGE;
  24 }
  25
  26 $file = shift unless $file;
  27 my $seqin;
  28 if( defined $file ) {
  29     print "Could not open file [$file]\n$USAGE" and exit unless -e $file;
  30     $seqin = new Bio::SeqIO(-format => $format,
  31                             -file   => $file);
  32 } else {
  33     $seqin = new Bio::SeqIO(-format => $format,
  34                             -fh     => \*STDIN);
  35 }
  36 my ($total_base, $total_gc);
  37 while( my $seq = $seqin->next_seq ) {
  38     next if( $seq->length == 0 );
  39     if( $seq->alphabet eq 'protein' ) {
  40         warn("gccalc does not work on amino acid sequences ...skipping this seq");
  41         next;
  42     }
  43
  44     my $seq_stats  =  Bio::Tools::SeqStats->new('-seq'=>$seq);
  45     my $hash_ref = $seq_stats->count_monomers();  # for DNA sequence
  46     print "Seq: ", $seq->display_id, " ";
  47     print $seq->desc if $seq->desc;
  48     print " Len:", $seq->length, "\n";
  49     $total_base += $seq->length;
  50     $total_gc   += $hash_ref->{'G'} + $hash_ref->{'C'};
  51     printf "GC content is %.4f\n", ($hash_ref->{'G'} + $hash_ref->{'C'}) /
  52         $seq->length();
  53
  54     foreach my $base (sort keys %{$hash_ref}) {
  55         print "Number of bases of type ", $base, "= ", $hash_ref->{$base},"\n";
  56     }
  57     print "--\n";
  58 }
  59 if( $aggregate ) {
  60   printf "Total GC content is %.4f out of %d bases\n", $total_gc / $total_base, $total_base;
  61 }
  62 # alternatively one could use code submitted by
  63 # cckim@stanford.edu
  64
  65 sub calcgc {
  66     my $seq = $_[0];
  67     my @seqarray = split('',$seq);
  68     my $count = 0;
  69     foreach my $base (@seqarray) {
  70         $count++ if $base =~ /[G|C]/i;
  71     }
  72
  73     my $len = $#seqarray+1;
  74     return $count / $len;
  75 }
  76
  77
  78 __END__
  79
  80 =head1 NAME
  81
  82 gccalc - GC content of nucleotide sequences
  83
  84 =head1 SYNOPSIS
  85
  86   gccalc [-f/--format FORMAT] [-h/--help] filename
  87   or
  88   gccalc [-f/--format FORMAT] < filename
  89   or
  90   gccalc [-f/--format FORMAT] -i filename
  91
  92 =head1 DESCRIPTION
  93
  94 This scripts prints out the GC content for every nucleotide sequence
  95 from the input file.
  96
  97 =head1 OPTIONS
  98
  99 The default sequence format is fasta.
 100
 101 The sequence input can be provided using any of the three methods:
 102
 103 =over 3
 104
 105 =item unnamed argument
 106
 107   gccalc filename
 108
 109 =item named argument
 110
 111   gccalc -i filename
 112
 113 =item standard input
 114
 115   gccalc < filename
 116
 117 =back
 118
 119 =head1 FEEDBACK
 120
 121 =head2 Mailing Lists
 122
 123 User feedback is an integral part of the evolution of this and other
 124 Bioperl modules. Send your comments and suggestions preferably to
 125 the Bioperl mailing list.  Your participation is much appreciated.
 126
 127   bioperl-l@bioperl.org                  - General discussion
 128   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 129
 130 =head2 Reporting Bugs
 131
 132 Report bugs to the Bioperl bug tracking system to help us keep track
 133 of the bugs and their resolution. Bug reports can be submitted via the
 134 web:
 135
 136   http://bugzilla.open-bio.org/
 137
 138 =head1 AUTHOR - Jason Stajich
 139
 140 Email jason@bioperl.org
 141
 142 =head1 HISTORY
 143
 144 Based on script code (see bottom) submitted by cckim@stanford.edu
 145
 146 Submitted as part of bioperl script project 2001/08/06
 147
 148 =cut