branch-1-6/scripts/seqstats/oligo_count.PLS

   1 #!perl -w
   2 # $Id$
   3 #
   4 # oligomer_freq.pl
   5 # We use this to determine what primers are useful for frequent priming of
   6 # nucleic acid for random labeling
   7 # Input: Sequence file, oligomer length
   8 # Output: Tab-delimited text file of oligomer frequencies
   9 # Written July 2, 2001
  10 # Charles C. Kim
  11
  12 ###########
  13 # MODULES #
  14 ###########
  15 use Bio::Seq;
  16 use Bio::SeqIO;
  17 use Getopt::Long;
  18
  19 #########################
  20 # VARIABLES & FILENAMES #
  21 #########################
  22
  23 use strict;
  24
  25 my ($format, $infile, $help, $outfile, $oligomerlength) = ('fasta');
  26 GetOptions(
  27            'f|format:s'            => \$format,
  28            'i|in|s|sequence:s'     => \$infile,
  29            'h|help|?'              => \$help,
  30            'o|out:s'               => \$outfile,
  31            'length:i'              => \$oligomerlength
  32           );
  33
  34 my $USAGE = "Usage:\toligo_count [-h/--help] [-l/--length OLIGOLENGTH]\n".
  35     "\t[-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]\n".
  36     "\t[-o/--out OUTFILE]\n".
  37     "\tDefault SEQFORMAT is fasta\n";
  38
  39 print $USAGE and exit if $help;
  40
  41 unless ($infile ) {
  42     print 'Enter your concatenated FASTA sequence filename: ';
  43     chomp ($infile=<STDIN>);
  44 }
  45 unless (-e $infile) { die "$infile not found\n"; }
  46
  47 if ($outfile) {
  48     if (-e $outfile) {
  49         print "$outfile already exists!  Overwrite (Y/N)? ";
  50         chomp ($_ = <STDIN>);
  51         while (/[^yn]/i) {
  52             print 'Y or N, please: ';
  53             chomp ($_ = <STDIN>);
  54         }
  55         if (/n/i) { die "$outfile not overwritten.\n"; }
  56     }
  57 #} else {
  58 #    print 'Enter an output filename: ';
  59 #    chomp ($outfile=<STDIN>);
  60 #    if (-e $outfile) {
  61 #       print "$outfile already exists!  Overwrite (Y/N)? ";
  62 #       chomp ($_ = <STDIN>);
  63 #       while (/[^yn]/i) {
  64 #           print 'Y or N, please: ';
  65 #           chomp ($_ = <STDIN>);
  66 #       }
  67 #       if (/n/i) { die "$outfile not overwritten.\n"; }
  68 #    }
  69 }
  70
  71 unless ($oligomerlength) {
  72     while () {
  73         print 'Enter an oligomer length to count: ';
  74         chomp($oligomerlength=<STDIN>);
  75         if ($oligomerlength !~ /\d/) {
  76             print "Value is non-numeric!\n";
  77         }
  78         else {last;}
  79     }
  80 }
  81
  82
  83 ########
  84 # MAIN #
  85 ########
  86
  87 if ($oligomerlength >= 9) {
  88     print "An oligomer length of $oligomerlength will generate ";
  89     print 4 ** $oligomerlength, " combinations,\nwhich could cause ";
  90     print "an out of memory error.  Proceed? (y/n) ";
  91     chomp($_=<STDIN>);
  92     if (/y/i) { ; }
  93     else { die "Program terminated\n"; }
  94 }
  95 my @oligoseqs = &generate_all_oligos($oligomerlength);
  96 my %oligos = ();
  97 foreach  (@oligoseqs) {
  98     $oligos{$_} = 0;
  99 }
 100
 101 my $in = Bio::SeqIO->new( -file => $infile,
 102                        -format => $format);
 103 my $seqnumber = 0;
 104 my $oligocounts = 0;
 105 my $exception;
 106 while (my $seq = $in->next_seq() ) {
 107     my $len = $seq->length();
 108     my $position = 1;
 109     if ($position+$oligomerlength > $len) {
 110         $exception = 2;
 111         next;
 112     }
 113     $seq = uc $seq->seq; #string
 114     $exception = 1 if $seq =~ /[^GATC]/;
 115
 116     while ($position + $oligomerlength-1 <= $len) {
 117         $oligos{substr $seq, $position-1, $oligomerlength}++;
 118         $position++;
 119         if ($position%250000 == 0) {print "$position\n";}
 120     }
 121     $oligocounts += $position-1;
 122     $seqnumber++;
 123 }
 124
 125 if ($outfile) {
 126     open(OUTFILE, ">$outfile") or die "Can't open $outfile\n";
 127 } else {
 128     open OUTFILE, '>-'; # STDOUT
 129 }
 130 print OUTFILE "$seqnumber sequences analyzed\n";
 131 print OUTFILE "$oligocounts total $oligomerlength-mers counted\n";
 132 print OUTFILE "$oligomerlength-mer\tNumber\tFrequency\n";
 133 foreach my $key (sort keys %oligos) {
 134     print OUTFILE "$key\t$oligos{$key}\t", $oligos{$key}/$oligocounts, "\n";
 135 }
 136
 137 if ($exception) {
 138     if ($exception == 1) {
 139         print "Non-standard (non-GATC) bases were found in sequence\n";
 140     }
 141     if ($exception == 2) {
 142         print "Oligomer length greater than sequence length\n";
 143     }
 144 }
 145
 146 #&notify();
 147
 148 ###############
 149 # SUBROUTINES #
 150 ###############
 151
 152 sub generate_all_oligos {
 153     my $oligolength = $_[0];
 154     my $iter = 1;
 155     my @newarray = qw{A C G T};
 156     my @bases = qw{A C G T};
 157
 158     while ($iter < $oligolength) {
 159         my @oldarray = @newarray;
 160         @newarray = ();
 161         foreach my $oligoseq (@oldarray) {
 162             foreach my $newbase (@bases) {
 163                 push @newarray, $oligoseq . $newbase;
 164             }
 165         }
 166         $iter++;
 167     }
 168     return @newarray;
 169 }
 170
 171 # if you wanted to be notified about status of running
 172 #my $EMAILADDRESS = undef;
 173 #die("Must change script to a valid email addres for notification")
 174 #    unless( defined $EMAILADDRESS );
 175
 176 #sub notify {
 177 #    $address = $EMAILADDRESS;
 178 #    $address = $_[0] if $_[0];
 179 #    open(SENDMAIL, "|/usr/lib/sendmail -oi -t") or die "Can't fork for sendmail: $!\n";
 180 #    print SENDMAIL <<"EOF";
 181 #From: Computer
 182 #To: $address
 183 #Subject: Program Finished
 184 #
 185 #EOF
 186 #    close(SENDMAIL) or warn "sendmail didn't close nicely";
 187 #}
 188
 189 __END__
 190
 191 =head1 NAME
 192
 193 oligo_count - oligo count and frequency
 194
 195 =head1 SYNOPSIS
 196
 197   Usage:  oligo_count [-h/--help] [-l/--length OLIGOLENGTH]
 198           [-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]
 199           [-o/--out OUTFILE]
 200
 201 =head1 DESCRIPTION
 202
 203 This scripts counts occurrence and frequency for all oligonucleotides
 204 of given length.
 205
 206 It can be used to determine what primers are useful for
 207 frequent priming of nucleic acid for random labeling.
 208
 209 Note that this script could be run by utilizing the compseq
 210 program which is part of EMBOSS.
 211
 212 =head1 OPTIONS
 213
 214 The default sequence format is fasta. If no outfile is given, the
 215 results will be printed to standard out. All other options can entered
 216 interactively.
 217
 218 =head1 FEEDBACK
 219
 220 =head2 Mailing Lists
 221
 222 User feedback is an integral part of the evolution of this and other
 223 Bioperl modules. Send your comments and suggestions preferably to
 224 the Bioperl mailing list.  Your participation is much appreciated.
 225
 226   bioperl-l@bioperl.org                  - General discussion
 227   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 228
 229 =head2 Reporting Bugs
 230
 231 Report bugs to the Bioperl bug tracking system to help us keep track
 232 of the bugs and their resolution. Bug reports can be submitted via the
 233 web:
 234
 235   http://bugzilla.open-bio.org/
 236
 237 =head1 AUTHOR - Charles C. Kim
 238
 239 Email cckim@stanford.edu
 240
 241 =head1 HISTORY
 242
 243 Written July 2, 2001
 244
 245 Submitted to bioperl scripts project 2001/08/06
 246
 247 E<gt>E<gt> 100 x speed optimization by Heikki Lehvaslaiho
 248
 249 =cut