5 # We use this to determine what primers are useful for frequent priming of
6 # nucleic acid for random labeling
7 # Input: Sequence file, oligomer length
8 # Output: Tab-delimited text file of oligomer frequencies
19 #########################
20 # VARIABLES & FILENAMES #
21 #########################
25 my ($format, $infile, $help, $outfile, $oligomerlength) = ('fasta');
27 'f|format:s' => \$format,
28 'i|in|s|sequence:s' => \$infile,
30 'o|out:s' => \$outfile,
31 'length:i' => \$oligomerlength
34 my $USAGE = "Usage:\toligo_count [-h/--help] [-l/--length OLIGOLENGTH]\n".
35 "\t[-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]\n".
36 "\t[-o/--out OUTFILE]\n".
37 "\tDefault SEQFORMAT is fasta\n";
39 print $USAGE and exit if $help;
42 print 'Enter your concatenated FASTA sequence filename: ';
43 chomp ($infile=<STDIN>);
45 unless (-e $infile) { die "$infile not found\n"; }
49 print "$outfile already exists! Overwrite (Y/N)? ";
52 print 'Y or N, please: ';
55 if (/n/i) { die "$outfile not overwritten.\n"; }
58 # print 'Enter an output filename: ';
59 # chomp ($outfile=<STDIN>);
61 # print "$outfile already exists! Overwrite (Y/N)? ";
62 # chomp ($_ = <STDIN>);
64 # print 'Y or N, please: ';
65 # chomp ($_ = <STDIN>);
67 # if (/n/i) { die "$outfile not overwritten.\n"; }
71 unless ($oligomerlength) {
73 print 'Enter an oligomer length to count: ';
74 chomp($oligomerlength=<STDIN>);
75 if ($oligomerlength !~ /\d/) {
76 print "Value is non-numeric!\n";
87 if ($oligomerlength >= 9) {
88 print "An oligomer length of $oligomerlength will generate ";
89 print 4 ** $oligomerlength, " combinations,\nwhich could cause ";
90 print "an out of memory error. Proceed? (y/n) ";
93 else { die "Program terminated\n"; }
95 my @oligoseqs = &generate_all_oligos($oligomerlength);
97 foreach (@oligoseqs) {
101 my $in = Bio::SeqIO->new( -file => $infile,
106 while (my $seq = $in->next_seq() ) {
107 my $len = $seq->length();
109 if ($position+$oligomerlength > $len) {
113 $seq = uc $seq->seq; #string
114 $exception = 1 if $seq =~ /[^GATC]/;
116 while ($position + $oligomerlength-1 <= $len) {
117 $oligos{substr $seq, $position-1, $oligomerlength}++;
119 if ($position%250000 == 0) {print "$position\n";}
121 $oligocounts += $position-1;
126 open(OUTFILE, ">$outfile") or die "Can't open $outfile\n";
128 open OUTFILE, '>-'; # STDOUT
130 print OUTFILE "$seqnumber sequences analyzed\n";
131 print OUTFILE "$oligocounts total $oligomerlength-mers counted\n";
132 print OUTFILE "$oligomerlength-mer\tNumber\tFrequency\n";
133 foreach my $key (sort keys %oligos) {
134 print OUTFILE "$key\t$oligos{$key}\t", $oligos{$key}/$oligocounts, "\n";
138 if ($exception == 1) {
139 print "Non-standard (non-GATC) bases were found in sequence\n";
141 if ($exception == 2) {
142 print "Oligomer length greater than sequence length\n";
152 sub generate_all_oligos {
153 my $oligolength = $_[0];
155 my @newarray = qw{A C G T};
156 my @bases = qw{A C G T};
158 while ($iter < $oligolength) {
159 my @oldarray = @newarray;
161 foreach my $oligoseq (@oldarray) {
162 foreach my $newbase (@bases) {
163 push @newarray, $oligoseq . $newbase;
171 # if you wanted to be notified about status of running
172 #my $EMAILADDRESS = undef;
173 #die("Must change script to a valid email addres for notification")
174 # unless( defined $EMAILADDRESS );
177 # $address = $EMAILADDRESS;
178 # $address = $_[0] if $_[0];
179 # open(SENDMAIL, "|/usr/lib/sendmail -oi -t") or die "Can't fork for sendmail: $!\n";
180 # print SENDMAIL <<"EOF";
183 #Subject: Program Finished
186 # close(SENDMAIL) or warn "sendmail didn't close nicely";
193 oligo_count - oligo count and frequency
197 Usage: oligo_count [-h/--help] [-l/--length OLIGOLENGTH]
198 [-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]
203 This scripts counts occurrence and frequency for all oligonucleotides
206 It can be used to determine what primers are useful for
207 frequent priming of nucleic acid for random labeling.
209 Note that this script could be run by utilizing the compseq
210 program which is part of EMBOSS.
214 The default sequence format is fasta. If no outfile is given, the
215 results will be printed to standard out. All other options can entered
222 User feedback is an integral part of the evolution of this and other
223 Bioperl modules. Send your comments and suggestions preferably to
224 the Bioperl mailing list. Your participation is much appreciated.
226 bioperl-l@bioperl.org - General discussion
227 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
229 =head2 Reporting Bugs
231 Report bugs to the Bioperl bug tracking system to help us keep track
232 of the bugs and their resolution. Bug reports can be submitted via the
235 http://bugzilla.open-bio.org/
237 =head1 AUTHOR - Charles C. Kim
239 Email cckim@stanford.edu
245 Submitted to bioperl scripts project 2001/08/06
247 E<gt>E<gt> 100 x speed optimization by Heikki Lehvaslaiho