see if we can make PAUSE skip indexing these modules
[bioperl-live.git] / examples / generate_random_seq.pl
blob9737be5b91799481b12c8ea9c5dbb4ba69e5d98e
1 #!/bin/perl -w
2 use strict;
3 use vars qw($USAGE);
5 # random sequence generator #
6 # -c=1 option will cause prot sequences to be built
7 # using vertebrate aa frequencies,
8 # with option -a putting a 1st methionine residues on. Frequencies are
9 # calculated from the NCBI human RefSeq protein sequences
10 # -c and -a only affect protein sequences
11 # -a only works in conjunction with -c
12 # -n number of random sequences, default = 1
14 use Bio::PrimarySeq;
15 use Bio::SeqIO;
16 use Getopt::Long;
17 my ($length,$type,$filename,$comp,$met);
19 $USAGE = 'usage: generate_random_seq.pl --length=1000 --type=dna --filename=/tmp/test.seq --number=50';
21 my %alphabets = ( 'dna' => [qw(C A G T)],
22 'rna' => [qw(C A G U)],
23 'prot'=> [qw( A C D E F G H I K L M N P Q R S T V W Y)],
25 # make random num from 1-10000. numbers in this array reflect the frequency,
26 # e.g., a random number from 1.744 = A, 745-991 = C etc;
27 my @aa_frequencies = qw(744 991 1398 2017 2378 3104 3349 3726 4239 5273 5443
28 5749 6410 6848 7455 8263 8760 9340 9488 9713 10000);
29 my $number = 1;
31 &GetOptions
33 'l|length:s' => \$length,
34 't|type|m|alphabet:s' => \$type,
35 'f|file|filename:s' => \$filename,
36 'c|composition:s' => \$comp,
37 'a|methionine:s' => \$met,
38 'n|number:s' => \$number
41 assert ( $type && defined ($alphabets{lc $type}),
42 $USAGE);
43 assert ( $length && $length =~ /^\d+$/, $USAGE );
45 foreach my $num (1..$number) {
46 my $sequence = "";
47 my $alphabet = $alphabets{lc $type};
48 my $sspace = scalar @$alphabet;
49 if (!$comp || $type ne 'prot') {
50 foreach ( 1..$length ) {
51 $sequence .= $alphabet->[ int rand($sspace) ];
53 }elsif ($type eq 'prot') {
54 $sequence = build_seq($length, \@aa_frequencies);
56 my $seq = Bio::PrimarySeq->new(-seq => $sequence,
57 -display_id => 'randomseq'.$num);
58 my %args = (-format => 'fasta');
59 if( $filename ) { $args{-file} = ">>$filename" }
60 my $seqio = Bio::SeqIO->new(%args);
61 $seqio->write_seq($seq);
64 sub assert { die $_[1] unless( $_[0] ); }
66 sub build_seq {
67 #takes seqlen and ref to frequency data as parameters
68 my ($len, $pf) = @_;
69 my $str = ($met)?'M':'';
70 my $i = ($met)?1:0;
71 for ($i..$len-1) {
72 my $aa = int(rand (10000)) ;
73 my $j = 0;
74 while ($pf->[$j] < $aa && $j <19) {
75 $j++;
77 $str .= $alphabets{'prot'}[$j];
79 print "str is $str\n";
80 return $str;