5 This script will accept FASTA formatted input and generate a shuffled sequence
6 for each input sequence. (Thus, the output will have twice as many sequences
7 as the input.) The prefix 'SHUFFLED_' will be added to the defline of each
8 shuffled sequence. Also, all sequences will be re-wrapped.
10 The script will strip duplicate loci in the input, giving an error if the
11 duplicates, which have the same locus name, do not also have the same
14 The randomness seed defaults to zero. Use different values if you want
15 different shuffles for a given input. Note that the random number generator
16 is reseeded after each sequence is generated. This means that a particular
17 input sequence will always be mapped to the same shuffled sequence,
18 independent of what other sequences exist in the input, which is desirable for
33 # change docstring above if this is changed!
34 SHUFFLE_PREFIX
= 'SHUFFLED_'
38 print >> sys
.stderr
, "warning: %s" % message
41 sys
.exit('error: ' + s
)
44 whitespace
= re
.compile('[ \t]+')
50 parser
= optparse
.OptionParser(usage
="usage: %prog [options] <file>",
52 parser
.add_option("-r", "--reverse", action
="store_true",
54 help="reverse the sequences, instead of shuffling")
55 parser
.add_option("-s", "--seed", type="int", dest
="seed",
56 help="seed for randomness [default 0]",
57 default
=0, metavar
="N")
58 parser
.add_option("-n", "--no-original", action
="store_true",
60 help="don't output original sequences")
61 parser
.add_option("-v", "--verbose", action
="store_true",
62 dest
="verbose", help="be verbose")
63 parser
.add_option("-w", "--wrap", dest
="wrap", type="int",
65 help="wrap sequence to specified width"
66 " [default %s, 0 means don't wrap at all]" % default_wrap
,
68 (options
, args
) = parser
.parse_args()
74 random
.seed(options
.seed
)
76 # locus id -> (defline, hash of sequence)
79 for locusname
, defline
, sequence
, filename \
80 in greylag
.read_fasta_files([args
[0]]):
81 write_locus(options
, seen
, locusname
, defline
, sequence
)
84 def write_locus(options
, seen
, locusname
, defline
, sequence
):
87 sequence_hash
= h
.digest()
90 seen_defline
, seen_hash
= seen
[locusname
]
91 if seen_hash
!= sequence_hash
:
92 error("differing sequence for locus '%s'" % locusname
)
93 if options
.verbose
and seen_defline
!= defline
:
94 warn("differing deflines for locus '%s'" % locusname
)
96 seen
[locusname
] = (defline
, sequence_hash
)
98 s_list
= list(sequence
)
99 random
.seed(options
.seed
)
103 random
.shuffle(s_list
)
104 shuffle_sequence
= ''.join(s_list
)
105 shuffle_defline
= SHUFFLE_PREFIX
+ locusname
+ ' FALSE POSITIVE'
106 for d
, s
in [(defline
, sequence
), (shuffle_defline
, shuffle_sequence
)]:
107 if options
.no_original
and d
== defline
:
111 for start
in range(0, len(s
), options
.wrap
):
112 print s
[start
:start
+options
.wrap
]
117 if __name__
== '__main__':