5 This script will accept FASTA formatted input and generate a shuffled sequence
6 for each input sequence. (Thus, the output will have twice as many sequences
7 as the input.) The prefix 'SHUFFLED_' will be added to the defline of each
8 shuffled sequence. Also, all sequences will be re-wrapped.
10 The script will strip duplicate loci in the input, giving an error if the
11 duplicates, which have the same locus name, do not also have the same
14 The randomness seed defaults to zero. Use different values if you want
15 different shuffles for a given input. Note that the random number generator
16 is reseeded after each sequence is generated. This means that a particular
17 input sequence will always be mapped to the same shuffled sequence,
18 independent of what other sequences exist in the input, which is desirable for
32 # change docstring above if this is changed!
33 SHUFFLE_PREFIX
= 'SHUFFLED_'
37 print >> sys
.stderr
, "warning: %s [at %s:%s]" \
38 % (message
, fileinput
.filename(), fileinput
.filelineno())
40 sys
.exit('error: ' + s
)
43 whitespace
= re
.compile('[ \t]+')
49 parser
= optparse
.OptionParser(usage
="usage: %prog [options] [<file>...]",
51 parser
.add_option("-r", "--reverse", action
="store_true",
53 help="reverse the sequences, instead of shuffling")
54 parser
.add_option("-s", "--seed", type="int", dest
="seed",
55 help="seed for randomness [default 0]",
56 default
=0, metavar
="N")
57 parser
.add_option("-n", "--no-original", action
="store_true",
59 help="don't output original sequences")
60 parser
.add_option("-v", "--verbose", action
="store_true",
61 dest
="verbose", help="be verbose")
62 parser
.add_option("-w", "--wrap", dest
="wrap", type="int",
64 help="wrap sequence to specified width"
65 " [default %s, 0 means don't wrap at all]" % default_wrap
,
67 (options
, args
) = parser
.parse_args()
69 random
.seed(options
.seed
)
73 # locus id -> (defline, hash of sequence)
75 for line
in fileinput
.input(args
):
79 out(defline
, seqs
, options
, seen
)
81 warn("discarding sequence prior to initial defline")
85 seqs
.append(re
.sub(whitespace
, '', line
))
87 out(defline
, seqs
, options
, seen
)
90 def out(defline
, seqs
, options
, seen
):
91 sequence
= ''.join(seqs
)
92 sequence_hash
= sha
.new(sequence
).digest()
93 locus_id
= re
.split(r
'[ ]+', defline
, 1)[0]
95 seen_defline
, seen_hash
= seen
[locus_id
]
96 if seen_hash
!= sequence_hash
:
97 error("differing sequence for locus '%s'" % locus_id
)
98 if options
.verbose
and seen_defline
!= defline
:
99 warn("differing deflines for locus '%s'" % locus_id
)
101 seen
[locus_id
] = (defline
, sequence_hash
)
103 s_list
= list(sequence
)
104 random
.seed(options
.seed
)
108 random
.shuffle(s_list
)
109 shuffle_sequence
= ''.join(s_list
)
110 shuffle_defline
= '>' + SHUFFLE_PREFIX
+ locus_id
[1:] + ' FALSE POSITIVE'
111 for d
, s
in [(defline
, sequence
), (shuffle_defline
, shuffle_sequence
)]:
112 if options
.no_original
and d
== defline
:
116 for start
in range(0, len(s
), options
.wrap
):
117 print s
[start
:start
+options
.wrap
]
122 if __name__
== '__main__':