Merge branch 'slowbisect'
[greylag.git] / greylag_shuffle_database.py
blobc75a98aac92525845f7c5f31318b137e549ae9bf
1 #!/usr/bin/env python
3 """
5 This script will accept FASTA formatted input and generate a shuffled sequence
6 for each input sequence. (Thus, the output will have twice as many sequences
7 as the input.) The prefix 'SHUFFLED_' will be added to the defline of each
8 shuffled sequence. Also, all sequences will be re-wrapped.
10 The script will strip duplicate loci in the input, giving an error if the
11 duplicates, which have the same locus name, do not also have the same
12 sequence.
14 The randomness seed defaults to zero. Use different values if you want
15 different shuffles for a given input. Note that the random number generator
16 is reseeded after each sequence is generated. This means that a particular
17 input sequence will always be mapped to the same shuffled sequence,
18 independent of what other sequences exist in the input, which is desirable for
19 some purposes.
21 """
24 import fileinput
25 import optparse
26 import random
27 import re
28 import sha
29 import sys
32 # change docstring above if this is changed!
33 SHUFFLE_PREFIX = 'SHUFFLED_'
36 def warn(message):
37 print >> sys.stderr, "warning: %s [at %s:%s]" \
38 % (message, fileinput.filename(), fileinput.filelineno())
39 def error(s):
40 sys.exit('error: ' + s)
43 whitespace = re.compile('[ \t]+')
45 default_wrap=80
48 def main():
49 parser = optparse.OptionParser(usage="usage: %prog [options] [<file>...]",
50 description=__doc__)
51 parser.add_option("-r", "--reverse", action="store_true",
52 dest="reverse",
53 help="reverse the sequences, instead of shuffling")
54 parser.add_option("-s", "--seed", type="int", dest="seed",
55 help="seed for randomness [default 0]",
56 default=0, metavar="N")
57 parser.add_option("-n", "--no-original", action="store_true",
58 dest="no_original",
59 help="don't output original sequences")
60 parser.add_option("-v", "--verbose", action="store_true",
61 dest="verbose", help="be verbose")
62 parser.add_option("-w", "--wrap", dest="wrap", type="int",
63 default=default_wrap,
64 help="wrap sequence to specified width"
65 " [default %s, 0 means don't wrap at all]" % default_wrap,
66 metavar="COLUMNS")
67 (options, args) = parser.parse_args()
69 random.seed(options.seed)
71 defline = None
72 seqs = []
73 # locus id -> (defline, hash of sequence)
74 seen = {}
75 for line in fileinput.input(args):
76 line = line.strip()
77 if line[:1] == '>':
78 if defline:
79 out(defline, seqs, options, seen)
80 elif seqs:
81 warn("discarding sequence prior to initial defline")
82 defline = line
83 seqs = []
84 else:
85 seqs.append(re.sub(whitespace, '', line))
86 if defline:
87 out(defline, seqs, options, seen)
90 def out(defline, seqs, options, seen):
91 sequence = ''.join(seqs)
92 sequence_hash = sha.new(sequence).digest()
93 locus_id = re.split(r'[ ]+', defline, 1)[0]
94 if locus_id in seen:
95 seen_defline, seen_hash = seen[locus_id]
96 if seen_hash != sequence_hash:
97 error("differing sequence for locus '%s'" % locus_id)
98 if options.verbose and seen_defline != defline:
99 warn("differing deflines for locus '%s'" % locus_id)
100 return
101 seen[locus_id] = (defline, sequence_hash)
103 s_list = list(sequence)
104 random.seed(options.seed)
105 if options.reverse:
106 s_list.reverse()
107 else:
108 random.shuffle(s_list)
109 shuffle_sequence = ''.join(s_list)
110 shuffle_defline = '>' + SHUFFLE_PREFIX + locus_id[1:] + ' FALSE POSITIVE'
111 for d, s in [(defline, sequence), (shuffle_defline, shuffle_sequence)]:
112 if options.no_original and d == defline:
113 continue
114 print d
115 if options.wrap:
116 for start in range(0, len(s), options.wrap):
117 print s[start:start+options.wrap]
118 else:
119 print s
122 if __name__ == '__main__':
123 main()