shuffle: use common fasta reader, upcase sequences, code cleanups
[greylag.git] / greylag_shuffle_database.py
blob469ebba0610fead1dfd867dc4ffa794253b64e29
1 #!/usr/bin/env python
3 """
5 This script will accept FASTA formatted input and generate a shuffled sequence
6 for each input sequence. (Thus, the output will have twice as many sequences
7 as the input.) The prefix 'SHUFFLED_' will be added to the defline of each
8 shuffled sequence. Also, all sequences will be re-wrapped.
10 The script will strip duplicate loci in the input, giving an error if the
11 duplicates, which have the same locus name, do not also have the same
12 sequence.
14 The randomness seed defaults to zero. Use different values if you want
15 different shuffles for a given input. Note that the random number generator
16 is reseeded after each sequence is generated. This means that a particular
17 input sequence will always be mapped to the same shuffled sequence,
18 independent of what other sequences exist in the input, which is desirable for
19 some purposes.
21 """
24 import hashlib
25 import optparse
26 import random
27 import re
28 import sys
30 import greylag
33 # change docstring above if this is changed!
34 SHUFFLE_PREFIX = 'SHUFFLED_'
37 def warn(message):
38 print >> sys.stderr, "warning: %s" % message
40 def error(s):
41 sys.exit('error: ' + s)
44 whitespace = re.compile('[ \t]+')
46 default_wrap=80
49 def main():
50 parser = optparse.OptionParser(usage="usage: %prog [options] <file>",
51 description=__doc__)
52 parser.add_option("-r", "--reverse", action="store_true",
53 dest="reverse",
54 help="reverse the sequences, instead of shuffling")
55 parser.add_option("-s", "--seed", type="int", dest="seed",
56 help="seed for randomness [default 0]",
57 default=0, metavar="N")
58 parser.add_option("-n", "--no-original", action="store_true",
59 dest="no_original",
60 help="don't output original sequences")
61 parser.add_option("-v", "--verbose", action="store_true",
62 dest="verbose", help="be verbose")
63 parser.add_option("-w", "--wrap", dest="wrap", type="int",
64 default=default_wrap,
65 help="wrap sequence to specified width"
66 " [default %s, 0 means don't wrap at all]" % default_wrap,
67 metavar="COLUMNS")
68 (options, args) = parser.parse_args()
70 if len(args) != 1:
71 parser.print_help()
72 sys.exit(1)
74 random.seed(options.seed)
76 # locus id -> (defline, hash of sequence)
77 seen = {}
79 for locusname, defline, sequence, filename \
80 in greylag.read_fasta_files([args[0]]):
81 write_locus(options, seen, locusname, defline, sequence)
84 def write_locus(options, seen, locusname, defline, sequence):
85 h = hashlib.sha1()
86 h.update(sequence)
87 sequence_hash = h.digest()
89 if locusname in seen:
90 seen_defline, seen_hash = seen[locusname]
91 if seen_hash != sequence_hash:
92 error("differing sequence for locus '%s'" % locusname)
93 if options.verbose and seen_defline != defline:
94 warn("differing deflines for locus '%s'" % locusname)
95 return
96 seen[locusname] = (defline, sequence_hash)
98 s_list = list(sequence)
99 random.seed(options.seed)
100 if options.reverse:
101 s_list.reverse()
102 else:
103 random.shuffle(s_list)
104 shuffle_sequence = ''.join(s_list)
105 shuffle_defline = SHUFFLE_PREFIX + locusname + ' FALSE POSITIVE'
106 for d, s in [(defline, sequence), (shuffle_defline, shuffle_sequence)]:
107 if options.no_original and d == defline:
108 continue
109 print '>' + d
110 if options.wrap:
111 for start in range(0, len(s), options.wrap):
112 print s[start:start+options.wrap]
113 else:
114 print s
117 if __name__ == '__main__':
118 main()