document release steps
[greylag.git] / greylag_shuffle_database.py
blob388e567f379fde8412039ff87f64fd41498d92ff
1 #!/usr/bin/env python
3 """
5 This script will accept FASTA formatted input and generate a decoy sequence
6 for each input sequence. (Thus, the output will have twice as many sequences
7 as the input.) A prefix (e.g. 'SHUFFLED_') will be added to the defline of
8 each decoy sequence. Also, all sequences will be re-wrapped.
10 The script will strip duplicate loci in the input, giving an error if the
11 duplicates, which have the same locus name, do not also have the same
12 sequence.
14 For shuffling, the randomness seed defaults to zero. Use different values if
15 you want different shuffles for a given input. Note that the random number
16 generator is reseeded after each sequence is generated. This means that a
17 particular input sequence will always be mapped to the same shuffled sequence,
18 independent of what other sequences exist in the input, which is desirable for
19 some purposes.
21 """
23 __copyright__ = '''
24 greylag, a collection of programs for MS/MS protein analysis
25 Copyright (C) 2006-2008 Stowers Institute for Medical Research
27 This program is free software: you can redistribute it and/or modify
28 it under the terms of the GNU General Public License as published by
29 the Free Software Foundation, either version 3 of the License, or
30 (at your option) any later version.
32 This program is distributed in the hope that it will be useful,
33 but WITHOUT ANY WARRANTY; without even the implied warranty of
34 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35 GNU General Public License for more details.
37 You should have received a copy of the GNU General Public License
38 along with this program. If not, see <http://www.gnu.org/licenses/>.
40 Contact: Mike Coleman
41 Stowers Institute for Medical Research
42 1000 East 50th Street
43 Kansas City, Missouri 64110
44 USA
45 '''
48 from collections import defaultdict
49 import hashlib
50 import optparse
51 import random
52 import re
53 import sys
55 import greylag
58 def warn(message):
59 print >> sys.stderr, "warning: %s" % message
61 def error(s):
62 sys.exit('error: ' + s)
64 # errors are fatal
65 greylag.chase_error = error
68 class abstract_decoy_maker:
69 def make(self, s):
70 """Return a decoy sequence of the same length. (Argument is a list of
71 residues, and may be modified by this function.)
72 """
73 error("abstract method not implemented")
75 class reverse_decoy_maker(abstract_decoy_maker):
76 def make(self, s):
77 s.reverse()
78 return s
80 class shuffle_decoy_maker(abstract_decoy_maker):
81 def __init__(self, random_seed):
82 self.random_seed = random_seed
84 def make(self, s):
85 random.seed(self.random_seed)
86 random.shuffle(s)
87 return s
89 class markov_decoy_maker(abstract_decoy_maker):
90 def __init__(self, random_seed, length, original_sequence_file):
91 random.seed(random_seed)
92 self.length = length
94 # for order 0 through self.length:
95 # [ length-mer -> subsequent residues -> count, ... ]
96 self.transition = [ defaultdict(lambda: defaultdict(int))
97 for i in range(length+1) ]
98 for locusname, defline, sequence, filename \
99 in greylag.read_fasta_files([original_sequence_file]):
100 for order in range(length+1):
101 seq = '-' * order + sequence
102 for i in xrange(len(sequence)):
103 self.transition[order][seq[i:i+order]][seq[i+order]] += 1
105 #import pprint
106 #for n, t in enumerate(self.transition):
107 # print '### order', n
108 # for k in sorted(t.keys()):
109 # pprint.pprint((k, dict(t[k])))
112 # FIX: this could be cached for better performance
113 def _choose_next(self, hist):
114 histpairs = hist.items()
115 total = sum(x[1] for x in histpairs)
116 choice = random.randrange(total)
117 cum = 0
118 for r, count in histpairs:
119 cum += count
120 if choice < cum:
121 return r
122 assert False
124 def make(self, s):
125 key = '-' * self.length
126 result = []
127 for i in xrange(len(s)):
128 for order in range(self.length, -1, -1):
129 k = key[-order:] if order else ''
130 if k not in self.transition[order]:
131 continue
132 r = self._choose_next(self.transition[order][k])
133 break
134 else:
135 assert False
137 result.append(r)
138 if key: # if self.length==0, key is ''
139 key = key[1:] + r
140 return result
143 def add_sixmers(sixmer_set, sequence):
144 for i in range(len(sequence) - 6 + 1):
145 sixmer_set.add(sequence[i:i+6])
148 def write_locus(options, decoy_maker, seen, sixmers, locusname, defline,
149 sequence):
150 h = hashlib.sha1()
151 h.update(sequence)
152 sequence_hash = h.digest()
154 if locusname in seen:
155 seen_defline, seen_hash = seen[locusname]
156 if seen_hash != sequence_hash:
157 error("differing sequence for locus '%s'" % locusname)
158 if options.verbose and seen_defline != defline:
159 warn("differing deflines for locus '%s'" % locusname)
160 return
161 seen[locusname] = (defline, sequence_hash)
163 decoy_sequence = ''.join(decoy_maker.make(list(sequence)))
164 decoy_defline = greylag.DEFAULT_DECOY_PREFIX + locusname + ' FALSE POSITIVE'
166 add_sixmers(sixmers[0], sequence)
167 add_sixmers(sixmers[1], decoy_sequence)
169 for d, s in [(defline, sequence), (decoy_defline, decoy_sequence)]:
170 if options.no_original and d == defline:
171 continue
172 print '>' + d
173 if options.wrap:
174 for start in range(0, len(s), options.wrap):
175 print s[start:start+options.wrap]
176 else:
177 print s
180 def main():
181 parser = optparse.OptionParser(usage="usage: %prog [options] <file>",
182 description=__doc__)
183 parser.add_option("-r", "--reverse", action="store_true",
184 dest="reverse",
185 help="reverse the sequences, instead of shuffling")
186 parser.add_option("-m", "--markov", type="int", dest="markov_length",
187 help="generate Markov sequences, with memory of length"
188 " N, instead of shuffling", default=None, metavar="N")
189 parser.add_option("-s", "--seed", type="int", dest="seed",
190 help="seed for randomness [default 0]",
191 default=0, metavar="N")
192 parser.add_option("-n", "--no-original", action="store_true",
193 dest="no_original",
194 help="don't output original sequences")
195 parser.add_option("-v", "--verbose", action="store_true",
196 dest="verbose", help="be verbose")
197 DEFAULT_WRAP = 80
198 parser.add_option("-w", "--wrap", dest="wrap", type="int",
199 default=DEFAULT_WRAP,
200 help="wrap sequence to specified width"
201 " [default %s, 0 means don't wrap at all]" % DEFAULT_WRAP,
202 metavar="COLUMNS")
203 parser.add_option("--copyright", action="store_true", dest="copyright",
204 help="print copyright and exit")
205 (options, args) = parser.parse_args()
207 if (len(args) != 1
208 or options.markov_length != None and options.reverse
209 or options.markov_length != None and options.markov_length < 0):
210 parser.print_help()
211 sys.exit(1)
213 if options.markov_length != None:
214 decoy_maker = markov_decoy_maker(options.seed, options.markov_length,
215 args[0])
216 elif options.reverse:
217 decoy_maker = reverse_decoy_maker()
218 else:
219 decoy_maker = shuffle_decoy_maker(options.seed)
221 # locus id -> (defline, hash of sequence)
222 seen = {}
224 # real and decoy 6-mers seen
225 sixmers = (set(), set())
227 for locusname, defline, sequence, filename \
228 in greylag.read_fasta_files([args[0]]):
229 write_locus(options, decoy_maker, seen, sixmers,
230 locusname, defline, sequence)
232 common_sixmers = sixmers[0] & sixmers[1]
233 print >> sys.stderr, ("six-mers: %s real %s decoy %s both"
234 % (len(sixmers[0]) - len(common_sixmers),
235 len(sixmers[1]) - len(common_sixmers),
236 len(common_sixmers)))
239 if __name__ == '__main__':
240 main()