minor code reorg (use new Python if/else operator)
[greylag.git] / greylag_validate.py
blob150485df2a07e1eeeea091e6ff242444cb26a713
1 #!/usr/bin/env greylag-python
3 """
4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate). Optionally rewrite the sqt
6 files with validation marks set to 'N' for filtered-out spectra.
8 """
10 from __future__ import with_statement
12 __copyright__ = '''
13 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
15 This program is free software; you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation; either version 2 of the License, or
18 (at your option) any later version.
20 This program is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
25 You should have received a copy of the GNU General Public License along
26 with this program; if not, write to the Free Software Foundation, Inc.,
27 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 '''
30 __version__ = "0.0"
33 from collections import defaultdict
34 import fileinput
35 import optparse
36 from pprint import pprint
37 import string
38 import sys
41 def warn(s):
42 print >> sys.stderr, 'warning:', s
43 def error(s):
44 sys.exit('error: ' + s)
45 def fileerror(s, *args):
46 error(s + (", at line %s of file '%s'"
47 % (fileinput.filelineno(), fileinput.filename())),
48 *args)
50 def inplace_warning():
51 warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
52 " be necessary to recover some or all of the .sqt files from the"
53 " corresponding .sqt.bak files.\n!!!")
56 def reset_marks(options, sqt_fns):
57 """Rewrite all evaluation marks to 'U', in-place."""
59 try:
60 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
61 if line.startswith("M\t"):
62 fs = line.split("\t")
63 if len(fs) >= 11:
64 fs[10] = 'U' + fs[10][1:]
65 line = '\t'.join(fs)
66 sys.stdout.write(line)
67 except:
68 inplace_warning()
69 raise
72 def mark(options, thresholds, sp_scores, sqt_fns):
73 """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
74 score and delta thresholds."""
76 spectrum_no = -1
77 mark_spectrum = False
79 try:
80 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
81 if line.startswith("S\t"):
82 spectrum_no += 1
83 charge, score, delta = sp_scores[spectrum_no]
84 mark_spectrum = (charge in thresholds
85 and score != None
86 and (score < thresholds[charge][0]
87 or delta < thresholds[charge][1]))
88 elif line.startswith("M\t") and mark_spectrum:
89 fs = line.split("\t")
90 if len(fs) >= 11:
91 fs[10] = 'N' + fs[10][1:]
92 line = '\t'.join(fs)
93 sys.stdout.write(line)
94 except:
95 inplace_warning()
96 raise
99 nulltrans = string.maketrans('','')
100 non_aa_chars = string.digits + string.punctuation
102 def stripmods(s):
103 """Given a peptide, return just the unmodified peptide seen (upcased)."""
104 # assuming no control chars present
105 return s.upper().translate(nulltrans, non_aa_chars)
108 def read_sqt_info(decoy_prefix, minimum_trypticity, sqt_fns):
109 """Return a pair, the first a dict mapping each charge to a list of
110 (score, delta, state), where state is 'real' or 'decoy', for all the
111 spectra in sqt_fns, and the second a list of (charge, score, delta), one
112 for each spectrum. Spectra lacking minimum_trypticity are dropped.
115 # charge -> [ (score, delta, state), ... ]
116 # where state is either 'real' or 'decoy'
117 z_scores = defaultdict(list)
119 # [ (charge, score, delta), ... ]
120 sp_scores = []
122 current_charge = None
123 current_score = None
124 current_delta = 0
125 current_peptide_trypticity = None # or one of [ 0, 1, 2 ]
126 current_state = set() # 'real', 'decoy' or both
127 highest_rank_seen = 0
128 best_peptide_seen = None
130 for line in fileinput.input(sqt_fns):
131 fs = line.split('\t')
132 if fs[0] == 'S':
133 if (current_charge != None and
134 current_peptide_trypticity >= minimum_trypticity):
135 if current_score != None:
136 if len(current_state) == 1:
137 z_scores[current_charge].append((current_score,
138 current_delta,
139 current_state.pop()))
140 sp_scores.append((current_charge, current_score,
141 current_delta))
142 current_charge = int(fs[3])
143 current_score = None
144 current_delta = 0
145 current_peptide_trypticity = None
146 current_state = set()
147 highest_rank_seen = 0
148 best_peptide_seen = None
149 elif fs[0] == 'M':
150 rank = int(fs[1])
151 delta = float(fs[4])
152 score = float(fs[5])
153 sp_rank = int(fs[2])
154 peptide = fs[9].strip() # e.g., A.B@CD*.-
155 mass = float(fs[3])
157 if rank > highest_rank_seen + 1:
158 # ignore aux top SpRank hits, because delta confounds
159 continue
160 highest_rank_seen = rank
162 assert peptide[1] == '.' and peptide[-2] == '.'
163 peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
164 peptide = peptide[2:-2] # 'B@CD*'
165 peptide_stripped = stripmods(peptide) # 'BCD'
167 if best_peptide_seen == None:
168 best_peptide_seen = peptide_stripped
170 if delta == 0:
171 assert current_delta == 0
172 current_score = score
173 elif (current_delta == 0 or best_peptide_seen == None
174 or best_peptide_seen == peptide_stripped):
175 current_delta = delta
176 if best_peptide_seen != peptide_stripped:
177 # once we've seen a non-equivalent peptide, don't set
178 # delta if we see further equivalent peptides
179 best_peptide_seen = '####'
180 if current_peptide_trypticity == None:
181 current_peptide_trypticity = 0
182 if (peptide_flanks[0] in ('K', 'R')
183 and peptide_stripped[0] != 'P'):
184 current_peptide_trypticity += 1
185 if (peptide_stripped[-1] in ('K', 'R')
186 and peptide_flanks[1] != 'P'):
187 current_peptide_trypticity += 1
188 elif fs[0] == 'L':
189 if current_delta == 0:
190 if fs[1].startswith(decoy_prefix):
191 current_state.add('decoy')
192 else:
193 current_state.add('real')
195 # handle final spectrum, as above
196 if (current_charge != None and
197 current_peptide_trypticity >= minimum_trypticity):
198 if current_score != None:
199 if len(current_state) == 1:
200 z_scores[current_charge].append((current_score,
201 current_delta,
202 current_state.pop()))
203 sp_scores.append((current_charge, current_score,
204 current_delta))
206 return (z_scores, sp_scores)
209 def specificity(positives, negatives):
210 return float(positives - negatives) / positives
213 def calculate_inner_threshold(fdr, charge, spinfo):
214 specificity_goal = 1 - fdr
216 spinfo = sorted(spinfo, key=lambda x: x[0])
218 epsilon = -1e-9
220 real_count = sum(1 for x in spinfo if x[-1] == 'real')
221 decoy_count = len(spinfo) - real_count
223 current_threshold = -1e100 # allow all spectra
224 for n, sp in enumerate(spinfo):
225 if real_count == 0:
226 return (None, real_count, decoy_count) # give up
228 specificity_est = specificity(real_count, decoy_count)
229 if specificity_est >= specificity_goal:
230 break
231 if sp[-1] == 'real':
232 real_count -= 1
233 else:
234 decoy_count -= 1
235 # set threshold just high enough to exclude this spectrum
236 current_threshold = sp[0] + epsilon
237 else:
238 current_threshold = spinfo[-1][0] + epsilon # couldn't meet goal
240 return (current_threshold, real_count, decoy_count)
243 def calculate_combined_thresholds(options, z_scores):
244 """Find best score/delta thresholds for each charge."""
246 specificity_goal = 1 - options.fdr
248 # Rather than search every possible value of delta, we're only going to
249 # "sample" at this granularity. This cuts search time dramatically (and
250 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
251 # useful in any case.
252 SEARCH_GRANULARITY = 0.001
254 # charge -> (score, delta, passing_reals, passing_decoys)
255 thresholds = {}
257 for charge, spinfo in z_scores.iteritems():
258 spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
260 last_value = None
262 while spinfo0:
263 this_value = spinfo0[-1][1] # current delta
264 if (last_value == None
265 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
267 # "inner" is score
268 r = calculate_inner_threshold(options.fdr, charge, spinfo0)
269 if r[0] != None:
270 if options.debug:
271 print '#', charge, r[0], this_value, r[1], r[2]
272 if (charge not in thresholds
273 or r[1] > thresholds[charge][2]):
274 thresholds[charge] = (r[0], this_value, r[1], r[2])
276 last_value = this_value
277 spinfo0.pop()
279 if charge in thresholds:
280 print ("%+d: score %s, delta %s -> %s real ids, %s decoys"
281 " (fdr %.4f)"
282 % (charge, thresholds[charge][0], thresholds[charge][1],
283 thresholds[charge][2], thresholds[charge][3],
284 1 - specificity(thresholds[charge][2],
285 thresholds[charge][3])))
286 else:
287 warn("could not calculate thresholds for %+d" % charge)
289 return thresholds
292 def main(args=sys.argv[1:]):
293 parser = optparse.OptionParser(usage=
294 "usage: %prog [options] <sqt-file>...",
295 description=__doc__, version=__version__)
296 pa = parser.add_option
297 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
298 pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
299 help='prefix given to locus name of decoy (e.g., shuffled) database'
300 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
301 DEFAULT_FDR = 0.02
302 pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
303 help="false discovery rate [default=%s]" % DEFAULT_FDR,
304 metavar="PROPORTION")
305 pa("-v", "--verbose", action="store_true", dest="verbose",
306 help="be verbose")
307 pa("--list-peptides", action="store_true", dest="list_peptides",
308 help="print information about passing peptides")
309 pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
310 default='0', dest="minimum_trypticity",
311 help="drop peptides with too few tryptic ends"
312 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
313 #pa("--graph", dest="graph",
314 # help='create distribution graphs, using the specified prefix',
315 # metavar="PATH PREFIX")
316 pa("--debug", action="store_true", dest="debug",
317 help="show debug output")
318 pa("-m", "--mark", action="store_true", dest="mark",
319 help="rewrite the input files, changing some validation marks to 'N',"
320 " according to filtering")
321 pa("--reset-marks", action="store_true", dest="reset_marks",
322 help="rewrite the input files, changing all validation marks to 'U'")
323 pa("--copyright", action="store_true", dest="copyright",
324 help="print copyright and exit")
325 (options, args) = parser.parse_args(args=args)
327 if options.copyright:
328 print __copyright__
329 sys.exit(0)
331 if len(args) < 1:
332 parser.print_help()
333 sys.exit(1)
335 if not (0.0 <= options.fdr <= 1.0):
336 error("--fdr must be within range [0.0, 1.0]")
337 if options.mark and options.reset_marks:
338 error("only one of --mark and --reset-marks may be specified")
340 if options.reset_marks:
341 reset_marks(options, args)
342 return
344 z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix,
345 int(options.minimum_trypticity),
346 args)
348 if options.debug:
349 print ('%s passing spectra, of which %s are not decoys'
350 % (len(spectrum_scores), sum(len(z_scores[x])
351 for x in z_scores)))
353 thresholds = calculate_combined_thresholds(options, z_scores)
355 if options.debug:
356 pprint(thresholds)
358 for charge, score, delta in spectrum_scores:
359 print '#S', charge, score, delta
361 if options.mark:
362 mark(options, thresholds, spectrum_scores, args)
365 if __name__ == '__main__':
366 main()