read/write pickle files in binary mode
[greylag.git] / greylag_validate.py
blobcdd887eee9fa28fe43aaa659bf313169249c7193
1 #!/usr/bin/env python
3 """
4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate). Optionally rewrite the sqt
6 files with validation marks set to 'N' for filtered-out spectra.
8 Note that a peptide end with an invalid flanking residue (currently any of
9 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
10 tryptic if it's a C-terminal end and otherwise qualifies. (Flanking '-' is
11 always considered tryptic.)
13 """
15 from __future__ import with_statement
17 __copyright__ = '''
18 greylag, a collection of programs for MS/MS protein analysis
19 Copyright (C) 2006-2008 Stowers Institute for Medical Research
21 This program is free software: you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation, either version 3 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License
32 along with this program. If not, see <http://www.gnu.org/licenses/>.
34 Contact: Mike Coleman
35 Stowers Institute for Medical Research
36 1000 East 50th Street
37 Kansas City, Missouri 64110
38 USA
39 '''
42 from collections import defaultdict
43 import fileinput
44 import optparse
45 from pprint import pprint
46 import string
47 import sys
50 # allow this script to be run even if not "installed"
51 try:
52 from greylag import VERSION
53 except:
54 VERSION = 'unknown'
57 def warn(s):
58 print >> sys.stderr, 'warning:', s
59 def error(s):
60 sys.exit('error: ' + s)
61 def fileerror(s, *args):
62 error(s + (", at line %s of file '%s'"
63 % (fileinput.filelineno(), fileinput.filename())),
64 *args)
66 def inplace_warning():
67 warn("\n!!!\nan error occurred while modifying .sqt files in-place--it may"
68 " be necessary to recover some or all of the .sqt files from the"
69 " corresponding .sqt.bak files.\n!!!\n")
72 def reset_marks(options, sqt_fns):
73 """Rewrite all evaluation marks to 'U', in-place."""
75 try:
76 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
77 if line.startswith("M\t"):
78 fs = line.split("\t")
79 if len(fs) >= 11:
80 fs[10] = 'U' + fs[10][1:]
81 line = '\t'.join(fs)
82 sys.stdout.write(line)
83 except:
84 inplace_warning()
85 raise
88 def mark(options, thresholds, sp_scores, sqt_fns):
89 """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
90 score and delta thresholds."""
92 spectrum_no = -1
93 mark_spectrum = False # mark this spectrum 'N'
95 try:
96 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
97 if line.startswith("S\t"):
98 spectrum_no += 1
99 sps = sp_scores[spectrum_no]
100 if sps:
101 charge, score, delta = sp_scores[spectrum_no]
102 mark_spectrum = (charge in thresholds
103 and score != None
104 and (score < thresholds[charge][0]
105 or delta < thresholds[charge][1]))
106 else:
107 mark_spectrum = True
108 elif line.startswith("M\t") and mark_spectrum:
109 fs = line.split("\t")
110 if len(fs) >= 11:
111 fs[10] = 'N' + fs[10][1:]
112 line = '\t'.join(fs)
113 sys.stdout.write(line)
114 except:
115 inplace_warning()
116 raise
119 def kill(options, thresholds, sp_scores, sqt_fns):
120 """Remove, in-place, spectra not meeting score and delta thresholds."""
122 spectrum_no = -1
123 remove_spectrum = False # remove this spectrum
125 try:
126 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
127 if fileinput.isfirstline():
128 remove_spectrum = False
129 if line.startswith("S\t"):
130 spectrum_no += 1
131 sps = sp_scores[spectrum_no]
132 if sps:
133 charge, score, delta = sp_scores[spectrum_no]
134 remove_spectrum = (charge in thresholds
135 and score != None
136 and (score < thresholds[charge][0]
137 or delta < thresholds[charge][1]))
138 else:
139 remove_spectrum = True
140 if not remove_spectrum:
141 sys.stdout.write(line)
142 except:
143 inplace_warning()
144 raise
147 nulltrans = string.maketrans('','')
148 non_aa_chars = string.digits + string.punctuation
150 def stripmods(s):
151 """Given a peptide, return just the unmodified peptide seen (upcased)."""
152 # assuming no control chars present
153 return s.upper().translate(nulltrans, non_aa_chars)
156 def read_sqt_info(decoy_prefix, minimum_trypticity, sqt_fns):
157 """Return a pair, the first a dict mapping each charge to a list of
158 (score, delta, state), where state is 'real' or 'decoy', for all the
159 spectra in sqt_fns, and the second a list of (charge, score, delta), one
160 for each spectrum. Spectra lacking minimum_trypticity are dropped.
163 # charge -> [ (score, delta, state, spectrum_name, peptide), ... ]
164 # where state is either 'real' or 'decoy'
165 # and peptide is the peptide as first seen, with mods, no flanks
166 z_scores = defaultdict(list)
168 # information about spectra, in file order--None iff spectrum was filtered
169 # out
170 # [ (charge, score, delta) or None, ... ]
171 sp_scores = []
173 current_charge = None
174 current_score = None
175 current_delta = 0
176 current_peptide_trypticity = None # or one of [ 0, 1, 2 ]
177 current_state = set() # 'real', 'decoy' or both
178 highest_rank_seen = 0
179 best_peptide_seen = None
180 current_peptide = None
181 spectrum_name = None
183 for line in fileinput.input(sqt_fns):
184 fs = line.split('\t')
185 if fs[0] == 'S':
186 sps = None
187 if (current_charge != None and
188 current_peptide_trypticity >= minimum_trypticity):
189 if current_score != None:
190 if len(current_state) == 1:
191 z_scores[current_charge].append((current_score,
192 current_delta,
193 current_state.pop(),
194 spectrum_name,
195 current_peptide))
196 sps = (current_charge, current_score, current_delta)
197 if current_charge != None:
198 sp_scores.append(sps)
199 current_charge = int(fs[3])
200 current_score = None
201 current_delta = 0
202 current_peptide_trypticity = None
203 current_state = set()
204 highest_rank_seen = 0
205 best_peptide_seen = None
206 current_peptide = None
207 spectrum_name = '.'.join((fileinput.filename(),
208 fs[1], fs[2], fs[3]))
209 elif fs[0] == 'M':
210 rank = int(fs[1])
211 delta = float(fs[4])
212 score = float(fs[5])
213 sp_rank = int(fs[2])
214 peptide = fs[9].strip() # e.g., A.B@CD*.-
215 mass = float(fs[3])
217 if rank > highest_rank_seen + 1:
218 # ignore aux top SpRank hits, because delta confounds
219 continue
220 highest_rank_seen = rank
222 assert peptide[1] == '.' and peptide[-2] == '.'
223 peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
224 peptide = peptide[2:-2] # 'B@CD*'
225 peptide_stripped = stripmods(peptide) # 'BCD'
227 if best_peptide_seen == None:
228 best_peptide_seen = peptide_stripped
229 if current_peptide == None:
230 current_peptide = peptide
232 # MyriMatch gives delta=0 when score=0, even if there are better
233 # scores!
234 if score > 0:
235 if delta == 0:
236 assert current_delta == 0
237 current_score = score
238 elif (current_delta == 0 or best_peptide_seen == None
239 or best_peptide_seen == peptide_stripped):
240 current_delta = delta
241 if best_peptide_seen != peptide_stripped:
242 # once we've seen a non-equivalent peptide, don't set
243 # delta if we see further equivalent peptides
244 best_peptide_seen = '####'
245 if current_peptide_trypticity == None:
246 current_peptide_trypticity = 0
247 if (peptide_flanks[0] in ('K', 'R')
248 and peptide_stripped[0] != 'P'
249 or peptide_flanks[0] == '-'):
250 current_peptide_trypticity += 1
251 if (peptide_stripped[-1] in ('K', 'R')
252 and peptide_flanks[1] != 'P'
253 or peptide_flanks[1] == '-'):
254 current_peptide_trypticity += 1
255 elif fs[0] == 'L':
256 if current_delta == 0:
257 if fs[1].startswith(decoy_prefix):
258 current_state.add('decoy')
259 else:
260 current_state.add('real')
262 # handle final spectrum, as above
263 # FIX: factor out this common code (using a generator?)
264 sps = None
265 if (current_charge != None and
266 current_peptide_trypticity >= minimum_trypticity):
267 if current_score != None:
268 if len(current_state) == 1:
269 z_scores[current_charge].append((current_score,
270 current_delta,
271 current_state.pop(),
272 spectrum_name,
273 current_peptide))
274 sps = (current_charge, current_score, current_delta)
275 if current_charge != None:
276 sp_scores.append(sps)
277 return (z_scores, sp_scores)
280 def PPV(reals, decoys):
281 """Returns the estimated Positive Predictive Value (== 1 - FDR), given
282 counts of reals and decoys."""
284 # We know that the decoys are false positives, and we estimate that an
285 # equal number of the "reals" are actually false, too.
286 false_positives = 2*decoys
287 true_positives = reals - decoys
289 return float(true_positives) / (true_positives + false_positives)
292 def calculate_inner_threshold(fdr, charge, spinfo):
293 ppv_goal = 1 - fdr
295 spinfo = sorted(spinfo, key=lambda x: x[0])
297 epsilon = +1e-6
299 real_count = sum(1 for x in spinfo if x[2] == 'real')
300 decoy_count = len(spinfo) - real_count
302 current_threshold = -1e100 # allow all spectra
303 for n, sp in enumerate(spinfo):
304 if real_count == 0:
305 return (None, real_count, decoy_count) # give up
307 ppv_est = PPV(real_count, decoy_count)
308 if ppv_est >= ppv_goal:
309 break
310 if sp[2] == 'real':
311 real_count -= 1
312 else:
313 decoy_count -= 1
314 # set threshold just high enough to exclude this spectrum
315 current_threshold = sp[0] + epsilon
316 else:
317 current_threshold = spinfo[-1][0] + epsilon # couldn't meet goal
319 return (current_threshold, real_count, decoy_count)
322 def calculate_combined_thresholds(options, z_scores):
323 """Find best score/delta thresholds for each charge."""
325 ppv_goal = 1 - options.fdr
327 # Rather than search every possible value of delta, we're only going to
328 # "sample" at this granularity. This cuts search time dramatically (and
329 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
330 # useful in any case.
331 SEARCH_GRANULARITY = 0.001
333 # charge -> (score, delta, passing_reals, passing_decoys)
334 thresholds = {}
336 total_reals, total_decoys = 0, 0
338 for charge, spinfo in z_scores.iteritems():
339 spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
341 last_value = None
343 while spinfo0:
344 this_value = spinfo0[-1][1] # current delta
345 if (last_value == None
346 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
348 # "inner" is score
349 r = calculate_inner_threshold(options.fdr, charge, spinfo0)
350 if r[0] != None:
351 if options.debug:
352 print '#', charge, r[0], this_value, r[1], r[2]
353 if (charge not in thresholds
354 or r[1] > thresholds[charge][2]):
355 thresholds[charge] = (r[0], this_value, r[1], r[2])
357 last_value = this_value
358 spinfo0.pop()
360 if charge in thresholds:
361 score_threshold, delta_threshold = thresholds[charge][0:2]
363 if options.list_peptides:
364 for (score, delta, state,
365 spectrum_name, peptide) in z_scores[charge]:
366 if score >= score_threshold and delta >= delta_threshold:
367 print "%+d %s %s %s" % (charge, spectrum_name, state,
368 peptide)
369 else:
370 reals, decoys = thresholds[charge][2], thresholds[charge][3]
371 total_reals += reals
372 total_decoys += decoys
373 print ("%+d: score %s, delta %s -> %s real ids, %s decoys"
374 " (fdr %.4f)"
375 % (charge, score_threshold, delta_threshold,
376 reals, decoys,
377 1 - PPV(thresholds[charge][2],
378 thresholds[charge][3])))
379 else:
380 warn("could not calculate thresholds for %+d" % charge)
382 if not options.list_peptides:
383 print "# total: %s real ids, %s decoys" % (total_reals, total_decoys)
385 return thresholds
388 def main(args=sys.argv[1:]):
389 parser = optparse.OptionParser(usage=
390 "usage: %prog [options] <sqt-file>...",
391 description=__doc__, version=VERSION)
392 pa = parser.add_option
393 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
394 pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
395 help='prefix given to locus name of decoy (e.g., shuffled) database'
396 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
397 DEFAULT_FDR = 0.01
398 pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
399 help="false discovery rate [default=%s] (Note that this is the"
400 " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
401 metavar="PROPORTION")
402 pa("-v", "--verbose", action="store_true", dest="verbose",
403 help="be verbose")
404 pa("--list-peptides", action="store_true", dest="list_peptides",
405 help="print information about passing peptides")
406 pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
407 default='0', dest="minimum_trypticity",
408 help="drop peptides with too few tryptic ends"
409 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
410 #pa("--graph", dest="graph",
411 # help='create distribution graphs, using the specified prefix',
412 # metavar="PATH PREFIX")
413 pa("--debug", action="store_true", dest="debug",
414 help="show debug output")
415 pa("-m", "--mark", action="store_true", dest="mark",
416 help="rewrite the input files, changing some validation marks to 'N',"
417 " according to filtering")
418 pa("--reset-marks", action="store_true", dest="reset_marks",
419 help="rewrite the input files, changing all validation marks to 'U'")
420 pa("-k", "--kill", action="store_true", dest="kill",
421 help="rewrite the input files, removing spectra that don't pass"
422 " validation, according to filtering")
423 pa("--copyright", action="store_true", dest="copyright",
424 help="print copyright and exit")
425 (options, args) = parser.parse_args(args=args)
427 if options.copyright:
428 print __copyright__
429 sys.exit(0)
431 if len(args) < 1:
432 parser.print_help()
433 sys.exit(1)
435 if not (0.0 <= options.fdr < 0.5):
436 error("--fdr must be within range [0.0, 0.5)")
437 if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
438 > 1):
439 error("only one of --mark, --reset-marks, --kill may be specified")
441 if options.reset_marks:
442 reset_marks(options, args)
443 return
445 # This is to translate the "effective" FDR, which is the rate after decoys
446 # have been stripped out of the results (which is what users really care
447 # about), into our internal FDR, which includes decoys.
448 options.fdr *= 2
450 z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix,
451 int(options.minimum_trypticity),
452 args)
454 if options.debug:
455 print ('%s passing spectra, of which %s are not decoys'
456 % (len(spectrum_scores), sum(len(z_scores[x])
457 for x in z_scores)))
459 thresholds = calculate_combined_thresholds(options, z_scores)
461 if options.debug:
462 pprint(thresholds)
464 for charge, score, delta in spectrum_scores:
465 print '#S', charge, score, delta
467 if options.kill:
468 kill(options, thresholds, spectrum_scores, args)
470 if options.mark:
471 mark(options, thresholds, spectrum_scores, args)
474 if __name__ == '__main__':
475 main()