validate: choose the better charge for multiple charge spectra
[greylag.git] / greylag_validate.py
blobb430ce820e051acba2aa8453884e1359174e1774
1 #!/usr/bin/env python
3 """
4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate) and other specified criteria.
6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
7 altogether.
9 Note that a peptide end with an invalid flanking residue (currently any of
10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
11 tryptic if it's a C-terminal end and otherwise qualifies. (Flanking '-' is
12 always considered tryptic.)
14 """
16 from __future__ import with_statement
18 __copyright__ = '''
19 greylag, a collection of programs for MS/MS protein analysis
20 Copyright (C) 2006-2008 Stowers Institute for Medical Research
22 This program is free software: you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation, either version 3 of the License, or
25 (at your option) any later version.
27 This program is distributed in the hope that it will be useful,
28 but WITHOUT ANY WARRANTY; without even the implied warranty of
29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 GNU General Public License for more details.
32 You should have received a copy of the GNU General Public License
33 along with this program. If not, see <http://www.gnu.org/licenses/>.
35 Contact: Mike Coleman
36 Stowers Institute for Medical Research
37 1000 East 50th Street
38 Kansas City, Missouri 64110
39 USA
40 '''
43 from collections import defaultdict
44 import itertools
45 import logging; from logging import debug, info, warning
46 import optparse
47 import os
48 from pprint import pprint
49 import re
50 import string
51 import sys
52 import time
55 from greylag import VERSION, set_logging
58 def error(s, *args):
59 "fatal error"
60 # if we're unit testing, just throw an exception
61 if __name__ != "__main__":
62 raise Exception((s + " (fatal error)") % args)
63 logging.error(s, *args)
64 sys.exit(1)
68 def generate_spectra_from_files(sqt_filenames):
69 """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
70 ...]) for the given SQT files. H+_lines, for example, is a list
71 containing H lines and possibly other kinds of lines up to the next S
72 line. (mod_lines are non-L lines following an M-line.)
73 """
74 for sqt_filename in sqt_filenames:
75 with open(sqt_filename) as sqtf:
76 H_lines = []
77 S_line, ML_lines = None, []
78 for line in sqtf:
79 if line.startswith('L\t'):
80 ML_lines[-1][2].append(line)
81 elif line.startswith('M\t'):
82 ML_lines.append((line, [], []))
83 elif line.startswith('S\t'):
84 if S_line:
85 yield (sqt_filename, H_lines, S_line, ML_lines)
86 S_line, ML_lines = None, []
87 S_line = line
88 else:
89 if not S_line:
90 # read headers up to first S line
91 H_lines.append(line)
92 else:
93 # other kind of line following M line
94 ML_lines[-1][1].append(line)
95 if S_line:
96 yield (sqt_filename, H_lines, S_line, ML_lines)
99 def write_spectra_to_files(spectra, filenames):
100 """Given a generator of tuples of the form generated by
101 'generate_spectra_from_files', write them to the implied files. If any of
102 filenames already exist, they are renamed with a '.bak' suffix. Temp
103 files are written first, then renamed at the end. (The filenames
104 parameter is necessary because spectra may not contain any entries for a
105 given filename, thus necessitating its removal.)
108 tmp_suffix = '.tmp.%s' % time.time()
110 # filename -> file
111 filemap = {}
113 for sqt_filename, H_lines, S_line, ML_lines in spectra:
114 sqtf = filemap.get(sqt_filename)
115 if sqtf == None:
116 sqtf = filemap[sqt_filename] = open(sqt_filename + tmp_suffix, 'w')
117 for h in H_lines:
118 sqtf.write(h)
119 sqtf.write(S_line)
120 for m_line, mod_lines, l_lines in ML_lines:
121 sqtf.write(m_line)
122 for m in mod_lines:
123 sqtf.write(m)
124 for l in l_lines:
125 sqtf.write(l)
127 for fd in filemap.values():
128 fd.close()
130 assert set(filemap.keys()) <= set(filenames)
131 for fn in filenames:
132 if os.path.exists(fn):
133 os.rename(fn, fn + '.bak')
134 if os.path.exists(fn + tmp_suffix):
135 os.rename(fn + tmp_suffix, fn)
138 def reset_marks(spectra):
139 """Given a generator of tuples of the form generated by
140 'generate_spectra_from_files', yield them with the M line marks
141 (destructively) reset to U.
143 >>> list(reset_marks([('fn', [], 'S...', [('M... Y', [], [])])]))
144 [('fn', [], 'S...', [('M... U', [], [])])]
148 def reset_mark(m_line):
149 return re.sub(r'(\s+)\S(\s*)$', r'\1U\2', m_line, 1)
151 for sqt_filename, H_lines, S_line, ML_lines in spectra:
152 ML_lines_1 = [ (reset_mark(M_line), mod_lines, L_lines)
153 for M_line, mod_lines, L_lines in ML_lines ]
154 yield (sqt_filename, H_lines, S_line, ML_lines_1)
157 def set_marks(valid_spectrum_info, spectra, kill=False):
158 """Given a generator of tuples 'spectra' of the form generated by
159 'generate_spectra_from_files', yield them with the M line marks
160 (destructively) set to 'N', for spectra not listed in
161 valid_spectrum_info. If 'kill', omit marked spectra altogether.
163 >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
164 ... ('fn', [], 'S 111...', [('M... U', [], [])])]))
165 [('fn', [], 'S 000...', [('M... U', [], [])]), ('fn', [], 'S 111...', [('M... N', [], [])])]
166 >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
167 ... ('fn', [], 'S 111...', [('M... U', [], [])])],
168 ... kill=True))
169 [('fn', [], 'S 000...', [('M... U', [], [])])]
173 def set_mark(m_line):
174 return re.sub(r'\S(\s*)$', r'N\1', m_line, 1)
176 passing_spectrum_numbers = frozenset(si[0] for si in valid_spectrum_info)
178 for spectrum_no, (sqt_filename, H_lines, S_line, ML_lines) \
179 in enumerate(spectra):
180 mark_spectrum = spectrum_no not in passing_spectrum_numbers
181 if mark_spectrum and not kill:
182 ML_lines = [ (set_mark(M_line), mod_lines, L_lines)
183 for M_line, mod_lines, L_lines in ML_lines ]
184 if not (kill and mark_spectrum):
185 yield (sqt_filename, H_lines, S_line, ML_lines)
188 nulltrans = string.maketrans('','')
189 non_aa_chars = string.digits + string.punctuation
191 def strip_mods(s):
192 """Given a peptide, return just the unmodified peptide seen.
194 >>> strip_mods('G*REY@LAG')
195 'GREYLAG'
196 >>> strip_mods('GREYLAG')
197 'GREYLAG'
200 # assuming no control chars present
201 return s.translate(nulltrans, non_aa_chars)
204 def get_spectrum_info(options, spectra):
205 """Given a generator of tuples 'spectra' of the form generated by
206 'generate_spectra_from_files', yield an info tuple for each spectrum that
207 passes the criteria specified in options (e.g., options.minimum_trypticity
208 and options.maximum_sp_rank).
211 for spectrum_no, (sqt_filename,
212 H_lines, S_line, ML_lines) in enumerate(spectra):
213 S_fields = S_line.split('\t')
214 current_charge = int(S_fields[3])
215 actual_mass = float(S_fields[6])
216 spectrum_name = '.'.join([os.path.splitext(sqt_filename)[0]]
217 + S_fields[1:4])
219 current_score = None
220 current_delta = None
221 current_peptide_trypticity = None
222 current_sp_rank = None
223 current_peptide_mass = None
224 current_state = set()
225 current_loci = set()
226 highest_rank_seen = 0
227 best_peptide_seen = None
228 current_peptide = None
230 for M_line, mod_lines, L_lines in ML_lines:
231 M_fields = M_line.split('\t')
233 rank = int(M_fields[1])
234 sp_rank = int(M_fields[2])
235 peptide_mass = float(M_fields[3])
236 score = float(M_fields[5])
237 peptide = M_fields[9].strip().upper() # e.g., A.B@CD*.-
238 assert score >= 0 and rank >= 1 and sp_rank >= 1
240 if rank > highest_rank_seen + 1:
241 # ignore aux top SpRank hits, because delta confounds
242 break
243 highest_rank_seen = rank
245 if current_score == None:
246 if score <= 0:
247 break # spectrum is total crap
248 current_score = score
250 # Don't trust delta from input file.
251 delta = (current_score - score) / current_score
253 assert peptide[1] == '.' and peptide[-2] == '.'
254 peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
255 peptide = peptide[2:-2] # 'B@CD*'
257 if current_peptide == None:
258 current_peptide = peptide
259 current_peptide_mass = peptide_mass
260 current_peptide_trypticity = 0
261 # NB: K*, R* not tryptic!
262 if (peptide_flanks[0] in ('K', 'R') and peptide[0] != 'P'
263 or peptide_flanks[0] == '-'):
264 current_peptide_trypticity += 1
265 if (peptide[-1] in ('K', 'R') and peptide_flanks[1] != 'P'
266 or peptide_flanks[1] == '-'):
267 current_peptide_trypticity += 1
268 current_sp_rank = sp_rank
270 # Choose delta from first M line such that peptide differs from
271 # peptide in initial (top) M line. We consider differently
272 # modified versions of the same peptide to be different (at least
273 # for now). But, if there are two peptides that differ only in
274 # their choice of I (isoleucine) or L (leucine), consider them
275 # identical and keep looking for a delta.
276 if peptide.replace('I', 'L') != current_peptide.replace('I', 'L'):
277 current_delta = delta
278 break
280 for L_line in L_lines:
281 locus = L_line.split('\t')[1]
282 current_loci.add(locus)
283 if locus.startswith(options.decoy_prefix):
284 current_state.add('decoy')
285 else:
286 current_state.add('real')
288 if (current_score != None and current_delta != None
289 and current_peptide_trypticity >= options.minimum_trypticity
290 and current_sp_rank <= options.maximum_sp_rank):
292 if len(current_state) == 2:
293 current_state = 'both'
294 else:
295 assert len(current_state) == 1
296 current_state = current_state.pop()
298 yield (spectrum_no, spectrum_name, current_charge, current_score,
299 current_delta, current_state, current_peptide,
300 strip_mods(current_peptide), actual_mass,
301 actual_mass-current_peptide_mass, frozenset(current_loci))
304 # FIX: can this be more succinct?
305 def remove_charge_aliases(spectrum_generator):
306 """Given a generator of spectra like get_spectrum_info, for spectra that
307 received multiple id's at different charges, yield only the best id.
308 (Probably unnecessary unless the precursor tolerance is wide.)
311 sp_info_0 = None
312 for sp_info_1 in spectrum_generator:
313 assert sp_info_1[2] < 10, "single-digit charge"
314 if not sp_info_0:
315 sp_info_0 = sp_info_1
316 continue
317 if sp_info_0[1][:-1] != sp_info_1[1][:-1]:
318 yield sp_info_0
319 sp_info_0 = sp_info_1
320 continue
321 # if at least one has sufficiently small mass delta, choose the
322 # one with the smaller
323 if abs(sp_info_0[9]) < abs(sp_info_1[9]) < 10:
324 continue
325 if abs(sp_info_1[9]) < abs(sp_info_0[9]) < 10:
326 sp_info_0 = sp_info_1
327 continue
328 # otherwise choose the better-scoring one
329 if sp_info_0[3] < sp_info_1[3]:
330 sp_info_0 = sp_info_1
332 if sp_info_0:
333 yield sp_info_0
336 # General algorithm:
337 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
338 # repeat until set of valid spectra converges:
339 # - do pr filter of spectra
340 # - if not first time, and no spectra filtered out, break
341 # - separately for each charge, choose valid spectra to maximize real ids subject to FDR
342 # - saturate (locally or completely)
345 def PPV(reals, decoys):
346 """Returns the estimated Positive Predictive Value (== 1 - FDR), given
347 counts of reals and decoys.
349 >>> PPV(100, 100)
351 >>> PPV(99, 100)
353 >>> PPV(100, 0)
355 >>> str(PPV(95, 5))
356 '0.9'
360 # We know that the decoys are false positives, and we estimate that an
361 # equal number of the "reals" are actually false, too.
362 false_positives = 2*decoys
363 true_positives = reals - decoys
365 if true_positives <= 0:
366 return 0.0
367 return float(true_positives) / (true_positives + false_positives)
370 def calculate_inner_threshold(fdr, spinfo):
371 ppv_goal = 1 - fdr
373 spinfo = sorted(spinfo, key=lambda x: x[3])
375 epsilon = +1e-6
377 real_count = sum(1 for x in spinfo if x[5] == 'real')
378 decoy_count = sum(1 for x in spinfo if x[5] == 'decoy')
380 current_threshold = -1e100 # allow all spectra
381 for n, sp in enumerate(spinfo):
382 if real_count == 0:
383 return (None, real_count, decoy_count) # give up
385 ppv_est = PPV(real_count, decoy_count)
386 if ppv_est >= ppv_goal:
387 break
388 if sp[5] == 'real':
389 real_count -= 1
390 elif sp[5] == 'decoy':
391 decoy_count -= 1
392 # set threshold just high enough to exclude this spectrum
393 current_threshold = sp[3] + epsilon
394 else:
395 current_threshold = spinfo[-1][3] + epsilon # couldn't meet goal
397 return (current_threshold, real_count, decoy_count)
400 def calculate_combined_thresholds(fdr, spectrum_info):
401 """Find best score/delta thresholds for each charge."""
403 ppv_goal = 1 - fdr
405 # Rather than search every possible value of delta, we're only going to
406 # "sample" at this granularity. This cuts search time dramatically (and
407 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
408 # useful in any case.
409 SEARCH_GRANULARITY = 0.001
411 # charge -> (score, delta, passing_reals, passing_decoys)
412 thresholds = {}
414 by_charge = lambda x: x[2]
415 spectrum_info.sort(key=by_charge)
416 for charge, spinfo in itertools.groupby(spectrum_info, key=by_charge):
417 spinfo0 = sorted(spinfo, key=lambda x: x[4], reverse=True)
419 last_value = None
421 while spinfo0:
422 this_value = spinfo0[-1][4] # current delta
423 if (last_value == None
424 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
426 # "inner" is score
427 r = calculate_inner_threshold(fdr, spinfo0)
428 threshold, real_count, decoy_count = r
429 if threshold != None:
430 debug('# %s %s %s %s %s', charge, threshold, this_value,
431 real_count, decoy_count)
432 if (charge not in thresholds
433 or real_count > thresholds[charge][2]):
434 thresholds[charge] = (threshold, this_value,
435 real_count, decoy_count)
436 last_value = this_value
437 spinfo0.pop()
439 return thresholds
442 def flatten(gen_of_gens):
443 """Yield items from a generator of generators.
445 >>> list(flatten([(1,2,3), (4, (5, 'b'), 6)]))
446 [1, 2, 3, 4, (5, 'b'), 6]
450 for gen in gen_of_gens:
451 for item in gen:
452 yield item
455 def redundancy_filter(options, spectrum_info):
456 if options.minimum_spectra_per_locus < 2:
457 return [ si for si in spectrum_info ]
459 # si = (spectrum_no, spectrum_name, charge, score, delta, state,
460 # peptide, stripped_peptide, actual_mass, mass_delta, loci)
462 Rs = sorted(flatten(((locus, si) for locus in si[10])
463 for si in spectrum_info))
465 result = set()
466 for locus, locus_Rs in itertools.groupby(Rs, key=lambda x: x[0]):
467 list_locus_si = [ si for locus, si in locus_Rs ]
468 if len(list_locus_si) < options.minimum_spectra_per_locus:
469 continue
470 result |= set(list_locus_si)
471 return sorted(result)
474 def saturate_by_prefix_suffix(spectrum_info, saturation_spectrum_info):
475 PREFIX_LENGTH = 16 # empirically seems good
476 peptide_prefixes = set(si[7][:PREFIX_LENGTH] for si in spectrum_info)
477 sat_si = sorted(ssi for ssi in saturation_spectrum_info
478 if ssi[7][:PREFIX_LENGTH] in peptide_prefixes)
480 info("saturation by prefix (w/%s): %s -> %s",
481 len(saturation_spectrum_info), len(spectrum_info), len(sat_si))
482 return sat_si
485 def saturate(spectrum_info, saturation_spectrum_info):
486 peptides = set(si[7] for si in spectrum_info)
487 # FIX: does this actually need to be sorted?
488 sat_si = sorted(ssi for ssi in saturation_spectrum_info
489 if ssi[7] in peptides)
490 info("saturation (w/%s): %s -> %s", len(saturation_spectrum_info),
491 len(spectrum_info), len(sat_si))
492 return saturate_by_prefix_suffix(sat_si, saturation_spectrum_info)
495 def fdr_stats(spectrum_info):
496 reals = sum(1 for si in spectrum_info if si[5] == 'real')
497 decoys = sum(1 for si in spectrum_info if si[5] == 'decoy')
498 fdr = 1 - PPV(reals, decoys)
499 return fdr, reals, decoys
501 def debug_fdr(options, msg, si):
502 if options.verbose:
503 fdr, reals, decoys = fdr_stats(si)
504 info('%s %s %s %s', msg, fdr, reals, decoys)
506 def try_fdr(fdr_guess, options, remaining_spectrum_info,
507 saturation_spectrum_info=None):
508 info("trying FDR = %s", fdr_guess)
509 debug_fdr(options, "initial", remaining_spectrum_info)
511 remaining_spectrum_info_1 = None
512 while True:
513 thresholds = calculate_combined_thresholds(fdr_guess,
514 remaining_spectrum_info)
515 remaining_spectrum_info = [ si for si in remaining_spectrum_info
516 if (si[2] in thresholds
517 and si[3] >= thresholds[si[2]][0]
518 and si[4] >= thresholds[si[2]][1]) ]
519 debug_fdr(options, "after thresholding", remaining_spectrum_info)
520 if remaining_spectrum_info == remaining_spectrum_info_1:
521 break
522 remaining_spectrum_info_1 = redundancy_filter(options,
523 remaining_spectrum_info)
524 debug_fdr(options, "after redundancy", remaining_spectrum_info_1)
525 if len(remaining_spectrum_info_1) == len(remaining_spectrum_info):
526 break
527 remaining_spectrum_info = remaining_spectrum_info_1
529 fdr_result, total_reals, total_decoys = fdr_stats(remaining_spectrum_info_1)
530 if not options.no_saturation and saturation_spectrum_info:
531 remaining_spectrum_info_1 = saturate(remaining_spectrum_info_1,
532 saturation_spectrum_info)
533 debug_fdr(options, "after saturation", remaining_spectrum_info_1)
534 (fdr_result,
535 total_reals, total_decoys) = fdr_stats(remaining_spectrum_info_1)
537 return (fdr_result, thresholds, total_reals, total_decoys,
538 remaining_spectrum_info_1)
541 # FIX: at various points here we may have a goal of a minimum number of real
542 # ids, based on goal FDR or whatever. We can pass this inward to do
543 # branch-and-bound, which may save time.
545 # FIX: this is now somewhat slower than before--any way to speed it up?
547 def search_adjusting_fdr(options, spectrum_info_0):
548 """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
549 guess that results in an appropriate goal FDR. This might be needed
550 because the thresholds found for per-charge FDRs will be saturated or have
551 quantization errors, resulting in overly conservative thresholds. Returns
552 a list of items from spectrum_info that are considered valid.
555 FDR_INFLATION_FACTOR = 3
556 assert FDR_INFLATION_FACTOR > 1
558 spectrum_info = redundancy_filter(options, spectrum_info_0)
560 low_fdr = options.fdr
561 # FIX: spectrum_info_0 has already been filtered for tryptic status
562 # Should we use an unfiltered set instead?
563 low_results = try_fdr(low_fdr, options, spectrum_info, spectrum_info_0)
565 if not options.no_adjust_fdr:
566 high_fdr = min(1.0, options.fdr * FDR_INFLATION_FACTOR)
567 high_results = try_fdr(high_fdr, options, spectrum_info,
568 spectrum_info_0)
569 initial_total_reals = low_results[2]
571 for i in range(32):
572 debug("adjust infl fdr %s %s -> %s", low_fdr, high_fdr,
573 high_results[0])
574 if high_results[0] >= options.fdr:
575 break
576 low_fdr, low_results = high_fdr, high_results
577 high_fdr = min(1.0, high_fdr * FDR_INFLATION_FACTOR)
578 if high_fdr == 1.0:
579 break
580 high_results = try_fdr(high_fdr, options, spectrum_info,
581 spectrum_info_0)
582 else:
583 warning("FDR adjustment inflation failed")
586 # l g h
587 # l g h
590 CONVERGENCE_FACTOR = 0.01
591 assert CONVERGENCE_FACTOR > 0
592 for i in range(32):
593 assert high_fdr >= low_fdr
594 if (high_fdr - low_fdr) <= CONVERGENCE_FACTOR*options.fdr:
595 break
597 # - seems to be slower than the naive method?
598 # Bias is empirically determined. It's supposed to speed
599 # convergence, but should not affect the results.
600 # FIX: better method possible?
601 #GUESS_BIAS = 1.15
602 #prop = ((options.fdr - low_results[0])
603 # / (high_results[0] - low_results[0]))
604 #prop = min(1, max(0, prop))
605 #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
607 guess_fdr = (high_fdr + low_fdr) / 2.0
609 guess_results = try_fdr(guess_fdr, options, spectrum_info,
610 spectrum_info_0)
611 debug("adjust fdr %s %s %s -> %s", low_fdr, guess_fdr, high_fdr,
612 guess_results[0])
613 if guess_results[0] > options.fdr:
614 high_fdr, high_results = guess_fdr, guess_results
615 else:
616 low_fdr, low_results = guess_fdr, guess_results
617 else:
618 warning("FDR adjustment convergence failed")
620 if not options.no_adjust_fdr:
621 info("FDR adjustment found %s extra real ids",
622 low_results[2] - initial_total_reals)
624 return low_results
627 def search_with_possible_saturation(options, spectrum_info):
629 for saturate in (True, False):
630 if saturate and options.no_saturation:
631 continue
632 results = search_adjusting_fdr(options, spectrum_info)
633 (fdr_result, thresholds,
634 total_reals, total_decoys, valid_spectrum_info) = results
635 if fdr_result <= options.fdr:
636 break
637 if saturate:
638 info('could not achieve FDR with saturation, retrying without')
639 options.no_saturation = True
641 if options.output_peptides:
642 valid_spectrum_info.sort(key=lambda x: x[1])
643 for (spectrum_no, spectrum_name, charge, score, delta, state,
644 peptide, stripped_peptide, actual_mass,
645 mass_delta, loci) in valid_spectrum_info:
646 print >> options.output_peptides, \
647 ("%+d %s %s %s %.8f %.8f"
648 % (charge, spectrum_name, state, peptide, actual_mass,
649 mass_delta))
651 for charge in sorted(thresholds.keys()):
652 score_threshold, delta_threshold = thresholds[charge][0:2]
653 reals, decoys = thresholds[charge][2:4]
654 info("%+d: score %f, delta %.6f -> %s real ids, %s decoys (fdr %.4f)",
655 charge, score_threshold, delta_threshold, reals, decoys,
656 0.5 * (1 - PPV(reals, decoys)))
658 if not options.quiet:
659 print ("%s real ids, %s decoys (FDR = %.4f) [p%s]"
660 % (total_reals, total_decoys, fdr_result / 2.0,
661 options.minimum_spectra_per_locus))
663 return valid_spectrum_info
666 def print_mass_error_histogram(valid_spectrum_info):
667 hist = defaultdict(int)
669 for (spectrum_no, spectrum_name, charge, score, delta, state, peptide,
670 stripped_peptide, actual_mass, mass_delta,
671 loci) in valid_spectrum_info:
672 hist[round(mass_delta)] += 1
674 pairs_most_first = sorted(((bin, hist[bin]) for bin in hist),
675 key=lambda x: x[1], reverse=True)
677 total = sum(count for bin, count in pairs_most_first)
679 cumulative_total = 0
680 for bin, count in pairs_most_first:
681 cumulative_total += count
682 print '# %+d\t%s\t%.3f' % (bin, count,
683 (float(cumulative_total) / total))
686 def main(args=sys.argv[1:]):
687 parser = optparse.OptionParser(usage=
688 "usage: %prog [options] <sqt-file>...",
689 description=__doc__, version=VERSION)
690 pa = parser.add_option
691 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
692 pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
693 help='prefix given to locus name of decoy (e.g., shuffled) database'
694 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
695 DEFAULT_FDR = 0.01
696 pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
697 help="false discovery rate [default=%s] (Note that this is the"
698 " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
699 metavar="PROPORTION")
700 pa("--no-adjust-fdr", action="store_true", dest="no_adjust_fdr",
701 help="skip adjustment of internal FDR to achieve a final FDR closer to"
702 " requested FDR (which takes longer, but may find more real ids in"
703 " some cases)")
704 pa("--no-saturation", action="store_true", dest="no_saturation",
705 help="skip peptide saturation")
706 pa("-v", "--verbose", action="store_true", dest="verbose",
707 help="be verbose")
708 pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
709 pa("-l", "--logfile", dest="logfile",
710 help="log to FILE instead of stderr", metavar="FILE")
711 pa("--output-peptides", dest="output_peptides",
712 help="output information about passing peptides to specified file"
713 " ('-' for stdout)", metavar="FILENAME")
714 pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
715 default='0', dest="minimum_trypticity",
716 help="drop peptides with too few tryptic ends"
717 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
718 DEFAULT_MINIMUM_SPECTRA_PER_LOCUS = 1
719 pa("-p", "--minimum-spectra-per-locus", dest="minimum_spectra_per_locus",
720 type="int", default=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
721 help="only loci with at least this many spectra are included, and"
722 " spectra not associated with such a locus are excluded"
723 " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
724 metavar="COUNT")
725 DEFAULT_MAXIMUM_SP_RANK = 1000000
726 pa("--maximum-sp-rank", dest="maximum_sp_rank", type="int",
727 default=DEFAULT_MAXIMUM_SP_RANK,
728 help="drop peptides with greater Sp rank (the secondary rank in sqt"
729 " files, for compatibility with other search programs)"
730 " [default=%s]" % DEFAULT_MAXIMUM_SP_RANK, metavar="RANK")
731 #pa("--graph", dest="graph",
732 # help='create distribution graphs, using the specified prefix',
733 # metavar="PATH PREFIX")
734 pa("-M", "--mass-error-histogram", action="store_true",
735 dest="mass_error_histogram",
736 help="print a histogram of mass errors (1-Da-wide bins)")
737 pa("--debug", action="store_true", dest="debug",
738 help="show debug output")
739 pa("--mark", action="store_true", dest="mark",
740 help="rewrite the input files, changing some validation marks to 'N',"
741 " according to filtering")
742 pa("--reset-marks", action="store_true", dest="reset_marks",
743 help="rewrite the input files, changing all validation marks to 'U'")
744 pa("--kill", action="store_true", dest="kill",
745 help="rewrite the input files, removing spectra that don't pass"
746 " validation, according to filtering")
747 pa("--copyright", action="store_true", dest="copyright",
748 help="print copyright and exit")
749 (options, args) = parser.parse_args(args=args)
751 if options.copyright:
752 print __copyright__
753 sys.exit(0)
755 if (len(args) < 1 or options.minimum_spectra_per_locus < 0
756 or options.maximum_sp_rank < 1):
757 parser.print_help()
758 sys.exit(1)
760 if not (0.0 <= options.fdr < 0.5):
761 error("--fdr must be within range [0.0, 0.5)")
762 if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
763 > 1):
764 error("only one of --mark, --reset-marks, --kill may be specified")
766 options.minimum_trypticity = int(options.minimum_trypticity)
768 set_logging(options)
770 if options.reset_marks:
771 write_spectra_to_files(reset_marks(generate_spectra_from_files(args)),
772 args)
773 return
775 if options.output_peptides:
776 if options.output_peptides == '-':
777 options.output_peptides = sys.stdout
778 else:
779 options.output_peptides = open(options.output_peptides, 'w')
781 # This is to translate the "effective" FDR, which is the rate after decoys
782 # have been stripped out of the results (which is what users really care
783 # about), into our internal FDR, which includes decoys.
784 options.fdr *= 2
786 info("reading spectra")
787 spectrum_info = list(remove_charge_aliases(get_spectrum_info(options,
788 generate_spectra_from_files(args))))
790 # valid_spectrum_info is the list of spectrum_info elements that have been
791 # chosen as "valid"
792 valid_spectrum_info = search_with_possible_saturation(options, spectrum_info)
794 if options.mass_error_histogram:
795 print_mass_error_histogram(valid_spectrum_info)
797 if options.debug:
798 valid_spectrum_info.sort()
799 for (spectrum_no, spectrum_name, charge, score, delta, state,
800 peptide, stripped_peptide, actual_mass,
801 mass_delta, loci) in valid_spectrum_info:
802 debug('#S %s %s %s', charge, score, delta)
804 if options.mark or options.kill:
805 # Note that we are intentionally reading the input files again here,
806 # to avoid having to keep them all in memory.
807 info("rewriting spectrum files")
808 write_spectra_to_files(set_marks(valid_spectrum_info,
809 generate_spectra_from_files(args),
810 options.kill),
811 args)
812 info("finished rewriting")
815 if __name__ == '__main__':
816 main()