validate: add --no-saturation flag, back to binary search of FDR, bug fixes
[greylag.git] / greylag_validate.py
blob8c7fc8b74eaf2b876e97c7be5adab46875969446
1 #!/usr/bin/env python
3 """
4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate) and other specified criteria.
6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
7 altogether.
9 Note that a peptide end with an invalid flanking residue (currently any of
10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
11 tryptic if it's a C-terminal end and otherwise qualifies. (Flanking '-' is
12 always considered tryptic.)
14 """
16 from __future__ import with_statement
18 __copyright__ = '''
19 greylag, a collection of programs for MS/MS protein analysis
20 Copyright (C) 2006-2008 Stowers Institute for Medical Research
22 This program is free software: you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation, either version 3 of the License, or
25 (at your option) any later version.
27 This program is distributed in the hope that it will be useful,
28 but WITHOUT ANY WARRANTY; without even the implied warranty of
29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 GNU General Public License for more details.
32 You should have received a copy of the GNU General Public License
33 along with this program. If not, see <http://www.gnu.org/licenses/>.
35 Contact: Mike Coleman
36 Stowers Institute for Medical Research
37 1000 East 50th Street
38 Kansas City, Missouri 64110
39 USA
40 '''
43 from collections import defaultdict
44 import itertools
45 import optparse
46 import os
47 from pprint import pprint
48 import re
49 import string
50 import sys
51 import time
54 # allow this script to be run even if not "installed"
55 try:
56 from greylag import VERSION
57 except:
58 VERSION = 'unknown'
61 def warn(s):
62 print >> sys.stderr, 'warning:', s
63 def error(s):
64 sys.exit('error: ' + s)
67 def generate_spectra_from_files(sqt_filenames):
68 """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
69 ...]) for the given SQT files. H+_lines, for example, is a list
70 containing H lines and possibly other kinds of lines up to the next S
71 line. (mod_lines are non-L lines following an M-line.)
72 """
73 for sqt_filename in sqt_filenames:
74 with open(sqt_filename) as sqtf:
75 sqt_filename = os.path.splitext(sqt_filename)[0]
76 H_lines = []
77 S_line, ML_lines = None, []
78 for line in sqtf:
79 if line.startswith('L\t'):
80 ML_lines[-1][2].append(line)
81 elif line.startswith('M\t'):
82 ML_lines.append((line, [], []))
83 elif line.startswith('S\t'):
84 if S_line:
85 yield (sqt_filename, H_lines, S_line, ML_lines)
86 S_line, ML_lines = None, []
87 S_line = line
88 else:
89 if not S_line:
90 # read headers up to first S line
91 H_lines.append(line)
92 else:
93 # other kind of line following M line
94 ML_lines[-1][1].append(line)
95 if S_line:
96 yield (sqt_filename, H_lines, S_line, ML_lines)
99 def write_spectra_to_files(spectra):
100 """Given a generator of tuples of the form generated by
101 'generate_spectra_from_files', write them to the implied files. If any of
102 those files already exist, they are renamed with a '.bak' suffix. Temp
103 files are written first, then renamed at the end.
106 tmp_suffix = '.tmp.%s' % time.time()
108 # filename -> file
109 filemap = {}
111 for sqt_filename, H_lines, S_line, ML_lines in spectra:
112 sqtf = filemap.get(sqt_filename)
113 if sqtf == None:
114 sqtf = filemap[sqt_filename] = open(sqt_filename + tmp_suffix, 'w')
115 for h in H_lines:
116 sqtf.write(h)
117 sqtf.write(S_line)
118 for m_line, mod_lines, l_lines in ML_lines:
119 sqtf.write(m_line)
120 for m in mod_lines:
121 sqtf.write(m)
122 for l in l_lines:
123 sqtf.write(l)
125 for fd in filemap.values():
126 fd.close()
127 for fn in filemap:
128 if os.path.exists(fn):
129 os.rename(fn, fn + '.bak')
130 os.rename(fn + tmp_suffix, fn)
133 def reset_marks(spectra):
134 """Given a generator of tuples of the form generated by
135 'generate_spectra_from_files', yield them with the M line marks
136 (destructively) reset to U.
139 def reset_mark(m_line):
140 return re.sub(r'\S(\s*)$', r'U\1', m_line, 1)
142 for sqt_filename, H_lines, S_line, ML_lines in spectra:
143 ML_lines_1 = [ (reset_mark(M_line), mod_lines, L_lines)
144 for M_line, mod_lines, L_lines in ML_lines ]
145 yield (sqt_filename, H_lines, S_line, ML_lines_1)
148 def set_marks(options, valid_spectrum_info, spectra, kill=False):
149 """Given a generator of tuples 'spectra' of the form generated by
150 'generate_spectra_from_files', yield them with the M line marks
151 (destructively) set to 'N', for spectra not listed in
152 valid_spectrum_info. If 'kill', omit marked spectra altogether.
155 def set_mark(m_line):
156 return re.sub(r'\S(\s*)$', r'N\1', m_line, 1)
158 passing_spectrum_numbers = frozenset(si[0] for si in valid_spectrum_info)
160 for spectrum_no, (sqt_filename, H_lines, S_line, ML_lines) \
161 in enumerate(spectra):
162 mark_spectrum = spectrum_no not in passing_spectrum_numbers
163 if mark_spectrum and not kill:
164 ML_lines = [ (set_mark(M_line), mod_lines, L_lines)
165 for M_line, mod_lines, L_lines in ML_lines ]
166 if not (kill and mark_spectrum):
167 yield (sqt_filename, H_lines, S_line, ML_lines)
170 nulltrans = string.maketrans('','')
171 non_aa_chars = string.digits + string.punctuation
173 def strip_mods(s):
174 """Given a peptide, return just the unmodified peptide seen."""
175 # assuming no control chars present
176 return s.translate(nulltrans, non_aa_chars)
179 def get_spectrum_info(decoy_prefix, minimum_trypticity, spectra):
180 """Return a pair, the first a dict mapping each charge to a list of
181 (score, delta, state), where state is 'real' or 'decoy', for all the
182 spectra in sqt_fns, and the second a list of (charge, score, delta), one
183 for each spectrum, given a generator of tuples 'spectra' of the form
184 generated by 'generate_spectra_from_files'. Spectra lacking
185 minimum_trypticity are dropped.
188 ## charge -> [ (score, delta, state), ... ]
189 ## where state is either 'real' or 'decoy'
190 #z_scores = defaultdict(list)
192 ## information about spectra, in file order--None iff spectrum was filtered
193 ## out
194 ## [ (charge, score, delta) or None, ... ]
195 #sp_scores = []
197 # [ (spectrum_no, charge, score, delta, state, stripped_peptide), ... ]
198 spectrum_info = []
200 for spectrum_no, (sqt_filename,
201 H_lines, S_line, ML_lines) in enumerate(spectra):
202 S_fields = S_line.split('\t')
203 current_charge = int(S_fields[3])
204 actual_mass = float(S_fields[6])
205 spectrum_name = '.'.join([sqt_filename] + S_fields[1:4])
207 current_score = None
208 current_delta = None
209 current_peptide_trypticity = None
210 current_state = set()
211 current_loci = set()
212 highest_rank_seen = 0
213 best_peptide_seen = None
214 current_peptide = None
216 for M_line, mod_lines, L_lines in ML_lines:
217 M_fields = M_line.split('\t')
219 rank = int(M_fields[1])
220 predicted_mass = float(M_fields[3])
221 score = float(M_fields[5])
222 peptide = M_fields[9].strip().upper() # e.g., A.B@CD*.-
223 #delta = float(M_fields[4])
224 #sp_rank = int(M_fields[2])
225 assert score >= 0 and rank >= 1
227 if rank > highest_rank_seen + 1:
228 # ignore aux top SpRank hits, because delta confounds
229 break
230 highest_rank_seen = rank
232 if current_score == None:
233 if score <= 0:
234 break # spectrum is total crap
235 current_score = score
237 # Don't trust delta from input file.
238 delta = (current_score - score) / current_score
240 assert peptide[1] == '.' and peptide[-2] == '.'
241 peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
242 peptide = peptide[2:-2] # 'B@CD*'
244 if current_peptide == None:
245 current_peptide = peptide
246 current_peptide_trypticity = 0
247 # NB: K*, R* not tryptic!
248 if (peptide_flanks[0] in ('K', 'R') and peptide[0] != 'P'
249 or peptide_flanks[0] == '-'):
250 current_peptide_trypticity += 1
251 if (peptide[-1] in ('K', 'R') and peptide_flanks[1] != 'P'
252 or peptide_flanks[1] == '-'):
253 current_peptide_trypticity += 1
255 # Choose delta from first M line such that peptide differs from
256 # peptide in initial (top) M line. We consider differently
257 # modified versions of the same peptide to be different (at least
258 # for now).
259 if peptide != current_peptide:
260 current_delta = delta
261 break
263 for L_line in L_lines:
264 locus = L_line.split('\t')[1]
265 current_loci.add(locus)
266 if locus.startswith(decoy_prefix):
267 current_state.add('decoy')
268 else:
269 current_state.add('real')
271 if (current_score != None and current_delta != None
272 and current_peptide_trypticity >= minimum_trypticity):
274 if len(current_state) == 2:
275 current_state = 'both'
276 else:
277 assert len(current_state) == 1
278 current_state = current_state.pop()
280 spectrum_info.append((spectrum_no, spectrum_name, current_charge,
281 current_score, current_delta, current_state,
282 current_peptide,
283 strip_mods(current_peptide),
284 actual_mass, actual_mass-predicted_mass,
285 frozenset(current_loci)))
287 return spectrum_info
290 # General algorithm:
291 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
292 # repeat until set of valid spectra converges:
293 # - do pr filter of spectra
294 # - if not first time, and no spectra filtered out, break
295 # - separately for each charge, choose valid spectra to maximize real ids subject to FDR
296 # - saturate (locally or completely)
299 def PPV(reals, decoys):
300 """Returns the estimated Positive Predictive Value (== 1 - FDR), given
301 counts of reals and decoys."""
303 # We know that the decoys are false positives, and we estimate that an
304 # equal number of the "reals" are actually false, too.
305 false_positives = 2*decoys
306 true_positives = reals - decoys
308 if true_positives <= 0:
309 return 0
310 return float(true_positives) / (true_positives + false_positives)
313 def calculate_inner_threshold(fdr, charge, spinfo):
314 ppv_goal = 1 - fdr
316 spinfo = sorted(spinfo, key=lambda x: x[3])
318 epsilon = +1e-6
320 real_count = sum(1 for x in spinfo if x[5] == 'real')
321 decoy_count = sum(1 for x in spinfo if x[5] == 'decoy')
323 current_threshold = -1e100 # allow all spectra
324 for n, sp in enumerate(spinfo):
325 if real_count == 0:
326 return (None, real_count, decoy_count) # give up
328 ppv_est = PPV(real_count, decoy_count)
329 if ppv_est >= ppv_goal:
330 break
331 if sp[5] == 'real':
332 real_count -= 1
333 elif sp[5] == 'decoy':
334 decoy_count -= 1
335 # set threshold just high enough to exclude this spectrum
336 current_threshold = sp[3] + epsilon
337 else:
338 current_threshold = spinfo[-1][3] + epsilon # couldn't meet goal
340 return (current_threshold, real_count, decoy_count)
343 def calculate_combined_thresholds(fdr, options, spectrum_info):
344 """Find best score/delta thresholds for each charge."""
346 ppv_goal = 1 - fdr
348 # Rather than search every possible value of delta, we're only going to
349 # "sample" at this granularity. This cuts search time dramatically (and
350 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
351 # useful in any case.
352 SEARCH_GRANULARITY = 0.001
354 # charge -> (score, delta, passing_reals, passing_decoys)
355 thresholds = {}
357 by_charge = lambda x: x[2]
358 spectrum_info.sort(key=by_charge)
359 for charge, spinfo in itertools.groupby(spectrum_info, key=by_charge):
360 spinfo0 = sorted(spinfo, key=lambda x: x[4], reverse=True)
362 last_value = None
364 while spinfo0:
365 this_value = spinfo0[-1][4] # current delta
366 if (last_value == None
367 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
369 # "inner" is score
370 r = calculate_inner_threshold(fdr, charge, spinfo0)
371 threshold, real_count, decoy_count = r
372 if threshold != None:
373 if options.debug:
374 print '#', charge, threshold, this_value, \
375 real_count, decoy_count
376 if (charge not in thresholds
377 or real_count > thresholds[charge][2]):
378 thresholds[charge] = (threshold, this_value,
379 real_count, decoy_count)
380 last_value = this_value
381 spinfo0.pop()
383 return thresholds
386 def flatten(gen_of_gens):
387 for gen in gen_of_gens:
388 for item in gen:
389 yield item
392 def redundancy_filter(options, spectrum_info):
393 if options.minimum_spectra_per_locus < 2:
394 return spectrum_info[:]
396 # si = (spectrum_no, spectrum_name, charge, score, delta, state,
397 # peptide, stripped_peptide, actual_mass, mass_delta, loci)
399 Rs = sorted(flatten(((locus, si) for locus in si[10])
400 for si in spectrum_info))
402 result = set()
403 for locus, locus_Rs in itertools.groupby(Rs, key=lambda x: x[0]):
404 list_locus_si = [ si for locus, si in locus_Rs ]
405 if len(list_locus_si) < options.minimum_spectra_per_locus:
406 continue
407 result |= set(list_locus_si)
408 return sorted(result)
411 def saturate_by_prefix_suffix(spectrum_info, saturation_spectrum_info):
412 PREFIX_LENGTH = 16 # empirically seems good
413 peptide_prefixes = set(si[7][:PREFIX_LENGTH] for si in spectrum_info)
414 sat_si = sorted(ssi for ssi in saturation_spectrum_info
415 if ssi[7][:PREFIX_LENGTH] in peptide_prefixes)
417 print ("saturation by prefix (w/%s): %s -> %s"
418 % (len(saturation_spectrum_info), len(spectrum_info), len(sat_si)))
419 return sat_si
422 def saturate(spectrum_info, saturation_spectrum_info):
423 peptides = set(si[7] for si in spectrum_info)
424 # FIX: does this actually need to be sorted?
425 sat_si = sorted(ssi for ssi in saturation_spectrum_info
426 if ssi[7] in peptides)
427 print ("saturation (w/%s): %s -> %s"
428 % (len(saturation_spectrum_info), len(spectrum_info), len(sat_si)))
429 return saturate_by_prefix_suffix(sat_si, saturation_spectrum_info)
430 #return sat_si
433 def debug_fdr(si):
434 print len(si), sum(1 for x in si if x[5] == 'real'), sum(1 for x in si if x[5] == 'decoy')
437 def try_fdr(fdr_guess, options, remaining_spectrum_info, saturation_spectrum_info=None):
438 print "try_fdr:", fdr_guess
439 debug_fdr(remaining_spectrum_info)
441 remaining_spectrum_info_1 = None
442 while True:
443 thresholds = calculate_combined_thresholds(fdr_guess, options,
444 remaining_spectrum_info)
445 remaining_spectrum_info = [ si for si in remaining_spectrum_info
446 if (si[2] in thresholds
447 and si[3] >= thresholds[si[2]][0]
448 and si[4] >= thresholds[si[2]][1]) ]
449 debug_fdr(remaining_spectrum_info)
450 if remaining_spectrum_info == remaining_spectrum_info_1:
451 break
452 remaining_spectrum_info_1 = redundancy_filter(options,
453 remaining_spectrum_info)
454 debug_fdr(remaining_spectrum_info_1)
455 if len(remaining_spectrum_info_1) == len(remaining_spectrum_info):
456 break
457 remaining_spectrum_info = remaining_spectrum_info_1
459 if not options.no_saturation and saturation_spectrum_info:
460 remaining_spectrum_info_1 = saturate(remaining_spectrum_info_1,
461 saturation_spectrum_info)
462 debug_fdr(remaining_spectrum_info_1)
464 total_reals = sum(1 for si in remaining_spectrum_info_1
465 if si[5] == 'real')
466 total_decoys = sum(1 for si in remaining_spectrum_info_1
467 if si[5] == 'decoy')
468 #total_reals = sum(thresholds[charge][2] for charge in thresholds)
469 #total_decoys = sum(thresholds[charge][3] for charge in thresholds)
470 fdr_result = 1 - PPV(total_reals, total_decoys)
472 return (fdr_result, thresholds, total_reals, total_decoys,
473 remaining_spectrum_info_1)
476 # FIX: at various points here we may have a goal of a minimum number of real
477 # ids, based on goal FDR or whatever. We can pass this inward to do
478 # branch-and-bound, which may save time.
480 # FIX: this is now somewhat slower than before--any way to speed it up?
482 def search_adjusting_fdr(options, spectrum_info_0):
483 """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
484 guess that results in an appropriate goal FDR. This might be needed
485 because the thresholds found for per-charge FDRs will be saturated or have
486 quantization errors, resulting in overly conservative thresholds. Returns
487 a list of items from spectrum_info that are considered "valid".
490 # FIX: Does this generate enough additional real ids to be worth the
491 # computational cost?
492 #FDR_INFLATION_FACTOR = 1.1
493 FDR_INFLATION_FACTOR = 3
494 assert FDR_INFLATION_FACTOR > 1
496 spectrum_info = redundancy_filter(options, spectrum_info_0)
498 low_fdr = options.fdr
499 # FIX: spectrum_info_0 has already been filtered for tryptic status
500 # Should we use an unfiltered set instead?
501 low_results = try_fdr(low_fdr, options, spectrum_info, spectrum_info_0)
503 if options.adjust_fdr:
504 high_fdr = min(1.0, options.fdr * FDR_INFLATION_FACTOR)
505 high_results = try_fdr(high_fdr, options, spectrum_info,
506 spectrum_info_0)
507 initial_total_reals = low_results[2]
509 for i in range(32):
510 if options.debug:
511 print ("adjust infl fdr %s %s -> %s"
512 % (low_fdr, high_fdr, high_results[0]))
513 if high_results[0] >= options.fdr:
514 break
515 low_fdr, low_results = high_fdr, high_results
516 high_fdr = min(1.0, high_fdr * FDR_INFLATION_FACTOR)
517 if high_fdr == 1.0:
518 break
519 high_results = try_fdr(high_fdr, options, spectrum_info,
520 spectrum_info_0)
521 else:
522 warn("FDR adjustment inflation failed")
525 # l g h
526 # l g h
529 CONVERGENCE_FACTOR = 0.01
530 assert CONVERGENCE_FACTOR > 0
531 for i in range(32):
532 assert high_fdr >= low_fdr
533 if (high_fdr - low_fdr) <= CONVERGENCE_FACTOR*options.fdr:
534 break
536 # - seems to be slower than the naive method?
537 # Bias is empirically determined. It's supposed to speed
538 # convergence, but should not affect the results.
539 # FIX: better method possible?
540 #GUESS_BIAS = 1.15
541 #prop = ((options.fdr - low_results[0])
542 # / (high_results[0] - low_results[0]))
543 #prop = min(1, max(0, prop))
544 #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
546 ####
547 guess_fdr = (high_fdr + low_fdr) / 2.0
549 guess_results = try_fdr(guess_fdr, options, spectrum_info,
550 spectrum_info_0)
551 if options.debug:
552 print ("adjust fdr %s %s %s -> %s"
553 % (low_fdr, guess_fdr, high_fdr, guess_results[0]))
554 if guess_results[0] > options.fdr:
555 high_fdr, high_results = guess_fdr, guess_results
556 else:
557 low_fdr, low_results = guess_fdr, guess_results
558 else:
559 warn("FDR adjustment convergence failed")
561 (fdr_result, thresholds, total_reals,
562 total_decoys, valid_spectrum_info) = low_results
564 if options.output_peptides:
565 valid_spectrum_info.sort(key=lambda x: x[1])
566 for (spectrum_no, spectrum_name, charge, score, delta, state,
567 peptide, stripped_peptide, actual_mass,
568 mass_delta, loci) in valid_spectrum_info:
569 print >> options.output_peptides, \
570 ("%+d %s %s %s %.8f %.8f"
571 % (charge, spectrum_name, state, peptide, actual_mass,
572 mass_delta))
574 for charge in sorted(thresholds.keys()):
575 score_threshold, delta_threshold = thresholds[charge][0:2]
576 reals, decoys = thresholds[charge][2:4]
577 print ("%+d: score %f, delta %.6f -> %s real ids, %s decoys"
578 " (fdr %.4f)"
579 % (charge, score_threshold, delta_threshold,
580 reals, decoys, 0.5 * (1 - PPV(reals, decoys))))
581 print ("# total: %s real ids, %s decoys (fdr %.4f)"
582 % (total_reals, total_decoys, fdr_result / 2.0))
583 if options.adjust_fdr:
584 print ("# (FDR adjustment found %s more real ids)"
585 % (total_reals - initial_total_reals))
587 return valid_spectrum_info
590 def main(args=sys.argv[1:]):
591 parser = optparse.OptionParser(usage=
592 "usage: %prog [options] <sqt-file>...",
593 description=__doc__, version=VERSION)
594 pa = parser.add_option
595 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
596 pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
597 help='prefix given to locus name of decoy (e.g., shuffled) database'
598 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
599 DEFAULT_FDR = 0.01
600 pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
601 help="false discovery rate [default=%s] (Note that this is the"
602 " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
603 metavar="PROPORTION")
604 pa("--adjust-fdr", action="store_true", dest="adjust_fdr",
605 help="try to adjust internal FDR to achieve a final FDR closer to"
606 " requested FDR (this takes longer, but may find more read ids in some"
607 " cases)")
608 pa("--no-saturation", action="store_true", dest="no_saturation",
609 help="skip peptide saturation")
610 pa("-v", "--verbose", action="store_true", dest="verbose",
611 help="be verbose")
612 pa("--output-peptides", dest="output_peptides",
613 help="output information about passing peptides to specified file"
614 " ('-' for stdout)", metavar="FILENAME")
615 pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
616 default='0', dest="minimum_trypticity",
617 help="drop peptides with too few tryptic ends"
618 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
619 DEFAULT_MINIMUM_SPECTRA_PER_LOCUS = 1
620 pa("-p", "--minimum-spectra-per-locus", dest="minimum_spectra_per_locus",
621 type="int", default=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
622 help="only loci with at least this many spectra are included, and"
623 " spectra not associated with such a locus are excluded"
624 " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
625 metavar="COUNT")
626 #pa("--graph", dest="graph",
627 # help='create distribution graphs, using the specified prefix',
628 # metavar="PATH PREFIX")
629 pa("--debug", action="store_true", dest="debug",
630 help="show debug output")
631 pa("--mark", action="store_true", dest="mark",
632 help="rewrite the input files, changing some validation marks to 'N',"
633 " according to filtering")
634 pa("--reset-marks", action="store_true", dest="reset_marks",
635 help="rewrite the input files, changing all validation marks to 'U'")
636 pa("--kill", action="store_true", dest="kill",
637 help="rewrite the input files, removing spectra that don't pass"
638 " validation, according to filtering")
639 pa("--copyright", action="store_true", dest="copyright",
640 help="print copyright and exit")
641 (options, args) = parser.parse_args(args=args)
643 if options.copyright:
644 print __copyright__
645 sys.exit(0)
647 if len(args) < 1 or options.minimum_spectra_per_locus < 0:
648 parser.print_help()
649 sys.exit(1)
651 if not (0.0 <= options.fdr < 0.5):
652 error("--fdr must be within range [0.0, 0.5)")
653 if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
654 > 1):
655 error("only one of --mark, --reset-marks, --kill may be specified")
657 if options.reset_marks:
658 write_spectra_to_files(reset_marks(generate_spectra_from_files(args)))
659 return
661 if options.output_peptides:
662 if options.output_peptides == '-':
663 options.output_peptides = sys.stdout
664 else:
665 options.output_peptides = open(options.output_peptides, 'w')
667 # This is to translate the "effective" FDR, which is the rate after decoys
668 # have been stripped out of the results (which is what users really care
669 # about), into our internal FDR, which includes decoys.
670 options.fdr *= 2
672 spectrum_info = get_spectrum_info(options.decoy_prefix,
673 int(options.minimum_trypticity),
674 generate_spectra_from_files(args))
676 if options.debug:
677 print ('%s passing spectra, of which %s are not decoys'
678 % (len(spectrum_info),
679 sum(1 for si in spectrum_info if si[5] == 'real')))
681 # valid_spectrum_info is the list of spectrum_info elements that have been
682 # chosen as "valid"
683 valid_spectrum_info = search_adjusting_fdr(options, spectrum_info)
685 if options.debug:
686 valid_spectrum_info.sort()
687 for (spectrum_no, spectrum_name, charge, score, delta, state,
688 peptide, stripped_peptide, actual_mass,
689 mass_delta, loci) in valid_spectrum_info:
690 print '#S', charge, score, delta
692 if options.mark or options.kill:
693 # Note that we are intentionally reading the input files again here,
694 # to avoid having to keep them all in memory.
695 write_spectra_to_files(set_marks(options, valid_spectrum_info,
696 generate_spectra_from_files(args),
697 options.kill))
700 if __name__ == '__main__':
701 main()