Merge commit 'v0.1.2'
[greylag.git] / greylag_validate.py
blob5edf18018f2223fc6cfd39854364e55979144e9a
1 #!/usr/bin/env python
3 """
4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate) and other specified criteria.
6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
7 altogether.
9 Note that a peptide end with an invalid flanking residue (currently any of
10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
11 tryptic if it's a C-terminal end and otherwise qualifies. (Flanking '-' is
12 always considered tryptic.)
14 """
16 from __future__ import with_statement
18 __copyright__ = '''
19 greylag, a collection of programs for MS/MS protein analysis
20 Copyright (C) 2006-2008 Stowers Institute for Medical Research
22 This program is free software: you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation, either version 3 of the License, or
25 (at your option) any later version.
27 This program is distributed in the hope that it will be useful,
28 but WITHOUT ANY WARRANTY; without even the implied warranty of
29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 GNU General Public License for more details.
32 You should have received a copy of the GNU General Public License
33 along with this program. If not, see <http://www.gnu.org/licenses/>.
35 Contact: Mike Coleman
36 Stowers Institute for Medical Research
37 1000 East 50th Street
38 Kansas City, Missouri 64110
39 USA
40 '''
43 from collections import defaultdict
44 import itertools
45 import logging; from logging import debug, info, warning
46 import optparse
47 import os
48 from pprint import pprint
49 import re
50 import string
51 import sys
52 import time
54 import greylag
57 def error(s, *args):
58 "fatal error"
59 # if we're unit testing, just throw an exception
60 if __name__ != "__main__":
61 raise Exception((s + " (fatal error)") % args)
62 logging.error(s, *args)
63 sys.exit(1)
65 # errors are fatal
66 greylag.chase_error = error
69 def generate_spectra_from_files(sqt_filenames):
70 """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
71 ...]) for the given SQT files. H+_lines, for example, is a list
72 containing H lines and possibly other kinds of lines up to the next S
73 line. (mod_lines are non-L lines following an M-line.)
74 """
75 for sqt_filename in sqt_filenames:
76 with open(sqt_filename) as sqtf:
77 H_lines = []
78 S_line, ML_lines = None, []
79 for line in sqtf:
80 if line.startswith('L\t'):
81 ML_lines[-1][2].append(line)
82 elif line.startswith('M\t'):
83 ML_lines.append((line, [], []))
84 elif line.startswith('S\t'):
85 if S_line:
86 yield (sqt_filename, H_lines, S_line, ML_lines)
87 S_line, ML_lines = None, []
88 S_line = line
89 else:
90 if not S_line:
91 # read headers up to first S line
92 H_lines.append(line)
93 else:
94 # other kind of line following M line
95 ML_lines[-1][1].append(line)
96 if S_line:
97 yield (sqt_filename, H_lines, S_line, ML_lines)
100 def write_spectra_to_files(spectra, filenames):
101 """Given a generator of tuples of the form generated by
102 'generate_spectra_from_files', write them to the implied files. If any of
103 filenames already exist, they are renamed with a '.bak' suffix. Temp
104 files are written first, then renamed at the end. (The filenames
105 parameter is necessary because spectra may not contain any entries for a
106 given filename, thus necessitating its removal.)
109 tmp_suffix = '.tmp.%s' % time.time()
111 # filename -> file
112 filemap = {}
114 for sqt_filename, H_lines, S_line, ML_lines in spectra:
115 sqtf = filemap.get(sqt_filename)
116 if sqtf == None:
117 sqtf = filemap[sqt_filename] = open(sqt_filename + tmp_suffix, 'w')
118 for h in H_lines:
119 sqtf.write(h)
120 sqtf.write(S_line)
121 for m_line, mod_lines, l_lines in ML_lines:
122 sqtf.write(m_line)
123 for m in mod_lines:
124 sqtf.write(m)
125 for l in l_lines:
126 sqtf.write(l)
128 for fd in filemap.values():
129 fd.close()
131 assert set(filemap.keys()) <= set(filenames)
132 for fn in filenames:
133 if os.path.exists(fn):
134 os.rename(fn, fn + '.bak')
135 if os.path.exists(fn + tmp_suffix):
136 os.rename(fn + tmp_suffix, fn)
139 def reset_marks(spectra):
140 """Given a generator of tuples of the form generated by
141 'generate_spectra_from_files', yield them with the M line marks
142 (destructively) reset to U.
144 >>> list(reset_marks([('fn', [], 'S...', [('M... Y', [], [])])]))
145 [('fn', [], 'S...', [('M... U', [], [])])]
149 def reset_mark(m_line):
150 return re.sub(r'(\s+)\S(\s*)$', r'\1U\2', m_line, 1)
152 for sqt_filename, H_lines, S_line, ML_lines in spectra:
153 ML_lines_1 = [ (reset_mark(M_line), mod_lines, L_lines)
154 for M_line, mod_lines, L_lines in ML_lines ]
155 yield (sqt_filename, H_lines, S_line, ML_lines_1)
158 def set_marks(valid_spectrum_info, spectra, kill=False):
159 """Given a generator of tuples 'spectra' of the form generated by
160 'generate_spectra_from_files', yield them with the M line marks
161 (destructively) set to 'N', for spectra not listed in
162 valid_spectrum_info. If 'kill', omit marked spectra altogether.
164 >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
165 ... ('fn', [], 'S 111...', [('M... U', [], [])])]))
166 [('fn', [], 'S 000...', [('M... U', [], [])]), ('fn', [], 'S 111...', [('M... N', [], [])])]
167 >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
168 ... ('fn', [], 'S 111...', [('M... U', [], [])])],
169 ... kill=True))
170 [('fn', [], 'S 000...', [('M... U', [], [])])]
174 def set_mark(m_line):
175 return re.sub(r'\S(\s*)$', r'N\1', m_line, 1)
177 passing_spectrum_numbers = frozenset(si[0] for si in valid_spectrum_info)
179 for spectrum_no, (sqt_filename, H_lines, S_line, ML_lines) \
180 in enumerate(spectra):
181 mark_spectrum = spectrum_no not in passing_spectrum_numbers
182 if mark_spectrum and not kill:
183 ML_lines = [ (set_mark(M_line), mod_lines, L_lines)
184 for M_line, mod_lines, L_lines in ML_lines ]
185 if not (kill and mark_spectrum):
186 yield (sqt_filename, H_lines, S_line, ML_lines)
189 nulltrans = string.maketrans('','')
190 non_aa_chars = string.digits + string.punctuation
192 def strip_mods(s):
193 """Given a peptide, return just the unmodified peptide seen.
195 >>> strip_mods('G*REY@LAG')
196 'GREYLAG'
197 >>> strip_mods('GREYLAG')
198 'GREYLAG'
201 # assuming no control chars present
202 return s.translate(nulltrans, non_aa_chars)
205 def get_spectrum_info(options, spectra):
206 """Given a generator of tuples 'spectra' of the form generated by
207 'generate_spectra_from_files', yield an info tuple for each spectrum that
208 passes the criteria specified in options (e.g., options.minimum_trypticity
209 and options.maximum_sp_rank).
212 for spectrum_no, (sqt_filename,
213 H_lines, S_line, ML_lines) in enumerate(spectra):
214 S_fields = S_line.split('\t')
215 current_charge = int(S_fields[3])
216 actual_mass = float(S_fields[6])
217 spectrum_name = '.'.join([os.path.splitext(sqt_filename)[0]]
218 + S_fields[1:4])
220 current_score = None
221 current_delta = None
222 current_peptide_trypticity = None
223 current_sp_rank = None
224 current_peptide_mass = None
225 current_state = set()
226 current_loci = set()
227 highest_rank_seen = 0
228 best_peptide_seen = None
229 current_peptide = None
231 for M_line, mod_lines, L_lines in ML_lines:
232 M_fields = M_line.split('\t')
234 rank = int(M_fields[1])
235 sp_rank = int(M_fields[2])
236 peptide_mass = float(M_fields[3])
237 score = float(M_fields[5])
238 peptide = M_fields[9].strip().upper() # e.g., A.B@CD*.-
239 assert score >= 0 and rank >= 1 and sp_rank >= 1
241 if rank > highest_rank_seen + 1:
242 # ignore aux top SpRank hits, because delta confounds
243 break
244 highest_rank_seen = rank
246 if current_score == None:
247 if score <= 0:
248 break # spectrum is total crap
249 current_score = score
251 # Don't trust delta from input file.
252 delta = (current_score - score) / current_score
254 assert peptide[1] == '.' and peptide[-2] == '.'
255 peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
256 peptide = peptide[2:-2] # 'B@CD*'
258 if current_peptide == None:
259 current_peptide = peptide
260 current_peptide_mass = peptide_mass
261 current_peptide_trypticity = 0
262 # NB: K*, R* not tryptic!
263 if (peptide_flanks[0] in ('K', 'R') and peptide[0] != 'P'
264 or peptide_flanks[0] == '-'):
265 current_peptide_trypticity += 1
266 if (peptide[-1] in ('K', 'R') and peptide_flanks[1] != 'P'
267 or peptide_flanks[1] == '-'):
268 current_peptide_trypticity += 1
269 current_sp_rank = sp_rank
271 # Choose delta from first M line such that peptide differs from
272 # peptide in initial (top) M line. We consider differently
273 # modified versions of the same peptide to be different (at least
274 # for now). But, if there are two peptides that differ only in
275 # their choice of I (isoleucine) or L (leucine), consider them
276 # identical and keep looking for a delta.
277 if peptide.replace('I', 'L') != current_peptide.replace('I', 'L'):
278 current_delta = delta
279 break
281 for L_line in L_lines:
282 locus = L_line.split('\t')[1]
283 current_loci.add(locus)
284 if locus.startswith(options.decoy_prefix):
285 current_state.add('decoy')
286 else:
287 current_state.add('real')
289 if (current_score != None and current_delta != None
290 and current_peptide_trypticity >= options.minimum_trypticity
291 and current_sp_rank <= options.maximum_sp_rank):
293 if len(current_state) == 2:
294 current_state = 'both'
295 else:
296 assert len(current_state) == 1
297 current_state = current_state.pop()
299 yield (spectrum_no, spectrum_name, current_charge, current_score,
300 current_delta, current_state, current_peptide,
301 strip_mods(current_peptide), actual_mass,
302 actual_mass-current_peptide_mass, frozenset(current_loci))
305 # FIX: can this be more succinct?
306 def remove_charge_aliases(spectrum_generator):
307 """Given a generator of spectra like get_spectrum_info, for spectra that
308 received multiple id's at different charges, yield only the best id.
309 (Probably unnecessary unless the precursor tolerance is wide.)
312 sp_info_0 = None
313 for sp_info_1 in spectrum_generator:
314 assert sp_info_1[2] < 10, "single-digit charge"
315 if not sp_info_0:
316 sp_info_0 = sp_info_1
317 continue
318 if sp_info_0[1][:-1] != sp_info_1[1][:-1]:
319 yield sp_info_0
320 sp_info_0 = sp_info_1
321 continue
322 # if at least one has sufficiently small mass delta, choose the
323 # one with the smaller
324 if abs(sp_info_0[9]) < abs(sp_info_1[9]) < 10:
325 continue
326 if abs(sp_info_1[9]) < abs(sp_info_0[9]) < 10:
327 sp_info_0 = sp_info_1
328 continue
329 # otherwise choose the better-scoring one
330 if sp_info_0[3] < sp_info_1[3]:
331 sp_info_0 = sp_info_1
333 if sp_info_0:
334 yield sp_info_0
337 # General algorithm:
338 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
339 # repeat until set of valid spectra converges:
340 # - do pr filter of spectra
341 # - if not first time, and no spectra filtered out, break
342 # - separately for each charge, choose valid spectra to maximize real ids subject to FDR
343 # - saturate (locally or completely)
346 def PPV(reals, decoys):
347 """Returns the estimated Positive Predictive Value (== 1 - FDR), given
348 counts of reals and decoys.
350 >>> PPV(100, 100)
352 >>> PPV(99, 100)
354 >>> PPV(100, 0)
356 >>> str(PPV(95, 5))
357 '0.9'
361 # We know that the decoys are false positives, and we estimate that an
362 # equal number of the "reals" are actually false, too.
363 false_positives = 2*decoys
364 true_positives = reals - decoys
366 if true_positives <= 0:
367 return 0.0
368 return float(true_positives) / (true_positives + false_positives)
371 def calculate_inner_threshold(fdr, spinfo):
372 ppv_goal = 1 - fdr
374 spinfo = sorted(spinfo, key=lambda x: x[3])
376 epsilon = +1e-6
378 real_count = sum(1 for x in spinfo if x[5] == 'real')
379 decoy_count = sum(1 for x in spinfo if x[5] == 'decoy')
381 current_threshold = -1e100 # allow all spectra
382 for n, sp in enumerate(spinfo):
383 if real_count == 0:
384 return (None, real_count, decoy_count) # give up
386 ppv_est = PPV(real_count, decoy_count)
387 if ppv_est >= ppv_goal:
388 break
389 if sp[5] == 'real':
390 real_count -= 1
391 elif sp[5] == 'decoy':
392 decoy_count -= 1
393 # set threshold just high enough to exclude this spectrum
394 current_threshold = sp[3] + epsilon
395 else:
396 current_threshold = spinfo[-1][3] + epsilon # couldn't meet goal
398 return (current_threshold, real_count, decoy_count)
401 def calculate_combined_thresholds(fdr, spectrum_info):
402 """Find best score/delta thresholds for each charge."""
404 ppv_goal = 1 - fdr
406 # Rather than search every possible value of delta, we're only going to
407 # "sample" at this granularity. This cuts search time dramatically (and
408 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
409 # useful in any case.
410 SEARCH_GRANULARITY = 0.001
412 # charge -> (score, delta, passing_reals, passing_decoys)
413 thresholds = {}
415 by_charge = lambda x: x[2]
416 spectrum_info.sort(key=by_charge)
417 for charge, spinfo in itertools.groupby(spectrum_info, key=by_charge):
418 spinfo0 = sorted(spinfo, key=lambda x: x[4], reverse=True)
420 last_value = None
422 while spinfo0:
423 this_value = spinfo0[-1][4] # current delta
424 if (last_value == None
425 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
427 # "inner" is score
428 r = calculate_inner_threshold(fdr, spinfo0)
429 threshold, real_count, decoy_count = r
430 if threshold != None:
431 debug('# %s %s %s %s %s', charge, threshold, this_value,
432 real_count, decoy_count)
433 if (charge not in thresholds
434 or real_count > thresholds[charge][2]):
435 thresholds[charge] = (threshold, this_value,
436 real_count, decoy_count)
437 last_value = this_value
438 spinfo0.pop()
440 return thresholds
443 def flatten(gen_of_gens):
444 """Yield items from a generator of generators.
446 >>> list(flatten([(1,2,3), (4, (5, 'b'), 6)]))
447 [1, 2, 3, 4, (5, 'b'), 6]
451 for gen in gen_of_gens:
452 for item in gen:
453 yield item
456 def redundancy_filter(options, spectrum_info):
457 if options.minimum_spectra_per_locus < 2:
458 return [ si for si in spectrum_info ]
460 # si = (spectrum_no, spectrum_name, charge, score, delta, state,
461 # peptide, stripped_peptide, actual_mass, mass_delta, loci)
463 Rs = sorted(flatten(((locus, si) for locus in si[10])
464 for si in spectrum_info))
466 result = set()
467 for locus, locus_Rs in itertools.groupby(Rs, key=lambda x: x[0]):
468 list_locus_si = [ si for locus, si in locus_Rs ]
469 if len(list_locus_si) < options.minimum_spectra_per_locus:
470 continue
471 result |= set(list_locus_si)
472 return sorted(result)
475 def saturate_by_prefix_suffix(spectrum_info, saturation_spectrum_info):
476 PREFIX_LENGTH = 16 # empirically seems good
477 peptide_prefixes = set(si[7][:PREFIX_LENGTH] for si in spectrum_info)
478 sat_si = sorted(ssi for ssi in saturation_spectrum_info
479 if ssi[7][:PREFIX_LENGTH] in peptide_prefixes)
481 info("saturation by prefix (w/%s): %s -> %s",
482 len(saturation_spectrum_info), len(spectrum_info), len(sat_si))
483 return sat_si
486 def saturate(spectrum_info, saturation_spectrum_info):
487 peptides = set(si[7] for si in spectrum_info)
488 # FIX: does this actually need to be sorted?
489 sat_si = sorted(ssi for ssi in saturation_spectrum_info
490 if ssi[7] in peptides)
491 info("saturation (w/%s): %s -> %s", len(saturation_spectrum_info),
492 len(spectrum_info), len(sat_si))
493 return saturate_by_prefix_suffix(sat_si, saturation_spectrum_info)
496 def fdr_stats(spectrum_info):
497 reals = sum(1 for si in spectrum_info if si[5] == 'real')
498 decoys = sum(1 for si in spectrum_info if si[5] == 'decoy')
499 fdr = 1 - PPV(reals, decoys)
500 return fdr, reals, decoys
502 def debug_fdr(options, msg, si):
503 if options.verbose:
504 fdr, reals, decoys = fdr_stats(si)
505 info('%s %s %s %s', msg, fdr, reals, decoys)
507 def try_fdr(fdr_guess, options, remaining_spectrum_info,
508 saturation_spectrum_info=None):
509 info("trying FDR = %s", fdr_guess)
510 debug_fdr(options, "initial", remaining_spectrum_info)
512 remaining_spectrum_info_1 = None
513 while True:
514 thresholds = calculate_combined_thresholds(fdr_guess,
515 remaining_spectrum_info)
516 remaining_spectrum_info = [ si for si in remaining_spectrum_info
517 if (si[2] in thresholds
518 and si[3] >= thresholds[si[2]][0]
519 and si[4] >= thresholds[si[2]][1]) ]
520 debug_fdr(options, "after thresholding", remaining_spectrum_info)
521 if remaining_spectrum_info == remaining_spectrum_info_1:
522 break
523 remaining_spectrum_info_1 = redundancy_filter(options,
524 remaining_spectrum_info)
525 debug_fdr(options, "after redundancy", remaining_spectrum_info_1)
526 if len(remaining_spectrum_info_1) == len(remaining_spectrum_info):
527 break
528 remaining_spectrum_info = remaining_spectrum_info_1
530 fdr_result, total_reals, total_decoys = fdr_stats(remaining_spectrum_info_1)
531 if not options.no_saturation and saturation_spectrum_info:
532 remaining_spectrum_info_1 = saturate(remaining_spectrum_info_1,
533 saturation_spectrum_info)
534 debug_fdr(options, "after saturation", remaining_spectrum_info_1)
535 (fdr_result,
536 total_reals, total_decoys) = fdr_stats(remaining_spectrum_info_1)
538 return (fdr_result, thresholds, total_reals, total_decoys,
539 remaining_spectrum_info_1)
542 # FIX: at various points here we may have a goal of a minimum number of real
543 # ids, based on goal FDR or whatever. We can pass this inward to do
544 # branch-and-bound, which may save time.
546 # FIX: this is now somewhat slower than before--any way to speed it up?
548 def search_adjusting_fdr(options, spectrum_info_0):
549 """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
550 guess that results in an appropriate goal FDR. This might be needed
551 because the thresholds found for per-charge FDRs will be saturated or have
552 quantization errors, resulting in overly conservative thresholds. Returns
553 a list of items from spectrum_info that are considered valid.
556 FDR_INFLATION_FACTOR = 3
557 assert FDR_INFLATION_FACTOR > 1
559 spectrum_info = redundancy_filter(options, spectrum_info_0)
561 low_fdr = options.fdr
562 # FIX: spectrum_info_0 has already been filtered for tryptic status
563 # Should we use an unfiltered set instead?
564 low_results = try_fdr(low_fdr, options, spectrum_info, spectrum_info_0)
566 if not options.no_adjust_fdr:
567 high_fdr = min(1.0, options.fdr * FDR_INFLATION_FACTOR)
568 high_results = try_fdr(high_fdr, options, spectrum_info,
569 spectrum_info_0)
570 initial_total_reals = low_results[2]
572 for i in range(32):
573 debug("adjust infl fdr %s %s -> %s", low_fdr, high_fdr,
574 high_results[0])
575 if high_results[0] >= options.fdr:
576 break
577 low_fdr, low_results = high_fdr, high_results
578 high_fdr = min(1.0, high_fdr * FDR_INFLATION_FACTOR)
579 if high_fdr == 1.0:
580 break
581 high_results = try_fdr(high_fdr, options, spectrum_info,
582 spectrum_info_0)
583 else:
584 warning("FDR adjustment inflation failed")
587 # l g h
588 # l g h
591 CONVERGENCE_FACTOR = 0.01
592 assert CONVERGENCE_FACTOR > 0
593 for i in range(32):
594 assert high_fdr >= low_fdr
595 if (high_fdr - low_fdr) <= CONVERGENCE_FACTOR*options.fdr:
596 break
598 # - seems to be slower than the naive method?
599 # Bias is empirically determined. It's supposed to speed
600 # convergence, but should not affect the results.
601 # FIX: better method possible?
602 #GUESS_BIAS = 1.15
603 #prop = ((options.fdr - low_results[0])
604 # / (high_results[0] - low_results[0]))
605 #prop = min(1, max(0, prop))
606 #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
608 guess_fdr = (high_fdr + low_fdr) / 2.0
610 guess_results = try_fdr(guess_fdr, options, spectrum_info,
611 spectrum_info_0)
612 debug("adjust fdr %s %s %s -> %s", low_fdr, guess_fdr, high_fdr,
613 guess_results[0])
614 if guess_results[0] > options.fdr:
615 high_fdr, high_results = guess_fdr, guess_results
616 else:
617 low_fdr, low_results = guess_fdr, guess_results
618 else:
619 warning("FDR adjustment convergence failed")
621 if not options.no_adjust_fdr:
622 info("FDR adjustment found %s extra real ids",
623 low_results[2] - initial_total_reals)
625 return low_results
628 def search_with_possible_saturation(options, spectrum_info):
630 for saturate in (True, False):
631 if saturate and options.no_saturation:
632 continue
633 results = search_adjusting_fdr(options, spectrum_info)
634 (fdr_result, thresholds,
635 total_reals, total_decoys, valid_spectrum_info) = results
636 if fdr_result <= options.fdr:
637 break
638 if saturate:
639 info('could not achieve FDR with saturation, retrying without')
640 options.no_saturation = True
642 if options.output_peptides:
643 valid_spectrum_info.sort(key=lambda x: x[1])
644 for (spectrum_no, spectrum_name, charge, score, delta, state,
645 peptide, stripped_peptide, actual_mass,
646 mass_delta, loci) in valid_spectrum_info:
647 print >> options.output_peptides, \
648 ("%+d %s %s %s %.8f %.8f"
649 % (charge, spectrum_name, state, peptide, actual_mass,
650 mass_delta))
652 for charge in sorted(thresholds.keys()):
653 score_threshold, delta_threshold = thresholds[charge][0:2]
654 reals, decoys = thresholds[charge][2:4]
655 info("%+d: score %f, delta %.6f -> %s real ids, %s decoys (fdr %.4f)",
656 charge, score_threshold, delta_threshold, reals, decoys,
657 0.5 * (1 - PPV(reals, decoys)))
659 if not options.quiet:
660 print ("%s real ids, %s decoys (FDR = %.4f) [p%s]"
661 % (total_reals, total_decoys, fdr_result / 2.0,
662 options.minimum_spectra_per_locus))
664 return valid_spectrum_info
667 def print_mass_error_histogram(valid_spectrum_info):
668 hist = defaultdict(int)
670 for (spectrum_no, spectrum_name, charge, score, delta, state, peptide,
671 stripped_peptide, actual_mass, mass_delta,
672 loci) in valid_spectrum_info:
673 hist[round(mass_delta)] += 1
675 pairs_most_first = sorted(((bin, hist[bin]) for bin in hist),
676 key=lambda x: x[1], reverse=True)
678 total = sum(count for bin, count in pairs_most_first)
680 cumulative_total = 0
681 for bin, count in pairs_most_first:
682 cumulative_total += count
683 print '# %+d\t%s\t%.3f' % (bin, count,
684 (float(cumulative_total) / total))
687 def main(args=sys.argv[1:]):
688 parser = optparse.OptionParser(usage=
689 "usage: %prog [options] <sqt-file>...",
690 description=__doc__, version=greylag.VERSION)
691 pa = parser.add_option
692 pa("--decoy-prefix", dest="decoy_prefix",
693 default=greylag.DEFAULT_DECOY_PREFIX,
694 help='prefix given to locus name of decoy (e.g., shuffled) database'
695 ' sequences [default=%r]' % greylag.DEFAULT_DECOY_PREFIX,
696 metavar="PREFIX")
697 DEFAULT_FDR = 0.01
698 pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
699 help="false discovery rate [default=%s] (Note that this is the"
700 " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
701 metavar="PROPORTION")
702 pa("--no-adjust-fdr", action="store_true", dest="no_adjust_fdr",
703 help="skip adjustment of internal FDR to achieve a final FDR closer to"
704 " requested FDR (which takes longer, but may find more real ids in"
705 " some cases)")
706 pa("--no-saturation", action="store_true", dest="no_saturation",
707 help="skip peptide saturation")
708 pa("-v", "--verbose", action="store_true", dest="verbose",
709 help="be verbose")
710 pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
711 pa("-l", "--logfile", dest="logfile",
712 help="log to FILE instead of stderr", metavar="FILE")
713 pa("--output-peptides", dest="output_peptides",
714 help="output information about passing peptides to specified file"
715 " ('-' for stdout)", metavar="FILENAME")
716 pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
717 default='0', dest="minimum_trypticity",
718 help="drop peptides with too few tryptic ends"
719 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
720 DEFAULT_MINIMUM_SPECTRA_PER_LOCUS = 1
721 pa("-p", "--minimum-spectra-per-locus", dest="minimum_spectra_per_locus",
722 type="int", default=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
723 help="only loci with at least this many spectra are included, and"
724 " spectra not associated with such a locus are excluded"
725 " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
726 metavar="COUNT")
727 DEFAULT_MAXIMUM_SP_RANK = 1000000
728 pa("--maximum-sp-rank", dest="maximum_sp_rank", type="int",
729 default=DEFAULT_MAXIMUM_SP_RANK,
730 help="drop peptides with greater Sp rank (the secondary rank in sqt"
731 " files, for compatibility with other search programs)"
732 " [default=%s]" % DEFAULT_MAXIMUM_SP_RANK, metavar="RANK")
733 #pa("--graph", dest="graph",
734 # help='create distribution graphs, using the specified prefix',
735 # metavar="PATH PREFIX")
736 pa("-M", "--mass-error-histogram", action="store_true",
737 dest="mass_error_histogram",
738 help="print a histogram of mass errors (1-Da-wide bins)")
739 pa("--debug", action="store_true", dest="debug",
740 help="show debug output")
741 pa("--mark", action="store_true", dest="mark",
742 help="rewrite the input files, changing some validation marks to 'N',"
743 " according to filtering")
744 pa("--reset-marks", action="store_true", dest="reset_marks",
745 help="rewrite the input files, changing all validation marks to 'U'")
746 pa("--kill", action="store_true", dest="kill",
747 help="rewrite the input files, removing spectra that don't pass"
748 " validation, according to filtering")
749 pa("--copyright", action="store_true", dest="copyright",
750 help="print copyright and exit")
751 (options, args) = parser.parse_args(args=args)
753 if options.copyright:
754 print __copyright__
755 sys.exit(0)
757 if (len(args) < 1 or options.minimum_spectra_per_locus < 0
758 or options.maximum_sp_rank < 1):
759 parser.print_help()
760 sys.exit(1)
762 if not (0.0 <= options.fdr < 0.5):
763 error("--fdr must be within range [0.0, 0.5)")
764 if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
765 > 1):
766 error("only one of --mark, --reset-marks, --kill may be specified")
768 options.minimum_trypticity = int(options.minimum_trypticity)
770 greylag.set_logging(options)
772 if options.reset_marks:
773 write_spectra_to_files(reset_marks(generate_spectra_from_files(args)),
774 args)
775 return
777 if options.output_peptides:
778 if options.output_peptides == '-':
779 options.output_peptides = sys.stdout
780 else:
781 options.output_peptides = open(options.output_peptides, 'w')
783 # This is to translate the "effective" FDR, which is the rate after decoys
784 # have been stripped out of the results (which is what users really care
785 # about), into our internal FDR, which includes decoys.
786 options.fdr *= 2
788 info("reading spectra")
789 spectrum_info = list(remove_charge_aliases(get_spectrum_info(options,
790 generate_spectra_from_files(args))))
792 # valid_spectrum_info is the list of spectrum_info elements that have been
793 # chosen as "valid"
794 valid_spectrum_info = search_with_possible_saturation(options, spectrum_info)
796 if options.mass_error_histogram:
797 print_mass_error_histogram(valid_spectrum_info)
799 if options.debug:
800 valid_spectrum_info.sort()
801 for (spectrum_no, spectrum_name, charge, score, delta, state,
802 peptide, stripped_peptide, actual_mass,
803 mass_delta, loci) in valid_spectrum_info:
804 debug('#S %s %s %s', charge, score, delta)
806 if options.mark or options.kill:
807 # Note that we are intentionally reading the input files again here,
808 # to avoid having to keep them all in memory.
809 info("rewriting spectrum files")
810 write_spectra_to_files(set_marks(valid_spectrum_info,
811 generate_spectra_from_files(args),
812 options.kill),
813 args)
814 info("finished rewriting")
817 if __name__ == '__main__':
818 main()