4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate) and other specified criteria.
6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
9 Note that a peptide end with an invalid flanking residue (currently any of
10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
11 tryptic if it's a C-terminal end and otherwise qualifies. (Flanking '-' is
12 always considered tryptic.)
16 from __future__
import with_statement
19 greylag, a collection of programs for MS/MS protein analysis
20 Copyright (C) 2006-2008 Stowers Institute for Medical Research
22 This program is free software: you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation, either version 3 of the License, or
25 (at your option) any later version.
27 This program is distributed in the hope that it will be useful,
28 but WITHOUT ANY WARRANTY; without even the implied warranty of
29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 GNU General Public License for more details.
32 You should have received a copy of the GNU General Public License
33 along with this program. If not, see <http://www.gnu.org/licenses/>.
36 Stowers Institute for Medical Research
38 Kansas City, Missouri 64110
43 from collections
import defaultdict
45 import logging
; from logging
import debug
, info
, warning
48 from pprint
import pprint
55 from greylag
import VERSION
, set_logging
60 # if we're unit testing, just throw an exception
61 if __name__
!= "__main__":
62 raise Exception((s
+ " (fatal error)") % args
)
63 logging
.error(s
, *args
)
68 def generate_spectra_from_files(sqt_filenames
):
69 """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
70 ...]) for the given SQT files. H+_lines, for example, is a list
71 containing H lines and possibly other kinds of lines up to the next S
72 line. (mod_lines are non-L lines following an M-line.)
74 for sqt_filename
in sqt_filenames
:
75 with
open(sqt_filename
) as sqtf
:
77 S_line
, ML_lines
= None, []
79 if line
.startswith('L\t'):
80 ML_lines
[-1][2].append(line
)
81 elif line
.startswith('M\t'):
82 ML_lines
.append((line
, [], []))
83 elif line
.startswith('S\t'):
85 yield (sqt_filename
, H_lines
, S_line
, ML_lines
)
86 S_line
, ML_lines
= None, []
90 # read headers up to first S line
93 # other kind of line following M line
94 ML_lines
[-1][1].append(line
)
96 yield (sqt_filename
, H_lines
, S_line
, ML_lines
)
99 def write_spectra_to_files(spectra
, filenames
):
100 """Given a generator of tuples of the form generated by
101 'generate_spectra_from_files', write them to the implied files. If any of
102 filenames already exist, they are renamed with a '.bak' suffix. Temp
103 files are written first, then renamed at the end. (The filenames
104 parameter is necessary because spectra may not contain any entries for a
105 given filename, thus necessitating its removal.)
108 tmp_suffix
= '.tmp.%s' % time
.time()
113 for sqt_filename
, H_lines
, S_line
, ML_lines
in spectra
:
114 sqtf
= filemap
.get(sqt_filename
)
116 sqtf
= filemap
[sqt_filename
] = open(sqt_filename
+ tmp_suffix
, 'w')
120 for m_line
, mod_lines
, l_lines
in ML_lines
:
127 for fd
in filemap
.values():
130 assert set(filemap
.keys()) <= set(filenames
)
132 if os
.path
.exists(fn
):
133 os
.rename(fn
, fn
+ '.bak')
134 if os
.path
.exists(fn
+ tmp_suffix
):
135 os
.rename(fn
+ tmp_suffix
, fn
)
138 def reset_marks(spectra
):
139 """Given a generator of tuples of the form generated by
140 'generate_spectra_from_files', yield them with the M line marks
141 (destructively) reset to U.
143 >>> list(reset_marks([('fn', [], 'S...', [('M... Y', [], [])])]))
144 [('fn', [], 'S...', [('M... U', [], [])])]
148 def reset_mark(m_line
):
149 return re
.sub(r
'(\s+)\S(\s*)$', r
'\1U\2', m_line
, 1)
151 for sqt_filename
, H_lines
, S_line
, ML_lines
in spectra
:
152 ML_lines_1
= [ (reset_mark(M_line
), mod_lines
, L_lines
)
153 for M_line
, mod_lines
, L_lines
in ML_lines
]
154 yield (sqt_filename
, H_lines
, S_line
, ML_lines_1
)
157 def set_marks(valid_spectrum_info
, spectra
, kill
=False):
158 """Given a generator of tuples 'spectra' of the form generated by
159 'generate_spectra_from_files', yield them with the M line marks
160 (destructively) set to 'N', for spectra not listed in
161 valid_spectrum_info. If 'kill', omit marked spectra altogether.
163 >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
164 ... ('fn', [], 'S 111...', [('M... U', [], [])])]))
165 [('fn', [], 'S 000...', [('M... U', [], [])]), ('fn', [], 'S 111...', [('M... N', [], [])])]
166 >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
167 ... ('fn', [], 'S 111...', [('M... U', [], [])])],
169 [('fn', [], 'S 000...', [('M... U', [], [])])]
173 def set_mark(m_line
):
174 return re
.sub(r
'\S(\s*)$', r
'N\1', m_line
, 1)
176 passing_spectrum_numbers
= frozenset(si
[0] for si
in valid_spectrum_info
)
178 for spectrum_no
, (sqt_filename
, H_lines
, S_line
, ML_lines
) \
179 in enumerate(spectra
):
180 mark_spectrum
= spectrum_no
not in passing_spectrum_numbers
181 if mark_spectrum
and not kill
:
182 ML_lines
= [ (set_mark(M_line
), mod_lines
, L_lines
)
183 for M_line
, mod_lines
, L_lines
in ML_lines
]
184 if not (kill
and mark_spectrum
):
185 yield (sqt_filename
, H_lines
, S_line
, ML_lines
)
188 nulltrans
= string
.maketrans('','')
189 non_aa_chars
= string
.digits
+ string
.punctuation
192 """Given a peptide, return just the unmodified peptide seen.
194 >>> strip_mods('G*REY@LAG')
196 >>> strip_mods('GREYLAG')
200 # assuming no control chars present
201 return s
.translate(nulltrans
, non_aa_chars
)
204 def get_spectrum_info(options
, spectra
):
205 """Given a generator of tuples 'spectra' of the form generated by
206 'generate_spectra_from_files', yield an info tuple for each spectrum that
207 passes the criteria specified in options (e.g., options.minimum_trypticity
208 and options.maximum_sp_rank).
211 for spectrum_no
, (sqt_filename
,
212 H_lines
, S_line
, ML_lines
) in enumerate(spectra
):
213 S_fields
= S_line
.split('\t')
214 current_charge
= int(S_fields
[3])
215 actual_mass
= float(S_fields
[6])
216 spectrum_name
= '.'.join([os
.path
.splitext(sqt_filename
)[0]]
221 current_peptide_trypticity
= None
222 current_sp_rank
= None
223 current_peptide_mass
= None
224 current_state
= set()
226 highest_rank_seen
= 0
227 best_peptide_seen
= None
228 current_peptide
= None
230 for M_line
, mod_lines
, L_lines
in ML_lines
:
231 M_fields
= M_line
.split('\t')
233 rank
= int(M_fields
[1])
234 sp_rank
= int(M_fields
[2])
235 peptide_mass
= float(M_fields
[3])
236 score
= float(M_fields
[5])
237 peptide
= M_fields
[9].strip().upper() # e.g., A.B@CD*.-
238 assert score
>= 0 and rank
>= 1 and sp_rank
>= 1
240 if rank
> highest_rank_seen
+ 1:
241 # ignore aux top SpRank hits, because delta confounds
243 highest_rank_seen
= rank
245 if current_score
== None:
247 break # spectrum is total crap
248 current_score
= score
250 # Don't trust delta from input file.
251 delta
= (current_score
- score
) / current_score
253 assert peptide
[1] == '.' and peptide
[-2] == '.'
254 peptide_flanks
= (peptide
[0], peptide
[-1]) # ('A', '-')
255 peptide
= peptide
[2:-2] # 'B@CD*'
257 if current_peptide
== None:
258 current_peptide
= peptide
259 current_peptide_mass
= peptide_mass
260 current_peptide_trypticity
= 0
261 # NB: K*, R* not tryptic!
262 if (peptide_flanks
[0] in ('K', 'R') and peptide
[0] != 'P'
263 or peptide_flanks
[0] == '-'):
264 current_peptide_trypticity
+= 1
265 if (peptide
[-1] in ('K', 'R') and peptide_flanks
[1] != 'P'
266 or peptide_flanks
[1] == '-'):
267 current_peptide_trypticity
+= 1
268 current_sp_rank
= sp_rank
270 # Choose delta from first M line such that peptide differs from
271 # peptide in initial (top) M line. We consider differently
272 # modified versions of the same peptide to be different (at least
273 # for now). But, if there are two peptides that differ only in
274 # their choice of I (isoleucine) or L (leucine), consider them
275 # identical and keep looking for a delta.
276 if peptide
.replace('I', 'L') != current_peptide
.replace('I', 'L'):
277 current_delta
= delta
280 for L_line
in L_lines
:
281 locus
= L_line
.split('\t')[1]
282 current_loci
.add(locus
)
283 if locus
.startswith(options
.decoy_prefix
):
284 current_state
.add('decoy')
286 current_state
.add('real')
288 if (current_score
!= None and current_delta
!= None
289 and current_peptide_trypticity
>= options
.minimum_trypticity
290 and current_sp_rank
<= options
.maximum_sp_rank
):
292 if len(current_state
) == 2:
293 current_state
= 'both'
295 assert len(current_state
) == 1
296 current_state
= current_state
.pop()
298 yield (spectrum_no
, spectrum_name
, current_charge
, current_score
,
299 current_delta
, current_state
, current_peptide
,
300 strip_mods(current_peptide
), actual_mass
,
301 actual_mass
-current_peptide_mass
, frozenset(current_loci
))
304 # FIX: can this be more succinct?
305 def remove_charge_aliases(spectrum_generator
):
306 """Given a generator of spectra like get_spectrum_info, for spectra that
307 received multiple id's at different charges, yield only the best id.
308 (Probably unnecessary unless the precursor tolerance is wide.)
312 for sp_info_1
in spectrum_generator
:
313 assert sp_info_1
[2] < 10, "single-digit charge"
315 sp_info_0
= sp_info_1
317 if sp_info_0
[1][:-1] != sp_info_1
[1][:-1]:
319 sp_info_0
= sp_info_1
321 # if at least one has sufficiently small mass delta, choose the
322 # one with the smaller
323 if abs(sp_info_0
[9]) < abs(sp_info_1
[9]) < 10:
325 if abs(sp_info_1
[9]) < abs(sp_info_0
[9]) < 10:
326 sp_info_0
= sp_info_1
328 # otherwise choose the better-scoring one
329 if sp_info_0
[3] < sp_info_1
[3]:
330 sp_info_0
= sp_info_1
337 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
338 # repeat until set of valid spectra converges:
339 # - do pr filter of spectra
340 # - if not first time, and no spectra filtered out, break
341 # - separately for each charge, choose valid spectra to maximize real ids subject to FDR
342 # - saturate (locally or completely)
345 def PPV(reals
, decoys
):
346 """Returns the estimated Positive Predictive Value (== 1 - FDR), given
347 counts of reals and decoys.
360 # We know that the decoys are false positives, and we estimate that an
361 # equal number of the "reals" are actually false, too.
362 false_positives
= 2*decoys
363 true_positives
= reals
- decoys
365 if true_positives
<= 0:
367 return float(true_positives
) / (true_positives
+ false_positives
)
370 def calculate_inner_threshold(fdr
, spinfo
):
373 spinfo
= sorted(spinfo
, key
=lambda x
: x
[3])
377 real_count
= sum(1 for x
in spinfo
if x
[5] == 'real')
378 decoy_count
= sum(1 for x
in spinfo
if x
[5] == 'decoy')
380 current_threshold
= -1e100
# allow all spectra
381 for n
, sp
in enumerate(spinfo
):
383 return (None, real_count
, decoy_count
) # give up
385 ppv_est
= PPV(real_count
, decoy_count
)
386 if ppv_est
>= ppv_goal
:
390 elif sp
[5] == 'decoy':
392 # set threshold just high enough to exclude this spectrum
393 current_threshold
= sp
[3] + epsilon
395 current_threshold
= spinfo
[-1][3] + epsilon
# couldn't meet goal
397 return (current_threshold
, real_count
, decoy_count
)
400 def calculate_combined_thresholds(fdr
, spectrum_info
):
401 """Find best score/delta thresholds for each charge."""
405 # Rather than search every possible value of delta, we're only going to
406 # "sample" at this granularity. This cuts search time dramatically (and
407 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
408 # useful in any case.
409 SEARCH_GRANULARITY
= 0.001
411 # charge -> (score, delta, passing_reals, passing_decoys)
414 by_charge
= lambda x
: x
[2]
415 spectrum_info
.sort(key
=by_charge
)
416 for charge
, spinfo
in itertools
.groupby(spectrum_info
, key
=by_charge
):
417 spinfo0
= sorted(spinfo
, key
=lambda x
: x
[4], reverse
=True)
422 this_value
= spinfo0
[-1][4] # current delta
423 if (last_value
== None
424 or abs(this_value
- last_value
) >= SEARCH_GRANULARITY
):
427 r
= calculate_inner_threshold(fdr
, spinfo0
)
428 threshold
, real_count
, decoy_count
= r
429 if threshold
!= None:
430 debug('# %s %s %s %s %s', charge
, threshold
, this_value
,
431 real_count
, decoy_count
)
432 if (charge
not in thresholds
433 or real_count
> thresholds
[charge
][2]):
434 thresholds
[charge
] = (threshold
, this_value
,
435 real_count
, decoy_count
)
436 last_value
= this_value
442 def flatten(gen_of_gens
):
443 """Yield items from a generator of generators.
445 >>> list(flatten([(1,2,3), (4, (5, 'b'), 6)]))
446 [1, 2, 3, 4, (5, 'b'), 6]
450 for gen
in gen_of_gens
:
455 def redundancy_filter(options
, spectrum_info
):
456 if options
.minimum_spectra_per_locus
< 2:
457 return [ si
for si
in spectrum_info
]
459 # si = (spectrum_no, spectrum_name, charge, score, delta, state,
460 # peptide, stripped_peptide, actual_mass, mass_delta, loci)
462 Rs
= sorted(flatten(((locus
, si
) for locus
in si
[10])
463 for si
in spectrum_info
))
466 for locus
, locus_Rs
in itertools
.groupby(Rs
, key
=lambda x
: x
[0]):
467 list_locus_si
= [ si
for locus
, si
in locus_Rs
]
468 if len(list_locus_si
) < options
.minimum_spectra_per_locus
:
470 result |
= set(list_locus_si
)
471 return sorted(result
)
474 def saturate_by_prefix_suffix(spectrum_info
, saturation_spectrum_info
):
475 PREFIX_LENGTH
= 16 # empirically seems good
476 peptide_prefixes
= set(si
[7][:PREFIX_LENGTH
] for si
in spectrum_info
)
477 sat_si
= sorted(ssi
for ssi
in saturation_spectrum_info
478 if ssi
[7][:PREFIX_LENGTH
] in peptide_prefixes
)
480 info("saturation by prefix (w/%s): %s -> %s",
481 len(saturation_spectrum_info
), len(spectrum_info
), len(sat_si
))
485 def saturate(spectrum_info
, saturation_spectrum_info
):
486 peptides
= set(si
[7] for si
in spectrum_info
)
487 # FIX: does this actually need to be sorted?
488 sat_si
= sorted(ssi
for ssi
in saturation_spectrum_info
489 if ssi
[7] in peptides
)
490 info("saturation (w/%s): %s -> %s", len(saturation_spectrum_info
),
491 len(spectrum_info
), len(sat_si
))
492 return saturate_by_prefix_suffix(sat_si
, saturation_spectrum_info
)
495 def fdr_stats(spectrum_info
):
496 reals
= sum(1 for si
in spectrum_info
if si
[5] == 'real')
497 decoys
= sum(1 for si
in spectrum_info
if si
[5] == 'decoy')
498 fdr
= 1 - PPV(reals
, decoys
)
499 return fdr
, reals
, decoys
501 def debug_fdr(options
, msg
, si
):
503 fdr
, reals
, decoys
= fdr_stats(si
)
504 info('%s %s %s %s', msg
, fdr
, reals
, decoys
)
506 def try_fdr(fdr_guess
, options
, remaining_spectrum_info
,
507 saturation_spectrum_info
=None):
508 info("trying FDR = %s", fdr_guess
)
509 debug_fdr(options
, "initial", remaining_spectrum_info
)
511 remaining_spectrum_info_1
= None
513 thresholds
= calculate_combined_thresholds(fdr_guess
,
514 remaining_spectrum_info
)
515 remaining_spectrum_info
= [ si
for si
in remaining_spectrum_info
516 if (si
[2] in thresholds
517 and si
[3] >= thresholds
[si
[2]][0]
518 and si
[4] >= thresholds
[si
[2]][1]) ]
519 debug_fdr(options
, "after thresholding", remaining_spectrum_info
)
520 if remaining_spectrum_info
== remaining_spectrum_info_1
:
522 remaining_spectrum_info_1
= redundancy_filter(options
,
523 remaining_spectrum_info
)
524 debug_fdr(options
, "after redundancy", remaining_spectrum_info_1
)
525 if len(remaining_spectrum_info_1
) == len(remaining_spectrum_info
):
527 remaining_spectrum_info
= remaining_spectrum_info_1
529 fdr_result
, total_reals
, total_decoys
= fdr_stats(remaining_spectrum_info_1
)
530 if not options
.no_saturation
and saturation_spectrum_info
:
531 remaining_spectrum_info_1
= saturate(remaining_spectrum_info_1
,
532 saturation_spectrum_info
)
533 debug_fdr(options
, "after saturation", remaining_spectrum_info_1
)
535 total_reals
, total_decoys
) = fdr_stats(remaining_spectrum_info_1
)
537 return (fdr_result
, thresholds
, total_reals
, total_decoys
,
538 remaining_spectrum_info_1
)
541 # FIX: at various points here we may have a goal of a minimum number of real
542 # ids, based on goal FDR or whatever. We can pass this inward to do
543 # branch-and-bound, which may save time.
545 # FIX: this is now somewhat slower than before--any way to speed it up?
547 def search_adjusting_fdr(options
, spectrum_info_0
):
548 """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
549 guess that results in an appropriate goal FDR. This might be needed
550 because the thresholds found for per-charge FDRs will be saturated or have
551 quantization errors, resulting in overly conservative thresholds. Returns
552 a list of items from spectrum_info that are considered valid.
555 FDR_INFLATION_FACTOR
= 3
556 assert FDR_INFLATION_FACTOR
> 1
558 spectrum_info
= redundancy_filter(options
, spectrum_info_0
)
560 low_fdr
= options
.fdr
561 # FIX: spectrum_info_0 has already been filtered for tryptic status
562 # Should we use an unfiltered set instead?
563 low_results
= try_fdr(low_fdr
, options
, spectrum_info
, spectrum_info_0
)
565 if not options
.no_adjust_fdr
:
566 high_fdr
= min(1.0, options
.fdr
* FDR_INFLATION_FACTOR
)
567 high_results
= try_fdr(high_fdr
, options
, spectrum_info
,
569 initial_total_reals
= low_results
[2]
572 debug("adjust infl fdr %s %s -> %s", low_fdr
, high_fdr
,
574 if high_results
[0] >= options
.fdr
:
576 low_fdr
, low_results
= high_fdr
, high_results
577 high_fdr
= min(1.0, high_fdr
* FDR_INFLATION_FACTOR
)
580 high_results
= try_fdr(high_fdr
, options
, spectrum_info
,
583 warning("FDR adjustment inflation failed")
590 CONVERGENCE_FACTOR
= 0.01
591 assert CONVERGENCE_FACTOR
> 0
593 assert high_fdr
>= low_fdr
594 if (high_fdr
- low_fdr
) <= CONVERGENCE_FACTOR
*options
.fdr
:
597 # - seems to be slower than the naive method?
598 # Bias is empirically determined. It's supposed to speed
599 # convergence, but should not affect the results.
600 # FIX: better method possible?
602 #prop = ((options.fdr - low_results[0])
603 # / (high_results[0] - low_results[0]))
604 #prop = min(1, max(0, prop))
605 #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
607 guess_fdr
= (high_fdr
+ low_fdr
) / 2.0
609 guess_results
= try_fdr(guess_fdr
, options
, spectrum_info
,
611 debug("adjust fdr %s %s %s -> %s", low_fdr
, guess_fdr
, high_fdr
,
613 if guess_results
[0] > options
.fdr
:
614 high_fdr
, high_results
= guess_fdr
, guess_results
616 low_fdr
, low_results
= guess_fdr
, guess_results
618 warning("FDR adjustment convergence failed")
620 if not options
.no_adjust_fdr
:
621 info("FDR adjustment found %s extra real ids",
622 low_results
[2] - initial_total_reals
)
627 def search_with_possible_saturation(options
, spectrum_info
):
629 for saturate
in (True, False):
630 if saturate
and options
.no_saturation
:
632 results
= search_adjusting_fdr(options
, spectrum_info
)
633 (fdr_result
, thresholds
,
634 total_reals
, total_decoys
, valid_spectrum_info
) = results
635 if fdr_result
<= options
.fdr
:
638 info('could not achieve FDR with saturation, retrying without')
639 options
.no_saturation
= True
641 if options
.output_peptides
:
642 valid_spectrum_info
.sort(key
=lambda x
: x
[1])
643 for (spectrum_no
, spectrum_name
, charge
, score
, delta
, state
,
644 peptide
, stripped_peptide
, actual_mass
,
645 mass_delta
, loci
) in valid_spectrum_info
:
646 print >> options
.output_peptides
, \
647 ("%+d %s %s %s %.8f %.8f"
648 % (charge
, spectrum_name
, state
, peptide
, actual_mass
,
651 for charge
in sorted(thresholds
.keys()):
652 score_threshold
, delta_threshold
= thresholds
[charge
][0:2]
653 reals
, decoys
= thresholds
[charge
][2:4]
654 info("%+d: score %f, delta %.6f -> %s real ids, %s decoys (fdr %.4f)",
655 charge
, score_threshold
, delta_threshold
, reals
, decoys
,
656 0.5 * (1 - PPV(reals
, decoys
)))
658 if not options
.quiet
:
659 print ("%s real ids, %s decoys (FDR = %.4f) [p%s]"
660 % (total_reals
, total_decoys
, fdr_result
/ 2.0,
661 options
.minimum_spectra_per_locus
))
663 return valid_spectrum_info
666 def print_mass_error_histogram(valid_spectrum_info
):
667 hist
= defaultdict(int)
669 for (spectrum_no
, spectrum_name
, charge
, score
, delta
, state
, peptide
,
670 stripped_peptide
, actual_mass
, mass_delta
,
671 loci
) in valid_spectrum_info
:
672 hist
[round(mass_delta
)] += 1
674 pairs_most_first
= sorted(((bin
, hist
[bin
]) for bin
in hist
),
675 key
=lambda x
: x
[1], reverse
=True)
677 total
= sum(count
for bin
, count
in pairs_most_first
)
680 for bin
, count
in pairs_most_first
:
681 cumulative_total
+= count
682 print '# %+d\t%s\t%.3f' % (bin
, count
,
683 (float(cumulative_total
) / total
))
686 def main(args
=sys
.argv
[1:]):
687 parser
= optparse
.OptionParser(usage
=
688 "usage: %prog [options] <sqt-file>...",
689 description
=__doc__
, version
=VERSION
)
690 pa
= parser
.add_option
691 DEFAULT_DECOY_PREFIX
= "SHUFFLED_"
692 pa("--decoy-prefix", dest
="decoy_prefix", default
=DEFAULT_DECOY_PREFIX
,
693 help='prefix given to locus name of decoy (e.g., shuffled) database'
694 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX
, metavar
="PREFIX")
696 pa("--fdr", dest
="fdr", type="float", default
=DEFAULT_FDR
,
697 help="false discovery rate [default=%s] (Note that this is the"
698 " final resulting FDR after the decoys have been removed" % DEFAULT_FDR
,
699 metavar
="PROPORTION")
700 pa("--no-adjust-fdr", action
="store_true", dest
="no_adjust_fdr",
701 help="skip adjustment of internal FDR to achieve a final FDR closer to"
702 " requested FDR (which takes longer, but may find more real ids in"
704 pa("--no-saturation", action
="store_true", dest
="no_saturation",
705 help="skip peptide saturation")
706 pa("-v", "--verbose", action
="store_true", dest
="verbose",
708 pa("-q", "--quiet", action
="store_true", dest
="quiet", help="no warnings")
709 pa("-l", "--logfile", dest
="logfile",
710 help="log to FILE instead of stderr", metavar
="FILE")
711 pa("--output-peptides", dest
="output_peptides",
712 help="output information about passing peptides to specified file"
713 " ('-' for stdout)", metavar
="FILENAME")
714 pa("-t", "--minimum-trypticity", type="choice", choices
=('0', '1', '2'),
715 default
='0', dest
="minimum_trypticity",
716 help="drop peptides with too few tryptic ends"
717 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar
="TRYPTICITY")
718 DEFAULT_MINIMUM_SPECTRA_PER_LOCUS
= 1
719 pa("-p", "--minimum-spectra-per-locus", dest
="minimum_spectra_per_locus",
720 type="int", default
=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS
,
721 help="only loci with at least this many spectra are included, and"
722 " spectra not associated with such a locus are excluded"
723 " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS
,
725 DEFAULT_MAXIMUM_SP_RANK
= 1000000
726 pa("--maximum-sp-rank", dest
="maximum_sp_rank", type="int",
727 default
=DEFAULT_MAXIMUM_SP_RANK
,
728 help="drop peptides with greater Sp rank (the secondary rank in sqt"
729 " files, for compatibility with other search programs)"
730 " [default=%s]" % DEFAULT_MAXIMUM_SP_RANK
, metavar
="RANK")
731 #pa("--graph", dest="graph",
732 # help='create distribution graphs, using the specified prefix',
733 # metavar="PATH PREFIX")
734 pa("-M", "--mass-error-histogram", action
="store_true",
735 dest
="mass_error_histogram",
736 help="print a histogram of mass errors (1-Da-wide bins)")
737 pa("--debug", action
="store_true", dest
="debug",
738 help="show debug output")
739 pa("--mark", action
="store_true", dest
="mark",
740 help="rewrite the input files, changing some validation marks to 'N',"
741 " according to filtering")
742 pa("--reset-marks", action
="store_true", dest
="reset_marks",
743 help="rewrite the input files, changing all validation marks to 'U'")
744 pa("--kill", action
="store_true", dest
="kill",
745 help="rewrite the input files, removing spectra that don't pass"
746 " validation, according to filtering")
747 pa("--copyright", action
="store_true", dest
="copyright",
748 help="print copyright and exit")
749 (options
, args
) = parser
.parse_args(args
=args
)
751 if options
.copyright
:
755 if (len(args
) < 1 or options
.minimum_spectra_per_locus
< 0
756 or options
.maximum_sp_rank
< 1):
760 if not (0.0 <= options
.fdr
< 0.5):
761 error("--fdr must be within range [0.0, 0.5)")
762 if (sum(1 for x
in (options
.mark
, options
.reset_marks
, options
.kill
) if x
)
764 error("only one of --mark, --reset-marks, --kill may be specified")
766 options
.minimum_trypticity
= int(options
.minimum_trypticity
)
770 if options
.reset_marks
:
771 write_spectra_to_files(reset_marks(generate_spectra_from_files(args
)),
775 if options
.output_peptides
:
776 if options
.output_peptides
== '-':
777 options
.output_peptides
= sys
.stdout
779 options
.output_peptides
= open(options
.output_peptides
, 'w')
781 # This is to translate the "effective" FDR, which is the rate after decoys
782 # have been stripped out of the results (which is what users really care
783 # about), into our internal FDR, which includes decoys.
786 info("reading spectra")
787 spectrum_info
= list(remove_charge_aliases(get_spectrum_info(options
,
788 generate_spectra_from_files(args
))))
790 # valid_spectrum_info is the list of spectrum_info elements that have been
792 valid_spectrum_info
= search_with_possible_saturation(options
, spectrum_info
)
794 if options
.mass_error_histogram
:
795 print_mass_error_histogram(valid_spectrum_info
)
798 valid_spectrum_info
.sort()
799 for (spectrum_no
, spectrum_name
, charge
, score
, delta
, state
,
800 peptide
, stripped_peptide
, actual_mass
,
801 mass_delta
, loci
) in valid_spectrum_info
:
802 debug('#S %s %s %s', charge
, score
, delta
)
804 if options
.mark
or options
.kill
:
805 # Note that we are intentionally reading the input files again here,
806 # to avoid having to keep them all in memory.
807 info("rewriting spectrum files")
808 write_spectra_to_files(set_marks(valid_spectrum_info
,
809 generate_spectra_from_files(args
),
812 info("finished rewriting")
815 if __name__
== '__main__':