4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate). Optionally rewrite the sqt
6 files with validation marks set to 'N' for filtered-out spectra.
8 Note that a peptide end with an invalid flanking residue (currently any of
9 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
10 tryptic if it's a C-terminal end and otherwise qualifies. (Flanking '-' is
11 always considered tryptic.)
15 from __future__
import with_statement
18 greylag, a collection of programs for MS/MS protein analysis
19 Copyright (C) 2006-2008 Stowers Institute for Medical Research
21 This program is free software: you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation, either version 3 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License
32 along with this program. If not, see <http://www.gnu.org/licenses/>.
35 Stowers Institute for Medical Research
37 Kansas City, Missouri 64110
42 from collections
import defaultdict
45 from pprint
import pprint
50 # allow this script to be run even if not "installed"
52 from greylag
import VERSION
58 print >> sys
.stderr
, 'warning:', s
60 sys
.exit('error: ' + s
)
61 def fileerror(s
, *args
):
62 error(s
+ (", at line %s of file '%s'"
63 % (fileinput
.filelineno(), fileinput
.filename())),
66 def inplace_warning():
67 warn("\n!!!\nan error occurred while modifying .sqt files in-place--it may"
68 " be necessary to recover some or all of the .sqt files from the"
69 " corresponding .sqt.bak files.\n!!!\n")
72 def reset_marks(options
, sqt_fns
):
73 """Rewrite all evaluation marks to 'U', in-place."""
76 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
77 if line
.startswith("M\t"):
80 fs
[10] = 'U' + fs
[10][1:]
82 sys
.stdout
.write(line
)
88 def mark(options
, thresholds
, sp_scores
, sqt_fns
):
89 """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
90 score and delta thresholds."""
93 mark_spectrum
= False # mark this spectrum 'N'
96 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
97 if line
.startswith("S\t"):
99 sps
= sp_scores
[spectrum_no
]
101 charge
, score
, delta
= sp_scores
[spectrum_no
]
102 mark_spectrum
= (charge
in thresholds
104 and (score
< thresholds
[charge
][0]
105 or delta
< thresholds
[charge
][1]))
108 elif line
.startswith("M\t") and mark_spectrum
:
109 fs
= line
.split("\t")
111 fs
[10] = 'N' + fs
[10][1:]
113 sys
.stdout
.write(line
)
119 def kill(options
, thresholds
, sp_scores
, sqt_fns
):
120 """Remove, in-place, spectra not meeting score and delta thresholds."""
123 remove_spectrum
= False # remove this spectrum
126 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
127 if fileinput
.isfirstline():
128 remove_spectrum
= False
129 if line
.startswith("S\t"):
131 sps
= sp_scores
[spectrum_no
]
133 charge
, score
, delta
= sp_scores
[spectrum_no
]
134 remove_spectrum
= (charge
in thresholds
136 and (score
< thresholds
[charge
][0]
137 or delta
< thresholds
[charge
][1]))
139 remove_spectrum
= True
140 if not remove_spectrum
:
141 sys
.stdout
.write(line
)
147 nulltrans
= string
.maketrans('','')
148 non_aa_chars
= string
.digits
+ string
.punctuation
151 """Given a peptide, return just the unmodified peptide seen (upcased)."""
152 # assuming no control chars present
153 return s
.upper().translate(nulltrans
, non_aa_chars
)
156 def read_sqt_info(decoy_prefix
, minimum_trypticity
, sqt_fns
):
157 """Return a pair, the first a dict mapping each charge to a list of
158 (score, delta, state), where state is 'real' or 'decoy', for all the
159 spectra in sqt_fns, and the second a list of (charge, score, delta), one
160 for each spectrum. Spectra lacking minimum_trypticity are dropped.
163 # charge -> [ (score, delta, state, spectrum_name, peptide), ... ]
164 # where state is either 'real' or 'decoy'
165 # and peptide is the peptide as first seen, with mods, no flanks
166 z_scores
= defaultdict(list)
168 # information about spectra, in file order--None iff spectrum was filtered
170 # [ (charge, score, delta) or None, ... ]
173 current_charge
= None
176 current_peptide_trypticity
= None # or one of [ 0, 1, 2 ]
177 current_state
= set() # 'real', 'decoy' or both
178 highest_rank_seen
= 0
179 best_peptide_seen
= None
180 current_peptide
= None
183 for line
in fileinput
.input(sqt_fns
):
184 fs
= line
.split('\t')
187 if (current_charge
!= None and
188 current_peptide_trypticity
>= minimum_trypticity
):
189 if current_score
!= None:
190 if len(current_state
) == 1:
191 z_scores
[current_charge
].append((current_score
,
196 sps
= (current_charge
, current_score
, current_delta
)
197 if current_charge
!= None:
198 sp_scores
.append(sps
)
199 current_charge
= int(fs
[3])
202 current_peptide_trypticity
= None
203 current_state
= set()
204 highest_rank_seen
= 0
205 best_peptide_seen
= None
206 current_peptide
= None
207 spectrum_name
= '.'.join((fileinput
.filename(),
208 fs
[1], fs
[2], fs
[3]))
214 peptide
= fs
[9].strip() # e.g., A.B@CD*.-
217 if rank
> highest_rank_seen
+ 1:
218 # ignore aux top SpRank hits, because delta confounds
220 highest_rank_seen
= rank
222 assert peptide
[1] == '.' and peptide
[-2] == '.'
223 peptide_flanks
= (peptide
[0], peptide
[-1]) # ('A', '-')
224 peptide
= peptide
[2:-2] # 'B@CD*'
225 peptide_stripped
= stripmods(peptide
) # 'BCD'
227 if best_peptide_seen
== None:
228 best_peptide_seen
= peptide_stripped
229 if current_peptide
== None:
230 current_peptide
= peptide
232 # MyriMatch gives delta=0 when score=0, even if there are better
236 assert current_delta
== 0
237 current_score
= score
238 elif (current_delta
== 0 or best_peptide_seen
== None
239 or best_peptide_seen
== peptide_stripped
):
240 current_delta
= delta
241 if best_peptide_seen
!= peptide_stripped
:
242 # once we've seen a non-equivalent peptide, don't set
243 # delta if we see further equivalent peptides
244 best_peptide_seen
= '####'
245 if current_peptide_trypticity
== None:
246 current_peptide_trypticity
= 0
247 if (peptide_flanks
[0] in ('K', 'R')
248 and peptide_stripped
[0] != 'P'
249 or peptide_flanks
[0] == '-'):
250 current_peptide_trypticity
+= 1
251 if (peptide_stripped
[-1] in ('K', 'R')
252 and peptide_flanks
[1] != 'P'
253 or peptide_flanks
[1] == '-'):
254 current_peptide_trypticity
+= 1
256 if current_delta
== 0:
257 if fs
[1].startswith(decoy_prefix
):
258 current_state
.add('decoy')
260 current_state
.add('real')
262 # handle final spectrum, as above
263 # FIX: factor out this common code (using a generator?)
265 if (current_charge
!= None and
266 current_peptide_trypticity
>= minimum_trypticity
):
267 if current_score
!= None:
268 if len(current_state
) == 1:
269 z_scores
[current_charge
].append((current_score
,
274 sps
= (current_charge
, current_score
, current_delta
)
275 if current_charge
!= None:
276 sp_scores
.append(sps
)
277 return (z_scores
, sp_scores
)
280 def PPV(reals
, decoys
):
281 """Returns the estimated Positive Predictive Value (== 1 - FDR), given
282 counts of reals and decoys."""
284 # We know that the decoys are false positives, and we estimate that an
285 # equal number of the "reals" are actually false, too.
286 false_positives
= 2*decoys
287 true_positives
= reals
- decoys
289 return float(true_positives
) / (true_positives
+ false_positives
)
292 def calculate_inner_threshold(fdr
, charge
, spinfo
):
295 spinfo
= sorted(spinfo
, key
=lambda x
: x
[0])
299 real_count
= sum(1 for x
in spinfo
if x
[2] == 'real')
300 decoy_count
= len(spinfo
) - real_count
302 current_threshold
= -1e100
# allow all spectra
303 for n
, sp
in enumerate(spinfo
):
305 return (None, real_count
, decoy_count
) # give up
307 ppv_est
= PPV(real_count
, decoy_count
)
308 if ppv_est
>= ppv_goal
:
314 # set threshold just high enough to exclude this spectrum
315 current_threshold
= sp
[0] + epsilon
317 current_threshold
= spinfo
[-1][0] + epsilon
# couldn't meet goal
319 return (current_threshold
, real_count
, decoy_count
)
322 def calculate_combined_thresholds(options
, z_scores
):
323 """Find best score/delta thresholds for each charge."""
325 ppv_goal
= 1 - options
.fdr
327 # Rather than search every possible value of delta, we're only going to
328 # "sample" at this granularity. This cuts search time dramatically (and
329 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
330 # useful in any case.
331 SEARCH_GRANULARITY
= 0.001
333 # charge -> (score, delta, passing_reals, passing_decoys)
336 total_reals
, total_decoys
= 0, 0
338 for charge
, spinfo
in z_scores
.iteritems():
339 spinfo0
= sorted(spinfo
, key
=lambda x
: x
[1], reverse
=True)
344 this_value
= spinfo0
[-1][1] # current delta
345 if (last_value
== None
346 or abs(this_value
- last_value
) >= SEARCH_GRANULARITY
):
349 r
= calculate_inner_threshold(options
.fdr
, charge
, spinfo0
)
352 print '#', charge
, r
[0], this_value
, r
[1], r
[2]
353 if (charge
not in thresholds
354 or r
[1] > thresholds
[charge
][2]):
355 thresholds
[charge
] = (r
[0], this_value
, r
[1], r
[2])
357 last_value
= this_value
360 if charge
in thresholds
:
361 score_threshold
, delta_threshold
= thresholds
[charge
][0:2]
363 if options
.list_peptides
:
364 for (score
, delta
, state
,
365 spectrum_name
, peptide
) in z_scores
[charge
]:
366 if score
>= score_threshold
and delta
>= delta_threshold
:
367 print "%+d %s %s %s" % (charge
, spectrum_name
, state
,
370 reals
, decoys
= thresholds
[charge
][2], thresholds
[charge
][3]
372 total_decoys
+= decoys
373 print ("%+d: score %s, delta %s -> %s real ids, %s decoys"
375 % (charge
, score_threshold
, delta_threshold
,
377 1 - PPV(thresholds
[charge
][2],
378 thresholds
[charge
][3])))
380 warn("could not calculate thresholds for %+d" % charge
)
382 if not options
.list_peptides
:
383 print "# total: %s real ids, %s decoys" % (total_reals
, total_decoys
)
388 def main(args
=sys
.argv
[1:]):
389 parser
= optparse
.OptionParser(usage
=
390 "usage: %prog [options] <sqt-file>...",
391 description
=__doc__
, version
=VERSION
)
392 pa
= parser
.add_option
393 DEFAULT_DECOY_PREFIX
= "SHUFFLED_"
394 pa("--decoy-prefix", dest
="decoy_prefix", default
=DEFAULT_DECOY_PREFIX
,
395 help='prefix given to locus name of decoy (e.g., shuffled) database'
396 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX
, metavar
="PREFIX")
398 pa("--fdr", dest
="fdr", type="float", default
=DEFAULT_FDR
,
399 help="false discovery rate [default=%s] (Note that this is the"
400 " final resulting FDR after the decoys have been removed" % DEFAULT_FDR
,
401 metavar
="PROPORTION")
402 pa("-v", "--verbose", action
="store_true", dest
="verbose",
404 pa("--list-peptides", action
="store_true", dest
="list_peptides",
405 help="print information about passing peptides")
406 pa("-t", "--minimum-trypticity", type="choice", choices
=('0', '1', '2'),
407 default
='0', dest
="minimum_trypticity",
408 help="drop peptides with too few tryptic ends"
409 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar
="TRYPTICITY")
410 #pa("--graph", dest="graph",
411 # help='create distribution graphs, using the specified prefix',
412 # metavar="PATH PREFIX")
413 pa("--debug", action
="store_true", dest
="debug",
414 help="show debug output")
415 pa("-m", "--mark", action
="store_true", dest
="mark",
416 help="rewrite the input files, changing some validation marks to 'N',"
417 " according to filtering")
418 pa("--reset-marks", action
="store_true", dest
="reset_marks",
419 help="rewrite the input files, changing all validation marks to 'U'")
420 pa("-k", "--kill", action
="store_true", dest
="kill",
421 help="rewrite the input files, removing spectra that don't pass"
422 " validation, according to filtering")
423 pa("--copyright", action
="store_true", dest
="copyright",
424 help="print copyright and exit")
425 (options
, args
) = parser
.parse_args(args
=args
)
427 if options
.copyright
:
435 if not (0.0 <= options
.fdr
< 0.5):
436 error("--fdr must be within range [0.0, 0.5)")
437 if (sum(1 for x
in (options
.mark
, options
.reset_marks
, options
.kill
) if x
)
439 error("only one of --mark, --reset-marks, --kill may be specified")
441 if options
.reset_marks
:
442 reset_marks(options
, args
)
445 # This is to translate the "effective" FDR, which is the rate after decoys
446 # have been stripped out of the results (which is what users really care
447 # about), into our internal FDR, which includes decoys.
450 z_scores
, spectrum_scores
= read_sqt_info(options
.decoy_prefix
,
451 int(options
.minimum_trypticity
),
455 print ('%s passing spectra, of which %s are not decoys'
456 % (len(spectrum_scores
), sum(len(z_scores
[x
])
459 thresholds
= calculate_combined_thresholds(options
, z_scores
)
464 for charge
, score
, delta
in spectrum_scores
:
465 print '#S', charge
, score
, delta
468 kill(options
, thresholds
, spectrum_scores
, args
)
471 mark(options
, thresholds
, spectrum_scores
, args
)
474 if __name__
== '__main__':