1 #!/usr/bin/env greylag-python
4 Given a set of sqt files, determine a 'valid' set of identifications that
5 satisfy the specified FDR (false discovery rate). Optionally rewrite the sqt
6 files with validation marks set to 'N' for filtered-out spectra.
10 from __future__
import with_statement
13 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
15 This program is free software; you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation; either version 2 of the License, or
18 (at your option) any later version.
20 This program is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
25 You should have received a copy of the GNU General Public License along
26 with this program; if not, write to the Free Software Foundation, Inc.,
27 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
33 from collections
import defaultdict
36 from pprint
import pprint
42 print >> sys
.stderr
, 'warning:', s
44 sys
.exit('error: ' + s
)
45 def fileerror(s
, *args
):
46 error(s
+ (", at line %s of file '%s'"
47 % (fileinput
.filelineno(), fileinput
.filename())),
50 def inplace_warning():
51 warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
52 " be necessary to recover some or all of the .sqt files from the"
53 " corresponding .sqt.bak files.\n!!!")
56 def reset_marks(options
, sqt_fns
):
57 """Rewrite all evaluation marks to 'U', in-place."""
60 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
61 if line
.startswith("M\t"):
64 fs
[10] = 'U' + fs
[10][1:]
66 sys
.stdout
.write(line
)
72 def mark(options
, thresholds
, sp_scores
, sqt_fns
):
73 """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
74 score and delta thresholds."""
80 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
81 if line
.startswith("S\t"):
83 charge
, score
, delta
= sp_scores
[spectrum_no
]
84 mark_spectrum
= (charge
in thresholds
86 and (score
< thresholds
[charge
][0]
87 or delta
< thresholds
[charge
][1]))
88 elif line
.startswith("M\t") and mark_spectrum
:
91 fs
[10] = 'N' + fs
[10][1:]
93 sys
.stdout
.write(line
)
99 nulltrans
= string
.maketrans('','')
100 non_aa_chars
= string
.digits
+ string
.punctuation
103 """Given a peptide, return just the unmodified peptide seen (upcased)."""
104 # assuming no control chars present
105 return s
.upper().translate(nulltrans
, non_aa_chars
)
108 def read_sqt_info(decoy_prefix
, minimum_trypticity
, sqt_fns
):
109 """Return a pair, the first a dict mapping each charge to a list of
110 (score, delta, state), where state is 'real' or 'decoy', for all the
111 spectra in sqt_fns, and the second a list of (charge, score, delta), one
112 for each spectrum. Spectra lacking minimum_trypticity are dropped.
115 # charge -> [ (score, delta, state), ... ]
116 # where state is either 'real' or 'decoy'
117 z_scores
= defaultdict(list)
119 # [ (charge, score, delta), ... ]
122 current_charge
= None
125 current_peptide_trypticity
= None # or one of [ 0, 1, 2 ]
126 current_state
= set() # 'real', 'decoy' or both
127 highest_rank_seen
= 0
128 best_peptide_seen
= None
130 for line
in fileinput
.input(sqt_fns
):
131 fs
= line
.split('\t')
133 if (current_charge
!= None and
134 current_peptide_trypticity
>= minimum_trypticity
):
135 if current_score
!= None:
136 if len(current_state
) == 1:
137 z_scores
[current_charge
].append((current_score
,
139 current_state
.pop()))
140 sp_scores
.append((current_charge
, current_score
,
142 current_charge
= int(fs
[3])
145 current_peptide_trypticity
= None
146 current_state
= set()
147 highest_rank_seen
= 0
148 best_peptide_seen
= None
154 peptide
= fs
[9].strip() # e.g., A.B@CD*.-
157 if rank
> highest_rank_seen
+ 1:
158 # ignore aux top SpRank hits, because delta confounds
160 highest_rank_seen
= rank
162 assert peptide
[1] == '.' and peptide
[-2] == '.'
163 peptide_flanks
= (peptide
[0], peptide
[-1]) # ('A', '-')
164 peptide
= peptide
[2:-2] # 'B@CD*'
165 peptide_stripped
= stripmods(peptide
) # 'BCD'
167 if best_peptide_seen
== None:
168 best_peptide_seen
= peptide_stripped
171 assert current_delta
== 0
172 current_score
= score
173 elif (current_delta
== 0 or best_peptide_seen
== None
174 or best_peptide_seen
== peptide_stripped
):
175 current_delta
= delta
176 if best_peptide_seen
!= peptide_stripped
:
177 # once we've seen a non-equivalent peptide, don't set
178 # delta if we see further equivalent peptides
179 best_peptide_seen
= '####'
180 if current_peptide_trypticity
== None:
181 current_peptide_trypticity
= 0
182 if (peptide_flanks
[0] in ('K', 'R')
183 and peptide_stripped
[0] != 'P'):
184 current_peptide_trypticity
+= 1
185 if (peptide_stripped
[-1] in ('K', 'R')
186 and peptide_flanks
[1] != 'P'):
187 current_peptide_trypticity
+= 1
189 if current_delta
== 0:
190 if fs
[1].startswith(decoy_prefix
):
191 current_state
.add('decoy')
193 current_state
.add('real')
195 # handle final spectrum, as above
196 if (current_charge
!= None and
197 current_peptide_trypticity
>= minimum_trypticity
):
198 if current_score
!= None:
199 if len(current_state
) == 1:
200 z_scores
[current_charge
].append((current_score
,
202 current_state
.pop()))
203 sp_scores
.append((current_charge
, current_score
,
206 return (z_scores
, sp_scores
)
209 def specificity(positives
, negatives
):
210 return float(positives
- negatives
) / positives
213 def calculate_inner_threshold(fdr
, charge
, spinfo
):
214 specificity_goal
= 1 - fdr
216 spinfo
= sorted(spinfo
, key
=lambda x
: x
[0])
220 real_count
= sum(1 for x
in spinfo
if x
[-1] == 'real')
221 decoy_count
= len(spinfo
) - real_count
223 current_threshold
= -1e100
# allow all spectra
224 for n
, sp
in enumerate(spinfo
):
226 return (None, real_count
, decoy_count
) # give up
228 specificity_est
= specificity(real_count
, decoy_count
)
229 if specificity_est
>= specificity_goal
:
235 # set threshold just high enough to exclude this spectrum
236 current_threshold
= sp
[0] + epsilon
238 current_threshold
= spinfo
[-1][0] + epsilon
# couldn't meet goal
240 return (current_threshold
, real_count
, decoy_count
)
243 def calculate_combined_thresholds(options
, z_scores
):
244 """Find best score/delta thresholds for each charge."""
246 specificity_goal
= 1 - options
.fdr
248 # Rather than search every possible value of delta, we're only going to
249 # "sample" at this granularity. This cuts search time dramatically (and
250 # makes it O(n) instead of O(n**2). Extra precision wouldn't really be
251 # useful in any case.
252 SEARCH_GRANULARITY
= 0.001
254 # charge -> (score, delta, passing_reals, passing_decoys)
257 for charge
, spinfo
in z_scores
.iteritems():
258 spinfo0
= sorted(spinfo
, key
=lambda x
: x
[1], reverse
=True)
263 this_value
= spinfo0
[-1][1] # current delta
264 if (last_value
== None
265 or abs(this_value
- last_value
) >= SEARCH_GRANULARITY
):
268 r
= calculate_inner_threshold(options
.fdr
, charge
, spinfo0
)
271 print '#', charge
, r
[0], this_value
, r
[1], r
[2]
272 if (charge
not in thresholds
273 or r
[1] > thresholds
[charge
][2]):
274 thresholds
[charge
] = (r
[0], this_value
, r
[1], r
[2])
276 last_value
= this_value
279 if charge
in thresholds
:
280 print ("%+d: score %s, delta %s -> %s real ids, %s decoys"
282 % (charge
, thresholds
[charge
][0], thresholds
[charge
][1],
283 thresholds
[charge
][2], thresholds
[charge
][3],
284 1 - specificity(thresholds
[charge
][2],
285 thresholds
[charge
][3])))
287 warn("could not calculate thresholds for %+d" % charge
)
292 def main(args
=sys
.argv
[1:]):
293 parser
= optparse
.OptionParser(usage
=
294 "usage: %prog [options] <sqt-file>...",
295 description
=__doc__
, version
=__version__
)
296 pa
= parser
.add_option
297 DEFAULT_DECOY_PREFIX
= "SHUFFLED_"
298 pa("--decoy-prefix", dest
="decoy_prefix", default
=DEFAULT_DECOY_PREFIX
,
299 help='prefix given to locus name of decoy (e.g., shuffled) database'
300 ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX
, metavar
="PREFIX")
302 pa("--fdr", dest
="fdr", type="float", default
=DEFAULT_FDR
,
303 help="false discovery rate [default=%s]" % DEFAULT_FDR
,
304 metavar
="PROPORTION")
305 pa("-v", "--verbose", action
="store_true", dest
="verbose",
307 pa("--list-peptides", action
="store_true", dest
="list_peptides",
308 help="print information about passing peptides")
309 pa("-t", "--minimum-trypticity", type="choice", choices
=('0', '1', '2'),
310 default
='0', dest
="minimum_trypticity",
311 help="drop peptides with too few tryptic ends"
312 " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar
="TRYPTICITY")
313 #pa("--graph", dest="graph",
314 # help='create distribution graphs, using the specified prefix',
315 # metavar="PATH PREFIX")
316 pa("--debug", action
="store_true", dest
="debug",
317 help="show debug output")
318 pa("-m", "--mark", action
="store_true", dest
="mark",
319 help="rewrite the input files, changing some validation marks to 'N',"
320 " according to filtering")
321 pa("--reset-marks", action
="store_true", dest
="reset_marks",
322 help="rewrite the input files, changing all validation marks to 'U'")
323 pa("--copyright", action
="store_true", dest
="copyright",
324 help="print copyright and exit")
325 (options
, args
) = parser
.parse_args(args
=args
)
327 if options
.copyright
:
335 if not (0.0 <= options
.fdr
<= 1.0):
336 error("--fdr must be within range [0.0, 1.0]")
337 if options
.mark
and options
.reset_marks
:
338 error("only one of --mark and --reset-marks may be specified")
340 if options
.reset_marks
:
341 reset_marks(options
, args
)
344 z_scores
, spectrum_scores
= read_sqt_info(options
.decoy_prefix
,
345 int(options
.minimum_trypticity
),
349 print ('%s passing spectra, of which %s are not decoys'
350 % (len(spectrum_scores
), sum(len(z_scores
[x
])
353 thresholds
= calculate_combined_thresholds(options
, z_scores
)
358 for charge
, score
, delta
in spectrum_scores
:
359 print '#S', charge
, score
, delta
362 mark(options
, thresholds
, spectrum_scores
, args
)
365 if __name__
== '__main__':