1 #!/usr/bin/env greylag-python
4 Filter a set of sqt files according to FDR and other criteria, optionally
5 rewriting them with the validation marks set to 'N' for filtered-out spectra.
9 from __future__
import with_statement
12 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
14 This program is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 2 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License along
25 with this program; if not, write to the Free Software Foundation, Inc.,
26 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
32 from collections
import defaultdict
35 from pprint
import pprint
40 print >> sys
.stderr
, 'warning:', s
42 sys
.exit('error: ' + s
)
43 def fileerror(s
, *args
):
44 error(s
+ (", at line %s of file '%s'"
45 % (fileinput
.filelineno(), fileinput
.filename())),
48 def inplace_warning():
49 warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
50 " be necessary to recover some or all of the .sqt files from the"
51 " corresponding .sqt.bak files.\n!!!")
54 def reset_marks(options
, sqt_fns
):
55 """Rewrite all evaluation marks to 'U', in-place."""
58 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
59 if line
.startswith("M\t"):
62 fs
[10] = 'U' + fs
[10][1:]
64 sys
.stdout
.write(line
)
70 def mark(options
, thresholds
, sp_scores
, sqt_fns
):
71 """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
72 score and delta thresholds."""
78 for line
in fileinput
.input(sqt_fns
, inplace
=1, backup
='.bak'):
79 if line
.startswith("S\t"):
81 charge
, score
, delta
= sp_scores
[spectrum_no
]
82 mark_spectrum
= (charge
in thresholds
84 and (score
< thresholds
[charge
][0]
85 or delta
< thresholds
[charge
][1]))
86 elif line
.startswith("M\t") and mark_spectrum
:
89 fs
[10] = 'N' + fs
[10][1:]
91 sys
.stdout
.write(line
)
97 def read_sqt_info(decoy_prefix
, is_laf
, sqt_fns
):
98 """Return a pair, the first a dict mapping each charge to a list of
99 (score, delta, state), where state is 'real' or 'decoy', for all the
100 spectra in sqt_fns, and the second a list of (charge, score, delta), one
104 # charge -> [ (score, delta, state), ... ]
105 # where state is either 'real' or 'decoy'
106 z_scores
= defaultdict(list)
108 # [ (charge, score, delta), ... ]
111 current_charge
= None
114 current_sp_rank
= None
115 current_peptide_length
= None
116 current_state
= set()
118 for line
in fileinput
.input(sqt_fns
):
119 fs
= line
.split('\t')
121 if (current_charge
!= None and
122 (not is_laf
or (current_sp_rank
<= 10))):
123 if current_score
!= None and len(current_state
) == 1:
124 z_scores
[current_charge
].append((current_score
,
126 current_state
.pop()))
127 if current_charge
!= None:
128 sp_scores
.append((current_charge
, current_score
,
130 current_charge
= int(fs
[3])
133 current_sp_rank
= None
134 current_peptide_length
= None
135 current_state
= set()
137 delta
, score
, sp_rank
, pep_length
= (float(fs
[4]), float(fs
[5]),
138 int(fs
[2]), len(fs
[9])-4)
140 current_score
= score
141 elif current_delta
== 0:
142 current_delta
= delta
143 if current_sp_rank
== None:
144 current_sp_rank
= sp_rank
145 if current_peptide_length
== None:
146 current_peptide_length
= pep_length
148 if current_delta
== 0:
149 if fs
[1].startswith(decoy_prefix
):
150 current_state
.add('decoy')
152 current_state
.add('real')
153 # handle final spectrum, as above
154 if (current_charge
!= None and
155 (not is_laf
or (current_sp_rank
<= 10))):
156 if current_score
!= None and len(current_state
) == 1:
157 z_scores
[current_charge
].append((current_score
, current_delta
,
158 current_state
.pop()))
159 if current_charge
!= None:
160 sp_scores
.append((current_charge
, current_score
, current_delta
))
162 return (z_scores
, sp_scores
)
165 def specificity(positives
, negatives
):
166 return (float(positives
- negatives
)
167 / (positives
+ negatives
))
170 def calculate_inner_threshold(specificity_goal
, charge
, spinfo
):
171 spinfo
= sorted(spinfo
, key
=lambda x
: x
[0])
173 real_count
= sum(1 for x
in spinfo
if x
[-1] == 'real')
174 decoy_count
= len(spinfo
) - real_count
177 return (None, real_count
, decoy_count
) # give up
179 current_threshold
= -1e100
# allow all spectra
180 for n
, sp
in enumerate(spinfo
):
181 specificity_est
= specificity(real_count
, decoy_count
)
182 if specificity_est
>= specificity_goal
:
188 # set threshold just high enough to exclude this spectrum
189 current_threshold
= sp
[0] + 1e-6
191 current_threshold
= spinfo
[-1][0] + 1e-6 # couldn't meet goal
193 return (current_threshold
, real_count
, decoy_count
)
196 def calculate_combined_thresholds(options
, z_scores
):
197 """Find best score/delta thresholds for each charge."""
199 specificity_goal
= 1 - options
.fdr
201 # Rather than search every possible value of delta, we're only going to
202 # "sample" at this granularity. This cuts search time dramatically (and
203 # making it O(n) instead of O(n**2). Extra precision wouldn't really be
204 # useful in any case.
205 SEARCH_GRANULARITY
= 0.001
207 # charge -> (score, delta, passing_reals, passing_decoys)
210 for charge
, spinfo
in z_scores
.iteritems():
211 spinfo0
= sorted(spinfo
, key
=lambda x
: x
[1], reverse
=True)
216 this_value
= spinfo0
[-1][1] # current delta
217 if (last_value
== None
218 or abs(this_value
- last_value
) >= SEARCH_GRANULARITY
):
221 r
= calculate_inner_threshold(specificity_goal
, charge
,
225 print '#', charge
, r
[0], this_value
, r
[1], r
[2]
226 if (charge
not in thresholds
227 or r
[1] > thresholds
[charge
][2]):
228 thresholds
[charge
] = (r
[0], this_value
, r
[1], r
[2])
230 last_value
= this_value
233 if charge
in thresholds
:
235 print ("%+d: score %s, delta %s -> %s real ids (fdr %.4f)"
236 % (charge
, thresholds
[charge
][0], thresholds
[charge
][1],
237 thresholds
[charge
][2],
238 1 - specificity(thresholds
[charge
][2],
239 thresholds
[charge
][3])))
241 warn("could not calculate thresholds for %+d" % charge
)
246 def main(args
=sys
.argv
[1:]):
247 parser
= optparse
.OptionParser(usage
=
248 "usage: %prog [options] <sqt-file>...",
249 description
=__doc__
, version
=__version__
)
250 pa
= parser
.add_option
251 pa("--decoy-prefix", dest
="decoy_prefix", default
="SHUFFLED_",
252 help='prefix given to locus name of decoy (e.g., shuffled) database'
253 ' sequences [default="SHUFFLED_"]', metavar
="PREFIX")
254 pa("--fdr", dest
="fdr", type="float", default
="0.02",
255 help="false discovery rate [default=0.02]", metavar
="PROPORTION")
256 pa("-v", "--verbose", action
="store_true", dest
="verbose",
258 pa("--laf", action
="store_true", dest
="laf",
259 help="drop peptides of length < 7 or Sp rank > 10")
260 pa("--debug", action
="store_true", dest
="debug",
261 help="show debug output")
262 pa("-m", "--mark", action
="store_true", dest
="mark",
263 help="rewrite the input files, changing some validation marks to 'N',"
264 " according to filtering")
265 pa("--reset-marks", action
="store_true", dest
="reset_marks",
266 help="rewrite the input files, changing all validation marks to 'U'")
267 pa("--copyright", action
="store_true", dest
="copyright",
268 help="print copyright and exit")
269 (options
, args
) = parser
.parse_args(args
=args
)
271 if options
.copyright
:
279 if not (0.0 <= options
.fdr
<= 1.0):
280 error("--fdr must be within range [0.0, 1.0]")
281 if options
.mark
and options
.reset_marks
:
282 error("only one of --mark and --reset-marks may be specified")
284 if options
.reset_marks
:
285 reset_marks(options
, args
)
288 z_scores
, spectrum_scores
= read_sqt_info(options
.decoy_prefix
,
291 thresholds
= calculate_combined_thresholds(options
, z_scores
)
296 for charge
, score
, delta
in spectrum_scores
:
297 print '#S', charge
, score
, delta
300 mark(options
, thresholds
, spectrum_scores
, args
)
303 if __name__
== '__main__':