sqt-filter: add temporary --laf argument to filter spectra
[greylag.git] / sqt_filter.py
blob49733df30bf5cb9e13f934d4ed476045dd4ebe6f
1 #!/usr/bin/env greylag-python
3 """
4 Filter a set of sqt files according to FDR and other criteria, optionally
5 rewriting them with the validation marks set to 'N' for filtered-out spectra.
7 """
9 from __future__ import with_statement
11 __copyright__ = '''
12 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
14 This program is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 2 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License along
25 with this program; if not, write to the Free Software Foundation, Inc.,
26 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 '''
29 __version__ = "0.0"
32 from collections import defaultdict
33 import fileinput
34 import optparse
35 from pprint import pprint
36 import sys
39 def warn(s):
40 print >> sys.stderr, 'warning:', s
41 def error(s):
42 sys.exit('error: ' + s)
43 def fileerror(s, *args):
44 error(s + (", at line %s of file '%s'"
45 % (fileinput.filelineno(), fileinput.filename())),
46 *args)
48 def inplace_warning():
49 warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
50 " be necessary to recover some or all of the .sqt files from the"
51 " corresponding .sqt.bak files.\n!!!")
54 def reset_marks(options, sqt_fns):
55 """Rewrite all evaluation marks to 'U', in-place."""
57 try:
58 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
59 if line.startswith("M\t"):
60 fs = line.split("\t")
61 if len(fs) >= 11:
62 fs[10] = 'U' + fs[10][1:]
63 line = '\t'.join(fs)
64 sys.stdout.write(line)
65 except:
66 inplace_warning()
67 raise
70 def mark(options, thresholds, sp_scores, sqt_fns):
71 """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
72 score and delta thresholds."""
74 spectrum_no = -1
75 mark_spectrum = False
77 try:
78 for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
79 if line.startswith("S\t"):
80 spectrum_no += 1
81 charge, score, delta = sp_scores[spectrum_no]
82 mark_spectrum = (charge in thresholds
83 and score != None
84 and (score < thresholds[charge][0]
85 or delta < thresholds[charge][1]))
86 elif line.startswith("M\t") and mark_spectrum:
87 fs = line.split("\t")
88 if len(fs) >= 11:
89 fs[10] = 'N' + fs[10][1:]
90 line = '\t'.join(fs)
91 sys.stdout.write(line)
92 except:
93 inplace_warning()
94 raise
97 def read_sqt_info(decoy_prefix, is_laf, sqt_fns):
98 """Return a pair, the first a dict mapping each charge to a list of
99 (score, delta, state), where state is 'real' or 'decoy', for all the
100 spectra in sqt_fns, and the second a list of (charge, score, delta), one
101 for each spectrum.
104 # charge -> [ (score, delta, state), ... ]
105 # where state is either 'real' or 'decoy'
106 z_scores = defaultdict(list)
108 # [ (charge, score, delta), ... ]
109 sp_scores = []
111 current_charge = None
112 current_score = None
113 current_delta = 0
114 current_sp_rank = None
115 current_peptide_length = None
116 current_state = set()
118 for line in fileinput.input(sqt_fns):
119 fs = line.split('\t')
120 if fs[0] == 'S':
121 if (current_charge != None and
122 (not is_laf or (current_sp_rank <= 10))):
123 if current_score != None and len(current_state) == 1:
124 z_scores[current_charge].append((current_score,
125 current_delta,
126 current_state.pop()))
127 if current_charge != None:
128 sp_scores.append((current_charge, current_score,
129 current_delta))
130 current_charge = int(fs[3])
131 current_score = None
132 current_delta = 0
133 current_sp_rank = None
134 current_peptide_length = None
135 current_state = set()
136 elif fs[0] == 'M':
137 delta, score, sp_rank, pep_length = (float(fs[4]), float(fs[5]),
138 int(fs[2]), len(fs[9])-4)
139 if delta == 0:
140 current_score = score
141 elif current_delta == 0:
142 current_delta = delta
143 if current_sp_rank == None:
144 current_sp_rank = sp_rank
145 if current_peptide_length == None:
146 current_peptide_length = pep_length
147 elif fs[0] == 'L':
148 if current_delta == 0:
149 if fs[1].startswith(decoy_prefix):
150 current_state.add('decoy')
151 else:
152 current_state.add('real')
153 # handle final spectrum, as above
154 if (current_charge != None and
155 (not is_laf or (current_sp_rank <= 10))):
156 if current_score != None and len(current_state) == 1:
157 z_scores[current_charge].append((current_score, current_delta,
158 current_state.pop()))
159 if current_charge != None:
160 sp_scores.append((current_charge, current_score, current_delta))
162 return (z_scores, sp_scores)
165 def specificity(positives, negatives):
166 return (float(positives - negatives)
167 / (positives + negatives))
170 def calculate_inner_threshold(specificity_goal, charge, spinfo):
171 spinfo = sorted(spinfo, key=lambda x: x[0])
173 real_count = sum(1 for x in spinfo if x[-1] == 'real')
174 decoy_count = len(spinfo) - real_count
176 if real_count == 0:
177 return (None, real_count, decoy_count) # give up
179 current_threshold = -1e100 # allow all spectra
180 for n, sp in enumerate(spinfo):
181 specificity_est = specificity(real_count, decoy_count)
182 if specificity_est >= specificity_goal:
183 break
184 if sp[-1] == 'real':
185 real_count -= 1
186 else:
187 decoy_count -= 1
188 # set threshold just high enough to exclude this spectrum
189 current_threshold = sp[0] + 1e-6
190 else:
191 current_threshold = spinfo[-1][0] + 1e-6 # couldn't meet goal
193 return (current_threshold, real_count, decoy_count)
196 def calculate_combined_thresholds(options, z_scores):
197 """Find best score/delta thresholds for each charge."""
199 specificity_goal = 1 - options.fdr
201 # Rather than search every possible value of delta, we're only going to
202 # "sample" at this granularity. This cuts search time dramatically (and
203 # making it O(n) instead of O(n**2). Extra precision wouldn't really be
204 # useful in any case.
205 SEARCH_GRANULARITY = 0.001
207 # charge -> (score, delta, passing_reals, passing_decoys)
208 thresholds = {}
210 for charge, spinfo in z_scores.iteritems():
211 spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
213 last_value = None
215 while spinfo0:
216 this_value = spinfo0[-1][1] # current delta
217 if (last_value == None
218 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
220 # "inner" is score
221 r = calculate_inner_threshold(specificity_goal, charge,
222 spinfo0)
223 if r[0] != None:
224 if options.debug:
225 print '#', charge, r[0], this_value, r[1], r[2]
226 if (charge not in thresholds
227 or r[1] > thresholds[charge][2]):
228 thresholds[charge] = (r[0], this_value, r[1], r[2])
230 last_value = this_value
231 spinfo0.pop()
233 if charge in thresholds:
234 if options.verbose:
235 print ("%+d: score %s, delta %s -> %s real ids (fdr %.4f)"
236 % (charge, thresholds[charge][0], thresholds[charge][1],
237 thresholds[charge][2],
238 1 - specificity(thresholds[charge][2],
239 thresholds[charge][3])))
240 else:
241 warn("could not calculate thresholds for %+d" % charge)
243 return thresholds
246 def main(args=sys.argv[1:]):
247 parser = optparse.OptionParser(usage=
248 "usage: %prog [options] <sqt-file>...",
249 description=__doc__, version=__version__)
250 pa = parser.add_option
251 pa("--decoy-prefix", dest="decoy_prefix", default="SHUFFLED_",
252 help='prefix given to locus name of decoy (e.g., shuffled) database'
253 ' sequences [default="SHUFFLED_"]', metavar="PREFIX")
254 pa("--fdr", dest="fdr", type="float", default="0.02",
255 help="false discovery rate [default=0.02]", metavar="PROPORTION")
256 pa("-v", "--verbose", action="store_true", dest="verbose",
257 help="be verbose")
258 pa("--laf", action="store_true", dest="laf",
259 help="drop peptides of length < 7 or Sp rank > 10")
260 pa("--debug", action="store_true", dest="debug",
261 help="show debug output")
262 pa("-m", "--mark", action="store_true", dest="mark",
263 help="rewrite the input files, changing some validation marks to 'N',"
264 " according to filtering")
265 pa("--reset-marks", action="store_true", dest="reset_marks",
266 help="rewrite the input files, changing all validation marks to 'U'")
267 pa("--copyright", action="store_true", dest="copyright",
268 help="print copyright and exit")
269 (options, args) = parser.parse_args(args=args)
271 if options.copyright:
272 print __copyright__
273 sys.exit(0)
275 if len(args) < 1:
276 parser.print_help()
277 sys.exit(1)
279 if not (0.0 <= options.fdr <= 1.0):
280 error("--fdr must be within range [0.0, 1.0]")
281 if options.mark and options.reset_marks:
282 error("only one of --mark and --reset-marks may be specified")
284 if options.reset_marks:
285 reset_marks(options, args)
286 return
288 z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix,
289 options.laf, args)
291 thresholds = calculate_combined_thresholds(options, z_scores)
293 if options.debug:
294 pprint(thresholds)
296 for charge, score, delta in spectrum_scores:
297 print '#S', charge, score, delta
299 if options.mark:
300 mark(options, thresholds, spectrum_scores, args)
303 if __name__ == '__main__':
304 main()