Merge branch 'slowbisect'
[greylag.git] / greylag_sqt.py
blob397e4c8b12c617e92b174b539bfa588e7edddb5b
1 #!/usr/bin/env python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, a collection of programs for MS/MS protein analysis
12 Copyright (C) 2006-2008 Stowers Institute for Medical Research
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
27 Contact: Mike Coleman
28 Stowers Institute for Medical Research
29 1000 East 50th Street
30 Kansas City, Missouri 64110
31 USA
32 '''
35 import cPickle as pickle
36 import math
37 import optparse
38 import os.path
39 from pprint import pprint
40 import sys
42 import greylag
45 # gc kills our performance, so disable it. gc only matters for cycles, which
46 # we (hope we) don't create. See the gc module docs.
47 import gc; gc.disable()
50 def warn(s):
51 print >> sys.stderr, 'warning:', s
52 def error(s):
53 sys.exit('error: ' + s)
56 def generate_leaves(tree):
57 """Given a tree made of lists, yield the leaves.
59 >>> list(generate_leaves([(1,2,3)]))
60 [(1, 2, 3)]
61 >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
62 [(1, 2, 3), (4, 5)]
64 """
65 if not isinstance(tree, list):
66 yield tree
67 return
68 for b in tree:
69 for l in generate_leaves(b):
70 yield l
73 # Example legacy headers:
74 # H Database /foo/bar/db.fasta
75 # H PrecursorMasses AVG
76 # H FragmentMasses MONO
77 # H StaticMod C=160.1388
78 # H DiffMod M*=+16.0
79 # H DiffMod STY#=+80.0
81 def print_legacy_headers(f, r):
82 """Output legacy H lines that readers of traditional SQT files may
83 expect. Warns if multiple databases, or multiple or isotopic mass regimes
84 are present.
85 """
87 if len(r['databases']) > 1:
88 warn("multiple databases present")
89 for db in r['databases']:
90 print >> f, "H\tDatabase\t%s" % db
92 precursor_masses, fragment_masses = 'MONO', 'MONO'
93 mass_regimes = r['parameters']['mass_regimes']
94 if len(mass_regimes) > 1:
95 warn("multiple mass regimes present--legacy SQT headers will be based"
96 "on just the first")
97 parent_regime, fragment_regime = mass_regimes[0]
98 if parent_regime[1] or fragment_regime[1]:
99 warn("isotopic mass regimes present--ignored for legacy SQT headers")
100 assert parent_regime[0] in ('MONO', 'AVG')
101 assert fragment_regime[0] in ('MONO', 'AVG')
102 print >> f, "H\tPrecursorMasses\t%s" % parent_regime[0]
103 print >> f, "H\tFragmentMasses\t%s" % fragment_regime[0]
105 # FIX: merge this with mass calculation code in
106 # greylag.initialize_spectrum_parameters?
107 # FIX: currently assuming MONO only (what would be better?)
108 static = dict((res, greylag.formula_mass(greylag.RESIDUE_FORMULA[res]))
109 for res in greylag.RESIDUES)
110 for m in r['parameters']['pervasive_mods']:
111 if isinstance(m[1], float):
112 static[m[3]] += m[0] * m[1]
113 else:
114 static[m[3]] += m[0] * greylag.formula_mass(m[1])
115 # all residues get a StaticMod because our figures are more exact than the
116 # "default" ones
117 for res in static:
118 print >> f, "H\tStaticMod\t%s=%s" % (res, static[res])
120 diffs = set(generate_leaves(r['parameters']['potential_mods']))
121 for m in diffs:
122 symbol = m[5]
123 if not symbol:
124 warn("unmarked potential mod present--omitting")
125 continue
126 if isinstance(m[1], float):
127 delta = m[0] * m[1]
128 else:
129 delta = m[0] * greylag.formula_mass(m[1])
130 print >> f, "H\tDiffMod\t%s%s=%+f" % (m[3], m[5], delta)
133 def print_header(f, r):
134 """Output H lines."""
135 print >> f, "H\tSQTGenerator\tgreylag"
136 print >> f, "H\tSQTGeneratorVersion\t%s" % greylag.VERSION
138 print_legacy_headers(f, r)
140 for pk, pv in sorted(r['parameters'].items()):
141 print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
144 def print_regime_manifest(f, regime_manifest):
145 """Output R lines."""
146 for regime_no, residue, mass in regime_manifest:
147 print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
150 def generate_marked_sequence(marks, peptide_sequence):
151 """Yield the characters in a marked version of peptide_sequence. If there
152 is both a terminal mark and a mark on a terminal residue, the latter is
153 omitted.
155 >>> ''.join(generate_marked_sequence({}, 'ASDF'))
156 'ASDF'
157 >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
158 'A^S*DF'
159 >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
160 'A^SDF'
164 for n, c in enumerate(peptide_sequence):
165 yield c
166 if n == 0 and 'N' in marks:
167 yield marks.get('N')
168 elif n == len(peptide_sequence)-1 and 'C' in marks:
169 yield marks.get('C')
170 else:
171 yield marks.get(n, '')
174 def print_spectrum(f, modification_conjucts, result, enhanced=False):
175 """Print the lines (S/M/L/A*) associated with the given spectrum."""
176 spectrum, sp_best_matches = result
178 scan_low, scan_high, charge = spectrum['name'].rsplit('.', 2)
179 print >> f, '\t'.join(str(v) for v in
180 ["S", scan_low, scan_high, charge, 0, 'honk',
181 spectrum['mass'],
182 round(math.log(spectrum['total_ion_current']), 4),
183 0, spectrum['comparisons']])
185 best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
186 if not best_scores:
187 return
189 # score -> rank
190 rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
192 best_score = best_scores[0]
193 for match in sp_best_matches:
194 score = match['score']
195 if score == 0:
196 continue # null entry
197 rank = rank_map[score]
198 score_delta = (best_score - score) / best_score
200 mass_regime_index = match.get('mass_regime_index', 0)
201 conjunct_index = match.get('conjunct_index', 0)
203 am_lines = [] # residue mods
204 marks = {}
205 for mt in match.get('mass_trace', []):
206 conjunct_item = modification_conjucts[conjunct_index][2][mt['conjunct_item_index']]
207 mark = conjunct_item[5]
208 if mark:
209 assert mt['position'] not in marks, "not yet implemented"
210 marks[mt['position']] = mark
211 if enhanced:
212 name = conjunct_item[4] or ''
213 delta = conjunct_item[6][mass_regime_index][1]
214 am_lines.append(["AM", mt['position'], round(delta, 8), name])
216 at_lines = [] # terminal mods
217 N_conjunct_item = modification_conjucts[conjunct_index][0]
218 if N_conjunct_item:
219 assert len(N_conjunct_item) == 1
220 mark = N_conjunct_item[0][5]
221 if mark:
222 marks['N'] = mark
223 if enhanced:
224 name = conjunct_item[4] or ''
225 delta = conjunct_item[6][mass_regime_index][1]
226 at_lines.append(["AT", 'N', round(delta, 8), name])
227 C_conjunct_item = modification_conjucts[conjunct_index][1]
228 if C_conjunct_item:
229 assert len(C_conjunct_item) == 1
230 mark = C_conjunct_item[0][5]
231 if mark:
232 marks['C'] = mark
233 if enhanced:
234 name = conjunct_item[4] or ''
235 delta = conjunct_item[6][mass_regime_index][1]
236 at_lines.append(["AT", 'C', round(delta, 8), name])
238 marked_sequence = match['peptide_sequence']
239 if marks:
240 marked_sequence = ''.join(generate_marked_sequence(marks,
241 marked_sequence))
243 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
244 # (N15), etc.
245 # Note: scores, being log probability, are non-positive, but we
246 # flip the sign in the SQT output
247 print >> f, '\t'.join(str(v) for v in
248 ["M", rank, rank,
249 round(match['predicted_parent_mass'], 5),
250 round(score_delta, 4), round(-score, 4), 0,
251 # FIX: 1 of 2 ions found--keep DTASelect happy
252 1, 2,
253 '.'.join((match['N_peptide_flank'],
254 marked_sequence,
255 match['C_peptide_flank'])),
256 'U'])
258 if enhanced:
259 if mass_regime_index != 0:
260 print >> f, "AR\t%s" % mass_regime_index
261 if match.get('pca_delta', 0) != 0:
262 print >> f, "APCA\t%s" % match['pca_delta']
263 for am_line in sorted(am_lines):
264 print >> f, '\t'.join(str(v) for v in am_line)
265 for at_line in at_lines:
266 print >> f, '\t'.join(str(v) for v in at_line)
268 assert len(match['sequence_name']) == len(match['peptide_begin'])
269 for sn, pb in zip(match['sequence_name'], match['peptide_begin']):
270 print >> f, 'L\t%s\t%s' % (sn, pb)
273 def dump_result_file(result_file):
274 """Pretty-print result file contents."""
275 try:
276 r = None
277 while True:
278 r = pickle.load(result_file)
279 pprint(r)
280 except EOFError, e:
281 if r != 'complete':
282 error("result file truncated or invalid")
285 def generate_results(result_file):
286 while True:
287 try:
288 r = pickle.load(result_file)
289 except EOFError:
290 error("result file truncated or invalid")
291 if r == 'complete':
292 return
293 yield r
296 def main(args=sys.argv[1:]):
297 parser = optparse.OptionParser(usage=
298 "usage: %prog [options] <result-file>",
299 description=__doc__, version=greylag.VERSION)
300 pa = parser.add_option
301 pa("-d", "--output-directory", dest="output_directory",
302 help="directory where output files are written [default is '.']",
303 metavar="DIR")
304 pa("-e", "--enhanced-output", action="store_true", dest="enhanced_output",
305 help="include extra DTASelect-incompatible information in output")
306 pa("-v", "--verbose", action="store_true", dest="verbose",
307 help="be verbose")
308 pa("--dump", action="store_true", dest="dump",
309 help="just dump the result file to stdout (for debugging)")
310 pa("--copyright", action="store_true", dest="copyright",
311 help="print copyright and exit")
312 (options, args) = parser.parse_args(args=args)
314 if options.copyright:
315 print __copyright__
316 sys.exit(0)
318 if len(args) != 1:
319 parser.print_help()
320 sys.exit(1)
322 result_file = open(args[0], 'rb')
324 if options.dump:
325 dump_result_file(result_file)
326 return
328 header = pickle.load(result_file)
330 spectrum_fns = header['spectrum files']
331 assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
332 assert all(os.path.dirname(fn) == '' for fn in spectrum_fns)
334 basenames = [ os.path.splitext(spectrum_fn)[0]
335 for spectrum_fn in spectrum_fns ]
337 # spectrum file index -> sqt file
338 spectrum_filemap = dict((n, open(fn+'.sqt', 'w'))
339 for n, fn in enumerate(basenames))
341 for sqt_file in spectrum_filemap.values():
342 print_header(sqt_file, header)
343 print_regime_manifest(sqt_file, header['mass regime manifest'])
345 for result in generate_results(result_file):
346 if result:
347 print_spectrum(spectrum_filemap[result[0]['file_id']],
348 header['modification conjuncts'], result,
349 options.enhanced_output)
352 if __name__ == '__main__':
353 main()