Merge commit 'v0.1.2'
[greylag.git] / greylag_sqt.py
blob8602c645f613cd56189b5934d92ea0dfe4e7c030
1 #!/usr/bin/env python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, a collection of programs for MS/MS protein analysis
12 Copyright (C) 2006-2008 Stowers Institute for Medical Research
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
27 Contact: Mike Coleman
28 Stowers Institute for Medical Research
29 1000 East 50th Street
30 Kansas City, Missouri 64110
31 USA
32 '''
35 import cPickle as pickle
36 import math
37 import optparse
38 import os.path
39 from pprint import pprint
40 import sys
42 import greylag
45 # gc kills our performance, so disable it. gc only matters for cycles, which
46 # we (hope we) don't create. See the gc module docs.
47 import gc; gc.disable()
50 def warn(s):
51 print >> sys.stderr, 'warning:', s
52 def error(s):
53 sys.exit('error: ' + s)
55 # errors are fatal
56 greylag.chase_error = error
59 def generate_leaves(tree):
60 """Given a tree made of lists, yield the leaves.
62 >>> list(generate_leaves([(1,2,3)]))
63 [(1, 2, 3)]
64 >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
65 [(1, 2, 3), (4, 5)]
67 """
68 if not isinstance(tree, list):
69 yield tree
70 return
71 for b in tree:
72 for l in generate_leaves(b):
73 yield l
76 # Example legacy headers:
77 # H Database /foo/bar/db.fasta
78 # H PrecursorMasses AVG
79 # H FragmentMasses MONO
80 # H StaticMod C=160.1388
81 # H DiffMod M*=+16.0
82 # H DiffMod STY#=+80.0
84 def print_legacy_headers(f, r):
85 """Output legacy H lines that readers of traditional SQT files may
86 expect. Warns if multiple databases, or multiple or isotopic mass regimes
87 are present.
88 """
90 if len(r['databases']) > 1:
91 warn("multiple databases present")
92 for db in r['databases']:
93 print >> f, "H\tDatabase\t%s" % db
95 precursor_masses, fragment_masses = 'MONO', 'MONO'
96 mass_regimes = r['parameters']['mass_regimes']
97 if len(mass_regimes) > 1:
98 warn("multiple mass regimes present--legacy SQT headers will be based"
99 "on just the first")
100 parent_regime, fragment_regime = mass_regimes[0]
101 if parent_regime[1] or fragment_regime[1]:
102 warn("isotopic mass regimes present--ignored for legacy SQT headers")
103 assert parent_regime[0] in ('MONO', 'AVG')
104 assert fragment_regime[0] in ('MONO', 'AVG')
105 print >> f, "H\tPrecursorMasses\t%s" % parent_regime[0]
106 print >> f, "H\tFragmentMasses\t%s" % fragment_regime[0]
108 # FIX: merge this with mass calculation code in
109 # greylag.initialize_spectrum_parameters?
110 # FIX: currently assuming MONO only (what would be better?)
111 static = dict((res, greylag.formula_mass(greylag.RESIDUE_FORMULA[res]))
112 for res in greylag.RESIDUES)
113 for m in r['parameters']['pervasive_mods']:
114 if isinstance(m[1], float):
115 static[m[3]] += m[0] * m[1]
116 else:
117 static[m[3]] += m[0] * greylag.formula_mass(m[1])
118 # all residues get a StaticMod because our figures are more exact than the
119 # "default" ones
120 for res in static:
121 print >> f, "H\tStaticMod\t%s=%s" % (res, static[res])
123 diffs = set(generate_leaves(r['parameters']['potential_mods']))
124 for m in diffs:
125 symbol = m[5]
126 if not symbol:
127 warn("unmarked potential mod present--omitting")
128 continue
129 if isinstance(m[1], float):
130 delta = m[0] * m[1]
131 else:
132 delta = m[0] * greylag.formula_mass(m[1])
133 print >> f, "H\tDiffMod\t%s%s=%+f" % (m[3], m[5], delta)
136 def print_header(f, r):
137 """Output H lines."""
138 print >> f, "H\tSQTGenerator\tgreylag"
139 print >> f, "H\tSQTGeneratorVersion\t%s" % greylag.VERSION
141 print_legacy_headers(f, r)
143 for pk, pv in sorted(r['parameters'].items()):
144 print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
147 def print_regime_manifest(f, regime_manifest):
148 """Output R lines."""
149 for regime_no, residue, mass in regime_manifest:
150 print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
153 def generate_marked_sequence(marks, peptide_sequence):
154 """Yield the characters in a marked version of peptide_sequence. If there
155 is both a terminal mark and a mark on a terminal residue, the latter is
156 omitted.
158 >>> ''.join(generate_marked_sequence({}, 'ASDF'))
159 'ASDF'
160 >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
161 'A^S*DF'
162 >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
163 'A^SDF'
167 for n, c in enumerate(peptide_sequence):
168 yield c
169 if n == 0 and 'N' in marks:
170 yield marks.get('N')
171 elif n == len(peptide_sequence)-1 and 'C' in marks:
172 yield marks.get('C')
173 else:
174 yield marks.get(n, '')
177 def print_spectrum(f, modification_conjucts, result, enhanced=False):
178 """Print the lines (S/M/L/A*) associated with the given spectrum."""
179 spectrum, sp_best_matches = result
181 scan_low, scan_high, charge = spectrum['name'].rsplit('.', 2)
182 print >> f, '\t'.join(str(v) for v in
183 ["S", scan_low, scan_high, charge, 0, 'honk',
184 spectrum['mass'],
185 round(math.log(spectrum['total_ion_current']), 4),
186 0, spectrum['comparisons']])
188 best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
189 if not best_scores:
190 return
192 # score -> rank
193 rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
195 best_score = best_scores[0]
196 for match in sp_best_matches:
197 score = match['score']
198 if score == 0:
199 continue # null entry
200 rank = rank_map[score]
201 score_delta = (best_score - score) / best_score
203 mass_regime_index = match.get('mass_regime_index', 0)
204 conjunct_index = match.get('conjunct_index', 0)
206 am_lines = [] # residue mods
207 marks = {}
208 for mt in match.get('mass_trace', []):
209 conjunct_item = modification_conjucts[conjunct_index][2][mt['conjunct_item_index']]
210 mark = conjunct_item[5]
211 if mark:
212 assert mt['position'] not in marks, "not yet implemented"
213 marks[mt['position']] = mark
214 if enhanced:
215 name = conjunct_item[4] or ''
216 delta = conjunct_item[6][mass_regime_index][1]
217 am_lines.append(["AM", mt['position'], round(delta, 8), name])
219 at_lines = [] # terminal mods
220 N_conjunct_item = modification_conjucts[conjunct_index][0]
221 if N_conjunct_item:
222 assert len(N_conjunct_item) == 1
223 mark = N_conjunct_item[0][5]
224 if mark:
225 marks['N'] = mark
226 if enhanced:
227 name = conjunct_item[4] or ''
228 delta = conjunct_item[6][mass_regime_index][1]
229 at_lines.append(["AT", 'N', round(delta, 8), name])
230 C_conjunct_item = modification_conjucts[conjunct_index][1]
231 if C_conjunct_item:
232 assert len(C_conjunct_item) == 1
233 mark = C_conjunct_item[0][5]
234 if mark:
235 marks['C'] = mark
236 if enhanced:
237 name = conjunct_item[4] or ''
238 delta = conjunct_item[6][mass_regime_index][1]
239 at_lines.append(["AT", 'C', round(delta, 8), name])
241 marked_sequence = match['peptide_sequence']
242 if marks:
243 marked_sequence = ''.join(generate_marked_sequence(marks,
244 marked_sequence))
246 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
247 # (N15), etc.
248 # Note: scores, being log probability, are non-positive, but we
249 # flip the sign in the SQT output
250 print >> f, '\t'.join(str(v) for v in
251 ["M", rank, rank,
252 round(match['predicted_parent_mass'], 5),
253 round(score_delta, 4), round(-score, 4), 0,
254 # FIX: 1 of 2 ions found--keep DTASelect happy
255 1, 2,
256 '.'.join((match['N_peptide_flank'],
257 marked_sequence,
258 match['C_peptide_flank'])),
259 'U'])
261 if enhanced:
262 if mass_regime_index != 0:
263 print >> f, "AR\t%s" % mass_regime_index
264 if match.get('pca_delta', 0) != 0:
265 print >> f, "APCA\t%s" % match['pca_delta']
266 for am_line in sorted(am_lines):
267 print >> f, '\t'.join(str(v) for v in am_line)
268 for at_line in at_lines:
269 print >> f, '\t'.join(str(v) for v in at_line)
271 for sequence_name, peptide_begin in match['loci']:
272 print >> f, 'L\t%s\t%s' % (sequence_name, peptide_begin)
275 def dump_result_file(result_file):
276 """Pretty-print result file contents."""
277 try:
278 r = None
279 while True:
280 r = pickle.load(result_file)
281 pprint(r)
282 except EOFError, e:
283 if r != 'complete':
284 error("result file truncated or invalid")
287 def generate_results(result_file):
288 while True:
289 try:
290 r = pickle.load(result_file)
291 except EOFError:
292 error("result file truncated or invalid")
293 if r == 'complete':
294 return
295 yield r
298 def main(args=sys.argv[1:]):
299 parser = optparse.OptionParser(usage=
300 "usage: %prog [options] <result-file>",
301 description=__doc__, version=greylag.VERSION)
302 pa = parser.add_option
303 pa("-d", "--output-directory", dest="output_directory",
304 help="directory where output files are written [default is '.']",
305 metavar="DIR")
306 pa("-e", "--enhanced-output", action="store_true", dest="enhanced_output",
307 help="include extra DTASelect-incompatible information in output")
308 pa("-v", "--verbose", action="store_true", dest="verbose",
309 help="be verbose")
310 pa("--dump", action="store_true", dest="dump",
311 help="just dump the result file to stdout (for debugging)")
312 pa("--copyright", action="store_true", dest="copyright",
313 help="print copyright and exit")
314 (options, args) = parser.parse_args(args=args)
316 if options.copyright:
317 print __copyright__
318 sys.exit(0)
320 if len(args) != 1:
321 parser.print_help()
322 sys.exit(1)
324 result_file = open(args[0], 'rb')
326 if options.dump:
327 dump_result_file(result_file)
328 return
330 header = pickle.load(result_file)
332 spectrum_fns = header['spectrum files']
333 assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
334 assert all(os.path.dirname(fn) == '' for fn in spectrum_fns)
336 basenames = [ os.path.splitext(spectrum_fn)[0]
337 for spectrum_fn in spectrum_fns ]
339 # spectrum file index -> sqt file
340 spectrum_filemap = dict((n, open(fn+'.sqt', 'w'))
341 for n, fn in enumerate(basenames))
343 for sqt_file in spectrum_filemap.values():
344 print_header(sqt_file, header)
345 print_regime_manifest(sqt_file, header['mass regime manifest'])
347 for result in generate_results(result_file):
348 if result:
349 print_spectrum(spectrum_filemap[result[0]['file_id']],
350 header['modification conjuncts'], result,
351 options.enhanced_output)
354 if __name__ == '__main__':
355 main()