update mass tracing--now works for basic cases
[greylag.git] / greylag_sqt.py
blobf4e5a05283987286fbfc209cb9d276f2a3d3df96
1 #!/usr/bin/env greylag-python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 '''
28 __version__ = "0.0"
31 from collections import defaultdict
32 import contextlib
33 import cPickle
34 import gzip
35 import math
36 import optparse
37 import os.path
38 from pprint import pprint
39 import sys
42 def warn(s):
43 print >> sys.stderr, 'warning:', s
44 def error(s):
45 sys.exit('error: ' + s)
48 def print_header(f, r):
49 """Output H lines."""
50 print >> f, "H\tSQTGenerator\tgreylag"
51 print >> f, "H\tSQTGeneratorVersion\t%s" % __version__
52 for db in r['databases']:
53 print >> f, "H\tDatabase\t%s" % db
54 for pk, pv in sorted(r['parameters'].items()):
55 print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
56 # FIX: are these necessary?
57 # H StaticMod C=160.1388
58 # H DiffMod TNA*=+2.0
61 def print_regime_manifest(f, regime_manifest):
62 """Output R lines."""
63 for regime_no, residue, mass in regime_manifest:
64 print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
67 def generate_marked_sequence(match_name_map, mass_trace, peptide_sequence):
68 """Yield the characters in a marked version of peptide_sequence.
70 >>> ''.join(generate_marked_sequence({}, [], 'ASDF'))
71 'ASDF'
72 >>> ''.join(generate_marked_sequence({('S', 80) : ('phosphorylation', '*')},
73 ... [{'position' : 1, 'delta' : 80}],
74 ... 'ASDF'))
75 'AS*DF'
77 """
79 # FIX: handle [ ]
80 trace = sorted(mass_trace,
81 key=lambda x: (x['position'], x['delta']), reverse=True)
82 for n, r in enumerate(peptide_sequence):
83 yield r
84 while trace and trace[-1]['position'] == n:
85 yield match_name_map[(r, trace[-1]['delta'])][1]
86 trace.pop()
89 def print_spectrum(f, mod_name_map, sp_matches):
90 """Print the lines (S/M/L/A*) associated with the given spectrum."""
91 spectrum, sp_best_matches = sp_matches
93 scan_low, scan_high, _rest = spectrum['name'].split('.', 2)
94 print >> f, '\t'.join(str(v) for v in
95 ["S", scan_low, scan_high, spectrum['charge'], 0,
96 'honk', spectrum['mass'],
97 round(math.log(spectrum['total_ion_current']), 4),
98 0, spectrum['comparisons']])
100 best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
101 # score -> rank
102 rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
104 best_score = best_scores[0]
105 for match in sp_best_matches:
106 score = match['score']
107 if score == 0:
108 continue # null entry
109 rank = rank_map[score]
110 score_delta = (best_score - score) / best_score
112 match_name_map = mod_name_map[(match['mass_regime_index'],
113 match['conjunct_index'])]
114 marked_sequence \
115 = ''.join(generate_marked_sequence(match_name_map,
116 match['mass_trace'],
117 match['peptide_sequence']))
119 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
120 # (N15), etc.
121 # Note: scores, being log probability, are non-positive, but we
122 # flip the sign in the SQT output
123 print >> f, '\t'.join(str(v) for v in
124 ["M", rank, rank,
125 round(match['predicted_parent_mass'], 5),
126 round(score_delta, 4), round(-score, 4), 0,
127 # 1 of 2 ions found--keep DTASelect happy
128 1, 2,
129 "-.%s.-" % marked_sequence, 'U'])
130 if match['mass_regime_index'] != 0:
131 print >> f, "AR\t%s" % match['mass_regime_index']
132 if match['pca_delta'] != 0:
133 print >> f, "APCA\t%s" % match['pca_delta']
134 for mt in match['mass_trace']:
135 name = (match_name_map[(match['peptide_sequence'][mt['position']],
136 mt['delta'])][0])
137 fs = ["AM", mt['position'], round(mt['delta'], 5),
138 name if name else '']
139 print >> f, '\t'.join(str(v) for v in fs)
141 assert len(match['sequence_name']) == len(match['peptide_begin'])
142 for sn, pb in zip(match['sequence_name'], match['peptide_begin']):
143 print >> f, 'L\t%s\t%s' % (sn, pb)
146 def main(args=sys.argv[1:]):
147 parser = optparse.OptionParser(usage=
148 "usage: %prog [options] <result-file>",
149 description=__doc__, version=__version__)
150 pa = parser.add_option
151 pa("-d", "--output-directory", dest="output_directory",
152 help="directory where output files are written [default is '.']",
153 metavar="DIR")
154 pa("-v", "--verbose", action="store_true", dest="verbose",
155 help="be verbose")
156 pa("--dump", action="store_true", dest="dump",
157 help="dump the result file (for debugging)")
158 pa("--copyright", action="store_true", dest="copyright",
159 help="print copyright and exit")
160 (options, args) = parser.parse_args(args=args)
162 if options.copyright:
163 print __copyright__
164 sys.exit(0)
166 if len(args) != 1:
167 parser.print_help()
168 sys.exit(1)
170 with contextlib.closing(gzip.open(args[0])) as r_file:
171 r = cPickle.load(r_file)
173 if options.dump:
174 pprint(r)
175 return
177 spectrum_fns = r['spectrum files']
178 assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
180 matches = r['matches'].items()
182 # order matches by (spectrum filename, spectrum name)
183 def filename_specname_order(m):
184 sp = m[1][0]
185 return sp['file_id'], sp['name']
187 matches.sort(key=filename_specname_order)
190 # (regime index, conjunct index) -> (residue, delta)
191 # -> (mod name, mod marker)
192 mod_name_map = defaultdict(dict)
193 for regime in range(len(r['mass regime atomic masses'])):
194 for cj_n, (N_cj, C_cj, R_cj) in enumerate(r['modification conjuncts']):
195 assert not N_cj and not C_cj, "not yet implemented"
196 for cj in R_cj:
197 for res in cj[3]:
198 mod_name_map[(regime, cj_n)][(res, cj[6][regime][1])] \
199 = (cj[4], cj[5])
200 #pprint(mod_name_map.items())
202 for spectrum_n, spectrum_fn in enumerate(spectrum_fns):
203 assert os.path.dirname(spectrum_fn) == ''
204 sqt_fn = os.path.splitext(spectrum_fn)[0] + '.sqt'
205 with open(sqt_fn, 'w') as sqtf:
206 print_header(sqtf, r)
207 print_regime_manifest(sqtf, r['mass regime manifest'])
208 for match in matches:
209 if match[1][0]['file_id'] == spectrum_n:
210 print_spectrum(sqtf, mod_name_map, match[1])
213 if __name__ == '__main__':
214 main()