minor code reorg (use new Python if/else operator)
[greylag.git] / greylag_sqt.py
blobe9951148675c0586bf8bff422b67267d1769e06c
1 #!/usr/bin/env greylag-python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 '''
28 __version__ = "0.0"
31 from collections import defaultdict
32 import contextlib
33 import cPickle
34 import math
35 import optparse
36 import os.path
37 from pprint import pprint
38 import sys
41 def warn(s):
42 print >> sys.stderr, 'warning:', s
43 def error(s):
44 sys.exit('error: ' + s)
47 def print_header(f, r):
48 """Output H lines."""
49 print >> f, "H\tSQTGenerator\tgreylag"
50 print >> f, "H\tSQTGeneratorVersion\t%s" % __version__
51 for db in r['databases']:
52 print >> f, "H\tDatabase\t%s" % db
53 for pk, pv in sorted(r['parameters'].items()):
54 print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
55 # FIX: are these necessary?
56 # H StaticMod C=160.1388
57 # H DiffMod TNA*=+2.0
60 def print_regime_manifest(f, regime_manifest):
61 """Output R lines."""
62 for regime_no, residue, mass in regime_manifest:
63 print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
66 def generate_marked_sequence(match_name_map, mass_trace, peptide_sequence):
67 """Yield the characters in a marked version of peptide_sequence.
69 >>> ''.join(generate_marked_sequence({}, [], 'ASDF'))
70 'ASDF'
71 >>> ''.join(generate_marked_sequence({('S', 80) : ('phosphorylation', '*')},
72 ... [{'position' : 1, 'delta' : 80}],
73 ... 'ASDF'))
74 'AS*DF'
76 """
78 # FIX: handle [ ]
79 trace = sorted(mass_trace,
80 key=lambda x: (x['position'], x['delta']), reverse=True)
81 for n, r in enumerate(peptide_sequence):
82 yield r
83 while trace and trace[-1]['position'] == n:
84 yield match_name_map[(r, trace[-1]['delta'])][1]
85 trace.pop()
88 def print_spectrum(f, mod_name_map, sp_name, sp_matches):
89 """Print the lines (S/M/L/A*) associated with the given spectrum."""
90 spectrum, sp_best_matches = sp_matches
92 scan_low, scan_high, charge = sp_name.rsplit('.', 2)
93 print >> f, '\t'.join(str(v) for v in
94 ["S", scan_low, scan_high, charge, 0, 'honk',
95 spectrum['mass'],
96 round(math.log(spectrum['total_ion_current']), 4),
97 0, spectrum['comparisons']])
99 best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
100 if not best_scores:
101 return
103 # score -> rank
104 rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
106 best_score = best_scores[0]
107 for match in sp_best_matches:
108 score = match['score']
109 if score == 0:
110 continue # null entry
111 rank = rank_map[score]
112 score_delta = (best_score - score) / best_score
114 match_name_map = mod_name_map[(match.get('mass_regime_index', 0),
115 match.get('conjunct_index', 0))]
116 marked_sequence \
117 = ''.join(generate_marked_sequence(match_name_map,
118 match.get('mass_trace', []),
119 match['peptide_sequence']))
121 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
122 # (N15), etc.
123 # Note: scores, being log probability, are non-positive, but we
124 # flip the sign in the SQT output
125 print >> f, '\t'.join(str(v) for v in
126 ["M", rank, rank,
127 round(match['predicted_parent_mass'], 5),
128 round(score_delta, 4), round(-score, 4), 0,
129 # 1 of 2 ions found--keep DTASelect happy
130 1, 2,
131 "-.%s.-" % marked_sequence, 'U'])
132 if match.get('mass_regime_index', 0) != 0:
133 print >> f, "AR\t%s" % match['mass_regime_index']
134 if match.get('pca_delta', 0) != 0:
135 print >> f, "APCA\t%s" % match['pca_delta']
136 for mt in match.get('mass_trace', []):
137 name = (match_name_map[(match['peptide_sequence'][mt['position']],
138 mt['delta'])][0])
139 fs = ["AM", mt['position'], round(mt['delta'], 5),
140 name if name else '']
141 print >> f, '\t'.join(str(v) for v in fs)
143 assert len(match['sequence_name']) == len(match['peptide_begin'])
144 for sn, pb in zip(match['sequence_name'], match['peptide_begin']):
145 print >> f, 'L\t%s\t%s' % (sn, pb)
148 def main(args=sys.argv[1:]):
149 parser = optparse.OptionParser(usage=
150 "usage: %prog [options] <result-file>",
151 description=__doc__, version=__version__)
152 pa = parser.add_option
153 pa("-d", "--output-directory", dest="output_directory",
154 help="directory where output files are written [default is '.']",
155 metavar="DIR")
156 pa("-v", "--verbose", action="store_true", dest="verbose",
157 help="be verbose")
158 pa("--dump", action="store_true", dest="dump",
159 help="dump the result file (for debugging)")
160 pa("--copyright", action="store_true", dest="copyright",
161 help="print copyright and exit")
162 (options, args) = parser.parse_args(args=args)
164 if options.copyright:
165 print __copyright__
166 sys.exit(0)
168 if len(args) != 1:
169 parser.print_help()
170 sys.exit(1)
172 with contextlib.closing(open(args[0])) as r_file:
173 r = cPickle.load(r_file)
175 if options.dump:
176 pprint(r)
177 return
179 spectrum_fns = r['spectrum files']
180 assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
182 matches = r['matches'].items()
183 matches.sort()
185 # (regime index, conjunct index) -> (residue, delta)
186 # -> (mod name, mod marker)
187 mod_name_map = defaultdict(dict)
188 for regime in range(len(r['mass regime atomic masses'])):
189 for cj_n, (N_cj, C_cj, R_cj) in enumerate(r['modification conjuncts']):
190 assert not N_cj and not C_cj, "not yet implemented"
191 for cj in R_cj:
192 for res in cj[3]:
193 mod_name_map[(regime, cj_n)][(res, cj[6][regime][1])] \
194 = (cj[4], cj[5])
195 #pprint(mod_name_map.items())
197 for spectrum_n, spectrum_fn in enumerate(spectrum_fns):
198 assert os.path.dirname(spectrum_fn) == ''
199 sqt_fn = os.path.splitext(spectrum_fn)[0] + '.sqt'
200 with open(sqt_fn, 'w') as sqtf:
201 print_header(sqtf, r)
202 print_regime_manifest(sqtf, r['mass regime manifest'])
203 for match in matches:
204 if match[0][0] == spectrum_n:
205 print_spectrum(sqtf, mod_name_map, match[0][1], match[1])
208 if __name__ == '__main__':
209 main()