3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
8 from __future__
import with_statement
11 greylag, a collection of programs for MS/MS protein analysis
12 Copyright (C) 2006-2008 Stowers Institute for Medical Research
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
28 Stowers Institute for Medical Research
30 Kansas City, Missouri 64110
35 import cPickle
as pickle
39 from pprint
import pprint
45 # gc kills our performance, so disable it. gc only matters for cycles, which
46 # we (hope we) don't create. See the gc module docs.
47 import gc
; gc
.disable()
51 print >> sys
.stderr
, 'warning:', s
53 sys
.exit('error: ' + s
)
56 def generate_leaves(tree
):
57 """Given a tree made of lists, yield the leaves.
59 >>> list(generate_leaves([(1,2,3)]))
61 >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
65 if not isinstance(tree
, list):
69 for l
in generate_leaves(b
):
73 # Example legacy headers:
74 # H Database /foo/bar/db.fasta
75 # H PrecursorMasses AVG
76 # H FragmentMasses MONO
77 # H StaticMod C=160.1388
79 # H DiffMod STY#=+80.0
81 def print_legacy_headers(f
, r
):
82 """Output legacy H lines that readers of traditional SQT files may
83 expect. Warns if multiple databases, or multiple or isotopic mass regimes
87 if len(r
['databases']) > 1:
88 warn("multiple databases present")
89 for db
in r
['databases']:
90 print >> f
, "H\tDatabase\t%s" % db
92 precursor_masses
, fragment_masses
= 'MONO', 'MONO'
93 mass_regimes
= r
['parameters']['mass_regimes']
94 if len(mass_regimes
) > 1:
95 warn("multiple mass regimes present--legacy SQT headers will be based"
97 parent_regime
, fragment_regime
= mass_regimes
[0]
98 if parent_regime
[1] or fragment_regime
[1]:
99 warn("isotopic mass regimes present--ignored for legacy SQT headers")
100 assert parent_regime
[0] in ('MONO', 'AVG')
101 assert fragment_regime
[0] in ('MONO', 'AVG')
102 print >> f
, "H\tPrecursorMasses\t%s" % parent_regime
[0]
103 print >> f
, "H\tFragmentMasses\t%s" % fragment_regime
[0]
105 # FIX: merge this with mass calculation code in
106 # greylag.initialize_spectrum_parameters?
107 # FIX: currently assuming MONO only (what would be better?)
108 static
= dict((res
, greylag
.formula_mass(greylag
.RESIDUE_FORMULA
[res
]))
109 for res
in greylag
.RESIDUES
)
110 for m
in r
['parameters']['pervasive_mods']:
111 if isinstance(m
[1], float):
112 static
[m
[3]] += m
[0] * m
[1]
114 static
[m
[3]] += m
[0] * greylag
.formula_mass(m
[1])
115 # all residues get a StaticMod because our figures are more exact than the
118 print >> f
, "H\tStaticMod\t%s=%s" % (res
, static
[res
])
120 diffs
= set(generate_leaves(r
['parameters']['potential_mods']))
124 warn("unmarked potential mod present--omitting")
126 if isinstance(m
[1], float):
129 delta
= m
[0] * greylag
.formula_mass(m
[1])
130 print >> f
, "H\tDiffMod\t%s%s=%+f" % (m
[3], m
[5], delta
)
133 def print_header(f
, r
):
134 """Output H lines."""
135 print >> f
, "H\tSQTGenerator\tgreylag"
136 print >> f
, "H\tSQTGeneratorVersion\t%s" % greylag
.VERSION
138 print_legacy_headers(f
, r
)
140 for pk
, pv
in sorted(r
['parameters'].items()):
141 print >> f
, "H\tParameter\t%s\t%s" % (pk
, pv
)
144 def print_regime_manifest(f
, regime_manifest
):
145 """Output R lines."""
146 for regime_no
, residue
, mass
in regime_manifest
:
147 print >> f
, "R\t%s\t%s\t%s" % (regime_no
, residue
, mass
)
150 def generate_marked_sequence(marks
, peptide_sequence
):
151 """Yield the characters in a marked version of peptide_sequence. If there
152 is both a terminal mark and a mark on a terminal residue, the latter is
155 >>> ''.join(generate_marked_sequence({}, 'ASDF'))
157 >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
159 >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
164 for n
, c
in enumerate(peptide_sequence
):
166 if n
== 0 and 'N' in marks
:
168 elif n
== len(peptide_sequence
)-1 and 'C' in marks
:
171 yield marks
.get(n
, '')
174 def print_spectrum(f
, modification_conjucts
, result
, enhanced
=False):
175 """Print the lines (S/M/L/A*) associated with the given spectrum."""
176 spectrum
, sp_best_matches
= result
178 scan_low
, scan_high
, charge
= spectrum
['name'].rsplit('.', 2)
179 print >> f
, '\t'.join(str(v
) for v
in
180 ["S", scan_low
, scan_high
, charge
, 0, 'honk',
182 round(math
.log(spectrum
['total_ion_current']), 4),
183 0, spectrum
['comparisons']])
185 best_scores
= sorted(list(set(m
['score'] for m
in sp_best_matches
)))
190 rank_map
= dict(zip(best_scores
, range(1,len(best_scores
)+1)))
192 best_score
= best_scores
[0]
193 for match
in sp_best_matches
:
194 score
= match
['score']
196 continue # null entry
197 rank
= rank_map
[score
]
198 score_delta
= (best_score
- score
) / best_score
200 mass_regime_index
= match
.get('mass_regime_index', 0)
201 conjunct_index
= match
.get('conjunct_index', 0)
203 am_lines
= [] # residue mods
205 for mt
in match
.get('mass_trace', []):
206 conjunct_item
= modification_conjucts
[conjunct_index
][2][mt
['conjunct_item_index']]
207 mark
= conjunct_item
[5]
209 assert mt
['position'] not in marks
, "not yet implemented"
210 marks
[mt
['position']] = mark
212 name
= conjunct_item
[4] or ''
213 delta
= conjunct_item
[6][mass_regime_index
][1]
214 am_lines
.append(["AM", mt
['position'], round(delta
, 8), name
])
216 at_lines
= [] # terminal mods
217 N_conjunct_item
= modification_conjucts
[conjunct_index
][0]
219 assert len(N_conjunct_item
) == 1
220 mark
= N_conjunct_item
[0][5]
224 name
= conjunct_item
[4] or ''
225 delta
= conjunct_item
[6][mass_regime_index
][1]
226 at_lines
.append(["AT", 'N', round(delta
, 8), name
])
227 C_conjunct_item
= modification_conjucts
[conjunct_index
][1]
229 assert len(C_conjunct_item
) == 1
230 mark
= C_conjunct_item
[0][5]
234 name
= conjunct_item
[4] or ''
235 delta
= conjunct_item
[6][mass_regime_index
][1]
236 at_lines
.append(["AT", 'C', round(delta
, 8), name
])
238 marked_sequence
= match
['peptide_sequence']
240 marked_sequence
= ''.join(generate_marked_sequence(marks
,
243 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
245 # Note: scores, being log probability, are non-positive, but we
246 # flip the sign in the SQT output
247 print >> f
, '\t'.join(str(v
) for v
in
249 round(match
['predicted_parent_mass'], 5),
250 round(score_delta
, 4), round(-score
, 4), 0,
251 # FIX: 1 of 2 ions found--keep DTASelect happy
253 '.'.join((match
['N_peptide_flank'],
255 match
['C_peptide_flank'])),
259 if mass_regime_index
!= 0:
260 print >> f
, "AR\t%s" % mass_regime_index
261 if match
.get('pca_delta', 0) != 0:
262 print >> f
, "APCA\t%s" % match
['pca_delta']
263 for am_line
in sorted(am_lines
):
264 print >> f
, '\t'.join(str(v
) for v
in am_line
)
265 for at_line
in at_lines
:
266 print >> f
, '\t'.join(str(v
) for v
in at_line
)
268 for sequence_name
, peptide_begin
in match
['loci']:
269 print >> f
, 'L\t%s\t%s' % (sequence_name
, peptide_begin
)
272 def dump_result_file(result_file
):
273 """Pretty-print result file contents."""
277 r
= pickle
.load(result_file
)
281 error("result file truncated or invalid")
284 def generate_results(result_file
):
287 r
= pickle
.load(result_file
)
289 error("result file truncated or invalid")
295 def main(args
=sys
.argv
[1:]):
296 parser
= optparse
.OptionParser(usage
=
297 "usage: %prog [options] <result-file>",
298 description
=__doc__
, version
=greylag
.VERSION
)
299 pa
= parser
.add_option
300 pa("-d", "--output-directory", dest
="output_directory",
301 help="directory where output files are written [default is '.']",
303 pa("-e", "--enhanced-output", action
="store_true", dest
="enhanced_output",
304 help="include extra DTASelect-incompatible information in output")
305 pa("-v", "--verbose", action
="store_true", dest
="verbose",
307 pa("--dump", action
="store_true", dest
="dump",
308 help="just dump the result file to stdout (for debugging)")
309 pa("--copyright", action
="store_true", dest
="copyright",
310 help="print copyright and exit")
311 (options
, args
) = parser
.parse_args(args
=args
)
313 if options
.copyright
:
321 result_file
= open(args
[0], 'rb')
324 dump_result_file(result_file
)
327 header
= pickle
.load(result_file
)
329 spectrum_fns
= header
['spectrum files']
330 assert len(spectrum_fns
) == len(set(spectrum_fns
)) # check uniqueness
331 assert all(os
.path
.dirname(fn
) == '' for fn
in spectrum_fns
)
333 basenames
= [ os
.path
.splitext(spectrum_fn
)[0]
334 for spectrum_fn
in spectrum_fns
]
336 # spectrum file index -> sqt file
337 spectrum_filemap
= dict((n
, open(fn
+'.sqt', 'w'))
338 for n
, fn
in enumerate(basenames
))
340 for sqt_file
in spectrum_filemap
.values():
341 print_header(sqt_file
, header
)
342 print_regime_manifest(sqt_file
, header
['mass regime manifest'])
344 for result
in generate_results(result_file
):
346 print_spectrum(spectrum_filemap
[result
[0]['file_id']],
347 header
['modification conjuncts'], result
,
348 options
.enhanced_output
)
351 if __name__
== '__main__':