1 #!/usr/bin/env greylag-python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
8 from __future__
import with_statement
11 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
31 from collections
import defaultdict
37 from pprint
import pprint
42 print >> sys
.stderr
, 'warning:', s
44 sys
.exit('error: ' + s
)
47 def print_header(f
, r
):
49 print >> f
, "H\tSQTGenerator\tgreylag"
50 print >> f
, "H\tSQTGeneratorVersion\t%s" % __version__
51 for db
in r
['databases']:
52 print >> f
, "H\tDatabase\t%s" % db
53 for pk
, pv
in sorted(r
['parameters'].items()):
54 print >> f
, "H\tParameter\t%s\t%s" % (pk
, pv
)
55 # FIX: are these necessary?
56 # H StaticMod C=160.1388
60 def print_regime_manifest(f
, regime_manifest
):
62 for regime_no
, residue
, mass
in regime_manifest
:
63 print >> f
, "R\t%s\t%s\t%s" % (regime_no
, residue
, mass
)
66 def generate_marked_sequence(match_name_map
, mass_trace
, peptide_sequence
):
67 """Yield the characters in a marked version of peptide_sequence.
69 >>> ''.join(generate_marked_sequence({}, [], 'ASDF'))
71 >>> ''.join(generate_marked_sequence({('S', 80) : ('phosphorylation', '*')},
72 ... [{'position' : 1, 'delta' : 80}],
79 trace
= sorted(mass_trace
,
80 key
=lambda x
: (x
['position'], x
['delta']), reverse
=True)
81 for n
, r
in enumerate(peptide_sequence
):
83 while trace
and trace
[-1]['position'] == n
:
84 yield match_name_map
[(r
, trace
[-1]['delta'])][1]
88 def print_spectrum(f
, mod_name_map
, sp_name
, sp_matches
):
89 """Print the lines (S/M/L/A*) associated with the given spectrum."""
90 spectrum
, sp_best_matches
= sp_matches
92 scan_low
, scan_high
, charge
= sp_name
.rsplit('.', 2)
93 print >> f
, '\t'.join(str(v
) for v
in
94 ["S", scan_low
, scan_high
, charge
, 0, 'honk',
96 round(math
.log(spectrum
['total_ion_current']), 4),
97 0, spectrum
['comparisons']])
99 best_scores
= sorted(list(set(m
['score'] for m
in sp_best_matches
)))
104 rank_map
= dict(zip(best_scores
, range(1,len(best_scores
)+1)))
106 best_score
= best_scores
[0]
107 for match
in sp_best_matches
:
108 score
= match
['score']
110 continue # null entry
111 rank
= rank_map
[score
]
112 score_delta
= (best_score
- score
) / best_score
114 match_name_map
= mod_name_map
[(match
.get('mass_regime_index', 0),
115 match
.get('conjunct_index', 0))]
117 = ''.join(generate_marked_sequence(match_name_map
,
118 match
.get('mass_trace', []),
119 match
['peptide_sequence']))
121 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
123 # Note: scores, being log probability, are non-positive, but we
124 # flip the sign in the SQT output
125 print >> f
, '\t'.join(str(v
) for v
in
127 round(match
['predicted_parent_mass'], 5),
128 round(score_delta
, 4), round(-score
, 4), 0,
129 # 1 of 2 ions found--keep DTASelect happy
131 "-.%s.-" % marked_sequence
, 'U'])
132 if match
.get('mass_regime_index', 0) != 0:
133 print >> f
, "AR\t%s" % match
['mass_regime_index']
134 if match
.get('pca_delta', 0) != 0:
135 print >> f
, "APCA\t%s" % match
['pca_delta']
136 for mt
in match
.get('mass_trace', []):
137 name
= (match_name_map
[(match
['peptide_sequence'][mt
['position']],
139 fs
= ["AM", mt
['position'], round(mt
['delta'], 5),
140 name
if name
else '']
141 print >> f
, '\t'.join(str(v
) for v
in fs
)
143 assert len(match
['sequence_name']) == len(match
['peptide_begin'])
144 for sn
, pb
in zip(match
['sequence_name'], match
['peptide_begin']):
145 print >> f
, 'L\t%s\t%s' % (sn
, pb
)
148 def main(args
=sys
.argv
[1:]):
149 parser
= optparse
.OptionParser(usage
=
150 "usage: %prog [options] <result-file>",
151 description
=__doc__
, version
=__version__
)
152 pa
= parser
.add_option
153 pa("-d", "--output-directory", dest
="output_directory",
154 help="directory where output files are written [default is '.']",
156 pa("-v", "--verbose", action
="store_true", dest
="verbose",
158 pa("--dump", action
="store_true", dest
="dump",
159 help="dump the result file (for debugging)")
160 pa("--copyright", action
="store_true", dest
="copyright",
161 help="print copyright and exit")
162 (options
, args
) = parser
.parse_args(args
=args
)
164 if options
.copyright
:
172 with contextlib
.closing(open(args
[0])) as r_file
:
173 r
= cPickle
.load(r_file
)
179 spectrum_fns
= r
['spectrum files']
180 assert len(spectrum_fns
) == len(set(spectrum_fns
)) # check uniqueness
182 matches
= r
['matches'].items()
185 # (regime index, conjunct index) -> (residue, delta)
186 # -> (mod name, mod marker)
187 mod_name_map
= defaultdict(dict)
188 for regime
in range(len(r
['mass regime atomic masses'])):
189 for cj_n
, (N_cj
, C_cj
, R_cj
) in enumerate(r
['modification conjuncts']):
190 assert not N_cj
and not C_cj
, "not yet implemented"
193 mod_name_map
[(regime
, cj_n
)][(res
, cj
[6][regime
][1])] \
195 #pprint(mod_name_map.items())
197 for spectrum_n
, spectrum_fn
in enumerate(spectrum_fns
):
198 assert os
.path
.dirname(spectrum_fn
) == ''
199 sqt_fn
= os
.path
.splitext(spectrum_fn
)[0] + '.sqt'
200 with
open(sqt_fn
, 'w') as sqtf
:
201 print_header(sqtf
, r
)
202 print_regime_manifest(sqtf
, r
['mass regime manifest'])
203 for match
in matches
:
204 if match
[0][0] == spectrum_n
:
205 print_spectrum(sqtf
, mod_name_map
, match
[0][1], match
[1])
208 if __name__
== '__main__':