1 #!/usr/bin/env greylag-python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
8 from __future__
import with_statement
11 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
31 from collections
import defaultdict
38 from pprint
import pprint
43 print >> sys
.stderr
, 'warning:', s
45 sys
.exit('error: ' + s
)
48 def print_header(f
, r
):
50 print >> f
, "H\tSQTGenerator\tgreylag"
51 print >> f
, "H\tSQTGeneratorVersion\t%s" % __version__
52 for db
in r
['databases']:
53 print >> f
, "H\tDatabase\t%s" % db
54 for pk
, pv
in sorted(r
['parameters'].items()):
55 print >> f
, "H\tParameter\t%s\t%s" % (pk
, pv
)
56 # FIX: are these necessary?
57 # H StaticMod C=160.1388
61 def print_regime_manifest(f
, regime_manifest
):
63 for regime_no
, residue
, mass
in regime_manifest
:
64 print >> f
, "R\t%s\t%s\t%s" % (regime_no
, residue
, mass
)
67 def generate_marked_sequence(match_name_map
, mass_trace
, peptide_sequence
):
68 """Yield the characters in a marked version of peptide_sequence.
70 >>> ''.join(generate_marked_sequence({}, [], 'ASDF'))
72 >>> ''.join(generate_marked_sequence({('S', 80) : ('phosphorylation', '*')},
73 ... [{'position' : 1, 'delta' : 80}],
80 trace
= sorted(mass_trace
,
81 key
=lambda x
: (x
['position'], x
['delta']), reverse
=True)
82 for n
, r
in enumerate(peptide_sequence
):
84 while trace
and trace
[-1]['position'] == n
:
85 yield match_name_map
[(r
, trace
[-1]['delta'])][1]
89 def print_spectrum(f
, mod_name_map
, sp_matches
):
90 """Print the lines (S/M/L/A*) associated with the given spectrum."""
91 spectrum
, sp_best_matches
= sp_matches
93 scan_low
, scan_high
, _rest
= spectrum
['name'].split('.', 2)
94 print >> f
, '\t'.join(str(v
) for v
in
95 ["S", scan_low
, scan_high
, spectrum
['charge'], 0,
96 'honk', spectrum
['mass'],
97 round(math
.log(spectrum
['total_ion_current']), 4),
98 0, spectrum
['comparisons']])
100 best_scores
= sorted(list(set(m
['score'] for m
in sp_best_matches
)))
102 rank_map
= dict(zip(best_scores
, range(1,len(best_scores
)+1)))
104 best_score
= best_scores
[0]
105 for match
in sp_best_matches
:
106 score
= match
['score']
108 continue # null entry
109 rank
= rank_map
[score
]
110 score_delta
= (best_score
- score
) / best_score
112 match_name_map
= mod_name_map
[(match
['mass_regime_index'],
113 match
['conjunct_index'])]
115 = ''.join(generate_marked_sequence(match_name_map
,
117 match
['peptide_sequence']))
119 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
121 # Note: scores, being log probability, are non-positive, but we
122 # flip the sign in the SQT output
123 print >> f
, '\t'.join(str(v
) for v
in
125 round(match
['predicted_parent_mass'], 5),
126 round(score_delta
, 4), round(-score
, 4), 0,
127 # 1 of 2 ions found--keep DTASelect happy
129 "-.%s.-" % marked_sequence
, 'U'])
130 if match
['mass_regime_index'] != 0:
131 print >> f
, "AR\t%s" % match
['mass_regime_index']
132 if match
['pca_delta'] != 0:
133 print >> f
, "APCA\t%s" % match
['pca_delta']
134 for mt
in match
['mass_trace']:
135 name
= (match_name_map
[(match
['peptide_sequence'][mt
['position']],
137 fs
= ["AM", mt
['position'], round(mt
['delta'], 5),
138 name
if name
else '']
139 print >> f
, '\t'.join(str(v
) for v
in fs
)
141 assert len(match
['sequence_name']) == len(match
['peptide_begin'])
142 for sn
, pb
in zip(match
['sequence_name'], match
['peptide_begin']):
143 print >> f
, 'L\t%s\t%s' % (sn
, pb
)
146 def main(args
=sys
.argv
[1:]):
147 parser
= optparse
.OptionParser(usage
=
148 "usage: %prog [options] <result-file>",
149 description
=__doc__
, version
=__version__
)
150 pa
= parser
.add_option
151 pa("-d", "--output-directory", dest
="output_directory",
152 help="directory where output files are written [default is '.']",
154 pa("-v", "--verbose", action
="store_true", dest
="verbose",
156 pa("--dump", action
="store_true", dest
="dump",
157 help="dump the result file (for debugging)")
158 pa("--copyright", action
="store_true", dest
="copyright",
159 help="print copyright and exit")
160 (options
, args
) = parser
.parse_args(args
=args
)
162 if options
.copyright
:
170 with contextlib
.closing(gzip
.open(args
[0])) as r_file
:
171 r
= cPickle
.load(r_file
)
177 spectrum_fns
= r
['spectrum files']
178 assert len(spectrum_fns
) == len(set(spectrum_fns
)) # check uniqueness
180 matches
= r
['matches'].items()
182 # order matches by (spectrum filename, spectrum name)
183 def filename_specname_order(m
):
185 return sp
['file_id'], sp
['name']
187 matches
.sort(key
=filename_specname_order
)
190 # (regime index, conjunct index) -> (residue, delta)
191 # -> (mod name, mod marker)
192 mod_name_map
= defaultdict(dict)
193 for regime
in range(len(r
['mass regime atomic masses'])):
194 for cj_n
, (N_cj
, C_cj
, R_cj
) in enumerate(r
['modification conjuncts']):
195 assert not N_cj
and not C_cj
, "not yet implemented"
198 mod_name_map
[(regime
, cj_n
)][(res
, cj
[6][regime
][1])] \
200 #pprint(mod_name_map.items())
202 for spectrum_n
, spectrum_fn
in enumerate(spectrum_fns
):
203 assert os
.path
.dirname(spectrum_fn
) == ''
204 sqt_fn
= os
.path
.splitext(spectrum_fn
)[0] + '.sqt'
205 with
open(sqt_fn
, 'w') as sqtf
:
206 print_header(sqtf
, r
)
207 print_regime_manifest(sqtf
, r
['mass regime manifest'])
208 for match
in matches
:
209 if match
[1][0]['file_id'] == spectrum_n
:
210 print_spectrum(sqtf
, mod_name_map
, match
[1])
213 if __name__
== '__main__':