Check sequence db locus name uniqueness
[greylag.git] / greylag-sqt.py
blob84a331e9a2a495947e9638d2f36deeea4bb9b1f4
1 #!/usr/bin/env python
3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 '''
28 __version__ = "0.0"
31 import contextlib
32 import cPickle
33 import gzip
34 import math
35 import optparse
36 import os.path
37 from pprint import pprint
38 import sys
41 def warn(s):
42 print >> sys.stderr, 'warning:', s
43 def error(s):
44 sys.exit('error: ' + s)
47 def print_header(f, r):
48 print >> f, "H\tSQTGenerator\tgreylag"
49 print >> f, "H\tSQTGeneratorVersion\t%s" % __version__
50 for db in r['databases']:
51 print >> f, "H\tDatabase\t%s" % db
52 for pk, pv in r['parameters'].items():
53 print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
54 # How should these be specified?
55 # What does DTASelect do with these?
56 # H StaticMod C=160.1388
57 # H DiffMod TNA*=+2.0
60 def print_spectrum(f, spectrum, sp_best_matches):
61 scan_low, scan_high, _rest = spectrum['name'].split('.', 2)
62 print >> f, '\t'.join(str(v) for v in
63 ["S", scan_low, scan_high, spectrum['charge'], 0,
64 'honk', spectrum['mass'],
65 round(math.log(spectrum['total_ion_current']), 4),
66 0, spectrum['comparisons']])
68 best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
69 # score -> rank
70 rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
72 best_score = sp_best_matches[0]['score']
73 prev_score, prev_sequence = None, None
74 for match in sp_best_matches:
75 score = match['score']
76 if score == 0:
77 continue # null entry
78 rank = rank_map[score]
79 score_delta = (best_score - score) / best_score
81 # This is a temporary way of stripping duplicates. It only works if
82 # they are adjacent (which will only rarely be false?).
83 if score != prev_score or match['peptide_sequence'] != prev_sequence:
84 # Note: scores, being log probability, are non-positive, but we
85 # flip the sign in the SQT output
86 print >> f, '\t'.join(str(v) for v in
87 ["M", rank, rank,
88 round(match['predicted_parent_mass'], 5),
89 round(score_delta, 4), round(-score, 4), 0,
90 # 1 of 2 ions found--keep DTASelect happy
91 1, 2,
92 "-.%s.-" % match['peptide_sequence'],
93 'U'])
94 prev_score, prev_sequence = score, match['peptide_sequence']
96 print >> f, 'L\t%s\t%s' % (match['sequence_name'],
97 match['peptide_begin'])
99 def main(args=sys.argv[1:]):
100 parser = optparse.OptionParser(usage=
101 "usage: %prog [options] <result-file>",
102 description=__doc__, version=__version__)
103 pa = parser.add_option
104 pa("-d", "--output-directory", dest="output_directory",
105 help="directory where output files are written [default is '.']",
106 metavar="DIR")
107 pa("-v", "--verbose", action="store_true", dest="verbose",
108 help="be verbose")
109 pa("--dump", action="store_true", dest="dump",
110 help="dump the result file (for debugging)")
111 pa("--copyright", action="store_true", dest="copyright",
112 help="print copyright and exit")
113 (options, args) = parser.parse_args(args=args)
115 if options.copyright:
116 print __copyright__
117 sys.exit(0)
119 if len(args) != 1:
120 parser.print_help()
121 sys.exit(1)
123 with contextlib.closing(gzip.open(args[0])) as r_file:
124 r = cPickle.load(r_file)
126 if options.dump:
127 pprint(r)
128 return
130 spectrum_fns = r['spectrum files']
131 assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
133 spectra = r['spectra']
134 best_matches = r['matches']['best_matches']
135 assert len(spectra) == len(best_matches)
137 # order spectra and best_matches by (filename, spectrum name)
138 def filename_specname_order(sm):
139 return r['spectrum files'][sm[0]['file_id']], sm[0]['name']
141 spec_match = zip(spectra, best_matches)
142 spec_match.sort(key=filename_specname_order)
144 for spectrum_n, spectrum_fn in enumerate(spectrum_fns):
145 assert os.path.dirname(spectrum_fn) == ''
146 sqt_fn = os.path.splitext(spectrum_fn)[0] + '.sqt'
147 with open(sqt_fn, 'w') as sqtf:
148 print_header(sqtf, r)
149 for spectrum, best_match in spec_match:
150 if spectrum['file_id'] == spectrum_n:
151 print_spectrum(sqtf, spectrum, best_match)
154 if __name__ == '__main__':
155 main()