Merge branch 'slowbisect'
[greylag.git] / greylag_merge.py
blob1dc62261140b1053891ac7ffbf0e6f393dfd3665
1 #!/usr/bin/env python
3 '''Merge a set of greylag search result files, producing a search result file
4 that summarizes the best matches found.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, a collection of programs for MS/MS protein analysis
12 Copyright (C) 2006-2008 Stowers Institute for Medical Research
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
27 Contact: Mike Coleman
28 Stowers Institute for Medical Research
29 1000 East 50th Street
30 Kansas City, Missouri 64110
31 USA
32 '''
35 import contextlib
36 import cPickle as pickle
37 import optparse
38 import os.path
39 from pprint import pprint
40 import sys
42 from greylag import VERSION
45 # gc kills our performance, so disable it. gc only matters for cycles, which
46 # we (hope we) don't create. See the gc module docs.
47 import gc; gc.disable()
50 def warn(s):
51 print >> sys.stderr, 'warning:', s
52 def error(s):
53 sys.exit('error: ' + s)
56 def check_consistency(r0, r1):
57 """Warn on any inconsistencies between the two results that are about to
58 be merged."""
60 if r0['version'] != r1['version']:
61 warn("merging runs searched with different greylag versions!"
62 " (rerunning with same version strongly recommended)")
63 k0 = set(r0.keys())
64 k1 = set(r1.keys())
65 if k0 != k1:
66 warn("mismatched keys: %s" % (k0 ^ k1))
68 # FIX: this restriction goes away later
69 if r0['spectrum files'] != r1['spectrum files']:
70 error('merging across differing sets of ms2 files not yet implemented'
71 ' (%s vs %s)' % (r0['spectrum files'], r1['spectrum files']))
73 varying_keys = set(['matches', 'total comparisons', 'argv'])
74 for k in k0 - varying_keys:
75 if r0[k] != r1.get(k):
76 warn("search context differs: %s (%s vs %s)" % (k, r0[k], r1[k]))
79 def score_equal(s1, s2):
80 return abs(s1-s2) < 1e-6
83 def merge_match(m0, m1, keep):
84 """Merge a particular spectrum match from m1 into m0."""
85 m0[0]['comparisons'] += m1[0]['comparisons']
87 # NB: This merge code, and score_equal above, must do the same merge as
88 # the merge code in cgreylag.cpp:evaluate_peptide!
90 assert keep >= 1
91 matches = m0[1] + m1[1]
92 matches.sort(key=lambda x: x['score'], reverse=True)
94 merged_matches = [matches.pop()]
95 while len(merged_matches) < keep and matches:
96 c0, c1 = merged_matches[-1], matches.pop()
97 if (score_equal(c0['score'], c1['score'])
98 and c0['peptide_sequence'] == c1['peptide_sequence']
99 and c0.get('mass_trace') == c1.get('mass_trace')):
100 continue
101 merged_matches.append(c1)
104 def merge_matches(m0, m1, keep):
105 """Merge the match information in m1 into m0."""
106 for m1k, m1v in m1.iteritems():
107 if m1k not in m0:
108 m0[m1k] = m1v
109 else:
110 merge_match(m0[m1k], m1v, keep)
113 def main(args=sys.argv[1:]):
114 parser = optparse.OptionParser(usage=
115 "usage: %prog [options] [<result-file>...]"
116 " <output-result-file>",
117 description=__doc__, version=VERSION)
118 pa = parser.add_option
119 pa("-f", "--files-on-stdin", action="store_true", dest="files_on_stdin",
120 help="read filenames to be merged from stdin, one per line, instead of"
121 " from the command-line.")
122 pa("-v", "--verbose", action="store_true", dest="verbose",
123 help="be verbose")
124 pa("--copyright", action="store_true", dest="copyright",
125 help="print copyright and exit")
126 (options, args) = parser.parse_args(args=args)
128 if options.copyright:
129 print __copyright__
130 sys.exit(0)
132 if options.files_on_stdin:
133 if len(args) != 1:
134 parser.print_help()
135 sys.exit(1)
136 filenames = [ l.strip() for l in sys.stdin ]
137 if len(filenames) < 1:
138 error("--files-on-stdin given, but stdin was empty")
139 result_fn_0 = filenames[0]
140 result_fn_1_N = filenames[1:]
141 else:
142 if len(args) < 2:
143 parser.print_help()
144 sys.exit(1)
145 result_fn_0 = args[0]
146 result_fn_1_N = args[1:-1]
148 output_fn = args[-1]
150 # spectrum name -> match list/spectrum info
151 matches = {}
152 with contextlib.closing(open(result_fn_0, 'rb')) as r_file:
153 r0 = pickle.load(r_file)
154 total_comparisons = r0['total comparisons']
155 keep = r0['parameters']['best_result_count']
156 matches = r0['matches']
157 if options.verbose:
158 print >> sys.stderr, "loaded", result_fn_0
160 for additional_result_fn in result_fn_1_N:
161 with contextlib.closing(open(additional_result_fn, 'rb')) as r1_file:
162 r1 = pickle.load(r1_file)
163 check_consistency(r0, r1)
164 total_comparisons += r1['total comparisons']
165 keep = max(keep, r1['parameters']['best_result_count'])
166 merge_matches(matches, r1['matches'], keep)
167 if options.verbose:
168 print >> sys.stderr, "merged", additional_result_fn
169 del r1 # free some memory
171 r0['matches'] = matches
172 r0['total comparisons'] = total_comparisons
173 with contextlib.closing(open(output_fn, 'wb')) as output_file:
174 pk = pickle.Pickler(output_file, pickle.HIGHEST_PROTOCOL)
175 pk.fast = 1 # stipulate no circular references
176 pk.dump(r0)
177 if options.verbose:
178 print >> sys.stderr, "dumped", output_fn
181 if __name__ == '__main__':
182 main()