greylag-process: default to 1100 parts (good for our production cluster)
[greylag.git] / greylag_merge.py
blobb557f2caf1ae37737403e7741c53969237d01f9b
1 #!/usr/bin/env greylag-python
3 '''Merge a set of greylag search result files, producing a search result file
4 that summarizes the best matches found.
6 '''
8 from __future__ import with_statement
10 __copyright__ = '''
11 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 '''
28 __version__ = "0.0"
31 import contextlib
32 import cPickle
33 import gzip
34 import optparse
35 import os.path
36 from pprint import pprint
37 import sys
40 def warn(s):
41 print >> sys.stderr, 'warning:', s
42 def error(s):
43 sys.exit('error: ' + s)
46 def check_consistency(r0, r1):
47 """Warn on any inconsistencies between the two results that are about to
48 be merged."""
50 if r0['version'] != r1['version']:
51 warn("merging runs searched with different greylag versions!"
52 " (rerunning with same version strongly recommended)")
53 k0 = set(r0.keys())
54 k1 = set(r1.keys())
55 if k0 != k1:
56 warn("mismatched keys: %s" % (k0 ^ k1))
58 varying_keys = set(['matches', 'total comparisons', 'argv'])
59 for k in k0 - varying_keys:
60 if r0[k] != r1.get(k):
61 warn("search context differs: %s (%s vs %s)" % (k, r0[k], r1[k]))
64 def score_equal(s1, s2):
65 return abs(s1-s2) < 1e-6
68 def merge_match(m0, m1):
69 """Merge a particular spectrum match from m1 into m0."""
70 m0[0]['comparisons'] += m1[0]['comparisons']
72 # NB: This merge code, and score_equal above, must functionally match the
73 # merge code in cgreylag.cpp:evaluate_peptide!
75 keep = max(len(m0), len(m1))
76 assert keep >= 1
77 matches = m0[1] + m1[1]
78 matches.sort(key=lambda x: x['score'], reverse=True)
80 merged_matches = [matches.pop()]
81 while len(merged_matches) < keep and matches:
82 c0, c1 = merged_matches[-1], matches.pop()
83 if (score_equal(c0['score'], c1['score'])
84 and c0['peptide_sequence'] == c1['peptide_sequence']
85 and c0['mass_trace'] == c1['mass_trace']):
86 continue
87 merged_matches.append(c1)
90 def merge_matches(m0, m1):
91 """Merge the match information in m1 into m0."""
92 for m1k, m1v in m1.iteritems():
93 if m1k not in m0:
94 m0[m1k] = m1v
95 else:
96 merge_match(m0[m1k], m1v)
99 def main(args=sys.argv[1:]):
100 parser = optparse.OptionParser(usage=
101 "usage: %prog [options] [<result-file>...]"
102 " <output-result-file>",
103 description=__doc__, version=__version__)
104 pa = parser.add_option
105 pa("-f", "--files-on-stdin", action="store_true", dest="files_on_stdin",
106 help="read filenames to be merged from stdin, one per line, instead of"
107 " from the command-line.")
108 pa("-v", "--verbose", action="store_true", dest="verbose",
109 help="be verbose")
110 pa("--copyright", action="store_true", dest="copyright",
111 help="print copyright and exit")
112 (options, args) = parser.parse_args(args=args)
114 if options.copyright:
115 print __copyright__
116 sys.exit(0)
118 if options.files_on_stdin:
119 if len(args) != 1:
120 parser.print_help()
121 sys.exit(1)
122 filenames = [ l.strip() for l in sys.stdin ]
123 if len(filenames) < 1:
124 error("--files-on-stdin given, but stdin was empty")
125 result_fn_0 = filenames[0]
126 result_fn_1_N = filenames[1:]
127 else:
128 if len(args) < 2:
129 parser.print_help()
130 sys.exit(1)
131 result_fn_0 = args[0]
132 result_fn_1_N = args[1:-1]
134 output_fn = args[-1]
136 # spectrum name -> match list/spectrum info
137 matches = {}
138 with contextlib.closing(gzip.open(result_fn_0)) as r_file:
139 r0 = cPickle.load(r_file)
140 matches = r0['matches']
141 total_comparisons = r0['total comparisons']
142 if options.verbose:
143 print >> sys.stderr, "loaded", result_fn_0
145 for additional_result_fn in result_fn_1_N:
146 with contextlib.closing(gzip.open(additional_result_fn)) as r1_file:
147 r1 = cPickle.load(r1_file)
148 check_consistency(r0, r1)
149 merge_matches(matches, r1['matches'])
150 total_comparisons += r1['total comparisons']
151 if options.verbose:
152 print >> sys.stderr, "merged", additional_result_fn
154 r0['matches'] = matches
155 r0['total comparisons'] = total_comparisons
156 with contextlib.closing(gzip.open(output_fn, 'w')) as output_file:
157 cPickle.dump(r0, output_file, cPickle.HIGHEST_PROTOCOL)
158 if options.verbose:
159 print >> sys.stderr, "dumped", output_fn
162 if __name__ == '__main__':
163 main()