3 '''Merge a set of greylag search result files, producing a search result file
4 that summarizes the best matches found.
8 from __future__
import with_statement
11 greylag, a collection of programs for MS/MS protein analysis
12 Copyright (C) 2006-2008 Stowers Institute for Medical Research
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
28 Stowers Institute for Medical Research
30 Kansas City, Missouri 64110
36 import cPickle
as pickle
39 from pprint
import pprint
42 from greylag
import VERSION
45 # gc kills our performance, so disable it. gc only matters for cycles, which
46 # we (hope we) don't create. See the gc module docs.
47 import gc
; gc
.disable()
51 print >> sys
.stderr
, 'warning:', s
53 sys
.exit('error: ' + s
)
56 def check_consistency(r0
, r1
):
57 """Warn on any inconsistencies between the two results that are about to
60 if r0
['version'] != r1
['version']:
61 warn("merging runs searched with different greylag versions!"
62 " (rerunning with same version strongly recommended)")
66 warn("mismatched keys: %s" % (k0 ^ k1
))
68 # FIX: this restriction goes away later
69 if r0
['spectrum files'] != r1
['spectrum files']:
70 error('merging across differing sets of ms2 files not yet implemented'
71 ' (%s vs %s)' % (r0
['spectrum files'], r1
['spectrum files']))
73 varying_keys
= set(['matches', 'total comparisons', 'argv'])
74 for k
in k0
- varying_keys
:
75 if r0
[k
] != r1
.get(k
):
76 warn("search context differs: %s (%s vs %s)" % (k
, r0
[k
], r1
[k
]))
79 def score_equal(s1
, s2
):
80 return abs(s1
-s2
) < 1e-6
83 def merge_match(m0
, m1
, keep
):
84 """Merge a particular spectrum match from m1 into m0."""
85 m0
[0]['comparisons'] += m1
[0]['comparisons']
87 # NB: This merge code, and score_equal above, must do the same merge as
88 # the merge code in cgreylag.cpp:evaluate_peptide!
91 matches
= m0
[1] + m1
[1]
92 matches
.sort(key
=lambda x
: x
['score'], reverse
=True)
94 merged_matches
= [matches
.pop()]
95 while len(merged_matches
) < keep
and matches
:
96 c0
, c1
= merged_matches
[-1], matches
.pop()
97 if (score_equal(c0
['score'], c1
['score'])
98 and c0
['peptide_sequence'] == c1
['peptide_sequence']
99 and c0
.get('mass_trace') == c1
.get('mass_trace')):
101 merged_matches
.append(c1
)
104 def merge_matches(m0
, m1
, keep
):
105 """Merge the match information in m1 into m0."""
106 for m1k
, m1v
in m1
.iteritems():
110 merge_match(m0
[m1k
], m1v
, keep
)
113 def main(args
=sys
.argv
[1:]):
114 parser
= optparse
.OptionParser(usage
=
115 "usage: %prog [options] [<result-file>...]"
116 " <output-result-file>",
117 description
=__doc__
, version
=VERSION
)
118 pa
= parser
.add_option
119 pa("-f", "--files-on-stdin", action
="store_true", dest
="files_on_stdin",
120 help="read filenames to be merged from stdin, one per line, instead of"
121 " from the command-line.")
122 pa("-v", "--verbose", action
="store_true", dest
="verbose",
124 pa("--copyright", action
="store_true", dest
="copyright",
125 help="print copyright and exit")
126 (options
, args
) = parser
.parse_args(args
=args
)
128 if options
.copyright
:
132 if options
.files_on_stdin
:
136 filenames
= [ l
.strip() for l
in sys
.stdin
]
137 if len(filenames
) < 1:
138 error("--files-on-stdin given, but stdin was empty")
139 result_fn_0
= filenames
[0]
140 result_fn_1_N
= filenames
[1:]
145 result_fn_0
= args
[0]
146 result_fn_1_N
= args
[1:-1]
150 # spectrum name -> match list/spectrum info
152 with contextlib
.closing(open(result_fn_0
, 'rb')) as r_file
:
153 r0
= pickle
.load(r_file
)
154 total_comparisons
= r0
['total comparisons']
155 keep
= r0
['parameters']['best_result_count']
156 matches
= r0
['matches']
158 print >> sys
.stderr
, "loaded", result_fn_0
160 for additional_result_fn
in result_fn_1_N
:
161 with contextlib
.closing(open(additional_result_fn
, 'rb')) as r1_file
:
162 r1
= pickle
.load(r1_file
)
163 check_consistency(r0
, r1
)
164 total_comparisons
+= r1
['total comparisons']
165 keep
= max(keep
, r1
['parameters']['best_result_count'])
166 merge_matches(matches
, r1
['matches'], keep
)
168 print >> sys
.stderr
, "merged", additional_result_fn
169 del r1
# free some memory
171 r0
['matches'] = matches
172 r0
['total comparisons'] = total_comparisons
173 with contextlib
.closing(open(output_fn
, 'wb')) as output_file
:
174 pk
= pickle
.Pickler(output_file
, pickle
.HIGHEST_PROTOCOL
)
175 pk
.fast
= 1 # stipulate no circular references
178 print >> sys
.stderr
, "dumped", output_fn
181 if __name__
== '__main__':