validate: choose the better charge for multiple charge spectra
[greylag.git] / greylag.py
blobf729f002b01ee6931f9f71f75538f31b6e20bbd1
1 # common greylag functions and constants
3 ## greylag, a collection of programs for MS/MS protein analysis
4 ## Copyright (C) 2006-2008 Stowers Institute for Medical Research
5 ##
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ## Contact: Mike Coleman
20 ## Stowers Institute for Medical Research
21 ## 1000 East 50th Street
22 ## Kansas City, Missouri 64110
23 ## USA
25 from __future__ import with_statement
28 import logging; from logging import debug, info, warning
29 import math
30 import re
31 import sys
33 import cgreylag
36 VERSION = "0.1.1"
39 # handle to the singleton parameter object shared with the C++ module
40 CP = cgreylag.cvar.parameters_the
43 def set_logging(options):
44 log_level = logging.WARNING
45 if options.quiet:
46 log_level = logging.ERROR
47 if options.verbose:
48 log_level = logging.INFO
49 if options.debug:
50 log_level = logging.DEBUG
51 logfile = None
52 if options.logfile:
53 logfile = options.logfile
54 logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
55 format=('%(asctime)s [%(process)d]'
56 ' %(levelname)s: %(message)s'),
57 filename=logfile)
60 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
61 PROTON_MASS = 1.007276
62 ELECTRON_MASS = 0.000549 # ?
64 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
65 MONOISOTOPIC_ATOMIC_MASS = {
66 'H' : 1.00782503214,
67 'C' : 12.00000000,
68 'N' : 14.00307400529,
69 'O' : 15.994914622115,
70 'P' : 30.9737615120,
71 'S' : 31.9720706912,
74 # most prevalent only (1 in 1000)
75 ISOTOPIC_ATOMIC_MASS = { # prevalence (in %)
76 'C13' : 13.003354837810, # 1.078
77 'N15' : 15.00010889849, # 0.3687
78 'O18' : 17.99916049, # 0.20514
79 'S33' : 32.9714585012, # 0.762
80 'S34' : 33.9678668311, # 4.2928
83 AVERAGE_ATOMIC_MASS = {
84 'H' : 1.007947,
85 'C' : 12.01078,
86 'N' : 14.00672,
87 'O' : 15.99943,
88 'P' : 30.9737612,
89 'S' : 32.0655,
92 # The xtandem average residue masses are about 0.002 amu higher than those
93 # calculated directly from the above average atomic masses. None of the
94 # chemists consulted knew of any reason why, aside from lack of precision in
95 # the average atomic mass estimates. This shouldn't matter very much, as
96 # fragmentation calculations should all be monoisotopic, and we can always
97 # widen the parent tolerance window a bit.
100 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
101 """Return the mass of formula, using the given mass regime (monoisotopic
102 by default).
104 >>> formula_mass('H2O', { 'H':1, 'O':16 })
106 >>> # monoisotopic mass of glycine
107 >>> str(round(formula_mass('C2H3ON'), 4))
108 '57.0215'
111 parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
112 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
113 return sum(atomic_mass[parts[i]] * int(parts[i+1])
114 for i in range(0, len(parts), 2))
116 # FIX: selenocysteine (U), etc
117 # residue -> formula
118 RESIDUE_FORMULA = {
119 'A' : "C3H5ON",
120 'C' : "C3H5ONS",
121 'D' : "C4H5O3N",
122 'E' : "C5H7O3N",
123 'F' : "C9H9ON",
124 'G' : "C2H3ON",
125 'H' : "C6H7ON3",
126 'I' : "C6H11ON",
127 'K' : "C6H12ON2",
128 'L' : "C6H11ON",
129 'M' : "C5H9ONS",
130 'N' : "C4H6O2N2",
131 'P' : "C5H7ON",
132 'Q' : "C5H8O2N2",
133 'R' : "C6H12ON4",
134 'S' : "C3H5O2N",
135 'T' : "C4H7O2N",
136 'V' : "C5H9ON",
137 'W' : "C11H10ON2",
138 'Y' : "C9H9O2N",
141 RESIDUES = sorted(RESIDUE_FORMULA.keys())
142 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
145 # [0][1] -> 'H' -> fragment mass of H for regime 0
146 MASS_REGIME_ATOMIC_MASSES = []
148 def dump_mass_regime_atomic_masses():
149 return MASS_REGIME_ATOMIC_MASSES
152 def mass_regime_atomic_masses(spec):
153 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
154 names to masses.
156 name, isotopes = spec
157 assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
158 if name == 'MONO':
159 r = MONOISOTOPIC_ATOMIC_MASS.copy()
160 else:
161 r = AVERAGE_ATOMIC_MASS.copy()
162 if isotopes:
163 iname, prevalence = isotopes[0]
164 assert iname == 'N15' and 0 <= prevalence <= 1
165 # this is a simplification, but additional accuracy pointless?
166 if name == 'MONO':
167 r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
168 else:
169 r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
170 return r
173 def initialize_spectrum_parameters(options, GLP, mass_regimes, fixed_mod_map):
174 """Initialize parameters known to the spectrum module.
175 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
178 # This function can be called multiple times to reinitialize for a new
179 # search, though probably everything else (previous matches) ought to be
180 # "forgotten" at that point, too.
182 # This is the size of vectors that are indexed by residues (A-Z) or
183 # special characters ('[]').
184 RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
186 # These are currently monoisotopic. (deuterium pointless?)
187 CP.proton_mass = PROTON_MASS
188 CP.hydrogen_mass = formula_mass("H")
190 regime_manifest = []
192 global MASS_REGIME_ATOMIC_MASSES
193 # clear previous state
194 MASS_REGIME_ATOMIC_MASSES = []
195 CP.parent_mass_regime.clear()
196 CP.fragment_mass_regime.clear()
198 for rn, regime_pair in enumerate(mass_regimes):
199 assert len(regime_pair) == 2 # parent and fragment
200 info('mass regime: %s', regime_pair)
201 MASS_REGIME_ATOMIC_MASSES.append([])
202 for n, regime in enumerate(regime_pair):
203 atmass = mass_regime_atomic_masses(regime)
204 MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
205 creg = cgreylag.mass_regime_parameters()
207 creg.hydroxyl_mass = formula_mass("OH", atmass)
208 creg.water_mass = formula_mass("H2O", atmass)
209 creg.ammonia_mass = formula_mass("NH3", atmass)
211 creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
213 for r in RESIDUES_W_BRACKETS:
214 m = 0
215 if r in RESIDUES:
216 m = (formula_mass(RESIDUE_FORMULA[r], atmass)
217 + GLP["mass_regime_debug_delta"])
218 if n == 1:
219 regime_manifest.append((rn, r, m))
220 rmod = fixed_mod_map.get(r)
221 if rmod:
222 if isinstance(rmod[1], str):
223 if rmod[2]:
224 m += rmod[0] * formula_mass(rmod[1])
225 else:
226 m += rmod[0] * formula_mass(rmod[1], atmass)
227 else:
228 m += rmod[0] * rmod[1]
229 creg.fixed_residue_mass[ord(r)] = m
230 # assuming these are monoisotopic (not regime)
231 creg.fixed_residue_mass[ord('[')] += formula_mass("H")
232 creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
233 if not n:
234 CP.parent_mass_regime.append(creg)
235 else:
236 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
237 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
238 CP.fragment_mass_regime.append(creg)
239 for r in RESIDUES_W_BRACKETS:
240 info('fixed mass %s: %s', r,
241 [ ("%.6f" % CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
242 "%.6f" % CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
243 for rn in range(len(mass_regimes)) ])
244 for r in RESIDUES:
245 for rn in range(len(mass_regimes)):
246 # check for physically impossible/meaningless masses
247 if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
248 raise ValueError('bogus parent mass specification for %s' % r)
249 if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
250 raise ValueError('bogus fragment mass specification for %s' % r)
252 CP.parent_mass_tolerance_1 = GLP["parent_mz_tolerance"]
253 CP.parent_mass_tolerance_max = (GLP["parent_mz_tolerance"]
254 * GLP["charge_limit"])
256 CP.fragment_mass_tolerance = GLP["fragment_mass_tolerance"]
257 CP.intensity_class_count = GLP["intensity_class_count"]
259 CP.minimum_peptide_length = GLP["min_peptide_length"]
261 # CP.ln_factorial[n] == ln(n!)
262 CP.ln_factorial.resize(int(GLP["max_parent_spectrum_mass"]
263 / GLP["fragment_mass_tolerance"] + 100), 0.0)
264 for n in range(2, len(CP.ln_factorial)):
265 CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
267 # FIX or eliminate
268 #CP.estimate_only = bool(options.estimate_only)
269 #CP.show_progress = bool(options.show_progress)
270 CP.estimate_only = False
271 CP.show_progress = False
273 return regime_manifest
276 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
277 if not mod_tree:
278 if 0 < len(conjuncts) <= limit:
279 yield conjuncts
280 return
281 first, rest = mod_tree[0], mod_tree[1:]
282 if isinstance(first, list):
283 for x in enumerate_disjunction(first, limit):
284 for y in enumerate_conjunction(rest, limit, conjuncts + x):
285 yield y
286 else:
287 for y in enumerate_conjunction(rest, limit, conjuncts):
288 yield y
289 for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
290 yield y
292 def enumerate_disjunction(mod_tree, limit=sys.maxint):
293 """Generates the conjuncts for mod_tree that are no longer than limit.
295 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
296 [[], ['a'], ['b'], ['c']]
297 >>> list(enumerate_disjunction([[1,2,3]]))
298 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
299 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
300 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
301 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
302 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
303 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
304 [[]]
307 assert isinstance(mod_tree, list)
308 yield []
309 for b in mod_tree:
310 for s in enumerate_conjunction(b, limit):
311 yield s
313 def get_mod_conjunct_triples(mod_tree, limit, mass_regimes):
314 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
315 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
316 ordered by increasing number of conjuncts, and with duplicates within and
317 across removed. A mass table is also appended to each conjunct.
319 # FIX: is there a more elegant way or place for all this?
320 def enmass(t):
321 def rmass(regime_index, par_frag, sign, delta, is_mono):
322 global MASS_REGIME_ATOMIC_MASSES
323 if isinstance(delta, str):
324 if is_mono:
325 regime_index = 0
326 return (sign * formula_mass(delta,
327 MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
328 else:
329 return sign * delta
331 sign, delta, is_mono = t[:3]
332 return t + (tuple((rmass(r, 0, sign, delta, is_mono),
333 rmass(r, 1, sign, delta, is_mono))
334 for r in range(len(mass_regimes))),)
336 def triple(c):
337 Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
338 Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
339 assert len(Ns) <= 1 and len(Cs) <= 1
340 rest = [ enmass(x) for x in c if x[3] not in '[]' ]
341 rest = tuple(sorted(list(frozenset(rest))))
342 return (Ns, Cs, rest)
344 return sorted(list(frozenset(triple(conjunct)
345 for conjunct
346 in enumerate_disjunction(mod_tree, limit))),
347 key=lambda x: (sum(len(y) for y in x), x))
350 def cleavage_motif_re(motif):
351 """Return (regexp, pos), where regexp is a regular expression that will
352 match a cleavage motif, and pos is the position of the cleavage with
353 respect to the match (or None). (The RE actually matches one character,
354 the rest matching as lookahead, so that re.finditer will find all
355 overlapping matches.)
357 >>> cleavage_motif_re('[KR]|{P}')
358 ('[KR](?=[^P])', 1)
359 >>> cleavage_motif_re('[X]|[X]')
360 ('.(?=.)', 1)
363 cleavage_pos = None
364 re_parts = []
365 motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
366 parts = [ p[0] for p in motif_re.findall(motif) ]
367 if ''.join(parts) != motif:
368 chase_error('invalid cleavage motif pattern')
369 i = 0
370 for part in parts:
371 if part == '|':
372 if cleavage_pos != None:
373 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
374 cleavage_pos = i
375 continue
376 if part == '[X]':
377 re_parts.append('.')
378 elif part[0] == '[':
379 re_parts.append('[%s]' % part[1:-1])
380 elif part[0] == '{':
381 re_parts.append('[^%s]' % part[1:-1])
382 else:
383 assert False, "unknown cleavage motif syntax"
384 i += 1
385 if len(re_parts) == 0:
386 re_pattern = '.'
387 elif len(re_parts) == 1:
388 re_pattern = re_parts[0]
389 else:
390 re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
391 return (re_pattern, cleavage_pos)
394 def file_sha1(filename):
395 """Return the (hex) SHA1 digest of the given file."""
396 try:
397 import hashlib
398 except:
399 return "no checksum--libs missing"
400 h = hashlib.sha1()
401 with open(filename) as f:
402 h.update(f.read())
403 return h.hexdigest()
406 def read_fasta_files(filenames):
407 """Yield (locusname, defline, sequence, filename) tuples as read from
408 FASTA files (uppercasing sequence)."""
410 loci_seen = set()
412 for filename in filenames:
413 locusname, defline = None, None
414 seqs = []
415 with open(filename) as f:
416 for line in f:
417 line = line.strip()
418 if line[:1] == '>':
419 if defline != None:
420 yield (locusname, defline, ''.join(seqs), filename)
421 elif seqs:
422 chase_error("bad format: line precedes initial defline"
423 " in '%s'", filename)
424 defline = line[1:]
425 locusname_rest = defline.split(None, 1)
426 if not locusname_rest:
427 chase_error("empty locus name not allowed in '%s'",
428 filename)
429 locusname = locusname_rest[0]
430 if locusname in loci_seen:
431 chase_error("locus name '%s' is not unique in the"
432 " search database(s) in '%s'", locusname,
433 filename)
434 loci_seen.add(locusname)
435 seqs = []
436 else:
437 seqs.append(line.upper())
438 if defline:
439 yield (locusname, defline, ''.join(seqs), filename)