validate: add --mass-error-histogram-bin-width option
[greylag.git] / greylag.py
blobe966cde489b24a8041b8fb818952d1b1dce66fd5
1 # common greylag functions and constants
3 ## greylag, a collection of programs for MS/MS protein analysis
4 ## Copyright (C) 2006-2008 Stowers Institute for Medical Research
5 ##
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ## Contact: Mike Coleman
20 ## Stowers Institute for Medical Research
21 ## 1000 East 50th Street
22 ## Kansas City, Missouri 64110
23 ## USA
25 from __future__ import with_statement
28 import logging; from logging import debug, info, warning
29 import math
30 import re
31 import sys
33 import cgreylag
36 VERSION = "0.1.2"
39 # prefix added to locus id's for decoy loci
40 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
43 # handle to the singleton parameter object shared with the C++ module
44 CP = cgreylag.cvar.parameters_the
47 def set_logging(options):
48 log_level = logging.WARNING
49 if options.quiet:
50 log_level = logging.ERROR
51 if options.verbose:
52 log_level = logging.INFO
53 if options.debug:
54 log_level = logging.DEBUG
55 logfile = None
56 if options.logfile:
57 logfile = options.logfile
58 logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
59 format=('%(asctime)s [%(process)d]'
60 ' %(levelname)s: %(message)s'),
61 filename=logfile)
64 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
65 PROTON_MASS = 1.007276
66 ELECTRON_MASS = 0.000549 # ?
68 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
69 # (It appears that NIST has started to copyright and require licenses for use
70 # of some of its data, but these values appear not to fall under copyright,
71 # at least for now.)
72 MONOISOTOPIC_ATOMIC_MASS = {
73 'H' : 1.00782503214,
74 'C' : 12.00000000,
75 'N' : 14.00307400529,
76 'O' : 15.994914622115,
77 'P' : 30.9737615120,
78 'S' : 31.9720706912,
81 # most prevalent only (1 in 1000)
82 ISOTOPIC_ATOMIC_MASS = { # prevalence (in %)
83 'C13' : 13.003354837810, # 1.078
84 'N15' : 15.00010889849, # 0.3687
85 'O18' : 17.99916049, # 0.20514
86 'S33' : 32.9714585012, # 0.762
87 'S34' : 33.9678668311, # 4.2928
90 AVERAGE_ATOMIC_MASS = {
91 'H' : 1.007947,
92 'C' : 12.01078,
93 'N' : 14.00672,
94 'O' : 15.99943,
95 'P' : 30.9737612,
96 'S' : 32.0655,
99 # The xtandem average residue masses are about 0.002 amu higher than those
100 # calculated directly from the above average atomic masses. None of the
101 # chemists consulted knew of any reason why, aside from lack of precision in
102 # the average atomic mass estimates. This shouldn't matter very much, as
103 # fragmentation calculations should all be monoisotopic, and we can always
104 # widen the parent tolerance window a bit.
107 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
108 """Return the mass of formula, using the given mass regime (monoisotopic
109 by default).
111 >>> formula_mass('H2O', { 'H':1, 'O':16 })
113 >>> # monoisotopic mass of glycine
114 >>> str(round(formula_mass('C2H3ON'), 4))
115 '57.0215'
118 parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
119 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
120 return sum(atomic_mass[parts[i]] * int(parts[i+1])
121 for i in range(0, len(parts), 2))
123 # FIX: selenocysteine (U), etc
124 # residue -> formula
125 RESIDUE_FORMULA = {
126 'A' : "C3H5ON",
127 'C' : "C3H5ONS",
128 'D' : "C4H5O3N",
129 'E' : "C5H7O3N",
130 'F' : "C9H9ON",
131 'G' : "C2H3ON",
132 'H' : "C6H7ON3",
133 'I' : "C6H11ON",
134 'K' : "C6H12ON2",
135 'L' : "C6H11ON",
136 'M' : "C5H9ONS",
137 'N' : "C4H6O2N2",
138 'P' : "C5H7ON",
139 'Q' : "C5H8O2N2",
140 'R' : "C6H12ON4",
141 'S' : "C3H5O2N",
142 'T' : "C4H7O2N",
143 'V' : "C5H9ON",
144 'W' : "C11H10ON2",
145 'Y' : "C9H9O2N",
148 RESIDUES = sorted(RESIDUE_FORMULA.keys())
149 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
152 # [0][1] -> 'H' -> fragment mass of H for regime 0
153 MASS_REGIME_ATOMIC_MASSES = []
155 def dump_mass_regime_atomic_masses():
156 return MASS_REGIME_ATOMIC_MASSES
159 def mass_regime_atomic_masses(spec):
160 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
161 names to masses.
163 name, isotopes = spec
164 assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
165 if name == 'MONO':
166 r = MONOISOTOPIC_ATOMIC_MASS.copy()
167 else:
168 r = AVERAGE_ATOMIC_MASS.copy()
169 if isotopes:
170 iname, prevalence = isotopes[0]
171 assert iname == 'N15' and 0 <= prevalence <= 1
172 # this is a simplification, but additional accuracy pointless?
173 if name == 'MONO':
174 r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
175 else:
176 r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
177 return r
180 def initialize_spectrum_parameters(options, GLP, mass_regimes, fixed_mod_map):
181 """Initialize parameters known to the spectrum module.
182 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
185 # This function can be called multiple times to reinitialize for a new
186 # search, though probably everything else (previous matches) ought to be
187 # "forgotten" at that point, too.
189 # This is the size of vectors that are indexed by residues (A-Z) or
190 # special characters ('[]').
191 RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
193 # These are currently monoisotopic. (deuterium pointless?)
194 CP.proton_mass = PROTON_MASS
195 CP.hydrogen_mass = formula_mass("H")
197 regime_manifest = []
199 global MASS_REGIME_ATOMIC_MASSES
200 # clear previous state
201 MASS_REGIME_ATOMIC_MASSES = []
202 CP.parent_mass_regime.clear()
203 CP.fragment_mass_regime.clear()
205 for rn, regime_pair in enumerate(mass_regimes):
206 assert len(regime_pair) == 2 # parent and fragment
207 info('mass regime: %s', regime_pair)
208 MASS_REGIME_ATOMIC_MASSES.append([])
209 for n, regime in enumerate(regime_pair):
210 atmass = mass_regime_atomic_masses(regime)
211 MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
212 creg = cgreylag.mass_regime_parameters()
214 creg.hydroxyl_mass = formula_mass("OH", atmass)
215 creg.water_mass = formula_mass("H2O", atmass)
216 creg.ammonia_mass = formula_mass("NH3", atmass)
218 creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
220 for r in RESIDUES_W_BRACKETS:
221 m = 0
222 if r in RESIDUES:
223 m = (formula_mass(RESIDUE_FORMULA[r], atmass)
224 + GLP["mass_regime_debug_delta"])
225 if n == 1:
226 regime_manifest.append((rn, r, m))
227 rmod = fixed_mod_map.get(r)
228 if rmod:
229 if isinstance(rmod[1], str):
230 if rmod[2]:
231 m += rmod[0] * formula_mass(rmod[1])
232 else:
233 m += rmod[0] * formula_mass(rmod[1], atmass)
234 else:
235 m += rmod[0] * rmod[1]
236 creg.fixed_residue_mass[ord(r)] = m
237 # assuming these are monoisotopic (not regime)
238 creg.fixed_residue_mass[ord('[')] += formula_mass("H")
239 creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
240 if not n:
241 CP.parent_mass_regime.append(creg)
242 else:
243 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
244 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
245 CP.fragment_mass_regime.append(creg)
246 for r in RESIDUES_W_BRACKETS:
247 info('fixed mass %s: %s', r,
248 [ ("%.6f" % CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
249 "%.6f" % CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
250 for rn in range(len(mass_regimes)) ])
251 for r in RESIDUES:
252 for rn in range(len(mass_regimes)):
253 # check for physically impossible/meaningless masses
254 if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
255 raise ValueError('bogus parent mass specification for %s' % r)
256 if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
257 raise ValueError('bogus fragment mass specification for %s' % r)
259 CP.parent_mass_tolerance_1 = GLP["parent_mz_tolerance"]
260 CP.parent_mass_tolerance_max = (GLP["parent_mz_tolerance"]
261 * GLP["charge_limit"])
263 CP.fragment_mass_tolerance = GLP["fragment_mass_tolerance"]
264 CP.intensity_class_count = GLP["intensity_class_count"]
266 CP.minimum_peptide_length = GLP["min_peptide_length"]
268 # CP.ln_factorial[n] == ln(n!)
269 CP.ln_factorial.resize(int(GLP["max_parent_spectrum_mass"]
270 / GLP["fragment_mass_tolerance"] + 100), 0.0)
271 for n in range(2, len(CP.ln_factorial)):
272 CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
274 # FIX or eliminate
275 #CP.estimate_only = bool(options.estimate_only)
276 #CP.show_progress = bool(options.show_progress)
277 CP.estimate_only = False
278 CP.show_progress = False
280 return regime_manifest
283 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
284 if not mod_tree:
285 if 0 < len(conjuncts) <= limit:
286 yield conjuncts
287 return
288 first, rest = mod_tree[0], mod_tree[1:]
289 if isinstance(first, list):
290 for x in enumerate_disjunction(first, limit):
291 for y in enumerate_conjunction(rest, limit, conjuncts + x):
292 yield y
293 else:
294 for y in enumerate_conjunction(rest, limit, conjuncts):
295 yield y
296 for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
297 yield y
299 def enumerate_disjunction(mod_tree, limit=sys.maxint):
300 """Generates the conjuncts for mod_tree that are no longer than limit.
302 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
303 [[], ['a'], ['b'], ['c']]
304 >>> list(enumerate_disjunction([[1,2,3]]))
305 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
306 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
307 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
308 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
309 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
310 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
311 [[]]
314 assert isinstance(mod_tree, list)
315 yield []
316 for b in mod_tree:
317 for s in enumerate_conjunction(b, limit):
318 yield s
320 def get_mod_conjunct_triples(mod_tree, limit, mass_regimes):
321 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
322 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
323 ordered by increasing number of conjuncts, and with duplicates within and
324 across removed. A mass table is also appended to each conjunct.
326 # FIX: is there a more elegant way or place for all this?
327 def enmass(t):
328 def rmass(regime_index, par_frag, sign, delta, is_mono):
329 global MASS_REGIME_ATOMIC_MASSES
330 if isinstance(delta, str):
331 if is_mono:
332 regime_index = 0
333 return (sign * formula_mass(delta,
334 MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
335 else:
336 return sign * delta
338 sign, delta, is_mono = t[:3]
339 return t + (tuple((rmass(r, 0, sign, delta, is_mono),
340 rmass(r, 1, sign, delta, is_mono))
341 for r in range(len(mass_regimes))),)
343 def triple(c):
344 Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
345 Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
346 assert len(Ns) <= 1 and len(Cs) <= 1
347 rest = [ enmass(x) for x in c if x[3] not in '[]' ]
348 rest = tuple(sorted(list(frozenset(rest))))
349 return (Ns, Cs, rest)
351 return sorted(list(frozenset(triple(conjunct)
352 for conjunct
353 in enumerate_disjunction(mod_tree, limit))),
354 key=lambda x: (sum(len(y) for y in x), x))
357 def cleavage_motif_re(motif):
358 """Return (regexp, pos), where regexp is a regular expression that will
359 match a cleavage motif, and pos is the position of the cleavage with
360 respect to the match (or None). (The RE actually matches one character,
361 the rest matching as lookahead, so that re.finditer will find all
362 overlapping matches.)
364 >>> cleavage_motif_re('[KR]|{P}')
365 ('[KR](?=[^P])', 1)
366 >>> cleavage_motif_re('[X]|[X]')
367 ('.(?=.)', 1)
370 cleavage_pos = None
371 re_parts = []
372 motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
373 parts = [ p[0] for p in motif_re.findall(motif) ]
374 if ''.join(parts) != motif:
375 chase_error('invalid cleavage motif pattern')
376 i = 0
377 for part in parts:
378 if part == '|':
379 if cleavage_pos != None:
380 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
381 cleavage_pos = i
382 continue
383 if part == '[X]':
384 re_parts.append('.')
385 elif part[0] == '[':
386 re_parts.append('[%s]' % part[1:-1])
387 elif part[0] == '{':
388 re_parts.append('[^%s]' % part[1:-1])
389 else:
390 assert False, "unknown cleavage motif syntax"
391 i += 1
392 if len(re_parts) == 0:
393 re_pattern = '.'
394 elif len(re_parts) == 1:
395 re_pattern = re_parts[0]
396 else:
397 re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
398 return (re_pattern, cleavage_pos)
401 def file_sha1(filename):
402 """Return the (hex) SHA1 digest of the given file."""
403 try:
404 import hashlib
405 except:
406 return "no checksum--libs missing"
407 h = hashlib.sha1()
408 with open(filename) as f:
409 h.update(f.read())
410 return h.hexdigest()
413 def read_fasta_file(f):
414 """Yield (locusname, defline, sequence) tuples as read from the given
415 FASTA file (uppercasing sequence)."""
417 locusname, defline = None, None
418 seqs = []
420 for line in f:
421 line = line.strip()
422 if line[:1] == '>':
423 if defline != None:
424 yield (locusname, defline, ''.join(seqs))
425 defline = line[1:]
426 locusname_rest = defline.split(None, 1)
427 locusname = locusname_rest[0] if locusname_rest else ''
428 seqs = []
429 else:
430 if defline == None:
431 chase_error("bad format: line precedes initial defline"
432 " in '%s'", (f.name if hasattr(f, 'name')
433 else 'unknown FASTA file'))
434 seqs.append(line.upper())
435 if defline != None:
436 yield (locusname, defline, ''.join(seqs))
439 def read_fasta_files(filenames):
440 """Yield (locusname, defline, sequence, filename) tuples as read from
441 FASTA files (uppercasing sequence). An error is given if locusname is
442 empty or not unique across all sequences."""
444 loci_seen = set()
446 for filename in filenames:
447 with open(filename) as f:
448 for locusname, defline, sequence in read_fasta_file(f):
449 if not locusname:
450 chase_error("empty locus name not allowed in '%s'",
451 filename)
452 if locusname in loci_seen:
453 chase_error("locus name '%s' is not unique in the search"
454 " database(s) in '%s'", locusname, filename)
455 loci_seen.add(locusname)
457 yield (locusname, defline, sequence, filename)