shuffle: add basic Markov shuffle, other minor cleanups
[greylag.git] / greylag.py
blob090053b11a52c0ad7d3afd6159d3b3c6a478a7b6
1 # common greylag functions and constants
3 ## greylag, a collection of programs for MS/MS protein analysis
4 ## Copyright (C) 2006-2008 Stowers Institute for Medical Research
5 ##
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ## Contact: Mike Coleman
20 ## Stowers Institute for Medical Research
21 ## 1000 East 50th Street
22 ## Kansas City, Missouri 64110
23 ## USA
25 from __future__ import with_statement
28 import logging; from logging import debug, info, warning
29 import math
30 import re
31 import sys
33 import cgreylag
36 VERSION = "0.1.1"
39 # prefix added to locus id's for decoy loci
40 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
43 # handle to the singleton parameter object shared with the C++ module
44 CP = cgreylag.cvar.parameters_the
47 def set_logging(options):
48 log_level = logging.WARNING
49 if options.quiet:
50 log_level = logging.ERROR
51 if options.verbose:
52 log_level = logging.INFO
53 if options.debug:
54 log_level = logging.DEBUG
55 logfile = None
56 if options.logfile:
57 logfile = options.logfile
58 logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
59 format=('%(asctime)s [%(process)d]'
60 ' %(levelname)s: %(message)s'),
61 filename=logfile)
64 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
65 PROTON_MASS = 1.007276
66 ELECTRON_MASS = 0.000549 # ?
68 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
69 MONOISOTOPIC_ATOMIC_MASS = {
70 'H' : 1.00782503214,
71 'C' : 12.00000000,
72 'N' : 14.00307400529,
73 'O' : 15.994914622115,
74 'P' : 30.9737615120,
75 'S' : 31.9720706912,
78 # most prevalent only (1 in 1000)
79 ISOTOPIC_ATOMIC_MASS = { # prevalence (in %)
80 'C13' : 13.003354837810, # 1.078
81 'N15' : 15.00010889849, # 0.3687
82 'O18' : 17.99916049, # 0.20514
83 'S33' : 32.9714585012, # 0.762
84 'S34' : 33.9678668311, # 4.2928
87 AVERAGE_ATOMIC_MASS = {
88 'H' : 1.007947,
89 'C' : 12.01078,
90 'N' : 14.00672,
91 'O' : 15.99943,
92 'P' : 30.9737612,
93 'S' : 32.0655,
96 # The xtandem average residue masses are about 0.002 amu higher than those
97 # calculated directly from the above average atomic masses. None of the
98 # chemists consulted knew of any reason why, aside from lack of precision in
99 # the average atomic mass estimates. This shouldn't matter very much, as
100 # fragmentation calculations should all be monoisotopic, and we can always
101 # widen the parent tolerance window a bit.
104 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
105 """Return the mass of formula, using the given mass regime (monoisotopic
106 by default).
108 >>> formula_mass('H2O', { 'H':1, 'O':16 })
110 >>> # monoisotopic mass of glycine
111 >>> str(round(formula_mass('C2H3ON'), 4))
112 '57.0215'
115 parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
116 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
117 return sum(atomic_mass[parts[i]] * int(parts[i+1])
118 for i in range(0, len(parts), 2))
120 # FIX: selenocysteine (U), etc
121 # residue -> formula
122 RESIDUE_FORMULA = {
123 'A' : "C3H5ON",
124 'C' : "C3H5ONS",
125 'D' : "C4H5O3N",
126 'E' : "C5H7O3N",
127 'F' : "C9H9ON",
128 'G' : "C2H3ON",
129 'H' : "C6H7ON3",
130 'I' : "C6H11ON",
131 'K' : "C6H12ON2",
132 'L' : "C6H11ON",
133 'M' : "C5H9ONS",
134 'N' : "C4H6O2N2",
135 'P' : "C5H7ON",
136 'Q' : "C5H8O2N2",
137 'R' : "C6H12ON4",
138 'S' : "C3H5O2N",
139 'T' : "C4H7O2N",
140 'V' : "C5H9ON",
141 'W' : "C11H10ON2",
142 'Y' : "C9H9O2N",
145 RESIDUES = sorted(RESIDUE_FORMULA.keys())
146 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
149 # [0][1] -> 'H' -> fragment mass of H for regime 0
150 MASS_REGIME_ATOMIC_MASSES = []
152 def dump_mass_regime_atomic_masses():
153 return MASS_REGIME_ATOMIC_MASSES
156 def mass_regime_atomic_masses(spec):
157 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
158 names to masses.
160 name, isotopes = spec
161 assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
162 if name == 'MONO':
163 r = MONOISOTOPIC_ATOMIC_MASS.copy()
164 else:
165 r = AVERAGE_ATOMIC_MASS.copy()
166 if isotopes:
167 iname, prevalence = isotopes[0]
168 assert iname == 'N15' and 0 <= prevalence <= 1
169 # this is a simplification, but additional accuracy pointless?
170 if name == 'MONO':
171 r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
172 else:
173 r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
174 return r
177 def initialize_spectrum_parameters(options, GLP, mass_regimes, fixed_mod_map):
178 """Initialize parameters known to the spectrum module.
179 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
182 # This function can be called multiple times to reinitialize for a new
183 # search, though probably everything else (previous matches) ought to be
184 # "forgotten" at that point, too.
186 # This is the size of vectors that are indexed by residues (A-Z) or
187 # special characters ('[]').
188 RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
190 # These are currently monoisotopic. (deuterium pointless?)
191 CP.proton_mass = PROTON_MASS
192 CP.hydrogen_mass = formula_mass("H")
194 regime_manifest = []
196 global MASS_REGIME_ATOMIC_MASSES
197 # clear previous state
198 MASS_REGIME_ATOMIC_MASSES = []
199 CP.parent_mass_regime.clear()
200 CP.fragment_mass_regime.clear()
202 for rn, regime_pair in enumerate(mass_regimes):
203 assert len(regime_pair) == 2 # parent and fragment
204 info('mass regime: %s', regime_pair)
205 MASS_REGIME_ATOMIC_MASSES.append([])
206 for n, regime in enumerate(regime_pair):
207 atmass = mass_regime_atomic_masses(regime)
208 MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
209 creg = cgreylag.mass_regime_parameters()
211 creg.hydroxyl_mass = formula_mass("OH", atmass)
212 creg.water_mass = formula_mass("H2O", atmass)
213 creg.ammonia_mass = formula_mass("NH3", atmass)
215 creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
217 for r in RESIDUES_W_BRACKETS:
218 m = 0
219 if r in RESIDUES:
220 m = (formula_mass(RESIDUE_FORMULA[r], atmass)
221 + GLP["mass_regime_debug_delta"])
222 if n == 1:
223 regime_manifest.append((rn, r, m))
224 rmod = fixed_mod_map.get(r)
225 if rmod:
226 if isinstance(rmod[1], str):
227 if rmod[2]:
228 m += rmod[0] * formula_mass(rmod[1])
229 else:
230 m += rmod[0] * formula_mass(rmod[1], atmass)
231 else:
232 m += rmod[0] * rmod[1]
233 creg.fixed_residue_mass[ord(r)] = m
234 # assuming these are monoisotopic (not regime)
235 creg.fixed_residue_mass[ord('[')] += formula_mass("H")
236 creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
237 if not n:
238 CP.parent_mass_regime.append(creg)
239 else:
240 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
241 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
242 CP.fragment_mass_regime.append(creg)
243 for r in RESIDUES_W_BRACKETS:
244 info('fixed mass %s: %s', r,
245 [ ("%.6f" % CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
246 "%.6f" % CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
247 for rn in range(len(mass_regimes)) ])
248 for r in RESIDUES:
249 for rn in range(len(mass_regimes)):
250 # check for physically impossible/meaningless masses
251 if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
252 raise ValueError('bogus parent mass specification for %s' % r)
253 if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
254 raise ValueError('bogus fragment mass specification for %s' % r)
256 CP.parent_mass_tolerance_1 = GLP["parent_mz_tolerance"]
257 CP.parent_mass_tolerance_max = (GLP["parent_mz_tolerance"]
258 * GLP["charge_limit"])
260 CP.fragment_mass_tolerance = GLP["fragment_mass_tolerance"]
261 CP.intensity_class_count = GLP["intensity_class_count"]
263 CP.minimum_peptide_length = GLP["min_peptide_length"]
265 # CP.ln_factorial[n] == ln(n!)
266 CP.ln_factorial.resize(int(GLP["max_parent_spectrum_mass"]
267 / GLP["fragment_mass_tolerance"] + 100), 0.0)
268 for n in range(2, len(CP.ln_factorial)):
269 CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
271 # FIX or eliminate
272 #CP.estimate_only = bool(options.estimate_only)
273 #CP.show_progress = bool(options.show_progress)
274 CP.estimate_only = False
275 CP.show_progress = False
277 return regime_manifest
280 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
281 if not mod_tree:
282 if 0 < len(conjuncts) <= limit:
283 yield conjuncts
284 return
285 first, rest = mod_tree[0], mod_tree[1:]
286 if isinstance(first, list):
287 for x in enumerate_disjunction(first, limit):
288 for y in enumerate_conjunction(rest, limit, conjuncts + x):
289 yield y
290 else:
291 for y in enumerate_conjunction(rest, limit, conjuncts):
292 yield y
293 for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
294 yield y
296 def enumerate_disjunction(mod_tree, limit=sys.maxint):
297 """Generates the conjuncts for mod_tree that are no longer than limit.
299 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
300 [[], ['a'], ['b'], ['c']]
301 >>> list(enumerate_disjunction([[1,2,3]]))
302 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
303 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
304 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
305 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
306 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
307 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
308 [[]]
311 assert isinstance(mod_tree, list)
312 yield []
313 for b in mod_tree:
314 for s in enumerate_conjunction(b, limit):
315 yield s
317 def get_mod_conjunct_triples(mod_tree, limit, mass_regimes):
318 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
319 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
320 ordered by increasing number of conjuncts, and with duplicates within and
321 across removed. A mass table is also appended to each conjunct.
323 # FIX: is there a more elegant way or place for all this?
324 def enmass(t):
325 def rmass(regime_index, par_frag, sign, delta, is_mono):
326 global MASS_REGIME_ATOMIC_MASSES
327 if isinstance(delta, str):
328 if is_mono:
329 regime_index = 0
330 return (sign * formula_mass(delta,
331 MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
332 else:
333 return sign * delta
335 sign, delta, is_mono = t[:3]
336 return t + (tuple((rmass(r, 0, sign, delta, is_mono),
337 rmass(r, 1, sign, delta, is_mono))
338 for r in range(len(mass_regimes))),)
340 def triple(c):
341 Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
342 Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
343 assert len(Ns) <= 1 and len(Cs) <= 1
344 rest = [ enmass(x) for x in c if x[3] not in '[]' ]
345 rest = tuple(sorted(list(frozenset(rest))))
346 return (Ns, Cs, rest)
348 return sorted(list(frozenset(triple(conjunct)
349 for conjunct
350 in enumerate_disjunction(mod_tree, limit))),
351 key=lambda x: (sum(len(y) for y in x), x))
354 def cleavage_motif_re(motif):
355 """Return (regexp, pos), where regexp is a regular expression that will
356 match a cleavage motif, and pos is the position of the cleavage with
357 respect to the match (or None). (The RE actually matches one character,
358 the rest matching as lookahead, so that re.finditer will find all
359 overlapping matches.)
361 >>> cleavage_motif_re('[KR]|{P}')
362 ('[KR](?=[^P])', 1)
363 >>> cleavage_motif_re('[X]|[X]')
364 ('.(?=.)', 1)
367 cleavage_pos = None
368 re_parts = []
369 motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
370 parts = [ p[0] for p in motif_re.findall(motif) ]
371 if ''.join(parts) != motif:
372 chase_error('invalid cleavage motif pattern')
373 i = 0
374 for part in parts:
375 if part == '|':
376 if cleavage_pos != None:
377 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
378 cleavage_pos = i
379 continue
380 if part == '[X]':
381 re_parts.append('.')
382 elif part[0] == '[':
383 re_parts.append('[%s]' % part[1:-1])
384 elif part[0] == '{':
385 re_parts.append('[^%s]' % part[1:-1])
386 else:
387 assert False, "unknown cleavage motif syntax"
388 i += 1
389 if len(re_parts) == 0:
390 re_pattern = '.'
391 elif len(re_parts) == 1:
392 re_pattern = re_parts[0]
393 else:
394 re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
395 return (re_pattern, cleavage_pos)
398 def file_sha1(filename):
399 """Return the (hex) SHA1 digest of the given file."""
400 try:
401 import hashlib
402 except:
403 return "no checksum--libs missing"
404 h = hashlib.sha1()
405 with open(filename) as f:
406 h.update(f.read())
407 return h.hexdigest()
410 def read_fasta_files(filenames):
411 """Yield (locusname, defline, sequence, filename) tuples as read from
412 FASTA files (uppercasing sequence)."""
414 loci_seen = set()
416 for filename in filenames:
417 locusname, defline = None, None
418 seqs = []
419 with open(filename) as f:
420 for line in f:
421 line = line.strip()
422 if line[:1] == '>':
423 if defline != None:
424 yield (locusname, defline, ''.join(seqs), filename)
425 elif seqs:
426 chase_error("bad format: line precedes initial defline"
427 " in '%s'", filename)
428 defline = line[1:]
429 locusname_rest = defline.split(None, 1)
430 if not locusname_rest:
431 chase_error("empty locus name not allowed in '%s'",
432 filename)
433 locusname = locusname_rest[0]
434 if locusname in loci_seen:
435 chase_error("locus name '%s' is not unique in the"
436 " search database(s) in '%s'", locusname,
437 filename)
438 loci_seen.add(locusname)
439 seqs = []
440 else:
441 seqs.append(line.upper())
442 if defline:
443 yield (locusname, defline, ''.join(seqs), filename)