1 # common greylag functions and constants
3 ## greylag, a collection of programs for MS/MS protein analysis
4 ## Copyright (C) 2006-2008 Stowers Institute for Medical Research
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ## Contact: Mike Coleman
20 ## Stowers Institute for Medical Research
21 ## 1000 East 50th Street
22 ## Kansas City, Missouri 64110
25 from __future__
import with_statement
28 import logging
; from logging
import debug
, info
, warning
39 # handle to the singleton parameter object shared with the C++ module
40 CP
= cgreylag
.cvar
.parameters_the
43 def set_logging(options
):
44 log_level
= logging
.WARNING
46 log_level
= logging
.ERROR
48 log_level
= logging
.INFO
50 log_level
= logging
.DEBUG
53 logfile
= options
.logfile
54 logging
.basicConfig(level
=log_level
, datefmt
='%b %e %H:%M:%S',
55 format
=('%(asctime)s [%(process)d]'
56 ' %(levelname)s: %(message)s'),
60 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
61 PROTON_MASS
= 1.007276
62 ELECTRON_MASS
= 0.000549 # ?
64 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
65 MONOISOTOPIC_ATOMIC_MASS
= {
69 'O' : 15.994914622115,
74 # most prevalent only (1 in 1000)
75 ISOTOPIC_ATOMIC_MASS
= { # prevalence (in %)
76 'C13' : 13.003354837810, # 1.078
77 'N15' : 15.00010889849, # 0.3687
78 'O18' : 17.99916049, # 0.20514
79 'S33' : 32.9714585012, # 0.762
80 'S34' : 33.9678668311, # 4.2928
83 AVERAGE_ATOMIC_MASS
= {
92 # The xtandem average residue masses are about 0.002 amu higher than those
93 # calculated directly from the above average atomic masses. None of the
94 # chemists consulted knew of any reason why, aside from lack of precision in
95 # the average atomic mass estimates. This shouldn't matter very much, as
96 # fragmentation calculations should all be monoisotopic, and we can always
97 # widen the parent tolerance window a bit.
100 def formula_mass(formula
, atomic_mass
=MONOISOTOPIC_ATOMIC_MASS
):
101 """Return the mass of formula, using the given mass regime (monoisotopic
104 >>> formula_mass('H2O', { 'H':1, 'O':16 })
106 >>> # monoisotopic mass of glycine
107 >>> str(round(formula_mass('C2H3ON'), 4))
111 parts
= [ p
or '1' for p
in re
.split(r
'([A-Z][a-z]*)', formula
)[1:] ]
112 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
113 return sum(atomic_mass
[parts
[i
]] * int(parts
[i
+1])
114 for i
in range(0, len(parts
), 2))
116 # FIX: selenocysteine (U), etc
141 RESIDUES
= sorted(RESIDUE_FORMULA
.keys())
142 RESIDUES_W_BRACKETS
= RESIDUES
+ ['[', ']']
145 # [0][1] -> 'H' -> fragment mass of H for regime 0
146 MASS_REGIME_ATOMIC_MASSES
= []
148 def dump_mass_regime_atomic_masses():
149 return MASS_REGIME_ATOMIC_MASSES
152 def mass_regime_atomic_masses(spec
):
153 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
156 name
, isotopes
= spec
157 assert name
in ['MONO', 'AVG'] and len(isotopes
) <= 1
159 r
= MONOISOTOPIC_ATOMIC_MASS
.copy()
161 r
= AVERAGE_ATOMIC_MASS
.copy()
163 iname
, prevalence
= isotopes
[0]
164 assert iname
== 'N15' and 0 <= prevalence
<= 1
165 # this is a simplification, but additional accuracy pointless?
167 r
['N'] = ISOTOPIC_ATOMIC_MASS
['N15']
169 r
['N'] += (ISOTOPIC_ATOMIC_MASS
['N15'] - r
['N']) * prevalence
173 def initialize_spectrum_parameters(options
, GLP
, mass_regimes
, fixed_mod_map
):
174 """Initialize parameters known to the spectrum module.
175 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
178 # This function can be called multiple times to reinitialize for a new
179 # search, though probably everything else (previous matches) ought to be
180 # "forgotten" at that point, too.
182 # This is the size of vectors that are indexed by residues (A-Z) or
183 # special characters ('[]').
184 RESIDUE_LIMIT
= max(ord(c
) for c
in 'Z[]') + 1
186 # These are currently monoisotopic. (deuterium pointless?)
187 CP
.proton_mass
= PROTON_MASS
188 CP
.hydrogen_mass
= formula_mass("H")
192 global MASS_REGIME_ATOMIC_MASSES
193 # clear previous state
194 MASS_REGIME_ATOMIC_MASSES
= []
195 CP
.parent_mass_regime
.clear()
196 CP
.fragment_mass_regime
.clear()
198 for rn
, regime_pair
in enumerate(mass_regimes
):
199 assert len(regime_pair
) == 2 # parent and fragment
200 info('mass regime: %s', regime_pair
)
201 MASS_REGIME_ATOMIC_MASSES
.append([])
202 for n
, regime
in enumerate(regime_pair
):
203 atmass
= mass_regime_atomic_masses(regime
)
204 MASS_REGIME_ATOMIC_MASSES
[-1].append(atmass
)
205 creg
= cgreylag
.mass_regime_parameters()
207 creg
.hydroxyl_mass
= formula_mass("OH", atmass
)
208 creg
.water_mass
= formula_mass("H2O", atmass
)
209 creg
.ammonia_mass
= formula_mass("NH3", atmass
)
211 creg
.fixed_residue_mass
.resize(RESIDUE_LIMIT
)
213 for r
in RESIDUES_W_BRACKETS
:
216 m
= (formula_mass(RESIDUE_FORMULA
[r
], atmass
)
217 + GLP
["mass_regime_debug_delta"])
219 regime_manifest
.append((rn
, r
, m
))
220 rmod
= fixed_mod_map
.get(r
)
222 if isinstance(rmod
[1], str):
224 m
+= rmod
[0] * formula_mass(rmod
[1])
226 m
+= rmod
[0] * formula_mass(rmod
[1], atmass
)
228 m
+= rmod
[0] * rmod
[1]
229 creg
.fixed_residue_mass
[ord(r
)] = m
230 # assuming these are monoisotopic (not regime)
231 creg
.fixed_residue_mass
[ord('[')] += formula_mass("H")
232 creg
.fixed_residue_mass
[ord(']')] += formula_mass("OH")
234 CP
.parent_mass_regime
.append(creg
)
236 creg
.fixed_residue_mass
[ord('[')] -= CP
.hydrogen_mass
237 creg
.fixed_residue_mass
[ord(']')] -= creg
.hydroxyl_mass
238 CP
.fragment_mass_regime
.append(creg
)
239 for r
in RESIDUES_W_BRACKETS
:
240 info('fixed mass %s: %s', r
,
241 [ ("%.6f" % CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)],
242 "%.6f" % CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)])
243 for rn
in range(len(mass_regimes
)) ])
245 for rn
in range(len(mass_regimes
)):
246 # check for physically impossible/meaningless masses
247 if CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
248 raise ValueError('bogus parent mass specification for %s' % r
)
249 if CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
250 raise ValueError('bogus fragment mass specification for %s' % r
)
252 CP
.parent_mass_tolerance_1
= GLP
["parent_mz_tolerance"]
253 CP
.parent_mass_tolerance_max
= (GLP
["parent_mz_tolerance"]
254 * GLP
["charge_limit"])
256 CP
.fragment_mass_tolerance
= GLP
["fragment_mass_tolerance"]
257 CP
.intensity_class_count
= GLP
["intensity_class_count"]
259 CP
.minimum_peptide_length
= GLP
["min_peptide_length"]
261 # CP.ln_factorial[n] == ln(n!)
262 CP
.ln_factorial
.resize(int(GLP
["max_parent_spectrum_mass"]
263 / GLP
["fragment_mass_tolerance"] + 100), 0.0)
264 for n
in range(2, len(CP
.ln_factorial
)):
265 CP
.ln_factorial
[n
] = CP
.ln_factorial
[n
-1] + math
.log(n
)
268 #CP.estimate_only = bool(options.estimate_only)
269 #CP.show_progress = bool(options.show_progress)
270 CP
.estimate_only
= False
271 CP
.show_progress
= False
273 return regime_manifest
276 def enumerate_conjunction(mod_tree
, limit
, conjuncts
=[]):
278 if 0 < len(conjuncts
) <= limit
:
281 first
, rest
= mod_tree
[0], mod_tree
[1:]
282 if isinstance(first
, list):
283 for x
in enumerate_disjunction(first
, limit
):
284 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ x
):
287 for y
in enumerate_conjunction(rest
, limit
, conjuncts
):
289 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ [first
]):
292 def enumerate_disjunction(mod_tree
, limit
=sys
.maxint
):
293 """Generates the conjuncts for mod_tree that are no longer than limit.
295 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
296 [[], ['a'], ['b'], ['c']]
297 >>> list(enumerate_disjunction([[1,2,3]]))
298 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
299 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
300 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
301 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
302 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
303 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
307 assert isinstance(mod_tree
, list)
310 for s
in enumerate_conjunction(b
, limit
):
313 def get_mod_conjunct_triples(mod_tree
, limit
, mass_regimes
):
314 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
315 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
316 ordered by increasing number of conjuncts, and with duplicates within and
317 across removed. A mass table is also appended to each conjunct.
319 # FIX: is there a more elegant way or place for all this?
321 def rmass(regime_index
, par_frag
, sign
, delta
, is_mono
):
322 global MASS_REGIME_ATOMIC_MASSES
323 if isinstance(delta
, str):
326 return (sign
* formula_mass(delta
,
327 MASS_REGIME_ATOMIC_MASSES
[regime_index
][par_frag
]))
331 sign
, delta
, is_mono
= t
[:3]
332 return t
+ (tuple((rmass(r
, 0, sign
, delta
, is_mono
),
333 rmass(r
, 1, sign
, delta
, is_mono
))
334 for r
in range(len(mass_regimes
))),)
337 Ns
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == '['))
338 Cs
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == ']'))
339 assert len(Ns
) <= 1 and len(Cs
) <= 1
340 rest
= [ enmass(x
) for x
in c
if x
[3] not in '[]' ]
341 rest
= tuple(sorted(list(frozenset(rest
))))
342 return (Ns
, Cs
, rest
)
344 return sorted(list(frozenset(triple(conjunct
)
346 in enumerate_disjunction(mod_tree
, limit
))),
347 key
=lambda x
: (sum(len(y
) for y
in x
), x
))
350 def cleavage_motif_re(motif
):
351 """Return (regexp, pos), where regexp is a regular expression that will
352 match a cleavage motif, and pos is the position of the cleavage with
353 respect to the match (or None). (The RE actually matches one character,
354 the rest matching as lookahead, so that re.finditer will find all
355 overlapping matches.)
357 >>> cleavage_motif_re('[KR]|{P}')
359 >>> cleavage_motif_re('[X]|[X]')
365 motif_re
= re
.compile(r
'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
366 parts
= [ p
[0] for p
in motif_re
.findall(motif
) ]
367 if ''.join(parts
) != motif
:
368 chase_error('invalid cleavage motif pattern')
372 if cleavage_pos
!= None:
373 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
379 re_parts
.append('[%s]' % part
[1:-1])
381 re_parts
.append('[^%s]' % part
[1:-1])
383 assert False, "unknown cleavage motif syntax"
385 if len(re_parts
) == 0:
387 elif len(re_parts
) == 1:
388 re_pattern
= re_parts
[0]
390 re_pattern
= re_parts
[0] + '(?=' + ''.join(re_parts
[1:]) + ')'
391 return (re_pattern
, cleavage_pos
)
394 def file_sha1(filename
):
395 """Return the (hex) SHA1 digest of the given file."""
399 return "no checksum--libs missing"
401 with
open(filename
) as f
:
406 def read_fasta_files(filenames
):
407 """Yield (locusname, defline, sequence, filename) tuples as read from
408 FASTA files (uppercasing sequence)."""
412 for filename
in filenames
:
413 locusname
, defline
= None, None
415 with
open(filename
) as f
:
420 yield (locusname
, defline
, ''.join(seqs
), filename
)
422 chase_error("bad format: line precedes initial defline"
423 " in '%s'", filename
)
425 locusname_rest
= defline
.split(None, 1)
426 if not locusname_rest
:
427 chase_error("empty locus name not allowed in '%s'",
429 locusname
= locusname_rest
[0]
430 if locusname
in loci_seen
:
431 chase_error("locus name '%s' is not unique in the"
432 " search database(s) in '%s'", locusname
,
434 loci_seen
.add(locusname
)
437 seqs
.append(line
.upper())
439 yield (locusname
, defline
, ''.join(seqs
), filename
)