1 # common greylag functions and constants
3 ## greylag, a collection of programs for MS/MS protein analysis
4 ## Copyright (C) 2006-2008 Stowers Institute for Medical Research
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ## Contact: Mike Coleman
20 ## Stowers Institute for Medical Research
21 ## 1000 East 50th Street
22 ## Kansas City, Missouri 64110
25 from __future__
import with_statement
28 import logging
; from logging
import debug
, info
, warning
39 # prefix added to locus id's for decoy loci
40 DEFAULT_DECOY_PREFIX
= "SHUFFLED_"
43 # handle to the singleton parameter object shared with the C++ module
44 CP
= cgreylag
.cvar
.parameters_the
47 def set_logging(options
):
48 log_level
= logging
.WARNING
50 log_level
= logging
.ERROR
52 log_level
= logging
.INFO
54 log_level
= logging
.DEBUG
57 logfile
= options
.logfile
58 logging
.basicConfig(level
=log_level
, datefmt
='%b %e %H:%M:%S',
59 format
=('%(asctime)s [%(process)d]'
60 ' %(levelname)s: %(message)s'),
64 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
65 PROTON_MASS
= 1.007276
66 ELECTRON_MASS
= 0.000549 # ?
68 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
69 # (It appears that NIST has started to copyright and require licenses for use
70 # of some of its data, but these values appear not to fall under copyright,
72 MONOISOTOPIC_ATOMIC_MASS
= {
76 'O' : 15.994914622115,
81 # most prevalent only (1 in 1000)
82 ISOTOPIC_ATOMIC_MASS
= { # prevalence (in %)
83 'C13' : 13.003354837810, # 1.078
84 'N15' : 15.00010889849, # 0.3687
85 'O18' : 17.99916049, # 0.20514
86 'S33' : 32.9714585012, # 0.762
87 'S34' : 33.9678668311, # 4.2928
90 AVERAGE_ATOMIC_MASS
= {
99 # The xtandem average residue masses are about 0.002 amu higher than those
100 # calculated directly from the above average atomic masses. None of the
101 # chemists consulted knew of any reason why, aside from lack of precision in
102 # the average atomic mass estimates. This shouldn't matter very much, as
103 # fragmentation calculations should all be monoisotopic, and we can always
104 # widen the parent tolerance window a bit.
107 def formula_mass(formula
, atomic_mass
=MONOISOTOPIC_ATOMIC_MASS
):
108 """Return the mass of formula, using the given mass regime (monoisotopic
111 >>> formula_mass('H2O', { 'H':1, 'O':16 })
113 >>> # monoisotopic mass of glycine
114 >>> str(round(formula_mass('C2H3ON'), 4))
118 parts
= [ p
or '1' for p
in re
.split(r
'([A-Z][a-z]*)', formula
)[1:] ]
119 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
120 return sum(atomic_mass
[parts
[i
]] * int(parts
[i
+1])
121 for i
in range(0, len(parts
), 2))
123 # FIX: selenocysteine (U), etc
148 RESIDUES
= sorted(RESIDUE_FORMULA
.keys())
149 RESIDUES_W_BRACKETS
= RESIDUES
+ ['[', ']']
152 # [0][1] -> 'H' -> fragment mass of H for regime 0
153 MASS_REGIME_ATOMIC_MASSES
= []
155 def dump_mass_regime_atomic_masses():
156 return MASS_REGIME_ATOMIC_MASSES
159 def mass_regime_atomic_masses(spec
):
160 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
163 name
, isotopes
= spec
164 assert name
in ['MONO', 'AVG'] and len(isotopes
) <= 1
166 r
= MONOISOTOPIC_ATOMIC_MASS
.copy()
168 r
= AVERAGE_ATOMIC_MASS
.copy()
170 iname
, prevalence
= isotopes
[0]
171 assert iname
== 'N15' and 0 <= prevalence
<= 1
172 # this is a simplification, but additional accuracy pointless?
174 r
['N'] = ISOTOPIC_ATOMIC_MASS
['N15']
176 r
['N'] += (ISOTOPIC_ATOMIC_MASS
['N15'] - r
['N']) * prevalence
180 def initialize_spectrum_parameters(options
, GLP
, mass_regimes
, fixed_mod_map
):
181 """Initialize parameters known to the spectrum module.
182 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
185 # This function can be called multiple times to reinitialize for a new
186 # search, though probably everything else (previous matches) ought to be
187 # "forgotten" at that point, too.
189 # This is the size of vectors that are indexed by residues (A-Z) or
190 # special characters ('[]').
191 RESIDUE_LIMIT
= max(ord(c
) for c
in 'Z[]') + 1
193 # These are currently monoisotopic. (deuterium pointless?)
194 CP
.proton_mass
= PROTON_MASS
195 CP
.hydrogen_mass
= formula_mass("H")
199 global MASS_REGIME_ATOMIC_MASSES
200 # clear previous state
201 MASS_REGIME_ATOMIC_MASSES
= []
202 CP
.parent_mass_regime
.clear()
203 CP
.fragment_mass_regime
.clear()
205 for rn
, regime_pair
in enumerate(mass_regimes
):
206 assert len(regime_pair
) == 2 # parent and fragment
207 info('mass regime: %s', regime_pair
)
208 MASS_REGIME_ATOMIC_MASSES
.append([])
209 for n
, regime
in enumerate(regime_pair
):
210 atmass
= mass_regime_atomic_masses(regime
)
211 MASS_REGIME_ATOMIC_MASSES
[-1].append(atmass
)
212 creg
= cgreylag
.mass_regime_parameters()
214 creg
.hydroxyl_mass
= formula_mass("OH", atmass
)
215 creg
.water_mass
= formula_mass("H2O", atmass
)
216 creg
.ammonia_mass
= formula_mass("NH3", atmass
)
218 creg
.fixed_residue_mass
.resize(RESIDUE_LIMIT
)
220 for r
in RESIDUES_W_BRACKETS
:
223 m
= (formula_mass(RESIDUE_FORMULA
[r
], atmass
)
224 + GLP
["mass_regime_debug_delta"])
226 regime_manifest
.append((rn
, r
, m
))
227 rmod
= fixed_mod_map
.get(r
)
229 if isinstance(rmod
[1], str):
231 m
+= rmod
[0] * formula_mass(rmod
[1])
233 m
+= rmod
[0] * formula_mass(rmod
[1], atmass
)
235 m
+= rmod
[0] * rmod
[1]
236 creg
.fixed_residue_mass
[ord(r
)] = m
237 # assuming these are monoisotopic (not regime)
238 creg
.fixed_residue_mass
[ord('[')] += formula_mass("H")
239 creg
.fixed_residue_mass
[ord(']')] += formula_mass("OH")
241 CP
.parent_mass_regime
.append(creg
)
243 creg
.fixed_residue_mass
[ord('[')] -= CP
.hydrogen_mass
244 creg
.fixed_residue_mass
[ord(']')] -= creg
.hydroxyl_mass
245 CP
.fragment_mass_regime
.append(creg
)
246 for r
in RESIDUES_W_BRACKETS
:
247 info('fixed mass %s: %s', r
,
248 [ ("%.6f" % CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)],
249 "%.6f" % CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)])
250 for rn
in range(len(mass_regimes
)) ])
252 for rn
in range(len(mass_regimes
)):
253 # check for physically impossible/meaningless masses
254 if CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
255 raise ValueError('bogus parent mass specification for %s' % r
)
256 if CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
257 raise ValueError('bogus fragment mass specification for %s' % r
)
259 CP
.parent_mass_tolerance_1
= GLP
["parent_mz_tolerance"]
260 CP
.parent_mass_tolerance_max
= (GLP
["parent_mz_tolerance"]
261 * GLP
["charge_limit"])
263 CP
.fragment_mass_tolerance
= GLP
["fragment_mass_tolerance"]
264 CP
.intensity_class_count
= GLP
["intensity_class_count"]
266 CP
.minimum_peptide_length
= GLP
["min_peptide_length"]
268 # CP.ln_factorial[n] == ln(n!)
269 CP
.ln_factorial
.resize(int(GLP
["max_parent_spectrum_mass"]
270 / GLP
["fragment_mass_tolerance"] + 100), 0.0)
271 for n
in range(2, len(CP
.ln_factorial
)):
272 CP
.ln_factorial
[n
] = CP
.ln_factorial
[n
-1] + math
.log(n
)
275 #CP.estimate_only = bool(options.estimate_only)
276 #CP.show_progress = bool(options.show_progress)
277 CP
.estimate_only
= False
278 CP
.show_progress
= False
280 return regime_manifest
283 def enumerate_conjunction(mod_tree
, limit
, conjuncts
=[]):
285 if 0 < len(conjuncts
) <= limit
:
288 first
, rest
= mod_tree
[0], mod_tree
[1:]
289 if isinstance(first
, list):
290 for x
in enumerate_disjunction(first
, limit
):
291 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ x
):
294 for y
in enumerate_conjunction(rest
, limit
, conjuncts
):
296 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ [first
]):
299 def enumerate_disjunction(mod_tree
, limit
=sys
.maxint
):
300 """Generates the conjuncts for mod_tree that are no longer than limit.
302 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
303 [[], ['a'], ['b'], ['c']]
304 >>> list(enumerate_disjunction([[1,2,3]]))
305 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
306 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
307 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
308 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
309 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
310 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
314 assert isinstance(mod_tree
, list)
317 for s
in enumerate_conjunction(b
, limit
):
320 def get_mod_conjunct_triples(mod_tree
, limit
, mass_regimes
):
321 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
322 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
323 ordered by increasing number of conjuncts, and with duplicates within and
324 across removed. A mass table is also appended to each conjunct.
326 # FIX: is there a more elegant way or place for all this?
328 def rmass(regime_index
, par_frag
, sign
, delta
, is_mono
):
329 global MASS_REGIME_ATOMIC_MASSES
330 if isinstance(delta
, str):
333 return (sign
* formula_mass(delta
,
334 MASS_REGIME_ATOMIC_MASSES
[regime_index
][par_frag
]))
338 sign
, delta
, is_mono
= t
[:3]
339 return t
+ (tuple((rmass(r
, 0, sign
, delta
, is_mono
),
340 rmass(r
, 1, sign
, delta
, is_mono
))
341 for r
in range(len(mass_regimes
))),)
344 Ns
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == '['))
345 Cs
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == ']'))
346 assert len(Ns
) <= 1 and len(Cs
) <= 1
347 rest
= [ enmass(x
) for x
in c
if x
[3] not in '[]' ]
348 rest
= tuple(sorted(list(frozenset(rest
))))
349 return (Ns
, Cs
, rest
)
351 return sorted(list(frozenset(triple(conjunct
)
353 in enumerate_disjunction(mod_tree
, limit
))),
354 key
=lambda x
: (sum(len(y
) for y
in x
), x
))
357 def cleavage_motif_re(motif
):
358 """Return (regexp, pos), where regexp is a regular expression that will
359 match a cleavage motif, and pos is the position of the cleavage with
360 respect to the match (or None). (The RE actually matches one character,
361 the rest matching as lookahead, so that re.finditer will find all
362 overlapping matches.)
364 >>> cleavage_motif_re('[KR]|{P}')
366 >>> cleavage_motif_re('[X]|[X]')
372 motif_re
= re
.compile(r
'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
373 parts
= [ p
[0] for p
in motif_re
.findall(motif
) ]
374 if ''.join(parts
) != motif
:
375 chase_error('invalid cleavage motif pattern')
379 if cleavage_pos
!= None:
380 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
386 re_parts
.append('[%s]' % part
[1:-1])
388 re_parts
.append('[^%s]' % part
[1:-1])
390 assert False, "unknown cleavage motif syntax"
392 if len(re_parts
) == 0:
394 elif len(re_parts
) == 1:
395 re_pattern
= re_parts
[0]
397 re_pattern
= re_parts
[0] + '(?=' + ''.join(re_parts
[1:]) + ')'
398 return (re_pattern
, cleavage_pos
)
401 def file_sha1(filename
):
402 """Return the (hex) SHA1 digest of the given file."""
406 return "no checksum--libs missing"
408 with
open(filename
) as f
:
413 def read_fasta_file(f
):
414 """Yield (locusname, defline, sequence) tuples as read from the given
415 FASTA file (uppercasing sequence)."""
417 locusname
, defline
= None, None
424 yield (locusname
, defline
, ''.join(seqs
))
426 locusname_rest
= defline
.split(None, 1)
427 locusname
= locusname_rest
[0] if locusname_rest
else ''
431 chase_error("bad format: line precedes initial defline"
432 " in '%s'", (f
.name
if hasattr(f
, 'name')
433 else 'unknown FASTA file'))
434 seqs
.append(line
.upper())
436 yield (locusname
, defline
, ''.join(seqs
))
439 def read_fasta_files(filenames
):
440 """Yield (locusname, defline, sequence, filename) tuples as read from
441 FASTA files (uppercasing sequence). An error is given if locusname is
442 empty or not unique across all sequences."""
446 for filename
in filenames
:
447 with
open(filename
) as f
:
448 for locusname
, defline
, sequence
in read_fasta_file(f
):
450 chase_error("empty locus name not allowed in '%s'",
452 if locusname
in loci_seen
:
453 chase_error("locus name '%s' is not unique in the search"
454 " database(s) in '%s'", locusname
, filename
)
455 loci_seen
.add(locusname
)
457 yield (locusname
, defline
, sequence
, filename
)