greylag.py

   1 # common greylag functions and constants
   2
   3 ##  greylag, a collection of programs for MS/MS protein analysis
   4 ##  Copyright (C) 2006-2008  Stowers Institute for Medical Research
   5 ##
   6 ##  This program is free software: you can redistribute it and/or modify
   7 ##  it under the terms of the GNU General Public License as published by
   8 ##  the Free Software Foundation, either version 3 of the License, or
   9 ##  (at your option) any later version.
  10 ##
  11 ##  This program is distributed in the hope that it will be useful,
  12 ##  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ##  GNU General Public License for more details.
  15 ##
  16 ##  You should have received a copy of the GNU General Public License
  17 ##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 ##
  19 ##  Contact: Mike Coleman
  20 ##           Stowers Institute for Medical Research
  21 ##           1000 East 50th Street
  22 ##           Kansas City, Missouri  64110
  23 ##           USA
  24
  25 from __future__ import with_statement
  26
  27
  28 import logging; from logging import debug, info, warning
  29 import math
  30 import re
  31 import sys
  32
  33 import cgreylag
  34
  35
  36 VERSION = "0.1.1"
  37
  38
  39 # handle to the singleton parameter object shared with the C++ module
  40 CP = cgreylag.cvar.parameters_the
  41
  42
  43 def set_logging(options):
  44     log_level = logging.WARNING
  45     if options.quiet:
  46         log_level = logging.ERROR
  47     if options.verbose:
  48         log_level = logging.INFO
  49     if options.debug:
  50         log_level = logging.DEBUG
  51     logfile = None
  52     if options.logfile:
  53         logfile = options.logfile
  54     logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
  55                         format=('%(asctime)s [%(process)d]'
  56                                 ' %(levelname)s: %(message)s'),
  57                         filename=logfile)
  58
  59
  60 # FIX: is this mono or avg?  (possible error here is ~0.0007 amu)
  61 PROTON_MASS =   1.007276
  62 ELECTRON_MASS = 0.000549                # ?
  63
  64 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
  65 MONOISOTOPIC_ATOMIC_MASS = {
  66     'H' :  1.00782503214,
  67     'C' : 12.00000000,
  68     'N' : 14.00307400529,
  69     'O' : 15.994914622115,
  70     'P' : 30.9737615120,
  71     'S' : 31.9720706912,
  72     }
  73
  74 # most prevalent only (1 in 1000)
  75 ISOTOPIC_ATOMIC_MASS = {                # prevalence (in %)
  76     'C13' : 13.003354837810,            # 1.078
  77     'N15' : 15.00010889849,             # 0.3687
  78     'O18' : 17.99916049,                # 0.20514
  79     'S33' : 32.9714585012,              # 0.762
  80     'S34' : 33.9678668311,              # 4.2928
  81     }
  82
  83 AVERAGE_ATOMIC_MASS = {
  84     'H' :  1.007947,
  85     'C' : 12.01078,
  86     'N' : 14.00672,
  87     'O' : 15.99943,
  88     'P' : 30.9737612,
  89     'S' : 32.0655,
  90     }
  91
  92 # The xtandem average residue masses are about 0.002 amu higher than those
  93 # calculated directly from the above average atomic masses.  None of the
  94 # chemists consulted knew of any reason why, aside from lack of precision in
  95 # the average atomic mass estimates.  This shouldn't matter very much, as
  96 # fragmentation calculations should all be monoisotopic, and we can always
  97 # widen the parent tolerance window a bit.
  98
  99
 100 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
 101     """Return the mass of formula, using the given mass regime (monoisotopic
 102     by default).
 103
 104     >>> formula_mass('H2O', { 'H':1, 'O':16 })
 105     18
 106     >>> # monoisotopic mass of glycine
 107     >>> str(round(formula_mass('C2H3ON'), 4))
 108     '57.0215'
 109
 110     """
 111     parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
 112     # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
 113     return sum(atomic_mass[parts[i]] * int(parts[i+1])
 114                for i in range(0, len(parts), 2))
 115
 116 # FIX: selenocysteine (U), etc
 117 # residue -> formula
 118 RESIDUE_FORMULA = {
 119     'A' : "C3H5ON",
 120     'C' : "C3H5ONS",
 121     'D' : "C4H5O3N",
 122     'E' : "C5H7O3N",
 123     'F' : "C9H9ON",
 124     'G' : "C2H3ON",
 125     'H' : "C6H7ON3",
 126     'I' : "C6H11ON",
 127     'K' : "C6H12ON2",
 128     'L' : "C6H11ON",
 129     'M' : "C5H9ONS",
 130     'N' : "C4H6O2N2",
 131     'P' : "C5H7ON",
 132     'Q' : "C5H8O2N2",
 133     'R' : "C6H12ON4",
 134     'S' : "C3H5O2N",
 135     'T' : "C4H7O2N",
 136     'V' : "C5H9ON",
 137     'W' : "C11H10ON2",
 138     'Y' : "C9H9O2N",
 139     }
 140
 141 RESIDUES = sorted(RESIDUE_FORMULA.keys())
 142 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
 143
 144
 145 # [0][1] -> 'H' -> fragment mass of H for regime 0
 146 MASS_REGIME_ATOMIC_MASSES = []
 147
 148 def dump_mass_regime_atomic_masses():
 149     return MASS_REGIME_ATOMIC_MASSES
 150
 151
 152 def mass_regime_atomic_masses(spec):
 153     """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
 154     names to masses.
 155     """
 156     name, isotopes = spec
 157     assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
 158     if name == 'MONO':
 159         r = MONOISOTOPIC_ATOMIC_MASS.copy()
 160     else:
 161         r = AVERAGE_ATOMIC_MASS.copy()
 162     if isotopes:
 163         iname, prevalence = isotopes[0]
 164         assert iname == 'N15' and 0 <= prevalence <= 1
 165         # this is a simplification, but additional accuracy pointless?
 166         if name == 'MONO':
 167             r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
 168         else:
 169             r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
 170     return r
 171
 172
 173 def initialize_spectrum_parameters(options, GLP, mass_regimes, fixed_mod_map):
 174     """Initialize parameters known to the spectrum module.
 175     fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
 176     """
 177
 178     # This function can be called multiple times to reinitialize for a new
 179     # search, though probably everything else (previous matches) ought to be
 180     # "forgotten" at that point, too.
 181
 182     # This is the size of vectors that are indexed by residues (A-Z) or
 183     # special characters ('[]').
 184     RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
 185
 186     # These are currently monoisotopic.  (deuterium pointless?)
 187     CP.proton_mass = PROTON_MASS
 188     CP.hydrogen_mass = formula_mass("H")
 189
 190     regime_manifest = []
 191
 192     global MASS_REGIME_ATOMIC_MASSES
 193     # clear previous state
 194     MASS_REGIME_ATOMIC_MASSES = []
 195     CP.parent_mass_regime.clear()
 196     CP.fragment_mass_regime.clear()
 197
 198     for rn, regime_pair in enumerate(mass_regimes):
 199         assert len(regime_pair) == 2    # parent and fragment
 200         info('mass regime: %s', regime_pair)
 201         MASS_REGIME_ATOMIC_MASSES.append([])
 202         for n, regime in enumerate(regime_pair):
 203             atmass = mass_regime_atomic_masses(regime)
 204             MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
 205             creg = cgreylag.mass_regime_parameters()
 206
 207             creg.hydroxyl_mass = formula_mass("OH", atmass)
 208             creg.water_mass = formula_mass("H2O", atmass)
 209             creg.ammonia_mass = formula_mass("NH3", atmass)
 210
 211             creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
 212
 213             for r in RESIDUES_W_BRACKETS:
 214                 m = 0
 215                 if r in RESIDUES:
 216                     m = (formula_mass(RESIDUE_FORMULA[r], atmass)
 217                          + GLP["mass_regime_debug_delta"])
 218                 if n == 1:
 219                     regime_manifest.append((rn, r, m))
 220                 rmod = fixed_mod_map.get(r)
 221                 if rmod:
 222                     if isinstance(rmod[1], str):
 223                         if rmod[2]:
 224                             m += rmod[0] * formula_mass(rmod[1])
 225                         else:
 226                             m += rmod[0] * formula_mass(rmod[1], atmass)
 227                     else:
 228                         m += rmod[0] * rmod[1]
 229                 creg.fixed_residue_mass[ord(r)] = m
 230             # assuming these are monoisotopic (not regime)
 231             creg.fixed_residue_mass[ord('[')] += formula_mass("H")
 232             creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
 233             if not n:
 234                 CP.parent_mass_regime.append(creg)
 235             else:
 236                 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
 237                 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
 238                 CP.fragment_mass_regime.append(creg)
 239     for r in RESIDUES_W_BRACKETS:
 240         info('fixed mass %s: %s', r,
 241              [ ("%.6f" % CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
 242                 "%.6f" % CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
 243                for rn in range(len(mass_regimes)) ])
 244     for r in RESIDUES:
 245         for rn in range(len(mass_regimes)):
 246             # check for physically impossible/meaningless masses
 247             if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 248                 raise ValueError('bogus parent mass specification for %s' % r)
 249             if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 250                 raise ValueError('bogus fragment mass specification for %s' % r)
 251
 252     CP.parent_mass_tolerance_1 = GLP["parent_mz_tolerance"]
 253     CP.parent_mass_tolerance_max = (GLP["parent_mz_tolerance"]
 254                                     * GLP["charge_limit"])
 255
 256     CP.fragment_mass_tolerance = GLP["fragment_mass_tolerance"]
 257     CP.intensity_class_count = GLP["intensity_class_count"]
 258
 259     CP.minimum_peptide_length = GLP["min_peptide_length"]
 260
 261     # CP.ln_factorial[n] == ln(n!)
 262     CP.ln_factorial.resize(int(GLP["max_parent_spectrum_mass"]
 263                                / GLP["fragment_mass_tolerance"] + 100), 0.0)
 264     for n in range(2, len(CP.ln_factorial)):
 265         CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
 266
 267     # FIX or eliminate
 268     #CP.estimate_only = bool(options.estimate_only)
 269     #CP.show_progress = bool(options.show_progress)
 270     CP.estimate_only = False
 271     CP.show_progress = False
 272
 273     return regime_manifest
 274
 275
 276 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
 277     if not mod_tree:
 278         if 0 < len(conjuncts) <= limit:
 279             yield conjuncts
 280         return
 281     first, rest = mod_tree[0], mod_tree[1:]
 282     if isinstance(first, list):
 283         for x in enumerate_disjunction(first, limit):
 284             for y in enumerate_conjunction(rest, limit, conjuncts + x):
 285                 yield y
 286     else:
 287         for y in enumerate_conjunction(rest, limit, conjuncts):
 288             yield y
 289         for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
 290             yield y
 291
 292 def enumerate_disjunction(mod_tree, limit=sys.maxint):
 293     """Generates the conjuncts for mod_tree that are no longer than limit.
 294
 295     >>> list(enumerate_disjunction([['a'],['b'],['c']]))
 296     [[], ['a'], ['b'], ['c']]
 297     >>> list(enumerate_disjunction([[1,2,3]]))
 298     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
 299     >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
 300     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
 301     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
 302     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
 303     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
 304     [[]]
 305
 306     """
 307     assert isinstance(mod_tree, list)
 308     yield []
 309     for b in mod_tree:
 310         for s in enumerate_conjunction(b, limit):
 311             yield s
 312
 313 def get_mod_conjunct_triples(mod_tree, limit, mass_regimes):
 314     """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
 315     (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
 316     ordered by increasing number of conjuncts, and with duplicates within and
 317     across removed.  A mass table is also appended to each conjunct.
 318     """
 319     # FIX: is there a more elegant way or place for all this?
 320     def enmass(t):
 321         def rmass(regime_index, par_frag, sign, delta, is_mono):
 322             global MASS_REGIME_ATOMIC_MASSES
 323             if isinstance(delta, str):
 324                 if is_mono:
 325                     regime_index = 0
 326                 return (sign * formula_mass(delta,
 327                            MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
 328             else:
 329                 return sign * delta
 330
 331         sign, delta, is_mono = t[:3]
 332         return t + (tuple((rmass(r, 0, sign, delta, is_mono),
 333                            rmass(r, 1, sign, delta, is_mono))
 334                           for r in range(len(mass_regimes))),)
 335
 336     def triple(c):
 337         Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
 338         Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
 339         assert len(Ns) <= 1 and len(Cs) <= 1
 340         rest = [ enmass(x) for x in c if x[3] not in '[]' ]
 341         rest = tuple(sorted(list(frozenset(rest))))
 342         return (Ns, Cs, rest)
 343
 344     return sorted(list(frozenset(triple(conjunct)
 345                                  for conjunct
 346                                  in enumerate_disjunction(mod_tree, limit))),
 347                   key=lambda x: (sum(len(y) for y in x), x))
 348
 349
 350 def cleavage_motif_re(motif):
 351     """Return (regexp, pos), where regexp is a regular expression that will
 352     match a cleavage motif, and pos is the position of the cleavage with
 353     respect to the match (or None).  (The RE actually matches one character,
 354     the rest matching as lookahead, so that re.finditer will find all
 355     overlapping matches.)
 356
 357     >>> cleavage_motif_re('[KR]|{P}')
 358     ('[KR](?=[^P])', 1)
 359     >>> cleavage_motif_re('[X]|[X]')
 360     ('.(?=.)', 1)
 361
 362     """
 363     cleavage_pos = None
 364     re_parts = []
 365     motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
 366     parts = [ p[0] for p in motif_re.findall(motif) ]
 367     if ''.join(parts) != motif:
 368         chase_error('invalid cleavage motif pattern')
 369     i = 0
 370     for part in parts:
 371         if part == '|':
 372             if cleavage_pos != None:
 373                 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
 374             cleavage_pos = i
 375             continue
 376         if part == '[X]':
 377             re_parts.append('.')
 378         elif part[0] == '[':
 379             re_parts.append('[%s]' % part[1:-1])
 380         elif part[0] == '{':
 381             re_parts.append('[^%s]' % part[1:-1])
 382         else:
 383             assert False, "unknown cleavage motif syntax"
 384         i += 1
 385     if len(re_parts) == 0:
 386         re_pattern = '.'
 387     elif len(re_parts) == 1:
 388         re_pattern = re_parts[0]
 389     else:
 390         re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
 391     return (re_pattern, cleavage_pos)
 392
 393
 394 def file_sha1(filename):
 395     """Return the (hex) SHA1 digest of the given file."""
 396     try:
 397         import hashlib
 398     except:
 399         return "no checksum--libs missing"
 400     h = hashlib.sha1()
 401     with open(filename) as f:
 402         h.update(f.read())
 403     return h.hexdigest()
 404
 405
 406 def read_fasta_files(filenames):
 407     """Yield (locusname, defline, sequence, filename) tuples as read from
 408     FASTA files (uppercasing sequence)."""
 409
 410     loci_seen = set()
 411
 412     for filename in filenames:
 413         locusname, defline = None, None
 414         seqs = []
 415         with open(filename) as f:
 416             for line in f:
 417                 line = line.strip()
 418                 if line[:1] == '>':
 419                     if defline != None:
 420                         yield (locusname, defline, ''.join(seqs), filename)
 421                     elif seqs:
 422                         chase_error("bad format: line precedes initial defline"
 423                                     " in '%s'", filename)
 424                     defline = line[1:]
 425                     locusname_rest = defline.split(None, 1)
 426                     if not locusname_rest:
 427                         chase_error("empty locus name not allowed in '%s'",
 428                                     filename)
 429                     locusname = locusname_rest[0]
 430                     if locusname in loci_seen:
 431                         chase_error("locus name '%s' is not unique in the"
 432                                     " search database(s) in '%s'", locusname,
 433                                     filename)
 434                     loci_seen.add(locusname)
 435                     seqs = []
 436                 else:
 437                     seqs.append(line.upper())
 438             if defline:
 439                 yield (locusname, defline, ''.join(seqs), filename)
 440
 441