greylag.py

   1 # common greylag functions and constants
   2
   3 ##  greylag, a collection of programs for MS/MS protein analysis
   4 ##  Copyright (C) 2006-2008  Stowers Institute for Medical Research
   5 ##
   6 ##  This program is free software: you can redistribute it and/or modify
   7 ##  it under the terms of the GNU General Public License as published by
   8 ##  the Free Software Foundation, either version 3 of the License, or
   9 ##  (at your option) any later version.
  10 ##
  11 ##  This program is distributed in the hope that it will be useful,
  12 ##  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ##  GNU General Public License for more details.
  15 ##
  16 ##  You should have received a copy of the GNU General Public License
  17 ##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 ##
  19 ##  Contact: Mike Coleman
  20 ##           Stowers Institute for Medical Research
  21 ##           1000 East 50th Street
  22 ##           Kansas City, Missouri  64110
  23 ##           USA
  24
  25 from __future__ import with_statement
  26
  27
  28 import logging; from logging import debug, info, warning
  29 import math
  30 import re
  31 import sys
  32
  33 import cgreylag
  34
  35
  36 VERSION = "0.1.2"
  37
  38
  39 # prefix added to locus id's for decoy loci
  40 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
  41
  42
  43 # handle to the singleton parameter object shared with the C++ module
  44 CP = cgreylag.cvar.parameters_the
  45
  46
  47 def set_logging(options):
  48     log_level = logging.WARNING
  49     if options.quiet:
  50         log_level = logging.ERROR
  51     if options.verbose:
  52         log_level = logging.INFO
  53     if options.debug:
  54         log_level = logging.DEBUG
  55     logfile = None
  56     if options.logfile:
  57         logfile = options.logfile
  58     logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
  59                         format=('%(asctime)s [%(process)d]'
  60                                 ' %(levelname)s: %(message)s'),
  61                         filename=logfile)
  62
  63
  64 # FIX: is this mono or avg?  (possible error here is ~0.0007 amu)
  65 PROTON_MASS =   1.007276
  66 ELECTRON_MASS = 0.000549                # ?
  67
  68 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
  69 # (It appears that NIST has started to copyright and require licenses for use
  70 # of some of its data, but these values appear not to fall under copyright,
  71 # at least for now.)
  72 MONOISOTOPIC_ATOMIC_MASS = {
  73     'H' :  1.00782503214,
  74     'C' : 12.00000000,
  75     'N' : 14.00307400529,
  76     'O' : 15.994914622115,
  77     'P' : 30.9737615120,
  78     'S' : 31.9720706912,
  79     }
  80
  81 # most prevalent only (1 in 1000)
  82 ISOTOPIC_ATOMIC_MASS = {                # prevalence (in %)
  83     'C13' : 13.003354837810,            # 1.078
  84     'N15' : 15.00010889849,             # 0.3687
  85     'O18' : 17.99916049,                # 0.20514
  86     'S33' : 32.9714585012,              # 0.762
  87     'S34' : 33.9678668311,              # 4.2928
  88     }
  89
  90 AVERAGE_ATOMIC_MASS = {
  91     'H' :  1.007947,
  92     'C' : 12.01078,
  93     'N' : 14.00672,
  94     'O' : 15.99943,
  95     'P' : 30.9737612,
  96     'S' : 32.0655,
  97     }
  98
  99 # The xtandem average residue masses are about 0.002 amu higher than those
 100 # calculated directly from the above average atomic masses.  None of the
 101 # chemists consulted knew of any reason why, aside from lack of precision in
 102 # the average atomic mass estimates.  This shouldn't matter very much, as
 103 # fragmentation calculations should all be monoisotopic, and we can always
 104 # widen the parent tolerance window a bit.
 105
 106
 107 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
 108     """Return the mass of formula, using the given mass regime (monoisotopic
 109     by default).
 110
 111     >>> formula_mass('H2O', { 'H':1, 'O':16 })
 112     18
 113     >>> # monoisotopic mass of glycine
 114     >>> str(round(formula_mass('C2H3ON'), 4))
 115     '57.0215'
 116
 117     """
 118     parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
 119     # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
 120     return sum(atomic_mass[parts[i]] * int(parts[i+1])
 121                for i in range(0, len(parts), 2))
 122
 123 # FIX: selenocysteine (U), etc
 124 # residue -> formula
 125 RESIDUE_FORMULA = {
 126     'A' : "C3H5ON",
 127     'C' : "C3H5ONS",
 128     'D' : "C4H5O3N",
 129     'E' : "C5H7O3N",
 130     'F' : "C9H9ON",
 131     'G' : "C2H3ON",
 132     'H' : "C6H7ON3",
 133     'I' : "C6H11ON",
 134     'K' : "C6H12ON2",
 135     'L' : "C6H11ON",
 136     'M' : "C5H9ONS",
 137     'N' : "C4H6O2N2",
 138     'P' : "C5H7ON",
 139     'Q' : "C5H8O2N2",
 140     'R' : "C6H12ON4",
 141     'S' : "C3H5O2N",
 142     'T' : "C4H7O2N",
 143     'V' : "C5H9ON",
 144     'W' : "C11H10ON2",
 145     'Y' : "C9H9O2N",
 146     }
 147
 148 RESIDUES = sorted(RESIDUE_FORMULA.keys())
 149 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
 150
 151
 152 # [0][1] -> 'H' -> fragment mass of H for regime 0
 153 MASS_REGIME_ATOMIC_MASSES = []
 154
 155 def dump_mass_regime_atomic_masses():
 156     return MASS_REGIME_ATOMIC_MASSES
 157
 158
 159 def mass_regime_atomic_masses(spec):
 160     """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
 161     names to masses.
 162     """
 163     name, isotopes = spec
 164     assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
 165     if name == 'MONO':
 166         r = MONOISOTOPIC_ATOMIC_MASS.copy()
 167     else:
 168         r = AVERAGE_ATOMIC_MASS.copy()
 169     if isotopes:
 170         iname, prevalence = isotopes[0]
 171         assert iname == 'N15' and 0 <= prevalence <= 1
 172         # this is a simplification, but additional accuracy pointless?
 173         if name == 'MONO':
 174             r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
 175         else:
 176             r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
 177     return r
 178
 179
 180 def initialize_spectrum_parameters(options, GLP, mass_regimes, fixed_mod_map):
 181     """Initialize parameters known to the spectrum module.
 182     fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
 183     """
 184
 185     # This function can be called multiple times to reinitialize for a new
 186     # search, though probably everything else (previous matches) ought to be
 187     # "forgotten" at that point, too.
 188
 189     # This is the size of vectors that are indexed by residues (A-Z) or
 190     # special characters ('[]').
 191     RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
 192
 193     # These are currently monoisotopic.  (deuterium pointless?)
 194     CP.proton_mass = PROTON_MASS
 195     CP.hydrogen_mass = formula_mass("H")
 196
 197     regime_manifest = []
 198
 199     global MASS_REGIME_ATOMIC_MASSES
 200     # clear previous state
 201     MASS_REGIME_ATOMIC_MASSES = []
 202     CP.parent_mass_regime.clear()
 203     CP.fragment_mass_regime.clear()
 204
 205     for rn, regime_pair in enumerate(mass_regimes):
 206         assert len(regime_pair) == 2    # parent and fragment
 207         info('mass regime: %s', regime_pair)
 208         MASS_REGIME_ATOMIC_MASSES.append([])
 209         for n, regime in enumerate(regime_pair):
 210             atmass = mass_regime_atomic_masses(regime)
 211             MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
 212             creg = cgreylag.mass_regime_parameters()
 213
 214             creg.hydroxyl_mass = formula_mass("OH", atmass)
 215             creg.water_mass = formula_mass("H2O", atmass)
 216             creg.ammonia_mass = formula_mass("NH3", atmass)
 217
 218             creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
 219
 220             for r in RESIDUES_W_BRACKETS:
 221                 m = 0
 222                 if r in RESIDUES:
 223                     m = (formula_mass(RESIDUE_FORMULA[r], atmass)
 224                          + GLP["mass_regime_debug_delta"])
 225                 if n == 1:
 226                     regime_manifest.append((rn, r, m))
 227                 rmod = fixed_mod_map.get(r)
 228                 if rmod:
 229                     if isinstance(rmod[1], str):
 230                         if rmod[2]:
 231                             m += rmod[0] * formula_mass(rmod[1])
 232                         else:
 233                             m += rmod[0] * formula_mass(rmod[1], atmass)
 234                     else:
 235                         m += rmod[0] * rmod[1]
 236                 creg.fixed_residue_mass[ord(r)] = m
 237             # assuming these are monoisotopic (not regime)
 238             creg.fixed_residue_mass[ord('[')] += formula_mass("H")
 239             creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
 240             if not n:
 241                 CP.parent_mass_regime.append(creg)
 242             else:
 243                 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
 244                 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
 245                 CP.fragment_mass_regime.append(creg)
 246     for r in RESIDUES_W_BRACKETS:
 247         info('fixed mass %s: %s', r,
 248              [ ("%.6f" % CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
 249                 "%.6f" % CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
 250                for rn in range(len(mass_regimes)) ])
 251     for r in RESIDUES:
 252         for rn in range(len(mass_regimes)):
 253             # check for physically impossible/meaningless masses
 254             if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 255                 raise ValueError('bogus parent mass specification for %s' % r)
 256             if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 257                 raise ValueError('bogus fragment mass specification for %s' % r)
 258
 259     CP.parent_mass_tolerance_1 = GLP["parent_mz_tolerance"]
 260     CP.parent_mass_tolerance_max = (GLP["parent_mz_tolerance"]
 261                                     * GLP["charge_limit"])
 262
 263     CP.fragment_mass_tolerance = GLP["fragment_mass_tolerance"]
 264     CP.intensity_class_count = GLP["intensity_class_count"]
 265
 266     CP.minimum_peptide_length = GLP["min_peptide_length"]
 267
 268     # CP.ln_factorial[n] == ln(n!)
 269     CP.ln_factorial.resize(int(GLP["max_parent_spectrum_mass"]
 270                                / GLP["fragment_mass_tolerance"] + 100), 0.0)
 271     for n in range(2, len(CP.ln_factorial)):
 272         CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
 273
 274     # FIX or eliminate
 275     #CP.estimate_only = bool(options.estimate_only)
 276     #CP.show_progress = bool(options.show_progress)
 277     CP.estimate_only = False
 278     CP.show_progress = False
 279
 280     return regime_manifest
 281
 282
 283 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
 284     if not mod_tree:
 285         if 0 < len(conjuncts) <= limit:
 286             yield conjuncts
 287         return
 288     first, rest = mod_tree[0], mod_tree[1:]
 289     if isinstance(first, list):
 290         for x in enumerate_disjunction(first, limit):
 291             for y in enumerate_conjunction(rest, limit, conjuncts + x):
 292                 yield y
 293     else:
 294         for y in enumerate_conjunction(rest, limit, conjuncts):
 295             yield y
 296         for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
 297             yield y
 298
 299 def enumerate_disjunction(mod_tree, limit=sys.maxint):
 300     """Generates the conjuncts for mod_tree that are no longer than limit.
 301
 302     >>> list(enumerate_disjunction([['a'],['b'],['c']]))
 303     [[], ['a'], ['b'], ['c']]
 304     >>> list(enumerate_disjunction([[1,2,3]]))
 305     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
 306     >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
 307     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
 308     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
 309     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
 310     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
 311     [[]]
 312
 313     """
 314     assert isinstance(mod_tree, list)
 315     yield []
 316     for b in mod_tree:
 317         for s in enumerate_conjunction(b, limit):
 318             yield s
 319
 320 def get_mod_conjunct_triples(mod_tree, limit, mass_regimes):
 321     """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
 322     (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
 323     ordered by increasing number of conjuncts, and with duplicates within and
 324     across removed.  A mass table is also appended to each conjunct.
 325     """
 326     # FIX: is there a more elegant way or place for all this?
 327     def enmass(t):
 328         def rmass(regime_index, par_frag, sign, delta, is_mono):
 329             global MASS_REGIME_ATOMIC_MASSES
 330             if isinstance(delta, str):
 331                 if is_mono:
 332                     regime_index = 0
 333                 return (sign * formula_mass(delta,
 334                            MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
 335             else:
 336                 return sign * delta
 337
 338         sign, delta, is_mono = t[:3]
 339         return t + (tuple((rmass(r, 0, sign, delta, is_mono),
 340                            rmass(r, 1, sign, delta, is_mono))
 341                           for r in range(len(mass_regimes))),)
 342
 343     def triple(c):
 344         Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
 345         Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
 346         assert len(Ns) <= 1 and len(Cs) <= 1
 347         rest = [ enmass(x) for x in c if x[3] not in '[]' ]
 348         rest = tuple(sorted(list(frozenset(rest))))
 349         return (Ns, Cs, rest)
 350
 351     return sorted(list(frozenset(triple(conjunct)
 352                                  for conjunct
 353                                  in enumerate_disjunction(mod_tree, limit))),
 354                   key=lambda x: (sum(len(y) for y in x), x))
 355
 356
 357 def cleavage_motif_re(motif):
 358     """Return (regexp, pos), where regexp is a regular expression that will
 359     match a cleavage motif, and pos is the position of the cleavage with
 360     respect to the match (or None).  (The RE actually matches one character,
 361     the rest matching as lookahead, so that re.finditer will find all
 362     overlapping matches.)
 363
 364     >>> cleavage_motif_re('[KR]|{P}')
 365     ('[KR](?=[^P])', 1)
 366     >>> cleavage_motif_re('[X]|[X]')
 367     ('.(?=.)', 1)
 368
 369     """
 370     cleavage_pos = None
 371     re_parts = []
 372     motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
 373     parts = [ p[0] for p in motif_re.findall(motif) ]
 374     if ''.join(parts) != motif:
 375         chase_error('invalid cleavage motif pattern')
 376     i = 0
 377     for part in parts:
 378         if part == '|':
 379             if cleavage_pos != None:
 380                 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
 381             cleavage_pos = i
 382             continue
 383         if part == '[X]':
 384             re_parts.append('.')
 385         elif part[0] == '[':
 386             re_parts.append('[%s]' % part[1:-1])
 387         elif part[0] == '{':
 388             re_parts.append('[^%s]' % part[1:-1])
 389         else:
 390             assert False, "unknown cleavage motif syntax"
 391         i += 1
 392     if len(re_parts) == 0:
 393         re_pattern = '.'
 394     elif len(re_parts) == 1:
 395         re_pattern = re_parts[0]
 396     else:
 397         re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
 398     return (re_pattern, cleavage_pos)
 399
 400
 401 def file_sha1(filename):
 402     """Return the (hex) SHA1 digest of the given file."""
 403     try:
 404         import hashlib
 405     except:
 406         return "no checksum--libs missing"
 407     h = hashlib.sha1()
 408     with open(filename) as f:
 409         h.update(f.read())
 410     return h.hexdigest()
 411
 412
 413 def read_fasta_file(f):
 414     """Yield (locusname, defline, sequence) tuples as read from the given
 415     FASTA file (uppercasing sequence)."""
 416
 417     locusname, defline = None, None
 418     seqs = []
 419
 420     for line in f:
 421         line = line.strip()
 422         if line[:1] == '>':
 423             if defline != None:
 424                 yield (locusname, defline, ''.join(seqs))
 425             defline = line[1:]
 426             locusname_rest = defline.split(None, 1)
 427             locusname = locusname_rest[0] if locusname_rest else ''
 428             seqs = []
 429         else:
 430             if defline == None:
 431                 chase_error("bad format: line precedes initial defline"
 432                             " in '%s'", (f.name if hasattr(f, 'name')
 433                                          else 'unknown FASTA file'))
 434             seqs.append(line.upper())
 435     if defline != None:
 436         yield (locusname, defline, ''.join(seqs))
 437
 438
 439 def read_fasta_files(filenames):
 440     """Yield (locusname, defline, sequence, filename) tuples as read from
 441     FASTA files (uppercasing sequence).  An error is given if locusname is
 442     empty or not unique across all sequences."""
 443
 444     loci_seen = set()
 445
 446     for filename in filenames:
 447         with open(filename) as f:
 448             for locusname, defline, sequence in read_fasta_file(f):
 449                 if not locusname:
 450                     chase_error("empty locus name not allowed in '%s'",
 451                                 filename)
 452                 if locusname in loci_seen:
 453                     chase_error("locus name '%s' is not unique in the search"
 454                                 " database(s) in '%s'", locusname, filename)
 455                 loci_seen.add(locusname)
 456
 457                 yield (locusname, defline, sequence, filename)