greylag_grind.py

   1 #!/usr/bin/env greylag-python
   2
   3 '''This program does the actual work to search mass spectra against a sequence
   4 database.  <job-id> is a unique identifier used as a prefix for work and
   5 output directories.  The <configuration-file> contains program options.  The
   6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
   7 on them.
   8
   9 This program can operate in standalone mode (useful for testing) or command
  10 mode.  If the --work-slice option is given, that slice will be searched in
  11 standalone mode, then the program will exit.  Otherwise, the program will
  12 process commands until an exit command is received.
  13
  14 '''
  15
  16 from __future__ import with_statement
  17
  18 __copyright__ = '''
  19     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  20
  21     This program is free software; you can redistribute it and/or modify
  22     it under the terms of the GNU General Public License as published by
  23     the Free Software Foundation; either version 2 of the License, or
  24     (at your option) any later version.
  25
  26     This program is distributed in the hope that it will be useful,
  27     but WITHOUT ANY WARRANTY; without even the implied warranty of
  28     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  29     GNU General Public License for more details.
  30
  31     You should have received a copy of the GNU General Public License along
  32     with this program; if not, write to the Free Software Foundation, Inc.,
  33     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  34 '''
  35
  36 ##############################################################################
  37 ##    "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra     ##
  38 ##############################################################################
  39
  40
  41 __version__ = "0.0"
  42
  43
  44 import ConfigParser
  45 import contextlib
  46 import cPickle
  47 import fileinput
  48 import gzip
  49 import logging
  50 from logging import debug, info, warning
  51 import math
  52 import optparse
  53 import os
  54 from pprint import pprint, pformat
  55 import re
  56 from socket import gethostname
  57 import sys
  58
  59 import cgreylag
  60
  61
  62 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
  63 # KeyboardInterrupt processing, since we may spend long periods of time in
  64 # uninterruptible C++ calls.  Also die immediately on SIGPIPE.
  65 try:
  66     import signal
  67     signal.signal(signal.SIGINT, signal.SIG_DFL)
  68     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
  69 except:
  70     pass
  71
  72
  73 def error(s, *args):
  74     "fatal error"
  75     # if we're unit testing, just throw an exception
  76     if __name__ != "__main__":
  77         raise Exception((s + " (fatal error)") % args)
  78     logging.error(s, *args)
  79     sys.exit(1)
  80
  81
  82 def fileerror(s, *args):
  83     error(s + (", at line %s of file '%s'"
  84                % (fileinput.filelineno(), fileinput.filename())),
  85           *args)
  86
  87
  88 # name -> value map of processed XML input parameters
  89 XTP = {}
  90
  91 # handle to the singleton parameter object shared with the C++ module
  92 CP = cgreylag.cvar.parameters_the
  93
  94 # FIX: is this mono or avg?  (possible error here is ~0.0007 amu)
  95 PROTON_MASS =   1.007276
  96 ELECTRON_MASS = 0.000549                # ?
  97
  98 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
  99 MONOISOTOPIC_ATOMIC_MASS = {
 100     'H' :  1.00782503214,
 101     'C' : 12.00000000,
 102     'N' : 14.00307400529,
 103     'O' : 15.994914622115,
 104     'P' : 30.9737615120,
 105     'S' : 31.9720706912,
 106     }
 107
 108 # most prevalent only (1 in 1000)
 109 ISOTOPIC_ATOMIC_MASS = {                # prevalence (in %)
 110     'C13' : 13.003354837810,            # 1.078
 111     'N15' : 15.00010889849,             # 0.3687
 112     'O18' : 17.99916049,                # 0.20514
 113     'S33' : 32.9714585012,              # 0.762
 114     'S34' : 33.9678668311,              # 4.2928
 115     }
 116
 117 AVERAGE_ATOMIC_MASS = {
 118     'H' :  1.007947,
 119     'C' : 12.01078,
 120     'N' : 14.00672,
 121     'O' : 15.99943,
 122     'P' : 30.9737612,
 123     'S' : 32.0655,
 124     }
 125
 126 # The xtandem average residue masses are about 0.002 amu higher than those
 127 # calculated directly from the above average atomic masses.  None of the
 128 # chemists consulted knew of any reason why, aside from lack of precision in
 129 # the average atomic mass estimates.  This shouldn't matter very much, as
 130 # fragmentation calculations should all be monoisotopic, and we can always
 131 # widen the parent tolerance window a bit.
 132
 133
 134 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
 135     """Return the mass of formula, using the given mass regime (monoisotopic
 136     by default).
 137
 138     >>> formula_mass('H2O', { 'H':1, 'O':16 })
 139     18
 140     >>> # monoisotopic mass of glycine
 141     >>> str(round(formula_mass('C2H3ON'), 4))
 142     '57.0215'
 143
 144     """
 145     parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
 146     # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
 147     return sum(atomic_mass[parts[i]] * int(parts[i+1])
 148                for i in range(0, len(parts), 2))
 149
 150 # FIX: selenocysteine (U), etc
 151 # residue -> formula
 152 RESIDUE_FORMULA = {
 153     'A' : "C3H5ON",
 154     'C' : "C3H5ONS",
 155     'D' : "C4H5O3N",
 156     'E' : "C5H7O3N",
 157     'F' : "C9H9ON",
 158     'G' : "C2H3ON",
 159     'H' : "C6H7ON3",
 160     'I' : "C6H11ON",
 161     'K' : "C6H12ON2",
 162     'L' : "C6H11ON",
 163     'M' : "C5H9ONS",
 164     'N' : "C4H6O2N2",
 165     'P' : "C5H7ON",
 166     'Q' : "C5H8O2N2",
 167     'R' : "C6H12ON4",
 168     'S' : "C3H5O2N",
 169     'T' : "C4H7O2N",
 170     'V' : "C5H9ON",
 171     'W' : "C11H10ON2",
 172     'Y' : "C9H9O2N",
 173     }
 174
 175 RESIDUES = sorted(RESIDUE_FORMULA.keys())
 176 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
 177
 178
 179 # accessed as a global variable (FIX?)
 180 # [0][1] -> 'H' -> fragment mass of H for regime 0
 181 MASS_REGIME_ATOMIC_MASSES = []
 182
 183
 184 def mass_regime_atomic_masses(spec):
 185     """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
 186     names to masses.
 187     """
 188     name, isotopes = spec
 189     assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
 190     if name == 'MONO':
 191         r = MONOISOTOPIC_ATOMIC_MASS.copy()
 192     else:
 193         r = AVERAGE_ATOMIC_MASS.copy()
 194     if isotopes:
 195         iname, prevalence = isotopes[0]
 196         assert iname == 'N15' and 0 <= prevalence <= 1
 197         # this is a simplification, but additional accuracy pointless?
 198         if name == 'MONO':
 199             r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
 200         else:
 201             r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
 202     return r
 203
 204
 205 def initialize_spectrum_parameters(options, mass_regimes, fixed_mod_map):
 206     """Initialize parameters known to the spectrum module.
 207     fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
 208     """
 209
 210     debug('fixed_mod_map: %s', fixed_mod_map)
 211     # This is the size of vectors that are indexed by residues (A-Z) or
 212     # special characters ('[]').
 213     RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
 214
 215     # These are currently monoisotopic.  (deuterium pointless?)
 216     CP.proton_mass = PROTON_MASS
 217     CP.hydrogen_mass = formula_mass("H")
 218
 219     global MASS_REGIME_ATOMIC_MASSES
 220     for regime_pair in mass_regimes:
 221         assert len(regime_pair) == 2    # parent and fragment
 222         info('mass regime: %s', regime_pair)
 223         MASS_REGIME_ATOMIC_MASSES.append([])
 224         for n, regime in enumerate(regime_pair):
 225             atmass = mass_regime_atomic_masses(regime)
 226             MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
 227             creg = cgreylag.mass_regime_parameters()
 228
 229             creg.hydroxyl_mass = formula_mass("OH", atmass)
 230             creg.water_mass = formula_mass("H2O", atmass)
 231             creg.ammonia_mass = formula_mass("NH3", atmass)
 232
 233             creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
 234
 235             for r in RESIDUES_W_BRACKETS:
 236                 m = 0
 237                 if r in RESIDUES:
 238                     m = formula_mass(RESIDUE_FORMULA[r], atmass)
 239                 rmod = fixed_mod_map.get(r)
 240                 if rmod:
 241                     if isinstance(rmod[1], str):
 242                         if rmod[2]:
 243                             m += rmod[0] * formula_mass(rmod[1])
 244                         else:
 245                             m += rmod[0] * formula_mass(rmod[1], atmass)
 246                     else:
 247                         m += rmod[0] * rmod[1]
 248                 creg.fixed_residue_mass[ord(r)] = m
 249             # assuming these are monoisotopic (not regime)
 250             creg.fixed_residue_mass[ord('[')] += formula_mass("H")
 251             creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
 252             if not n:
 253                 CP.parent_mass_regime.append(creg);
 254             else:
 255                 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
 256                 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
 257                 CP.fragment_mass_regime.append(creg);
 258     for r in RESIDUES_W_BRACKETS:
 259         info('fixed_mass %s: %s', r,
 260              [ "%.6f/%.6f"
 261                % (CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
 262                   CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
 263                for rn in range(len(mass_regimes)) ])
 264     for r in RESIDUES:
 265         for rn in range(len(mass_regimes)):
 266             # physically impossible (and results would be garbage)
 267             if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 268                 raise ValueError('bogus parent mass specification for %s' % r)
 269             if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 270                 raise ValueError('bogus parent mass specification for %s' % r)
 271
 272     CP.parent_mass_tolerance_1 = XTP["parent_mz_tolerance"]
 273     CP.parent_mass_tolerance_max = (XTP["parent_mz_tolerance"]
 274                                     * XTP["charge_limit"])
 275
 276     CP.fragment_mass_tolerance = XTP["fragment_mass_tolerance"]
 277     CP.intensity_class_count = XTP["intensity_class_count"]
 278
 279     CP.minimum_peptide_length = XTP["min_peptide_length"]
 280
 281     # CP.ln_factorial[n] == ln(n!)
 282     CP.ln_factorial.resize(int(XTP["max_parent_spectrum_mass"]
 283                                / XTP["fragment_mass_tolerance"] + 100), 0.0)
 284     for n in range(2, len(CP.ln_factorial)):
 285         CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
 286
 287     CP.estimate_only = bool(options.estimate_only)
 288     CP.show_progress = bool(options.show_progress)
 289
 290
 291 def cleavage_motif_re(motif):
 292     """Return (regexp, pos), where regexp is a regular expression that will
 293     match a cleavage motif, and pos is the position of the cleavage with
 294     respect to the match (or None).  (The RE actually matches one character,
 295     the rest matching as lookahead, so that re.finditer will find all
 296     overlapping matches.)
 297
 298     >>> cleavage_motif_re('[KR]|{P}')
 299     ('[KR](?=[^P])', 1)
 300     >>> cleavage_motif_re('[X]|[X]')
 301     ('.(?=.)', 1)
 302
 303     """
 304     cleavage_pos = None
 305     re_parts = []
 306     motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
 307     parts = [ p[0] for p in motif_re.findall(motif) ]
 308     if ''.join(parts) != motif:
 309         raise ValueError('invalid cleavage motif pattern')
 310     i = 0
 311     for part in parts:
 312         if part == '|':
 313             if cleavage_pos != None:
 314                 raise ValueError("invalid cleavage motif pattern"
 315                                  " (multiple '|'s)")
 316             cleavage_pos = i
 317             continue
 318         if part == '[X]':
 319             re_parts.append('.')
 320         elif part[0] == '[':
 321             re_parts.append('[%s]' % part[1:-1])
 322         elif part[0] == '{':
 323             re_parts.append('[^%s]' % part[1:-1])
 324         else:
 325             assert False, "unknown cleavage motif syntax"
 326         i += 1
 327     if len(re_parts) == 0:
 328         re_pattern = '.'
 329     elif len(re_parts) == 1:
 330         re_pattern = re_parts[0]
 331     else:
 332         re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
 333     return (re_pattern, cleavage_pos)
 334
 335
 336 def generate_cleavage_points(cleavage_re, cleavage_pos, sequence):
 337     """Yields the offsets of the cleavages in sequence.  The endpoints are
 338     always included, by convention.
 339
 340     >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
 341     ...                               'ARBCKDEKPF'))
 342     [0, 2, 5, 10]
 343
 344     """
 345     yield 0
 346     for m in cleavage_re.finditer(sequence):
 347         p = m.start() + cleavage_pos
 348         if 0 < p < len(sequence):
 349             yield p
 350     yield len(sequence)
 351
 352
 353 AA_SEQUENCE = re.compile(r'[ARNDCQEGHILKMFPSTWYV]+')
 354
 355 def split_sequence_into_aa_runs(idno, locusname, defline, sequence, filename):
 356     """Returns a tuple (idno, start, locusname, defline, seq, filename) for
 357     each contiguous run of residues in sequence, where 'start' is the position
 358     of 'seq' in 'sequence'.
 359
 360     >>> pprint(split_sequence_into_aa_runs(123, 'ln', 'ln defline',
 361     ...                                    'STSS*DEFABA', 'filename'))
 362     [(123, 0, 'ln', 'ln defline', 'STSS', 'filename'),
 363      (123, 5, 'ln', 'ln defline', 'DEFA', 'filename'),
 364      (123, 10, 'ln', 'ln defline', 'A', 'filename')]
 365
 366     """
 367     return [ (idno, m.start(), locusname, defline, m.group(), filename)
 368              for n, m in enumerate(AA_SEQUENCE.finditer(sequence)) ]
 369
 370
 371 def read_fasta_files(filenames):
 372     """Yield (locusname, defline, sequence, filename) tuples as read from
 373     FASTA files (uppercasing sequence)."""
 374
 375     loci_seen = set()
 376     locusname, defline = None, None
 377     seqs = []
 378
 379     # reset fileinput first, in case it's been called before
 380     try:
 381         fileinput.close()
 382     except RuntimeError:
 383         pass
 384     for line in fileinput.input(filenames):
 385         line = line.strip()
 386         if line[:1] == '>':
 387             if defline != None:
 388                 yield (locusname, defline, ''.join(seqs), fileinput.filename())
 389             elif seqs:
 390                 fileerror("bad format: line precedes initial defline")
 391             defline = line[1:]
 392             locusname_rest = defline.split(None, 1)
 393             if not locusname_rest:
 394                 error("empty locus name not allowed")
 395             locusname = locusname_rest[0]
 396             if locusname in loci_seen:
 397                 error("locus name '%s' is not unique in the search database(s)"
 398                       % locusname)
 399             loci_seen.add(locusname)
 400             seqs = []
 401         else:
 402             seqs.append(line.upper())
 403     if defline:
 404         yield (locusname, defline, ''.join(seqs), fileinput.filename())
 405
 406
 407 def read_spectra_slice(spectrum_fns, offset_indices, slice):
 408     s_l, s_u = slice
 409     assert 0 <= s_l <= s_u <= 1
 410
 411     total_spectra = sum(len(oi) for oi in offset_indices)
 412     sp_l = int(float(s_l) * total_spectra)
 413     sp_u = int(float(s_u) * total_spectra)
 414
 415     seeing_l = 0
 416     spectra = []
 417
 418     for f_no, (fn, oi) in enumerate(zip(spectrum_fns, offset_indices)):
 419         seeing_u = seeing_l + len(oi)
 420         f_l = max(sp_l, seeing_l)
 421         f_u = min(sp_u, seeing_u)
 422
 423         if f_l < f_u:
 424             b_l = oi[f_l - seeing_l]
 425             b_u = oi[f_u - seeing_l] if f_u - seeing_l < len(oi) else -1 # oo
 426
 427             with open(fn) as f:
 428                 fsp = cgreylag.spectrum.read_spectra_from_ms2(f, f_no,
 429                                                               b_l, b_u)
 430                 spectra.extend(fsp)
 431         seeing_l = seeing_u
 432
 433     return spectra
 434
 435
 436 # XML parameter file processing
 437
 438 # FIX: This parsing is way too complex.  How to simplify?
 439
 440 def mass_regime_part(part_specification):
 441     """Parse a single mass regime specification part.
 442
 443     >>> mass_regime_part('MONO')
 444     ('MONO', [])
 445     >>> mass_regime_part('AVG')
 446     ('AVG', [])
 447     >>> mass_regime_part('MONO(N15@87.5%)')
 448     ('MONO', [('N15', 0.875)])
 449
 450     """
 451     ps = [ x.strip() for x in part_specification.partition('(') ]
 452     if ps[0] not in ('MONO', 'AVG'):
 453         raise ValueError("invalid mass regime list specification"
 454                          " (regime id must be 'MONO' or 'AVG')")
 455     if not ps[1]:
 456         return (ps[0], [])
 457     if ps[2][-1] != ')':
 458         raise ValueError("invalid mass regime list specification"
 459                          " (expected ')')")
 460     pps = [ x.strip() for x in ps[2][:-1].split(',') ]
 461     if len(pps) > 1:
 462         raise ValueError("invalid mass regime list specification"
 463                          " (multiple isotopes not yet implemented)")
 464     ppps = [ x.strip() for x in pps[0].partition('@') ]
 465     if not ppps[1]:
 466         raise ValueError("invalid mass regime list specification"
 467                          " (expected '@')")
 468     if ppps[0] not in ('N15',):
 469         raise ValueError("invalid mass regime list specification"
 470                          " (isotope id must currently be 'N15')")
 471     if ppps[2][-1] != '%':
 472         raise ValueError("invalid mass regime list specification"
 473                          " (expected '%')")
 474     prevalence = float(ppps[2][:-1]) / 100
 475     if not (0 <= prevalence <= 1):
 476         raise ValueError("invalid mass regime list specification"
 477                          " (prevalence must be in range 0-100%)")
 478     return (ps[0], [(ppps[0], prevalence)])
 479
 480 def mass_regime_list(mass_regime_list_specification):
 481     """Check and return a list of regime tuples (parent_regime,
 482     fragment_regime), where each regime is a tuple (id, [(isotope_id,
 483     prevalence), ...]). Multiple isotopes (when implemented) would be
 484     comma-separated.
 485
 486     >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
 487     [[('AVG', []), ('MONO', [])],
 488      [('MONO', []), ('MONO', [])],
 489      [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
 490
 491     """
 492     result = []
 493     for regspec in mass_regime_list_specification.split(';'):
 494         halves = [ x.strip() for x in regspec.split('/') ]
 495         if len(halves) > 2:
 496             raise ValueError("invalid mass regime list specification"
 497                              " (too many '/'?)")
 498         pr = [ mass_regime_part(h) for h in halves ]
 499         if len(pr) == 1:
 500             pr = [ pr[0], pr[0] ]
 501         result.append(pr)
 502     debug("mass regime list:\n%s", pformat(result))
 503
 504     # The first fragmentation regime should generally be MONO, so that
 505     # formulaic deltas with '!' do the expected thing.
 506     if result[0][1] != ('MONO', []):
 507         raise ValueError("first fragmentation regime was something other than"
 508                          " 'MONO' with no isotopes--this is almost certainly"
 509                          " not what was intended")
 510     return result
 511
 512
 513 def parse_mod_term(s, is_potential=False):
 514     """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
 515     residues, description).
 516
 517     >>> parse_mod_term('-C2H3ON!@C')
 518     (-1, 'C2H3ON', True, 'C', None)
 519     >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
 520     (1, 42.0, False, 'STY', 'phosphorylation')
 521
 522     """
 523
 524     m = re.match(r'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
 525                  r'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s)
 526     if not m:
 527         raise ValueError("invalid modification term specification"
 528                          " '%s'" % s)
 529     mg = m.groups()
 530     invalid_residues = set(mg[5]) - set(RESIDUES_W_BRACKETS)
 531     if invalid_residues:
 532         raise ValueError("invalid modification list specification"
 533                          " (invalid residues %s)" % list(invalid_residues))
 534     delta = mg[1]
 535     if mg[2]:
 536         delta = float(mg[1])
 537         if is_potential and abs(delta) < 0.0001:
 538             raise ValueError("invalid modification list specification"
 539                              " (delta '%s' is too small)" % delta)
 540     residues = mg[5]
 541     if not is_potential and len(residues) != 1:
 542         raise ValueError("invalid modification list specification '%s' (only"
 543                          " potential modifications may have multiple residues)"
 544                          % residues)
 545     if len(residues) != len(set(residues)):
 546         raise ValueError("invalid modification list specification"
 547                          " '%s' (duplicate residues prohibited)"
 548                          % residues)
 549     return (mg[0] == '-' and -1 or 1, delta, mg[4] == '!', residues, mg[7])
 550
 551
 552 def fixed_mod_list(specification):
 553     """Check and return a list of modification tuples.
 554
 555     >>> fixed_mod_list('57@C')
 556     [(1, 57.0, False, 'C', None)]
 557     >>> fixed_mod_list('57@C,CH!@N desc')
 558     [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
 559     >>> fixed_mod_list('57@C,58@C')
 560     Traceback (most recent call last):
 561         ...
 562     ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
 563
 564     """
 565     if not specification:
 566         return []
 567     result = [ parse_mod_term(s) for s in specification.split(',') ]
 568     residues = [ x[3] for x in result ]
 569     # this check is across terms; the one above is within terms
 570     if len(residues) != len(set(residues)):
 571         raise ValueError("invalid modification list specification"
 572                          " '%s' (duplicate residues prohibited)"
 573                          % residues)
 574     debug("fixed_mod_list:\n%s", pformat(result))
 575     return result
 576
 577
 578 def parse_mod_basic_expression(s):
 579     s = s.strip()
 580     if s[0] == '(':
 581         tree, rest =  parse_mod_disjunction(s[1:])
 582         rest = rest.lstrip()
 583         if rest[:1] != ')':
 584             raise ValueError("invalid modification list specification"
 585                              " (expected matching ')')")
 586         return tree, rest[1:]
 587     parts = re.split(r'([;,()])', s, 1)
 588     if len(parts) == 1:
 589         term, rest = s, ''
 590     else:
 591         assert len(parts) == 3
 592         term, rest = parts[0], parts[1]+parts[2]
 593     return parse_mod_term(term, is_potential=True), rest
 594
 595 def parse_mod_conjunction(s):
 596     result = []
 597     while s:
 598         tree, s = parse_mod_basic_expression(s)
 599         result.append(tree)
 600         s = s.lstrip()
 601         if s[:1] != ',':
 602             break
 603         s = s[1:]
 604     return result, s
 605
 606 def parse_mod_disjunction(s):
 607     result = []
 608     while s:
 609         tree, s = parse_mod_conjunction(s)
 610         result.append(tree)
 611         s = s.lstrip()
 612         if s[:1] != ';':
 613             break
 614         s = s[1:]
 615     return result, s
 616
 617 def potential_mod_list(specification):
 618     """Check and return a tree of potential modification tuples.  Nodes at
 619     even (odd) levels are disjunctions (conjunctions).  (The top list is a
 620     disjunction.)
 621
 622     >>> potential_mod_list('')
 623     []
 624     >>> potential_mod_list('PO3H@STY; C2H2O@KST')
 625     [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
 626     >>> potential_mod_list('PO3H@STY, C2H2O@KST')
 627     [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
 628     >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
 629     ...                               C2H2O@KST acetylation;
 630     ...                               CH2@AKST methylation),
 631     ...                              O@M oxidation'''))
 632     [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
 633        [(1, 'C2H2O', False, 'KST', 'acetylation')],
 634        [(1, 'CH2', False, 'AKST', 'methylation')]],
 635       (1, 'O', False, 'M', 'oxidation')]]
 636
 637     """
 638     if not specification:
 639         return []
 640     tree, remainder = parse_mod_disjunction(specification)
 641     if remainder:
 642         raise ValueError("invalid modification list specification"
 643                          " (unexpected '%s')" % remainder)
 644     debug("potential_mod_list:\n%s", pformat(tree))
 645     return tree
 646
 647
 648 def p_positive(x): return x > 0
 649 def p_negative(x): return x < 0
 650 def p_nonnegative(x): return x >= 0
 651 def p_proportion(x): return 0 <= x <= 1
 652
 653 # Configuration file parameter specification
 654 #
 655 # name -> (type, default, check_fn)
 656 # type may be bool, int, float, str, modlist, or a tuple of values
 657 # default, as a string value, or None if value must be explicitly specified
 658 # check_fn is an optional function that returns True iff the value is valid
 659
 660 PARAMETER_INFO = {
 661     "databases" : (str, None),
 662     "decoy_locus_prefix" : (str, "SHUFFLED_"),
 663     "mass_regimes" : (mass_regime_list, "MONO"),
 664     "pervasive_mods" : (fixed_mod_list, ""),
 665     "potential_mods" : (potential_mod_list, ""),
 666     "potential_mod_limit" : (int, 2, p_nonnegative),
 667     "charge_limit" : (int, 3, p_positive),
 668     "min_peptide_length" : (int, 5, p_positive), # needed?
 669     "min_parent_spectrum_mass" : (float, 0, p_nonnegative),
 670     "max_parent_spectrum_mass" : (float, 10000, p_nonnegative),
 671     "TIC_cutoff_proportion" : (float, 0.98, p_proportion),
 672     "parent_mz_tolerance" : (float, 1.25, p_nonnegative),
 673     "fragment_mass_tolerance" : (float, 0.5, p_nonnegative),
 674     "intensity_class_count" : (int, 3, p_positive),
 675     "intensity_class_ratio" : (float, 2.0, p_positive), # really > 1.0?
 676     "best_result_count" : (int, 5, p_positive),
 677     }
 678
 679
 680 def validate_parameters(parameters, parameter_info=PARAMETER_INFO):
 681     """Verify that parameters are valid, have valid values, and correspond to
 682     currently implemented functionality.  Values are converted, default values
 683     are filled in, and the resulting name/value dict returned.
 684
 685     >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
 686     ...                            {'spectrum, total peaks'
 687     ...                             : (int, '50', p_positive),
 688     ...                             'output, spectra'
 689     ...                             : (bool, 'no')}).items())
 690     [('output, spectra', False), ('spectrum, total peaks', 40)]
 691
 692     """
 693
 694     pmap = {}
 695     for p_name, p_info in sorted(parameter_info.items()):
 696         type_ = p_info[0]
 697         default = p_info[1]
 698         check_fn = len(p_info) > 2 and p_info[2] or None
 699
 700         v = parameters.get(p_name)
 701         if v == None:
 702             if default != None:
 703                 debug("parameter '%s' defaulting to '%s'", p_name, default)
 704                 v = default
 705             else:
 706                 error("missing required parameter '%s'" % p_name)
 707         if isinstance(type_, tuple):
 708             if not v in type_:
 709                 error("parameter '%s' value '%s' not in %s (feature not"
 710                       " implemented?)" % (p_name, v, type_))
 711         elif type_ == bool:
 712             v = { 'yes' : True, 'no' : False }.get(v)
 713             if v == None:
 714                 error("parameter '%s' requires a value of 'yes' or 'no'")
 715         else:
 716             try:
 717                 v = type_(v)
 718             except ValueError, e:
 719                 error("parameter '%s' has value '%s' with invalid format [%s]"
 720                       % (p_name, v, e))
 721         if check_fn and not check_fn(v):
 722             error("parameter '%s' has invalid value '%s' (or feature not"
 723                   " implemented)" % (p_name, v))
 724         pmap[p_name] = v
 725
 726     unknown_parameters = set(parameters) - set(parameter_info)
 727     if unknown_parameters:
 728         warning("%s unknown parameter(s):\n %s"
 729                 % (len(unknown_parameters),
 730                    pformat(sorted(list(unknown_parameters)))))
 731     return pmap
 732
 733
 734 def pythonize_swig_object(o, only_fields=None, skip_fields=[]):
 735     """Creates a pure Python copy of a SWIG object, so that it can be easily
 736     pickled, or printed (for debugging purposes).  Each SWIG object is
 737     pythonized as a dictionary, for convenience.  If provided, 'only_fields',
 738     limits the copy to the list of fields specified.  Otherwise,
 739     'skip_fields' if given is a list of methods not to include (this helps
 740     avoid infinite recursion).
 741
 742     >>> pprint(pythonize_swig_object(cgreylag.score_stats(1, 1)))
 743     {'best_matches': [[{'mass_trace': [],
 744                         'peptide_begin': [],
 745                         'peptide_sequence': '',
 746                         'predicted_parent_mass': 0.0,
 747                         'score': 0.0,
 748                         'sequence_name': [],
 749                         'spectrum_index': -1}]],
 750      'candidate_spectrum_count': 0,
 751      'combinations_searched': 0}
 752
 753     """
 754
 755     if isinstance(o, str):
 756         return o
 757     try:
 758         len(o)
 759     except TypeError:
 760         pass
 761     else:
 762         return list(pythonize_swig_object(x, only_fields, skip_fields)
 763                     for x in o)
 764     if hasattr(o, '__swig_getmethods__'):
 765         s = {}
 766         for a in o.__swig_getmethods__:
 767             if (only_fields != None and a in only_fields
 768                 or only_fields == None and a not in skip_fields):
 769                 s[a] = pythonize_swig_object(getattr(o, a), only_fields,
 770                                              skip_fields)
 771         return s
 772     return o
 773
 774 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
 775 # circularization of the peptide N-terminal.  PCA mods are excluded if a
 776 # static N-terminal mod has been specified.  Likewise, choosing a PCA mod will
 777 # exclude choosing a potential N-terminal mod.  (The PCA mod for 'C' is
 778 # dependent on a static mod of C+57 being in effect.)
 779
 780 def get_pca_table(mass_regimes):
 781     """Return a list of tuples (residues, parent delta, fragment delta)
 782     describing the PCA possibilities for each mass regime. Residues is a
 783     string specifying the residues this PCA delta applies to.
 784     """
 785     # FIX: According to Xtandem, C is only a candidate for PCA if
 786     # carboxyamidomethylated (C+57).  Currently we always search it.
 787     return [ [('', 0, 0),
 788               ('E', -1 * CP.parent_mass_regime[r].water_mass,
 789                -1 * CP.fragment_mass_regime[r].water_mass),
 790               ('QC', -1 * CP.parent_mass_regime[r].ammonia_mass,
 791                -1 * CP.fragment_mass_regime[r].ammonia_mass)]
 792              for r in range(len(mass_regimes)) ]
 793
 794
 795 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
 796     if not mod_tree:
 797         if 0 < len(conjuncts) <= limit:
 798             yield conjuncts
 799         return
 800     first, rest = mod_tree[0], mod_tree[1:]
 801     if isinstance(first, list):
 802         for x in enumerate_disjunction(first, limit):
 803             for y in enumerate_conjunction(rest, limit, conjuncts + x):
 804                 yield y
 805     else:
 806         for y in enumerate_conjunction(rest, limit, conjuncts):
 807             yield y
 808         for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
 809             yield y
 810
 811 def enumerate_disjunction(mod_tree, limit=sys.maxint):
 812     """Generates the conjuncts for mod_tree that are no longer than limit.
 813
 814     >>> list(enumerate_disjunction([['a'],['b'],['c']]))
 815     [[], ['a'], ['b'], ['c']]
 816     >>> list(enumerate_disjunction([[1,2,3]]))
 817     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
 818     >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
 819     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
 820     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
 821     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
 822     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
 823     [[]]
 824
 825     """
 826     assert isinstance(mod_tree, list)
 827     yield []
 828     for b in mod_tree:
 829         for s in enumerate_conjunction(b, limit):
 830             yield s
 831
 832 def get_mod_conjunct_triples(mod_tree, limit):
 833     """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
 834     (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
 835     ordered by increasing number of conjuncts, and with duplicates within and
 836     across removed.  A mass table is also appended to each conjunct.
 837     """
 838     # FIX: is there a more elegant way or place for all this?
 839     def enmass(t):
 840         def rmass(regime_index, par_frag, sign, delta, is_mono):
 841             global MASS_REGIME_ATOMIC_MASSES
 842             if isinstance(delta, str):
 843                 if is_mono:
 844                     regime_index = 0
 845                 return (sign * formula_mass(delta,
 846                            MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
 847             else:
 848                 return sign * delta
 849
 850         sign, delta, is_mono = t[:3]
 851         return t + (tuple((rmass(r, 0, sign, delta, is_mono),
 852                            rmass(r, 1, sign, delta, is_mono))
 853                           for r in range(len(XTP["mass_regimes"]))),)
 854
 855     def triple(c):
 856         Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
 857         Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
 858         assert len(Ns) <= 1 and len(Cs) <= 1
 859         rest = [ enmass(x) for x in c if x[3] not in '[]' ]
 860         rest = tuple(sorted(list(frozenset(rest))))
 861         return (Ns, Cs, rest)
 862
 863     return sorted(list(frozenset(triple(conjunct)
 864                                  for conjunct
 865                                  in enumerate_disjunction(mod_tree, limit))),
 866                   key=lambda x: (sum(len(y) for y in x), x))
 867
 868
 869 def gen_delta_bag_counts(i, remainder, bag):
 870     if i < 1:
 871         assert i == 0
 872         bag[0] = remainder
 873         yield tuple(bag)
 874         return
 875     for delta in range(1, remainder-i+1):
 876         bag[i] = delta
 877         for x in gen_delta_bag_counts(i-1, remainder-delta, bag):
 878             yield x
 879
 880 def generate_delta_bag_counts(mod_count, conjunct_length):
 881     """Generate all tuples of positive integers having length conjunct_length
 882     and sum mod_count.  As a special case, () is such a tuple having length 0
 883     and sum 0.
 884
 885     >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
 886     ...
 887     0 []
 888     1 [(4,)]
 889     2 [(3, 1), (2, 2), (1, 3)]
 890     3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
 891     4 [(1, 1, 1, 1)]
 892     5 []
 893     >>> list(generate_delta_bag_counts(0, 0))
 894     [()]
 895     >>> list(generate_delta_bag_counts(0, 1))
 896     []
 897
 898     """
 899     if conjunct_length == 0:
 900         return [()] if mod_count == 0 else []
 901     if mod_count < conjunct_length:
 902         return []
 903     return gen_delta_bag_counts(conjunct_length - 1, mod_count,
 904                                 [0] * conjunct_length)
 905
 906
 907 def set_context_conjuncts(context, mass_regime_index, N_cj, C_cj, R_cj):
 908     assert len(N_cj) < 1 and len(C_cj) < 1
 909     context.N_delta = 0
 910     if N_cj:
 911         context.N_delta = N_cj[0][5][mass_regime_index][1]
 912     context.C_delta = 0
 913     if C_cj:
 914         context.C_delta = C_cj[0][5][mass_regime_index][1]
 915     context.delta_bag_lookup.clear()
 916     context.delta_bag_lookup.resize(ord('Z')+1)
 917     context.delta_bag_delta.clear()
 918     for n, cj in enumerate(R_cj):
 919         context.delta_bag_delta.append(cj[5][mass_regime_index][1])
 920         for r in cj[3]:
 921             context.delta_bag_lookup[ord(r)] \
 922                 = context.delta_bag_lookup[ord(r)] + (n,)
 923
 924
 925 def search_all(options, context, score_statistics):
 926     """Search sequence database against searchable spectra."""
 927
 928     mod_limit = XTP["potential_mod_limit"]
 929
 930     mod_conjunct_triples = get_mod_conjunct_triples(XTP["potential_mods"],
 931                                                     mod_limit)
 932     info("%s unique potential modification conjuncts",
 933          len(mod_conjunct_triples))
 934     debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples))
 935
 936     mass_regimes = XTP["mass_regimes"]
 937     pca_table = get_pca_table(mass_regimes)
 938     debug("pca_table: %s", pca_table)
 939
 940     total_combinations_searched = 0
 941
 942     for mod_count in range(mod_limit + 1):
 943         context.mod_count = mod_count
 944         for mr_index, (mass_regime, pca_entry) in enumerate(zip(mass_regimes,
 945                                                                 pca_table)):
 946             context.mass_regime_index = mr_index
 947             for pca_res, pca_parent_delta, pca_frag_delta in pca_entry:
 948                 context.pca_residues = pca_res
 949                 context.pca_delta = pca_frag_delta
 950                 for cji, (N_cj, C_cj, R_cj) in enumerate(mod_conjunct_triples):
 951                     if pca_res and N_cj:
 952                         continue    # mutually exclusive, for now
 953                     set_context_conjuncts(context, mr_index, N_cj, C_cj, R_cj)
 954                     debug("mod_count: %s", mod_count)
 955                     debug("cj_triple: N=%s C=%s R=%s", N_cj, C_cj, R_cj)
 956                     for delta_bag in generate_delta_bag_counts(mod_count,
 957                                                                len(R_cj)):
 958                         debug("delta_bag: %s", delta_bag)
 959
 960                         # this clear() avoids an SWIG/STL bug!?
 961                         context.delta_bag_count.clear()
 962                         context.delta_bag_count[:] = delta_bag
 963
 964                         pmrf = CP.parent_mass_regime[mr_index].fixed_residue_mass
 965                         fmrf = CP.fragment_mass_regime[mr_index].fixed_residue_mass
 966                         p_fx = (pmrf[ord('[')]
 967                                 + (N_cj and N_cj[0][5][mr_index][0] or 0)
 968                                 + pmrf[ord(']')]
 969                                 + (C_cj and C_cj[0][5][mr_index][0] or 0)
 970                                 + pca_parent_delta + PROTON_MASS)
 971                         context.parent_fixed_mass = p_fx
 972                         f_N_fx = (fmrf[ord('[')]
 973                                   + (N_cj and N_cj[0][5][mr_index][1] or 0)
 974                                   + pca_frag_delta)
 975                         context.fragment_N_fixed_mass = f_N_fx
 976                         f_C_fx = (fmrf[ord(']')]
 977                                   + (C_cj and C_cj[0][5][mr_index][1] or 0)
 978                                   + CP.fragment_mass_regime[mr_index].water_mass)
 979                         context.fragment_C_fixed_mass = f_C_fx
 980
 981                         info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
 982                              % (mod_count, mr_index, pca_res, cji, delta_bag))
 983                         debug("p_fx %s f_N_fx %s f_C_fx %s"
 984                               % (p_fx, f_N_fx, f_C_fx))
 985
 986                         score_statistics.combinations_searched = 0
 987                         cgreylag.spectrum.search_runs(context,
 988                                                       score_statistics)
 989                         total_combinations_searched \
 990                             += score_statistics.combinations_searched
 991                         info("  %s candidate spectra examined, this bag",
 992                              score_statistics.combinations_searched)
 993
 994     info('%s candidate spectra examined',
 995          score_statistics.candidate_spectrum_count)
 996
 997
 998 def get_prefix_sequence(begin_pos, run_offset, sequence):
 999     """
1000     >>> get_prefix_sequence(0, 0, 'abcdefghi')
1001     '['
1002     >>> get_prefix_sequence(1, 0, 'abcdefghi')
1003     '[a'
1004     >>> get_prefix_sequence(3, 0, 'abcdefghi')
1005     '[abc'
1006     >>> get_prefix_sequence(4, 0, 'abcdefghi')
1007     'abcd'
1008     >>> get_prefix_sequence(4, 2, 'abcdefghi')
1009     'cdef'
1010
1011     """
1012     prefix_start = run_offset + begin_pos - 4
1013     s = sequence[max(0, prefix_start):prefix_start+4]
1014     if len(s) < 4:
1015         s = '[' + s
1016     return s
1017
1018 def get_suffix_sequence(end_pos, run_offset, sequence):
1019     """
1020     >>> get_suffix_sequence(0, 0, 'abcdefghi')
1021     'abcd'
1022     >>> get_suffix_sequence(4, 0, 'abcdefghi')
1023     'efgh'
1024     >>> get_suffix_sequence(6, 0, 'abcdefghi')
1025     'ghi]'
1026     >>> get_suffix_sequence(8, 0, 'abcdefghi')
1027     'i]'
1028     >>> get_suffix_sequence(4, 2, 'abcdefghi')
1029     'ghi]'
1030
1031     """
1032     suffix_start = run_offset + end_pos
1033     s = sequence[suffix_start:suffix_start+4]
1034     if len(s) < 4:
1035         s = s + ']'
1036     return s
1037
1038
1039 def results_dump(score_statistics, searchable_spectra):
1040     """Return a result dict mapping spectrum names to (spectrum_metadata,
1041     best_matches) pairs.  (Unneeded fields are stripped.)
1042     """
1043
1044     r = {}
1045     spectrum_metadata_fs = set(['name', 'file_id', 'mass', 'charge',
1046                                 'total_ion_current', 'comparisons'])
1047     py_s_spectra = pythonize_swig_object(searchable_spectra,
1048                                          only_fields=spectrum_metadata_fs)
1049     py_matches = pythonize_swig_object(score_statistics.best_matches,
1050                                        skip_fields=['spectrum_index'])
1051     assert len(py_s_spectra) == len(py_matches)
1052
1053     for sp_metadata, sp_matches in zip(py_s_spectra, py_matches):
1054         assert sp_metadata['name'] not in r, "duplicate spectrum name"
1055         r[sp_metadata['name']] = (sp_metadata, sp_matches)
1056
1057     return r
1058
1059
1060 def main(args=sys.argv[1:]):
1061     parser = optparse.OptionParser(usage=
1062                                    "usage: %prog [options]"
1063                                    " <configuration-file> <ms2-file>...",
1064                                    description=__doc__, version=__version__)
1065     pa = parser.add_option
1066     pa("-P", "--parameter", nargs=2, dest="parameters", action="append",
1067        default=[],
1068        help="override a parameter in <parameter-file>, may be used multiple"
1069        " times", metavar="NAME VALUE")
1070     pa("-w", "--work-slice", nargs=2, type="float", dest="work_slice",
1071        help="search a subinterval [L:U) of the work space"
1072        " (where 0 <= L <= U <= 1) in standalone mode", metavar="L U")
1073     pa("--job-id", dest="job_id", default="unknown",
1074        help="used to generate unique output filenames [default 'unknown']")
1075     pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
1076     pa("-p", "--show-progress", action="store_true", dest="show_progress",
1077        help="show running progress")
1078     pa("--estimate", action="store_true", dest="estimate_only",
1079        help="just estimate the time required for the search")
1080     pa("-v", "--verbose", action="store_true", dest="verbose",
1081        help="be verbose")
1082     pa("--copyright", action="store_true", dest="copyright",
1083        help="print copyright and exit")
1084     pa("--debug", action="store_true", dest="debug",
1085        help="output debugging info")
1086     pa("--profile", action="store_true", dest="profile",
1087        help="dump Python profiling output to './greylag.prof.<pid>'")
1088     (options, args) = parser.parse_args(args=args)
1089
1090     if options.copyright:
1091         print __copyright__
1092         sys.exit(0)
1093
1094     if len(args) < 2:
1095         parser.print_help()
1096         sys.exit(1)
1097
1098     configuration_fn = args[0]
1099     spectrum_fns = args[1:]
1100
1101     if (any(True for f in spectrum_fns if not f.endswith('.ms2'))
1102         or (options.work_slice
1103             and not (0 <= options.work_slice[0]
1104                      <= options.work_slice[1] <= 1))):
1105         parser.print_help()
1106         sys.exit(1)
1107
1108     log_level = logging.WARNING
1109     if options.quiet:
1110         log_level = logging.ERROR
1111     if options.verbose:
1112         log_level = logging.INFO
1113     if options.debug:
1114         log_level = logging.DEBUG
1115     logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
1116                         format='%(asctime)s %(levelname)s: %(message)s')
1117     info("starting on %s", gethostname())
1118
1119     # check spectrum basename uniqueness, as corresponding sqt files will be
1120     # in a single directory
1121     base_spectrum_fns = [ os.path.basename(fn) for fn in spectrum_fns ]
1122     if len(base_spectrum_fns) != len(set(base_spectrum_fns)):
1123         error("base spectrum filenames must be unique")
1124
1125     # check -P names for validity
1126     bad_names = (set(n for n,v in options.parameters)
1127                  - set(PARAMETER_INFO.keys()))
1128     if bad_names:
1129         error("bad -P parameter names %s" % list(bad_names))
1130
1131     # read params
1132     cp = ConfigParser.RawConfigParser()
1133     cp.optionxform = str                # be case-sensitive
1134     with open(configuration_fn) as configuration_file:
1135         cp.readfp(configuration_file)
1136     if not cp.has_section('greylag'):
1137         error("%s has no [greylag] section" % configuration_fn)
1138     parameters = dict(cp.items('greylag'))
1139     parameters.update(dict(options.parameters)) # command-line override
1140     global XTP
1141     XTP = validate_parameters(parameters)
1142
1143     fixed_mod_map = dict((r[3], r) for r in XTP["pervasive_mods"])
1144     initialize_spectrum_parameters(options, XTP["mass_regimes"], fixed_mod_map)
1145
1146     # read sequence dbs
1147     databases = XTP["databases"].split()
1148     # [(locusname, defline, seq, filename), ...]
1149     fasta_db = list(read_fasta_files(databases))
1150     # [(idno, offset, locusname, defline, seq, seq_filename), ...]
1151     db = []
1152     for idno, (locusname, defline, sequence, filename) in enumerate(fasta_db):
1153         db.extend(split_sequence_into_aa_runs(idno, locusname, defline,
1154                                               sequence, filename))
1155     db_residue_count = sum(len(dbi[3]) for dbi in db)
1156
1157     info("read %s sequences (%s runs, %s residues)", len(fasta_db), len(db),
1158          db_residue_count)
1159     max_run_length = max(len(r[3]) for r in db)
1160     info("max run length is %s residues", max_run_length)
1161     if max_run_length > 2**31 - 1:
1162         error("runs longer than %s not yet supported", max_run_length)
1163     if not db:
1164         error("no database sequences")
1165
1166     # read spectrum offset indices
1167     spectrum_offset_indices = []
1168     for spfn in spectrum_fns:
1169         idxfn = spfn + '.idx'
1170         with contextlib.closing(gzip.open(idxfn)) as idxf:
1171             idx = cPickle.load(idxf)
1172             # try to verify that index matches spectrum file
1173             if idx['file size'] != os.path.getsize(spfn):
1174                 error("index '%s' needs rebuilding" % idxfn)
1175             spectrum_offset_indices.append(idx['offsets'])
1176
1177     # FIX: assume standalone mode (for now)
1178     assert options.work_slice
1179
1180     # read spectra per work slice
1181     spectra = read_spectra_slice(spectrum_fns, spectrum_offset_indices,
1182                                  options.work_slice)
1183     del spectrum_offset_indices
1184     spectra.sort(key=lambda x: x.mass)
1185
1186     def peak_statistics(spectra):
1187         counts = [ len(sp.peaks) for sp in spectra ]
1188         counts.sort()
1189         n = len(counts)
1190         return (counts[0], counts[int(n*0.25)], counts[int(n*0.5)],
1191                 counts[int(n*0.75)], counts[-1], sum(counts) / float(n))
1192
1193     if spectra:
1194         info("read %s spectra (mass range %s - %s)", len(spectra),
1195              spectra[0].mass, spectra[-1].mass)
1196         info("  peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1197              % peak_statistics(spectra))
1198     else:
1199         warning("no input spectra")
1200
1201     # filter and normalize spectra
1202     for sp in spectra:
1203         sp.filter_peaks(XTP["TIC_cutoff_proportion"],
1204                         CP.parent_mass_tolerance_max)
1205         sp.classify(XTP["intensity_class_count"], XTP["intensity_class_ratio"],
1206                     XTP["fragment_mass_tolerance"])
1207
1208     min_psm = XTP["min_parent_spectrum_mass"]
1209     max_psm = XTP["max_parent_spectrum_mass"]
1210     # FIX: also filter by 1 + 2 + 4 rule?
1211     spectra = [ sp for sp in spectra
1212                 if len(sp.peaks) >= 10 and min_psm <= sp.mass <= max_psm ]
1213
1214     if spectra:
1215         info("after filtering:")
1216         info("     %s spectra (mass range %s - %s)", len(spectra),
1217              spectra[0].mass, spectra[-1].mass)
1218         info("  peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1219              % peak_statistics(spectra))
1220
1221     cgreylag.spectrum.set_searchable_spectra(spectra)
1222     score_statistics = cgreylag.score_stats(len(spectra),
1223                                             XTP["best_result_count"])
1224
1225     if spectra:
1226         del spectra                     # release memory
1227
1228         # FIX!!!
1229         # (cleavage_re, position of cleavage in cleavage_re)
1230         cleavage_motif = "[X]|[X]"
1231         cleavage_pattern, cleavage_pos = cleavage_motif_re(cleavage_motif)
1232         if cleavage_pos == None:
1233             error("cleavage site '%s' is missing '|'",
1234                   XTP["protein, cleavage site"])
1235         cleavage_pattern = re.compile(cleavage_pattern)
1236
1237         context = cgreylag.search_context()
1238         for idno, offset, locusname, defline, seq, seq_filename in db:
1239             cp = []
1240             if cleavage_motif != "[X]|[X]":
1241                 cp = list(generate_cleavage_points(cleavage_pattern,
1242                                                    cleavage_pos, seq))
1243             sr = cgreylag.sequence_run(idno, offset, seq, cp, locusname)
1244             context.sequence_runs.append(sr)
1245         context.maximum_missed_cleavage_sites = 1000000000 # FIX
1246
1247         info("searching")
1248         search_all(options, context, score_statistics)
1249     else:
1250         warning("no spectra after filtering--search skipped")
1251
1252     if options.estimate_only:
1253         print ("%.2f generic CPU hours"
1254                % (score_statistics.candidate_spectrum_count / 300.0e6))
1255         return
1256
1257     info("writing result file")
1258     # FIX: this is only for standalone mode
1259     result_fn = 'grind_%s_%s-%s.gwr' % (options.job_id, options.work_slice[0],
1260                                         options.work_slice[1])
1261     with contextlib.closing(gzip.open(result_fn, 'w')) as result_file:
1262         d = { 'version' : __version__,
1263               'matches' : results_dump(score_statistics,
1264                                        cgreylag.cvar.spectrum_searchable_spectra),
1265               'total comparisons' : score_statistics.candidate_spectrum_count,
1266               'spectrum files' : base_spectrum_fns,
1267               'databases' : databases,
1268               'parameters' : XTP,
1269               'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES,
1270               'proton mass' : PROTON_MASS,
1271               'argv' : sys.argv }
1272         cPickle.dump(d, result_file, cPickle.HIGHEST_PROTOCOL)
1273     info("finished, result file written to '%s'", result_fn)
1274
1275
1276 if __name__ == '__main__':
1277 #     try:
1278 #         import psyco
1279 #         psyco.full()
1280 #         ###psyco.bind(formula_mass)
1281 #         warning('psyco enabled')
1282 #     except ImportError:
1283 #         pass
1284
1285     try:
1286         if '--profile' in sys.argv:
1287             import cProfile
1288             import pstats
1289             report_fn = "greylag.prof.%s" % os.getpid()
1290             data_fn = report_fn + ".tmp"
1291             prof = cProfile.run('main()', data_fn)
1292             with open(report_fn, 'w') as report_f:
1293                 try:
1294                     stats = pstats.Stats(data_fn, stream=report_f)
1295                     stats.strip_dirs()
1296                     stats.sort_stats('cumulative')
1297                     stats.print_stats(50)
1298                     stats.sort_stats('time')
1299                     stats.print_stats(50)
1300                     print "# profile report written to '%s'" % report_fn
1301                 finally:
1302                     try:
1303                         os.remove(data_fn)
1304                     except:
1305                         pass
1306         else:
1307             main()
1308     except SystemExit:
1309         raise
1310     except:
1311         logging.exception("unhandled exception")
1312         sys.exit(1)
1313     finally:
1314         logging.shutdown()
1315
1316
1317 # FIXES:
1318 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)