greylag-grind.py

   1 #!/usr/bin/env python
   2
   3 '''This program does the actual work to search mass spectra against a sequence
   4 database.  <job-id> is a unique identifier used as a prefix for work and
   5 output directories.  The <parameter-file> contains program options.  The
   6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
   7 on them.
   8
   9 This program can operate in standalone mode (useful for testing) or command
  10 mode.  If the --work-slice option is given, that slice will be searched in
  11 standalone mode, then the program will exit.  Otherwise, the program will
  12 process commands until an exit command is received.
  13
  14 '''
  15
  16 from __future__ import with_statement
  17
  18 __copyright__ = '''
  19     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  20
  21     This program is free software; you can redistribute it and/or modify
  22     it under the terms of the GNU General Public License as published by
  23     the Free Software Foundation; either version 2 of the License, or
  24     (at your option) any later version.
  25
  26     This program is distributed in the hope that it will be useful,
  27     but WITHOUT ANY WARRANTY; without even the implied warranty of
  28     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  29     GNU General Public License for more details.
  30
  31     You should have received a copy of the GNU General Public License along
  32     with this program; if not, write to the Free Software Foundation, Inc.,
  33     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  34 '''
  35
  36 ##############################################################################
  37 ##    "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra     ##
  38 ##############################################################################
  39
  40
  41 __version__ = "0.0"
  42
  43
  44 import ConfigParser
  45 from collections import defaultdict
  46 import contextlib
  47 import cPickle
  48 import fileinput
  49 import gzip
  50 import itertools
  51 import logging
  52 from logging import debug, info, warning
  53 import math                             #??
  54 import optparse
  55 import os
  56 from pprint import pprint, pformat
  57 import re
  58 from socket import gethostname
  59 import sys
  60
  61 import cgreylag
  62
  63
  64 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
  65 # KeyboardInterrupt processing, since we may spend long periods of time in
  66 # uninterruptible C++ calls.  Also die immediately on SIGPIPE.
  67 try:
  68     import signal
  69     signal.signal(signal.SIGINT, signal.SIG_DFL)
  70     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
  71 except:
  72     pass
  73
  74
  75 def error(s, *args):
  76     "fatal error"
  77     logging.error(s, *args)
  78     if __name__ != "__main__":
  79         raise Exception("(fatal error, but unit testing, so not exiting)")
  80     sys.exit(1)
  81
  82
  83 def fileerror(s, *args):
  84     error(s + (", at line %s of file '%s'"
  85                % (fileinput.filelineno(), fileinput.filename())),
  86           *args)
  87
  88
  89 # name -> value map of processed XML input parameters
  90 XTP = {}
  91
  92 # handle to the singleton parameter object shared with the C++ module
  93 CP = cgreylag.cvar.parameters_the
  94
  95 # FIX: is this mono or avg?  (possible error here is ~0.0007 amu)
  96 PROTON_MASS =   1.007276
  97 ELECTRON_MASS = 0.000549                # ?
  98
  99 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
 100 MONOISOTOPIC_ATOMIC_MASS = {
 101     'H' :  1.00782503214,
 102     'C' : 12.00000000,
 103     'N' : 14.00307400529,
 104     'O' : 15.994914622115,
 105     'P' : 30.9737615120,
 106     'S' : 31.9720706912,
 107     }
 108
 109 # most prevalent only (1 in 1000)
 110 ISOTOPIC_ATOMIC_MASS = {                # prevalence (in %)
 111     'C13' : 13.003354837810,            # 1.078
 112     'N15' : 15.00010889849,             # 0.3687
 113     'O18' : 17.99916049,                # 0.20514
 114     'S33' : 32.9714585012,              # 0.762
 115     'S34' : 33.9678668311,              # 4.2928
 116     }
 117
 118 AVERAGE_ATOMIC_MASS = {
 119     'H' :  1.007947,
 120     'C' : 12.01078,
 121     'N' : 14.00672,
 122     'O' : 15.99943,
 123     'P' : 30.9737612,
 124     'S' : 32.0655,
 125     }
 126
 127 # The xtandem average residue masses are about 0.002 amu higher than those
 128 # calculated directly from the above average atomic masses.  None of the
 129 # chemists consulted knew of any reason why, aside from lack of precision in
 130 # the average atomic mass estimates.  This shouldn't matter very much, as
 131 # fragmentation calculations should all be monoisotopic, and we can always
 132 # widen the parent tolerance window a bit.
 133
 134
 135 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
 136     """Return the mass of formula, using the given mass regime (monoisotopic
 137     by default).
 138
 139     >>> formula_mass('H2O', { 'H':1, 'O':16 })
 140     18
 141     >>> # monoisotopic mass of glycine
 142     >>> str(round(formula_mass('C2H3ON'), 4))
 143     '57.0215'
 144
 145     """
 146     parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
 147     # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
 148     return sum(atomic_mass[parts[i]] * int(parts[i+1])
 149                for i in range(0, len(parts), 2))
 150
 151 # FIX: selenocysteine (U), etc
 152 # residue -> formula
 153 RESIDUE_FORMULA = {
 154     'A' : "C3H5ON",
 155     'C' : "C3H5ONS",
 156     'D' : "C4H5O3N",
 157     'E' : "C5H7O3N",
 158     'F' : "C9H9ON",
 159     'G' : "C2H3ON",
 160     'H' : "C6H7ON3",
 161     'I' : "C6H11ON",
 162     'K' : "C6H12ON2",
 163     'L' : "C6H11ON",
 164     'M' : "C5H9ONS",
 165     'N' : "C4H6O2N2",
 166     'P' : "C5H7ON",
 167     'Q' : "C5H8O2N2",
 168     'R' : "C6H12ON4",
 169     'S' : "C3H5O2N",
 170     'T' : "C4H7O2N",
 171     'V' : "C5H9ON",
 172     'W' : "C11H10ON2",
 173     'Y' : "C9H9O2N",
 174     }
 175
 176 RESIDUES = sorted(RESIDUE_FORMULA.keys())
 177 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
 178
 179
 180 # accessed as a global variable (FIX?)
 181 # [0][1] -> 'H' -> fragment mass of H for regime 0
 182 MASS_REGIME_ATOMIC_MASSES = []
 183
 184
 185 def mass_regime_atomic_masses(spec):
 186     """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
 187     names to masses.
 188     """
 189     name, isotopes = spec
 190     assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
 191     if name == 'MONO':
 192         r = MONOISOTOPIC_ATOMIC_MASS.copy()
 193     else:
 194         r = AVERAGE_ATOMIC_MASS.copy()
 195     if isotopes:
 196         iname, prevalence = isotopes[0]
 197         assert iname == 'N15' and 0 <= prevalence <= 1
 198         # this is a simplification, but additional accuracy pointless?
 199         if name == 'MONO':
 200             r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
 201         else:
 202             r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
 203     return r
 204
 205
 206 def initialize_spectrum_parameters(options, mass_regimes, fixed_mod_map):
 207     """Initialize parameters known to the spectrum module.
 208     fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
 209     """
 210
 211     debug('fixed_mod_map: %s', fixed_mod_map)
 212     # This is the size of vectors that are indexed by residues (A-Z) or
 213     # special characters ('[]').
 214     RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
 215
 216     # These are currently monoisotopic.  (deuterium pointless?)
 217     CP.proton_mass = PROTON_MASS
 218     CP.hydrogen_mass = formula_mass("H")
 219
 220     global MASS_REGIME_ATOMIC_MASSES
 221     for regime_pair in mass_regimes:
 222         assert len(regime_pair) == 2    # parent and fragment
 223         info('mass regime: %s', regime_pair)
 224         MASS_REGIME_ATOMIC_MASSES.append([])
 225         for n, regime in enumerate(regime_pair):
 226             atmass = mass_regime_atomic_masses(regime)
 227             MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
 228             creg = cgreylag.mass_regime_parameters()
 229
 230             creg.hydroxyl_mass = formula_mass("OH", atmass)
 231             creg.water_mass = formula_mass("H2O", atmass)
 232             creg.ammonia_mass = formula_mass("NH3", atmass)
 233
 234             creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
 235
 236             for r in RESIDUES_W_BRACKETS:
 237                 m = 0
 238                 if r in RESIDUES:
 239                     m = formula_mass(RESIDUE_FORMULA[r], atmass)
 240                 rmod = fixed_mod_map.get(r)
 241                 if rmod:
 242                     if isinstance(rmod[1], str):
 243                         if rmod[2]:
 244                             m += rmod[0] * formula_mass(rmod[1])
 245                         else:
 246                             m += rmod[0] * formula_mass(rmod[1], atmass)
 247                     else:
 248                         m += rmod[0] * rmod[1]
 249                 creg.fixed_residue_mass[ord(r)] = m
 250             # assuming these are monoisotopic (not regime)
 251             creg.fixed_residue_mass[ord('[')] += formula_mass("H")
 252             creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
 253             if not n:
 254                 CP.parent_mass_regime.append(creg);
 255             else:
 256                 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
 257                 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
 258                 CP.fragment_mass_regime.append(creg);
 259     for r in RESIDUES_W_BRACKETS:
 260         info('fixed_mass %s: %s', r,
 261              [ "%.6f/%.6f"
 262                % (CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
 263                   CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
 264                for rn in range(len(mass_regimes)) ])
 265     for r in RESIDUES:
 266         for rn in range(len(mass_regimes)):
 267             # physically impossible (and results would be garbage)
 268             if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 269                 raise ValueError('bogus parent mass specification for %s' % r)
 270             if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 271                 raise ValueError('bogus parent mass specification for %s' % r)
 272
 273     CP.parent_mass_tolerance_1 = XTP["parent_mz_tolerance"]
 274     CP.parent_mass_tolerance_max = (XTP["parent_mz_tolerance"]
 275                                     * XTP["charge_limit"])
 276
 277     CP.fragment_mass_tolerance = XTP["fragment_mass_tolerance"]
 278     CP.intensity_class_count = XTP["intensity_class_count"]
 279
 280     CP.minimum_peptide_length = XTP["min_peptide_length"]
 281
 282     # CP.ln_factorial[n] == ln(n!)
 283     CP.ln_factorial.resize(int(XTP["max_parent_spectrum_mass"]
 284                                / XTP["fragment_mass_tolerance"] + 100), 0.0)
 285     for n in range(2, len(CP.ln_factorial)):
 286         CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
 287
 288     CP.estimate_only = bool(options.estimate_only)
 289     CP.show_progress = bool(options.show_progress)
 290
 291
 292 def cleavage_motif_re(motif):
 293     """Return (regexp, pos), where regexp is a regular expression that will
 294     match a cleavage motif, and pos is the position of the cleavage with
 295     respect to the match (or None).  (The RE actually matches one character,
 296     the rest matching as lookahead, so that re.finditer will find all
 297     overlapping matches.)
 298
 299     >>> cleavage_motif_re('[KR]|{P}')
 300     ('[KR](?=[^P])', 1)
 301     >>> cleavage_motif_re('[X]|[X]')
 302     ('.(?=.)', 1)
 303
 304     """
 305     cleavage_pos = None
 306     re_parts = []
 307     motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
 308     parts = [ p[0] for p in motif_re.findall(motif) ]
 309     if ''.join(parts) != motif:
 310         raise ValueError('invalid cleavage motif pattern')
 311     i = 0
 312     for part in parts:
 313         if part == '|':
 314             if cleavage_pos != None:
 315                 raise ValueError("invalid cleavage motif pattern"
 316                                  " (multiple '|'s)")
 317             cleavage_pos = i
 318             continue
 319         if part == '[X]':
 320             re_parts.append('.')
 321         elif part[0] == '[':
 322             re_parts.append('[%s]' % part[1:-1])
 323         elif part[0] == '{':
 324             re_parts.append('[^%s]' % part[1:-1])
 325         else:
 326             assert False, "unknown cleavage motif syntax"
 327         i += 1
 328     if len(re_parts) == 0:
 329         re_pattern = '.'
 330     elif len(re_parts) == 1:
 331         re_pattern = re_parts[0]
 332     else:
 333         re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
 334     return (re_pattern, cleavage_pos)
 335
 336
 337 def generate_cleavage_points(cleavage_re, cleavage_pos, sequence):
 338     """Yields the offsets of the cleavages in sequence.  The endpoints are
 339     always included, by convention.
 340
 341     >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
 342     ...                               'ARBCKDEKPF'))
 343     [0, 2, 5, 10]
 344
 345     """
 346     yield 0
 347     for m in cleavage_re.finditer(sequence):
 348         p = m.start() + cleavage_pos
 349         if 0 < p < len(sequence):
 350             yield p
 351     yield len(sequence)
 352
 353
 354 AA_SEQUENCE = re.compile(r'[ARNDCQEGHILKMFPSTWYV]+')
 355
 356 def split_sequence_into_aa_runs(idno, defline, sequence, filename):
 357     """Returns a tuple (idno, start, defline, seq, filename) for each
 358     contiguous run of residues in sequence, where 'start' is the position of
 359     'seq' in 'sequence'.
 360
 361     >>> pprint(split_sequence_into_aa_runs(123, 'defline', 'STSS*DEFABA',
 362     ...                                    'filename'))
 363     [(123, 0, 'defline', 'STSS', 'filename'),
 364      (123, 5, 'defline', 'DEFA', 'filename'),
 365      (123, 10, 'defline', 'A', 'filename')]
 366
 367     """
 368     return [ (idno, m.start(), defline, m.group(), filename)
 369              for n, m in enumerate(AA_SEQUENCE.finditer(sequence)) ]
 370
 371
 372 def read_fasta_files(filenames):
 373     """Yield (defline, sequence, filename) tuples as read from FASTA files
 374     (uppercasing sequence)."""
 375     defline = None
 376     seqs = []
 377     for line in fileinput.input(filenames):
 378         line = line.strip()
 379         if line[:1] == '>':
 380             if defline != None:
 381                 yield (defline, ''.join(seqs), fileinput.filename())
 382             elif seqs:
 383                 fileerror("bad format: line precedes initial defline")
 384             defline = line[1:]
 385             seqs = []
 386         else:
 387             seqs.append(line.upper())
 388     if defline:
 389         yield (defline, ''.join(seqs), fileinput.filename())
 390
 391
 392 def read_spectra_slice(spectrum_fns, offset_indices, slice):
 393     s_l, s_u = slice
 394     assert 0 <= s_l <= s_u <= 1
 395
 396     total_spectra = sum(len(oi) for oi in offset_indices)
 397     sp_l = int(float(s_l) * total_spectra)
 398     sp_u = int(float(s_u) * total_spectra)
 399
 400     seeing_l = 0
 401     spectra = []
 402
 403     for f_no, (fn, oi) in enumerate(zip(spectrum_fns, offset_indices)):
 404         seeing_u = seeing_l + len(oi)
 405         f_l = max(sp_l, seeing_l)
 406         f_u = min(sp_u, seeing_u)
 407
 408         if f_l < f_u:
 409             b_l = oi[f_l - seeing_l]
 410             b_u = oi[f_u - seeing_l] if f_u - seeing_l < len(oi) else -1 # oo
 411
 412             with open(fn) as f:
 413                 fsp = cgreylag.spectrum.read_spectra_from_ms2(f, f_no,
 414                                                               b_l, b_u)
 415                 spectra.extend(fsp)
 416         seeing_l = seeing_u
 417
 418     return spectra
 419
 420
 421 # XML parameter file processing
 422
 423 # FIX: This parsing is way too complex.  How to simplify?
 424
 425 def mass_regime_part(part_specification):
 426     """Parse a single mass regime specification part.
 427
 428     >>> mass_regime_part('MONO')
 429     ('MONO', [])
 430     >>> mass_regime_part('AVG')
 431     ('AVG', [])
 432     >>> mass_regime_part('MONO(N15@87.5%)')
 433     ('MONO', [('N15', 0.875)])
 434
 435     """
 436     ps = [ x.strip() for x in part_specification.partition('(') ]
 437     if ps[0] not in ('MONO', 'AVG'):
 438         raise ValueError("invalid mass regime list specification"
 439                          " (regime id must be 'MONO' or 'AVG')")
 440     if not ps[1]:
 441         return (ps[0], [])
 442     if ps[2][-1] != ')':
 443         raise ValueError("invalid mass regime list specification"
 444                          " (expected ')')")
 445     pps = [ x.strip() for x in ps[2][:-1].split(',') ]
 446     if len(pps) > 1:
 447         raise ValueError("invalid mass regime list specification"
 448                          " (multiple isotopes not yet implemented)")
 449     ppps = [ x.strip() for x in pps[0].partition('@') ]
 450     if not ppps[1]:
 451         raise ValueError("invalid mass regime list specification"
 452                          " (expected '@')")
 453     if ppps[0] not in ('N15',):
 454         raise ValueError("invalid mass regime list specification"
 455                          " (isotope id must currently be 'N15')")
 456     if ppps[2][-1] != '%':
 457         raise ValueError("invalid mass regime list specification"
 458                          " (expected '%')")
 459     prevalence = float(ppps[2][:-1]) / 100
 460     if not (0 <= prevalence <= 1):
 461         raise ValueError("invalid mass regime list specification"
 462                          " (prevalence must be in range 0-100%)")
 463     return (ps[0], [(ppps[0], prevalence)])
 464
 465 def mass_regime_list(mass_regime_list_specification):
 466     """Check and return a list of regime tuples (parent_regime,
 467     fragment_regime), where each regime is a tuple (id, [(isotope_id,
 468     prevalence), ...]). Multiple isotopes (when implemented) would be
 469     comma-separated.
 470
 471     >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
 472     [[('AVG', []), ('MONO', [])],
 473      [('MONO', []), ('MONO', [])],
 474      [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
 475
 476     """
 477     result = []
 478     for regspec in mass_regime_list_specification.split(';'):
 479         halves = [ x.strip() for x in regspec.split('/') ]
 480         if len(halves) > 2:
 481             raise ValueError("invalid mass regime list specification"
 482                              " (too many '/'?)")
 483         pr = [ mass_regime_part(h) for h in halves ]
 484         if len(pr) == 1:
 485             pr = [ pr[0], pr[0] ]
 486         result.append(pr)
 487     debug("mass regime list:\n%s", pformat(result))
 488
 489     # The first fragmentation regime should generally be MONO, so that
 490     # formulaic deltas with '!' do the expected thing.
 491     if result[0][1] != ('MONO', []):
 492         raise ValueError("first fragmentation regime was something other than"
 493                          " 'MONO' with no isotopes--this is almost certainly"
 494                          " not what was intended")
 495     return result
 496
 497
 498 def parse_mod_term(s, is_potential=False):
 499     """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
 500     residues, description).
 501
 502     >>> parse_mod_term('-C2H3ON!@C')
 503     (-1, 'C2H3ON', True, 'C', None)
 504     >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
 505     (1, 42.0, False, 'STY', 'phosphorylation')
 506
 507     """
 508
 509     m = re.match(r'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
 510                  r'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s)
 511     if not m:
 512         raise ValueError("invalid modification term specification"
 513                          " '%s'" % s)
 514     mg = m.groups()
 515     invalid_residues = set(mg[5]) - set(RESIDUES_W_BRACKETS)
 516     if invalid_residues:
 517         raise ValueError("invalid modification list specification"
 518                          " (invalid residues %s)" % list(invalid_residues))
 519     delta = mg[1]
 520     if mg[2]:
 521         delta = float(mg[1])
 522         if is_potential and abs(delta) < 0.0001:
 523             raise ValueError("invalid modification list specification"
 524                              " (delta '%s' is too small)" % delta)
 525     residues = mg[5]
 526     if not is_potential and len(residues) != 1:
 527         raise ValueError("invalid modification list specification '%s' (only"
 528                          " potential modifications may have multiple residues)"
 529                          % residues)
 530     if len(residues) != len(set(residues)):
 531         raise ValueError("invalid modification list specification"
 532                          " '%s' (duplicate residues prohibited)"
 533                          % residues)
 534     return (mg[0] == '-' and -1 or 1, delta, mg[4] == '!', residues, mg[7])
 535
 536
 537 def fixed_mod_list(specification):
 538     """Check and return a list of modification tuples.
 539
 540     >>> fixed_mod_list('57@C')
 541     [(1, 57.0, False, 'C', None)]
 542     >>> fixed_mod_list('57@C,CH!@N desc')
 543     [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
 544     >>> fixed_mod_list('57@C,58@C')
 545     Traceback (most recent call last):
 546         ...
 547     ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
 548
 549     """
 550     if not specification:
 551         return []
 552     result = [ parse_mod_term(s) for s in specification.split(',') ]
 553     residues = [ x[3] for x in result ]
 554     # this check is across terms; the one above is within terms
 555     if len(residues) != len(set(residues)):
 556         raise ValueError("invalid modification list specification"
 557                          " '%s' (duplicate residues prohibited)"
 558                          % residues)
 559     debug("fixed_mod_list:\n%s", pformat(result))
 560     return result
 561
 562
 563 def parse_mod_basic_expression(s):
 564     s = s.strip()
 565     if s[0] == '(':
 566         tree, rest =  parse_mod_disjunction(s[1:])
 567         rest = rest.lstrip()
 568         if rest[:1] != ')':
 569             raise ValueError("invalid modification list specification"
 570                              " (expected matching ')')")
 571         return tree, rest[1:]
 572     parts = re.split(r'([;,()])', s, 1)
 573     if len(parts) == 1:
 574         term, rest = s, ''
 575     else:
 576         assert len(parts) == 3
 577         term, rest = parts[0], parts[1]+parts[2]
 578     return parse_mod_term(term, is_potential=True), rest
 579
 580 def parse_mod_conjunction(s):
 581     result = []
 582     while s:
 583         tree, s = parse_mod_basic_expression(s)
 584         result.append(tree)
 585         s = s.lstrip()
 586         if s[:1] != ',':
 587             break
 588         s = s[1:]
 589     return result, s
 590
 591 def parse_mod_disjunction(s):
 592     result = []
 593     while s:
 594         tree, s = parse_mod_conjunction(s)
 595         result.append(tree)
 596         s = s.lstrip()
 597         if s[:1] != ';':
 598             break
 599         s = s[1:]
 600     return result, s
 601
 602 def potential_mod_list(specification):
 603     """Check and return a tree of potential modification tuples.  Nodes at
 604     even (odd) levels are disjunctions (conjunctions).  (The top list is a
 605     disjunction.)
 606
 607     >>> potential_mod_list('')
 608     []
 609     >>> potential_mod_list('PO3H@STY; C2H2O@KST')
 610     [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
 611     >>> potential_mod_list('PO3H@STY, C2H2O@KST')
 612     [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
 613     >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
 614     ...                               C2H2O@KST acetylation;
 615     ...                               CH2@AKST methylation),
 616     ...                              O@M oxidation'''))
 617     [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
 618        [(1, 'C2H2O', False, 'KST', 'acetylation')],
 619        [(1, 'CH2', False, 'AKST', 'methylation')]],
 620       (1, 'O', False, 'M', 'oxidation')]]
 621
 622     """
 623     if not specification:
 624         return []
 625     tree, remainder = parse_mod_disjunction(specification)
 626     if remainder:
 627         raise ValueError("invalid modification list specification"
 628                          " (unexpected '%s')" % remainder)
 629     debug("potential_mod_list:\n%s", pformat(tree))
 630     return tree
 631
 632
 633 def p_positive(x): return x > 0
 634 def p_negative(x): return x < 0
 635 def p_nonnegative(x): return x >= 0
 636 def p_proportion(x): return 0 <= x <= 1
 637
 638 # Configuration file parameter specification
 639 #
 640 # name -> (type, default, check_fn)
 641 # type may be bool, int, float, str, modlist, or a tuple of values
 642 # default, as a string value, or None if value must be explicitly specified
 643 # check_fn is an optional function that returns True iff the value is valid
 644
 645 PARAMETER_INFO = {
 646     "databases" : (str, None),
 647     "decoy_locus_prefix" : (str, "SHUFFLED_"),
 648     "mass_regimes" : (mass_regime_list, "MONO"),
 649     "pervasive_mods" : (fixed_mod_list, ""),
 650     "potential_mods" : (potential_mod_list, ""),
 651     "potential_mod_limit" : (int, 2, p_nonnegative),
 652     "charge_limit" : (int, 3, p_positive),
 653     "min_peptide_length" : (int, 5, p_positive), # needed?
 654     "min_parent_spectrum_mass" : (float, 0, p_nonnegative),
 655     "max_parent_spectrum_mass" : (float, 10000, p_nonnegative),
 656     "TIC_cutoff_proportion" : (float, 0.98, p_proportion),
 657     "parent_mz_tolerance" : (float, 1.25, p_nonnegative),
 658     "fragment_mass_tolerance" : (float, 0.5, p_nonnegative),
 659     "intensity_class_count" : (int, 3, p_positive),
 660     "intensity_class_ratio" : (float, 2.0, p_positive), # really > 1.0?
 661     "best_result_count" : (int, 5, p_positive),
 662     }
 663
 664
 665 def validate_parameters(parameters, parameter_info=PARAMETER_INFO):
 666     """Verify that parameters are valid, have valid values, and correspond to
 667     currently implemented functionality.  Values are converted, default values
 668     are filled in, and the resulting name/value dict returned.
 669
 670     >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
 671     ...                            {'spectrum, total peaks'
 672     ...                             : (int, '50', p_positive),
 673     ...                             'output, spectra'
 674     ...                             : (bool, 'no')}).items())
 675     [('output, spectra', False), ('spectrum, total peaks', 40)]
 676
 677     """
 678
 679     pmap = {}
 680     for p_name, p_info in sorted(parameter_info.items()):
 681         type_ = p_info[0]
 682         default = p_info[1]
 683         check_fn = len(p_info) > 2 and p_info[2] or None
 684
 685         v = parameters.get(p_name)
 686         if v == None:
 687             if default != None:
 688                 debug("parameter '%s' defaulting to '%s'", p_name, default)
 689                 v = default
 690             else:
 691                 error("missing required parameter '%s'" % p_name)
 692         if isinstance(type_, tuple):
 693             if not v in type_:
 694                 error("parameter '%s' value '%s' not in %s (feature not"
 695                       " implemented?)" % (p_name, v, type_))
 696         elif type_ == bool:
 697             v = { 'yes' : True, 'no' : False }.get(v)
 698             if v == None:
 699                 error("parameter '%s' requires a value of 'yes' or 'no'")
 700         else:
 701             try:
 702                 v = type_(v)
 703             except ValueError, e:
 704                 error("parameter '%s' has value '%s' with invalid format [%s]"
 705                       % (p_name, v, e))
 706         if check_fn and not check_fn(v):
 707             error("parameter '%s' has invalid value '%s' (or feature not"
 708                   " implemented)" % (p_name, v))
 709         pmap[p_name] = v
 710
 711     unknown_parameters = set(parameters) - set(parameter_info)
 712     if unknown_parameters:
 713         warning("%s unknown parameter(s):\n %s"
 714                 % (len(unknown_parameters),
 715                    pformat(sorted(list(unknown_parameters)))))
 716     return pmap
 717
 718
 719 def generate_mass_bands(band_count, mass_list):
 720     """Yield (n, mass_lb, mass_ub) for each mass band, where n ranges from 1
 721     to band_count.  To generate the bands, the mass list (which is assumed
 722     already sorted by mass) is evenly partitioned into bands with masses in
 723     the range [mass_lb, mass_ub).
 724
 725     >>> list(generate_mass_bands(1, [ float(x) for x in range(100) ]))
 726     [(1, 0.0, 100.0)]
 727     >>> list(generate_mass_bands(4, [ float(x) for x in range(100) ]))
 728     [(1, 0.0, 25.0), (2, 25.0, 50.0), (3, 50.0, 75.0), (4, 75.0, 100.0)]
 729     >>> list(generate_mass_bands(3, [ float(x) for x in range(100) ]))
 730     [(1, 0.0, 34.0), (2, 34.0, 68.0), (3, 68.0, 100.0)]
 731
 732     """
 733     assert band_count > 0 and mass_list
 734     assert sorted(mass_list) == list(mass_list)
 735     band_size = int(math.ceil(float(len(mass_list)) / band_count))
 736     lb = mass_list[0]
 737     for bn in range(1, band_count):
 738         i = min(bn*band_size, len(mass_list)-1)
 739         ub = mass_list[i]
 740         yield bn, lb, ub
 741         lb = ub
 742     # Since the upper bound is exclusive, we add a little slop to make sure
 743     # the last spectrum is included.
 744     yield band_count, lb, round(mass_list[-1]+1)
 745
 746
 747 def pythonize_swig_object(o, only_fields=None, skip_fields=[]):
 748     """Creates a pure Python copy of a SWIG object, so that it can be easily
 749     pickled, or printed (for debugging purposes).  Each SWIG object is
 750     pythonized as a dictionary, for convenience.  If provided, 'only_fields',
 751     limits the copy to the list of fields specified.  Otherwise,
 752     'skip_fields' if given is a list of methods not to include (this helps
 753     avoid infinite recursion).
 754
 755     >>> pprint(pythonize_swig_object(cgreylag.score_stats(3)))
 756     struct({'best_match': [[], [], []], 'best_score': [100.0, 100.0, 100.0], 'candidate_spectrum_count': 0, 'combinations_searched': 0, 'hyperscore_histogram': [[], [], []], 'improved_candidates': 0, 'second_best_score': [100.0, 100.0, 100.0], 'spectra_with_candidates': 0})
 757
 758     """
 759
 760     if isinstance(o, str):
 761         return o
 762     try:
 763         len(o)
 764     except TypeError:
 765         pass
 766     else:
 767         return list(pythonize_swig_object(x, only_fields, skip_fields)
 768                     for x in o)
 769     if hasattr(o, '__swig_getmethods__'):
 770         s = {}
 771         for a in o.__swig_getmethods__:
 772             if (only_fields != None and a in only_fields
 773                 or only_fields == None and a not in skip_fields):
 774                 s[a] = pythonize_swig_object(getattr(o, a), only_fields,
 775                                              skip_fields)
 776         return s
 777     return o
 778
 779 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
 780 # circularization of the peptide N-terminal.  PCA mods are excluded if a
 781 # static N-terminal mod has been specified.  Likewise, choosing a PCA mod will
 782 # exclude choosing a potential N-terminal mod.  (The PCA mod for 'C' is
 783 # dependent on a static mod of C+57 being in effect.)
 784
 785 def get_pca_table(mass_regimes):
 786     """Return a list of tuples (residues, parent delta, fragment delta)
 787     describing the PCA possibilities for each mass regime. Residues is a
 788     string specifying the residues this PCA delta applies to.
 789     """
 790     # FIX: According to Xtandem, C is only a candidate for PCA if
 791     # carboxyamidomethylated (C+57).  Currently we always search it.
 792     return [ [('', 0, 0),
 793               ('E', -1 * CP.parent_mass_regime[r].water_mass,
 794                -1 * CP.fragment_mass_regime[r].water_mass),
 795               ('QC', -1 * CP.parent_mass_regime[r].ammonia_mass,
 796                -1 * CP.fragment_mass_regime[r].ammonia_mass)]
 797              for r in range(len(mass_regimes)) ]
 798
 799
 800 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
 801     if not mod_tree:
 802         if 0 < len(conjuncts) <= limit:
 803             yield conjuncts
 804         return
 805     first, rest = mod_tree[0], mod_tree[1:]
 806     if isinstance(first, list):
 807         for x in enumerate_disjunction(first, limit):
 808             for y in enumerate_conjunction(rest, limit, conjuncts + x):
 809                 yield y
 810     else:
 811         for y in enumerate_conjunction(rest, limit, conjuncts):
 812             yield y
 813         for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
 814             yield y
 815
 816 def enumerate_disjunction(mod_tree, limit=sys.maxint):
 817     """Generates the conjuncts for mod_tree that are no longer than limit.
 818
 819     >>> list(enumerate_disjunction([['a'],['b'],['c']]))
 820     [[], ['a'], ['b'], ['c']]
 821     >>> list(enumerate_disjunction([[1,2,3]]))
 822     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
 823     >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
 824     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
 825     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
 826     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
 827     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
 828     [[]]
 829
 830     """
 831     assert isinstance(mod_tree, list)
 832     yield []
 833     for b in mod_tree:
 834         for s in enumerate_conjunction(b, limit):
 835             yield s
 836
 837 def get_mod_conjunct_triples(mod_tree, limit):
 838     """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
 839     (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
 840     ordered by increasing number of conjuncts, and with duplicates within and
 841     across removed.  A mass table is also appended to each conjunct.
 842     """
 843     # FIX: is there a more elegant way or place for all this?
 844     def enmass(t):
 845         def rmass(regime_index, par_frag, sign, delta, is_mono):
 846             global MASS_REGIME_ATOMIC_MASSES
 847             if isinstance(delta, str):
 848                 if is_mono:
 849                     regime_index = 0
 850                 return (sign * formula_mass(delta,
 851                            MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
 852             else:
 853                 return sign * delta
 854
 855         sign, delta, is_mono = t[:3]
 856         return t + (tuple((rmass(r, 0, sign, delta, is_mono),
 857                            rmass(r, 1, sign, delta, is_mono))
 858                           for r in range(len(XTP["mass_regimes"]))),)
 859
 860     def triple(c):
 861         Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
 862         Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
 863         assert len(Ns) <= 1 and len(Cs) <= 1
 864         rest = [ enmass(x) for x in c if x[3] not in '[]' ]
 865         rest = tuple(sorted(list(frozenset(rest))))
 866         return (Ns, Cs, rest)
 867
 868     return sorted(list(frozenset(triple(conjunct)
 869                                  for conjunct
 870                                  in enumerate_disjunction(mod_tree, limit))),
 871                   key=lambda x: (sum(len(y) for y in x), x))
 872
 873
 874 def gen_delta_bag_counts(i, remainder, bag):
 875     if i < 1:
 876         assert i == 0
 877         bag[0] = remainder
 878         yield tuple(bag)
 879         return
 880     for delta in range(1, remainder-i+1):
 881         bag[i] = delta
 882         for x in gen_delta_bag_counts(i-1, remainder-delta, bag):
 883             yield x
 884
 885 def generate_delta_bag_counts(mod_count, conjunct_length):
 886     """Generate all tuples of positive integers having length conjunct_length
 887     and sum mod_count.  As a special case, () is such a tuple having length 0
 888     and sum 0.
 889
 890     >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
 891     ...
 892     0 []
 893     1 [(4,)]
 894     2 [(3, 1), (2, 2), (1, 3)]
 895     3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
 896     4 [(1, 1, 1, 1)]
 897     5 []
 898     >>> list(generate_delta_bag_counts(0, 0))
 899     [()]
 900     >>> list(generate_delta_bag_counts(0, 1))
 901     []
 902
 903     """
 904     if conjunct_length == 0:
 905         return [()] if mod_count == 0 else []
 906     if mod_count < conjunct_length:
 907         return []
 908     return gen_delta_bag_counts(conjunct_length - 1, mod_count,
 909                                 [0] * conjunct_length)
 910
 911
 912 def set_context_conjuncts(context, mass_regime_index, N_cj, C_cj, R_cj):
 913     assert len(N_cj) < 1 and len(C_cj) < 1
 914     context.N_delta = 0
 915     if N_cj:
 916         context.N_delta = N_cj[0][5][mass_regime_index][1]
 917     context.C_delta = 0
 918     if C_cj:
 919         context.C_delta = C_cj[0][5][mass_regime_index][1]
 920     context.delta_bag_lookup.clear()
 921     context.delta_bag_lookup.resize(ord('Z')+1)
 922     context.delta_bag_delta.clear()
 923     for n, cj in enumerate(R_cj):
 924         context.delta_bag_delta.append(cj[5][mass_regime_index][1])
 925         for r in cj[3]:
 926             context.delta_bag_lookup[ord(r)] \
 927                 = context.delta_bag_lookup[ord(r)] + (n,)
 928
 929
 930 def search_all(options, context, score_statistics):
 931     """Search sequence database against searchable spectra."""
 932
 933     mod_limit = XTP["potential_mod_limit"]
 934
 935     mod_conjunct_triples = get_mod_conjunct_triples(XTP["potential_mods"],
 936                                                     mod_limit)
 937     info("%s unique potential modification conjuncts",
 938          len(mod_conjunct_triples))
 939     debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples))
 940
 941     mass_regimes = XTP["mass_regimes"]
 942     pca_table = get_pca_table(mass_regimes)
 943     debug("pca_table: %s", pca_table)
 944
 945     total_combinations_searched = 0
 946
 947     for mod_count in range(mod_limit + 1):
 948         context.mod_count = mod_count
 949         for mr_index, (mass_regime, pca_entry) in enumerate(zip(mass_regimes,
 950                                                                 pca_table)):
 951             context.mass_regime_index = mr_index
 952             for pca_res, pca_parent_delta, pca_frag_delta in pca_entry:
 953                 context.pca_residues = pca_res
 954                 context.pca_delta = pca_frag_delta
 955                 for cji, (N_cj, C_cj, R_cj) in enumerate(mod_conjunct_triples):
 956                     if pca_res and N_cj:
 957                         continue    # mutually exclusive, for now
 958                     set_context_conjuncts(context, mr_index, N_cj, C_cj, R_cj)
 959                     debug("mod_count: %s", mod_count)
 960                     debug("cj_triple: N=%s C=%s R=%s", N_cj, C_cj, R_cj)
 961                     for delta_bag in generate_delta_bag_counts(mod_count,
 962                                                                len(R_cj)):
 963                         debug("delta_bag: %s", delta_bag)
 964
 965                         # this clear() avoids an SWIG/STL bug!?
 966                         context.delta_bag_count.clear()
 967                         context.delta_bag_count[:] = delta_bag
 968
 969                         pmrf = CP.parent_mass_regime[mr_index].fixed_residue_mass
 970                         fmrf = CP.fragment_mass_regime[mr_index].fixed_residue_mass
 971                         p_fx = (pmrf[ord('[')]
 972                                 + (N_cj and N_cj[0][5][mr_index][0] or 0)
 973                                 + pmrf[ord(']')]
 974                                 + (C_cj and C_cj[0][5][mr_index][0] or 0)
 975                                 + pca_parent_delta + PROTON_MASS)
 976                         context.parent_fixed_mass = p_fx
 977                         f_N_fx = (fmrf[ord('[')]
 978                                   + (N_cj and N_cj[0][5][mr_index][1] or 0)
 979                                   + pca_frag_delta)
 980                         context.fragment_N_fixed_mass = f_N_fx
 981                         f_C_fx = (fmrf[ord(']')]
 982                                   + (C_cj and C_cj[0][5][mr_index][1] or 0)
 983                                   + CP.fragment_mass_regime[mr_index].water_mass)
 984                         context.fragment_C_fixed_mass = f_C_fx
 985
 986                         info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
 987                              % (mod_count, mr_index, pca_res, cji, delta_bag))
 988                         debug("p_fx %s f_N_fx %s f_C_fx %s"
 989                               % (p_fx, f_N_fx, f_C_fx))
 990
 991                         score_statistics.combinations_searched = 0
 992                         cgreylag.spectrum.search_runs(context,
 993                                                       score_statistics)
 994                         total_combinations_searched \
 995                             += score_statistics.combinations_searched
 996                         info("  %s candidate spectra examined, this bag",
 997                              score_statistics.combinations_searched)
 998
 999     info('%s candidate spectra examined',
1000          score_statistics.candidate_spectrum_count)
1001
1002
1003 def get_prefix_sequence(begin_pos, run_offset, sequence):
1004     """
1005     >>> get_prefix_sequence(0, 0, 'abcdefghi')
1006     '['
1007     >>> get_prefix_sequence(1, 0, 'abcdefghi')
1008     '[a'
1009     >>> get_prefix_sequence(3, 0, 'abcdefghi')
1010     '[abc'
1011     >>> get_prefix_sequence(4, 0, 'abcdefghi')
1012     'abcd'
1013     >>> get_prefix_sequence(4, 2, 'abcdefghi')
1014     'cdef'
1015
1016     """
1017     prefix_start = run_offset + begin_pos - 4
1018     s = sequence[max(0, prefix_start):prefix_start+4]
1019     if len(s) < 4:
1020         s = '[' + s
1021     return s
1022
1023 def get_suffix_sequence(end_pos, run_offset, sequence):
1024     """
1025     >>> get_suffix_sequence(0, 0, 'abcdefghi')
1026     'abcd'
1027     >>> get_suffix_sequence(4, 0, 'abcdefghi')
1028     'efgh'
1029     >>> get_suffix_sequence(6, 0, 'abcdefghi')
1030     'ghi]'
1031     >>> get_suffix_sequence(8, 0, 'abcdefghi')
1032     'i]'
1033     >>> get_suffix_sequence(4, 2, 'abcdefghi')
1034     'ghi]'
1035
1036     """
1037     suffix_start = run_offset + end_pos
1038     s = sequence[suffix_start:suffix_start+4]
1039     if len(s) < 4:
1040         s = s + ']'
1041     return s
1042
1043
1044 def clean_string(v):
1045     """Strip and collapse internal whitespace.
1046
1047     >>> clean_string(' one   two ')
1048     'one two'
1049
1050     """
1051     if not v:
1052         return v
1053     return re.sub(r'[\s]+', ' ', v.strip())
1054
1055
1056 def clean_defline(s):
1057     """Return the given string with tabs replaced by spaces and control
1058     and non-ASCII characters removed, then stripped.
1059
1060     >>> tab=chr(9); clean_defline(' one' + tab + ' two three\001four ')
1061     'one  two threefour'
1062
1063     """
1064     return re.sub(r'[^ -~]', '', s.replace('\t', ' ')).strip()
1065
1066
1067 def abbrev_defline(s):
1068     """Return an abbreviated version of the defline--about 80 characters.
1069
1070     >>> abbrev_defline('words ' * 10)
1071     'words words words words words words words words words words '
1072     >>> abbrev_defline('words ' * 20)
1073     'words words words words words words words words words words words words words words...'
1074
1075     """
1076     ab = re.match(r'.{,80}\S{,170}', s).group(0)
1077     if len(ab) < len(s):
1078         ab += '...'
1079     return ab
1080
1081
1082 def zopen(filename, mode='r', compresslevel=None):
1083     """Open a filename as with 'open', but using compression if indicated by
1084     the filename suffix.  The compression level defaults to 1 for .gz files
1085     and 9 for .bz2 files (presumably bzip2 will only be used if size matters
1086     much more than compression time)."""
1087     if filename.endswith('.gz'):
1088         import gzip
1089         return gzip.GzipFile(filename, mode, compresslevel or 1)
1090     elif filename.endswith('.bz2'):
1091         import bz2
1092         return bz2.BZ2File(filename, mode, compresslevel or 9)
1093     else:
1094         return open(filename, mode)
1095
1096
1097 def main(args=sys.argv[1:]):
1098     parser = optparse.OptionParser(usage=
1099                                    "usage: %prog [options] <job-id>"
1100                                    " <parameter-file> <ms2-file>...",
1101                                    description=__doc__, version=__version__)
1102     pa = parser.add_option
1103     pa("-P", "--parameter", nargs=2, dest="parameters", action="append",
1104        default=[],
1105        help="override a parameter in <parameter-file>, may be used multiple"
1106        " times", metavar="NAME VALUE")
1107     pa("-w", "--work-slice", nargs=2, type="float", dest="work_slice",
1108        help="search a subinterval [L:U) of the work space"
1109        " (where 0 <= L <= U <= 1) in standalone mode", metavar="L U")
1110     pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
1111     pa("-p", "--show-progress", action="store_true", dest="show_progress",
1112        help="show running progress")
1113     pa("--estimate", action="store_true", dest="estimate_only",
1114        help="just estimate the time required for the search")
1115     pa("-v", "--verbose", action="store_true", dest="verbose",
1116        help="be verbose")
1117     pa("--copyright", action="store_true", dest="copyright",
1118        help="print copyright and exit")
1119     pa("--debug", action="store_true", dest="debug",
1120        help="output debugging info")
1121     pa("--profile", action="store_true", dest="profile",
1122        help="dump Python profiling output to './greylag.prof.<pid>'")
1123     (options, args) = parser.parse_args(args=args)
1124
1125     if options.copyright:
1126         print __copyright__
1127         sys.exit(0)
1128
1129     if len(args) < 3:
1130         parser.print_help()
1131         sys.exit(1)
1132
1133     job_id = args[0]
1134     parameter_fn = args[1]
1135     spectrum_fns = args[2:]
1136
1137     if (any(True for f in spectrum_fns if not f.endswith('.ms2'))
1138         or (options.work_slice
1139             and not (0 <= options.work_slice[0]
1140                      <= options.work_slice[1] <= 1))):
1141         parser.print_help()
1142         sys.exit(1)
1143
1144     log_level = logging.WARNING
1145     if options.quiet:
1146         log_level = logging.ERROR
1147     if options.verbose:
1148         log_level = logging.INFO
1149     if options.debug:
1150         log_level = logging.DEBUG
1151     logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
1152                         format='%(asctime)s %(levelname)s: %(message)s')
1153     info("starting on %s", gethostname())
1154
1155     # prevent format char problems
1156     if '%' in job_id:
1157         error("<job-id> may not contain '%'")
1158
1159     # check spectrum basename uniqueness, as corresponding sqt files will be
1160     # in a single directory
1161     base_spectrum_fns = [ os.path.basename(fn) for fn in spectrum_fns ]
1162     if len(base_spectrum_fns) != len(set(base_spectrum_fns)):
1163         error("base spectrum filenames must be unique")
1164
1165     # check -P names for validity
1166     bad_names = (set(n for n,v in options.parameters)
1167                  - set(PARAMETER_INFO.keys()))
1168     if bad_names:
1169         error("bad -P parameter names %s" % list(bad_names))
1170
1171     # read params
1172     cp = ConfigParser.RawConfigParser()
1173     cp.optionxform = str                # be case-sensitive
1174     with open(parameter_fn) as parameter_file:
1175         cp.readfp(parameter_file)
1176     if not cp.has_section('greylag'):
1177         error("%s has no [greylag] section" % parameter_fn)
1178     parameters = dict(cp.items('greylag'))
1179     parameters.update(dict(options.parameters)) # command-line override
1180     global XTP
1181     XTP = validate_parameters(parameters)
1182
1183     fixed_mod_map = dict((r[3], r) for r in XTP["pervasive_mods"])
1184     initialize_spectrum_parameters(options, XTP["mass_regimes"], fixed_mod_map)
1185
1186     # read sequence dbs
1187     # [(defline, seq, filename), ...]
1188     databases = XTP["databases"].split()
1189     fasta_db = list(read_fasta_files(databases))
1190     # [(idno, offset, defline, seq, seq_filename), ...]
1191     db = []
1192     for idno, (defline, sequence, filename) in enumerate(fasta_db):
1193         db.extend(split_sequence_into_aa_runs(idno, defline, sequence,
1194                                               filename))
1195     db_residue_count = sum(len(dbi[3]) for dbi in db)
1196
1197     # (idno, offset) -> (defline, run_seq, filename)
1198     db_info = dict(((idno, offset), (defline, seq, filename))
1199                    for idno, offset, defline, seq, filename in db)
1200
1201     info("read %s sequences (%s runs, %s residues)", len(fasta_db), len(db),
1202          db_residue_count)
1203     max_run_length = max(len(r[3]) for r in db)
1204     info("max run length is %s residues", max_run_length)
1205     if max_run_length > 2**31 - 1:
1206         error("runs longer than %s not yet supported", max_run_length)
1207     if not db:
1208         error("no database sequences")
1209
1210     # read spectrum offset indices
1211     spectrum_offset_indices = []
1212     for spfn in spectrum_fns:
1213         idxfn = spfn + '.idx'
1214         with contextlib.closing(gzip.open(idxfn)) as idxf:
1215             idx = cPickle.load(idxf)
1216             # try to verify that index matches spectrum file
1217             if idx['file size'] != os.path.getsize(spfn):
1218                 error("index '%s' needs rebuilding" % idxfn)
1219             spectrum_offset_indices.append(idx['offsets'])
1220
1221     # FIX: assume standalone mode (for now)
1222     assert options.work_slice
1223
1224     # read spectra per work slice
1225     spectra = read_spectra_slice(spectrum_fns, spectrum_offset_indices,
1226                                  options.work_slice)
1227     spectra.sort(key=lambda x: x.mass)
1228
1229     if not spectra:
1230         warning("no input spectra")
1231     else:
1232         info("read %s spectra (mass range %s - %s)", len(spectra),
1233              spectra[0].mass, spectra[-1].mass)
1234
1235     def peak_statistics(spectra):
1236         counts = [ len(sp.peaks) for sp in spectra ]
1237         counts.sort()
1238         n = len(counts)
1239         return (counts[0], counts[int(n*0.25)], counts[int(n*0.5)],
1240                 counts[int(n*0.75)], counts[-1], sum(counts) / float(n))
1241
1242     info("  peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1243          % peak_statistics(spectra))
1244
1245     # filter and normalize spectra
1246     for sp in spectra:
1247         sp.filter_peaks(XTP["TIC_cutoff_proportion"],
1248                         CP.parent_mass_tolerance_max)
1249         sp.classify(XTP["intensity_class_count"], XTP["intensity_class_ratio"],
1250                     XTP["fragment_mass_tolerance"])
1251
1252     min_psm = XTP["min_parent_spectrum_mass"]
1253     max_psm = XTP["max_parent_spectrum_mass"]
1254     # FIX: also filter by 1 + 2 + 4 rule?
1255     spectra = [ sp for sp in spectra
1256                 if len(sp.peaks) >= 10 and min_psm <= sp.mass <= max_psm ]
1257
1258     info("after filtering:")
1259     info("     %s spectra (mass range %s - %s)", len(spectra),
1260          spectra[0].mass, spectra[-1].mass)
1261     info("  peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1262          % peak_statistics(spectra))
1263
1264     cgreylag.spectrum.set_searchable_spectra(spectra)
1265     score_statistics = cgreylag.score_stats(len(spectra),
1266                                             XTP["best_result_count"])
1267
1268     if spectra:
1269         del spectra                     # release memory
1270
1271         # FIX!!!
1272         # (cleavage_re, position of cleavage in cleavage_re)
1273         cleavage_motif = "[X]|[X]"
1274         cleavage_pattern, cleavage_pos = cleavage_motif_re(cleavage_motif)
1275         if cleavage_pos == None:
1276             error("cleavage site '%s' is missing '|'",
1277                   XTP["protein, cleavage site"])
1278         cleavage_pattern = re.compile(cleavage_pattern)
1279
1280         context = cgreylag.search_context()
1281         for idno, offset, defline, seq, seq_filename in db:
1282             cp = []
1283             if cleavage_motif != "[X]|[X]":
1284                 cp = list(generate_cleavage_points(cleavage_pattern,
1285                                                    cleavage_pos, seq))
1286             locus_name = defline.split(None, 1)[0]
1287             sr = cgreylag.sequence_run(idno, offset, seq, cp, locus_name)
1288             context.sequence_runs.append(sr)
1289         context.maximum_missed_cleavage_sites = 1000000000 # FIX
1290
1291         info("searching")
1292         search_all(options, context, score_statistics)
1293     else:
1294         warning("no spectra after filtering--search skipped")
1295
1296     if options.estimate_only:
1297         print ("%.2f generic CPU hours"
1298                % (score_statistics.candidate_spectrum_count / 300.0e6))
1299         return
1300
1301     # FIX!
1302     info("writing result file")
1303     result_fn = 'test.result.gz'
1304     py_score_statistics = pythonize_swig_object(score_statistics)
1305
1306     # only put spectrum info that's actually needed into the result file
1307     spectrum_metadata_fields = set(['name', 'file_id', 'mass', 'charge',
1308                                     'total_ion_current', 'comparisons'])
1309     spectrum_metadata = \
1310         pythonize_swig_object(cgreylag.cvar.spectrum_searchable_spectra,
1311                               only_fields=spectrum_metadata_fields)
1312
1313     with contextlib.closing(zopen(result_fn, 'w')) as result_file:
1314         cPickle.dump({ 'matches' : py_score_statistics,
1315                        'spectra' : spectrum_metadata,
1316                        'spectrum files' : base_spectrum_fns,
1317                        'databases' : databases,
1318                        'parameters' : XTP,
1319                        'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES,
1320                        'proton mass' : PROTON_MASS,
1321                        'argv' : sys.argv },
1322                      result_file,
1323                      cPickle.HIGHEST_PROTOCOL)
1324     info("finished, result file written to '%s'", result_fn)
1325
1326
1327 if __name__ == '__main__':
1328 #     try:
1329 #         import psyco
1330 #         psyco.full()
1331 #         ###psyco.bind(formula_mass)
1332 #         warning('psyco enabled')
1333 #     except ImportError:
1334 #         pass
1335
1336     try:
1337         if '--profile' in sys.argv:
1338             import cProfile
1339             import pstats
1340             report_fn = "greylag.prof.%s" % os.getpid()
1341             data_fn = report_fn + ".tmp"
1342             prof = cProfile.run('main()', data_fn)
1343             with open(report_fn, 'w') as report_f:
1344                 try:
1345                     stats = pstats.Stats(data_fn, stream=report_f)
1346                     stats.strip_dirs()
1347                     stats.sort_stats('cumulative')
1348                     stats.print_stats(50)
1349                     stats.sort_stats('time')
1350                     stats.print_stats(50)
1351                     print "# profile report written to '%s'" % report_fn
1352                 finally:
1353                     try:
1354                         os.remove(data_fn)
1355                     except:
1356                         pass
1357         else:
1358             main()
1359     except SystemExit:
1360         raise
1361     except:
1362         logging.exception("unhandled exception")
1363         sys.exit(1)
1364     finally:
1365         logging.shutdown()
1366
1367
1368 # FIXES:
1369 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)