greylag_grind.py

   1 #!/usr/bin/env python2.5
   2
   3 '''This program does the actual work to search mass spectra against a sequence
   4 database.  <job-id> is a unique identifier used as a prefix for work and
   5 output directories.  The <configuration-file> contains program options.  The
   6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
   7 on them.
   8
   9 This program can operate in standalone mode (useful for testing) or command
  10 mode.  If the --work-slice option is given, that slice will be searched in
  11 standalone mode, then the program will exit.  Otherwise, the program will
  12 process commands until an exit command is received.
  13
  14 '''
  15
  16 from __future__ import with_statement
  17
  18 __copyright__ = '''
  19     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  20
  21     This program is free software; you can redistribute it and/or modify
  22     it under the terms of the GNU General Public License as published by
  23     the Free Software Foundation; either version 2 of the License, or
  24     (at your option) any later version.
  25
  26     This program is distributed in the hope that it will be useful,
  27     but WITHOUT ANY WARRANTY; without even the implied warranty of
  28     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  29     GNU General Public License for more details.
  30
  31     You should have received a copy of the GNU General Public License along
  32     with this program; if not, write to the Free Software Foundation, Inc.,
  33     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  34 '''
  35
  36 ##############################################################################
  37 ##    "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra     ##
  38 ##############################################################################
  39
  40
  41 __version__ = "0.0"
  42
  43
  44 import ConfigParser
  45 import contextlib
  46 import cPickle
  47 import fileinput
  48 import gzip
  49 import logging
  50 from logging import debug, info, warning
  51 import math
  52 import optparse
  53 import os
  54 from pprint import pprint, pformat
  55 import re
  56 from socket import gethostname
  57 import sys
  58
  59 import cgreylag
  60
  61
  62 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
  63 # KeyboardInterrupt processing, since we may spend long periods of time in
  64 # uninterruptible C++ calls.  Also die immediately on SIGPIPE.
  65 try:
  66     import signal
  67     signal.signal(signal.SIGINT, signal.SIG_DFL)
  68     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
  69 except:
  70     pass
  71
  72
  73 def error(s, *args):
  74     "fatal error"
  75     logging.error(s, *args)
  76     if __name__ != "__main__":
  77         raise Exception("(fatal error, but unit testing, so not exiting)")
  78     sys.exit(1)
  79
  80
  81 def fileerror(s, *args):
  82     error(s + (", at line %s of file '%s'"
  83                % (fileinput.filelineno(), fileinput.filename())),
  84           *args)
  85
  86
  87 # name -> value map of processed XML input parameters
  88 XTP = {}
  89
  90 # handle to the singleton parameter object shared with the C++ module
  91 CP = cgreylag.cvar.parameters_the
  92
  93 # FIX: is this mono or avg?  (possible error here is ~0.0007 amu)
  94 PROTON_MASS =   1.007276
  95 ELECTRON_MASS = 0.000549                # ?
  96
  97 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
  98 MONOISOTOPIC_ATOMIC_MASS = {
  99     'H' :  1.00782503214,
 100     'C' : 12.00000000,
 101     'N' : 14.00307400529,
 102     'O' : 15.994914622115,
 103     'P' : 30.9737615120,
 104     'S' : 31.9720706912,
 105     }
 106
 107 # most prevalent only (1 in 1000)
 108 ISOTOPIC_ATOMIC_MASS = {                # prevalence (in %)
 109     'C13' : 13.003354837810,            # 1.078
 110     'N15' : 15.00010889849,             # 0.3687
 111     'O18' : 17.99916049,                # 0.20514
 112     'S33' : 32.9714585012,              # 0.762
 113     'S34' : 33.9678668311,              # 4.2928
 114     }
 115
 116 AVERAGE_ATOMIC_MASS = {
 117     'H' :  1.007947,
 118     'C' : 12.01078,
 119     'N' : 14.00672,
 120     'O' : 15.99943,
 121     'P' : 30.9737612,
 122     'S' : 32.0655,
 123     }
 124
 125 # The xtandem average residue masses are about 0.002 amu higher than those
 126 # calculated directly from the above average atomic masses.  None of the
 127 # chemists consulted knew of any reason why, aside from lack of precision in
 128 # the average atomic mass estimates.  This shouldn't matter very much, as
 129 # fragmentation calculations should all be monoisotopic, and we can always
 130 # widen the parent tolerance window a bit.
 131
 132
 133 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
 134     """Return the mass of formula, using the given mass regime (monoisotopic
 135     by default).
 136
 137     >>> formula_mass('H2O', { 'H':1, 'O':16 })
 138     18
 139     >>> # monoisotopic mass of glycine
 140     >>> str(round(formula_mass('C2H3ON'), 4))
 141     '57.0215'
 142
 143     """
 144     parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
 145     # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
 146     return sum(atomic_mass[parts[i]] * int(parts[i+1])
 147                for i in range(0, len(parts), 2))
 148
 149 # FIX: selenocysteine (U), etc
 150 # residue -> formula
 151 RESIDUE_FORMULA = {
 152     'A' : "C3H5ON",
 153     'C' : "C3H5ONS",
 154     'D' : "C4H5O3N",
 155     'E' : "C5H7O3N",
 156     'F' : "C9H9ON",
 157     'G' : "C2H3ON",
 158     'H' : "C6H7ON3",
 159     'I' : "C6H11ON",
 160     'K' : "C6H12ON2",
 161     'L' : "C6H11ON",
 162     'M' : "C5H9ONS",
 163     'N' : "C4H6O2N2",
 164     'P' : "C5H7ON",
 165     'Q' : "C5H8O2N2",
 166     'R' : "C6H12ON4",
 167     'S' : "C3H5O2N",
 168     'T' : "C4H7O2N",
 169     'V' : "C5H9ON",
 170     'W' : "C11H10ON2",
 171     'Y' : "C9H9O2N",
 172     }
 173
 174 RESIDUES = sorted(RESIDUE_FORMULA.keys())
 175 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
 176
 177
 178 # accessed as a global variable (FIX?)
 179 # [0][1] -> 'H' -> fragment mass of H for regime 0
 180 MASS_REGIME_ATOMIC_MASSES = []
 181
 182
 183 def mass_regime_atomic_masses(spec):
 184     """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
 185     names to masses.
 186     """
 187     name, isotopes = spec
 188     assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
 189     if name == 'MONO':
 190         r = MONOISOTOPIC_ATOMIC_MASS.copy()
 191     else:
 192         r = AVERAGE_ATOMIC_MASS.copy()
 193     if isotopes:
 194         iname, prevalence = isotopes[0]
 195         assert iname == 'N15' and 0 <= prevalence <= 1
 196         # this is a simplification, but additional accuracy pointless?
 197         if name == 'MONO':
 198             r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
 199         else:
 200             r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
 201     return r
 202
 203
 204 def initialize_spectrum_parameters(options, mass_regimes, fixed_mod_map):
 205     """Initialize parameters known to the spectrum module.
 206     fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
 207     """
 208
 209     debug('fixed_mod_map: %s', fixed_mod_map)
 210     # This is the size of vectors that are indexed by residues (A-Z) or
 211     # special characters ('[]').
 212     RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
 213
 214     # These are currently monoisotopic.  (deuterium pointless?)
 215     CP.proton_mass = PROTON_MASS
 216     CP.hydrogen_mass = formula_mass("H")
 217
 218     global MASS_REGIME_ATOMIC_MASSES
 219     for regime_pair in mass_regimes:
 220         assert len(regime_pair) == 2    # parent and fragment
 221         info('mass regime: %s', regime_pair)
 222         MASS_REGIME_ATOMIC_MASSES.append([])
 223         for n, regime in enumerate(regime_pair):
 224             atmass = mass_regime_atomic_masses(regime)
 225             MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
 226             creg = cgreylag.mass_regime_parameters()
 227
 228             creg.hydroxyl_mass = formula_mass("OH", atmass)
 229             creg.water_mass = formula_mass("H2O", atmass)
 230             creg.ammonia_mass = formula_mass("NH3", atmass)
 231
 232             creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
 233
 234             for r in RESIDUES_W_BRACKETS:
 235                 m = 0
 236                 if r in RESIDUES:
 237                     m = formula_mass(RESIDUE_FORMULA[r], atmass)
 238                 rmod = fixed_mod_map.get(r)
 239                 if rmod:
 240                     if isinstance(rmod[1], str):
 241                         if rmod[2]:
 242                             m += rmod[0] * formula_mass(rmod[1])
 243                         else:
 244                             m += rmod[0] * formula_mass(rmod[1], atmass)
 245                     else:
 246                         m += rmod[0] * rmod[1]
 247                 creg.fixed_residue_mass[ord(r)] = m
 248             # assuming these are monoisotopic (not regime)
 249             creg.fixed_residue_mass[ord('[')] += formula_mass("H")
 250             creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
 251             if not n:
 252                 CP.parent_mass_regime.append(creg);
 253             else:
 254                 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
 255                 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
 256                 CP.fragment_mass_regime.append(creg);
 257     for r in RESIDUES_W_BRACKETS:
 258         info('fixed_mass %s: %s', r,
 259              [ "%.6f/%.6f"
 260                % (CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
 261                   CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
 262                for rn in range(len(mass_regimes)) ])
 263     for r in RESIDUES:
 264         for rn in range(len(mass_regimes)):
 265             # physically impossible (and results would be garbage)
 266             if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 267                 raise ValueError('bogus parent mass specification for %s' % r)
 268             if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 269                 raise ValueError('bogus parent mass specification for %s' % r)
 270
 271     CP.parent_mass_tolerance_1 = XTP["parent_mz_tolerance"]
 272     CP.parent_mass_tolerance_max = (XTP["parent_mz_tolerance"]
 273                                     * XTP["charge_limit"])
 274
 275     CP.fragment_mass_tolerance = XTP["fragment_mass_tolerance"]
 276     CP.intensity_class_count = XTP["intensity_class_count"]
 277
 278     CP.minimum_peptide_length = XTP["min_peptide_length"]
 279
 280     # CP.ln_factorial[n] == ln(n!)
 281     CP.ln_factorial.resize(int(XTP["max_parent_spectrum_mass"]
 282                                / XTP["fragment_mass_tolerance"] + 100), 0.0)
 283     for n in range(2, len(CP.ln_factorial)):
 284         CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
 285
 286     CP.estimate_only = bool(options.estimate_only)
 287     CP.show_progress = bool(options.show_progress)
 288
 289
 290 def cleavage_motif_re(motif):
 291     """Return (regexp, pos), where regexp is a regular expression that will
 292     match a cleavage motif, and pos is the position of the cleavage with
 293     respect to the match (or None).  (The RE actually matches one character,
 294     the rest matching as lookahead, so that re.finditer will find all
 295     overlapping matches.)
 296
 297     >>> cleavage_motif_re('[KR]|{P}')
 298     ('[KR](?=[^P])', 1)
 299     >>> cleavage_motif_re('[X]|[X]')
 300     ('.(?=.)', 1)
 301
 302     """
 303     cleavage_pos = None
 304     re_parts = []
 305     motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
 306     parts = [ p[0] for p in motif_re.findall(motif) ]
 307     if ''.join(parts) != motif:
 308         raise ValueError('invalid cleavage motif pattern')
 309     i = 0
 310     for part in parts:
 311         if part == '|':
 312             if cleavage_pos != None:
 313                 raise ValueError("invalid cleavage motif pattern"
 314                                  " (multiple '|'s)")
 315             cleavage_pos = i
 316             continue
 317         if part == '[X]':
 318             re_parts.append('.')
 319         elif part[0] == '[':
 320             re_parts.append('[%s]' % part[1:-1])
 321         elif part[0] == '{':
 322             re_parts.append('[^%s]' % part[1:-1])
 323         else:
 324             assert False, "unknown cleavage motif syntax"
 325         i += 1
 326     if len(re_parts) == 0:
 327         re_pattern = '.'
 328     elif len(re_parts) == 1:
 329         re_pattern = re_parts[0]
 330     else:
 331         re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
 332     return (re_pattern, cleavage_pos)
 333
 334
 335 def generate_cleavage_points(cleavage_re, cleavage_pos, sequence):
 336     """Yields the offsets of the cleavages in sequence.  The endpoints are
 337     always included, by convention.
 338
 339     >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
 340     ...                               'ARBCKDEKPF'))
 341     [0, 2, 5, 10]
 342
 343     """
 344     yield 0
 345     for m in cleavage_re.finditer(sequence):
 346         p = m.start() + cleavage_pos
 347         if 0 < p < len(sequence):
 348             yield p
 349     yield len(sequence)
 350
 351
 352 AA_SEQUENCE = re.compile(r'[ARNDCQEGHILKMFPSTWYV]+')
 353
 354 def split_sequence_into_aa_runs(idno, locusname, defline, sequence, filename):
 355     """Returns a tuple (idno, start, locusname, defline, seq, filename) for
 356     each contiguous run of residues in sequence, where 'start' is the position
 357     of 'seq' in 'sequence'.
 358
 359     >>> pprint(split_sequence_into_aa_runs(123, 'ln', 'ln defline',
 360     ...                                    'STSS*DEFABA', 'filename'))
 361     [(123, 0, 'ln', 'ln defline', 'STSS', 'filename'),
 362      (123, 5, 'ln', 'ln defline', 'DEFA', 'filename'),
 363      (123, 10, 'ln', 'ln defline', 'A', 'filename')]
 364
 365     """
 366     return [ (idno, m.start(), locusname, defline, m.group(), filename)
 367              for n, m in enumerate(AA_SEQUENCE.finditer(sequence)) ]
 368
 369
 370 def read_fasta_files(filenames):
 371     """Yield (locusname, defline, sequence, filename) tuples as read from
 372     FASTA files (uppercasing sequence)."""
 373
 374     loci_seen = set()
 375     locusname, defline = None, None
 376     seqs = []
 377     for line in fileinput.input(filenames):
 378         line = line.strip()
 379         if line[:1] == '>':
 380             if defline != None:
 381                 yield (locusname, defline, ''.join(seqs), fileinput.filename())
 382             elif seqs:
 383                 fileerror("bad format: line precedes initial defline")
 384             defline = line[1:]
 385             locusname = defline.split(None, 1)[0]
 386             if locusname in loci_seen:
 387                 error("locus name '%s' is not unique in the search database(s)"
 388                       % locusname)
 389             loci_seen.add(locusname)
 390             seqs = []
 391         else:
 392             seqs.append(line.upper())
 393     if defline:
 394         yield (locusname, defline, ''.join(seqs), fileinput.filename())
 395
 396
 397 def read_spectra_slice(spectrum_fns, offset_indices, slice):
 398     s_l, s_u = slice
 399     assert 0 <= s_l <= s_u <= 1
 400
 401     total_spectra = sum(len(oi) for oi in offset_indices)
 402     sp_l = int(float(s_l) * total_spectra)
 403     sp_u = int(float(s_u) * total_spectra)
 404
 405     seeing_l = 0
 406     spectra = []
 407
 408     for f_no, (fn, oi) in enumerate(zip(spectrum_fns, offset_indices)):
 409         seeing_u = seeing_l + len(oi)
 410         f_l = max(sp_l, seeing_l)
 411         f_u = min(sp_u, seeing_u)
 412
 413         if f_l < f_u:
 414             b_l = oi[f_l - seeing_l]
 415             b_u = oi[f_u - seeing_l] if f_u - seeing_l < len(oi) else -1 # oo
 416
 417             with open(fn) as f:
 418                 fsp = cgreylag.spectrum.read_spectra_from_ms2(f, f_no,
 419                                                               b_l, b_u)
 420                 spectra.extend(fsp)
 421         seeing_l = seeing_u
 422
 423     return spectra
 424
 425
 426 # XML parameter file processing
 427
 428 # FIX: This parsing is way too complex.  How to simplify?
 429
 430 def mass_regime_part(part_specification):
 431     """Parse a single mass regime specification part.
 432
 433     >>> mass_regime_part('MONO')
 434     ('MONO', [])
 435     >>> mass_regime_part('AVG')
 436     ('AVG', [])
 437     >>> mass_regime_part('MONO(N15@87.5%)')
 438     ('MONO', [('N15', 0.875)])
 439
 440     """
 441     ps = [ x.strip() for x in part_specification.partition('(') ]
 442     if ps[0] not in ('MONO', 'AVG'):
 443         raise ValueError("invalid mass regime list specification"
 444                          " (regime id must be 'MONO' or 'AVG')")
 445     if not ps[1]:
 446         return (ps[0], [])
 447     if ps[2][-1] != ')':
 448         raise ValueError("invalid mass regime list specification"
 449                          " (expected ')')")
 450     pps = [ x.strip() for x in ps[2][:-1].split(',') ]
 451     if len(pps) > 1:
 452         raise ValueError("invalid mass regime list specification"
 453                          " (multiple isotopes not yet implemented)")
 454     ppps = [ x.strip() for x in pps[0].partition('@') ]
 455     if not ppps[1]:
 456         raise ValueError("invalid mass regime list specification"
 457                          " (expected '@')")
 458     if ppps[0] not in ('N15',):
 459         raise ValueError("invalid mass regime list specification"
 460                          " (isotope id must currently be 'N15')")
 461     if ppps[2][-1] != '%':
 462         raise ValueError("invalid mass regime list specification"
 463                          " (expected '%')")
 464     prevalence = float(ppps[2][:-1]) / 100
 465     if not (0 <= prevalence <= 1):
 466         raise ValueError("invalid mass regime list specification"
 467                          " (prevalence must be in range 0-100%)")
 468     return (ps[0], [(ppps[0], prevalence)])
 469
 470 def mass_regime_list(mass_regime_list_specification):
 471     """Check and return a list of regime tuples (parent_regime,
 472     fragment_regime), where each regime is a tuple (id, [(isotope_id,
 473     prevalence), ...]). Multiple isotopes (when implemented) would be
 474     comma-separated.
 475
 476     >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
 477     [[('AVG', []), ('MONO', [])],
 478      [('MONO', []), ('MONO', [])],
 479      [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
 480
 481     """
 482     result = []
 483     for regspec in mass_regime_list_specification.split(';'):
 484         halves = [ x.strip() for x in regspec.split('/') ]
 485         if len(halves) > 2:
 486             raise ValueError("invalid mass regime list specification"
 487                              " (too many '/'?)")
 488         pr = [ mass_regime_part(h) for h in halves ]
 489         if len(pr) == 1:
 490             pr = [ pr[0], pr[0] ]
 491         result.append(pr)
 492     debug("mass regime list:\n%s", pformat(result))
 493
 494     # The first fragmentation regime should generally be MONO, so that
 495     # formulaic deltas with '!' do the expected thing.
 496     if result[0][1] != ('MONO', []):
 497         raise ValueError("first fragmentation regime was something other than"
 498                          " 'MONO' with no isotopes--this is almost certainly"
 499                          " not what was intended")
 500     return result
 501
 502
 503 def parse_mod_term(s, is_potential=False):
 504     """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
 505     residues, description).
 506
 507     >>> parse_mod_term('-C2H3ON!@C')
 508     (-1, 'C2H3ON', True, 'C', None)
 509     >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
 510     (1, 42.0, False, 'STY', 'phosphorylation')
 511
 512     """
 513
 514     m = re.match(r'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
 515                  r'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s)
 516     if not m:
 517         raise ValueError("invalid modification term specification"
 518                          " '%s'" % s)
 519     mg = m.groups()
 520     invalid_residues = set(mg[5]) - set(RESIDUES_W_BRACKETS)
 521     if invalid_residues:
 522         raise ValueError("invalid modification list specification"
 523                          " (invalid residues %s)" % list(invalid_residues))
 524     delta = mg[1]
 525     if mg[2]:
 526         delta = float(mg[1])
 527         if is_potential and abs(delta) < 0.0001:
 528             raise ValueError("invalid modification list specification"
 529                              " (delta '%s' is too small)" % delta)
 530     residues = mg[5]
 531     if not is_potential and len(residues) != 1:
 532         raise ValueError("invalid modification list specification '%s' (only"
 533                          " potential modifications may have multiple residues)"
 534                          % residues)
 535     if len(residues) != len(set(residues)):
 536         raise ValueError("invalid modification list specification"
 537                          " '%s' (duplicate residues prohibited)"
 538                          % residues)
 539     return (mg[0] == '-' and -1 or 1, delta, mg[4] == '!', residues, mg[7])
 540
 541
 542 def fixed_mod_list(specification):
 543     """Check and return a list of modification tuples.
 544
 545     >>> fixed_mod_list('57@C')
 546     [(1, 57.0, False, 'C', None)]
 547     >>> fixed_mod_list('57@C,CH!@N desc')
 548     [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
 549     >>> fixed_mod_list('57@C,58@C')
 550     Traceback (most recent call last):
 551         ...
 552     ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
 553
 554     """
 555     if not specification:
 556         return []
 557     result = [ parse_mod_term(s) for s in specification.split(',') ]
 558     residues = [ x[3] for x in result ]
 559     # this check is across terms; the one above is within terms
 560     if len(residues) != len(set(residues)):
 561         raise ValueError("invalid modification list specification"
 562                          " '%s' (duplicate residues prohibited)"
 563                          % residues)
 564     debug("fixed_mod_list:\n%s", pformat(result))
 565     return result
 566
 567
 568 def parse_mod_basic_expression(s):
 569     s = s.strip()
 570     if s[0] == '(':
 571         tree, rest =  parse_mod_disjunction(s[1:])
 572         rest = rest.lstrip()
 573         if rest[:1] != ')':
 574             raise ValueError("invalid modification list specification"
 575                              " (expected matching ')')")
 576         return tree, rest[1:]
 577     parts = re.split(r'([;,()])', s, 1)
 578     if len(parts) == 1:
 579         term, rest = s, ''
 580     else:
 581         assert len(parts) == 3
 582         term, rest = parts[0], parts[1]+parts[2]
 583     return parse_mod_term(term, is_potential=True), rest
 584
 585 def parse_mod_conjunction(s):
 586     result = []
 587     while s:
 588         tree, s = parse_mod_basic_expression(s)
 589         result.append(tree)
 590         s = s.lstrip()
 591         if s[:1] != ',':
 592             break
 593         s = s[1:]
 594     return result, s
 595
 596 def parse_mod_disjunction(s):
 597     result = []
 598     while s:
 599         tree, s = parse_mod_conjunction(s)
 600         result.append(tree)
 601         s = s.lstrip()
 602         if s[:1] != ';':
 603             break
 604         s = s[1:]
 605     return result, s
 606
 607 def potential_mod_list(specification):
 608     """Check and return a tree of potential modification tuples.  Nodes at
 609     even (odd) levels are disjunctions (conjunctions).  (The top list is a
 610     disjunction.)
 611
 612     >>> potential_mod_list('')
 613     []
 614     >>> potential_mod_list('PO3H@STY; C2H2O@KST')
 615     [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
 616     >>> potential_mod_list('PO3H@STY, C2H2O@KST')
 617     [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
 618     >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
 619     ...                               C2H2O@KST acetylation;
 620     ...                               CH2@AKST methylation),
 621     ...                              O@M oxidation'''))
 622     [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
 623        [(1, 'C2H2O', False, 'KST', 'acetylation')],
 624        [(1, 'CH2', False, 'AKST', 'methylation')]],
 625       (1, 'O', False, 'M', 'oxidation')]]
 626
 627     """
 628     if not specification:
 629         return []
 630     tree, remainder = parse_mod_disjunction(specification)
 631     if remainder:
 632         raise ValueError("invalid modification list specification"
 633                          " (unexpected '%s')" % remainder)
 634     debug("potential_mod_list:\n%s", pformat(tree))
 635     return tree
 636
 637
 638 def p_positive(x): return x > 0
 639 def p_negative(x): return x < 0
 640 def p_nonnegative(x): return x >= 0
 641 def p_proportion(x): return 0 <= x <= 1
 642
 643 # Configuration file parameter specification
 644 #
 645 # name -> (type, default, check_fn)
 646 # type may be bool, int, float, str, modlist, or a tuple of values
 647 # default, as a string value, or None if value must be explicitly specified
 648 # check_fn is an optional function that returns True iff the value is valid
 649
 650 PARAMETER_INFO = {
 651     "databases" : (str, None),
 652     "decoy_locus_prefix" : (str, "SHUFFLED_"),
 653     "mass_regimes" : (mass_regime_list, "MONO"),
 654     "pervasive_mods" : (fixed_mod_list, ""),
 655     "potential_mods" : (potential_mod_list, ""),
 656     "potential_mod_limit" : (int, 2, p_nonnegative),
 657     "charge_limit" : (int, 3, p_positive),
 658     "min_peptide_length" : (int, 5, p_positive), # needed?
 659     "min_parent_spectrum_mass" : (float, 0, p_nonnegative),
 660     "max_parent_spectrum_mass" : (float, 10000, p_nonnegative),
 661     "TIC_cutoff_proportion" : (float, 0.98, p_proportion),
 662     "parent_mz_tolerance" : (float, 1.25, p_nonnegative),
 663     "fragment_mass_tolerance" : (float, 0.5, p_nonnegative),
 664     "intensity_class_count" : (int, 3, p_positive),
 665     "intensity_class_ratio" : (float, 2.0, p_positive), # really > 1.0?
 666     "best_result_count" : (int, 5, p_positive),
 667     }
 668
 669
 670 def validate_parameters(parameters, parameter_info=PARAMETER_INFO):
 671     """Verify that parameters are valid, have valid values, and correspond to
 672     currently implemented functionality.  Values are converted, default values
 673     are filled in, and the resulting name/value dict returned.
 674
 675     >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
 676     ...                            {'spectrum, total peaks'
 677     ...                             : (int, '50', p_positive),
 678     ...                             'output, spectra'
 679     ...                             : (bool, 'no')}).items())
 680     [('output, spectra', False), ('spectrum, total peaks', 40)]
 681
 682     """
 683
 684     pmap = {}
 685     for p_name, p_info in sorted(parameter_info.items()):
 686         type_ = p_info[0]
 687         default = p_info[1]
 688         check_fn = len(p_info) > 2 and p_info[2] or None
 689
 690         v = parameters.get(p_name)
 691         if v == None:
 692             if default != None:
 693                 debug("parameter '%s' defaulting to '%s'", p_name, default)
 694                 v = default
 695             else:
 696                 error("missing required parameter '%s'" % p_name)
 697         if isinstance(type_, tuple):
 698             if not v in type_:
 699                 error("parameter '%s' value '%s' not in %s (feature not"
 700                       " implemented?)" % (p_name, v, type_))
 701         elif type_ == bool:
 702             v = { 'yes' : True, 'no' : False }.get(v)
 703             if v == None:
 704                 error("parameter '%s' requires a value of 'yes' or 'no'")
 705         else:
 706             try:
 707                 v = type_(v)
 708             except ValueError, e:
 709                 error("parameter '%s' has value '%s' with invalid format [%s]"
 710                       % (p_name, v, e))
 711         if check_fn and not check_fn(v):
 712             error("parameter '%s' has invalid value '%s' (or feature not"
 713                   " implemented)" % (p_name, v))
 714         pmap[p_name] = v
 715
 716     unknown_parameters = set(parameters) - set(parameter_info)
 717     if unknown_parameters:
 718         warning("%s unknown parameter(s):\n %s"
 719                 % (len(unknown_parameters),
 720                    pformat(sorted(list(unknown_parameters)))))
 721     return pmap
 722
 723
 724 def pythonize_swig_object(o, only_fields=None, skip_fields=[]):
 725     """Creates a pure Python copy of a SWIG object, so that it can be easily
 726     pickled, or printed (for debugging purposes).  Each SWIG object is
 727     pythonized as a dictionary, for convenience.  If provided, 'only_fields',
 728     limits the copy to the list of fields specified.  Otherwise,
 729     'skip_fields' if given is a list of methods not to include (this helps
 730     avoid infinite recursion).
 731
 732     >>> pprint(pythonize_swig_object(cgreylag.score_stats(1, 1)))
 733     {'best_matches': [[{'mass_trace': [],
 734                         'peptide_begin': [],
 735                         'peptide_sequence': '',
 736                         'predicted_parent_mass': 0.0,
 737                         'score': 0.0,
 738                         'sequence_name': [],
 739                         'spectrum_index': -1}]],
 740      'candidate_spectrum_count': 0,
 741      'combinations_searched': 0}
 742
 743     """
 744
 745     if isinstance(o, str):
 746         return o
 747     try:
 748         len(o)
 749     except TypeError:
 750         pass
 751     else:
 752         return list(pythonize_swig_object(x, only_fields, skip_fields)
 753                     for x in o)
 754     if hasattr(o, '__swig_getmethods__'):
 755         s = {}
 756         for a in o.__swig_getmethods__:
 757             if (only_fields != None and a in only_fields
 758                 or only_fields == None and a not in skip_fields):
 759                 s[a] = pythonize_swig_object(getattr(o, a), only_fields,
 760                                              skip_fields)
 761         return s
 762     return o
 763
 764 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
 765 # circularization of the peptide N-terminal.  PCA mods are excluded if a
 766 # static N-terminal mod has been specified.  Likewise, choosing a PCA mod will
 767 # exclude choosing a potential N-terminal mod.  (The PCA mod for 'C' is
 768 # dependent on a static mod of C+57 being in effect.)
 769
 770 def get_pca_table(mass_regimes):
 771     """Return a list of tuples (residues, parent delta, fragment delta)
 772     describing the PCA possibilities for each mass regime. Residues is a
 773     string specifying the residues this PCA delta applies to.
 774     """
 775     # FIX: According to Xtandem, C is only a candidate for PCA if
 776     # carboxyamidomethylated (C+57).  Currently we always search it.
 777     return [ [('', 0, 0),
 778               ('E', -1 * CP.parent_mass_regime[r].water_mass,
 779                -1 * CP.fragment_mass_regime[r].water_mass),
 780               ('QC', -1 * CP.parent_mass_regime[r].ammonia_mass,
 781                -1 * CP.fragment_mass_regime[r].ammonia_mass)]
 782              for r in range(len(mass_regimes)) ]
 783
 784
 785 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
 786     if not mod_tree:
 787         if 0 < len(conjuncts) <= limit:
 788             yield conjuncts
 789         return
 790     first, rest = mod_tree[0], mod_tree[1:]
 791     if isinstance(first, list):
 792         for x in enumerate_disjunction(first, limit):
 793             for y in enumerate_conjunction(rest, limit, conjuncts + x):
 794                 yield y
 795     else:
 796         for y in enumerate_conjunction(rest, limit, conjuncts):
 797             yield y
 798         for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
 799             yield y
 800
 801 def enumerate_disjunction(mod_tree, limit=sys.maxint):
 802     """Generates the conjuncts for mod_tree that are no longer than limit.
 803
 804     >>> list(enumerate_disjunction([['a'],['b'],['c']]))
 805     [[], ['a'], ['b'], ['c']]
 806     >>> list(enumerate_disjunction([[1,2,3]]))
 807     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
 808     >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
 809     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
 810     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
 811     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
 812     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
 813     [[]]
 814
 815     """
 816     assert isinstance(mod_tree, list)
 817     yield []
 818     for b in mod_tree:
 819         for s in enumerate_conjunction(b, limit):
 820             yield s
 821
 822 def get_mod_conjunct_triples(mod_tree, limit):
 823     """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
 824     (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
 825     ordered by increasing number of conjuncts, and with duplicates within and
 826     across removed.  A mass table is also appended to each conjunct.
 827     """
 828     # FIX: is there a more elegant way or place for all this?
 829     def enmass(t):
 830         def rmass(regime_index, par_frag, sign, delta, is_mono):
 831             global MASS_REGIME_ATOMIC_MASSES
 832             if isinstance(delta, str):
 833                 if is_mono:
 834                     regime_index = 0
 835                 return (sign * formula_mass(delta,
 836                            MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
 837             else:
 838                 return sign * delta
 839
 840         sign, delta, is_mono = t[:3]
 841         return t + (tuple((rmass(r, 0, sign, delta, is_mono),
 842                            rmass(r, 1, sign, delta, is_mono))
 843                           for r in range(len(XTP["mass_regimes"]))),)
 844
 845     def triple(c):
 846         Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
 847         Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
 848         assert len(Ns) <= 1 and len(Cs) <= 1
 849         rest = [ enmass(x) for x in c if x[3] not in '[]' ]
 850         rest = tuple(sorted(list(frozenset(rest))))
 851         return (Ns, Cs, rest)
 852
 853     return sorted(list(frozenset(triple(conjunct)
 854                                  for conjunct
 855                                  in enumerate_disjunction(mod_tree, limit))),
 856                   key=lambda x: (sum(len(y) for y in x), x))
 857
 858
 859 def gen_delta_bag_counts(i, remainder, bag):
 860     if i < 1:
 861         assert i == 0
 862         bag[0] = remainder
 863         yield tuple(bag)
 864         return
 865     for delta in range(1, remainder-i+1):
 866         bag[i] = delta
 867         for x in gen_delta_bag_counts(i-1, remainder-delta, bag):
 868             yield x
 869
 870 def generate_delta_bag_counts(mod_count, conjunct_length):
 871     """Generate all tuples of positive integers having length conjunct_length
 872     and sum mod_count.  As a special case, () is such a tuple having length 0
 873     and sum 0.
 874
 875     >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
 876     ...
 877     0 []
 878     1 [(4,)]
 879     2 [(3, 1), (2, 2), (1, 3)]
 880     3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
 881     4 [(1, 1, 1, 1)]
 882     5 []
 883     >>> list(generate_delta_bag_counts(0, 0))
 884     [()]
 885     >>> list(generate_delta_bag_counts(0, 1))
 886     []
 887
 888     """
 889     if conjunct_length == 0:
 890         return [()] if mod_count == 0 else []
 891     if mod_count < conjunct_length:
 892         return []
 893     return gen_delta_bag_counts(conjunct_length - 1, mod_count,
 894                                 [0] * conjunct_length)
 895
 896
 897 def set_context_conjuncts(context, mass_regime_index, N_cj, C_cj, R_cj):
 898     assert len(N_cj) < 1 and len(C_cj) < 1
 899     context.N_delta = 0
 900     if N_cj:
 901         context.N_delta = N_cj[0][5][mass_regime_index][1]
 902     context.C_delta = 0
 903     if C_cj:
 904         context.C_delta = C_cj[0][5][mass_regime_index][1]
 905     context.delta_bag_lookup.clear()
 906     context.delta_bag_lookup.resize(ord('Z')+1)
 907     context.delta_bag_delta.clear()
 908     for n, cj in enumerate(R_cj):
 909         context.delta_bag_delta.append(cj[5][mass_regime_index][1])
 910         for r in cj[3]:
 911             context.delta_bag_lookup[ord(r)] \
 912                 = context.delta_bag_lookup[ord(r)] + (n,)
 913
 914
 915 def search_all(options, context, score_statistics):
 916     """Search sequence database against searchable spectra."""
 917
 918     mod_limit = XTP["potential_mod_limit"]
 919
 920     mod_conjunct_triples = get_mod_conjunct_triples(XTP["potential_mods"],
 921                                                     mod_limit)
 922     info("%s unique potential modification conjuncts",
 923          len(mod_conjunct_triples))
 924     debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples))
 925
 926     mass_regimes = XTP["mass_regimes"]
 927     pca_table = get_pca_table(mass_regimes)
 928     debug("pca_table: %s", pca_table)
 929
 930     total_combinations_searched = 0
 931
 932     for mod_count in range(mod_limit + 1):
 933         context.mod_count = mod_count
 934         for mr_index, (mass_regime, pca_entry) in enumerate(zip(mass_regimes,
 935                                                                 pca_table)):
 936             context.mass_regime_index = mr_index
 937             for pca_res, pca_parent_delta, pca_frag_delta in pca_entry:
 938                 context.pca_residues = pca_res
 939                 context.pca_delta = pca_frag_delta
 940                 for cji, (N_cj, C_cj, R_cj) in enumerate(mod_conjunct_triples):
 941                     if pca_res and N_cj:
 942                         continue    # mutually exclusive, for now
 943                     set_context_conjuncts(context, mr_index, N_cj, C_cj, R_cj)
 944                     debug("mod_count: %s", mod_count)
 945                     debug("cj_triple: N=%s C=%s R=%s", N_cj, C_cj, R_cj)
 946                     for delta_bag in generate_delta_bag_counts(mod_count,
 947                                                                len(R_cj)):
 948                         debug("delta_bag: %s", delta_bag)
 949
 950                         # this clear() avoids an SWIG/STL bug!?
 951                         context.delta_bag_count.clear()
 952                         context.delta_bag_count[:] = delta_bag
 953
 954                         pmrf = CP.parent_mass_regime[mr_index].fixed_residue_mass
 955                         fmrf = CP.fragment_mass_regime[mr_index].fixed_residue_mass
 956                         p_fx = (pmrf[ord('[')]
 957                                 + (N_cj and N_cj[0][5][mr_index][0] or 0)
 958                                 + pmrf[ord(']')]
 959                                 + (C_cj and C_cj[0][5][mr_index][0] or 0)
 960                                 + pca_parent_delta + PROTON_MASS)
 961                         context.parent_fixed_mass = p_fx
 962                         f_N_fx = (fmrf[ord('[')]
 963                                   + (N_cj and N_cj[0][5][mr_index][1] or 0)
 964                                   + pca_frag_delta)
 965                         context.fragment_N_fixed_mass = f_N_fx
 966                         f_C_fx = (fmrf[ord(']')]
 967                                   + (C_cj and C_cj[0][5][mr_index][1] or 0)
 968                                   + CP.fragment_mass_regime[mr_index].water_mass)
 969                         context.fragment_C_fixed_mass = f_C_fx
 970
 971                         info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
 972                              % (mod_count, mr_index, pca_res, cji, delta_bag))
 973                         debug("p_fx %s f_N_fx %s f_C_fx %s"
 974                               % (p_fx, f_N_fx, f_C_fx))
 975
 976                         score_statistics.combinations_searched = 0
 977                         cgreylag.spectrum.search_runs(context,
 978                                                       score_statistics)
 979                         total_combinations_searched \
 980                             += score_statistics.combinations_searched
 981                         info("  %s candidate spectra examined, this bag",
 982                              score_statistics.combinations_searched)
 983
 984     info('%s candidate spectra examined',
 985          score_statistics.candidate_spectrum_count)
 986
 987
 988 def get_prefix_sequence(begin_pos, run_offset, sequence):
 989     """
 990     >>> get_prefix_sequence(0, 0, 'abcdefghi')
 991     '['
 992     >>> get_prefix_sequence(1, 0, 'abcdefghi')
 993     '[a'
 994     >>> get_prefix_sequence(3, 0, 'abcdefghi')
 995     '[abc'
 996     >>> get_prefix_sequence(4, 0, 'abcdefghi')
 997     'abcd'
 998     >>> get_prefix_sequence(4, 2, 'abcdefghi')
 999     'cdef'
1000
1001     """
1002     prefix_start = run_offset + begin_pos - 4
1003     s = sequence[max(0, prefix_start):prefix_start+4]
1004     if len(s) < 4:
1005         s = '[' + s
1006     return s
1007
1008 def get_suffix_sequence(end_pos, run_offset, sequence):
1009     """
1010     >>> get_suffix_sequence(0, 0, 'abcdefghi')
1011     'abcd'
1012     >>> get_suffix_sequence(4, 0, 'abcdefghi')
1013     'efgh'
1014     >>> get_suffix_sequence(6, 0, 'abcdefghi')
1015     'ghi]'
1016     >>> get_suffix_sequence(8, 0, 'abcdefghi')
1017     'i]'
1018     >>> get_suffix_sequence(4, 2, 'abcdefghi')
1019     'ghi]'
1020
1021     """
1022     suffix_start = run_offset + end_pos
1023     s = sequence[suffix_start:suffix_start+4]
1024     if len(s) < 4:
1025         s = s + ']'
1026     return s
1027
1028
1029 def results_dump(score_statistics, searchable_spectra):
1030     """Return a result dict mapping spectrum names to (spectrum_metadata,
1031     best_matches) pairs.  (Unneeded fields are stripped.)
1032     """
1033
1034     r = {}
1035     spectrum_metadata_fs = set(['name', 'file_id', 'mass', 'charge',
1036                                 'total_ion_current', 'comparisons'])
1037     py_s_spectra = pythonize_swig_object(searchable_spectra,
1038                                          only_fields=spectrum_metadata_fs)
1039     py_matches = pythonize_swig_object(score_statistics.best_matches,
1040                                        skip_fields=['spectrum_index'])
1041     assert len(py_s_spectra) == len(py_matches)
1042
1043     for sp_metadata, sp_matches in zip(py_s_spectra, py_matches):
1044         assert sp_metadata['name'] not in r, "duplicate spectrum name"
1045         r[sp_metadata['name']] = (sp_metadata, sp_matches)
1046
1047     return r
1048
1049
1050 def main(args=sys.argv[1:]):
1051     parser = optparse.OptionParser(usage=
1052                                    "usage: %prog [options] <job-id>"
1053                                    " <configuration-file> <ms2-file>...",
1054                                    description=__doc__, version=__version__)
1055     pa = parser.add_option
1056     pa("-P", "--parameter", nargs=2, dest="parameters", action="append",
1057        default=[],
1058        help="override a parameter in <parameter-file>, may be used multiple"
1059        " times", metavar="NAME VALUE")
1060     pa("-w", "--work-slice", nargs=2, type="float", dest="work_slice",
1061        help="search a subinterval [L:U) of the work space"
1062        " (where 0 <= L <= U <= 1) in standalone mode", metavar="L U")
1063     pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
1064     pa("-p", "--show-progress", action="store_true", dest="show_progress",
1065        help="show running progress")
1066     pa("--estimate", action="store_true", dest="estimate_only",
1067        help="just estimate the time required for the search")
1068     pa("-v", "--verbose", action="store_true", dest="verbose",
1069        help="be verbose")
1070     pa("--copyright", action="store_true", dest="copyright",
1071        help="print copyright and exit")
1072     pa("--debug", action="store_true", dest="debug",
1073        help="output debugging info")
1074     pa("--profile", action="store_true", dest="profile",
1075        help="dump Python profiling output to './greylag.prof.<pid>'")
1076     (options, args) = parser.parse_args(args=args)
1077
1078     if options.copyright:
1079         print __copyright__
1080         sys.exit(0)
1081
1082     if len(args) < 3:
1083         parser.print_help()
1084         sys.exit(1)
1085
1086     job_id = args[0]
1087     configuration_fn = args[1]
1088     spectrum_fns = args[2:]
1089
1090     if (any(True for f in spectrum_fns if not f.endswith('.ms2'))
1091         or (options.work_slice
1092             and not (0 <= options.work_slice[0]
1093                      <= options.work_slice[1] <= 1))):
1094         parser.print_help()
1095         sys.exit(1)
1096
1097     log_level = logging.WARNING
1098     if options.quiet:
1099         log_level = logging.ERROR
1100     if options.verbose:
1101         log_level = logging.INFO
1102     if options.debug:
1103         log_level = logging.DEBUG
1104     logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
1105                         format='%(asctime)s %(levelname)s: %(message)s')
1106     info("starting on %s", gethostname())
1107
1108     # prevent format char problems
1109     if '%' in job_id:
1110         error("<job-id> may not contain '%'")
1111
1112     # check spectrum basename uniqueness, as corresponding sqt files will be
1113     # in a single directory
1114     base_spectrum_fns = [ os.path.basename(fn) for fn in spectrum_fns ]
1115     if len(base_spectrum_fns) != len(set(base_spectrum_fns)):
1116         error("base spectrum filenames must be unique")
1117
1118     # check -P names for validity
1119     bad_names = (set(n for n,v in options.parameters)
1120                  - set(PARAMETER_INFO.keys()))
1121     if bad_names:
1122         error("bad -P parameter names %s" % list(bad_names))
1123
1124     # read params
1125     cp = ConfigParser.RawConfigParser()
1126     cp.optionxform = str                # be case-sensitive
1127     with open(configuration_fn) as configuration_file:
1128         cp.readfp(configuration_file)
1129     if not cp.has_section('greylag'):
1130         error("%s has no [greylag] section" % configuration_fn)
1131     parameters = dict(cp.items('greylag'))
1132     parameters.update(dict(options.parameters)) # command-line override
1133     global XTP
1134     XTP = validate_parameters(parameters)
1135
1136     fixed_mod_map = dict((r[3], r) for r in XTP["pervasive_mods"])
1137     initialize_spectrum_parameters(options, XTP["mass_regimes"], fixed_mod_map)
1138
1139     # read sequence dbs
1140     databases = XTP["databases"].split()
1141     # [(locusname, defline, seq, filename), ...]
1142     fasta_db = list(read_fasta_files(databases))
1143     # [(idno, offset, locusname, defline, seq, seq_filename), ...]
1144     db = []
1145     for idno, (locusname, defline, sequence, filename) in enumerate(fasta_db):
1146         db.extend(split_sequence_into_aa_runs(idno, locusname, defline,
1147                                               sequence, filename))
1148     db_residue_count = sum(len(dbi[3]) for dbi in db)
1149
1150     info("read %s sequences (%s runs, %s residues)", len(fasta_db), len(db),
1151          db_residue_count)
1152     max_run_length = max(len(r[3]) for r in db)
1153     info("max run length is %s residues", max_run_length)
1154     if max_run_length > 2**31 - 1:
1155         error("runs longer than %s not yet supported", max_run_length)
1156     if not db:
1157         error("no database sequences")
1158
1159     # read spectrum offset indices
1160     spectrum_offset_indices = []
1161     for spfn in spectrum_fns:
1162         idxfn = spfn + '.idx'
1163         with contextlib.closing(gzip.open(idxfn)) as idxf:
1164             idx = cPickle.load(idxf)
1165             # try to verify that index matches spectrum file
1166             if idx['file size'] != os.path.getsize(spfn):
1167                 error("index '%s' needs rebuilding" % idxfn)
1168             spectrum_offset_indices.append(idx['offsets'])
1169
1170     # FIX: assume standalone mode (for now)
1171     assert options.work_slice
1172
1173     # read spectra per work slice
1174     spectra = read_spectra_slice(spectrum_fns, spectrum_offset_indices,
1175                                  options.work_slice)
1176     del spectrum_offset_indices
1177     spectra.sort(key=lambda x: x.mass)
1178
1179     def peak_statistics(spectra):
1180         counts = [ len(sp.peaks) for sp in spectra ]
1181         counts.sort()
1182         n = len(counts)
1183         return (counts[0], counts[int(n*0.25)], counts[int(n*0.5)],
1184                 counts[int(n*0.75)], counts[-1], sum(counts) / float(n))
1185
1186     if spectra:
1187         info("read %s spectra (mass range %s - %s)", len(spectra),
1188              spectra[0].mass, spectra[-1].mass)
1189         info("  peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1190              % peak_statistics(spectra))
1191     else:
1192         warning("no input spectra")
1193
1194     # filter and normalize spectra
1195     for sp in spectra:
1196         sp.filter_peaks(XTP["TIC_cutoff_proportion"],
1197                         CP.parent_mass_tolerance_max)
1198         sp.classify(XTP["intensity_class_count"], XTP["intensity_class_ratio"],
1199                     XTP["fragment_mass_tolerance"])
1200
1201     min_psm = XTP["min_parent_spectrum_mass"]
1202     max_psm = XTP["max_parent_spectrum_mass"]
1203     # FIX: also filter by 1 + 2 + 4 rule?
1204     spectra = [ sp for sp in spectra
1205                 if len(sp.peaks) >= 10 and min_psm <= sp.mass <= max_psm ]
1206
1207     if spectra:
1208         info("after filtering:")
1209         info("     %s spectra (mass range %s - %s)", len(spectra),
1210              spectra[0].mass, spectra[-1].mass)
1211         info("  peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1212              % peak_statistics(spectra))
1213
1214     cgreylag.spectrum.set_searchable_spectra(spectra)
1215     score_statistics = cgreylag.score_stats(len(spectra),
1216                                             XTP["best_result_count"])
1217
1218     if spectra:
1219         del spectra                     # release memory
1220
1221         # FIX!!!
1222         # (cleavage_re, position of cleavage in cleavage_re)
1223         cleavage_motif = "[X]|[X]"
1224         cleavage_pattern, cleavage_pos = cleavage_motif_re(cleavage_motif)
1225         if cleavage_pos == None:
1226             error("cleavage site '%s' is missing '|'",
1227                   XTP["protein, cleavage site"])
1228         cleavage_pattern = re.compile(cleavage_pattern)
1229
1230         context = cgreylag.search_context()
1231         for idno, offset, locusname, defline, seq, seq_filename in db:
1232             cp = []
1233             if cleavage_motif != "[X]|[X]":
1234                 cp = list(generate_cleavage_points(cleavage_pattern,
1235                                                    cleavage_pos, seq))
1236             sr = cgreylag.sequence_run(idno, offset, seq, cp, locusname)
1237             context.sequence_runs.append(sr)
1238         context.maximum_missed_cleavage_sites = 1000000000 # FIX
1239
1240         info("searching")
1241         search_all(options, context, score_statistics)
1242     else:
1243         warning("no spectra after filtering--search skipped")
1244
1245     if options.estimate_only:
1246         print ("%.2f generic CPU hours"
1247                % (score_statistics.candidate_spectrum_count / 300.0e6))
1248         return
1249
1250     info("writing result file")
1251     # FIX: this is only for standalone mode
1252     result_fn = 'grind_%s-%s.gwr' % options.work_slice
1253     with contextlib.closing(gzip.open(result_fn, 'w')) as result_file:
1254         d = { 'version' : __version__,
1255               'matches' : results_dump(score_statistics,
1256                                        cgreylag.cvar.spectrum_searchable_spectra),
1257               'total comparisons' : score_statistics.candidate_spectrum_count,
1258               'spectrum files' : base_spectrum_fns,
1259               'databases' : databases,
1260               'parameters' : XTP,
1261               'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES,
1262               'proton mass' : PROTON_MASS,
1263               'argv' : sys.argv }
1264         cPickle.dump(d, result_file, cPickle.HIGHEST_PROTOCOL)
1265     info("finished, result file written to '%s'", result_fn)
1266
1267
1268 if __name__ == '__main__':
1269 #     try:
1270 #         import psyco
1271 #         psyco.full()
1272 #         ###psyco.bind(formula_mass)
1273 #         warning('psyco enabled')
1274 #     except ImportError:
1275 #         pass
1276
1277     try:
1278         if '--profile' in sys.argv:
1279             import cProfile
1280             import pstats
1281             report_fn = "greylag.prof.%s" % os.getpid()
1282             data_fn = report_fn + ".tmp"
1283             prof = cProfile.run('main()', data_fn)
1284             with open(report_fn, 'w') as report_f:
1285                 try:
1286                     stats = pstats.Stats(data_fn, stream=report_f)
1287                     stats.strip_dirs()
1288                     stats.sort_stats('cumulative')
1289                     stats.print_stats(50)
1290                     stats.sort_stats('time')
1291                     stats.print_stats(50)
1292                     print "# profile report written to '%s'" % report_fn
1293                 finally:
1294                     try:
1295                         os.remove(data_fn)
1296                     except:
1297                         pass
1298         else:
1299             main()
1300     except SystemExit:
1301         raise
1302     except:
1303         logging.exception("unhandled exception")
1304         sys.exit(1)
1305     finally:
1306         logging.shutdown()
1307
1308
1309 # FIXES:
1310 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)