Cleanups: dead code removal/etc.
[greylag.git] / greylag_grind.py
blob4c44952792542d8237bda16bccf2a7d614828b63
1 #!/usr/bin/env python2.5
3 '''This program does the actual work to search mass spectra against a sequence
4 database. <job-id> is a unique identifier used as a prefix for work and
5 output directories. The <configuration-file> contains program options. The
6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
7 on them.
9 This program can operate in standalone mode (useful for testing) or command
10 mode. If the --work-slice option is given, that slice will be searched in
11 standalone mode, then the program will exit. Otherwise, the program will
12 process commands until an exit command is received.
14 '''
16 from __future__ import with_statement
18 __copyright__ = '''
19 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License along
32 with this program; if not, write to the Free Software Foundation, Inc.,
33 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
34 '''
36 ##############################################################################
37 ## "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra ##
38 ##############################################################################
41 __version__ = "0.0"
44 import ConfigParser
45 import contextlib
46 import cPickle
47 import fileinput
48 import gzip
49 import logging
50 from logging import debug, info, warning
51 import math
52 import optparse
53 import os
54 from pprint import pprint, pformat
55 import re
56 from socket import gethostname
57 import sys
59 import cgreylag
62 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
63 # KeyboardInterrupt processing, since we may spend long periods of time in
64 # uninterruptible C++ calls. Also die immediately on SIGPIPE.
65 try:
66 import signal
67 signal.signal(signal.SIGINT, signal.SIG_DFL)
68 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
69 except:
70 pass
73 def error(s, *args):
74 "fatal error"
75 logging.error(s, *args)
76 if __name__ != "__main__":
77 raise Exception("(fatal error, but unit testing, so not exiting)")
78 sys.exit(1)
81 def fileerror(s, *args):
82 error(s + (", at line %s of file '%s'"
83 % (fileinput.filelineno(), fileinput.filename())),
84 *args)
87 # name -> value map of processed XML input parameters
88 XTP = {}
90 # handle to the singleton parameter object shared with the C++ module
91 CP = cgreylag.cvar.parameters_the
93 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
94 PROTON_MASS = 1.007276
95 ELECTRON_MASS = 0.000549 # ?
97 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
98 MONOISOTOPIC_ATOMIC_MASS = {
99 'H' : 1.00782503214,
100 'C' : 12.00000000,
101 'N' : 14.00307400529,
102 'O' : 15.994914622115,
103 'P' : 30.9737615120,
104 'S' : 31.9720706912,
107 # most prevalent only (1 in 1000)
108 ISOTOPIC_ATOMIC_MASS = { # prevalence (in %)
109 'C13' : 13.003354837810, # 1.078
110 'N15' : 15.00010889849, # 0.3687
111 'O18' : 17.99916049, # 0.20514
112 'S33' : 32.9714585012, # 0.762
113 'S34' : 33.9678668311, # 4.2928
116 AVERAGE_ATOMIC_MASS = {
117 'H' : 1.007947,
118 'C' : 12.01078,
119 'N' : 14.00672,
120 'O' : 15.99943,
121 'P' : 30.9737612,
122 'S' : 32.0655,
125 # The xtandem average residue masses are about 0.002 amu higher than those
126 # calculated directly from the above average atomic masses. None of the
127 # chemists consulted knew of any reason why, aside from lack of precision in
128 # the average atomic mass estimates. This shouldn't matter very much, as
129 # fragmentation calculations should all be monoisotopic, and we can always
130 # widen the parent tolerance window a bit.
133 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
134 """Return the mass of formula, using the given mass regime (monoisotopic
135 by default).
137 >>> formula_mass('H2O', { 'H':1, 'O':16 })
139 >>> # monoisotopic mass of glycine
140 >>> str(round(formula_mass('C2H3ON'), 4))
141 '57.0215'
144 parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
145 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
146 return sum(atomic_mass[parts[i]] * int(parts[i+1])
147 for i in range(0, len(parts), 2))
149 # FIX: selenocysteine (U), etc
150 # residue -> formula
151 RESIDUE_FORMULA = {
152 'A' : "C3H5ON",
153 'C' : "C3H5ONS",
154 'D' : "C4H5O3N",
155 'E' : "C5H7O3N",
156 'F' : "C9H9ON",
157 'G' : "C2H3ON",
158 'H' : "C6H7ON3",
159 'I' : "C6H11ON",
160 'K' : "C6H12ON2",
161 'L' : "C6H11ON",
162 'M' : "C5H9ONS",
163 'N' : "C4H6O2N2",
164 'P' : "C5H7ON",
165 'Q' : "C5H8O2N2",
166 'R' : "C6H12ON4",
167 'S' : "C3H5O2N",
168 'T' : "C4H7O2N",
169 'V' : "C5H9ON",
170 'W' : "C11H10ON2",
171 'Y' : "C9H9O2N",
174 RESIDUES = sorted(RESIDUE_FORMULA.keys())
175 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
178 # accessed as a global variable (FIX?)
179 # [0][1] -> 'H' -> fragment mass of H for regime 0
180 MASS_REGIME_ATOMIC_MASSES = []
183 def mass_regime_atomic_masses(spec):
184 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
185 names to masses.
187 name, isotopes = spec
188 assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
189 if name == 'MONO':
190 r = MONOISOTOPIC_ATOMIC_MASS.copy()
191 else:
192 r = AVERAGE_ATOMIC_MASS.copy()
193 if isotopes:
194 iname, prevalence = isotopes[0]
195 assert iname == 'N15' and 0 <= prevalence <= 1
196 # this is a simplification, but additional accuracy pointless?
197 if name == 'MONO':
198 r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
199 else:
200 r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
201 return r
204 def initialize_spectrum_parameters(options, mass_regimes, fixed_mod_map):
205 """Initialize parameters known to the spectrum module.
206 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
209 debug('fixed_mod_map: %s', fixed_mod_map)
210 # This is the size of vectors that are indexed by residues (A-Z) or
211 # special characters ('[]').
212 RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
214 # These are currently monoisotopic. (deuterium pointless?)
215 CP.proton_mass = PROTON_MASS
216 CP.hydrogen_mass = formula_mass("H")
218 global MASS_REGIME_ATOMIC_MASSES
219 for regime_pair in mass_regimes:
220 assert len(regime_pair) == 2 # parent and fragment
221 info('mass regime: %s', regime_pair)
222 MASS_REGIME_ATOMIC_MASSES.append([])
223 for n, regime in enumerate(regime_pair):
224 atmass = mass_regime_atomic_masses(regime)
225 MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
226 creg = cgreylag.mass_regime_parameters()
228 creg.hydroxyl_mass = formula_mass("OH", atmass)
229 creg.water_mass = formula_mass("H2O", atmass)
230 creg.ammonia_mass = formula_mass("NH3", atmass)
232 creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
234 for r in RESIDUES_W_BRACKETS:
235 m = 0
236 if r in RESIDUES:
237 m = formula_mass(RESIDUE_FORMULA[r], atmass)
238 rmod = fixed_mod_map.get(r)
239 if rmod:
240 if isinstance(rmod[1], str):
241 if rmod[2]:
242 m += rmod[0] * formula_mass(rmod[1])
243 else:
244 m += rmod[0] * formula_mass(rmod[1], atmass)
245 else:
246 m += rmod[0] * rmod[1]
247 creg.fixed_residue_mass[ord(r)] = m
248 # assuming these are monoisotopic (not regime)
249 creg.fixed_residue_mass[ord('[')] += formula_mass("H")
250 creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
251 if not n:
252 CP.parent_mass_regime.append(creg);
253 else:
254 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
255 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
256 CP.fragment_mass_regime.append(creg);
257 for r in RESIDUES_W_BRACKETS:
258 info('fixed_mass %s: %s', r,
259 [ "%.6f/%.6f"
260 % (CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
261 CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
262 for rn in range(len(mass_regimes)) ])
263 for r in RESIDUES:
264 for rn in range(len(mass_regimes)):
265 # physically impossible (and results would be garbage)
266 if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
267 raise ValueError('bogus parent mass specification for %s' % r)
268 if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
269 raise ValueError('bogus parent mass specification for %s' % r)
271 CP.parent_mass_tolerance_1 = XTP["parent_mz_tolerance"]
272 CP.parent_mass_tolerance_max = (XTP["parent_mz_tolerance"]
273 * XTP["charge_limit"])
275 CP.fragment_mass_tolerance = XTP["fragment_mass_tolerance"]
276 CP.intensity_class_count = XTP["intensity_class_count"]
278 CP.minimum_peptide_length = XTP["min_peptide_length"]
280 # CP.ln_factorial[n] == ln(n!)
281 CP.ln_factorial.resize(int(XTP["max_parent_spectrum_mass"]
282 / XTP["fragment_mass_tolerance"] + 100), 0.0)
283 for n in range(2, len(CP.ln_factorial)):
284 CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
286 CP.estimate_only = bool(options.estimate_only)
287 CP.show_progress = bool(options.show_progress)
290 def cleavage_motif_re(motif):
291 """Return (regexp, pos), where regexp is a regular expression that will
292 match a cleavage motif, and pos is the position of the cleavage with
293 respect to the match (or None). (The RE actually matches one character,
294 the rest matching as lookahead, so that re.finditer will find all
295 overlapping matches.)
297 >>> cleavage_motif_re('[KR]|{P}')
298 ('[KR](?=[^P])', 1)
299 >>> cleavage_motif_re('[X]|[X]')
300 ('.(?=.)', 1)
303 cleavage_pos = None
304 re_parts = []
305 motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
306 parts = [ p[0] for p in motif_re.findall(motif) ]
307 if ''.join(parts) != motif:
308 raise ValueError('invalid cleavage motif pattern')
309 i = 0
310 for part in parts:
311 if part == '|':
312 if cleavage_pos != None:
313 raise ValueError("invalid cleavage motif pattern"
314 " (multiple '|'s)")
315 cleavage_pos = i
316 continue
317 if part == '[X]':
318 re_parts.append('.')
319 elif part[0] == '[':
320 re_parts.append('[%s]' % part[1:-1])
321 elif part[0] == '{':
322 re_parts.append('[^%s]' % part[1:-1])
323 else:
324 assert False, "unknown cleavage motif syntax"
325 i += 1
326 if len(re_parts) == 0:
327 re_pattern = '.'
328 elif len(re_parts) == 1:
329 re_pattern = re_parts[0]
330 else:
331 re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
332 return (re_pattern, cleavage_pos)
335 def generate_cleavage_points(cleavage_re, cleavage_pos, sequence):
336 """Yields the offsets of the cleavages in sequence. The endpoints are
337 always included, by convention.
339 >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
340 ... 'ARBCKDEKPF'))
341 [0, 2, 5, 10]
344 yield 0
345 for m in cleavage_re.finditer(sequence):
346 p = m.start() + cleavage_pos
347 if 0 < p < len(sequence):
348 yield p
349 yield len(sequence)
352 AA_SEQUENCE = re.compile(r'[ARNDCQEGHILKMFPSTWYV]+')
354 def split_sequence_into_aa_runs(idno, locusname, defline, sequence, filename):
355 """Returns a tuple (idno, start, locusname, defline, seq, filename) for
356 each contiguous run of residues in sequence, where 'start' is the position
357 of 'seq' in 'sequence'.
359 >>> pprint(split_sequence_into_aa_runs(123, 'ln', 'ln defline',
360 ... 'STSS*DEFABA', 'filename'))
361 [(123, 0, 'ln', 'ln defline', 'STSS', 'filename'),
362 (123, 5, 'ln', 'ln defline', 'DEFA', 'filename'),
363 (123, 10, 'ln', 'ln defline', 'A', 'filename')]
366 return [ (idno, m.start(), locusname, defline, m.group(), filename)
367 for n, m in enumerate(AA_SEQUENCE.finditer(sequence)) ]
370 def read_fasta_files(filenames):
371 """Yield (locusname, defline, sequence, filename) tuples as read from
372 FASTA files (uppercasing sequence)."""
374 loci_seen = set()
375 locusname, defline = None, None
376 seqs = []
377 for line in fileinput.input(filenames):
378 line = line.strip()
379 if line[:1] == '>':
380 if defline != None:
381 yield (locusname, defline, ''.join(seqs), fileinput.filename())
382 elif seqs:
383 fileerror("bad format: line precedes initial defline")
384 defline = line[1:]
385 locusname = defline.split(None, 1)[0]
386 if locusname in loci_seen:
387 error("locus name '%s' is not unique in the search database(s)"
388 % locusname)
389 loci_seen.add(locusname)
390 seqs = []
391 else:
392 seqs.append(line.upper())
393 if defline:
394 yield (locusname, defline, ''.join(seqs), fileinput.filename())
397 def read_spectra_slice(spectrum_fns, offset_indices, slice):
398 s_l, s_u = slice
399 assert 0 <= s_l <= s_u <= 1
401 total_spectra = sum(len(oi) for oi in offset_indices)
402 sp_l = int(float(s_l) * total_spectra)
403 sp_u = int(float(s_u) * total_spectra)
405 seeing_l = 0
406 spectra = []
408 for f_no, (fn, oi) in enumerate(zip(spectrum_fns, offset_indices)):
409 seeing_u = seeing_l + len(oi)
410 f_l = max(sp_l, seeing_l)
411 f_u = min(sp_u, seeing_u)
413 if f_l < f_u:
414 b_l = oi[f_l - seeing_l]
415 b_u = oi[f_u - seeing_l] if f_u - seeing_l < len(oi) else -1 # oo
417 with open(fn) as f:
418 fsp = cgreylag.spectrum.read_spectra_from_ms2(f, f_no,
419 b_l, b_u)
420 spectra.extend(fsp)
421 seeing_l = seeing_u
423 return spectra
426 # XML parameter file processing
428 # FIX: This parsing is way too complex. How to simplify?
430 def mass_regime_part(part_specification):
431 """Parse a single mass regime specification part.
433 >>> mass_regime_part('MONO')
434 ('MONO', [])
435 >>> mass_regime_part('AVG')
436 ('AVG', [])
437 >>> mass_regime_part('MONO(N15@87.5%)')
438 ('MONO', [('N15', 0.875)])
441 ps = [ x.strip() for x in part_specification.partition('(') ]
442 if ps[0] not in ('MONO', 'AVG'):
443 raise ValueError("invalid mass regime list specification"
444 " (regime id must be 'MONO' or 'AVG')")
445 if not ps[1]:
446 return (ps[0], [])
447 if ps[2][-1] != ')':
448 raise ValueError("invalid mass regime list specification"
449 " (expected ')')")
450 pps = [ x.strip() for x in ps[2][:-1].split(',') ]
451 if len(pps) > 1:
452 raise ValueError("invalid mass regime list specification"
453 " (multiple isotopes not yet implemented)")
454 ppps = [ x.strip() for x in pps[0].partition('@') ]
455 if not ppps[1]:
456 raise ValueError("invalid mass regime list specification"
457 " (expected '@')")
458 if ppps[0] not in ('N15',):
459 raise ValueError("invalid mass regime list specification"
460 " (isotope id must currently be 'N15')")
461 if ppps[2][-1] != '%':
462 raise ValueError("invalid mass regime list specification"
463 " (expected '%')")
464 prevalence = float(ppps[2][:-1]) / 100
465 if not (0 <= prevalence <= 1):
466 raise ValueError("invalid mass regime list specification"
467 " (prevalence must be in range 0-100%)")
468 return (ps[0], [(ppps[0], prevalence)])
470 def mass_regime_list(mass_regime_list_specification):
471 """Check and return a list of regime tuples (parent_regime,
472 fragment_regime), where each regime is a tuple (id, [(isotope_id,
473 prevalence), ...]). Multiple isotopes (when implemented) would be
474 comma-separated.
476 >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
477 [[('AVG', []), ('MONO', [])],
478 [('MONO', []), ('MONO', [])],
479 [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
482 result = []
483 for regspec in mass_regime_list_specification.split(';'):
484 halves = [ x.strip() for x in regspec.split('/') ]
485 if len(halves) > 2:
486 raise ValueError("invalid mass regime list specification"
487 " (too many '/'?)")
488 pr = [ mass_regime_part(h) for h in halves ]
489 if len(pr) == 1:
490 pr = [ pr[0], pr[0] ]
491 result.append(pr)
492 debug("mass regime list:\n%s", pformat(result))
494 # The first fragmentation regime should generally be MONO, so that
495 # formulaic deltas with '!' do the expected thing.
496 if result[0][1] != ('MONO', []):
497 raise ValueError("first fragmentation regime was something other than"
498 " 'MONO' with no isotopes--this is almost certainly"
499 " not what was intended")
500 return result
503 def parse_mod_term(s, is_potential=False):
504 """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
505 residues, description).
507 >>> parse_mod_term('-C2H3ON!@C')
508 (-1, 'C2H3ON', True, 'C', None)
509 >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
510 (1, 42.0, False, 'STY', 'phosphorylation')
514 m = re.match(r'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
515 r'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s)
516 if not m:
517 raise ValueError("invalid modification term specification"
518 " '%s'" % s)
519 mg = m.groups()
520 invalid_residues = set(mg[5]) - set(RESIDUES_W_BRACKETS)
521 if invalid_residues:
522 raise ValueError("invalid modification list specification"
523 " (invalid residues %s)" % list(invalid_residues))
524 delta = mg[1]
525 if mg[2]:
526 delta = float(mg[1])
527 if is_potential and abs(delta) < 0.0001:
528 raise ValueError("invalid modification list specification"
529 " (delta '%s' is too small)" % delta)
530 residues = mg[5]
531 if not is_potential and len(residues) != 1:
532 raise ValueError("invalid modification list specification '%s' (only"
533 " potential modifications may have multiple residues)"
534 % residues)
535 if len(residues) != len(set(residues)):
536 raise ValueError("invalid modification list specification"
537 " '%s' (duplicate residues prohibited)"
538 % residues)
539 return (mg[0] == '-' and -1 or 1, delta, mg[4] == '!', residues, mg[7])
542 def fixed_mod_list(specification):
543 """Check and return a list of modification tuples.
545 >>> fixed_mod_list('57@C')
546 [(1, 57.0, False, 'C', None)]
547 >>> fixed_mod_list('57@C,CH!@N desc')
548 [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
549 >>> fixed_mod_list('57@C,58@C')
550 Traceback (most recent call last):
552 ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
555 if not specification:
556 return []
557 result = [ parse_mod_term(s) for s in specification.split(',') ]
558 residues = [ x[3] for x in result ]
559 # this check is across terms; the one above is within terms
560 if len(residues) != len(set(residues)):
561 raise ValueError("invalid modification list specification"
562 " '%s' (duplicate residues prohibited)"
563 % residues)
564 debug("fixed_mod_list:\n%s", pformat(result))
565 return result
568 def parse_mod_basic_expression(s):
569 s = s.strip()
570 if s[0] == '(':
571 tree, rest = parse_mod_disjunction(s[1:])
572 rest = rest.lstrip()
573 if rest[:1] != ')':
574 raise ValueError("invalid modification list specification"
575 " (expected matching ')')")
576 return tree, rest[1:]
577 parts = re.split(r'([;,()])', s, 1)
578 if len(parts) == 1:
579 term, rest = s, ''
580 else:
581 assert len(parts) == 3
582 term, rest = parts[0], parts[1]+parts[2]
583 return parse_mod_term(term, is_potential=True), rest
585 def parse_mod_conjunction(s):
586 result = []
587 while s:
588 tree, s = parse_mod_basic_expression(s)
589 result.append(tree)
590 s = s.lstrip()
591 if s[:1] != ',':
592 break
593 s = s[1:]
594 return result, s
596 def parse_mod_disjunction(s):
597 result = []
598 while s:
599 tree, s = parse_mod_conjunction(s)
600 result.append(tree)
601 s = s.lstrip()
602 if s[:1] != ';':
603 break
604 s = s[1:]
605 return result, s
607 def potential_mod_list(specification):
608 """Check and return a tree of potential modification tuples. Nodes at
609 even (odd) levels are disjunctions (conjunctions). (The top list is a
610 disjunction.)
612 >>> potential_mod_list('')
614 >>> potential_mod_list('PO3H@STY; C2H2O@KST')
615 [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
616 >>> potential_mod_list('PO3H@STY, C2H2O@KST')
617 [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
618 >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
619 ... C2H2O@KST acetylation;
620 ... CH2@AKST methylation),
621 ... O@M oxidation'''))
622 [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
623 [(1, 'C2H2O', False, 'KST', 'acetylation')],
624 [(1, 'CH2', False, 'AKST', 'methylation')]],
625 (1, 'O', False, 'M', 'oxidation')]]
628 if not specification:
629 return []
630 tree, remainder = parse_mod_disjunction(specification)
631 if remainder:
632 raise ValueError("invalid modification list specification"
633 " (unexpected '%s')" % remainder)
634 debug("potential_mod_list:\n%s", pformat(tree))
635 return tree
638 def p_positive(x): return x > 0
639 def p_negative(x): return x < 0
640 def p_nonnegative(x): return x >= 0
641 def p_proportion(x): return 0 <= x <= 1
643 # Configuration file parameter specification
645 # name -> (type, default, check_fn)
646 # type may be bool, int, float, str, modlist, or a tuple of values
647 # default, as a string value, or None if value must be explicitly specified
648 # check_fn is an optional function that returns True iff the value is valid
650 PARAMETER_INFO = {
651 "databases" : (str, None),
652 "decoy_locus_prefix" : (str, "SHUFFLED_"),
653 "mass_regimes" : (mass_regime_list, "MONO"),
654 "pervasive_mods" : (fixed_mod_list, ""),
655 "potential_mods" : (potential_mod_list, ""),
656 "potential_mod_limit" : (int, 2, p_nonnegative),
657 "charge_limit" : (int, 3, p_positive),
658 "min_peptide_length" : (int, 5, p_positive), # needed?
659 "min_parent_spectrum_mass" : (float, 0, p_nonnegative),
660 "max_parent_spectrum_mass" : (float, 10000, p_nonnegative),
661 "TIC_cutoff_proportion" : (float, 0.98, p_proportion),
662 "parent_mz_tolerance" : (float, 1.25, p_nonnegative),
663 "fragment_mass_tolerance" : (float, 0.5, p_nonnegative),
664 "intensity_class_count" : (int, 3, p_positive),
665 "intensity_class_ratio" : (float, 2.0, p_positive), # really > 1.0?
666 "best_result_count" : (int, 5, p_positive),
670 def validate_parameters(parameters, parameter_info=PARAMETER_INFO):
671 """Verify that parameters are valid, have valid values, and correspond to
672 currently implemented functionality. Values are converted, default values
673 are filled in, and the resulting name/value dict returned.
675 >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
676 ... {'spectrum, total peaks'
677 ... : (int, '50', p_positive),
678 ... 'output, spectra'
679 ... : (bool, 'no')}).items())
680 [('output, spectra', False), ('spectrum, total peaks', 40)]
684 pmap = {}
685 for p_name, p_info in sorted(parameter_info.items()):
686 type_ = p_info[0]
687 default = p_info[1]
688 check_fn = len(p_info) > 2 and p_info[2] or None
690 v = parameters.get(p_name)
691 if v == None:
692 if default != None:
693 debug("parameter '%s' defaulting to '%s'", p_name, default)
694 v = default
695 else:
696 error("missing required parameter '%s'" % p_name)
697 if isinstance(type_, tuple):
698 if not v in type_:
699 error("parameter '%s' value '%s' not in %s (feature not"
700 " implemented?)" % (p_name, v, type_))
701 elif type_ == bool:
702 v = { 'yes' : True, 'no' : False }.get(v)
703 if v == None:
704 error("parameter '%s' requires a value of 'yes' or 'no'")
705 else:
706 try:
707 v = type_(v)
708 except ValueError, e:
709 error("parameter '%s' has value '%s' with invalid format [%s]"
710 % (p_name, v, e))
711 if check_fn and not check_fn(v):
712 error("parameter '%s' has invalid value '%s' (or feature not"
713 " implemented)" % (p_name, v))
714 pmap[p_name] = v
716 unknown_parameters = set(parameters) - set(parameter_info)
717 if unknown_parameters:
718 warning("%s unknown parameter(s):\n %s"
719 % (len(unknown_parameters),
720 pformat(sorted(list(unknown_parameters)))))
721 return pmap
724 def pythonize_swig_object(o, only_fields=None, skip_fields=[]):
725 """Creates a pure Python copy of a SWIG object, so that it can be easily
726 pickled, or printed (for debugging purposes). Each SWIG object is
727 pythonized as a dictionary, for convenience. If provided, 'only_fields',
728 limits the copy to the list of fields specified. Otherwise,
729 'skip_fields' if given is a list of methods not to include (this helps
730 avoid infinite recursion).
732 >>> pprint(pythonize_swig_object(cgreylag.score_stats(1, 1)))
733 {'best_matches': [[{'mass_trace': [],
734 'peptide_begin': [],
735 'peptide_sequence': '',
736 'predicted_parent_mass': 0.0,
737 'score': 0.0,
738 'sequence_name': [],
739 'spectrum_index': -1}]],
740 'candidate_spectrum_count': 0,
741 'combinations_searched': 0}
745 if isinstance(o, str):
746 return o
747 try:
748 len(o)
749 except TypeError:
750 pass
751 else:
752 return list(pythonize_swig_object(x, only_fields, skip_fields)
753 for x in o)
754 if hasattr(o, '__swig_getmethods__'):
755 s = {}
756 for a in o.__swig_getmethods__:
757 if (only_fields != None and a in only_fields
758 or only_fields == None and a not in skip_fields):
759 s[a] = pythonize_swig_object(getattr(o, a), only_fields,
760 skip_fields)
761 return s
762 return o
764 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
765 # circularization of the peptide N-terminal. PCA mods are excluded if a
766 # static N-terminal mod has been specified. Likewise, choosing a PCA mod will
767 # exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
768 # dependent on a static mod of C+57 being in effect.)
770 def get_pca_table(mass_regimes):
771 """Return a list of tuples (residues, parent delta, fragment delta)
772 describing the PCA possibilities for each mass regime. Residues is a
773 string specifying the residues this PCA delta applies to.
775 # FIX: According to Xtandem, C is only a candidate for PCA if
776 # carboxyamidomethylated (C+57). Currently we always search it.
777 return [ [('', 0, 0),
778 ('E', -1 * CP.parent_mass_regime[r].water_mass,
779 -1 * CP.fragment_mass_regime[r].water_mass),
780 ('QC', -1 * CP.parent_mass_regime[r].ammonia_mass,
781 -1 * CP.fragment_mass_regime[r].ammonia_mass)]
782 for r in range(len(mass_regimes)) ]
785 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
786 if not mod_tree:
787 if 0 < len(conjuncts) <= limit:
788 yield conjuncts
789 return
790 first, rest = mod_tree[0], mod_tree[1:]
791 if isinstance(first, list):
792 for x in enumerate_disjunction(first, limit):
793 for y in enumerate_conjunction(rest, limit, conjuncts + x):
794 yield y
795 else:
796 for y in enumerate_conjunction(rest, limit, conjuncts):
797 yield y
798 for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
799 yield y
801 def enumerate_disjunction(mod_tree, limit=sys.maxint):
802 """Generates the conjuncts for mod_tree that are no longer than limit.
804 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
805 [[], ['a'], ['b'], ['c']]
806 >>> list(enumerate_disjunction([[1,2,3]]))
807 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
808 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
809 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
810 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
811 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
812 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
813 [[]]
816 assert isinstance(mod_tree, list)
817 yield []
818 for b in mod_tree:
819 for s in enumerate_conjunction(b, limit):
820 yield s
822 def get_mod_conjunct_triples(mod_tree, limit):
823 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
824 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
825 ordered by increasing number of conjuncts, and with duplicates within and
826 across removed. A mass table is also appended to each conjunct.
828 # FIX: is there a more elegant way or place for all this?
829 def enmass(t):
830 def rmass(regime_index, par_frag, sign, delta, is_mono):
831 global MASS_REGIME_ATOMIC_MASSES
832 if isinstance(delta, str):
833 if is_mono:
834 regime_index = 0
835 return (sign * formula_mass(delta,
836 MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
837 else:
838 return sign * delta
840 sign, delta, is_mono = t[:3]
841 return t + (tuple((rmass(r, 0, sign, delta, is_mono),
842 rmass(r, 1, sign, delta, is_mono))
843 for r in range(len(XTP["mass_regimes"]))),)
845 def triple(c):
846 Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
847 Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
848 assert len(Ns) <= 1 and len(Cs) <= 1
849 rest = [ enmass(x) for x in c if x[3] not in '[]' ]
850 rest = tuple(sorted(list(frozenset(rest))))
851 return (Ns, Cs, rest)
853 return sorted(list(frozenset(triple(conjunct)
854 for conjunct
855 in enumerate_disjunction(mod_tree, limit))),
856 key=lambda x: (sum(len(y) for y in x), x))
859 def gen_delta_bag_counts(i, remainder, bag):
860 if i < 1:
861 assert i == 0
862 bag[0] = remainder
863 yield tuple(bag)
864 return
865 for delta in range(1, remainder-i+1):
866 bag[i] = delta
867 for x in gen_delta_bag_counts(i-1, remainder-delta, bag):
868 yield x
870 def generate_delta_bag_counts(mod_count, conjunct_length):
871 """Generate all tuples of positive integers having length conjunct_length
872 and sum mod_count. As a special case, () is such a tuple having length 0
873 and sum 0.
875 >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
877 0 []
878 1 [(4,)]
879 2 [(3, 1), (2, 2), (1, 3)]
880 3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
881 4 [(1, 1, 1, 1)]
882 5 []
883 >>> list(generate_delta_bag_counts(0, 0))
884 [()]
885 >>> list(generate_delta_bag_counts(0, 1))
889 if conjunct_length == 0:
890 return [()] if mod_count == 0 else []
891 if mod_count < conjunct_length:
892 return []
893 return gen_delta_bag_counts(conjunct_length - 1, mod_count,
894 [0] * conjunct_length)
897 def set_context_conjuncts(context, mass_regime_index, N_cj, C_cj, R_cj):
898 assert len(N_cj) < 1 and len(C_cj) < 1
899 context.N_delta = 0
900 if N_cj:
901 context.N_delta = N_cj[0][5][mass_regime_index][1]
902 context.C_delta = 0
903 if C_cj:
904 context.C_delta = C_cj[0][5][mass_regime_index][1]
905 context.delta_bag_lookup.clear()
906 context.delta_bag_lookup.resize(ord('Z')+1)
907 context.delta_bag_delta.clear()
908 for n, cj in enumerate(R_cj):
909 context.delta_bag_delta.append(cj[5][mass_regime_index][1])
910 for r in cj[3]:
911 context.delta_bag_lookup[ord(r)] \
912 = context.delta_bag_lookup[ord(r)] + (n,)
915 def search_all(options, context, score_statistics):
916 """Search sequence database against searchable spectra."""
918 mod_limit = XTP["potential_mod_limit"]
920 mod_conjunct_triples = get_mod_conjunct_triples(XTP["potential_mods"],
921 mod_limit)
922 info("%s unique potential modification conjuncts",
923 len(mod_conjunct_triples))
924 debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples))
926 mass_regimes = XTP["mass_regimes"]
927 pca_table = get_pca_table(mass_regimes)
928 debug("pca_table: %s", pca_table)
930 total_combinations_searched = 0
932 for mod_count in range(mod_limit + 1):
933 context.mod_count = mod_count
934 for mr_index, (mass_regime, pca_entry) in enumerate(zip(mass_regimes,
935 pca_table)):
936 context.mass_regime_index = mr_index
937 for pca_res, pca_parent_delta, pca_frag_delta in pca_entry:
938 context.pca_residues = pca_res
939 context.pca_delta = pca_frag_delta
940 for cji, (N_cj, C_cj, R_cj) in enumerate(mod_conjunct_triples):
941 if pca_res and N_cj:
942 continue # mutually exclusive, for now
943 set_context_conjuncts(context, mr_index, N_cj, C_cj, R_cj)
944 debug("mod_count: %s", mod_count)
945 debug("cj_triple: N=%s C=%s R=%s", N_cj, C_cj, R_cj)
946 for delta_bag in generate_delta_bag_counts(mod_count,
947 len(R_cj)):
948 debug("delta_bag: %s", delta_bag)
950 # this clear() avoids an SWIG/STL bug!?
951 context.delta_bag_count.clear()
952 context.delta_bag_count[:] = delta_bag
954 pmrf = CP.parent_mass_regime[mr_index].fixed_residue_mass
955 fmrf = CP.fragment_mass_regime[mr_index].fixed_residue_mass
956 p_fx = (pmrf[ord('[')]
957 + (N_cj and N_cj[0][5][mr_index][0] or 0)
958 + pmrf[ord(']')]
959 + (C_cj and C_cj[0][5][mr_index][0] or 0)
960 + pca_parent_delta + PROTON_MASS)
961 context.parent_fixed_mass = p_fx
962 f_N_fx = (fmrf[ord('[')]
963 + (N_cj and N_cj[0][5][mr_index][1] or 0)
964 + pca_frag_delta)
965 context.fragment_N_fixed_mass = f_N_fx
966 f_C_fx = (fmrf[ord(']')]
967 + (C_cj and C_cj[0][5][mr_index][1] or 0)
968 + CP.fragment_mass_regime[mr_index].water_mass)
969 context.fragment_C_fixed_mass = f_C_fx
971 info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
972 % (mod_count, mr_index, pca_res, cji, delta_bag))
973 debug("p_fx %s f_N_fx %s f_C_fx %s"
974 % (p_fx, f_N_fx, f_C_fx))
976 score_statistics.combinations_searched = 0
977 cgreylag.spectrum.search_runs(context,
978 score_statistics)
979 total_combinations_searched \
980 += score_statistics.combinations_searched
981 info(" %s candidate spectra examined, this bag",
982 score_statistics.combinations_searched)
984 info('%s candidate spectra examined',
985 score_statistics.candidate_spectrum_count)
988 def get_prefix_sequence(begin_pos, run_offset, sequence):
990 >>> get_prefix_sequence(0, 0, 'abcdefghi')
992 >>> get_prefix_sequence(1, 0, 'abcdefghi')
993 '[a'
994 >>> get_prefix_sequence(3, 0, 'abcdefghi')
995 '[abc'
996 >>> get_prefix_sequence(4, 0, 'abcdefghi')
997 'abcd'
998 >>> get_prefix_sequence(4, 2, 'abcdefghi')
999 'cdef'
1002 prefix_start = run_offset + begin_pos - 4
1003 s = sequence[max(0, prefix_start):prefix_start+4]
1004 if len(s) < 4:
1005 s = '[' + s
1006 return s
1008 def get_suffix_sequence(end_pos, run_offset, sequence):
1010 >>> get_suffix_sequence(0, 0, 'abcdefghi')
1011 'abcd'
1012 >>> get_suffix_sequence(4, 0, 'abcdefghi')
1013 'efgh'
1014 >>> get_suffix_sequence(6, 0, 'abcdefghi')
1015 'ghi]'
1016 >>> get_suffix_sequence(8, 0, 'abcdefghi')
1017 'i]'
1018 >>> get_suffix_sequence(4, 2, 'abcdefghi')
1019 'ghi]'
1022 suffix_start = run_offset + end_pos
1023 s = sequence[suffix_start:suffix_start+4]
1024 if len(s) < 4:
1025 s = s + ']'
1026 return s
1029 def results_dump(score_statistics, searchable_spectra):
1030 """Return a result dict mapping spectrum names to (spectrum_metadata,
1031 best_matches) pairs. (Unneeded fields are stripped.)
1034 r = {}
1035 spectrum_metadata_fs = set(['name', 'file_id', 'mass', 'charge',
1036 'total_ion_current', 'comparisons'])
1037 py_s_spectra = pythonize_swig_object(searchable_spectra,
1038 only_fields=spectrum_metadata_fs)
1039 py_matches = pythonize_swig_object(score_statistics.best_matches,
1040 skip_fields=['spectrum_index'])
1041 assert len(py_s_spectra) == len(py_matches)
1043 for sp_metadata, sp_matches in zip(py_s_spectra, py_matches):
1044 assert sp_metadata['name'] not in r, "duplicate spectrum name"
1045 r[sp_metadata['name']] = (sp_metadata, sp_matches)
1047 return r
1050 def main(args=sys.argv[1:]):
1051 parser = optparse.OptionParser(usage=
1052 "usage: %prog [options] <job-id>"
1053 " <configuration-file> <ms2-file>...",
1054 description=__doc__, version=__version__)
1055 pa = parser.add_option
1056 pa("-P", "--parameter", nargs=2, dest="parameters", action="append",
1057 default=[],
1058 help="override a parameter in <parameter-file>, may be used multiple"
1059 " times", metavar="NAME VALUE")
1060 pa("-w", "--work-slice", nargs=2, type="float", dest="work_slice",
1061 help="search a subinterval [L:U) of the work space"
1062 " (where 0 <= L <= U <= 1) in standalone mode", metavar="L U")
1063 pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
1064 pa("-p", "--show-progress", action="store_true", dest="show_progress",
1065 help="show running progress")
1066 pa("--estimate", action="store_true", dest="estimate_only",
1067 help="just estimate the time required for the search")
1068 pa("-v", "--verbose", action="store_true", dest="verbose",
1069 help="be verbose")
1070 pa("--copyright", action="store_true", dest="copyright",
1071 help="print copyright and exit")
1072 pa("--debug", action="store_true", dest="debug",
1073 help="output debugging info")
1074 pa("--profile", action="store_true", dest="profile",
1075 help="dump Python profiling output to './greylag.prof.<pid>'")
1076 (options, args) = parser.parse_args(args=args)
1078 if options.copyright:
1079 print __copyright__
1080 sys.exit(0)
1082 if len(args) < 3:
1083 parser.print_help()
1084 sys.exit(1)
1086 job_id = args[0]
1087 configuration_fn = args[1]
1088 spectrum_fns = args[2:]
1090 if (any(True for f in spectrum_fns if not f.endswith('.ms2'))
1091 or (options.work_slice
1092 and not (0 <= options.work_slice[0]
1093 <= options.work_slice[1] <= 1))):
1094 parser.print_help()
1095 sys.exit(1)
1097 log_level = logging.WARNING
1098 if options.quiet:
1099 log_level = logging.ERROR
1100 if options.verbose:
1101 log_level = logging.INFO
1102 if options.debug:
1103 log_level = logging.DEBUG
1104 logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
1105 format='%(asctime)s %(levelname)s: %(message)s')
1106 info("starting on %s", gethostname())
1108 # prevent format char problems
1109 if '%' in job_id:
1110 error("<job-id> may not contain '%'")
1112 # check spectrum basename uniqueness, as corresponding sqt files will be
1113 # in a single directory
1114 base_spectrum_fns = [ os.path.basename(fn) for fn in spectrum_fns ]
1115 if len(base_spectrum_fns) != len(set(base_spectrum_fns)):
1116 error("base spectrum filenames must be unique")
1118 # check -P names for validity
1119 bad_names = (set(n for n,v in options.parameters)
1120 - set(PARAMETER_INFO.keys()))
1121 if bad_names:
1122 error("bad -P parameter names %s" % list(bad_names))
1124 # read params
1125 cp = ConfigParser.RawConfigParser()
1126 cp.optionxform = str # be case-sensitive
1127 with open(configuration_fn) as configuration_file:
1128 cp.readfp(configuration_file)
1129 if not cp.has_section('greylag'):
1130 error("%s has no [greylag] section" % configuration_fn)
1131 parameters = dict(cp.items('greylag'))
1132 parameters.update(dict(options.parameters)) # command-line override
1133 global XTP
1134 XTP = validate_parameters(parameters)
1136 fixed_mod_map = dict((r[3], r) for r in XTP["pervasive_mods"])
1137 initialize_spectrum_parameters(options, XTP["mass_regimes"], fixed_mod_map)
1139 # read sequence dbs
1140 databases = XTP["databases"].split()
1141 # [(locusname, defline, seq, filename), ...]
1142 fasta_db = list(read_fasta_files(databases))
1143 # [(idno, offset, locusname, defline, seq, seq_filename), ...]
1144 db = []
1145 for idno, (locusname, defline, sequence, filename) in enumerate(fasta_db):
1146 db.extend(split_sequence_into_aa_runs(idno, locusname, defline,
1147 sequence, filename))
1148 db_residue_count = sum(len(dbi[3]) for dbi in db)
1150 info("read %s sequences (%s runs, %s residues)", len(fasta_db), len(db),
1151 db_residue_count)
1152 max_run_length = max(len(r[3]) for r in db)
1153 info("max run length is %s residues", max_run_length)
1154 if max_run_length > 2**31 - 1:
1155 error("runs longer than %s not yet supported", max_run_length)
1156 if not db:
1157 error("no database sequences")
1159 # read spectrum offset indices
1160 spectrum_offset_indices = []
1161 for spfn in spectrum_fns:
1162 idxfn = spfn + '.idx'
1163 with contextlib.closing(gzip.open(idxfn)) as idxf:
1164 idx = cPickle.load(idxf)
1165 # try to verify that index matches spectrum file
1166 if idx['file size'] != os.path.getsize(spfn):
1167 error("index '%s' needs rebuilding" % idxfn)
1168 spectrum_offset_indices.append(idx['offsets'])
1170 # FIX: assume standalone mode (for now)
1171 assert options.work_slice
1173 # read spectra per work slice
1174 spectra = read_spectra_slice(spectrum_fns, spectrum_offset_indices,
1175 options.work_slice)
1176 del spectrum_offset_indices
1177 spectra.sort(key=lambda x: x.mass)
1179 def peak_statistics(spectra):
1180 counts = [ len(sp.peaks) for sp in spectra ]
1181 counts.sort()
1182 n = len(counts)
1183 return (counts[0], counts[int(n*0.25)], counts[int(n*0.5)],
1184 counts[int(n*0.75)], counts[-1], sum(counts) / float(n))
1186 if spectra:
1187 info("read %s spectra (mass range %s - %s)", len(spectra),
1188 spectra[0].mass, spectra[-1].mass)
1189 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1190 % peak_statistics(spectra))
1191 else:
1192 warning("no input spectra")
1194 # filter and normalize spectra
1195 for sp in spectra:
1196 sp.filter_peaks(XTP["TIC_cutoff_proportion"],
1197 CP.parent_mass_tolerance_max)
1198 sp.classify(XTP["intensity_class_count"], XTP["intensity_class_ratio"],
1199 XTP["fragment_mass_tolerance"])
1201 min_psm = XTP["min_parent_spectrum_mass"]
1202 max_psm = XTP["max_parent_spectrum_mass"]
1203 # FIX: also filter by 1 + 2 + 4 rule?
1204 spectra = [ sp for sp in spectra
1205 if len(sp.peaks) >= 10 and min_psm <= sp.mass <= max_psm ]
1207 if spectra:
1208 info("after filtering:")
1209 info(" %s spectra (mass range %s - %s)", len(spectra),
1210 spectra[0].mass, spectra[-1].mass)
1211 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1212 % peak_statistics(spectra))
1214 cgreylag.spectrum.set_searchable_spectra(spectra)
1215 score_statistics = cgreylag.score_stats(len(spectra),
1216 XTP["best_result_count"])
1218 if spectra:
1219 del spectra # release memory
1221 # FIX!!!
1222 # (cleavage_re, position of cleavage in cleavage_re)
1223 cleavage_motif = "[X]|[X]"
1224 cleavage_pattern, cleavage_pos = cleavage_motif_re(cleavage_motif)
1225 if cleavage_pos == None:
1226 error("cleavage site '%s' is missing '|'",
1227 XTP["protein, cleavage site"])
1228 cleavage_pattern = re.compile(cleavage_pattern)
1230 context = cgreylag.search_context()
1231 for idno, offset, locusname, defline, seq, seq_filename in db:
1232 cp = []
1233 if cleavage_motif != "[X]|[X]":
1234 cp = list(generate_cleavage_points(cleavage_pattern,
1235 cleavage_pos, seq))
1236 sr = cgreylag.sequence_run(idno, offset, seq, cp, locusname)
1237 context.sequence_runs.append(sr)
1238 context.maximum_missed_cleavage_sites = 1000000000 # FIX
1240 info("searching")
1241 search_all(options, context, score_statistics)
1242 else:
1243 warning("no spectra after filtering--search skipped")
1245 if options.estimate_only:
1246 print ("%.2f generic CPU hours"
1247 % (score_statistics.candidate_spectrum_count / 300.0e6))
1248 return
1250 info("writing result file")
1251 # FIX: this is only for standalone mode
1252 result_fn = 'grind_%s-%s.gwr' % options.work_slice
1253 with contextlib.closing(gzip.open(result_fn, 'w')) as result_file:
1254 d = { 'version' : __version__,
1255 'matches' : results_dump(score_statistics,
1256 cgreylag.cvar.spectrum_searchable_spectra),
1257 'total comparisons' : score_statistics.candidate_spectrum_count,
1258 'spectrum files' : base_spectrum_fns,
1259 'databases' : databases,
1260 'parameters' : XTP,
1261 'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES,
1262 'proton mass' : PROTON_MASS,
1263 'argv' : sys.argv }
1264 cPickle.dump(d, result_file, cPickle.HIGHEST_PROTOCOL)
1265 info("finished, result file written to '%s'", result_fn)
1268 if __name__ == '__main__':
1269 # try:
1270 # import psyco
1271 # psyco.full()
1272 # ###psyco.bind(formula_mass)
1273 # warning('psyco enabled')
1274 # except ImportError:
1275 # pass
1277 try:
1278 if '--profile' in sys.argv:
1279 import cProfile
1280 import pstats
1281 report_fn = "greylag.prof.%s" % os.getpid()
1282 data_fn = report_fn + ".tmp"
1283 prof = cProfile.run('main()', data_fn)
1284 with open(report_fn, 'w') as report_f:
1285 try:
1286 stats = pstats.Stats(data_fn, stream=report_f)
1287 stats.strip_dirs()
1288 stats.sort_stats('cumulative')
1289 stats.print_stats(50)
1290 stats.sort_stats('time')
1291 stats.print_stats(50)
1292 print "# profile report written to '%s'" % report_fn
1293 finally:
1294 try:
1295 os.remove(data_fn)
1296 except:
1297 pass
1298 else:
1299 main()
1300 except SystemExit:
1301 raise
1302 except:
1303 logging.exception("unhandled exception")
1304 sys.exit(1)
1305 finally:
1306 logging.shutdown()
1309 # FIXES:
1310 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)