Update greylag-solo and tests, plus bug fixes.
[greylag.git] / greylag_grind.py
blob0d65cb5a5481a5764de7feed65c3ac443716f8fe
1 #!/usr/bin/env greylag-python
3 '''This program does the actual work to search mass spectra against a sequence
4 database. <job-id> is a unique identifier used as a prefix for work and
5 output directories. The <configuration-file> contains program options. The
6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
7 on them.
9 This program can operate in standalone mode (useful for testing) or command
10 mode. If the --work-slice option is given, that slice will be searched in
11 standalone mode, then the program will exit. Otherwise, the program will
12 process commands until an exit command is received.
14 '''
16 from __future__ import with_statement
18 __copyright__ = '''
19 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License along
32 with this program; if not, write to the Free Software Foundation, Inc.,
33 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
34 '''
36 ##############################################################################
37 ## "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra ##
38 ##############################################################################
41 __version__ = "0.0"
44 import ConfigParser
45 import contextlib
46 import cPickle
47 import fileinput
48 import gzip
49 import logging
50 from logging import debug, info, warning
51 import math
52 import optparse
53 import os
54 from pprint import pprint, pformat
55 import re
56 from socket import gethostname
57 import sys
59 import cgreylag
62 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
63 # KeyboardInterrupt processing, since we may spend long periods of time in
64 # uninterruptible C++ calls. Also die immediately on SIGPIPE.
65 try:
66 import signal
67 signal.signal(signal.SIGINT, signal.SIG_DFL)
68 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
69 except:
70 pass
73 def error(s, *args):
74 "fatal error"
75 # if we're unit testing, just throw an exception
76 if __name__ != "__main__":
77 raise Exception((s + " (fatal error)") % args)
78 logging.error(s, *args)
79 sys.exit(1)
82 def fileerror(s, *args):
83 error(s + (", at line %s of file '%s'"
84 % (fileinput.filelineno(), fileinput.filename())),
85 *args)
88 # name -> value map of processed XML input parameters
89 XTP = {}
91 # handle to the singleton parameter object shared with the C++ module
92 CP = cgreylag.cvar.parameters_the
94 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
95 PROTON_MASS = 1.007276
96 ELECTRON_MASS = 0.000549 # ?
98 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
99 MONOISOTOPIC_ATOMIC_MASS = {
100 'H' : 1.00782503214,
101 'C' : 12.00000000,
102 'N' : 14.00307400529,
103 'O' : 15.994914622115,
104 'P' : 30.9737615120,
105 'S' : 31.9720706912,
108 # most prevalent only (1 in 1000)
109 ISOTOPIC_ATOMIC_MASS = { # prevalence (in %)
110 'C13' : 13.003354837810, # 1.078
111 'N15' : 15.00010889849, # 0.3687
112 'O18' : 17.99916049, # 0.20514
113 'S33' : 32.9714585012, # 0.762
114 'S34' : 33.9678668311, # 4.2928
117 AVERAGE_ATOMIC_MASS = {
118 'H' : 1.007947,
119 'C' : 12.01078,
120 'N' : 14.00672,
121 'O' : 15.99943,
122 'P' : 30.9737612,
123 'S' : 32.0655,
126 # The xtandem average residue masses are about 0.002 amu higher than those
127 # calculated directly from the above average atomic masses. None of the
128 # chemists consulted knew of any reason why, aside from lack of precision in
129 # the average atomic mass estimates. This shouldn't matter very much, as
130 # fragmentation calculations should all be monoisotopic, and we can always
131 # widen the parent tolerance window a bit.
134 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
135 """Return the mass of formula, using the given mass regime (monoisotopic
136 by default).
138 >>> formula_mass('H2O', { 'H':1, 'O':16 })
140 >>> # monoisotopic mass of glycine
141 >>> str(round(formula_mass('C2H3ON'), 4))
142 '57.0215'
145 parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
146 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
147 return sum(atomic_mass[parts[i]] * int(parts[i+1])
148 for i in range(0, len(parts), 2))
150 # FIX: selenocysteine (U), etc
151 # residue -> formula
152 RESIDUE_FORMULA = {
153 'A' : "C3H5ON",
154 'C' : "C3H5ONS",
155 'D' : "C4H5O3N",
156 'E' : "C5H7O3N",
157 'F' : "C9H9ON",
158 'G' : "C2H3ON",
159 'H' : "C6H7ON3",
160 'I' : "C6H11ON",
161 'K' : "C6H12ON2",
162 'L' : "C6H11ON",
163 'M' : "C5H9ONS",
164 'N' : "C4H6O2N2",
165 'P' : "C5H7ON",
166 'Q' : "C5H8O2N2",
167 'R' : "C6H12ON4",
168 'S' : "C3H5O2N",
169 'T' : "C4H7O2N",
170 'V' : "C5H9ON",
171 'W' : "C11H10ON2",
172 'Y' : "C9H9O2N",
175 RESIDUES = sorted(RESIDUE_FORMULA.keys())
176 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
179 # accessed as a global variable (FIX?)
180 # [0][1] -> 'H' -> fragment mass of H for regime 0
181 MASS_REGIME_ATOMIC_MASSES = []
184 def mass_regime_atomic_masses(spec):
185 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
186 names to masses.
188 name, isotopes = spec
189 assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
190 if name == 'MONO':
191 r = MONOISOTOPIC_ATOMIC_MASS.copy()
192 else:
193 r = AVERAGE_ATOMIC_MASS.copy()
194 if isotopes:
195 iname, prevalence = isotopes[0]
196 assert iname == 'N15' and 0 <= prevalence <= 1
197 # this is a simplification, but additional accuracy pointless?
198 if name == 'MONO':
199 r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
200 else:
201 r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
202 return r
205 def initialize_spectrum_parameters(options, mass_regimes, fixed_mod_map):
206 """Initialize parameters known to the spectrum module.
207 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
210 debug('fixed_mod_map: %s', fixed_mod_map)
211 # This is the size of vectors that are indexed by residues (A-Z) or
212 # special characters ('[]').
213 RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
215 # These are currently monoisotopic. (deuterium pointless?)
216 CP.proton_mass = PROTON_MASS
217 CP.hydrogen_mass = formula_mass("H")
219 global MASS_REGIME_ATOMIC_MASSES
220 for regime_pair in mass_regimes:
221 assert len(regime_pair) == 2 # parent and fragment
222 info('mass regime: %s', regime_pair)
223 MASS_REGIME_ATOMIC_MASSES.append([])
224 for n, regime in enumerate(regime_pair):
225 atmass = mass_regime_atomic_masses(regime)
226 MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
227 creg = cgreylag.mass_regime_parameters()
229 creg.hydroxyl_mass = formula_mass("OH", atmass)
230 creg.water_mass = formula_mass("H2O", atmass)
231 creg.ammonia_mass = formula_mass("NH3", atmass)
233 creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
235 for r in RESIDUES_W_BRACKETS:
236 m = 0
237 if r in RESIDUES:
238 m = formula_mass(RESIDUE_FORMULA[r], atmass)
239 rmod = fixed_mod_map.get(r)
240 if rmod:
241 if isinstance(rmod[1], str):
242 if rmod[2]:
243 m += rmod[0] * formula_mass(rmod[1])
244 else:
245 m += rmod[0] * formula_mass(rmod[1], atmass)
246 else:
247 m += rmod[0] * rmod[1]
248 creg.fixed_residue_mass[ord(r)] = m
249 # assuming these are monoisotopic (not regime)
250 creg.fixed_residue_mass[ord('[')] += formula_mass("H")
251 creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
252 if not n:
253 CP.parent_mass_regime.append(creg);
254 else:
255 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
256 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
257 CP.fragment_mass_regime.append(creg);
258 for r in RESIDUES_W_BRACKETS:
259 info('fixed_mass %s: %s', r,
260 [ "%.6f/%.6f"
261 % (CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
262 CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
263 for rn in range(len(mass_regimes)) ])
264 for r in RESIDUES:
265 for rn in range(len(mass_regimes)):
266 # physically impossible (and results would be garbage)
267 if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
268 raise ValueError('bogus parent mass specification for %s' % r)
269 if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
270 raise ValueError('bogus parent mass specification for %s' % r)
272 CP.parent_mass_tolerance_1 = XTP["parent_mz_tolerance"]
273 CP.parent_mass_tolerance_max = (XTP["parent_mz_tolerance"]
274 * XTP["charge_limit"])
276 CP.fragment_mass_tolerance = XTP["fragment_mass_tolerance"]
277 CP.intensity_class_count = XTP["intensity_class_count"]
279 CP.minimum_peptide_length = XTP["min_peptide_length"]
281 # CP.ln_factorial[n] == ln(n!)
282 CP.ln_factorial.resize(int(XTP["max_parent_spectrum_mass"]
283 / XTP["fragment_mass_tolerance"] + 100), 0.0)
284 for n in range(2, len(CP.ln_factorial)):
285 CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
287 CP.estimate_only = bool(options.estimate_only)
288 CP.show_progress = bool(options.show_progress)
291 def cleavage_motif_re(motif):
292 """Return (regexp, pos), where regexp is a regular expression that will
293 match a cleavage motif, and pos is the position of the cleavage with
294 respect to the match (or None). (The RE actually matches one character,
295 the rest matching as lookahead, so that re.finditer will find all
296 overlapping matches.)
298 >>> cleavage_motif_re('[KR]|{P}')
299 ('[KR](?=[^P])', 1)
300 >>> cleavage_motif_re('[X]|[X]')
301 ('.(?=.)', 1)
304 cleavage_pos = None
305 re_parts = []
306 motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
307 parts = [ p[0] for p in motif_re.findall(motif) ]
308 if ''.join(parts) != motif:
309 raise ValueError('invalid cleavage motif pattern')
310 i = 0
311 for part in parts:
312 if part == '|':
313 if cleavage_pos != None:
314 raise ValueError("invalid cleavage motif pattern"
315 " (multiple '|'s)")
316 cleavage_pos = i
317 continue
318 if part == '[X]':
319 re_parts.append('.')
320 elif part[0] == '[':
321 re_parts.append('[%s]' % part[1:-1])
322 elif part[0] == '{':
323 re_parts.append('[^%s]' % part[1:-1])
324 else:
325 assert False, "unknown cleavage motif syntax"
326 i += 1
327 if len(re_parts) == 0:
328 re_pattern = '.'
329 elif len(re_parts) == 1:
330 re_pattern = re_parts[0]
331 else:
332 re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
333 return (re_pattern, cleavage_pos)
336 def generate_cleavage_points(cleavage_re, cleavage_pos, sequence):
337 """Yields the offsets of the cleavages in sequence. The endpoints are
338 always included, by convention.
340 >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
341 ... 'ARBCKDEKPF'))
342 [0, 2, 5, 10]
345 yield 0
346 for m in cleavage_re.finditer(sequence):
347 p = m.start() + cleavage_pos
348 if 0 < p < len(sequence):
349 yield p
350 yield len(sequence)
353 AA_SEQUENCE = re.compile(r'[ARNDCQEGHILKMFPSTWYV]+')
355 def split_sequence_into_aa_runs(idno, locusname, defline, sequence, filename):
356 """Returns a tuple (idno, start, locusname, defline, seq, filename) for
357 each contiguous run of residues in sequence, where 'start' is the position
358 of 'seq' in 'sequence'.
360 >>> pprint(split_sequence_into_aa_runs(123, 'ln', 'ln defline',
361 ... 'STSS*DEFABA', 'filename'))
362 [(123, 0, 'ln', 'ln defline', 'STSS', 'filename'),
363 (123, 5, 'ln', 'ln defline', 'DEFA', 'filename'),
364 (123, 10, 'ln', 'ln defline', 'A', 'filename')]
367 return [ (idno, m.start(), locusname, defline, m.group(), filename)
368 for n, m in enumerate(AA_SEQUENCE.finditer(sequence)) ]
371 def read_fasta_files(filenames):
372 """Yield (locusname, defline, sequence, filename) tuples as read from
373 FASTA files (uppercasing sequence)."""
375 loci_seen = set()
376 locusname, defline = None, None
377 seqs = []
379 # reset fileinput first, in case it's been called before
380 try:
381 fileinput.close()
382 except RuntimeError:
383 pass
384 for line in fileinput.input(filenames):
385 line = line.strip()
386 if line[:1] == '>':
387 if defline != None:
388 yield (locusname, defline, ''.join(seqs), fileinput.filename())
389 elif seqs:
390 fileerror("bad format: line precedes initial defline")
391 defline = line[1:]
392 locusname_rest = defline.split(None, 1)
393 if not locusname_rest:
394 error("empty locus name not allowed")
395 locusname = locusname_rest[0]
396 if locusname in loci_seen:
397 error("locus name '%s' is not unique in the search database(s)"
398 % locusname)
399 loci_seen.add(locusname)
400 seqs = []
401 else:
402 seqs.append(line.upper())
403 if defline:
404 yield (locusname, defline, ''.join(seqs), fileinput.filename())
407 def read_spectra_slice(spectrum_fns, offset_indices, slice):
408 s_l, s_u = slice
409 assert 0 <= s_l <= s_u <= 1
411 total_spectra = sum(len(oi) for oi in offset_indices)
412 sp_l = int(float(s_l) * total_spectra)
413 sp_u = int(float(s_u) * total_spectra)
415 seeing_l = 0
416 spectra = []
418 for f_no, (fn, oi) in enumerate(zip(spectrum_fns, offset_indices)):
419 seeing_u = seeing_l + len(oi)
420 f_l = max(sp_l, seeing_l)
421 f_u = min(sp_u, seeing_u)
423 if f_l < f_u:
424 b_l = oi[f_l - seeing_l]
425 b_u = oi[f_u - seeing_l] if f_u - seeing_l < len(oi) else -1 # oo
427 with open(fn) as f:
428 fsp = cgreylag.spectrum.read_spectra_from_ms2(f, f_no,
429 b_l, b_u)
430 spectra.extend(fsp)
431 seeing_l = seeing_u
433 return spectra
436 # XML parameter file processing
438 # FIX: This parsing is way too complex. How to simplify?
440 def mass_regime_part(part_specification):
441 """Parse a single mass regime specification part.
443 >>> mass_regime_part('MONO')
444 ('MONO', [])
445 >>> mass_regime_part('AVG')
446 ('AVG', [])
447 >>> mass_regime_part('MONO(N15@87.5%)')
448 ('MONO', [('N15', 0.875)])
451 ps = [ x.strip() for x in part_specification.partition('(') ]
452 if ps[0] not in ('MONO', 'AVG'):
453 raise ValueError("invalid mass regime list specification"
454 " (regime id must be 'MONO' or 'AVG')")
455 if not ps[1]:
456 return (ps[0], [])
457 if ps[2][-1] != ')':
458 raise ValueError("invalid mass regime list specification"
459 " (expected ')')")
460 pps = [ x.strip() for x in ps[2][:-1].split(',') ]
461 if len(pps) > 1:
462 raise ValueError("invalid mass regime list specification"
463 " (multiple isotopes not yet implemented)")
464 ppps = [ x.strip() for x in pps[0].partition('@') ]
465 if not ppps[1]:
466 raise ValueError("invalid mass regime list specification"
467 " (expected '@')")
468 if ppps[0] not in ('N15',):
469 raise ValueError("invalid mass regime list specification"
470 " (isotope id must currently be 'N15')")
471 if ppps[2][-1] != '%':
472 raise ValueError("invalid mass regime list specification"
473 " (expected '%')")
474 prevalence = float(ppps[2][:-1]) / 100
475 if not (0 <= prevalence <= 1):
476 raise ValueError("invalid mass regime list specification"
477 " (prevalence must be in range 0-100%)")
478 return (ps[0], [(ppps[0], prevalence)])
480 def mass_regime_list(mass_regime_list_specification):
481 """Check and return a list of regime tuples (parent_regime,
482 fragment_regime), where each regime is a tuple (id, [(isotope_id,
483 prevalence), ...]). Multiple isotopes (when implemented) would be
484 comma-separated.
486 >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
487 [[('AVG', []), ('MONO', [])],
488 [('MONO', []), ('MONO', [])],
489 [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
492 result = []
493 for regspec in mass_regime_list_specification.split(';'):
494 halves = [ x.strip() for x in regspec.split('/') ]
495 if len(halves) > 2:
496 raise ValueError("invalid mass regime list specification"
497 " (too many '/'?)")
498 pr = [ mass_regime_part(h) for h in halves ]
499 if len(pr) == 1:
500 pr = [ pr[0], pr[0] ]
501 result.append(pr)
502 debug("mass regime list:\n%s", pformat(result))
504 # The first fragmentation regime should generally be MONO, so that
505 # formulaic deltas with '!' do the expected thing.
506 if result[0][1] != ('MONO', []):
507 raise ValueError("first fragmentation regime was something other than"
508 " 'MONO' with no isotopes--this is almost certainly"
509 " not what was intended")
510 return result
513 def parse_mod_term(s, is_potential=False):
514 """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
515 residues, description).
517 >>> parse_mod_term('-C2H3ON!@C')
518 (-1, 'C2H3ON', True, 'C', None)
519 >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
520 (1, 42.0, False, 'STY', 'phosphorylation')
524 m = re.match(r'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
525 r'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s)
526 if not m:
527 raise ValueError("invalid modification term specification"
528 " '%s'" % s)
529 mg = m.groups()
530 invalid_residues = set(mg[5]) - set(RESIDUES_W_BRACKETS)
531 if invalid_residues:
532 raise ValueError("invalid modification list specification"
533 " (invalid residues %s)" % list(invalid_residues))
534 delta = mg[1]
535 if mg[2]:
536 delta = float(mg[1])
537 if is_potential and abs(delta) < 0.0001:
538 raise ValueError("invalid modification list specification"
539 " (delta '%s' is too small)" % delta)
540 residues = mg[5]
541 if not is_potential and len(residues) != 1:
542 raise ValueError("invalid modification list specification '%s' (only"
543 " potential modifications may have multiple residues)"
544 % residues)
545 if len(residues) != len(set(residues)):
546 raise ValueError("invalid modification list specification"
547 " '%s' (duplicate residues prohibited)"
548 % residues)
549 return (mg[0] == '-' and -1 or 1, delta, mg[4] == '!', residues, mg[7])
552 def fixed_mod_list(specification):
553 """Check and return a list of modification tuples.
555 >>> fixed_mod_list('57@C')
556 [(1, 57.0, False, 'C', None)]
557 >>> fixed_mod_list('57@C,CH!@N desc')
558 [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
559 >>> fixed_mod_list('57@C,58@C')
560 Traceback (most recent call last):
562 ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
565 if not specification:
566 return []
567 result = [ parse_mod_term(s) for s in specification.split(',') ]
568 residues = [ x[3] for x in result ]
569 # this check is across terms; the one above is within terms
570 if len(residues) != len(set(residues)):
571 raise ValueError("invalid modification list specification"
572 " '%s' (duplicate residues prohibited)"
573 % residues)
574 debug("fixed_mod_list:\n%s", pformat(result))
575 return result
578 def parse_mod_basic_expression(s):
579 s = s.strip()
580 if s[0] == '(':
581 tree, rest = parse_mod_disjunction(s[1:])
582 rest = rest.lstrip()
583 if rest[:1] != ')':
584 raise ValueError("invalid modification list specification"
585 " (expected matching ')')")
586 return tree, rest[1:]
587 parts = re.split(r'([;,()])', s, 1)
588 if len(parts) == 1:
589 term, rest = s, ''
590 else:
591 assert len(parts) == 3
592 term, rest = parts[0], parts[1]+parts[2]
593 return parse_mod_term(term, is_potential=True), rest
595 def parse_mod_conjunction(s):
596 result = []
597 while s:
598 tree, s = parse_mod_basic_expression(s)
599 result.append(tree)
600 s = s.lstrip()
601 if s[:1] != ',':
602 break
603 s = s[1:]
604 return result, s
606 def parse_mod_disjunction(s):
607 result = []
608 while s:
609 tree, s = parse_mod_conjunction(s)
610 result.append(tree)
611 s = s.lstrip()
612 if s[:1] != ';':
613 break
614 s = s[1:]
615 return result, s
617 def potential_mod_list(specification):
618 """Check and return a tree of potential modification tuples. Nodes at
619 even (odd) levels are disjunctions (conjunctions). (The top list is a
620 disjunction.)
622 >>> potential_mod_list('')
624 >>> potential_mod_list('PO3H@STY; C2H2O@KST')
625 [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
626 >>> potential_mod_list('PO3H@STY, C2H2O@KST')
627 [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
628 >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
629 ... C2H2O@KST acetylation;
630 ... CH2@AKST methylation),
631 ... O@M oxidation'''))
632 [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
633 [(1, 'C2H2O', False, 'KST', 'acetylation')],
634 [(1, 'CH2', False, 'AKST', 'methylation')]],
635 (1, 'O', False, 'M', 'oxidation')]]
638 if not specification:
639 return []
640 tree, remainder = parse_mod_disjunction(specification)
641 if remainder:
642 raise ValueError("invalid modification list specification"
643 " (unexpected '%s')" % remainder)
644 debug("potential_mod_list:\n%s", pformat(tree))
645 return tree
648 def p_positive(x): return x > 0
649 def p_negative(x): return x < 0
650 def p_nonnegative(x): return x >= 0
651 def p_proportion(x): return 0 <= x <= 1
653 # Configuration file parameter specification
655 # name -> (type, default, check_fn)
656 # type may be bool, int, float, str, modlist, or a tuple of values
657 # default, as a string value, or None if value must be explicitly specified
658 # check_fn is an optional function that returns True iff the value is valid
660 PARAMETER_INFO = {
661 "databases" : (str, None),
662 "decoy_locus_prefix" : (str, "SHUFFLED_"),
663 "mass_regimes" : (mass_regime_list, "MONO"),
664 "pervasive_mods" : (fixed_mod_list, ""),
665 "potential_mods" : (potential_mod_list, ""),
666 "potential_mod_limit" : (int, 2, p_nonnegative),
667 "charge_limit" : (int, 3, p_positive),
668 "min_peptide_length" : (int, 5, p_positive), # needed?
669 "min_parent_spectrum_mass" : (float, 0, p_nonnegative),
670 "max_parent_spectrum_mass" : (float, 10000, p_nonnegative),
671 "TIC_cutoff_proportion" : (float, 0.98, p_proportion),
672 "parent_mz_tolerance" : (float, 1.25, p_nonnegative),
673 "fragment_mass_tolerance" : (float, 0.5, p_nonnegative),
674 "intensity_class_count" : (int, 3, p_positive),
675 "intensity_class_ratio" : (float, 2.0, p_positive), # really > 1.0?
676 "best_result_count" : (int, 5, p_positive),
680 def validate_parameters(parameters, parameter_info=PARAMETER_INFO):
681 """Verify that parameters are valid, have valid values, and correspond to
682 currently implemented functionality. Values are converted, default values
683 are filled in, and the resulting name/value dict returned.
685 >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
686 ... {'spectrum, total peaks'
687 ... : (int, '50', p_positive),
688 ... 'output, spectra'
689 ... : (bool, 'no')}).items())
690 [('output, spectra', False), ('spectrum, total peaks', 40)]
694 pmap = {}
695 for p_name, p_info in sorted(parameter_info.items()):
696 type_ = p_info[0]
697 default = p_info[1]
698 check_fn = len(p_info) > 2 and p_info[2] or None
700 v = parameters.get(p_name)
701 if v == None:
702 if default != None:
703 debug("parameter '%s' defaulting to '%s'", p_name, default)
704 v = default
705 else:
706 error("missing required parameter '%s'" % p_name)
707 if isinstance(type_, tuple):
708 if not v in type_:
709 error("parameter '%s' value '%s' not in %s (feature not"
710 " implemented?)" % (p_name, v, type_))
711 elif type_ == bool:
712 v = { 'yes' : True, 'no' : False }.get(v)
713 if v == None:
714 error("parameter '%s' requires a value of 'yes' or 'no'")
715 else:
716 try:
717 v = type_(v)
718 except ValueError, e:
719 error("parameter '%s' has value '%s' with invalid format [%s]"
720 % (p_name, v, e))
721 if check_fn and not check_fn(v):
722 error("parameter '%s' has invalid value '%s' (or feature not"
723 " implemented)" % (p_name, v))
724 pmap[p_name] = v
726 unknown_parameters = set(parameters) - set(parameter_info)
727 if unknown_parameters:
728 warning("%s unknown parameter(s):\n %s"
729 % (len(unknown_parameters),
730 pformat(sorted(list(unknown_parameters)))))
731 return pmap
734 def pythonize_swig_object(o, only_fields=None, skip_fields=[]):
735 """Creates a pure Python copy of a SWIG object, so that it can be easily
736 pickled, or printed (for debugging purposes). Each SWIG object is
737 pythonized as a dictionary, for convenience. If provided, 'only_fields',
738 limits the copy to the list of fields specified. Otherwise,
739 'skip_fields' if given is a list of methods not to include (this helps
740 avoid infinite recursion).
742 >>> pprint(pythonize_swig_object(cgreylag.score_stats(1, 1)))
743 {'best_matches': [[{'mass_trace': [],
744 'peptide_begin': [],
745 'peptide_sequence': '',
746 'predicted_parent_mass': 0.0,
747 'score': 0.0,
748 'sequence_name': [],
749 'spectrum_index': -1}]],
750 'candidate_spectrum_count': 0,
751 'combinations_searched': 0}
755 if isinstance(o, str):
756 return o
757 try:
758 len(o)
759 except TypeError:
760 pass
761 else:
762 return list(pythonize_swig_object(x, only_fields, skip_fields)
763 for x in o)
764 if hasattr(o, '__swig_getmethods__'):
765 s = {}
766 for a in o.__swig_getmethods__:
767 if (only_fields != None and a in only_fields
768 or only_fields == None and a not in skip_fields):
769 s[a] = pythonize_swig_object(getattr(o, a), only_fields,
770 skip_fields)
771 return s
772 return o
774 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
775 # circularization of the peptide N-terminal. PCA mods are excluded if a
776 # static N-terminal mod has been specified. Likewise, choosing a PCA mod will
777 # exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
778 # dependent on a static mod of C+57 being in effect.)
780 def get_pca_table(mass_regimes):
781 """Return a list of tuples (residues, parent delta, fragment delta)
782 describing the PCA possibilities for each mass regime. Residues is a
783 string specifying the residues this PCA delta applies to.
785 # FIX: According to Xtandem, C is only a candidate for PCA if
786 # carboxyamidomethylated (C+57). Currently we always search it.
787 return [ [('', 0, 0),
788 ('E', -1 * CP.parent_mass_regime[r].water_mass,
789 -1 * CP.fragment_mass_regime[r].water_mass),
790 ('QC', -1 * CP.parent_mass_regime[r].ammonia_mass,
791 -1 * CP.fragment_mass_regime[r].ammonia_mass)]
792 for r in range(len(mass_regimes)) ]
795 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
796 if not mod_tree:
797 if 0 < len(conjuncts) <= limit:
798 yield conjuncts
799 return
800 first, rest = mod_tree[0], mod_tree[1:]
801 if isinstance(first, list):
802 for x in enumerate_disjunction(first, limit):
803 for y in enumerate_conjunction(rest, limit, conjuncts + x):
804 yield y
805 else:
806 for y in enumerate_conjunction(rest, limit, conjuncts):
807 yield y
808 for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
809 yield y
811 def enumerate_disjunction(mod_tree, limit=sys.maxint):
812 """Generates the conjuncts for mod_tree that are no longer than limit.
814 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
815 [[], ['a'], ['b'], ['c']]
816 >>> list(enumerate_disjunction([[1,2,3]]))
817 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
818 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
819 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
820 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
821 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
822 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
823 [[]]
826 assert isinstance(mod_tree, list)
827 yield []
828 for b in mod_tree:
829 for s in enumerate_conjunction(b, limit):
830 yield s
832 def get_mod_conjunct_triples(mod_tree, limit):
833 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
834 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
835 ordered by increasing number of conjuncts, and with duplicates within and
836 across removed. A mass table is also appended to each conjunct.
838 # FIX: is there a more elegant way or place for all this?
839 def enmass(t):
840 def rmass(regime_index, par_frag, sign, delta, is_mono):
841 global MASS_REGIME_ATOMIC_MASSES
842 if isinstance(delta, str):
843 if is_mono:
844 regime_index = 0
845 return (sign * formula_mass(delta,
846 MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
847 else:
848 return sign * delta
850 sign, delta, is_mono = t[:3]
851 return t + (tuple((rmass(r, 0, sign, delta, is_mono),
852 rmass(r, 1, sign, delta, is_mono))
853 for r in range(len(XTP["mass_regimes"]))),)
855 def triple(c):
856 Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
857 Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
858 assert len(Ns) <= 1 and len(Cs) <= 1
859 rest = [ enmass(x) for x in c if x[3] not in '[]' ]
860 rest = tuple(sorted(list(frozenset(rest))))
861 return (Ns, Cs, rest)
863 return sorted(list(frozenset(triple(conjunct)
864 for conjunct
865 in enumerate_disjunction(mod_tree, limit))),
866 key=lambda x: (sum(len(y) for y in x), x))
869 def gen_delta_bag_counts(i, remainder, bag):
870 if i < 1:
871 assert i == 0
872 bag[0] = remainder
873 yield tuple(bag)
874 return
875 for delta in range(1, remainder-i+1):
876 bag[i] = delta
877 for x in gen_delta_bag_counts(i-1, remainder-delta, bag):
878 yield x
880 def generate_delta_bag_counts(mod_count, conjunct_length):
881 """Generate all tuples of positive integers having length conjunct_length
882 and sum mod_count. As a special case, () is such a tuple having length 0
883 and sum 0.
885 >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
887 0 []
888 1 [(4,)]
889 2 [(3, 1), (2, 2), (1, 3)]
890 3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
891 4 [(1, 1, 1, 1)]
892 5 []
893 >>> list(generate_delta_bag_counts(0, 0))
894 [()]
895 >>> list(generate_delta_bag_counts(0, 1))
899 if conjunct_length == 0:
900 return [()] if mod_count == 0 else []
901 if mod_count < conjunct_length:
902 return []
903 return gen_delta_bag_counts(conjunct_length - 1, mod_count,
904 [0] * conjunct_length)
907 def set_context_conjuncts(context, mass_regime_index, N_cj, C_cj, R_cj):
908 assert len(N_cj) < 1 and len(C_cj) < 1
909 context.N_delta = 0
910 if N_cj:
911 context.N_delta = N_cj[0][5][mass_regime_index][1]
912 context.C_delta = 0
913 if C_cj:
914 context.C_delta = C_cj[0][5][mass_regime_index][1]
915 context.delta_bag_lookup.clear()
916 context.delta_bag_lookup.resize(ord('Z')+1)
917 context.delta_bag_delta.clear()
918 for n, cj in enumerate(R_cj):
919 context.delta_bag_delta.append(cj[5][mass_regime_index][1])
920 for r in cj[3]:
921 context.delta_bag_lookup[ord(r)] \
922 = context.delta_bag_lookup[ord(r)] + (n,)
925 def search_all(options, context, score_statistics):
926 """Search sequence database against searchable spectra."""
928 mod_limit = XTP["potential_mod_limit"]
930 mod_conjunct_triples = get_mod_conjunct_triples(XTP["potential_mods"],
931 mod_limit)
932 info("%s unique potential modification conjuncts",
933 len(mod_conjunct_triples))
934 debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples))
936 mass_regimes = XTP["mass_regimes"]
937 pca_table = get_pca_table(mass_regimes)
938 debug("pca_table: %s", pca_table)
940 total_combinations_searched = 0
942 for mod_count in range(mod_limit + 1):
943 context.mod_count = mod_count
944 for mr_index, (mass_regime, pca_entry) in enumerate(zip(mass_regimes,
945 pca_table)):
946 context.mass_regime_index = mr_index
947 for pca_res, pca_parent_delta, pca_frag_delta in pca_entry:
948 context.pca_residues = pca_res
949 context.pca_delta = pca_frag_delta
950 for cji, (N_cj, C_cj, R_cj) in enumerate(mod_conjunct_triples):
951 if pca_res and N_cj:
952 continue # mutually exclusive, for now
953 set_context_conjuncts(context, mr_index, N_cj, C_cj, R_cj)
954 debug("mod_count: %s", mod_count)
955 debug("cj_triple: N=%s C=%s R=%s", N_cj, C_cj, R_cj)
956 for delta_bag in generate_delta_bag_counts(mod_count,
957 len(R_cj)):
958 debug("delta_bag: %s", delta_bag)
960 # this clear() avoids an SWIG/STL bug!?
961 context.delta_bag_count.clear()
962 context.delta_bag_count[:] = delta_bag
964 pmrf = CP.parent_mass_regime[mr_index].fixed_residue_mass
965 fmrf = CP.fragment_mass_regime[mr_index].fixed_residue_mass
966 p_fx = (pmrf[ord('[')]
967 + (N_cj and N_cj[0][5][mr_index][0] or 0)
968 + pmrf[ord(']')]
969 + (C_cj and C_cj[0][5][mr_index][0] or 0)
970 + pca_parent_delta + PROTON_MASS)
971 context.parent_fixed_mass = p_fx
972 f_N_fx = (fmrf[ord('[')]
973 + (N_cj and N_cj[0][5][mr_index][1] or 0)
974 + pca_frag_delta)
975 context.fragment_N_fixed_mass = f_N_fx
976 f_C_fx = (fmrf[ord(']')]
977 + (C_cj and C_cj[0][5][mr_index][1] or 0)
978 + CP.fragment_mass_regime[mr_index].water_mass)
979 context.fragment_C_fixed_mass = f_C_fx
981 info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
982 % (mod_count, mr_index, pca_res, cji, delta_bag))
983 debug("p_fx %s f_N_fx %s f_C_fx %s"
984 % (p_fx, f_N_fx, f_C_fx))
986 score_statistics.combinations_searched = 0
987 cgreylag.spectrum.search_runs(context,
988 score_statistics)
989 total_combinations_searched \
990 += score_statistics.combinations_searched
991 info(" %s candidate spectra examined, this bag",
992 score_statistics.combinations_searched)
994 info('%s candidate spectra examined',
995 score_statistics.candidate_spectrum_count)
998 def get_prefix_sequence(begin_pos, run_offset, sequence):
1000 >>> get_prefix_sequence(0, 0, 'abcdefghi')
1002 >>> get_prefix_sequence(1, 0, 'abcdefghi')
1003 '[a'
1004 >>> get_prefix_sequence(3, 0, 'abcdefghi')
1005 '[abc'
1006 >>> get_prefix_sequence(4, 0, 'abcdefghi')
1007 'abcd'
1008 >>> get_prefix_sequence(4, 2, 'abcdefghi')
1009 'cdef'
1012 prefix_start = run_offset + begin_pos - 4
1013 s = sequence[max(0, prefix_start):prefix_start+4]
1014 if len(s) < 4:
1015 s = '[' + s
1016 return s
1018 def get_suffix_sequence(end_pos, run_offset, sequence):
1020 >>> get_suffix_sequence(0, 0, 'abcdefghi')
1021 'abcd'
1022 >>> get_suffix_sequence(4, 0, 'abcdefghi')
1023 'efgh'
1024 >>> get_suffix_sequence(6, 0, 'abcdefghi')
1025 'ghi]'
1026 >>> get_suffix_sequence(8, 0, 'abcdefghi')
1027 'i]'
1028 >>> get_suffix_sequence(4, 2, 'abcdefghi')
1029 'ghi]'
1032 suffix_start = run_offset + end_pos
1033 s = sequence[suffix_start:suffix_start+4]
1034 if len(s) < 4:
1035 s = s + ']'
1036 return s
1039 def results_dump(score_statistics, searchable_spectra):
1040 """Return a result dict mapping spectrum names to (spectrum_metadata,
1041 best_matches) pairs. (Unneeded fields are stripped.)
1044 r = {}
1045 spectrum_metadata_fs = set(['name', 'file_id', 'mass', 'charge',
1046 'total_ion_current', 'comparisons'])
1047 py_s_spectra = pythonize_swig_object(searchable_spectra,
1048 only_fields=spectrum_metadata_fs)
1049 py_matches = pythonize_swig_object(score_statistics.best_matches,
1050 skip_fields=['spectrum_index'])
1051 assert len(py_s_spectra) == len(py_matches)
1053 for sp_metadata, sp_matches in zip(py_s_spectra, py_matches):
1054 assert sp_metadata['name'] not in r, "duplicate spectrum name"
1055 r[sp_metadata['name']] = (sp_metadata, sp_matches)
1057 return r
1060 def main(args=sys.argv[1:]):
1061 parser = optparse.OptionParser(usage=
1062 "usage: %prog [options]"
1063 " <configuration-file> <ms2-file>...",
1064 description=__doc__, version=__version__)
1065 pa = parser.add_option
1066 pa("-P", "--parameter", nargs=2, dest="parameters", action="append",
1067 default=[],
1068 help="override a parameter in <parameter-file>, may be used multiple"
1069 " times", metavar="NAME VALUE")
1070 pa("-w", "--work-slice", nargs=2, type="float", dest="work_slice",
1071 help="search a subinterval [L:U) of the work space"
1072 " (where 0 <= L <= U <= 1) in standalone mode", metavar="L U")
1073 pa("--job-id", dest="job_id", default="unknown",
1074 help="used to generate unique output filenames [default 'unknown']")
1075 pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
1076 pa("-p", "--show-progress", action="store_true", dest="show_progress",
1077 help="show running progress")
1078 pa("--estimate", action="store_true", dest="estimate_only",
1079 help="just estimate the time required for the search")
1080 pa("-v", "--verbose", action="store_true", dest="verbose",
1081 help="be verbose")
1082 pa("--copyright", action="store_true", dest="copyright",
1083 help="print copyright and exit")
1084 pa("--debug", action="store_true", dest="debug",
1085 help="output debugging info")
1086 pa("--profile", action="store_true", dest="profile",
1087 help="dump Python profiling output to './greylag.prof.<pid>'")
1088 (options, args) = parser.parse_args(args=args)
1090 if options.copyright:
1091 print __copyright__
1092 sys.exit(0)
1094 if len(args) < 2:
1095 parser.print_help()
1096 sys.exit(1)
1098 configuration_fn = args[0]
1099 spectrum_fns = args[1:]
1101 if (any(True for f in spectrum_fns if not f.endswith('.ms2'))
1102 or (options.work_slice
1103 and not (0 <= options.work_slice[0]
1104 <= options.work_slice[1] <= 1))):
1105 parser.print_help()
1106 sys.exit(1)
1108 log_level = logging.WARNING
1109 if options.quiet:
1110 log_level = logging.ERROR
1111 if options.verbose:
1112 log_level = logging.INFO
1113 if options.debug:
1114 log_level = logging.DEBUG
1115 logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
1116 format='%(asctime)s %(levelname)s: %(message)s')
1117 info("starting on %s", gethostname())
1119 # check spectrum basename uniqueness, as corresponding sqt files will be
1120 # in a single directory
1121 base_spectrum_fns = [ os.path.basename(fn) for fn in spectrum_fns ]
1122 if len(base_spectrum_fns) != len(set(base_spectrum_fns)):
1123 error("base spectrum filenames must be unique")
1125 # check -P names for validity
1126 bad_names = (set(n for n,v in options.parameters)
1127 - set(PARAMETER_INFO.keys()))
1128 if bad_names:
1129 error("bad -P parameter names %s" % list(bad_names))
1131 # read params
1132 cp = ConfigParser.RawConfigParser()
1133 cp.optionxform = str # be case-sensitive
1134 with open(configuration_fn) as configuration_file:
1135 cp.readfp(configuration_file)
1136 if not cp.has_section('greylag'):
1137 error("%s has no [greylag] section" % configuration_fn)
1138 parameters = dict(cp.items('greylag'))
1139 parameters.update(dict(options.parameters)) # command-line override
1140 global XTP
1141 XTP = validate_parameters(parameters)
1143 fixed_mod_map = dict((r[3], r) for r in XTP["pervasive_mods"])
1144 initialize_spectrum_parameters(options, XTP["mass_regimes"], fixed_mod_map)
1146 # read sequence dbs
1147 databases = XTP["databases"].split()
1148 # [(locusname, defline, seq, filename), ...]
1149 fasta_db = list(read_fasta_files(databases))
1150 # [(idno, offset, locusname, defline, seq, seq_filename), ...]
1151 db = []
1152 for idno, (locusname, defline, sequence, filename) in enumerate(fasta_db):
1153 db.extend(split_sequence_into_aa_runs(idno, locusname, defline,
1154 sequence, filename))
1155 db_residue_count = sum(len(dbi[3]) for dbi in db)
1157 info("read %s sequences (%s runs, %s residues)", len(fasta_db), len(db),
1158 db_residue_count)
1159 max_run_length = max(len(r[3]) for r in db)
1160 info("max run length is %s residues", max_run_length)
1161 if max_run_length > 2**31 - 1:
1162 error("runs longer than %s not yet supported", max_run_length)
1163 if not db:
1164 error("no database sequences")
1166 # read spectrum offset indices
1167 spectrum_offset_indices = []
1168 for spfn in spectrum_fns:
1169 idxfn = spfn + '.idx'
1170 with contextlib.closing(gzip.open(idxfn)) as idxf:
1171 idx = cPickle.load(idxf)
1172 # try to verify that index matches spectrum file
1173 if idx['file size'] != os.path.getsize(spfn):
1174 error("index '%s' needs rebuilding" % idxfn)
1175 spectrum_offset_indices.append(idx['offsets'])
1177 # FIX: assume standalone mode (for now)
1178 assert options.work_slice
1180 # read spectra per work slice
1181 spectra = read_spectra_slice(spectrum_fns, spectrum_offset_indices,
1182 options.work_slice)
1183 del spectrum_offset_indices
1184 spectra.sort(key=lambda x: x.mass)
1186 def peak_statistics(spectra):
1187 counts = [ len(sp.peaks) for sp in spectra ]
1188 counts.sort()
1189 n = len(counts)
1190 return (counts[0], counts[int(n*0.25)], counts[int(n*0.5)],
1191 counts[int(n*0.75)], counts[-1], sum(counts) / float(n))
1193 if spectra:
1194 info("read %s spectra (mass range %s - %s)", len(spectra),
1195 spectra[0].mass, spectra[-1].mass)
1196 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1197 % peak_statistics(spectra))
1198 else:
1199 warning("no input spectra")
1201 # filter and normalize spectra
1202 for sp in spectra:
1203 sp.filter_peaks(XTP["TIC_cutoff_proportion"],
1204 CP.parent_mass_tolerance_max)
1205 sp.classify(XTP["intensity_class_count"], XTP["intensity_class_ratio"],
1206 XTP["fragment_mass_tolerance"])
1208 min_psm = XTP["min_parent_spectrum_mass"]
1209 max_psm = XTP["max_parent_spectrum_mass"]
1210 # FIX: also filter by 1 + 2 + 4 rule?
1211 spectra = [ sp for sp in spectra
1212 if len(sp.peaks) >= 10 and min_psm <= sp.mass <= max_psm ]
1214 if spectra:
1215 info("after filtering:")
1216 info(" %s spectra (mass range %s - %s)", len(spectra),
1217 spectra[0].mass, spectra[-1].mass)
1218 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1219 % peak_statistics(spectra))
1221 cgreylag.spectrum.set_searchable_spectra(spectra)
1222 score_statistics = cgreylag.score_stats(len(spectra),
1223 XTP["best_result_count"])
1225 if spectra:
1226 del spectra # release memory
1228 # FIX!!!
1229 # (cleavage_re, position of cleavage in cleavage_re)
1230 cleavage_motif = "[X]|[X]"
1231 cleavage_pattern, cleavage_pos = cleavage_motif_re(cleavage_motif)
1232 if cleavage_pos == None:
1233 error("cleavage site '%s' is missing '|'",
1234 XTP["protein, cleavage site"])
1235 cleavage_pattern = re.compile(cleavage_pattern)
1237 context = cgreylag.search_context()
1238 for idno, offset, locusname, defline, seq, seq_filename in db:
1239 cp = []
1240 if cleavage_motif != "[X]|[X]":
1241 cp = list(generate_cleavage_points(cleavage_pattern,
1242 cleavage_pos, seq))
1243 sr = cgreylag.sequence_run(idno, offset, seq, cp, locusname)
1244 context.sequence_runs.append(sr)
1245 context.maximum_missed_cleavage_sites = 1000000000 # FIX
1247 info("searching")
1248 search_all(options, context, score_statistics)
1249 else:
1250 warning("no spectra after filtering--search skipped")
1252 if options.estimate_only:
1253 print ("%.2f generic CPU hours"
1254 % (score_statistics.candidate_spectrum_count / 300.0e6))
1255 return
1257 info("writing result file")
1258 # FIX: this is only for standalone mode
1259 result_fn = 'grind_%s_%s-%s.gwr' % (options.job_id, options.work_slice[0],
1260 options.work_slice[1])
1261 with contextlib.closing(gzip.open(result_fn, 'w')) as result_file:
1262 d = { 'version' : __version__,
1263 'matches' : results_dump(score_statistics,
1264 cgreylag.cvar.spectrum_searchable_spectra),
1265 'total comparisons' : score_statistics.candidate_spectrum_count,
1266 'spectrum files' : base_spectrum_fns,
1267 'databases' : databases,
1268 'parameters' : XTP,
1269 'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES,
1270 'proton mass' : PROTON_MASS,
1271 'argv' : sys.argv }
1272 cPickle.dump(d, result_file, cPickle.HIGHEST_PROTOCOL)
1273 info("finished, result file written to '%s'", result_fn)
1276 if __name__ == '__main__':
1277 # try:
1278 # import psyco
1279 # psyco.full()
1280 # ###psyco.bind(formula_mass)
1281 # warning('psyco enabled')
1282 # except ImportError:
1283 # pass
1285 try:
1286 if '--profile' in sys.argv:
1287 import cProfile
1288 import pstats
1289 report_fn = "greylag.prof.%s" % os.getpid()
1290 data_fn = report_fn + ".tmp"
1291 prof = cProfile.run('main()', data_fn)
1292 with open(report_fn, 'w') as report_f:
1293 try:
1294 stats = pstats.Stats(data_fn, stream=report_f)
1295 stats.strip_dirs()
1296 stats.sort_stats('cumulative')
1297 stats.print_stats(50)
1298 stats.sort_stats('time')
1299 stats.print_stats(50)
1300 print "# profile report written to '%s'" % report_fn
1301 finally:
1302 try:
1303 os.remove(data_fn)
1304 except:
1305 pass
1306 else:
1307 main()
1308 except SystemExit:
1309 raise
1310 except:
1311 logging.exception("unhandled exception")
1312 sys.exit(1)
1313 finally:
1314 logging.shutdown()
1317 # FIXES:
1318 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)