Check sequence db locus name uniqueness
[greylag.git] / greylag-grind.py
blobc89f5ac279dfd3458146065acc176d15219aaf08
1 #!/usr/bin/env python
3 '''This program does the actual work to search mass spectra against a sequence
4 database. <job-id> is a unique identifier used as a prefix for work and
5 output directories. The <parameter-file> contains program options. The
6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
7 on them.
9 This program can operate in standalone mode (useful for testing) or command
10 mode. If the --work-slice option is given, that slice will be searched in
11 standalone mode, then the program will exit. Otherwise, the program will
12 process commands until an exit command is received.
14 '''
16 from __future__ import with_statement
18 __copyright__ = '''
19 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License along
32 with this program; if not, write to the Free Software Foundation, Inc.,
33 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
34 '''
36 ##############################################################################
37 ## "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra ##
38 ##############################################################################
41 __version__ = "0.0"
44 import ConfigParser
45 from collections import defaultdict
46 import contextlib
47 import cPickle
48 import fileinput
49 import gzip
50 import itertools
51 import logging
52 from logging import debug, info, warning
53 import math #??
54 import optparse
55 import os
56 from pprint import pprint, pformat
57 import re
58 from socket import gethostname
59 import sys
61 import cgreylag
64 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
65 # KeyboardInterrupt processing, since we may spend long periods of time in
66 # uninterruptible C++ calls. Also die immediately on SIGPIPE.
67 try:
68 import signal
69 signal.signal(signal.SIGINT, signal.SIG_DFL)
70 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
71 except:
72 pass
75 def error(s, *args):
76 "fatal error"
77 logging.error(s, *args)
78 if __name__ != "__main__":
79 raise Exception("(fatal error, but unit testing, so not exiting)")
80 sys.exit(1)
83 def fileerror(s, *args):
84 error(s + (", at line %s of file '%s'"
85 % (fileinput.filelineno(), fileinput.filename())),
86 *args)
89 # name -> value map of processed XML input parameters
90 XTP = {}
92 # handle to the singleton parameter object shared with the C++ module
93 CP = cgreylag.cvar.parameters_the
95 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
96 PROTON_MASS = 1.007276
97 ELECTRON_MASS = 0.000549 # ?
99 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
100 MONOISOTOPIC_ATOMIC_MASS = {
101 'H' : 1.00782503214,
102 'C' : 12.00000000,
103 'N' : 14.00307400529,
104 'O' : 15.994914622115,
105 'P' : 30.9737615120,
106 'S' : 31.9720706912,
109 # most prevalent only (1 in 1000)
110 ISOTOPIC_ATOMIC_MASS = { # prevalence (in %)
111 'C13' : 13.003354837810, # 1.078
112 'N15' : 15.00010889849, # 0.3687
113 'O18' : 17.99916049, # 0.20514
114 'S33' : 32.9714585012, # 0.762
115 'S34' : 33.9678668311, # 4.2928
118 AVERAGE_ATOMIC_MASS = {
119 'H' : 1.007947,
120 'C' : 12.01078,
121 'N' : 14.00672,
122 'O' : 15.99943,
123 'P' : 30.9737612,
124 'S' : 32.0655,
127 # The xtandem average residue masses are about 0.002 amu higher than those
128 # calculated directly from the above average atomic masses. None of the
129 # chemists consulted knew of any reason why, aside from lack of precision in
130 # the average atomic mass estimates. This shouldn't matter very much, as
131 # fragmentation calculations should all be monoisotopic, and we can always
132 # widen the parent tolerance window a bit.
135 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
136 """Return the mass of formula, using the given mass regime (monoisotopic
137 by default).
139 >>> formula_mass('H2O', { 'H':1, 'O':16 })
141 >>> # monoisotopic mass of glycine
142 >>> str(round(formula_mass('C2H3ON'), 4))
143 '57.0215'
146 parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
147 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
148 return sum(atomic_mass[parts[i]] * int(parts[i+1])
149 for i in range(0, len(parts), 2))
151 # FIX: selenocysteine (U), etc
152 # residue -> formula
153 RESIDUE_FORMULA = {
154 'A' : "C3H5ON",
155 'C' : "C3H5ONS",
156 'D' : "C4H5O3N",
157 'E' : "C5H7O3N",
158 'F' : "C9H9ON",
159 'G' : "C2H3ON",
160 'H' : "C6H7ON3",
161 'I' : "C6H11ON",
162 'K' : "C6H12ON2",
163 'L' : "C6H11ON",
164 'M' : "C5H9ONS",
165 'N' : "C4H6O2N2",
166 'P' : "C5H7ON",
167 'Q' : "C5H8O2N2",
168 'R' : "C6H12ON4",
169 'S' : "C3H5O2N",
170 'T' : "C4H7O2N",
171 'V' : "C5H9ON",
172 'W' : "C11H10ON2",
173 'Y' : "C9H9O2N",
176 RESIDUES = sorted(RESIDUE_FORMULA.keys())
177 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
180 # accessed as a global variable (FIX?)
181 # [0][1] -> 'H' -> fragment mass of H for regime 0
182 MASS_REGIME_ATOMIC_MASSES = []
185 def mass_regime_atomic_masses(spec):
186 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
187 names to masses.
189 name, isotopes = spec
190 assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
191 if name == 'MONO':
192 r = MONOISOTOPIC_ATOMIC_MASS.copy()
193 else:
194 r = AVERAGE_ATOMIC_MASS.copy()
195 if isotopes:
196 iname, prevalence = isotopes[0]
197 assert iname == 'N15' and 0 <= prevalence <= 1
198 # this is a simplification, but additional accuracy pointless?
199 if name == 'MONO':
200 r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
201 else:
202 r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
203 return r
206 def initialize_spectrum_parameters(options, mass_regimes, fixed_mod_map):
207 """Initialize parameters known to the spectrum module.
208 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
211 debug('fixed_mod_map: %s', fixed_mod_map)
212 # This is the size of vectors that are indexed by residues (A-Z) or
213 # special characters ('[]').
214 RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
216 # These are currently monoisotopic. (deuterium pointless?)
217 CP.proton_mass = PROTON_MASS
218 CP.hydrogen_mass = formula_mass("H")
220 global MASS_REGIME_ATOMIC_MASSES
221 for regime_pair in mass_regimes:
222 assert len(regime_pair) == 2 # parent and fragment
223 info('mass regime: %s', regime_pair)
224 MASS_REGIME_ATOMIC_MASSES.append([])
225 for n, regime in enumerate(regime_pair):
226 atmass = mass_regime_atomic_masses(regime)
227 MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
228 creg = cgreylag.mass_regime_parameters()
230 creg.hydroxyl_mass = formula_mass("OH", atmass)
231 creg.water_mass = formula_mass("H2O", atmass)
232 creg.ammonia_mass = formula_mass("NH3", atmass)
234 creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
236 for r in RESIDUES_W_BRACKETS:
237 m = 0
238 if r in RESIDUES:
239 m = formula_mass(RESIDUE_FORMULA[r], atmass)
240 rmod = fixed_mod_map.get(r)
241 if rmod:
242 if isinstance(rmod[1], str):
243 if rmod[2]:
244 m += rmod[0] * formula_mass(rmod[1])
245 else:
246 m += rmod[0] * formula_mass(rmod[1], atmass)
247 else:
248 m += rmod[0] * rmod[1]
249 creg.fixed_residue_mass[ord(r)] = m
250 # assuming these are monoisotopic (not regime)
251 creg.fixed_residue_mass[ord('[')] += formula_mass("H")
252 creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
253 if not n:
254 CP.parent_mass_regime.append(creg);
255 else:
256 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
257 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
258 CP.fragment_mass_regime.append(creg);
259 for r in RESIDUES_W_BRACKETS:
260 info('fixed_mass %s: %s', r,
261 [ "%.6f/%.6f"
262 % (CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
263 CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
264 for rn in range(len(mass_regimes)) ])
265 for r in RESIDUES:
266 for rn in range(len(mass_regimes)):
267 # physically impossible (and results would be garbage)
268 if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
269 raise ValueError('bogus parent mass specification for %s' % r)
270 if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
271 raise ValueError('bogus parent mass specification for %s' % r)
273 CP.parent_mass_tolerance_1 = XTP["parent_mz_tolerance"]
274 CP.parent_mass_tolerance_max = (XTP["parent_mz_tolerance"]
275 * XTP["charge_limit"])
277 CP.fragment_mass_tolerance = XTP["fragment_mass_tolerance"]
278 CP.intensity_class_count = XTP["intensity_class_count"]
280 CP.minimum_peptide_length = XTP["min_peptide_length"]
282 # CP.ln_factorial[n] == ln(n!)
283 CP.ln_factorial.resize(int(XTP["max_parent_spectrum_mass"]
284 / XTP["fragment_mass_tolerance"] + 100), 0.0)
285 for n in range(2, len(CP.ln_factorial)):
286 CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
288 CP.estimate_only = bool(options.estimate_only)
289 CP.show_progress = bool(options.show_progress)
292 def cleavage_motif_re(motif):
293 """Return (regexp, pos), where regexp is a regular expression that will
294 match a cleavage motif, and pos is the position of the cleavage with
295 respect to the match (or None). (The RE actually matches one character,
296 the rest matching as lookahead, so that re.finditer will find all
297 overlapping matches.)
299 >>> cleavage_motif_re('[KR]|{P}')
300 ('[KR](?=[^P])', 1)
301 >>> cleavage_motif_re('[X]|[X]')
302 ('.(?=.)', 1)
305 cleavage_pos = None
306 re_parts = []
307 motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
308 parts = [ p[0] for p in motif_re.findall(motif) ]
309 if ''.join(parts) != motif:
310 raise ValueError('invalid cleavage motif pattern')
311 i = 0
312 for part in parts:
313 if part == '|':
314 if cleavage_pos != None:
315 raise ValueError("invalid cleavage motif pattern"
316 " (multiple '|'s)")
317 cleavage_pos = i
318 continue
319 if part == '[X]':
320 re_parts.append('.')
321 elif part[0] == '[':
322 re_parts.append('[%s]' % part[1:-1])
323 elif part[0] == '{':
324 re_parts.append('[^%s]' % part[1:-1])
325 else:
326 assert False, "unknown cleavage motif syntax"
327 i += 1
328 if len(re_parts) == 0:
329 re_pattern = '.'
330 elif len(re_parts) == 1:
331 re_pattern = re_parts[0]
332 else:
333 re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
334 return (re_pattern, cleavage_pos)
337 def generate_cleavage_points(cleavage_re, cleavage_pos, sequence):
338 """Yields the offsets of the cleavages in sequence. The endpoints are
339 always included, by convention.
341 >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
342 ... 'ARBCKDEKPF'))
343 [0, 2, 5, 10]
346 yield 0
347 for m in cleavage_re.finditer(sequence):
348 p = m.start() + cleavage_pos
349 if 0 < p < len(sequence):
350 yield p
351 yield len(sequence)
354 AA_SEQUENCE = re.compile(r'[ARNDCQEGHILKMFPSTWYV]+')
356 def split_sequence_into_aa_runs(idno, defline, sequence, filename):
357 """Returns a tuple (idno, start, defline, seq, filename) for each
358 contiguous run of residues in sequence, where 'start' is the position of
359 'seq' in 'sequence'.
361 >>> pprint(split_sequence_into_aa_runs(123, 'defline', 'STSS*DEFABA',
362 ... 'filename'))
363 [(123, 0, 'defline', 'STSS', 'filename'),
364 (123, 5, 'defline', 'DEFA', 'filename'),
365 (123, 10, 'defline', 'A', 'filename')]
368 return [ (idno, m.start(), defline, m.group(), filename)
369 for n, m in enumerate(AA_SEQUENCE.finditer(sequence)) ]
372 def read_fasta_files(filenames):
373 """Yield (defline, sequence, filename) tuples as read from FASTA files
374 (uppercasing sequence)."""
375 defline = None
376 seqs = []
377 for line in fileinput.input(filenames):
378 line = line.strip()
379 if line[:1] == '>':
380 if defline != None:
381 yield (defline, ''.join(seqs), fileinput.filename())
382 elif seqs:
383 fileerror("bad format: line precedes initial defline")
384 defline = line[1:]
385 seqs = []
386 else:
387 seqs.append(line.upper())
388 if defline:
389 yield (defline, ''.join(seqs), fileinput.filename())
392 def read_spectra_slice(spectrum_fns, offset_indices, slice):
393 s_l, s_u = slice
394 assert 0 <= s_l <= s_u <= 1
396 total_spectra = sum(len(oi) for oi in offset_indices)
397 sp_l = int(float(s_l) * total_spectra)
398 sp_u = int(float(s_u) * total_spectra)
400 seeing_l = 0
401 spectra = []
403 for f_no, (fn, oi) in enumerate(zip(spectrum_fns, offset_indices)):
404 seeing_u = seeing_l + len(oi)
405 f_l = max(sp_l, seeing_l)
406 f_u = min(sp_u, seeing_u)
408 if f_l < f_u:
409 b_l = oi[f_l - seeing_l]
410 b_u = oi[f_u - seeing_l] if f_u - seeing_l < len(oi) else -1 # oo
412 with open(fn) as f:
413 fsp = cgreylag.spectrum.read_spectra_from_ms2(f, f_no,
414 b_l, b_u)
415 spectra.extend(fsp)
416 seeing_l = seeing_u
418 return spectra
421 # XML parameter file processing
423 # FIX: This parsing is way too complex. How to simplify?
425 def mass_regime_part(part_specification):
426 """Parse a single mass regime specification part.
428 >>> mass_regime_part('MONO')
429 ('MONO', [])
430 >>> mass_regime_part('AVG')
431 ('AVG', [])
432 >>> mass_regime_part('MONO(N15@87.5%)')
433 ('MONO', [('N15', 0.875)])
436 ps = [ x.strip() for x in part_specification.partition('(') ]
437 if ps[0] not in ('MONO', 'AVG'):
438 raise ValueError("invalid mass regime list specification"
439 " (regime id must be 'MONO' or 'AVG')")
440 if not ps[1]:
441 return (ps[0], [])
442 if ps[2][-1] != ')':
443 raise ValueError("invalid mass regime list specification"
444 " (expected ')')")
445 pps = [ x.strip() for x in ps[2][:-1].split(',') ]
446 if len(pps) > 1:
447 raise ValueError("invalid mass regime list specification"
448 " (multiple isotopes not yet implemented)")
449 ppps = [ x.strip() for x in pps[0].partition('@') ]
450 if not ppps[1]:
451 raise ValueError("invalid mass regime list specification"
452 " (expected '@')")
453 if ppps[0] not in ('N15',):
454 raise ValueError("invalid mass regime list specification"
455 " (isotope id must currently be 'N15')")
456 if ppps[2][-1] != '%':
457 raise ValueError("invalid mass regime list specification"
458 " (expected '%')")
459 prevalence = float(ppps[2][:-1]) / 100
460 if not (0 <= prevalence <= 1):
461 raise ValueError("invalid mass regime list specification"
462 " (prevalence must be in range 0-100%)")
463 return (ps[0], [(ppps[0], prevalence)])
465 def mass_regime_list(mass_regime_list_specification):
466 """Check and return a list of regime tuples (parent_regime,
467 fragment_regime), where each regime is a tuple (id, [(isotope_id,
468 prevalence), ...]). Multiple isotopes (when implemented) would be
469 comma-separated.
471 >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
472 [[('AVG', []), ('MONO', [])],
473 [('MONO', []), ('MONO', [])],
474 [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
477 result = []
478 for regspec in mass_regime_list_specification.split(';'):
479 halves = [ x.strip() for x in regspec.split('/') ]
480 if len(halves) > 2:
481 raise ValueError("invalid mass regime list specification"
482 " (too many '/'?)")
483 pr = [ mass_regime_part(h) for h in halves ]
484 if len(pr) == 1:
485 pr = [ pr[0], pr[0] ]
486 result.append(pr)
487 debug("mass regime list:\n%s", pformat(result))
489 # The first fragmentation regime should generally be MONO, so that
490 # formulaic deltas with '!' do the expected thing.
491 if result[0][1] != ('MONO', []):
492 raise ValueError("first fragmentation regime was something other than"
493 " 'MONO' with no isotopes--this is almost certainly"
494 " not what was intended")
495 return result
498 def parse_mod_term(s, is_potential=False):
499 """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
500 residues, description).
502 >>> parse_mod_term('-C2H3ON!@C')
503 (-1, 'C2H3ON', True, 'C', None)
504 >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
505 (1, 42.0, False, 'STY', 'phosphorylation')
509 m = re.match(r'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
510 r'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s)
511 if not m:
512 raise ValueError("invalid modification term specification"
513 " '%s'" % s)
514 mg = m.groups()
515 invalid_residues = set(mg[5]) - set(RESIDUES_W_BRACKETS)
516 if invalid_residues:
517 raise ValueError("invalid modification list specification"
518 " (invalid residues %s)" % list(invalid_residues))
519 delta = mg[1]
520 if mg[2]:
521 delta = float(mg[1])
522 if is_potential and abs(delta) < 0.0001:
523 raise ValueError("invalid modification list specification"
524 " (delta '%s' is too small)" % delta)
525 residues = mg[5]
526 if not is_potential and len(residues) != 1:
527 raise ValueError("invalid modification list specification '%s' (only"
528 " potential modifications may have multiple residues)"
529 % residues)
530 if len(residues) != len(set(residues)):
531 raise ValueError("invalid modification list specification"
532 " '%s' (duplicate residues prohibited)"
533 % residues)
534 return (mg[0] == '-' and -1 or 1, delta, mg[4] == '!', residues, mg[7])
537 def fixed_mod_list(specification):
538 """Check and return a list of modification tuples.
540 >>> fixed_mod_list('57@C')
541 [(1, 57.0, False, 'C', None)]
542 >>> fixed_mod_list('57@C,CH!@N desc')
543 [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
544 >>> fixed_mod_list('57@C,58@C')
545 Traceback (most recent call last):
547 ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
550 if not specification:
551 return []
552 result = [ parse_mod_term(s) for s in specification.split(',') ]
553 residues = [ x[3] for x in result ]
554 # this check is across terms; the one above is within terms
555 if len(residues) != len(set(residues)):
556 raise ValueError("invalid modification list specification"
557 " '%s' (duplicate residues prohibited)"
558 % residues)
559 debug("fixed_mod_list:\n%s", pformat(result))
560 return result
563 def parse_mod_basic_expression(s):
564 s = s.strip()
565 if s[0] == '(':
566 tree, rest = parse_mod_disjunction(s[1:])
567 rest = rest.lstrip()
568 if rest[:1] != ')':
569 raise ValueError("invalid modification list specification"
570 " (expected matching ')')")
571 return tree, rest[1:]
572 parts = re.split(r'([;,()])', s, 1)
573 if len(parts) == 1:
574 term, rest = s, ''
575 else:
576 assert len(parts) == 3
577 term, rest = parts[0], parts[1]+parts[2]
578 return parse_mod_term(term, is_potential=True), rest
580 def parse_mod_conjunction(s):
581 result = []
582 while s:
583 tree, s = parse_mod_basic_expression(s)
584 result.append(tree)
585 s = s.lstrip()
586 if s[:1] != ',':
587 break
588 s = s[1:]
589 return result, s
591 def parse_mod_disjunction(s):
592 result = []
593 while s:
594 tree, s = parse_mod_conjunction(s)
595 result.append(tree)
596 s = s.lstrip()
597 if s[:1] != ';':
598 break
599 s = s[1:]
600 return result, s
602 def potential_mod_list(specification):
603 """Check and return a tree of potential modification tuples. Nodes at
604 even (odd) levels are disjunctions (conjunctions). (The top list is a
605 disjunction.)
607 >>> potential_mod_list('')
609 >>> potential_mod_list('PO3H@STY; C2H2O@KST')
610 [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
611 >>> potential_mod_list('PO3H@STY, C2H2O@KST')
612 [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
613 >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
614 ... C2H2O@KST acetylation;
615 ... CH2@AKST methylation),
616 ... O@M oxidation'''))
617 [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
618 [(1, 'C2H2O', False, 'KST', 'acetylation')],
619 [(1, 'CH2', False, 'AKST', 'methylation')]],
620 (1, 'O', False, 'M', 'oxidation')]]
623 if not specification:
624 return []
625 tree, remainder = parse_mod_disjunction(specification)
626 if remainder:
627 raise ValueError("invalid modification list specification"
628 " (unexpected '%s')" % remainder)
629 debug("potential_mod_list:\n%s", pformat(tree))
630 return tree
633 def p_positive(x): return x > 0
634 def p_negative(x): return x < 0
635 def p_nonnegative(x): return x >= 0
636 def p_proportion(x): return 0 <= x <= 1
638 # Configuration file parameter specification
640 # name -> (type, default, check_fn)
641 # type may be bool, int, float, str, modlist, or a tuple of values
642 # default, as a string value, or None if value must be explicitly specified
643 # check_fn is an optional function that returns True iff the value is valid
645 PARAMETER_INFO = {
646 "databases" : (str, None),
647 "decoy_locus_prefix" : (str, "SHUFFLED_"),
648 "mass_regimes" : (mass_regime_list, "MONO"),
649 "pervasive_mods" : (fixed_mod_list, ""),
650 "potential_mods" : (potential_mod_list, ""),
651 "potential_mod_limit" : (int, 2, p_nonnegative),
652 "charge_limit" : (int, 3, p_positive),
653 "min_peptide_length" : (int, 5, p_positive), # needed?
654 "min_parent_spectrum_mass" : (float, 0, p_nonnegative),
655 "max_parent_spectrum_mass" : (float, 10000, p_nonnegative),
656 "TIC_cutoff_proportion" : (float, 0.98, p_proportion),
657 "parent_mz_tolerance" : (float, 1.25, p_nonnegative),
658 "fragment_mass_tolerance" : (float, 0.5, p_nonnegative),
659 "intensity_class_count" : (int, 3, p_positive),
660 "intensity_class_ratio" : (float, 2.0, p_positive), # really > 1.0?
661 "best_result_count" : (int, 5, p_positive),
665 def validate_parameters(parameters, parameter_info=PARAMETER_INFO):
666 """Verify that parameters are valid, have valid values, and correspond to
667 currently implemented functionality. Values are converted, default values
668 are filled in, and the resulting name/value dict returned.
670 >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
671 ... {'spectrum, total peaks'
672 ... : (int, '50', p_positive),
673 ... 'output, spectra'
674 ... : (bool, 'no')}).items())
675 [('output, spectra', False), ('spectrum, total peaks', 40)]
679 pmap = {}
680 for p_name, p_info in sorted(parameter_info.items()):
681 type_ = p_info[0]
682 default = p_info[1]
683 check_fn = len(p_info) > 2 and p_info[2] or None
685 v = parameters.get(p_name)
686 if v == None:
687 if default != None:
688 debug("parameter '%s' defaulting to '%s'", p_name, default)
689 v = default
690 else:
691 error("missing required parameter '%s'" % p_name)
692 if isinstance(type_, tuple):
693 if not v in type_:
694 error("parameter '%s' value '%s' not in %s (feature not"
695 " implemented?)" % (p_name, v, type_))
696 elif type_ == bool:
697 v = { 'yes' : True, 'no' : False }.get(v)
698 if v == None:
699 error("parameter '%s' requires a value of 'yes' or 'no'")
700 else:
701 try:
702 v = type_(v)
703 except ValueError, e:
704 error("parameter '%s' has value '%s' with invalid format [%s]"
705 % (p_name, v, e))
706 if check_fn and not check_fn(v):
707 error("parameter '%s' has invalid value '%s' (or feature not"
708 " implemented)" % (p_name, v))
709 pmap[p_name] = v
711 unknown_parameters = set(parameters) - set(parameter_info)
712 if unknown_parameters:
713 warning("%s unknown parameter(s):\n %s"
714 % (len(unknown_parameters),
715 pformat(sorted(list(unknown_parameters)))))
716 return pmap
719 def generate_mass_bands(band_count, mass_list):
720 """Yield (n, mass_lb, mass_ub) for each mass band, where n ranges from 1
721 to band_count. To generate the bands, the mass list (which is assumed
722 already sorted by mass) is evenly partitioned into bands with masses in
723 the range [mass_lb, mass_ub).
725 >>> list(generate_mass_bands(1, [ float(x) for x in range(100) ]))
726 [(1, 0.0, 100.0)]
727 >>> list(generate_mass_bands(4, [ float(x) for x in range(100) ]))
728 [(1, 0.0, 25.0), (2, 25.0, 50.0), (3, 50.0, 75.0), (4, 75.0, 100.0)]
729 >>> list(generate_mass_bands(3, [ float(x) for x in range(100) ]))
730 [(1, 0.0, 34.0), (2, 34.0, 68.0), (3, 68.0, 100.0)]
733 assert band_count > 0 and mass_list
734 assert sorted(mass_list) == list(mass_list)
735 band_size = int(math.ceil(float(len(mass_list)) / band_count))
736 lb = mass_list[0]
737 for bn in range(1, band_count):
738 i = min(bn*band_size, len(mass_list)-1)
739 ub = mass_list[i]
740 yield bn, lb, ub
741 lb = ub
742 # Since the upper bound is exclusive, we add a little slop to make sure
743 # the last spectrum is included.
744 yield band_count, lb, round(mass_list[-1]+1)
747 def pythonize_swig_object(o, only_fields=None, skip_fields=[]):
748 """Creates a pure Python copy of a SWIG object, so that it can be easily
749 pickled, or printed (for debugging purposes). Each SWIG object is
750 pythonized as a dictionary, for convenience. If provided, 'only_fields',
751 limits the copy to the list of fields specified. Otherwise,
752 'skip_fields' if given is a list of methods not to include (this helps
753 avoid infinite recursion).
755 >>> pprint(pythonize_swig_object(cgreylag.score_stats(3)))
756 struct({'best_match': [[], [], []], 'best_score': [100.0, 100.0, 100.0], 'candidate_spectrum_count': 0, 'combinations_searched': 0, 'hyperscore_histogram': [[], [], []], 'improved_candidates': 0, 'second_best_score': [100.0, 100.0, 100.0], 'spectra_with_candidates': 0})
760 if isinstance(o, str):
761 return o
762 try:
763 len(o)
764 except TypeError:
765 pass
766 else:
767 return list(pythonize_swig_object(x, only_fields, skip_fields)
768 for x in o)
769 if hasattr(o, '__swig_getmethods__'):
770 s = {}
771 for a in o.__swig_getmethods__:
772 if (only_fields != None and a in only_fields
773 or only_fields == None and a not in skip_fields):
774 s[a] = pythonize_swig_object(getattr(o, a), only_fields,
775 skip_fields)
776 return s
777 return o
779 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
780 # circularization of the peptide N-terminal. PCA mods are excluded if a
781 # static N-terminal mod has been specified. Likewise, choosing a PCA mod will
782 # exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
783 # dependent on a static mod of C+57 being in effect.)
785 def get_pca_table(mass_regimes):
786 """Return a list of tuples (residues, parent delta, fragment delta)
787 describing the PCA possibilities for each mass regime. Residues is a
788 string specifying the residues this PCA delta applies to.
790 # FIX: According to Xtandem, C is only a candidate for PCA if
791 # carboxyamidomethylated (C+57). Currently we always search it.
792 return [ [('', 0, 0),
793 ('E', -1 * CP.parent_mass_regime[r].water_mass,
794 -1 * CP.fragment_mass_regime[r].water_mass),
795 ('QC', -1 * CP.parent_mass_regime[r].ammonia_mass,
796 -1 * CP.fragment_mass_regime[r].ammonia_mass)]
797 for r in range(len(mass_regimes)) ]
800 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
801 if not mod_tree:
802 if 0 < len(conjuncts) <= limit:
803 yield conjuncts
804 return
805 first, rest = mod_tree[0], mod_tree[1:]
806 if isinstance(first, list):
807 for x in enumerate_disjunction(first, limit):
808 for y in enumerate_conjunction(rest, limit, conjuncts + x):
809 yield y
810 else:
811 for y in enumerate_conjunction(rest, limit, conjuncts):
812 yield y
813 for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
814 yield y
816 def enumerate_disjunction(mod_tree, limit=sys.maxint):
817 """Generates the conjuncts for mod_tree that are no longer than limit.
819 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
820 [[], ['a'], ['b'], ['c']]
821 >>> list(enumerate_disjunction([[1,2,3]]))
822 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
823 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
824 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
825 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
826 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
827 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
828 [[]]
831 assert isinstance(mod_tree, list)
832 yield []
833 for b in mod_tree:
834 for s in enumerate_conjunction(b, limit):
835 yield s
837 def get_mod_conjunct_triples(mod_tree, limit):
838 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
839 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
840 ordered by increasing number of conjuncts, and with duplicates within and
841 across removed. A mass table is also appended to each conjunct.
843 # FIX: is there a more elegant way or place for all this?
844 def enmass(t):
845 def rmass(regime_index, par_frag, sign, delta, is_mono):
846 global MASS_REGIME_ATOMIC_MASSES
847 if isinstance(delta, str):
848 if is_mono:
849 regime_index = 0
850 return (sign * formula_mass(delta,
851 MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
852 else:
853 return sign * delta
855 sign, delta, is_mono = t[:3]
856 return t + (tuple((rmass(r, 0, sign, delta, is_mono),
857 rmass(r, 1, sign, delta, is_mono))
858 for r in range(len(XTP["mass_regimes"]))),)
860 def triple(c):
861 Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
862 Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
863 assert len(Ns) <= 1 and len(Cs) <= 1
864 rest = [ enmass(x) for x in c if x[3] not in '[]' ]
865 rest = tuple(sorted(list(frozenset(rest))))
866 return (Ns, Cs, rest)
868 return sorted(list(frozenset(triple(conjunct)
869 for conjunct
870 in enumerate_disjunction(mod_tree, limit))),
871 key=lambda x: (sum(len(y) for y in x), x))
874 def gen_delta_bag_counts(i, remainder, bag):
875 if i < 1:
876 assert i == 0
877 bag[0] = remainder
878 yield tuple(bag)
879 return
880 for delta in range(1, remainder-i+1):
881 bag[i] = delta
882 for x in gen_delta_bag_counts(i-1, remainder-delta, bag):
883 yield x
885 def generate_delta_bag_counts(mod_count, conjunct_length):
886 """Generate all tuples of positive integers having length conjunct_length
887 and sum mod_count. As a special case, () is such a tuple having length 0
888 and sum 0.
890 >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
892 0 []
893 1 [(4,)]
894 2 [(3, 1), (2, 2), (1, 3)]
895 3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
896 4 [(1, 1, 1, 1)]
897 5 []
898 >>> list(generate_delta_bag_counts(0, 0))
899 [()]
900 >>> list(generate_delta_bag_counts(0, 1))
904 if conjunct_length == 0:
905 return [()] if mod_count == 0 else []
906 if mod_count < conjunct_length:
907 return []
908 return gen_delta_bag_counts(conjunct_length - 1, mod_count,
909 [0] * conjunct_length)
912 def set_context_conjuncts(context, mass_regime_index, N_cj, C_cj, R_cj):
913 assert len(N_cj) < 1 and len(C_cj) < 1
914 context.N_delta = 0
915 if N_cj:
916 context.N_delta = N_cj[0][5][mass_regime_index][1]
917 context.C_delta = 0
918 if C_cj:
919 context.C_delta = C_cj[0][5][mass_regime_index][1]
920 context.delta_bag_lookup.clear()
921 context.delta_bag_lookup.resize(ord('Z')+1)
922 context.delta_bag_delta.clear()
923 for n, cj in enumerate(R_cj):
924 context.delta_bag_delta.append(cj[5][mass_regime_index][1])
925 for r in cj[3]:
926 context.delta_bag_lookup[ord(r)] \
927 = context.delta_bag_lookup[ord(r)] + (n,)
930 def search_all(options, context, score_statistics):
931 """Search sequence database against searchable spectra."""
933 mod_limit = XTP["potential_mod_limit"]
935 mod_conjunct_triples = get_mod_conjunct_triples(XTP["potential_mods"],
936 mod_limit)
937 info("%s unique potential modification conjuncts",
938 len(mod_conjunct_triples))
939 debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples))
941 mass_regimes = XTP["mass_regimes"]
942 pca_table = get_pca_table(mass_regimes)
943 debug("pca_table: %s", pca_table)
945 total_combinations_searched = 0
947 for mod_count in range(mod_limit + 1):
948 context.mod_count = mod_count
949 for mr_index, (mass_regime, pca_entry) in enumerate(zip(mass_regimes,
950 pca_table)):
951 context.mass_regime_index = mr_index
952 for pca_res, pca_parent_delta, pca_frag_delta in pca_entry:
953 context.pca_residues = pca_res
954 context.pca_delta = pca_frag_delta
955 for cji, (N_cj, C_cj, R_cj) in enumerate(mod_conjunct_triples):
956 if pca_res and N_cj:
957 continue # mutually exclusive, for now
958 set_context_conjuncts(context, mr_index, N_cj, C_cj, R_cj)
959 debug("mod_count: %s", mod_count)
960 debug("cj_triple: N=%s C=%s R=%s", N_cj, C_cj, R_cj)
961 for delta_bag in generate_delta_bag_counts(mod_count,
962 len(R_cj)):
963 debug("delta_bag: %s", delta_bag)
965 # this clear() avoids an SWIG/STL bug!?
966 context.delta_bag_count.clear()
967 context.delta_bag_count[:] = delta_bag
969 pmrf = CP.parent_mass_regime[mr_index].fixed_residue_mass
970 fmrf = CP.fragment_mass_regime[mr_index].fixed_residue_mass
971 p_fx = (pmrf[ord('[')]
972 + (N_cj and N_cj[0][5][mr_index][0] or 0)
973 + pmrf[ord(']')]
974 + (C_cj and C_cj[0][5][mr_index][0] or 0)
975 + pca_parent_delta + PROTON_MASS)
976 context.parent_fixed_mass = p_fx
977 f_N_fx = (fmrf[ord('[')]
978 + (N_cj and N_cj[0][5][mr_index][1] or 0)
979 + pca_frag_delta)
980 context.fragment_N_fixed_mass = f_N_fx
981 f_C_fx = (fmrf[ord(']')]
982 + (C_cj and C_cj[0][5][mr_index][1] or 0)
983 + CP.fragment_mass_regime[mr_index].water_mass)
984 context.fragment_C_fixed_mass = f_C_fx
986 info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
987 % (mod_count, mr_index, pca_res, cji, delta_bag))
988 debug("p_fx %s f_N_fx %s f_C_fx %s"
989 % (p_fx, f_N_fx, f_C_fx))
991 score_statistics.combinations_searched = 0
992 cgreylag.spectrum.search_runs(context,
993 score_statistics)
994 total_combinations_searched \
995 += score_statistics.combinations_searched
996 info(" %s candidate spectra examined, this bag",
997 score_statistics.combinations_searched)
999 info('%s candidate spectra examined',
1000 score_statistics.candidate_spectrum_count)
1003 def get_prefix_sequence(begin_pos, run_offset, sequence):
1005 >>> get_prefix_sequence(0, 0, 'abcdefghi')
1007 >>> get_prefix_sequence(1, 0, 'abcdefghi')
1008 '[a'
1009 >>> get_prefix_sequence(3, 0, 'abcdefghi')
1010 '[abc'
1011 >>> get_prefix_sequence(4, 0, 'abcdefghi')
1012 'abcd'
1013 >>> get_prefix_sequence(4, 2, 'abcdefghi')
1014 'cdef'
1017 prefix_start = run_offset + begin_pos - 4
1018 s = sequence[max(0, prefix_start):prefix_start+4]
1019 if len(s) < 4:
1020 s = '[' + s
1021 return s
1023 def get_suffix_sequence(end_pos, run_offset, sequence):
1025 >>> get_suffix_sequence(0, 0, 'abcdefghi')
1026 'abcd'
1027 >>> get_suffix_sequence(4, 0, 'abcdefghi')
1028 'efgh'
1029 >>> get_suffix_sequence(6, 0, 'abcdefghi')
1030 'ghi]'
1031 >>> get_suffix_sequence(8, 0, 'abcdefghi')
1032 'i]'
1033 >>> get_suffix_sequence(4, 2, 'abcdefghi')
1034 'ghi]'
1037 suffix_start = run_offset + end_pos
1038 s = sequence[suffix_start:suffix_start+4]
1039 if len(s) < 4:
1040 s = s + ']'
1041 return s
1044 def clean_string(v):
1045 """Strip and collapse internal whitespace.
1047 >>> clean_string(' one two ')
1048 'one two'
1051 if not v:
1052 return v
1053 return re.sub(r'[\s]+', ' ', v.strip())
1056 def clean_defline(s):
1057 """Return the given string with tabs replaced by spaces and control
1058 and non-ASCII characters removed, then stripped.
1060 >>> tab=chr(9); clean_defline(' one' + tab + ' two three\001four ')
1061 'one two threefour'
1064 return re.sub(r'[^ -~]', '', s.replace('\t', ' ')).strip()
1067 def abbrev_defline(s):
1068 """Return an abbreviated version of the defline--about 80 characters.
1070 >>> abbrev_defline('words ' * 10)
1071 'words words words words words words words words words words '
1072 >>> abbrev_defline('words ' * 20)
1073 'words words words words words words words words words words words words words words...'
1076 ab = re.match(r'.{,80}\S{,170}', s).group(0)
1077 if len(ab) < len(s):
1078 ab += '...'
1079 return ab
1082 def zopen(filename, mode='r', compresslevel=None):
1083 """Open a filename as with 'open', but using compression if indicated by
1084 the filename suffix. The compression level defaults to 1 for .gz files
1085 and 9 for .bz2 files (presumably bzip2 will only be used if size matters
1086 much more than compression time)."""
1087 if filename.endswith('.gz'):
1088 import gzip
1089 return gzip.GzipFile(filename, mode, compresslevel or 1)
1090 elif filename.endswith('.bz2'):
1091 import bz2
1092 return bz2.BZ2File(filename, mode, compresslevel or 9)
1093 else:
1094 return open(filename, mode)
1097 def main(args=sys.argv[1:]):
1098 parser = optparse.OptionParser(usage=
1099 "usage: %prog [options] <job-id>"
1100 " <parameter-file> <ms2-file>...",
1101 description=__doc__, version=__version__)
1102 pa = parser.add_option
1103 pa("-P", "--parameter", nargs=2, dest="parameters", action="append",
1104 default=[],
1105 help="override a parameter in <parameter-file>, may be used multiple"
1106 " times", metavar="NAME VALUE")
1107 pa("-w", "--work-slice", nargs=2, type="float", dest="work_slice",
1108 help="search a subinterval [L:U) of the work space"
1109 " (where 0 <= L <= U <= 1) in standalone mode", metavar="L U")
1110 pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
1111 pa("-p", "--show-progress", action="store_true", dest="show_progress",
1112 help="show running progress")
1113 pa("--estimate", action="store_true", dest="estimate_only",
1114 help="just estimate the time required for the search")
1115 pa("-v", "--verbose", action="store_true", dest="verbose",
1116 help="be verbose")
1117 pa("--copyright", action="store_true", dest="copyright",
1118 help="print copyright and exit")
1119 pa("--debug", action="store_true", dest="debug",
1120 help="output debugging info")
1121 pa("--profile", action="store_true", dest="profile",
1122 help="dump Python profiling output to './greylag.prof.<pid>'")
1123 (options, args) = parser.parse_args(args=args)
1125 if options.copyright:
1126 print __copyright__
1127 sys.exit(0)
1129 if len(args) < 3:
1130 parser.print_help()
1131 sys.exit(1)
1133 job_id = args[0]
1134 parameter_fn = args[1]
1135 spectrum_fns = args[2:]
1137 if (any(True for f in spectrum_fns if not f.endswith('.ms2'))
1138 or (options.work_slice
1139 and not (0 <= options.work_slice[0]
1140 <= options.work_slice[1] <= 1))):
1141 parser.print_help()
1142 sys.exit(1)
1144 log_level = logging.WARNING
1145 if options.quiet:
1146 log_level = logging.ERROR
1147 if options.verbose:
1148 log_level = logging.INFO
1149 if options.debug:
1150 log_level = logging.DEBUG
1151 logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
1152 format='%(asctime)s %(levelname)s: %(message)s')
1153 info("starting on %s", gethostname())
1155 # prevent format char problems
1156 if '%' in job_id:
1157 error("<job-id> may not contain '%'")
1159 # check spectrum basename uniqueness, as corresponding sqt files will be
1160 # in a single directory
1161 base_spectrum_fns = [ os.path.basename(fn) for fn in spectrum_fns ]
1162 if len(base_spectrum_fns) != len(set(base_spectrum_fns)):
1163 error("base spectrum filenames must be unique")
1165 # check -P names for validity
1166 bad_names = (set(n for n,v in options.parameters)
1167 - set(PARAMETER_INFO.keys()))
1168 if bad_names:
1169 error("bad -P parameter names %s" % list(bad_names))
1171 # read params
1172 cp = ConfigParser.RawConfigParser()
1173 cp.optionxform = str # be case-sensitive
1174 with open(parameter_fn) as parameter_file:
1175 cp.readfp(parameter_file)
1176 if not cp.has_section('greylag'):
1177 error("%s has no [greylag] section" % parameter_fn)
1178 parameters = dict(cp.items('greylag'))
1179 parameters.update(dict(options.parameters)) # command-line override
1180 global XTP
1181 XTP = validate_parameters(parameters)
1183 fixed_mod_map = dict((r[3], r) for r in XTP["pervasive_mods"])
1184 initialize_spectrum_parameters(options, XTP["mass_regimes"], fixed_mod_map)
1186 # read sequence dbs
1187 # [(defline, seq, filename), ...]
1188 databases = XTP["databases"].split()
1189 fasta_db = list(read_fasta_files(databases))
1190 # [(idno, offset, defline, seq, seq_filename), ...]
1191 db = []
1192 for idno, (defline, sequence, filename) in enumerate(fasta_db):
1193 db.extend(split_sequence_into_aa_runs(idno, defline, sequence,
1194 filename))
1195 db_residue_count = sum(len(dbi[3]) for dbi in db)
1197 # (idno, offset) -> (defline, run_seq, filename)
1198 db_info = dict(((idno, offset), (defline, seq, filename))
1199 for idno, offset, defline, seq, filename in db)
1201 info("read %s sequences (%s runs, %s residues)", len(fasta_db), len(db),
1202 db_residue_count)
1203 max_run_length = max(len(r[3]) for r in db)
1204 info("max run length is %s residues", max_run_length)
1205 if max_run_length > 2**31 - 1:
1206 error("runs longer than %s not yet supported", max_run_length)
1207 if not db:
1208 error("no database sequences")
1210 # read spectrum offset indices
1211 spectrum_offset_indices = []
1212 for spfn in spectrum_fns:
1213 idxfn = spfn + '.idx'
1214 with contextlib.closing(gzip.open(idxfn)) as idxf:
1215 idx = cPickle.load(idxf)
1216 # try to verify that index matches spectrum file
1217 if idx['file size'] != os.path.getsize(spfn):
1218 error("index '%s' needs rebuilding" % idxfn)
1219 spectrum_offset_indices.append(idx['offsets'])
1221 # FIX: assume standalone mode (for now)
1222 assert options.work_slice
1224 # read spectra per work slice
1225 spectra = read_spectra_slice(spectrum_fns, spectrum_offset_indices,
1226 options.work_slice)
1227 spectra.sort(key=lambda x: x.mass)
1229 if not spectra:
1230 warning("no input spectra")
1231 else:
1232 info("read %s spectra (mass range %s - %s)", len(spectra),
1233 spectra[0].mass, spectra[-1].mass)
1235 def peak_statistics(spectra):
1236 counts = [ len(sp.peaks) for sp in spectra ]
1237 counts.sort()
1238 n = len(counts)
1239 return (counts[0], counts[int(n*0.25)], counts[int(n*0.5)],
1240 counts[int(n*0.75)], counts[-1], sum(counts) / float(n))
1242 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1243 % peak_statistics(spectra))
1245 # filter and normalize spectra
1246 for sp in spectra:
1247 sp.filter_peaks(XTP["TIC_cutoff_proportion"],
1248 CP.parent_mass_tolerance_max)
1249 sp.classify(XTP["intensity_class_count"], XTP["intensity_class_ratio"],
1250 XTP["fragment_mass_tolerance"])
1252 min_psm = XTP["min_parent_spectrum_mass"]
1253 max_psm = XTP["max_parent_spectrum_mass"]
1254 # FIX: also filter by 1 + 2 + 4 rule?
1255 spectra = [ sp for sp in spectra
1256 if len(sp.peaks) >= 10 and min_psm <= sp.mass <= max_psm ]
1258 info("after filtering:")
1259 info(" %s spectra (mass range %s - %s)", len(spectra),
1260 spectra[0].mass, spectra[-1].mass)
1261 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1262 % peak_statistics(spectra))
1264 cgreylag.spectrum.set_searchable_spectra(spectra)
1265 score_statistics = cgreylag.score_stats(len(spectra),
1266 XTP["best_result_count"])
1268 if spectra:
1269 del spectra # release memory
1271 # FIX!!!
1272 # (cleavage_re, position of cleavage in cleavage_re)
1273 cleavage_motif = "[X]|[X]"
1274 cleavage_pattern, cleavage_pos = cleavage_motif_re(cleavage_motif)
1275 if cleavage_pos == None:
1276 error("cleavage site '%s' is missing '|'",
1277 XTP["protein, cleavage site"])
1278 cleavage_pattern = re.compile(cleavage_pattern)
1280 context = cgreylag.search_context()
1281 loci_seen = set()
1282 for idno, offset, defline, seq, seq_filename in db:
1283 cp = []
1284 if cleavage_motif != "[X]|[X]":
1285 cp = list(generate_cleavage_points(cleavage_pattern,
1286 cleavage_pos, seq))
1287 locus_name = defline.split(None, 1)[0]
1288 if locus_name in loci_seen:
1289 error("locus name '%s' is not unique in the search database(s)"
1290 % locus_name)
1291 loci_seen.add(locus_name)
1292 sr = cgreylag.sequence_run(idno, offset, seq, cp, locus_name)
1293 context.sequence_runs.append(sr)
1294 context.maximum_missed_cleavage_sites = 1000000000 # FIX
1296 info("searching")
1297 search_all(options, context, score_statistics)
1298 else:
1299 warning("no spectra after filtering--search skipped")
1301 if options.estimate_only:
1302 print ("%.2f generic CPU hours"
1303 % (score_statistics.candidate_spectrum_count / 300.0e6))
1304 return
1306 # FIX!
1307 info("writing result file")
1308 result_fn = 'test.result.gz'
1309 py_score_statistics = pythonize_swig_object(score_statistics)
1311 # only put spectrum info that's actually needed into the result file
1312 spectrum_metadata_fields = set(['name', 'file_id', 'mass', 'charge',
1313 'total_ion_current', 'comparisons'])
1314 spectrum_metadata = \
1315 pythonize_swig_object(cgreylag.cvar.spectrum_searchable_spectra,
1316 only_fields=spectrum_metadata_fields)
1318 with contextlib.closing(zopen(result_fn, 'w')) as result_file:
1319 cPickle.dump({ 'matches' : py_score_statistics,
1320 'spectra' : spectrum_metadata,
1321 'spectrum files' : base_spectrum_fns,
1322 'databases' : databases,
1323 'parameters' : XTP,
1324 'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES,
1325 'proton mass' : PROTON_MASS,
1326 'argv' : sys.argv },
1327 result_file,
1328 cPickle.HIGHEST_PROTOCOL)
1329 info("finished, result file written to '%s'", result_fn)
1332 if __name__ == '__main__':
1333 # try:
1334 # import psyco
1335 # psyco.full()
1336 # ###psyco.bind(formula_mass)
1337 # warning('psyco enabled')
1338 # except ImportError:
1339 # pass
1341 try:
1342 if '--profile' in sys.argv:
1343 import cProfile
1344 import pstats
1345 report_fn = "greylag.prof.%s" % os.getpid()
1346 data_fn = report_fn + ".tmp"
1347 prof = cProfile.run('main()', data_fn)
1348 with open(report_fn, 'w') as report_f:
1349 try:
1350 stats = pstats.Stats(data_fn, stream=report_f)
1351 stats.strip_dirs()
1352 stats.sort_stats('cumulative')
1353 stats.print_stats(50)
1354 stats.sort_stats('time')
1355 stats.print_stats(50)
1356 print "# profile report written to '%s'" % report_fn
1357 finally:
1358 try:
1359 os.remove(data_fn)
1360 except:
1361 pass
1362 else:
1363 main()
1364 except SystemExit:
1365 raise
1366 except:
1367 logging.exception("unhandled exception")
1368 sys.exit(1)
1369 finally:
1370 logging.shutdown()
1373 # FIXES:
1374 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)