1 #!/usr/bin/env greylag-python
3 '''This program does the actual work to search mass spectra against a sequence
4 database. <job-id> is a unique identifier used as a prefix for work and
5 output directories. The <configuration-file> contains program options. The
6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
9 This program can operate in standalone mode (useful for testing) or command
10 mode. If the --work-slice option is given, that slice will be searched in
11 standalone mode, then the program will exit. Otherwise, the program will
12 process commands until an exit command is received.
16 from __future__
import with_statement
19 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License along
32 with this program; if not, write to the Free Software Foundation, Inc.,
33 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 ##############################################################################
37 ## "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra ##
38 ##############################################################################
50 from logging
import debug
, info
, warning
54 from pprint
import pprint
, pformat
56 from socket
import gethostname
62 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
63 # KeyboardInterrupt processing, since we may spend long periods of time in
64 # uninterruptible C++ calls. Also die immediately on SIGPIPE.
67 signal
.signal(signal
.SIGINT
, signal
.SIG_DFL
)
68 signal
.signal(signal
.SIGPIPE
, signal
.SIG_DFL
)
75 # if we're unit testing, just throw an exception
76 if __name__
!= "__main__":
77 raise Exception((s
+ " (fatal error)") % args
)
78 logging
.error(s
, *args
)
82 def fileerror(s
, *args
):
83 error(s
+ (", at line %s of file '%s'"
84 % (fileinput
.filelineno(), fileinput
.filename())),
88 # name -> value map of processed XML input parameters
91 # handle to the singleton parameter object shared with the C++ module
92 CP
= cgreylag
.cvar
.parameters_the
94 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
95 PROTON_MASS
= 1.007276
96 ELECTRON_MASS
= 0.000549 # ?
98 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
99 MONOISOTOPIC_ATOMIC_MASS
= {
102 'N' : 14.00307400529,
103 'O' : 15.994914622115,
108 # most prevalent only (1 in 1000)
109 ISOTOPIC_ATOMIC_MASS
= { # prevalence (in %)
110 'C13' : 13.003354837810, # 1.078
111 'N15' : 15.00010889849, # 0.3687
112 'O18' : 17.99916049, # 0.20514
113 'S33' : 32.9714585012, # 0.762
114 'S34' : 33.9678668311, # 4.2928
117 AVERAGE_ATOMIC_MASS
= {
126 # The xtandem average residue masses are about 0.002 amu higher than those
127 # calculated directly from the above average atomic masses. None of the
128 # chemists consulted knew of any reason why, aside from lack of precision in
129 # the average atomic mass estimates. This shouldn't matter very much, as
130 # fragmentation calculations should all be monoisotopic, and we can always
131 # widen the parent tolerance window a bit.
134 def formula_mass(formula
, atomic_mass
=MONOISOTOPIC_ATOMIC_MASS
):
135 """Return the mass of formula, using the given mass regime (monoisotopic
138 >>> formula_mass('H2O', { 'H':1, 'O':16 })
140 >>> # monoisotopic mass of glycine
141 >>> str(round(formula_mass('C2H3ON'), 4))
145 parts
= [ p
or '1' for p
in re
.split(r
'([A-Z][a-z]*)', formula
)[1:] ]
146 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
147 return sum(atomic_mass
[parts
[i
]] * int(parts
[i
+1])
148 for i
in range(0, len(parts
), 2))
150 # FIX: selenocysteine (U), etc
175 RESIDUES
= sorted(RESIDUE_FORMULA
.keys())
176 RESIDUES_W_BRACKETS
= RESIDUES
+ ['[', ']']
179 # accessed as a global variable (FIX?)
180 # [0][1] -> 'H' -> fragment mass of H for regime 0
181 MASS_REGIME_ATOMIC_MASSES
= []
184 def mass_regime_atomic_masses(spec
):
185 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
188 name
, isotopes
= spec
189 assert name
in ['MONO', 'AVG'] and len(isotopes
) <= 1
191 r
= MONOISOTOPIC_ATOMIC_MASS
.copy()
193 r
= AVERAGE_ATOMIC_MASS
.copy()
195 iname
, prevalence
= isotopes
[0]
196 assert iname
== 'N15' and 0 <= prevalence
<= 1
197 # this is a simplification, but additional accuracy pointless?
199 r
['N'] = ISOTOPIC_ATOMIC_MASS
['N15']
201 r
['N'] += (ISOTOPIC_ATOMIC_MASS
['N15'] - r
['N']) * prevalence
205 def initialize_spectrum_parameters(options
, mass_regimes
, fixed_mod_map
):
206 """Initialize parameters known to the spectrum module.
207 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
210 debug('fixed_mod_map: %s', fixed_mod_map
)
211 # This is the size of vectors that are indexed by residues (A-Z) or
212 # special characters ('[]').
213 RESIDUE_LIMIT
= max(ord(c
) for c
in 'Z[]') + 1
215 # These are currently monoisotopic. (deuterium pointless?)
216 CP
.proton_mass
= PROTON_MASS
217 CP
.hydrogen_mass
= formula_mass("H")
219 global MASS_REGIME_ATOMIC_MASSES
220 for regime_pair
in mass_regimes
:
221 assert len(regime_pair
) == 2 # parent and fragment
222 info('mass regime: %s', regime_pair
)
223 MASS_REGIME_ATOMIC_MASSES
.append([])
224 for n
, regime
in enumerate(regime_pair
):
225 atmass
= mass_regime_atomic_masses(regime
)
226 MASS_REGIME_ATOMIC_MASSES
[-1].append(atmass
)
227 creg
= cgreylag
.mass_regime_parameters()
229 creg
.hydroxyl_mass
= formula_mass("OH", atmass
)
230 creg
.water_mass
= formula_mass("H2O", atmass
)
231 creg
.ammonia_mass
= formula_mass("NH3", atmass
)
233 creg
.fixed_residue_mass
.resize(RESIDUE_LIMIT
)
235 for r
in RESIDUES_W_BRACKETS
:
238 m
= formula_mass(RESIDUE_FORMULA
[r
], atmass
)
239 rmod
= fixed_mod_map
.get(r
)
241 if isinstance(rmod
[1], str):
243 m
+= rmod
[0] * formula_mass(rmod
[1])
245 m
+= rmod
[0] * formula_mass(rmod
[1], atmass
)
247 m
+= rmod
[0] * rmod
[1]
248 creg
.fixed_residue_mass
[ord(r
)] = m
249 # assuming these are monoisotopic (not regime)
250 creg
.fixed_residue_mass
[ord('[')] += formula_mass("H")
251 creg
.fixed_residue_mass
[ord(']')] += formula_mass("OH")
253 CP
.parent_mass_regime
.append(creg
);
255 creg
.fixed_residue_mass
[ord('[')] -= CP
.hydrogen_mass
256 creg
.fixed_residue_mass
[ord(']')] -= creg
.hydroxyl_mass
257 CP
.fragment_mass_regime
.append(creg
);
258 for r
in RESIDUES_W_BRACKETS
:
259 info('fixed_mass %s: %s', r
,
261 % (CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)],
262 CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)])
263 for rn
in range(len(mass_regimes
)) ])
265 for rn
in range(len(mass_regimes
)):
266 # physically impossible (and results would be garbage)
267 if CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
268 raise ValueError('bogus parent mass specification for %s' % r
)
269 if CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
270 raise ValueError('bogus parent mass specification for %s' % r
)
272 CP
.parent_mass_tolerance_1
= XTP
["parent_mz_tolerance"]
273 CP
.parent_mass_tolerance_max
= (XTP
["parent_mz_tolerance"]
274 * XTP
["charge_limit"])
276 CP
.fragment_mass_tolerance
= XTP
["fragment_mass_tolerance"]
277 CP
.intensity_class_count
= XTP
["intensity_class_count"]
279 CP
.minimum_peptide_length
= XTP
["min_peptide_length"]
281 # CP.ln_factorial[n] == ln(n!)
282 CP
.ln_factorial
.resize(int(XTP
["max_parent_spectrum_mass"]
283 / XTP
["fragment_mass_tolerance"] + 100), 0.0)
284 for n
in range(2, len(CP
.ln_factorial
)):
285 CP
.ln_factorial
[n
] = CP
.ln_factorial
[n
-1] + math
.log(n
)
287 CP
.estimate_only
= bool(options
.estimate_only
)
288 CP
.show_progress
= bool(options
.show_progress
)
291 def cleavage_motif_re(motif
):
292 """Return (regexp, pos), where regexp is a regular expression that will
293 match a cleavage motif, and pos is the position of the cleavage with
294 respect to the match (or None). (The RE actually matches one character,
295 the rest matching as lookahead, so that re.finditer will find all
296 overlapping matches.)
298 >>> cleavage_motif_re('[KR]|{P}')
300 >>> cleavage_motif_re('[X]|[X]')
306 motif_re
= re
.compile(r
'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
307 parts
= [ p
[0] for p
in motif_re
.findall(motif
) ]
308 if ''.join(parts
) != motif
:
309 raise ValueError('invalid cleavage motif pattern')
313 if cleavage_pos
!= None:
314 raise ValueError("invalid cleavage motif pattern"
321 re_parts
.append('[%s]' % part
[1:-1])
323 re_parts
.append('[^%s]' % part
[1:-1])
325 assert False, "unknown cleavage motif syntax"
327 if len(re_parts
) == 0:
329 elif len(re_parts
) == 1:
330 re_pattern
= re_parts
[0]
332 re_pattern
= re_parts
[0] + '(?=' + ''.join(re_parts
[1:]) + ')'
333 return (re_pattern
, cleavage_pos
)
336 def generate_cleavage_points(cleavage_re
, cleavage_pos
, sequence
):
337 """Yields the offsets of the cleavages in sequence. The endpoints are
338 always included, by convention.
340 >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
346 for m
in cleavage_re
.finditer(sequence
):
347 p
= m
.start() + cleavage_pos
348 if 0 < p
< len(sequence
):
353 AA_SEQUENCE
= re
.compile(r
'[ARNDCQEGHILKMFPSTWYV]+')
355 def split_sequence_into_aa_runs(idno
, locusname
, defline
, sequence
, filename
):
356 """Returns a tuple (idno, start, locusname, defline, seq, filename) for
357 each contiguous run of residues in sequence, where 'start' is the position
358 of 'seq' in 'sequence'.
360 >>> pprint(split_sequence_into_aa_runs(123, 'ln', 'ln defline',
361 ... 'STSS*DEFABA', 'filename'))
362 [(123, 0, 'ln', 'ln defline', 'STSS', 'filename'),
363 (123, 5, 'ln', 'ln defline', 'DEFA', 'filename'),
364 (123, 10, 'ln', 'ln defline', 'A', 'filename')]
367 return [ (idno
, m
.start(), locusname
, defline
, m
.group(), filename
)
368 for n
, m
in enumerate(AA_SEQUENCE
.finditer(sequence
)) ]
371 def read_fasta_files(filenames
):
372 """Yield (locusname, defline, sequence, filename) tuples as read from
373 FASTA files (uppercasing sequence)."""
376 locusname
, defline
= None, None
379 # reset fileinput first, in case it's been called before
384 for line
in fileinput
.input(filenames
):
388 yield (locusname
, defline
, ''.join(seqs
), fileinput
.filename())
390 fileerror("bad format: line precedes initial defline")
392 locusname_rest
= defline
.split(None, 1)
393 if not locusname_rest
:
394 error("empty locus name not allowed")
395 locusname
= locusname_rest
[0]
396 if locusname
in loci_seen
:
397 error("locus name '%s' is not unique in the search database(s)"
399 loci_seen
.add(locusname
)
402 seqs
.append(line
.upper())
404 yield (locusname
, defline
, ''.join(seqs
), fileinput
.filename())
407 def read_spectra_slice(spectrum_fns
, offset_indices
, slice):
409 assert 0 <= s_l
<= s_u
<= 1
411 total_spectra
= sum(len(oi
) for oi
in offset_indices
)
412 sp_l
= int(float(s_l
) * total_spectra
)
413 sp_u
= int(float(s_u
) * total_spectra
)
418 for f_no
, (fn
, oi
) in enumerate(zip(spectrum_fns
, offset_indices
)):
419 seeing_u
= seeing_l
+ len(oi
)
420 f_l
= max(sp_l
, seeing_l
)
421 f_u
= min(sp_u
, seeing_u
)
424 b_l
= oi
[f_l
- seeing_l
]
425 b_u
= oi
[f_u
- seeing_l
] if f_u
- seeing_l
< len(oi
) else -1 # oo
428 fsp
= cgreylag
.spectrum
.read_spectra_from_ms2(f
, f_no
,
436 # XML parameter file processing
438 # FIX: This parsing is way too complex. How to simplify?
440 def mass_regime_part(part_specification
):
441 """Parse a single mass regime specification part.
443 >>> mass_regime_part('MONO')
445 >>> mass_regime_part('AVG')
447 >>> mass_regime_part('MONO(N15@87.5%)')
448 ('MONO', [('N15', 0.875)])
451 ps
= [ x
.strip() for x
in part_specification
.partition('(') ]
452 if ps
[0] not in ('MONO', 'AVG'):
453 raise ValueError("invalid mass regime list specification"
454 " (regime id must be 'MONO' or 'AVG')")
458 raise ValueError("invalid mass regime list specification"
460 pps
= [ x
.strip() for x
in ps
[2][:-1].split(',') ]
462 raise ValueError("invalid mass regime list specification"
463 " (multiple isotopes not yet implemented)")
464 ppps
= [ x
.strip() for x
in pps
[0].partition('@') ]
466 raise ValueError("invalid mass regime list specification"
468 if ppps
[0] not in ('N15',):
469 raise ValueError("invalid mass regime list specification"
470 " (isotope id must currently be 'N15')")
471 if ppps
[2][-1] != '%':
472 raise ValueError("invalid mass regime list specification"
474 prevalence
= float(ppps
[2][:-1]) / 100
475 if not (0 <= prevalence
<= 1):
476 raise ValueError("invalid mass regime list specification"
477 " (prevalence must be in range 0-100%)")
478 return (ps
[0], [(ppps
[0], prevalence
)])
480 def mass_regime_list(mass_regime_list_specification
):
481 """Check and return a list of regime tuples (parent_regime,
482 fragment_regime), where each regime is a tuple (id, [(isotope_id,
483 prevalence), ...]). Multiple isotopes (when implemented) would be
486 >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
487 [[('AVG', []), ('MONO', [])],
488 [('MONO', []), ('MONO', [])],
489 [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
493 for regspec
in mass_regime_list_specification
.split(';'):
494 halves
= [ x
.strip() for x
in regspec
.split('/') ]
496 raise ValueError("invalid mass regime list specification"
498 pr
= [ mass_regime_part(h
) for h
in halves
]
500 pr
= [ pr
[0], pr
[0] ]
502 debug("mass regime list:\n%s", pformat(result
))
504 # The first fragmentation regime should generally be MONO, so that
505 # formulaic deltas with '!' do the expected thing.
506 if result
[0][1] != ('MONO', []):
507 raise ValueError("first fragmentation regime was something other than"
508 " 'MONO' with no isotopes--this is almost certainly"
509 " not what was intended")
513 def parse_mod_term(s
, is_potential
=False):
514 """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
515 residues, description).
517 >>> parse_mod_term('-C2H3ON!@C')
518 (-1, 'C2H3ON', True, 'C', None)
519 >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
520 (1, 42.0, False, 'STY', 'phosphorylation')
524 m
= re
.match(r
'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
525 r
'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s
)
527 raise ValueError("invalid modification term specification"
530 invalid_residues
= set(mg
[5]) - set(RESIDUES_W_BRACKETS
)
532 raise ValueError("invalid modification list specification"
533 " (invalid residues %s)" % list(invalid_residues
))
537 if is_potential
and abs(delta
) < 0.0001:
538 raise ValueError("invalid modification list specification"
539 " (delta '%s' is too small)" % delta
)
541 if not is_potential
and len(residues
) != 1:
542 raise ValueError("invalid modification list specification '%s' (only"
543 " potential modifications may have multiple residues)"
545 if len(residues
) != len(set(residues
)):
546 raise ValueError("invalid modification list specification"
547 " '%s' (duplicate residues prohibited)"
549 return (mg
[0] == '-' and -1 or 1, delta
, mg
[4] == '!', residues
, mg
[7])
552 def fixed_mod_list(specification
):
553 """Check and return a list of modification tuples.
555 >>> fixed_mod_list('57@C')
556 [(1, 57.0, False, 'C', None)]
557 >>> fixed_mod_list('57@C,CH!@N desc')
558 [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
559 >>> fixed_mod_list('57@C,58@C')
560 Traceback (most recent call last):
562 ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
565 if not specification
:
567 result
= [ parse_mod_term(s
) for s
in specification
.split(',') ]
568 residues
= [ x
[3] for x
in result
]
569 # this check is across terms; the one above is within terms
570 if len(residues
) != len(set(residues
)):
571 raise ValueError("invalid modification list specification"
572 " '%s' (duplicate residues prohibited)"
574 debug("fixed_mod_list:\n%s", pformat(result
))
578 def parse_mod_basic_expression(s
):
581 tree
, rest
= parse_mod_disjunction(s
[1:])
584 raise ValueError("invalid modification list specification"
585 " (expected matching ')')")
586 return tree
, rest
[1:]
587 parts
= re
.split(r
'([;,()])', s
, 1)
591 assert len(parts
) == 3
592 term
, rest
= parts
[0], parts
[1]+parts
[2]
593 return parse_mod_term(term
, is_potential
=True), rest
595 def parse_mod_conjunction(s
):
598 tree
, s
= parse_mod_basic_expression(s
)
606 def parse_mod_disjunction(s
):
609 tree
, s
= parse_mod_conjunction(s
)
617 def potential_mod_list(specification
):
618 """Check and return a tree of potential modification tuples. Nodes at
619 even (odd) levels are disjunctions (conjunctions). (The top list is a
622 >>> potential_mod_list('')
624 >>> potential_mod_list('PO3H@STY; C2H2O@KST')
625 [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
626 >>> potential_mod_list('PO3H@STY, C2H2O@KST')
627 [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
628 >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
629 ... C2H2O@KST acetylation;
630 ... CH2@AKST methylation),
631 ... O@M oxidation'''))
632 [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
633 [(1, 'C2H2O', False, 'KST', 'acetylation')],
634 [(1, 'CH2', False, 'AKST', 'methylation')]],
635 (1, 'O', False, 'M', 'oxidation')]]
638 if not specification
:
640 tree
, remainder
= parse_mod_disjunction(specification
)
642 raise ValueError("invalid modification list specification"
643 " (unexpected '%s')" % remainder
)
644 debug("potential_mod_list:\n%s", pformat(tree
))
648 def p_positive(x
): return x
> 0
649 def p_negative(x
): return x
< 0
650 def p_nonnegative(x
): return x
>= 0
651 def p_proportion(x
): return 0 <= x
<= 1
653 # Configuration file parameter specification
655 # name -> (type, default, check_fn)
656 # type may be bool, int, float, str, modlist, or a tuple of values
657 # default, as a string value, or None if value must be explicitly specified
658 # check_fn is an optional function that returns True iff the value is valid
661 "databases" : (str, None),
662 "decoy_locus_prefix" : (str, "SHUFFLED_"),
663 "mass_regimes" : (mass_regime_list
, "MONO"),
664 "pervasive_mods" : (fixed_mod_list
, ""),
665 "potential_mods" : (potential_mod_list
, ""),
666 "potential_mod_limit" : (int, 2, p_nonnegative
),
667 "charge_limit" : (int, 3, p_positive
),
668 "min_peptide_length" : (int, 5, p_positive
), # needed?
669 "min_parent_spectrum_mass" : (float, 0, p_nonnegative
),
670 "max_parent_spectrum_mass" : (float, 10000, p_nonnegative
),
671 "TIC_cutoff_proportion" : (float, 0.98, p_proportion
),
672 "parent_mz_tolerance" : (float, 1.25, p_nonnegative
),
673 "fragment_mass_tolerance" : (float, 0.5, p_nonnegative
),
674 "intensity_class_count" : (int, 3, p_positive
),
675 "intensity_class_ratio" : (float, 2.0, p_positive
), # really > 1.0?
676 "best_result_count" : (int, 5, p_positive
),
680 def validate_parameters(parameters
, parameter_info
=PARAMETER_INFO
):
681 """Verify that parameters are valid, have valid values, and correspond to
682 currently implemented functionality. Values are converted, default values
683 are filled in, and the resulting name/value dict returned.
685 >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
686 ... {'spectrum, total peaks'
687 ... : (int, '50', p_positive),
688 ... 'output, spectra'
689 ... : (bool, 'no')}).items())
690 [('output, spectra', False), ('spectrum, total peaks', 40)]
695 for p_name
, p_info
in sorted(parameter_info
.items()):
698 check_fn
= len(p_info
) > 2 and p_info
[2] or None
700 v
= parameters
.get(p_name
)
703 debug("parameter '%s' defaulting to '%s'", p_name
, default
)
706 error("missing required parameter '%s'" % p_name
)
707 if isinstance(type_
, tuple):
709 error("parameter '%s' value '%s' not in %s (feature not"
710 " implemented?)" % (p_name
, v
, type_
))
712 v
= { 'yes' : True, 'no' : False }.get(v
)
714 error("parameter '%s' requires a value of 'yes' or 'no'")
718 except ValueError, e
:
719 error("parameter '%s' has value '%s' with invalid format [%s]"
721 if check_fn
and not check_fn(v
):
722 error("parameter '%s' has invalid value '%s' (or feature not"
723 " implemented)" % (p_name
, v
))
726 unknown_parameters
= set(parameters
) - set(parameter_info
)
727 if unknown_parameters
:
728 warning("%s unknown parameter(s):\n %s"
729 % (len(unknown_parameters
),
730 pformat(sorted(list(unknown_parameters
)))))
734 def pythonize_swig_object(o
, only_fields
=None, skip_fields
=[]):
735 """Creates a pure Python copy of a SWIG object, so that it can be easily
736 pickled, or printed (for debugging purposes). Each SWIG object is
737 pythonized as a dictionary, for convenience. If provided, 'only_fields',
738 limits the copy to the list of fields specified. Otherwise,
739 'skip_fields' if given is a list of methods not to include (this helps
740 avoid infinite recursion).
742 >>> pprint(pythonize_swig_object(cgreylag.score_stats(1, 1)))
743 {'best_matches': [[{'mass_trace': [],
745 'peptide_sequence': '',
746 'predicted_parent_mass': 0.0,
749 'spectrum_index': -1}]],
750 'candidate_spectrum_count': 0,
751 'combinations_searched': 0}
755 if isinstance(o
, str):
762 return list(pythonize_swig_object(x
, only_fields
, skip_fields
)
764 if hasattr(o
, '__swig_getmethods__'):
766 for a
in o
.__swig
_getmethods
__:
767 if (only_fields
!= None and a
in only_fields
768 or only_fields
== None and a
not in skip_fields
):
769 s
[a
] = pythonize_swig_object(getattr(o
, a
), only_fields
,
774 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
775 # circularization of the peptide N-terminal. PCA mods are excluded if a
776 # static N-terminal mod has been specified. Likewise, choosing a PCA mod will
777 # exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
778 # dependent on a static mod of C+57 being in effect.)
780 def get_pca_table(mass_regimes
):
781 """Return a list of tuples (residues, parent delta, fragment delta)
782 describing the PCA possibilities for each mass regime. Residues is a
783 string specifying the residues this PCA delta applies to.
785 # FIX: According to Xtandem, C is only a candidate for PCA if
786 # carboxyamidomethylated (C+57). Currently we always search it.
787 return [ [('', 0, 0),
788 ('E', -1 * CP
.parent_mass_regime
[r
].water_mass
,
789 -1 * CP
.fragment_mass_regime
[r
].water_mass
),
790 ('QC', -1 * CP
.parent_mass_regime
[r
].ammonia_mass
,
791 -1 * CP
.fragment_mass_regime
[r
].ammonia_mass
)]
792 for r
in range(len(mass_regimes
)) ]
795 def enumerate_conjunction(mod_tree
, limit
, conjuncts
=[]):
797 if 0 < len(conjuncts
) <= limit
:
800 first
, rest
= mod_tree
[0], mod_tree
[1:]
801 if isinstance(first
, list):
802 for x
in enumerate_disjunction(first
, limit
):
803 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ x
):
806 for y
in enumerate_conjunction(rest
, limit
, conjuncts
):
808 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ [first
]):
811 def enumerate_disjunction(mod_tree
, limit
=sys
.maxint
):
812 """Generates the conjuncts for mod_tree that are no longer than limit.
814 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
815 [[], ['a'], ['b'], ['c']]
816 >>> list(enumerate_disjunction([[1,2,3]]))
817 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
818 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
819 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
820 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
821 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
822 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
826 assert isinstance(mod_tree
, list)
829 for s
in enumerate_conjunction(b
, limit
):
832 def get_mod_conjunct_triples(mod_tree
, limit
):
833 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
834 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
835 ordered by increasing number of conjuncts, and with duplicates within and
836 across removed. A mass table is also appended to each conjunct.
838 # FIX: is there a more elegant way or place for all this?
840 def rmass(regime_index
, par_frag
, sign
, delta
, is_mono
):
841 global MASS_REGIME_ATOMIC_MASSES
842 if isinstance(delta
, str):
845 return (sign
* formula_mass(delta
,
846 MASS_REGIME_ATOMIC_MASSES
[regime_index
][par_frag
]))
850 sign
, delta
, is_mono
= t
[:3]
851 return t
+ (tuple((rmass(r
, 0, sign
, delta
, is_mono
),
852 rmass(r
, 1, sign
, delta
, is_mono
))
853 for r
in range(len(XTP
["mass_regimes"]))),)
856 Ns
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == '['))
857 Cs
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == ']'))
858 assert len(Ns
) <= 1 and len(Cs
) <= 1
859 rest
= [ enmass(x
) for x
in c
if x
[3] not in '[]' ]
860 rest
= tuple(sorted(list(frozenset(rest
))))
861 return (Ns
, Cs
, rest
)
863 return sorted(list(frozenset(triple(conjunct
)
865 in enumerate_disjunction(mod_tree
, limit
))),
866 key
=lambda x
: (sum(len(y
) for y
in x
), x
))
869 def gen_delta_bag_counts(i
, remainder
, bag
):
875 for delta
in range(1, remainder
-i
+1):
877 for x
in gen_delta_bag_counts(i
-1, remainder
-delta
, bag
):
880 def generate_delta_bag_counts(mod_count
, conjunct_length
):
881 """Generate all tuples of positive integers having length conjunct_length
882 and sum mod_count. As a special case, () is such a tuple having length 0
885 >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
889 2 [(3, 1), (2, 2), (1, 3)]
890 3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
893 >>> list(generate_delta_bag_counts(0, 0))
895 >>> list(generate_delta_bag_counts(0, 1))
899 if conjunct_length
== 0:
900 return [()] if mod_count
== 0 else []
901 if mod_count
< conjunct_length
:
903 return gen_delta_bag_counts(conjunct_length
- 1, mod_count
,
904 [0] * conjunct_length
)
907 def set_context_conjuncts(context
, mass_regime_index
, N_cj
, C_cj
, R_cj
):
908 assert len(N_cj
) < 1 and len(C_cj
) < 1
911 context
.N_delta
= N_cj
[0][5][mass_regime_index
][1]
914 context
.C_delta
= C_cj
[0][5][mass_regime_index
][1]
915 context
.delta_bag_lookup
.clear()
916 context
.delta_bag_lookup
.resize(ord('Z')+1)
917 context
.delta_bag_delta
.clear()
918 for n
, cj
in enumerate(R_cj
):
919 context
.delta_bag_delta
.append(cj
[5][mass_regime_index
][1])
921 context
.delta_bag_lookup
[ord(r
)] \
922 = context
.delta_bag_lookup
[ord(r
)] + (n
,)
925 def search_all(options
, context
, score_statistics
):
926 """Search sequence database against searchable spectra."""
928 mod_limit
= XTP
["potential_mod_limit"]
930 mod_conjunct_triples
= get_mod_conjunct_triples(XTP
["potential_mods"],
932 info("%s unique potential modification conjuncts",
933 len(mod_conjunct_triples
))
934 debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples
))
936 mass_regimes
= XTP
["mass_regimes"]
937 pca_table
= get_pca_table(mass_regimes
)
938 debug("pca_table: %s", pca_table
)
940 total_combinations_searched
= 0
942 for mod_count
in range(mod_limit
+ 1):
943 context
.mod_count
= mod_count
944 for mr_index
, (mass_regime
, pca_entry
) in enumerate(zip(mass_regimes
,
946 context
.mass_regime_index
= mr_index
947 for pca_res
, pca_parent_delta
, pca_frag_delta
in pca_entry
:
948 context
.pca_residues
= pca_res
949 context
.pca_delta
= pca_frag_delta
950 for cji
, (N_cj
, C_cj
, R_cj
) in enumerate(mod_conjunct_triples
):
952 continue # mutually exclusive, for now
953 set_context_conjuncts(context
, mr_index
, N_cj
, C_cj
, R_cj
)
954 debug("mod_count: %s", mod_count
)
955 debug("cj_triple: N=%s C=%s R=%s", N_cj
, C_cj
, R_cj
)
956 for delta_bag
in generate_delta_bag_counts(mod_count
,
958 debug("delta_bag: %s", delta_bag
)
960 # this clear() avoids an SWIG/STL bug!?
961 context
.delta_bag_count
.clear()
962 context
.delta_bag_count
[:] = delta_bag
964 pmrf
= CP
.parent_mass_regime
[mr_index
].fixed_residue_mass
965 fmrf
= CP
.fragment_mass_regime
[mr_index
].fixed_residue_mass
966 p_fx
= (pmrf
[ord('[')]
967 + (N_cj
and N_cj
[0][5][mr_index
][0] or 0)
969 + (C_cj
and C_cj
[0][5][mr_index
][0] or 0)
970 + pca_parent_delta
+ PROTON_MASS
)
971 context
.parent_fixed_mass
= p_fx
972 f_N_fx
= (fmrf
[ord('[')]
973 + (N_cj
and N_cj
[0][5][mr_index
][1] or 0)
975 context
.fragment_N_fixed_mass
= f_N_fx
976 f_C_fx
= (fmrf
[ord(']')]
977 + (C_cj
and C_cj
[0][5][mr_index
][1] or 0)
978 + CP
.fragment_mass_regime
[mr_index
].water_mass
)
979 context
.fragment_C_fixed_mass
= f_C_fx
981 info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
982 % (mod_count
, mr_index
, pca_res
, cji
, delta_bag
))
983 debug("p_fx %s f_N_fx %s f_C_fx %s"
984 % (p_fx
, f_N_fx
, f_C_fx
))
986 score_statistics
.combinations_searched
= 0
987 cgreylag
.spectrum
.search_runs(context
,
989 total_combinations_searched \
990 += score_statistics
.combinations_searched
991 info(" %s candidate spectra examined, this bag",
992 score_statistics
.combinations_searched
)
994 info('%s candidate spectra examined',
995 score_statistics
.candidate_spectrum_count
)
998 def get_prefix_sequence(begin_pos
, run_offset
, sequence
):
1000 >>> get_prefix_sequence(0, 0, 'abcdefghi')
1002 >>> get_prefix_sequence(1, 0, 'abcdefghi')
1004 >>> get_prefix_sequence(3, 0, 'abcdefghi')
1006 >>> get_prefix_sequence(4, 0, 'abcdefghi')
1008 >>> get_prefix_sequence(4, 2, 'abcdefghi')
1012 prefix_start
= run_offset
+ begin_pos
- 4
1013 s
= sequence
[max(0, prefix_start
):prefix_start
+4]
1018 def get_suffix_sequence(end_pos
, run_offset
, sequence
):
1020 >>> get_suffix_sequence(0, 0, 'abcdefghi')
1022 >>> get_suffix_sequence(4, 0, 'abcdefghi')
1024 >>> get_suffix_sequence(6, 0, 'abcdefghi')
1026 >>> get_suffix_sequence(8, 0, 'abcdefghi')
1028 >>> get_suffix_sequence(4, 2, 'abcdefghi')
1032 suffix_start
= run_offset
+ end_pos
1033 s
= sequence
[suffix_start
:suffix_start
+4]
1039 def results_dump(score_statistics
, searchable_spectra
):
1040 """Return a result dict mapping spectrum names to (spectrum_metadata,
1041 best_matches) pairs. (Unneeded fields are stripped.)
1045 spectrum_metadata_fs
= set(['name', 'file_id', 'mass', 'charge',
1046 'total_ion_current', 'comparisons'])
1047 py_s_spectra
= pythonize_swig_object(searchable_spectra
,
1048 only_fields
=spectrum_metadata_fs
)
1049 py_matches
= pythonize_swig_object(score_statistics
.best_matches
,
1050 skip_fields
=['spectrum_index'])
1051 assert len(py_s_spectra
) == len(py_matches
)
1053 for sp_metadata
, sp_matches
in zip(py_s_spectra
, py_matches
):
1054 assert sp_metadata
['name'] not in r
, "duplicate spectrum name"
1055 r
[sp_metadata
['name']] = (sp_metadata
, sp_matches
)
1060 def main(args
=sys
.argv
[1:]):
1061 parser
= optparse
.OptionParser(usage
=
1062 "usage: %prog [options]"
1063 " <configuration-file> <ms2-file>...",
1064 description
=__doc__
, version
=__version__
)
1065 pa
= parser
.add_option
1066 pa("-P", "--parameter", nargs
=2, dest
="parameters", action
="append",
1068 help="override a parameter in <parameter-file>, may be used multiple"
1069 " times", metavar
="NAME VALUE")
1070 pa("-w", "--work-slice", nargs
=2, type="float", dest
="work_slice",
1071 help="search a subinterval [L:U) of the work space"
1072 " (where 0 <= L <= U <= 1) in standalone mode", metavar
="L U")
1073 pa("--job-id", dest
="job_id", default
="unknown",
1074 help="used to generate unique output filenames [default 'unknown']")
1075 pa("-q", "--quiet", action
="store_true", dest
="quiet", help="no warnings")
1076 pa("-p", "--show-progress", action
="store_true", dest
="show_progress",
1077 help="show running progress")
1078 pa("--estimate", action
="store_true", dest
="estimate_only",
1079 help="just estimate the time required for the search")
1080 pa("-v", "--verbose", action
="store_true", dest
="verbose",
1082 pa("--copyright", action
="store_true", dest
="copyright",
1083 help="print copyright and exit")
1084 pa("--debug", action
="store_true", dest
="debug",
1085 help="output debugging info")
1086 pa("--profile", action
="store_true", dest
="profile",
1087 help="dump Python profiling output to './greylag.prof.<pid>'")
1088 (options
, args
) = parser
.parse_args(args
=args
)
1090 if options
.copyright
:
1098 configuration_fn
= args
[0]
1099 spectrum_fns
= args
[1:]
1101 if (any(True for f
in spectrum_fns
if not f
.endswith('.ms2'))
1102 or (options
.work_slice
1103 and not (0 <= options
.work_slice
[0]
1104 <= options
.work_slice
[1] <= 1))):
1108 log_level
= logging
.WARNING
1110 log_level
= logging
.ERROR
1112 log_level
= logging
.INFO
1114 log_level
= logging
.DEBUG
1115 logging
.basicConfig(level
=log_level
, datefmt
='%b %e %H:%M:%S',
1116 format
='%(asctime)s %(levelname)s: %(message)s')
1117 info("starting on %s", gethostname())
1119 # check spectrum basename uniqueness, as corresponding sqt files will be
1120 # in a single directory
1121 base_spectrum_fns
= [ os
.path
.basename(fn
) for fn
in spectrum_fns
]
1122 if len(base_spectrum_fns
) != len(set(base_spectrum_fns
)):
1123 error("base spectrum filenames must be unique")
1125 # check -P names for validity
1126 bad_names
= (set(n
for n
,v
in options
.parameters
)
1127 - set(PARAMETER_INFO
.keys()))
1129 error("bad -P parameter names %s" % list(bad_names
))
1132 cp
= ConfigParser
.RawConfigParser()
1133 cp
.optionxform
= str # be case-sensitive
1134 with
open(configuration_fn
) as configuration_file
:
1135 cp
.readfp(configuration_file
)
1136 if not cp
.has_section('greylag'):
1137 error("%s has no [greylag] section" % configuration_fn
)
1138 parameters
= dict(cp
.items('greylag'))
1139 parameters
.update(dict(options
.parameters
)) # command-line override
1141 XTP
= validate_parameters(parameters
)
1143 fixed_mod_map
= dict((r
[3], r
) for r
in XTP
["pervasive_mods"])
1144 initialize_spectrum_parameters(options
, XTP
["mass_regimes"], fixed_mod_map
)
1147 databases
= XTP
["databases"].split()
1148 # [(locusname, defline, seq, filename), ...]
1149 fasta_db
= list(read_fasta_files(databases
))
1150 # [(idno, offset, locusname, defline, seq, seq_filename), ...]
1152 for idno
, (locusname
, defline
, sequence
, filename
) in enumerate(fasta_db
):
1153 db
.extend(split_sequence_into_aa_runs(idno
, locusname
, defline
,
1154 sequence
, filename
))
1155 db_residue_count
= sum(len(dbi
[3]) for dbi
in db
)
1157 info("read %s sequences (%s runs, %s residues)", len(fasta_db
), len(db
),
1159 max_run_length
= max(len(r
[3]) for r
in db
)
1160 info("max run length is %s residues", max_run_length
)
1161 if max_run_length
> 2**31 - 1:
1162 error("runs longer than %s not yet supported", max_run_length
)
1164 error("no database sequences")
1166 # read spectrum offset indices
1167 spectrum_offset_indices
= []
1168 for spfn
in spectrum_fns
:
1169 idxfn
= spfn
+ '.idx'
1170 with contextlib
.closing(gzip
.open(idxfn
)) as idxf
:
1171 idx
= cPickle
.load(idxf
)
1172 # try to verify that index matches spectrum file
1173 if idx
['file size'] != os
.path
.getsize(spfn
):
1174 error("index '%s' needs rebuilding" % idxfn
)
1175 spectrum_offset_indices
.append(idx
['offsets'])
1177 # FIX: assume standalone mode (for now)
1178 assert options
.work_slice
1180 # read spectra per work slice
1181 spectra
= read_spectra_slice(spectrum_fns
, spectrum_offset_indices
,
1183 del spectrum_offset_indices
1184 spectra
.sort(key
=lambda x
: x
.mass
)
1186 def peak_statistics(spectra
):
1187 counts
= [ len(sp
.peaks
) for sp
in spectra
]
1190 return (counts
[0], counts
[int(n
*0.25)], counts
[int(n
*0.5)],
1191 counts
[int(n
*0.75)], counts
[-1], sum(counts
) / float(n
))
1194 info("read %s spectra (mass range %s - %s)", len(spectra
),
1195 spectra
[0].mass
, spectra
[-1].mass
)
1196 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1197 % peak_statistics(spectra
))
1199 warning("no input spectra")
1201 # filter and normalize spectra
1203 sp
.filter_peaks(XTP
["TIC_cutoff_proportion"],
1204 CP
.parent_mass_tolerance_max
)
1205 sp
.classify(XTP
["intensity_class_count"], XTP
["intensity_class_ratio"],
1206 XTP
["fragment_mass_tolerance"])
1208 min_psm
= XTP
["min_parent_spectrum_mass"]
1209 max_psm
= XTP
["max_parent_spectrum_mass"]
1210 # FIX: also filter by 1 + 2 + 4 rule?
1211 spectra
= [ sp
for sp
in spectra
1212 if len(sp
.peaks
) >= 10 and min_psm
<= sp
.mass
<= max_psm
]
1215 info("after filtering:")
1216 info(" %s spectra (mass range %s - %s)", len(spectra
),
1217 spectra
[0].mass
, spectra
[-1].mass
)
1218 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1219 % peak_statistics(spectra
))
1221 cgreylag
.spectrum
.set_searchable_spectra(spectra
)
1222 score_statistics
= cgreylag
.score_stats(len(spectra
),
1223 XTP
["best_result_count"])
1226 del spectra
# release memory
1229 # (cleavage_re, position of cleavage in cleavage_re)
1230 cleavage_motif
= "[X]|[X]"
1231 cleavage_pattern
, cleavage_pos
= cleavage_motif_re(cleavage_motif
)
1232 if cleavage_pos
== None:
1233 error("cleavage site '%s' is missing '|'",
1234 XTP
["protein, cleavage site"])
1235 cleavage_pattern
= re
.compile(cleavage_pattern
)
1237 context
= cgreylag
.search_context()
1238 for idno
, offset
, locusname
, defline
, seq
, seq_filename
in db
:
1240 if cleavage_motif
!= "[X]|[X]":
1241 cp
= list(generate_cleavage_points(cleavage_pattern
,
1243 sr
= cgreylag
.sequence_run(idno
, offset
, seq
, cp
, locusname
)
1244 context
.sequence_runs
.append(sr
)
1245 context
.maximum_missed_cleavage_sites
= 1000000000 # FIX
1248 search_all(options
, context
, score_statistics
)
1250 warning("no spectra after filtering--search skipped")
1252 if options
.estimate_only
:
1253 print ("%.2f generic CPU hours"
1254 % (score_statistics
.candidate_spectrum_count
/ 300.0e6
))
1257 info("writing result file")
1258 # FIX: this is only for standalone mode
1259 result_fn
= 'grind_%s_%s-%s.gwr' % (options
.job_id
, options
.work_slice
[0],
1260 options
.work_slice
[1])
1261 with contextlib
.closing(gzip
.open(result_fn
, 'w')) as result_file
:
1262 d
= { 'version' : __version__
,
1263 'matches' : results_dump(score_statistics
,
1264 cgreylag
.cvar
.spectrum_searchable_spectra
),
1265 'total comparisons' : score_statistics
.candidate_spectrum_count
,
1266 'spectrum files' : base_spectrum_fns
,
1267 'databases' : databases
,
1269 'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES
,
1270 'proton mass' : PROTON_MASS
,
1272 cPickle
.dump(d
, result_file
, cPickle
.HIGHEST_PROTOCOL
)
1273 info("finished, result file written to '%s'", result_fn
)
1276 if __name__
== '__main__':
1280 # ###psyco.bind(formula_mass)
1281 # warning('psyco enabled')
1282 # except ImportError:
1286 if '--profile' in sys
.argv
:
1289 report_fn
= "greylag.prof.%s" % os
.getpid()
1290 data_fn
= report_fn
+ ".tmp"
1291 prof
= cProfile
.run('main()', data_fn
)
1292 with
open(report_fn
, 'w') as report_f
:
1294 stats
= pstats
.Stats(data_fn
, stream
=report_f
)
1296 stats
.sort_stats('cumulative')
1297 stats
.print_stats(50)
1298 stats
.sort_stats('time')
1299 stats
.print_stats(50)
1300 print "# profile report written to '%s'" % report_fn
1311 logging
.exception("unhandled exception")
1318 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)