3 '''This program does the actual work to search mass spectra against a sequence
4 database. <job-id> is a unique identifier used as a prefix for work and
5 output directories. The <parameter-file> contains program options. The
6 spectra are in <ms2-file>s; greylag-index-spectra must already have been run
9 This program can operate in standalone mode (useful for testing) or command
10 mode. If the --work-slice option is given, that slice will be searched in
11 standalone mode, then the program will exit. Otherwise, the program will
12 process commands until an exit command is received.
16 from __future__
import with_statement
19 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2 of the License, or
24 (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
31 You should have received a copy of the GNU General Public License along
32 with this program; if not, write to the Free Software Foundation, Inc.,
33 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 ##############################################################################
37 ## "Simplicity is prerequisite for reliability" - Edsger W. Dijkstra ##
38 ##############################################################################
45 from collections
import defaultdict
52 from logging
import debug
, info
, warning
56 from pprint
import pprint
, pformat
58 from socket
import gethostname
64 # Try to drop dead immediately on SIGINT (control-C), instead of normal Python
65 # KeyboardInterrupt processing, since we may spend long periods of time in
66 # uninterruptible C++ calls. Also die immediately on SIGPIPE.
69 signal
.signal(signal
.SIGINT
, signal
.SIG_DFL
)
70 signal
.signal(signal
.SIGPIPE
, signal
.SIG_DFL
)
77 logging
.error(s
, *args
)
78 if __name__
!= "__main__":
79 raise Exception("(fatal error, but unit testing, so not exiting)")
83 def fileerror(s
, *args
):
84 error(s
+ (", at line %s of file '%s'"
85 % (fileinput
.filelineno(), fileinput
.filename())),
89 # name -> value map of processed XML input parameters
92 # handle to the singleton parameter object shared with the C++ module
93 CP
= cgreylag
.cvar
.parameters_the
95 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
96 PROTON_MASS
= 1.007276
97 ELECTRON_MASS
= 0.000549 # ?
99 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
100 MONOISOTOPIC_ATOMIC_MASS
= {
103 'N' : 14.00307400529,
104 'O' : 15.994914622115,
109 # most prevalent only (1 in 1000)
110 ISOTOPIC_ATOMIC_MASS
= { # prevalence (in %)
111 'C13' : 13.003354837810, # 1.078
112 'N15' : 15.00010889849, # 0.3687
113 'O18' : 17.99916049, # 0.20514
114 'S33' : 32.9714585012, # 0.762
115 'S34' : 33.9678668311, # 4.2928
118 AVERAGE_ATOMIC_MASS
= {
127 # The xtandem average residue masses are about 0.002 amu higher than those
128 # calculated directly from the above average atomic masses. None of the
129 # chemists consulted knew of any reason why, aside from lack of precision in
130 # the average atomic mass estimates. This shouldn't matter very much, as
131 # fragmentation calculations should all be monoisotopic, and we can always
132 # widen the parent tolerance window a bit.
135 def formula_mass(formula
, atomic_mass
=MONOISOTOPIC_ATOMIC_MASS
):
136 """Return the mass of formula, using the given mass regime (monoisotopic
139 >>> formula_mass('H2O', { 'H':1, 'O':16 })
141 >>> # monoisotopic mass of glycine
142 >>> str(round(formula_mass('C2H3ON'), 4))
146 parts
= [ p
or '1' for p
in re
.split(r
'([A-Z][a-z]*)', formula
)[1:] ]
147 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
148 return sum(atomic_mass
[parts
[i
]] * int(parts
[i
+1])
149 for i
in range(0, len(parts
), 2))
151 # FIX: selenocysteine (U), etc
176 RESIDUES
= sorted(RESIDUE_FORMULA
.keys())
177 RESIDUES_W_BRACKETS
= RESIDUES
+ ['[', ']']
180 # accessed as a global variable (FIX?)
181 # [0][1] -> 'H' -> fragment mass of H for regime 0
182 MASS_REGIME_ATOMIC_MASSES
= []
185 def mass_regime_atomic_masses(spec
):
186 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
189 name
, isotopes
= spec
190 assert name
in ['MONO', 'AVG'] and len(isotopes
) <= 1
192 r
= MONOISOTOPIC_ATOMIC_MASS
.copy()
194 r
= AVERAGE_ATOMIC_MASS
.copy()
196 iname
, prevalence
= isotopes
[0]
197 assert iname
== 'N15' and 0 <= prevalence
<= 1
198 # this is a simplification, but additional accuracy pointless?
200 r
['N'] = ISOTOPIC_ATOMIC_MASS
['N15']
202 r
['N'] += (ISOTOPIC_ATOMIC_MASS
['N15'] - r
['N']) * prevalence
206 def initialize_spectrum_parameters(options
, mass_regimes
, fixed_mod_map
):
207 """Initialize parameters known to the spectrum module.
208 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
211 debug('fixed_mod_map: %s', fixed_mod_map
)
212 # This is the size of vectors that are indexed by residues (A-Z) or
213 # special characters ('[]').
214 RESIDUE_LIMIT
= max(ord(c
) for c
in 'Z[]') + 1
216 # These are currently monoisotopic. (deuterium pointless?)
217 CP
.proton_mass
= PROTON_MASS
218 CP
.hydrogen_mass
= formula_mass("H")
220 global MASS_REGIME_ATOMIC_MASSES
221 for regime_pair
in mass_regimes
:
222 assert len(regime_pair
) == 2 # parent and fragment
223 info('mass regime: %s', regime_pair
)
224 MASS_REGIME_ATOMIC_MASSES
.append([])
225 for n
, regime
in enumerate(regime_pair
):
226 atmass
= mass_regime_atomic_masses(regime
)
227 MASS_REGIME_ATOMIC_MASSES
[-1].append(atmass
)
228 creg
= cgreylag
.mass_regime_parameters()
230 creg
.hydroxyl_mass
= formula_mass("OH", atmass
)
231 creg
.water_mass
= formula_mass("H2O", atmass
)
232 creg
.ammonia_mass
= formula_mass("NH3", atmass
)
234 creg
.fixed_residue_mass
.resize(RESIDUE_LIMIT
)
236 for r
in RESIDUES_W_BRACKETS
:
239 m
= formula_mass(RESIDUE_FORMULA
[r
], atmass
)
240 rmod
= fixed_mod_map
.get(r
)
242 if isinstance(rmod
[1], str):
244 m
+= rmod
[0] * formula_mass(rmod
[1])
246 m
+= rmod
[0] * formula_mass(rmod
[1], atmass
)
248 m
+= rmod
[0] * rmod
[1]
249 creg
.fixed_residue_mass
[ord(r
)] = m
250 # assuming these are monoisotopic (not regime)
251 creg
.fixed_residue_mass
[ord('[')] += formula_mass("H")
252 creg
.fixed_residue_mass
[ord(']')] += formula_mass("OH")
254 CP
.parent_mass_regime
.append(creg
);
256 creg
.fixed_residue_mass
[ord('[')] -= CP
.hydrogen_mass
257 creg
.fixed_residue_mass
[ord(']')] -= creg
.hydroxyl_mass
258 CP
.fragment_mass_regime
.append(creg
);
259 for r
in RESIDUES_W_BRACKETS
:
260 info('fixed_mass %s: %s', r
,
262 % (CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)],
263 CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)])
264 for rn
in range(len(mass_regimes
)) ])
266 for rn
in range(len(mass_regimes
)):
267 # physically impossible (and results would be garbage)
268 if CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
269 raise ValueError('bogus parent mass specification for %s' % r
)
270 if CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
271 raise ValueError('bogus parent mass specification for %s' % r
)
273 CP
.parent_mass_tolerance_1
= XTP
["parent_mz_tolerance"]
274 CP
.parent_mass_tolerance_max
= (XTP
["parent_mz_tolerance"]
275 * XTP
["charge_limit"])
277 CP
.fragment_mass_tolerance
= XTP
["fragment_mass_tolerance"]
278 CP
.intensity_class_count
= XTP
["intensity_class_count"]
280 CP
.minimum_peptide_length
= XTP
["min_peptide_length"]
282 # CP.ln_factorial[n] == ln(n!)
283 CP
.ln_factorial
.resize(int(XTP
["max_parent_spectrum_mass"]
284 / XTP
["fragment_mass_tolerance"] + 100), 0.0)
285 for n
in range(2, len(CP
.ln_factorial
)):
286 CP
.ln_factorial
[n
] = CP
.ln_factorial
[n
-1] + math
.log(n
)
288 CP
.estimate_only
= bool(options
.estimate_only
)
289 CP
.show_progress
= bool(options
.show_progress
)
292 def cleavage_motif_re(motif
):
293 """Return (regexp, pos), where regexp is a regular expression that will
294 match a cleavage motif, and pos is the position of the cleavage with
295 respect to the match (or None). (The RE actually matches one character,
296 the rest matching as lookahead, so that re.finditer will find all
297 overlapping matches.)
299 >>> cleavage_motif_re('[KR]|{P}')
301 >>> cleavage_motif_re('[X]|[X]')
307 motif_re
= re
.compile(r
'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
308 parts
= [ p
[0] for p
in motif_re
.findall(motif
) ]
309 if ''.join(parts
) != motif
:
310 raise ValueError('invalid cleavage motif pattern')
314 if cleavage_pos
!= None:
315 raise ValueError("invalid cleavage motif pattern"
322 re_parts
.append('[%s]' % part
[1:-1])
324 re_parts
.append('[^%s]' % part
[1:-1])
326 assert False, "unknown cleavage motif syntax"
328 if len(re_parts
) == 0:
330 elif len(re_parts
) == 1:
331 re_pattern
= re_parts
[0]
333 re_pattern
= re_parts
[0] + '(?=' + ''.join(re_parts
[1:]) + ')'
334 return (re_pattern
, cleavage_pos
)
337 def generate_cleavage_points(cleavage_re
, cleavage_pos
, sequence
):
338 """Yields the offsets of the cleavages in sequence. The endpoints are
339 always included, by convention.
341 >>> list(generate_cleavage_points(re.compile('[KR](?=[^P])'), 1,
347 for m
in cleavage_re
.finditer(sequence
):
348 p
= m
.start() + cleavage_pos
349 if 0 < p
< len(sequence
):
354 AA_SEQUENCE
= re
.compile(r
'[ARNDCQEGHILKMFPSTWYV]+')
356 def split_sequence_into_aa_runs(idno
, defline
, sequence
, filename
):
357 """Returns a tuple (idno, start, defline, seq, filename) for each
358 contiguous run of residues in sequence, where 'start' is the position of
361 >>> pprint(split_sequence_into_aa_runs(123, 'defline', 'STSS*DEFABA',
363 [(123, 0, 'defline', 'STSS', 'filename'),
364 (123, 5, 'defline', 'DEFA', 'filename'),
365 (123, 10, 'defline', 'A', 'filename')]
368 return [ (idno
, m
.start(), defline
, m
.group(), filename
)
369 for n
, m
in enumerate(AA_SEQUENCE
.finditer(sequence
)) ]
372 def read_fasta_files(filenames
):
373 """Yield (defline, sequence, filename) tuples as read from FASTA files
374 (uppercasing sequence)."""
377 for line
in fileinput
.input(filenames
):
381 yield (defline
, ''.join(seqs
), fileinput
.filename())
383 fileerror("bad format: line precedes initial defline")
387 seqs
.append(line
.upper())
389 yield (defline
, ''.join(seqs
), fileinput
.filename())
392 def read_spectra_slice(spectrum_fns
, offset_indices
, slice):
394 assert 0 <= s_l
<= s_u
<= 1
396 total_spectra
= sum(len(oi
) for oi
in offset_indices
)
397 sp_l
= int(float(s_l
) * total_spectra
)
398 sp_u
= int(float(s_u
) * total_spectra
)
403 for f_no
, (fn
, oi
) in enumerate(zip(spectrum_fns
, offset_indices
)):
404 seeing_u
= seeing_l
+ len(oi
)
405 f_l
= max(sp_l
, seeing_l
)
406 f_u
= min(sp_u
, seeing_u
)
409 b_l
= oi
[f_l
- seeing_l
]
410 b_u
= oi
[f_u
- seeing_l
] if f_u
- seeing_l
< len(oi
) else -1 # oo
413 fsp
= cgreylag
.spectrum
.read_spectra_from_ms2(f
, f_no
,
421 # XML parameter file processing
423 # FIX: This parsing is way too complex. How to simplify?
425 def mass_regime_part(part_specification
):
426 """Parse a single mass regime specification part.
428 >>> mass_regime_part('MONO')
430 >>> mass_regime_part('AVG')
432 >>> mass_regime_part('MONO(N15@87.5%)')
433 ('MONO', [('N15', 0.875)])
436 ps
= [ x
.strip() for x
in part_specification
.partition('(') ]
437 if ps
[0] not in ('MONO', 'AVG'):
438 raise ValueError("invalid mass regime list specification"
439 " (regime id must be 'MONO' or 'AVG')")
443 raise ValueError("invalid mass regime list specification"
445 pps
= [ x
.strip() for x
in ps
[2][:-1].split(',') ]
447 raise ValueError("invalid mass regime list specification"
448 " (multiple isotopes not yet implemented)")
449 ppps
= [ x
.strip() for x
in pps
[0].partition('@') ]
451 raise ValueError("invalid mass regime list specification"
453 if ppps
[0] not in ('N15',):
454 raise ValueError("invalid mass regime list specification"
455 " (isotope id must currently be 'N15')")
456 if ppps
[2][-1] != '%':
457 raise ValueError("invalid mass regime list specification"
459 prevalence
= float(ppps
[2][:-1]) / 100
460 if not (0 <= prevalence
<= 1):
461 raise ValueError("invalid mass regime list specification"
462 " (prevalence must be in range 0-100%)")
463 return (ps
[0], [(ppps
[0], prevalence
)])
465 def mass_regime_list(mass_regime_list_specification
):
466 """Check and return a list of regime tuples (parent_regime,
467 fragment_regime), where each regime is a tuple (id, [(isotope_id,
468 prevalence), ...]). Multiple isotopes (when implemented) would be
471 >>> pprint(mass_regime_list('AVG/MONO;MONO;MONO(N15@75%)'))
472 [[('AVG', []), ('MONO', [])],
473 [('MONO', []), ('MONO', [])],
474 [('MONO', [('N15', 0.75)]), ('MONO', [('N15', 0.75)])]]
478 for regspec
in mass_regime_list_specification
.split(';'):
479 halves
= [ x
.strip() for x
in regspec
.split('/') ]
481 raise ValueError("invalid mass regime list specification"
483 pr
= [ mass_regime_part(h
) for h
in halves
]
485 pr
= [ pr
[0], pr
[0] ]
487 debug("mass regime list:\n%s", pformat(result
))
489 # The first fragmentation regime should generally be MONO, so that
490 # formulaic deltas with '!' do the expected thing.
491 if result
[0][1] != ('MONO', []):
492 raise ValueError("first fragmentation regime was something other than"
493 " 'MONO' with no isotopes--this is almost certainly"
494 " not what was intended")
498 def parse_mod_term(s
, is_potential
=False):
499 """Parse a modification term, returning a tuple (sign, mod, fixed_regime,
500 residues, description).
502 >>> parse_mod_term('-C2H3ON!@C')
503 (-1, 'C2H3ON', True, 'C', None)
504 >>> parse_mod_term('42@STY phosphorylation', is_potential=True)
505 (1, 42.0, False, 'STY', 'phosphorylation')
509 m
= re
.match(r
'^\s*(-|\+)?(([1-9][0-9.]*)|([A-Z][A-Z0-9]*))(!?)'
510 r
'@([A-Z]+|\[|\])(\s+([A-Za-z0-9_]+))?\s*$', s
)
512 raise ValueError("invalid modification term specification"
515 invalid_residues
= set(mg
[5]) - set(RESIDUES_W_BRACKETS
)
517 raise ValueError("invalid modification list specification"
518 " (invalid residues %s)" % list(invalid_residues
))
522 if is_potential
and abs(delta
) < 0.0001:
523 raise ValueError("invalid modification list specification"
524 " (delta '%s' is too small)" % delta
)
526 if not is_potential
and len(residues
) != 1:
527 raise ValueError("invalid modification list specification '%s' (only"
528 " potential modifications may have multiple residues)"
530 if len(residues
) != len(set(residues
)):
531 raise ValueError("invalid modification list specification"
532 " '%s' (duplicate residues prohibited)"
534 return (mg
[0] == '-' and -1 or 1, delta
, mg
[4] == '!', residues
, mg
[7])
537 def fixed_mod_list(specification
):
538 """Check and return a list of modification tuples.
540 >>> fixed_mod_list('57@C')
541 [(1, 57.0, False, 'C', None)]
542 >>> fixed_mod_list('57@C,CH!@N desc')
543 [(1, 57.0, False, 'C', None), (1, 'CH', True, 'N', 'desc')]
544 >>> fixed_mod_list('57@C,58@C')
545 Traceback (most recent call last):
547 ValueError: invalid modification list specification '['C', 'C']' (duplicate residues prohibited)
550 if not specification
:
552 result
= [ parse_mod_term(s
) for s
in specification
.split(',') ]
553 residues
= [ x
[3] for x
in result
]
554 # this check is across terms; the one above is within terms
555 if len(residues
) != len(set(residues
)):
556 raise ValueError("invalid modification list specification"
557 " '%s' (duplicate residues prohibited)"
559 debug("fixed_mod_list:\n%s", pformat(result
))
563 def parse_mod_basic_expression(s
):
566 tree
, rest
= parse_mod_disjunction(s
[1:])
569 raise ValueError("invalid modification list specification"
570 " (expected matching ')')")
571 return tree
, rest
[1:]
572 parts
= re
.split(r
'([;,()])', s
, 1)
576 assert len(parts
) == 3
577 term
, rest
= parts
[0], parts
[1]+parts
[2]
578 return parse_mod_term(term
, is_potential
=True), rest
580 def parse_mod_conjunction(s
):
583 tree
, s
= parse_mod_basic_expression(s
)
591 def parse_mod_disjunction(s
):
594 tree
, s
= parse_mod_conjunction(s
)
602 def potential_mod_list(specification
):
603 """Check and return a tree of potential modification tuples. Nodes at
604 even (odd) levels are disjunctions (conjunctions). (The top list is a
607 >>> potential_mod_list('')
609 >>> potential_mod_list('PO3H@STY; C2H2O@KST')
610 [[(1, 'PO3H', False, 'STY', None)], [(1, 'C2H2O', False, 'KST', None)]]
611 >>> potential_mod_list('PO3H@STY, C2H2O@KST')
612 [[(1, 'PO3H', False, 'STY', None), (1, 'C2H2O', False, 'KST', None)]]
613 >>> pprint(potential_mod_list('''(PO3H@STY phosphorylation;
614 ... C2H2O@KST acetylation;
615 ... CH2@AKST methylation),
616 ... O@M oxidation'''))
617 [[[[(1, 'PO3H', False, 'STY', 'phosphorylation')],
618 [(1, 'C2H2O', False, 'KST', 'acetylation')],
619 [(1, 'CH2', False, 'AKST', 'methylation')]],
620 (1, 'O', False, 'M', 'oxidation')]]
623 if not specification
:
625 tree
, remainder
= parse_mod_disjunction(specification
)
627 raise ValueError("invalid modification list specification"
628 " (unexpected '%s')" % remainder
)
629 debug("potential_mod_list:\n%s", pformat(tree
))
633 def p_positive(x
): return x
> 0
634 def p_negative(x
): return x
< 0
635 def p_nonnegative(x
): return x
>= 0
636 def p_proportion(x
): return 0 <= x
<= 1
638 # Configuration file parameter specification
640 # name -> (type, default, check_fn)
641 # type may be bool, int, float, str, modlist, or a tuple of values
642 # default, as a string value, or None if value must be explicitly specified
643 # check_fn is an optional function that returns True iff the value is valid
646 "databases" : (str, None),
647 "decoy_locus_prefix" : (str, "SHUFFLED_"),
648 "mass_regimes" : (mass_regime_list
, "MONO"),
649 "pervasive_mods" : (fixed_mod_list
, ""),
650 "potential_mods" : (potential_mod_list
, ""),
651 "potential_mod_limit" : (int, 2, p_nonnegative
),
652 "charge_limit" : (int, 3, p_positive
),
653 "min_peptide_length" : (int, 5, p_positive
), # needed?
654 "min_parent_spectrum_mass" : (float, 0, p_nonnegative
),
655 "max_parent_spectrum_mass" : (float, 10000, p_nonnegative
),
656 "TIC_cutoff_proportion" : (float, 0.98, p_proportion
),
657 "parent_mz_tolerance" : (float, 1.25, p_nonnegative
),
658 "fragment_mass_tolerance" : (float, 0.5, p_nonnegative
),
659 "intensity_class_count" : (int, 3, p_positive
),
660 "intensity_class_ratio" : (float, 2.0, p_positive
), # really > 1.0?
661 "best_result_count" : (int, 5, p_positive
),
665 def validate_parameters(parameters
, parameter_info
=PARAMETER_INFO
):
666 """Verify that parameters are valid, have valid values, and correspond to
667 currently implemented functionality. Values are converted, default values
668 are filled in, and the resulting name/value dict returned.
670 >>> sorted(validate_parameters({'spectrum, total peaks' : '40'},
671 ... {'spectrum, total peaks'
672 ... : (int, '50', p_positive),
673 ... 'output, spectra'
674 ... : (bool, 'no')}).items())
675 [('output, spectra', False), ('spectrum, total peaks', 40)]
680 for p_name
, p_info
in sorted(parameter_info
.items()):
683 check_fn
= len(p_info
) > 2 and p_info
[2] or None
685 v
= parameters
.get(p_name
)
688 debug("parameter '%s' defaulting to '%s'", p_name
, default
)
691 error("missing required parameter '%s'" % p_name
)
692 if isinstance(type_
, tuple):
694 error("parameter '%s' value '%s' not in %s (feature not"
695 " implemented?)" % (p_name
, v
, type_
))
697 v
= { 'yes' : True, 'no' : False }.get(v
)
699 error("parameter '%s' requires a value of 'yes' or 'no'")
703 except ValueError, e
:
704 error("parameter '%s' has value '%s' with invalid format [%s]"
706 if check_fn
and not check_fn(v
):
707 error("parameter '%s' has invalid value '%s' (or feature not"
708 " implemented)" % (p_name
, v
))
711 unknown_parameters
= set(parameters
) - set(parameter_info
)
712 if unknown_parameters
:
713 warning("%s unknown parameter(s):\n %s"
714 % (len(unknown_parameters
),
715 pformat(sorted(list(unknown_parameters
)))))
719 def generate_mass_bands(band_count
, mass_list
):
720 """Yield (n, mass_lb, mass_ub) for each mass band, where n ranges from 1
721 to band_count. To generate the bands, the mass list (which is assumed
722 already sorted by mass) is evenly partitioned into bands with masses in
723 the range [mass_lb, mass_ub).
725 >>> list(generate_mass_bands(1, [ float(x) for x in range(100) ]))
727 >>> list(generate_mass_bands(4, [ float(x) for x in range(100) ]))
728 [(1, 0.0, 25.0), (2, 25.0, 50.0), (3, 50.0, 75.0), (4, 75.0, 100.0)]
729 >>> list(generate_mass_bands(3, [ float(x) for x in range(100) ]))
730 [(1, 0.0, 34.0), (2, 34.0, 68.0), (3, 68.0, 100.0)]
733 assert band_count
> 0 and mass_list
734 assert sorted(mass_list
) == list(mass_list
)
735 band_size
= int(math
.ceil(float(len(mass_list
)) / band_count
))
737 for bn
in range(1, band_count
):
738 i
= min(bn
*band_size
, len(mass_list
)-1)
742 # Since the upper bound is exclusive, we add a little slop to make sure
743 # the last spectrum is included.
744 yield band_count
, lb
, round(mass_list
[-1]+1)
747 def pythonize_swig_object(o
, only_fields
=None, skip_fields
=[]):
748 """Creates a pure Python copy of a SWIG object, so that it can be easily
749 pickled, or printed (for debugging purposes). Each SWIG object is
750 pythonized as a dictionary, for convenience. If provided, 'only_fields',
751 limits the copy to the list of fields specified. Otherwise,
752 'skip_fields' if given is a list of methods not to include (this helps
753 avoid infinite recursion).
755 >>> pprint(pythonize_swig_object(cgreylag.score_stats(3)))
756 struct({'best_match': [[], [], []], 'best_score': [100.0, 100.0, 100.0], 'candidate_spectrum_count': 0, 'combinations_searched': 0, 'hyperscore_histogram': [[], [], []], 'improved_candidates': 0, 'second_best_score': [100.0, 100.0, 100.0], 'spectra_with_candidates': 0})
760 if isinstance(o
, str):
767 return list(pythonize_swig_object(x
, only_fields
, skip_fields
)
769 if hasattr(o
, '__swig_getmethods__'):
771 for a
in o
.__swig
_getmethods
__:
772 if (only_fields
!= None and a
in only_fields
773 or only_fields
== None and a
not in skip_fields
):
774 s
[a
] = pythonize_swig_object(getattr(o
, a
), only_fields
,
779 # A possible PCA (pyrrolidone carboxyl acid) modification accounts for
780 # circularization of the peptide N-terminal. PCA mods are excluded if a
781 # static N-terminal mod has been specified. Likewise, choosing a PCA mod will
782 # exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
783 # dependent on a static mod of C+57 being in effect.)
785 def get_pca_table(mass_regimes
):
786 """Return a list of tuples (residues, parent delta, fragment delta)
787 describing the PCA possibilities for each mass regime. Residues is a
788 string specifying the residues this PCA delta applies to.
790 # FIX: According to Xtandem, C is only a candidate for PCA if
791 # carboxyamidomethylated (C+57). Currently we always search it.
792 return [ [('', 0, 0),
793 ('E', -1 * CP
.parent_mass_regime
[r
].water_mass
,
794 -1 * CP
.fragment_mass_regime
[r
].water_mass
),
795 ('QC', -1 * CP
.parent_mass_regime
[r
].ammonia_mass
,
796 -1 * CP
.fragment_mass_regime
[r
].ammonia_mass
)]
797 for r
in range(len(mass_regimes
)) ]
800 def enumerate_conjunction(mod_tree
, limit
, conjuncts
=[]):
802 if 0 < len(conjuncts
) <= limit
:
805 first
, rest
= mod_tree
[0], mod_tree
[1:]
806 if isinstance(first
, list):
807 for x
in enumerate_disjunction(first
, limit
):
808 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ x
):
811 for y
in enumerate_conjunction(rest
, limit
, conjuncts
):
813 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ [first
]):
816 def enumerate_disjunction(mod_tree
, limit
=sys
.maxint
):
817 """Generates the conjuncts for mod_tree that are no longer than limit.
819 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
820 [[], ['a'], ['b'], ['c']]
821 >>> list(enumerate_disjunction([[1,2,3]]))
822 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
823 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
824 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
825 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
826 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
827 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
831 assert isinstance(mod_tree
, list)
834 for s
in enumerate_conjunction(b
, limit
):
837 def get_mod_conjunct_triples(mod_tree
, limit
):
838 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
839 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
840 ordered by increasing number of conjuncts, and with duplicates within and
841 across removed. A mass table is also appended to each conjunct.
843 # FIX: is there a more elegant way or place for all this?
845 def rmass(regime_index
, par_frag
, sign
, delta
, is_mono
):
846 global MASS_REGIME_ATOMIC_MASSES
847 if isinstance(delta
, str):
850 return (sign
* formula_mass(delta
,
851 MASS_REGIME_ATOMIC_MASSES
[regime_index
][par_frag
]))
855 sign
, delta
, is_mono
= t
[:3]
856 return t
+ (tuple((rmass(r
, 0, sign
, delta
, is_mono
),
857 rmass(r
, 1, sign
, delta
, is_mono
))
858 for r
in range(len(XTP
["mass_regimes"]))),)
861 Ns
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == '['))
862 Cs
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == ']'))
863 assert len(Ns
) <= 1 and len(Cs
) <= 1
864 rest
= [ enmass(x
) for x
in c
if x
[3] not in '[]' ]
865 rest
= tuple(sorted(list(frozenset(rest
))))
866 return (Ns
, Cs
, rest
)
868 return sorted(list(frozenset(triple(conjunct
)
870 in enumerate_disjunction(mod_tree
, limit
))),
871 key
=lambda x
: (sum(len(y
) for y
in x
), x
))
874 def gen_delta_bag_counts(i
, remainder
, bag
):
880 for delta
in range(1, remainder
-i
+1):
882 for x
in gen_delta_bag_counts(i
-1, remainder
-delta
, bag
):
885 def generate_delta_bag_counts(mod_count
, conjunct_length
):
886 """Generate all tuples of positive integers having length conjunct_length
887 and sum mod_count. As a special case, () is such a tuple having length 0
890 >>> for i in range(6): print i, list(generate_delta_bag_counts(4, i))
894 2 [(3, 1), (2, 2), (1, 3)]
895 3 [(2, 1, 1), (1, 2, 1), (1, 1, 2)]
898 >>> list(generate_delta_bag_counts(0, 0))
900 >>> list(generate_delta_bag_counts(0, 1))
904 if conjunct_length
== 0:
905 return [()] if mod_count
== 0 else []
906 if mod_count
< conjunct_length
:
908 return gen_delta_bag_counts(conjunct_length
- 1, mod_count
,
909 [0] * conjunct_length
)
912 def set_context_conjuncts(context
, mass_regime_index
, N_cj
, C_cj
, R_cj
):
913 assert len(N_cj
) < 1 and len(C_cj
) < 1
916 context
.N_delta
= N_cj
[0][5][mass_regime_index
][1]
919 context
.C_delta
= C_cj
[0][5][mass_regime_index
][1]
920 context
.delta_bag_lookup
.clear()
921 context
.delta_bag_lookup
.resize(ord('Z')+1)
922 context
.delta_bag_delta
.clear()
923 for n
, cj
in enumerate(R_cj
):
924 context
.delta_bag_delta
.append(cj
[5][mass_regime_index
][1])
926 context
.delta_bag_lookup
[ord(r
)] \
927 = context
.delta_bag_lookup
[ord(r
)] + (n
,)
930 def search_all(options
, context
, score_statistics
):
931 """Search sequence database against searchable spectra."""
933 mod_limit
= XTP
["potential_mod_limit"]
935 mod_conjunct_triples
= get_mod_conjunct_triples(XTP
["potential_mods"],
937 info("%s unique potential modification conjuncts",
938 len(mod_conjunct_triples
))
939 debug("mod_conjunct_triples (unique):\n%s", pformat(mod_conjunct_triples
))
941 mass_regimes
= XTP
["mass_regimes"]
942 pca_table
= get_pca_table(mass_regimes
)
943 debug("pca_table: %s", pca_table
)
945 total_combinations_searched
= 0
947 for mod_count
in range(mod_limit
+ 1):
948 context
.mod_count
= mod_count
949 for mr_index
, (mass_regime
, pca_entry
) in enumerate(zip(mass_regimes
,
951 context
.mass_regime_index
= mr_index
952 for pca_res
, pca_parent_delta
, pca_frag_delta
in pca_entry
:
953 context
.pca_residues
= pca_res
954 context
.pca_delta
= pca_frag_delta
955 for cji
, (N_cj
, C_cj
, R_cj
) in enumerate(mod_conjunct_triples
):
957 continue # mutually exclusive, for now
958 set_context_conjuncts(context
, mr_index
, N_cj
, C_cj
, R_cj
)
959 debug("mod_count: %s", mod_count
)
960 debug("cj_triple: N=%s C=%s R=%s", N_cj
, C_cj
, R_cj
)
961 for delta_bag
in generate_delta_bag_counts(mod_count
,
963 debug("delta_bag: %s", delta_bag
)
965 # this clear() avoids an SWIG/STL bug!?
966 context
.delta_bag_count
.clear()
967 context
.delta_bag_count
[:] = delta_bag
969 pmrf
= CP
.parent_mass_regime
[mr_index
].fixed_residue_mass
970 fmrf
= CP
.fragment_mass_regime
[mr_index
].fixed_residue_mass
971 p_fx
= (pmrf
[ord('[')]
972 + (N_cj
and N_cj
[0][5][mr_index
][0] or 0)
974 + (C_cj
and C_cj
[0][5][mr_index
][0] or 0)
975 + pca_parent_delta
+ PROTON_MASS
)
976 context
.parent_fixed_mass
= p_fx
977 f_N_fx
= (fmrf
[ord('[')]
978 + (N_cj
and N_cj
[0][5][mr_index
][1] or 0)
980 context
.fragment_N_fixed_mass
= f_N_fx
981 f_C_fx
= (fmrf
[ord(']')]
982 + (C_cj
and C_cj
[0][5][mr_index
][1] or 0)
983 + CP
.fragment_mass_regime
[mr_index
].water_mass
)
984 context
.fragment_C_fixed_mass
= f_C_fx
986 info("MC=%s MR=%s PCA=%s CJ=%s DB=%s"
987 % (mod_count
, mr_index
, pca_res
, cji
, delta_bag
))
988 debug("p_fx %s f_N_fx %s f_C_fx %s"
989 % (p_fx
, f_N_fx
, f_C_fx
))
991 score_statistics
.combinations_searched
= 0
992 cgreylag
.spectrum
.search_runs(context
,
994 total_combinations_searched \
995 += score_statistics
.combinations_searched
996 info(" %s candidate spectra examined, this bag",
997 score_statistics
.combinations_searched
)
999 info('%s candidate spectra examined',
1000 score_statistics
.candidate_spectrum_count
)
1003 def get_prefix_sequence(begin_pos
, run_offset
, sequence
):
1005 >>> get_prefix_sequence(0, 0, 'abcdefghi')
1007 >>> get_prefix_sequence(1, 0, 'abcdefghi')
1009 >>> get_prefix_sequence(3, 0, 'abcdefghi')
1011 >>> get_prefix_sequence(4, 0, 'abcdefghi')
1013 >>> get_prefix_sequence(4, 2, 'abcdefghi')
1017 prefix_start
= run_offset
+ begin_pos
- 4
1018 s
= sequence
[max(0, prefix_start
):prefix_start
+4]
1023 def get_suffix_sequence(end_pos
, run_offset
, sequence
):
1025 >>> get_suffix_sequence(0, 0, 'abcdefghi')
1027 >>> get_suffix_sequence(4, 0, 'abcdefghi')
1029 >>> get_suffix_sequence(6, 0, 'abcdefghi')
1031 >>> get_suffix_sequence(8, 0, 'abcdefghi')
1033 >>> get_suffix_sequence(4, 2, 'abcdefghi')
1037 suffix_start
= run_offset
+ end_pos
1038 s
= sequence
[suffix_start
:suffix_start
+4]
1044 def clean_string(v
):
1045 """Strip and collapse internal whitespace.
1047 >>> clean_string(' one two ')
1053 return re
.sub(r
'[\s]+', ' ', v
.strip())
1056 def clean_defline(s
):
1057 """Return the given string with tabs replaced by spaces and control
1058 and non-ASCII characters removed, then stripped.
1060 >>> tab=chr(9); clean_defline(' one' + tab + ' two three\001four ')
1064 return re
.sub(r
'[^ -~]', '', s
.replace('\t', ' ')).strip()
1067 def abbrev_defline(s
):
1068 """Return an abbreviated version of the defline--about 80 characters.
1070 >>> abbrev_defline('words ' * 10)
1071 'words words words words words words words words words words '
1072 >>> abbrev_defline('words ' * 20)
1073 'words words words words words words words words words words words words words words...'
1076 ab
= re
.match(r
'.{,80}\S{,170}', s
).group(0)
1077 if len(ab
) < len(s
):
1082 def zopen(filename
, mode
='r', compresslevel
=None):
1083 """Open a filename as with 'open', but using compression if indicated by
1084 the filename suffix. The compression level defaults to 1 for .gz files
1085 and 9 for .bz2 files (presumably bzip2 will only be used if size matters
1086 much more than compression time)."""
1087 if filename
.endswith('.gz'):
1089 return gzip
.GzipFile(filename
, mode
, compresslevel
or 1)
1090 elif filename
.endswith('.bz2'):
1092 return bz2
.BZ2File(filename
, mode
, compresslevel
or 9)
1094 return open(filename
, mode
)
1097 def main(args
=sys
.argv
[1:]):
1098 parser
= optparse
.OptionParser(usage
=
1099 "usage: %prog [options] <job-id>"
1100 " <parameter-file> <ms2-file>...",
1101 description
=__doc__
, version
=__version__
)
1102 pa
= parser
.add_option
1103 pa("-P", "--parameter", nargs
=2, dest
="parameters", action
="append",
1105 help="override a parameter in <parameter-file>, may be used multiple"
1106 " times", metavar
="NAME VALUE")
1107 pa("-w", "--work-slice", nargs
=2, type="float", dest
="work_slice",
1108 help="search a subinterval [L:U) of the work space"
1109 " (where 0 <= L <= U <= 1) in standalone mode", metavar
="L U")
1110 pa("-q", "--quiet", action
="store_true", dest
="quiet", help="no warnings")
1111 pa("-p", "--show-progress", action
="store_true", dest
="show_progress",
1112 help="show running progress")
1113 pa("--estimate", action
="store_true", dest
="estimate_only",
1114 help="just estimate the time required for the search")
1115 pa("-v", "--verbose", action
="store_true", dest
="verbose",
1117 pa("--copyright", action
="store_true", dest
="copyright",
1118 help="print copyright and exit")
1119 pa("--debug", action
="store_true", dest
="debug",
1120 help="output debugging info")
1121 pa("--profile", action
="store_true", dest
="profile",
1122 help="dump Python profiling output to './greylag.prof.<pid>'")
1123 (options
, args
) = parser
.parse_args(args
=args
)
1125 if options
.copyright
:
1134 parameter_fn
= args
[1]
1135 spectrum_fns
= args
[2:]
1137 if (any(True for f
in spectrum_fns
if not f
.endswith('.ms2'))
1138 or (options
.work_slice
1139 and not (0 <= options
.work_slice
[0]
1140 <= options
.work_slice
[1] <= 1))):
1144 log_level
= logging
.WARNING
1146 log_level
= logging
.ERROR
1148 log_level
= logging
.INFO
1150 log_level
= logging
.DEBUG
1151 logging
.basicConfig(level
=log_level
, datefmt
='%b %e %H:%M:%S',
1152 format
='%(asctime)s %(levelname)s: %(message)s')
1153 info("starting on %s", gethostname())
1155 # prevent format char problems
1157 error("<job-id> may not contain '%'")
1159 # check spectrum basename uniqueness, as corresponding sqt files will be
1160 # in a single directory
1161 base_spectrum_fns
= [ os
.path
.basename(fn
) for fn
in spectrum_fns
]
1162 if len(base_spectrum_fns
) != len(set(base_spectrum_fns
)):
1163 error("base spectrum filenames must be unique")
1165 # check -P names for validity
1166 bad_names
= (set(n
for n
,v
in options
.parameters
)
1167 - set(PARAMETER_INFO
.keys()))
1169 error("bad -P parameter names %s" % list(bad_names
))
1172 cp
= ConfigParser
.RawConfigParser()
1173 cp
.optionxform
= str # be case-sensitive
1174 with
open(parameter_fn
) as parameter_file
:
1175 cp
.readfp(parameter_file
)
1176 if not cp
.has_section('greylag'):
1177 error("%s has no [greylag] section" % parameter_fn
)
1178 parameters
= dict(cp
.items('greylag'))
1179 parameters
.update(dict(options
.parameters
)) # command-line override
1181 XTP
= validate_parameters(parameters
)
1183 fixed_mod_map
= dict((r
[3], r
) for r
in XTP
["pervasive_mods"])
1184 initialize_spectrum_parameters(options
, XTP
["mass_regimes"], fixed_mod_map
)
1187 # [(defline, seq, filename), ...]
1188 databases
= XTP
["databases"].split()
1189 fasta_db
= list(read_fasta_files(databases
))
1190 # [(idno, offset, defline, seq, seq_filename), ...]
1192 for idno
, (defline
, sequence
, filename
) in enumerate(fasta_db
):
1193 db
.extend(split_sequence_into_aa_runs(idno
, defline
, sequence
,
1195 db_residue_count
= sum(len(dbi
[3]) for dbi
in db
)
1197 # (idno, offset) -> (defline, run_seq, filename)
1198 db_info
= dict(((idno
, offset
), (defline
, seq
, filename
))
1199 for idno
, offset
, defline
, seq
, filename
in db
)
1201 info("read %s sequences (%s runs, %s residues)", len(fasta_db
), len(db
),
1203 max_run_length
= max(len(r
[3]) for r
in db
)
1204 info("max run length is %s residues", max_run_length
)
1205 if max_run_length
> 2**31 - 1:
1206 error("runs longer than %s not yet supported", max_run_length
)
1208 error("no database sequences")
1210 # read spectrum offset indices
1211 spectrum_offset_indices
= []
1212 for spfn
in spectrum_fns
:
1213 idxfn
= spfn
+ '.idx'
1214 with contextlib
.closing(gzip
.open(idxfn
)) as idxf
:
1215 idx
= cPickle
.load(idxf
)
1216 # try to verify that index matches spectrum file
1217 if idx
['file size'] != os
.path
.getsize(spfn
):
1218 error("index '%s' needs rebuilding" % idxfn
)
1219 spectrum_offset_indices
.append(idx
['offsets'])
1221 # FIX: assume standalone mode (for now)
1222 assert options
.work_slice
1224 # read spectra per work slice
1225 spectra
= read_spectra_slice(spectrum_fns
, spectrum_offset_indices
,
1227 spectra
.sort(key
=lambda x
: x
.mass
)
1230 warning("no input spectra")
1232 info("read %s spectra (mass range %s - %s)", len(spectra
),
1233 spectra
[0].mass
, spectra
[-1].mass
)
1235 def peak_statistics(spectra
):
1236 counts
= [ len(sp
.peaks
) for sp
in spectra
]
1239 return (counts
[0], counts
[int(n
*0.25)], counts
[int(n
*0.5)],
1240 counts
[int(n
*0.75)], counts
[-1], sum(counts
) / float(n
))
1242 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1243 % peak_statistics(spectra
))
1245 # filter and normalize spectra
1247 sp
.filter_peaks(XTP
["TIC_cutoff_proportion"],
1248 CP
.parent_mass_tolerance_max
)
1249 sp
.classify(XTP
["intensity_class_count"], XTP
["intensity_class_ratio"],
1250 XTP
["fragment_mass_tolerance"])
1252 min_psm
= XTP
["min_parent_spectrum_mass"]
1253 max_psm
= XTP
["max_parent_spectrum_mass"]
1254 # FIX: also filter by 1 + 2 + 4 rule?
1255 spectra
= [ sp
for sp
in spectra
1256 if len(sp
.peaks
) >= 10 and min_psm
<= sp
.mass
<= max_psm
]
1258 info("after filtering:")
1259 info(" %s spectra (mass range %s - %s)", len(spectra
),
1260 spectra
[0].mass
, spectra
[-1].mass
)
1261 info(" peak stats: %s..%s..%s..%s..%s (mean=%.2f)"
1262 % peak_statistics(spectra
))
1264 cgreylag
.spectrum
.set_searchable_spectra(spectra
)
1265 score_statistics
= cgreylag
.score_stats(len(spectra
),
1266 XTP
["best_result_count"])
1269 del spectra
# release memory
1272 # (cleavage_re, position of cleavage in cleavage_re)
1273 cleavage_motif
= "[X]|[X]"
1274 cleavage_pattern
, cleavage_pos
= cleavage_motif_re(cleavage_motif
)
1275 if cleavage_pos
== None:
1276 error("cleavage site '%s' is missing '|'",
1277 XTP
["protein, cleavage site"])
1278 cleavage_pattern
= re
.compile(cleavage_pattern
)
1280 context
= cgreylag
.search_context()
1281 for idno
, offset
, defline
, seq
, seq_filename
in db
:
1283 if cleavage_motif
!= "[X]|[X]":
1284 cp
= list(generate_cleavage_points(cleavage_pattern
,
1286 locus_name
= defline
.split(None, 1)[0]
1287 sr
= cgreylag
.sequence_run(idno
, offset
, seq
, cp
, locus_name
)
1288 context
.sequence_runs
.append(sr
)
1289 context
.maximum_missed_cleavage_sites
= 1000000000 # FIX
1292 search_all(options
, context
, score_statistics
)
1294 warning("no spectra after filtering--search skipped")
1296 if options
.estimate_only
:
1297 print ("%.2f generic CPU hours"
1298 % (score_statistics
.candidate_spectrum_count
/ 300.0e6
))
1302 info("writing result file")
1303 result_fn
= 'test.result.gz'
1304 py_score_statistics
= pythonize_swig_object(score_statistics
)
1306 # only put spectrum info that's actually needed into the result file
1307 spectrum_metadata_fields
= set(['name', 'file_id', 'mass', 'charge',
1308 'total_ion_current', 'comparisons'])
1309 spectrum_metadata
= \
1310 pythonize_swig_object(cgreylag
.cvar
.spectrum_searchable_spectra
,
1311 only_fields
=spectrum_metadata_fields
)
1313 with contextlib
.closing(zopen(result_fn
, 'w')) as result_file
:
1314 cPickle
.dump({ 'matches' : py_score_statistics
,
1315 'spectra' : spectrum_metadata
,
1316 'spectrum files' : base_spectrum_fns
,
1317 'databases' : databases
,
1319 'mass regime atomic masses' : MASS_REGIME_ATOMIC_MASSES
,
1320 'proton mass' : PROTON_MASS
,
1321 'argv' : sys
.argv
},
1323 cPickle
.HIGHEST_PROTOCOL
)
1324 info("finished, result file written to '%s'", result_fn
)
1327 if __name__
== '__main__':
1331 # ###psyco.bind(formula_mass)
1332 # warning('psyco enabled')
1333 # except ImportError:
1337 if '--profile' in sys
.argv
:
1340 report_fn
= "greylag.prof.%s" % os
.getpid()
1341 data_fn
= report_fn
+ ".tmp"
1342 prof
= cProfile
.run('main()', data_fn
)
1343 with
open(report_fn
, 'w') as report_f
:
1345 stats
= pstats
.Stats(data_fn
, stream
=report_f
)
1347 stats
.sort_stats('cumulative')
1348 stats
.print_stats(50)
1349 stats
.sort_stats('time')
1350 stats
.print_stats(50)
1351 print "# profile report written to '%s'" % report_fn
1362 logging
.exception("unhandled exception")
1369 # - need to rigorously check for bad input in any file (fasta, spectrum, xml)