greylag_sqt.py

   1 #!/usr/bin/env python
   2
   3 '''Generate a set of sqt files from the specified greylag search result
   4 file.  The sqt files will be created in the current directory, with names
   5 corresponding to the searched ms2 files.
   6 '''
   7
   8 from __future__ import with_statement
   9
  10 __copyright__ = '''
  11     greylag, a collection of programs for MS/MS protein analysis
  12     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  13
  14     This program is free software: you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation, either version 3 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License
  25     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  26
  27     Contact: Mike Coleman
  28              Stowers Institute for Medical Research
  29              1000 East 50th Street
  30              Kansas City, Missouri  64110
  31              USA
  32 '''
  33
  34
  35 import cPickle as pickle
  36 import math
  37 import optparse
  38 import os.path
  39 from pprint import pprint
  40 import sys
  41
  42 import greylag
  43
  44
  45 # gc kills our performance, so disable it.  gc only matters for cycles, which
  46 # we (hope we) don't create.  See the gc module docs.
  47 import gc; gc.disable()
  48
  49
  50 def warn(s):
  51     print >> sys.stderr, 'warning:', s
  52 def error(s):
  53     sys.exit('error: ' + s)
  54
  55 # errors are fatal
  56 greylag.chase_error = error
  57
  58
  59 def generate_leaves(tree):
  60     """Given a tree made of lists, yield the leaves.
  61
  62     >>> list(generate_leaves([(1,2,3)]))
  63     [(1, 2, 3)]
  64     >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
  65     [(1, 2, 3), (4, 5)]
  66
  67     """
  68     if not isinstance(tree, list):
  69         yield tree
  70         return
  71     for b in tree:
  72         for l in generate_leaves(b):
  73             yield l
  74
  75
  76 # Example legacy headers:
  77 # H       Database        /foo/bar/db.fasta
  78 # H       PrecursorMasses AVG
  79 # H       FragmentMasses  MONO
  80 # H       StaticMod       C=160.1388
  81 # H       DiffMod M*=+16.0
  82 # H       DiffMod STY#=+80.0
  83
  84 def print_legacy_headers(f, r):
  85     """Output legacy H lines that readers of traditional SQT files may
  86     expect.  Warns if multiple databases, or multiple or isotopic mass regimes
  87     are present.
  88     """
  89
  90     if len(r['databases']) > 1:
  91         warn("multiple databases present")
  92     for db in r['databases']:
  93         print >> f, "H\tDatabase\t%s" % db
  94
  95     precursor_masses, fragment_masses = 'MONO', 'MONO'
  96     mass_regimes = r['parameters']['mass_regimes']
  97     if len(mass_regimes) > 1:
  98         warn("multiple mass regimes present--legacy SQT headers will be based"
  99              "on just the first")
 100     parent_regime, fragment_regime = mass_regimes[0]
 101     if parent_regime[1] or fragment_regime[1]:
 102         warn("isotopic mass regimes present--ignored for legacy SQT headers")
 103     assert parent_regime[0] in ('MONO', 'AVG')
 104     assert fragment_regime[0] in ('MONO', 'AVG')
 105     print >> f, "H\tPrecursorMasses\t%s" % parent_regime[0]
 106     print >> f, "H\tFragmentMasses\t%s" % fragment_regime[0]
 107
 108     # FIX: merge this with mass calculation code in
 109     #      greylag.initialize_spectrum_parameters?
 110     # FIX: currently assuming MONO only (what would be better?)
 111     static = dict((res, greylag.formula_mass(greylag.RESIDUE_FORMULA[res]))
 112                   for res in greylag.RESIDUES)
 113     for m in r['parameters']['pervasive_mods']:
 114         if isinstance(m[1], float):
 115             static[m[3]] += m[0] * m[1]
 116         else:
 117             static[m[3]] += m[0] * greylag.formula_mass(m[1])
 118     # all residues get a StaticMod because our figures are more exact than the
 119     # "default" ones
 120     for res in static:
 121         print >> f, "H\tStaticMod\t%s=%s" % (res, static[res])
 122
 123     diffs = set(generate_leaves(r['parameters']['potential_mods']))
 124     for m in diffs:
 125         symbol = m[5]
 126         if not symbol:
 127             warn("unmarked potential mod present--omitting")
 128             continue
 129         if isinstance(m[1], float):
 130             delta = m[0] * m[1]
 131         else:
 132             delta = m[0] * greylag.formula_mass(m[1])
 133         print >> f, "H\tDiffMod\t%s%s=%+f" % (m[3], m[5], delta)
 134
 135
 136 def print_header(f, r):
 137     """Output H lines."""
 138     print >> f, "H\tSQTGenerator\tgreylag"
 139     print >> f, "H\tSQTGeneratorVersion\t%s" % greylag.VERSION
 140
 141     print_legacy_headers(f, r)
 142
 143     for pk, pv in sorted(r['parameters'].items()):
 144         print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
 145
 146
 147 def print_regime_manifest(f, regime_manifest):
 148     """Output R lines."""
 149     for regime_no, residue, mass in regime_manifest:
 150         print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
 151
 152
 153 def generate_marked_sequence(marks, peptide_sequence):
 154     """Yield the characters in a marked version of peptide_sequence.  If there
 155     is both a terminal mark and a mark on a terminal residue, the latter is
 156     omitted.
 157
 158     >>> ''.join(generate_marked_sequence({}, 'ASDF'))
 159     'ASDF'
 160     >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
 161     'A^S*DF'
 162     >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
 163     'A^SDF'
 164
 165     """
 166
 167     for n, c in enumerate(peptide_sequence):
 168         yield c
 169         if n == 0 and 'N' in marks:
 170             yield marks.get('N')
 171         elif n == len(peptide_sequence)-1 and 'C' in marks:
 172             yield marks.get('C')
 173         else:
 174             yield marks.get(n, '')
 175
 176
 177 def print_spectrum(f, modification_conjucts, result, enhanced=False):
 178     """Print the lines (S/M/L/A*) associated with the given spectrum."""
 179     spectrum, sp_best_matches = result
 180
 181     scan_low, scan_high, charge = spectrum['name'].rsplit('.', 2)
 182     print >> f, '\t'.join(str(v) for v in
 183                           ["S", scan_low, scan_high, charge, 0, 'honk',
 184                            spectrum['mass'],
 185                            round(math.log(spectrum['total_ion_current']), 4),
 186                            0, spectrum['comparisons']])
 187
 188     best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
 189     if not best_scores:
 190         return
 191
 192     # score -> rank
 193     rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
 194
 195     best_score = best_scores[0]
 196     for match in sp_best_matches:
 197         score = match['score']
 198         if score == 0:
 199             continue                    # null entry
 200         rank = rank_map[score]
 201         score_delta = (best_score - score) / best_score
 202
 203         mass_regime_index = match.get('mass_regime_index', 0)
 204         conjunct_index = match.get('conjunct_index', 0)
 205
 206         am_lines = []                   # residue mods
 207         marks = {}
 208         for mt in match.get('mass_trace', []):
 209             conjunct_item = modification_conjucts[conjunct_index][2][mt['conjunct_item_index']]
 210             mark = conjunct_item[5]
 211             if mark:
 212                 assert mt['position'] not in marks, "not yet implemented"
 213                 marks[mt['position']] = mark
 214             if enhanced:
 215                 name = conjunct_item[4] or ''
 216                 delta = conjunct_item[6][mass_regime_index][1]
 217                 am_lines.append(["AM", mt['position'], round(delta, 8), name])
 218
 219         at_lines = []                   # terminal mods
 220         N_conjunct_item = modification_conjucts[conjunct_index][0]
 221         if N_conjunct_item:
 222             assert len(N_conjunct_item) == 1
 223             mark = N_conjunct_item[0][5]
 224             if mark:
 225                 marks['N'] = mark
 226             if enhanced:
 227                 name = conjunct_item[4] or ''
 228                 delta = conjunct_item[6][mass_regime_index][1]
 229                 at_lines.append(["AT", 'N', round(delta, 8), name])
 230         C_conjunct_item = modification_conjucts[conjunct_index][1]
 231         if C_conjunct_item:
 232             assert len(C_conjunct_item) == 1
 233             mark = C_conjunct_item[0][5]
 234             if mark:
 235                 marks['C'] = mark
 236             if enhanced:
 237                 name = conjunct_item[4] or ''
 238                 delta = conjunct_item[6][mass_regime_index][1]
 239                 at_lines.append(["AT", 'C', round(delta, 8), name])
 240
 241         marked_sequence = match['peptide_sequence']
 242         if marks:
 243             marked_sequence = ''.join(generate_marked_sequence(marks,
 244                                                                marked_sequence))
 245
 246         # FIX: also need M lines for fixed mods, terminal mods, isotope mods
 247         # (N15), etc.
 248         # Note: scores, being log probability, are non-positive, but we
 249         # flip the sign in the SQT output
 250         print >> f, '\t'.join(str(v) for v in
 251                               ["M", rank, rank,
 252                                round(match['predicted_parent_mass'], 5),
 253                                round(score_delta, 4), round(-score, 4), 0,
 254                                # FIX: 1 of 2 ions found--keep DTASelect happy
 255                                1, 2,
 256                                '.'.join((match['N_peptide_flank'],
 257                                          marked_sequence,
 258                                          match['C_peptide_flank'])),
 259                                'U'])
 260
 261         if enhanced:
 262             if mass_regime_index != 0:
 263                 print >> f, "AR\t%s" % mass_regime_index
 264             if match.get('pca_delta', 0) != 0:
 265                 print >> f, "APCA\t%s" % match['pca_delta']
 266             for am_line in sorted(am_lines):
 267                 print >> f, '\t'.join(str(v) for v in am_line)
 268             for at_line in at_lines:
 269                 print >> f, '\t'.join(str(v) for v in at_line)
 270
 271         for sequence_name, peptide_begin in match['loci']:
 272             print >> f, 'L\t%s\t%s' % (sequence_name, peptide_begin)
 273
 274
 275 def dump_result_file(result_file):
 276     """Pretty-print result file contents."""
 277     try:
 278         r = None
 279         while True:
 280             r = pickle.load(result_file)
 281             pprint(r)
 282     except EOFError, e:
 283         if r != 'complete':
 284             error("result file truncated or invalid")
 285
 286
 287 def generate_results(result_file):
 288     while True:
 289         try:
 290             r = pickle.load(result_file)
 291         except EOFError:
 292             error("result file truncated or invalid")
 293         if r == 'complete':
 294             return
 295         yield r
 296
 297
 298 def main(args=sys.argv[1:]):
 299     parser = optparse.OptionParser(usage=
 300                                    "usage: %prog [options] <result-file>",
 301                                    description=__doc__, version=greylag.VERSION)
 302     pa = parser.add_option
 303     pa("-d", "--output-directory", dest="output_directory",
 304        help="directory where output files are written [default is '.']",
 305        metavar="DIR")
 306     pa("-e", "--enhanced-output", action="store_true", dest="enhanced_output",
 307        help="include extra DTASelect-incompatible information in output")
 308     pa("-v", "--verbose", action="store_true", dest="verbose",
 309        help="be verbose")
 310     pa("--dump", action="store_true", dest="dump",
 311        help="just dump the result file to stdout (for debugging)")
 312     pa("--copyright", action="store_true", dest="copyright",
 313        help="print copyright and exit")
 314     (options, args) = parser.parse_args(args=args)
 315
 316     if options.copyright:
 317         print __copyright__
 318         sys.exit(0)
 319
 320     if len(args) != 1:
 321         parser.print_help()
 322         sys.exit(1)
 323
 324     result_file = open(args[0], 'rb')
 325
 326     if options.dump:
 327         dump_result_file(result_file)
 328         return
 329
 330     header = pickle.load(result_file)
 331
 332     spectrum_fns = header['spectrum files']
 333     assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
 334     assert all(os.path.dirname(fn) == '' for fn in spectrum_fns)
 335
 336     basenames = [ os.path.splitext(spectrum_fn)[0]
 337                   for spectrum_fn in spectrum_fns ]
 338
 339     # spectrum file index -> sqt file
 340     spectrum_filemap = dict((n, open(fn+'.sqt', 'w'))
 341                             for n, fn in enumerate(basenames))
 342
 343     for sqt_file in spectrum_filemap.values():
 344         print_header(sqt_file, header)
 345         print_regime_manifest(sqt_file, header['mass regime manifest'])
 346
 347     for result in generate_results(result_file):
 348         if result:
 349             print_spectrum(spectrum_filemap[result[0]['file_id']],
 350                            header['modification conjuncts'], result,
 351                            options.enhanced_output)
 352
 353
 354 if __name__ == '__main__':
 355     main()