greylag_sqt.py

   1 #!/usr/bin/env python
   2
   3 '''Generate a set of sqt files from the specified greylag search result
   4 file.  The sqt files will be created in the current directory, with names
   5 corresponding to the searched ms2 files.
   6 '''
   7
   8 from __future__ import with_statement
   9
  10 __copyright__ = '''
  11     greylag, a collection of programs for MS/MS protein analysis
  12     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  13
  14     This program is free software: you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation, either version 3 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License
  25     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  26
  27     Contact: Mike Coleman
  28              Stowers Institute for Medical Research
  29              1000 East 50th Street
  30              Kansas City, Missouri  64110
  31              USA
  32 '''
  33
  34
  35 import cPickle as pickle
  36 import math
  37 import optparse
  38 import os.path
  39 from pprint import pprint
  40 import sys
  41
  42 import greylag
  43
  44
  45 # gc kills our performance, so disable it.  gc only matters for cycles, which
  46 # we (hope we) don't create.  See the gc module docs.
  47 import gc; gc.disable()
  48
  49
  50 def warn(s):
  51     print >> sys.stderr, 'warning:', s
  52 def error(s):
  53     sys.exit('error: ' + s)
  54
  55
  56 def generate_leaves(tree):
  57     """Given a tree made of lists, yield the leaves.
  58
  59     >>> list(generate_leaves([(1,2,3)]))
  60     [(1, 2, 3)]
  61     >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
  62     [(1, 2, 3), (4, 5)]
  63
  64     """
  65     if not isinstance(tree, list):
  66         yield tree
  67         return
  68     for b in tree:
  69         for l in generate_leaves(b):
  70             yield l
  71
  72
  73 # Example legacy headers:
  74 # H       Database        /foo/bar/db.fasta
  75 # H       PrecursorMasses AVG
  76 # H       FragmentMasses  MONO
  77 # H       StaticMod       C=160.1388
  78 # H       DiffMod M*=+16.0
  79 # H       DiffMod STY#=+80.0
  80
  81 def print_legacy_headers(f, r):
  82     """Output legacy H lines that readers of traditional SQT files may
  83     expect.  Warns if multiple databases, or multiple or isotopic mass regimes
  84     are present.
  85     """
  86
  87     if len(r['databases']) > 1:
  88         warn("multiple databases present")
  89     for db in r['databases']:
  90         print >> f, "H\tDatabase\t%s" % db
  91
  92     precursor_masses, fragment_masses = 'MONO', 'MONO'
  93     mass_regimes = r['parameters']['mass_regimes']
  94     if len(mass_regimes) > 1:
  95         warn("multiple mass regimes present--legacy SQT headers will be based"
  96              "on just the first")
  97     parent_regime, fragment_regime = mass_regimes[0]
  98     if parent_regime[1] or fragment_regime[1]:
  99         warn("isotopic mass regimes present--ignored for legacy SQT headers")
 100     assert parent_regime[0] in ('MONO', 'AVG')
 101     assert fragment_regime[0] in ('MONO', 'AVG')
 102     print >> f, "H\tPrecursorMasses\t%s" % parent_regime[0]
 103     print >> f, "H\tFragmentMasses\t%s" % fragment_regime[0]
 104
 105     # FIX: merge this with mass calculation code in
 106     #      greylag.initialize_spectrum_parameters?
 107     # FIX: currently assuming MONO only (what would be better?)
 108     static = dict((res, greylag.formula_mass(greylag.RESIDUE_FORMULA[res]))
 109                   for res in greylag.RESIDUES)
 110     for m in r['parameters']['pervasive_mods']:
 111         if isinstance(m[1], float):
 112             static[m[3]] += m[0] * m[1]
 113         else:
 114             static[m[3]] += m[0] * greylag.formula_mass(m[1])
 115     # all residues get a StaticMod because our figures are more exact than the
 116     # "default" ones
 117     for res in static:
 118         print >> f, "H\tStaticMod\t%s=%s" % (res, static[res])
 119
 120     diffs = set(generate_leaves(r['parameters']['potential_mods']))
 121     for m in diffs:
 122         symbol = m[5]
 123         if not symbol:
 124             warn("unmarked potential mod present--omitting")
 125             continue
 126         if isinstance(m[1], float):
 127             delta = m[0] * m[1]
 128         else:
 129             delta = m[0] * greylag.formula_mass(m[1])
 130         print >> f, "H\tDiffMod\t%s%s=%+f" % (m[3], m[5], delta)
 131
 132
 133 def print_header(f, r):
 134     """Output H lines."""
 135     print >> f, "H\tSQTGenerator\tgreylag"
 136     print >> f, "H\tSQTGeneratorVersion\t%s" % greylag.VERSION
 137
 138     print_legacy_headers(f, r)
 139
 140     for pk, pv in sorted(r['parameters'].items()):
 141         print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
 142
 143
 144 def print_regime_manifest(f, regime_manifest):
 145     """Output R lines."""
 146     for regime_no, residue, mass in regime_manifest:
 147         print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
 148
 149
 150 def generate_marked_sequence(marks, peptide_sequence):
 151     """Yield the characters in a marked version of peptide_sequence.  If there
 152     is both a terminal mark and a mark on a terminal residue, the latter is
 153     omitted.
 154
 155     >>> ''.join(generate_marked_sequence({}, 'ASDF'))
 156     'ASDF'
 157     >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
 158     'A^S*DF'
 159     >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
 160     'A^SDF'
 161
 162     """
 163
 164     for n, c in enumerate(peptide_sequence):
 165         yield c
 166         if n == 0 and 'N' in marks:
 167             yield marks.get('N')
 168         elif n == len(peptide_sequence)-1 and 'C' in marks:
 169             yield marks.get('C')
 170         else:
 171             yield marks.get(n, '')
 172
 173
 174 def print_spectrum(f, modification_conjucts, result, enhanced=False):
 175     """Print the lines (S/M/L/A*) associated with the given spectrum."""
 176     spectrum, sp_best_matches = result
 177
 178     scan_low, scan_high, charge = spectrum['name'].rsplit('.', 2)
 179     print >> f, '\t'.join(str(v) for v in
 180                           ["S", scan_low, scan_high, charge, 0, 'honk',
 181                            spectrum['mass'],
 182                            round(math.log(spectrum['total_ion_current']), 4),
 183                            0, spectrum['comparisons']])
 184
 185     best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
 186     if not best_scores:
 187         return
 188
 189     # score -> rank
 190     rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
 191
 192     best_score = best_scores[0]
 193     for match in sp_best_matches:
 194         score = match['score']
 195         if score == 0:
 196             continue                    # null entry
 197         rank = rank_map[score]
 198         score_delta = (best_score - score) / best_score
 199
 200         mass_regime_index = match.get('mass_regime_index', 0)
 201         conjunct_index = match.get('conjunct_index', 0)
 202
 203         am_lines = []                   # residue mods
 204         marks = {}
 205         for mt in match.get('mass_trace', []):
 206             conjunct_item = modification_conjucts[conjunct_index][2][mt['conjunct_item_index']]
 207             mark = conjunct_item[5]
 208             if mark:
 209                 assert mt['position'] not in marks, "not yet implemented"
 210                 marks[mt['position']] = mark
 211             if enhanced:
 212                 name = conjunct_item[4] or ''
 213                 delta = conjunct_item[6][mass_regime_index][1]
 214                 am_lines.append(["AM", mt['position'], round(delta, 8), name])
 215
 216         at_lines = []                   # terminal mods
 217         N_conjunct_item = modification_conjucts[conjunct_index][0]
 218         if N_conjunct_item:
 219             assert len(N_conjunct_item) == 1
 220             mark = N_conjunct_item[0][5]
 221             if mark:
 222                 marks['N'] = mark
 223             if enhanced:
 224                 name = conjunct_item[4] or ''
 225                 delta = conjunct_item[6][mass_regime_index][1]
 226                 at_lines.append(["AT", 'N', round(delta, 8), name])
 227         C_conjunct_item = modification_conjucts[conjunct_index][1]
 228         if C_conjunct_item:
 229             assert len(C_conjunct_item) == 1
 230             mark = C_conjunct_item[0][5]
 231             if mark:
 232                 marks['C'] = mark
 233             if enhanced:
 234                 name = conjunct_item[4] or ''
 235                 delta = conjunct_item[6][mass_regime_index][1]
 236                 at_lines.append(["AT", 'C', round(delta, 8), name])
 237
 238         marked_sequence = match['peptide_sequence']
 239         if marks:
 240             marked_sequence = ''.join(generate_marked_sequence(marks,
 241                                                                marked_sequence))
 242
 243         # FIX: also need M lines for fixed mods, terminal mods, isotope mods
 244         # (N15), etc.
 245         # Note: scores, being log probability, are non-positive, but we
 246         # flip the sign in the SQT output
 247         print >> f, '\t'.join(str(v) for v in
 248                               ["M", rank, rank,
 249                                round(match['predicted_parent_mass'], 5),
 250                                round(score_delta, 4), round(-score, 4), 0,
 251                                # FIX: 1 of 2 ions found--keep DTASelect happy
 252                                1, 2,
 253                                '.'.join((match['N_peptide_flank'],
 254                                          marked_sequence,
 255                                          match['C_peptide_flank'])),
 256                                'U'])
 257
 258         if enhanced:
 259             if mass_regime_index != 0:
 260                 print >> f, "AR\t%s" % mass_regime_index
 261             if match.get('pca_delta', 0) != 0:
 262                 print >> f, "APCA\t%s" % match['pca_delta']
 263             for am_line in sorted(am_lines):
 264                 print >> f, '\t'.join(str(v) for v in am_line)
 265             for at_line in at_lines:
 266                 print >> f, '\t'.join(str(v) for v in at_line)
 267
 268         assert len(match['sequence_name']) == len(match['peptide_begin'])
 269         for sn, pb in zip(match['sequence_name'], match['peptide_begin']):
 270             print >> f, 'L\t%s\t%s' % (sn, pb)
 271
 272
 273 def dump_result_file(result_file):
 274     """Pretty-print result file contents."""
 275     try:
 276         r = None
 277         while True:
 278             r = pickle.load(result_file)
 279             pprint(r)
 280     except EOFError, e:
 281         if r != 'complete':
 282             error("result file truncated or invalid")
 283
 284
 285 def generate_results(result_file):
 286     while True:
 287         try:
 288             r = pickle.load(result_file)
 289         except EOFError:
 290             error("result file truncated or invalid")
 291         if r == 'complete':
 292             return
 293         yield r
 294
 295
 296 def main(args=sys.argv[1:]):
 297     parser = optparse.OptionParser(usage=
 298                                    "usage: %prog [options] <result-file>",
 299                                    description=__doc__, version=greylag.VERSION)
 300     pa = parser.add_option
 301     pa("-d", "--output-directory", dest="output_directory",
 302        help="directory where output files are written [default is '.']",
 303        metavar="DIR")
 304     pa("-e", "--enhanced-output", action="store_true", dest="enhanced_output",
 305        help="include extra DTASelect-incompatible information in output")
 306     pa("-v", "--verbose", action="store_true", dest="verbose",
 307        help="be verbose")
 308     pa("--dump", action="store_true", dest="dump",
 309        help="just dump the result file to stdout (for debugging)")
 310     pa("--copyright", action="store_true", dest="copyright",
 311        help="print copyright and exit")
 312     (options, args) = parser.parse_args(args=args)
 313
 314     if options.copyright:
 315         print __copyright__
 316         sys.exit(0)
 317
 318     if len(args) != 1:
 319         parser.print_help()
 320         sys.exit(1)
 321
 322     result_file = open(args[0], 'rb')
 323
 324     if options.dump:
 325         dump_result_file(result_file)
 326         return
 327
 328     header = pickle.load(result_file)
 329
 330     spectrum_fns = header['spectrum files']
 331     assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
 332     assert all(os.path.dirname(fn) == '' for fn in spectrum_fns)
 333
 334     basenames = [ os.path.splitext(spectrum_fn)[0]
 335                   for spectrum_fn in spectrum_fns ]
 336
 337     # spectrum file index -> sqt file
 338     spectrum_filemap = dict((n, open(fn+'.sqt', 'w'))
 339                             for n, fn in enumerate(basenames))
 340
 341     for sqt_file in spectrum_filemap.values():
 342         print_header(sqt_file, header)
 343         print_regime_manifest(sqt_file, header['mass regime manifest'])
 344
 345     for result in generate_results(result_file):
 346         if result:
 347             print_spectrum(spectrum_filemap[result[0]['file_id']],
 348                            header['modification conjuncts'], result,
 349                            options.enhanced_output)
 350
 351
 352 if __name__ == '__main__':
 353     main()