greylag_sqt.py

   1 #!/usr/bin/env python
   2
   3 '''Generate a set of sqt files from the specified greylag search result
   4 file.  The sqt files will be created in the current directory, with names
   5 corresponding to the searched ms2 files.
   6 '''
   7
   8 from __future__ import with_statement
   9
  10 __copyright__ = '''
  11     greylag, a collection of programs for MS/MS protein analysis
  12     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  13
  14     This program is free software: you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation, either version 3 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License
  25     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  26
  27     Contact: Mike Coleman
  28              Stowers Institute for Medical Research
  29              1000 East 50th Street
  30              Kansas City, Missouri  64110
  31              USA
  32 '''
  33
  34
  35 import cPickle as pickle
  36 import math
  37 import optparse
  38 import os.path
  39 from pprint import pprint
  40 import sys
  41
  42 import greylag
  43
  44
  45 # gc kills our performance, so disable it.  gc only matters for cycles, which
  46 # we (hope we) don't create.  See the gc module docs.
  47 import gc; gc.disable()
  48
  49
  50 def warn(s):
  51     print >> sys.stderr, 'warning:', s
  52 def error(s):
  53     sys.exit('error: ' + s)
  54
  55
  56 def generate_leaves(tree):
  57     """Given a tree made of lists, yield the leaves.
  58
  59     >>> list(generate_leaves([(1,2,3)]))
  60     [(1, 2, 3)]
  61     >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
  62     [(1, 2, 3), (4, 5)]
  63
  64     """
  65     if not isinstance(tree, list):
  66         yield tree
  67         return
  68     for b in tree:
  69         for l in generate_leaves(b):
  70             yield l
  71
  72
  73 # Example legacy headers:
  74 # H       Database        /foo/bar/db.fasta
  75 # H       PrecursorMasses AVG
  76 # H       FragmentMasses  MONO
  77 # H       StaticMod       C=160.1388
  78 # H       DiffMod M*=+16.0
  79 # H       DiffMod STY#=+80.0
  80
  81 def print_legacy_headers(f, r):
  82     """Output legacy H lines that readers of traditional SQT files may
  83     expect.  Warns if multiple databases, or multiple or isotopic mass regimes
  84     are present.
  85     """
  86
  87     if len(r['databases']) > 1:
  88         warn("multiple databases present")
  89     for db in r['databases']:
  90         print >> f, "H\tDatabase\t%s" % db
  91
  92     precursor_masses, fragment_masses = 'MONO', 'MONO'
  93     mass_regimes = r['parameters']['mass_regimes']
  94     if len(mass_regimes) > 1:
  95         warn("multiple mass regimes present--legacy SQT headers will be based"
  96              "on just the first")
  97     parent_regime, fragment_regime = mass_regimes[0]
  98     if parent_regime[1] or fragment_regime[1]:
  99         warn("isotopic mass regimes present--ignored for legacy SQT headers")
 100     assert parent_regime[0] in ('MONO', 'AVG')
 101     assert fragment_regime[0] in ('MONO', 'AVG')
 102     print >> f, "H\tPrecursorMasses\t%s" % parent_regime[0]
 103     print >> f, "H\tFragmentMasses\t%s" % fragment_regime[0]
 104
 105     # FIX: merge this with mass calculation code in
 106     #      greylag.initialize_spectrum_parameters?
 107     # FIX: currently assuming MONO only (what would be better?)
 108     static = dict((res, greylag.formula_mass(greylag.RESIDUE_FORMULA[res]))
 109                   for res in greylag.RESIDUES)
 110     for m in r['parameters']['pervasive_mods']:
 111         if isinstance(m[1], float):
 112             static[m[3]] += m[0] * m[1]
 113         else:
 114             static[m[3]] += m[0] * greylag.formula_mass(m[1])
 115     # all residues get a StaticMod because our figures are more exact than the
 116     # "default" ones
 117     for res in static:
 118         print >> f, "H\tStaticMod\t%s=%s" % (res, static[res])
 119
 120     diffs = set(generate_leaves(r['parameters']['potential_mods']))
 121     for m in diffs:
 122         symbol = m[5]
 123         if not symbol:
 124             warn("unmarked potential mod present--omitting")
 125             continue
 126         if isinstance(m[1], float):
 127             delta = m[0] * m[1]
 128         else:
 129             delta = m[0] * greylag.formula_mass(m[1])
 130         print >> f, "H\tDiffMod\t%s%s=%+f" % (m[3], m[5], delta)
 131
 132
 133 def print_header(f, r):
 134     """Output H lines."""
 135     print >> f, "H\tSQTGenerator\tgreylag"
 136     print >> f, "H\tSQTGeneratorVersion\t%s" % greylag.VERSION
 137
 138     print_legacy_headers(f, r)
 139
 140     for pk, pv in sorted(r['parameters'].items()):
 141         print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
 142
 143
 144 def print_regime_manifest(f, regime_manifest):
 145     """Output R lines."""
 146     for regime_no, residue, mass in regime_manifest:
 147         print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
 148
 149
 150 def generate_marked_sequence(marks, peptide_sequence):
 151     """Yield the characters in a marked version of peptide_sequence.  If there
 152     is both a terminal mark and a mark on a terminal residue, the latter is
 153     omitted.
 154
 155     >>> ''.join(generate_marked_sequence({}, 'ASDF'))
 156     'ASDF'
 157     >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
 158     'A^S*DF'
 159     >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
 160     'A^SDF'
 161
 162     """
 163
 164     for n, c in enumerate(peptide_sequence):
 165         yield c
 166         if n == 0 and 'N' in marks:
 167             yield marks.get('N')
 168         elif n == len(peptide_sequence)-1 and 'C' in marks:
 169             yield marks.get('C')
 170         else:
 171             yield marks.get(n, '')
 172
 173
 174 def print_spectrum(f, modification_conjucts, result, enhanced=False):
 175     """Print the lines (S/M/L/A*) associated with the given spectrum."""
 176     spectrum, sp_best_matches = result
 177
 178     scan_low, scan_high, charge = spectrum['name'].rsplit('.', 2)
 179     print >> f, '\t'.join(str(v) for v in
 180                           ["S", scan_low, scan_high, charge, 0, 'honk',
 181                            spectrum['mass'],
 182                            round(math.log(spectrum['total_ion_current']), 4),
 183                            0, spectrum['comparisons']])
 184
 185     best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
 186     if not best_scores:
 187         return
 188
 189     # score -> rank
 190     rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
 191
 192     best_score = best_scores[0]
 193     for match in sp_best_matches:
 194         score = match['score']
 195         if score == 0:
 196             continue                    # null entry
 197         rank = rank_map[score]
 198         score_delta = (best_score - score) / best_score
 199
 200         mass_regime_index = match.get('mass_regime_index', 0)
 201         conjunct_index = match.get('conjunct_index', 0)
 202
 203         am_lines = []                   # residue mods
 204         marks = {}
 205         for mt in match.get('mass_trace', []):
 206             conjunct_item = modification_conjucts[conjunct_index][2][mt['conjunct_item_index']]
 207             mark = conjunct_item[5]
 208             if mark:
 209                 assert mt['position'] not in marks, "not yet implemented"
 210                 marks[mt['position']] = mark
 211             if enhanced:
 212                 name = conjunct_item[4] or ''
 213                 delta = conjunct_item[6][mass_regime_index][1]
 214                 am_lines.append(["AM", mt['position'], round(delta, 8), name])
 215
 216         at_lines = []                   # terminal mods
 217         N_conjunct_item = modification_conjucts[conjunct_index][0]
 218         if N_conjunct_item:
 219             assert len(N_conjunct_item) == 1
 220             mark = N_conjunct_item[0][5]
 221             if mark:
 222                 marks['N'] = mark
 223             if enhanced:
 224                 name = conjunct_item[4] or ''
 225                 delta = conjunct_item[6][mass_regime_index][1]
 226                 at_lines.append(["AT", 'N', round(delta, 8), name])
 227         C_conjunct_item = modification_conjucts[conjunct_index][1]
 228         if C_conjunct_item:
 229             assert len(C_conjunct_item) == 1
 230             mark = C_conjunct_item[0][5]
 231             if mark:
 232                 marks['C'] = mark
 233             if enhanced:
 234                 name = conjunct_item[4] or ''
 235                 delta = conjunct_item[6][mass_regime_index][1]
 236                 at_lines.append(["AT", 'C', round(delta, 8), name])
 237
 238         marked_sequence = match['peptide_sequence']
 239         if marks:
 240             marked_sequence = ''.join(generate_marked_sequence(marks,
 241                                                                marked_sequence))
 242
 243         # FIX: also need M lines for fixed mods, terminal mods, isotope mods
 244         # (N15), etc.
 245         # Note: scores, being log probability, are non-positive, but we
 246         # flip the sign in the SQT output
 247         print >> f, '\t'.join(str(v) for v in
 248                               ["M", rank, rank,
 249                                round(match['predicted_parent_mass'], 5),
 250                                round(score_delta, 4), round(-score, 4), 0,
 251                                # FIX: 1 of 2 ions found--keep DTASelect happy
 252                                1, 2,
 253                                '.'.join((match['N_peptide_flank'],
 254                                          marked_sequence,
 255                                          match['C_peptide_flank'])),
 256                                'U'])
 257
 258         if enhanced:
 259             if mass_regime_index != 0:
 260                 print >> f, "AR\t%s" % mass_regime_index
 261             if match.get('pca_delta', 0) != 0:
 262                 print >> f, "APCA\t%s" % match['pca_delta']
 263             for am_line in sorted(am_lines):
 264                 print >> f, '\t'.join(str(v) for v in am_line)
 265             for at_line in at_lines:
 266                 print >> f, '\t'.join(str(v) for v in at_line)
 267
 268         for sequence_name, peptide_begin in match['loci']:
 269             print >> f, 'L\t%s\t%s' % (sequence_name, peptide_begin)
 270
 271
 272 def dump_result_file(result_file):
 273     """Pretty-print result file contents."""
 274     try:
 275         r = None
 276         while True:
 277             r = pickle.load(result_file)
 278             pprint(r)
 279     except EOFError, e:
 280         if r != 'complete':
 281             error("result file truncated or invalid")
 282
 283
 284 def generate_results(result_file):
 285     while True:
 286         try:
 287             r = pickle.load(result_file)
 288         except EOFError:
 289             error("result file truncated or invalid")
 290         if r == 'complete':
 291             return
 292         yield r
 293
 294
 295 def main(args=sys.argv[1:]):
 296     parser = optparse.OptionParser(usage=
 297                                    "usage: %prog [options] <result-file>",
 298                                    description=__doc__, version=greylag.VERSION)
 299     pa = parser.add_option
 300     pa("-d", "--output-directory", dest="output_directory",
 301        help="directory where output files are written [default is '.']",
 302        metavar="DIR")
 303     pa("-e", "--enhanced-output", action="store_true", dest="enhanced_output",
 304        help="include extra DTASelect-incompatible information in output")
 305     pa("-v", "--verbose", action="store_true", dest="verbose",
 306        help="be verbose")
 307     pa("--dump", action="store_true", dest="dump",
 308        help="just dump the result file to stdout (for debugging)")
 309     pa("--copyright", action="store_true", dest="copyright",
 310        help="print copyright and exit")
 311     (options, args) = parser.parse_args(args=args)
 312
 313     if options.copyright:
 314         print __copyright__
 315         sys.exit(0)
 316
 317     if len(args) != 1:
 318         parser.print_help()
 319         sys.exit(1)
 320
 321     result_file = open(args[0], 'rb')
 322
 323     if options.dump:
 324         dump_result_file(result_file)
 325         return
 326
 327     header = pickle.load(result_file)
 328
 329     spectrum_fns = header['spectrum files']
 330     assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
 331     assert all(os.path.dirname(fn) == '' for fn in spectrum_fns)
 332
 333     basenames = [ os.path.splitext(spectrum_fn)[0]
 334                   for spectrum_fn in spectrum_fns ]
 335
 336     # spectrum file index -> sqt file
 337     spectrum_filemap = dict((n, open(fn+'.sqt', 'w'))
 338                             for n, fn in enumerate(basenames))
 339
 340     for sqt_file in spectrum_filemap.values():
 341         print_header(sqt_file, header)
 342         print_regime_manifest(sqt_file, header['mass regime manifest'])
 343
 344     for result in generate_results(result_file):
 345         if result:
 346             print_spectrum(spectrum_filemap[result[0]['file_id']],
 347                            header['modification conjuncts'], result,
 348                            options.enhanced_output)
 349
 350
 351 if __name__ == '__main__':
 352     main()