greylag_sqt.py

   1 #!/usr/bin/env greylag-python
   2
   3 '''Generate a set of sqt files from the specified greylag search result
   4 file.  The sqt files will be created in the current directory, with names
   5 corresponding to the searched ms2 files.
   6 '''
   7
   8 from __future__ import with_statement
   9
  10 __copyright__ = '''
  11     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  12
  13     This program is free software; you can redistribute it and/or modify
  14     it under the terms of the GNU General Public License as published by
  15     the Free Software Foundation; either version 2 of the License, or
  16     (at your option) any later version.
  17
  18     This program is distributed in the hope that it will be useful,
  19     but WITHOUT ANY WARRANTY; without even the implied warranty of
  20     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21     GNU General Public License for more details.
  22
  23     You should have received a copy of the GNU General Public License along
  24     with this program; if not, write to the Free Software Foundation, Inc.,
  25     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  26 '''
  27
  28 __version__ = "0.0"
  29
  30
  31 from collections import defaultdict
  32 import contextlib
  33 import cPickle
  34 import gzip
  35 import math
  36 import optparse
  37 import os.path
  38 from pprint import pprint
  39 import sys
  40
  41
  42 def warn(s):
  43     print >> sys.stderr, 'warning:', s
  44 def error(s):
  45     sys.exit('error: ' + s)
  46
  47
  48 def print_header(f, r):
  49     """Output H lines."""
  50     print >> f, "H\tSQTGenerator\tgreylag"
  51     print >> f, "H\tSQTGeneratorVersion\t%s" % __version__
  52     for db in r['databases']:
  53         print >> f, "H\tDatabase\t%s" % db
  54     for pk, pv in sorted(r['parameters'].items()):
  55         print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
  56     # FIX: are these necessary?
  57     # H       StaticMod       C=160.1388
  58     # H       DiffMod TNA*=+2.0
  59
  60
  61 def print_regime_manifest(f, regime_manifest):
  62     """Output R lines."""
  63     for regime_no, residue, mass in regime_manifest:
  64         print >> f, "R\t%s\t%s\t%s" % (regime_no, residue, mass)
  65
  66
  67 def generate_marked_sequence(match_name_map, mass_trace, peptide_sequence):
  68     """Yield the characters in a marked version of peptide_sequence.
  69
  70     >>> ''.join(generate_marked_sequence({}, [], 'ASDF'))
  71     'ASDF'
  72     >>> ''.join(generate_marked_sequence({('S', 80) : ('phosphorylation', '*')},
  73     ...                                  [{'position' : 1, 'delta' : 80}],
  74     ...                                  'ASDF'))
  75     'AS*DF'
  76
  77     """
  78
  79     # FIX: handle [ ]
  80     trace = sorted(mass_trace,
  81                    key=lambda x: (x['position'], x['delta']), reverse=True)
  82     for n, r in enumerate(peptide_sequence):
  83         yield r
  84         while trace and trace[-1]['position'] == n:
  85             yield match_name_map[(r, trace[-1]['delta'])][1]
  86             trace.pop()
  87
  88
  89 def print_spectrum(f, mod_name_map, sp_matches):
  90     """Print the lines (S/M/L/A*) associated with the given spectrum."""
  91     spectrum, sp_best_matches = sp_matches
  92
  93     scan_low, scan_high, _rest = spectrum['name'].split('.', 2)
  94     print >> f, '\t'.join(str(v) for v in
  95                           ["S", scan_low, scan_high, spectrum['charge'], 0,
  96                            'honk', spectrum['mass'],
  97                            round(math.log(spectrum['total_ion_current']), 4),
  98                            0, spectrum['comparisons']])
  99
 100     best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
 101     # score -> rank
 102     rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
 103
 104     best_score = best_scores[0]
 105     for match in sp_best_matches:
 106         score = match['score']
 107         if score == 0:
 108             continue                    # null entry
 109         rank = rank_map[score]
 110         score_delta = (best_score - score) / best_score
 111
 112         match_name_map = mod_name_map[(match['mass_regime_index'],
 113                                        match['conjunct_index'])]
 114         marked_sequence \
 115             = ''.join(generate_marked_sequence(match_name_map,
 116                                                match['mass_trace'],
 117                                                match['peptide_sequence']))
 118
 119         # FIX: also need M lines for fixed mods, terminal mods, isotope mods
 120         # (N15), etc.
 121         # Note: scores, being log probability, are non-positive, but we
 122         # flip the sign in the SQT output
 123         print >> f, '\t'.join(str(v) for v in
 124                               ["M", rank, rank,
 125                                round(match['predicted_parent_mass'], 5),
 126                                round(score_delta, 4), round(-score, 4), 0,
 127                                # 1 of 2 ions found--keep DTASelect happy
 128                                1, 2,
 129                                "-.%s.-" % marked_sequence, 'U'])
 130         if match['mass_regime_index'] != 0:
 131             print >> f, "AR\t%s" % match['mass_regime_index']
 132         if match['pca_delta'] != 0:
 133             print >> f, "APCA\t%s" % match['pca_delta']
 134         for mt in match['mass_trace']:
 135             name = (match_name_map[(match['peptide_sequence'][mt['position']],
 136                                     mt['delta'])][0])
 137             fs = ["AM", mt['position'], round(mt['delta'], 5),
 138                   name if name else '']
 139             print >> f, '\t'.join(str(v) for v in fs)
 140
 141         assert len(match['sequence_name']) == len(match['peptide_begin'])
 142         for sn, pb in zip(match['sequence_name'], match['peptide_begin']):
 143             print >> f, 'L\t%s\t%s' % (sn, pb)
 144
 145
 146 def main(args=sys.argv[1:]):
 147     parser = optparse.OptionParser(usage=
 148                                    "usage: %prog [options] <result-file>",
 149                                    description=__doc__, version=__version__)
 150     pa = parser.add_option
 151     pa("-d", "--output-directory", dest="output_directory",
 152        help="directory where output files are written [default is '.']",
 153        metavar="DIR")
 154     pa("-v", "--verbose", action="store_true", dest="verbose",
 155        help="be verbose")
 156     pa("--dump", action="store_true", dest="dump",
 157        help="dump the result file (for debugging)")
 158     pa("--copyright", action="store_true", dest="copyright",
 159        help="print copyright and exit")
 160     (options, args) = parser.parse_args(args=args)
 161
 162     if options.copyright:
 163         print __copyright__
 164         sys.exit(0)
 165
 166     if len(args) != 1:
 167         parser.print_help()
 168         sys.exit(1)
 169
 170     with contextlib.closing(gzip.open(args[0])) as r_file:
 171         r = cPickle.load(r_file)
 172
 173     if options.dump:
 174         pprint(r)
 175         return
 176
 177     spectrum_fns = r['spectrum files']
 178     assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
 179
 180     matches = r['matches'].items()
 181
 182     # order matches by (spectrum filename, spectrum name)
 183     def filename_specname_order(m):
 184         sp = m[1][0]
 185         return sp['file_id'], sp['name']
 186
 187     matches.sort(key=filename_specname_order)
 188
 189
 190     # (regime index, conjunct index) -> (residue, delta)
 191     #                                                -> (mod name, mod marker)
 192     mod_name_map = defaultdict(dict)
 193     for regime in range(len(r['mass regime atomic masses'])):
 194         for cj_n, (N_cj, C_cj, R_cj) in enumerate(r['modification conjuncts']):
 195             assert not N_cj and not C_cj, "not yet implemented"
 196             for cj in R_cj:
 197                 for res in cj[3]:
 198                     mod_name_map[(regime, cj_n)][(res, cj[6][regime][1])] \
 199                         = (cj[4], cj[5])
 200     #pprint(mod_name_map.items())
 201
 202     for spectrum_n, spectrum_fn in enumerate(spectrum_fns):
 203         assert os.path.dirname(spectrum_fn) == ''
 204         sqt_fn = os.path.splitext(spectrum_fn)[0] + '.sqt'
 205         with open(sqt_fn, 'w') as sqtf:
 206             print_header(sqtf, r)
 207             print_regime_manifest(sqtf, r['mass regime manifest'])
 208             for match in matches:
 209                 if match[1][0]['file_id'] == spectrum_n:
 210                     print_spectrum(sqtf, mod_name_map, match[1])
 211
 212
 213 if __name__ == '__main__':
 214     main()