greylag-sqt.py

   1 #!/usr/bin/env python
   2
   3 '''Generate a set of sqt files from the specified greylag search result
   4 file.  The sqt files will be created in the current directory, with names
   5 corresponding to the searched ms2 files.
   6 '''
   7
   8 from __future__ import with_statement
   9
  10 __copyright__ = '''
  11     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  12
  13     This program is free software; you can redistribute it and/or modify
  14     it under the terms of the GNU General Public License as published by
  15     the Free Software Foundation; either version 2 of the License, or
  16     (at your option) any later version.
  17
  18     This program is distributed in the hope that it will be useful,
  19     but WITHOUT ANY WARRANTY; without even the implied warranty of
  20     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21     GNU General Public License for more details.
  22
  23     You should have received a copy of the GNU General Public License along
  24     with this program; if not, write to the Free Software Foundation, Inc.,
  25     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  26 '''
  27
  28 __version__ = "0.0"
  29
  30
  31 import contextlib
  32 import cPickle
  33 import gzip
  34 import math
  35 import optparse
  36 import os.path
  37 from pprint import pprint
  38 import sys
  39
  40
  41 def warn(s):
  42     print >> sys.stderr, 'warning:', s
  43 def error(s):
  44     sys.exit('error: ' + s)
  45
  46
  47 def print_header(f, r):
  48     print >> f, "H\tSQTGenerator\tgreylag"
  49     print >> f, "H\tSQTGeneratorVersion\t%s" % __version__
  50     for db in r['databases']:
  51         print >> f, "H\tDatabase\t%s" % db
  52     for pk, pv in r['parameters'].items():
  53         print >> f, "H\tParameter\t%s\t%s" % (pk, pv)
  54     # How should these be specified?
  55     # What does DTASelect do with these?
  56     # H       StaticMod       C=160.1388
  57     # H       DiffMod TNA*=+2.0
  58
  59
  60 def print_spectrum(f, spectrum, sp_best_matches):
  61     scan_low, scan_high, _rest = spectrum['name'].split('.', 2)
  62     print >> f, '\t'.join(str(v) for v in
  63                           ["S", scan_low, scan_high, spectrum['charge'], 0,
  64                            'honk', spectrum['mass'],
  65                            round(math.log(spectrum['total_ion_current']), 4),
  66                            0, spectrum['comparisons']])
  67
  68     best_scores = sorted(list(set(m['score'] for m in sp_best_matches)))
  69     # score -> rank
  70     rank_map = dict(zip(best_scores, range(1,len(best_scores)+1)))
  71
  72     best_score = sp_best_matches[0]['score']
  73     prev_score, prev_sequence = None, None
  74     for match in sp_best_matches:
  75         score = match['score']
  76         if score == 0:
  77             continue                    # null entry
  78         rank = rank_map[score]
  79         score_delta = (best_score - score) / best_score
  80
  81         # This is a temporary way of stripping duplicates.  It only works if
  82         # they are adjacent (which will only rarely be false?).
  83         if score != prev_score or match['peptide_sequence'] != prev_sequence:
  84             # Note: scores, being log probability, are non-positive, but we
  85             # flip the sign in the SQT output
  86             print >> f, '\t'.join(str(v) for v in
  87                                   ["M", rank, rank,
  88                                    round(match['predicted_parent_mass'], 5),
  89                                    round(score_delta, 4), round(-score, 4), 0,
  90                                    # 1 of 2 ions found--keep DTASelect happy
  91                                    1, 2,
  92                                    "-.%s.-" % match['peptide_sequence'],
  93                                    'U'])
  94         prev_score, prev_sequence = score, match['peptide_sequence']
  95
  96         print >> f, 'L\t%s\t%s' % (match['sequence_name'],
  97                                    match['peptide_begin'])
  98
  99 def main(args=sys.argv[1:]):
 100     parser = optparse.OptionParser(usage=
 101                                    "usage: %prog [options] <result-file>",
 102                                    description=__doc__, version=__version__)
 103     pa = parser.add_option
 104     pa("-d", "--output-directory", dest="output_directory",
 105        help="directory where output files are written [default is '.']",
 106        metavar="DIR")
 107     pa("-v", "--verbose", action="store_true", dest="verbose",
 108        help="be verbose")
 109     pa("--dump", action="store_true", dest="dump",
 110        help="dump the result file (for debugging)")
 111     pa("--copyright", action="store_true", dest="copyright",
 112        help="print copyright and exit")
 113     (options, args) = parser.parse_args(args=args)
 114
 115     if options.copyright:
 116         print __copyright__
 117         sys.exit(0)
 118
 119     if len(args) != 1:
 120         parser.print_help()
 121         sys.exit(1)
 122
 123     with contextlib.closing(gzip.open(args[0])) as r_file:
 124         r = cPickle.load(r_file)
 125
 126     if options.dump:
 127         pprint(r)
 128         return
 129
 130     spectrum_fns = r['spectrum files']
 131     assert len(spectrum_fns) == len(set(spectrum_fns)) # check uniqueness
 132
 133     spectra = r['spectra']
 134     best_matches = r['matches']['best_matches']
 135     assert len(spectra) == len(best_matches)
 136
 137     # order spectra and best_matches by (filename, spectrum name)
 138     def filename_specname_order(sm):
 139         return r['spectrum files'][sm[0]['file_id']], sm[0]['name']
 140
 141     spec_match = zip(spectra, best_matches)
 142     spec_match.sort(key=filename_specname_order)
 143
 144     for spectrum_n, spectrum_fn in enumerate(spectrum_fns):
 145         assert os.path.dirname(spectrum_fn) == ''
 146         sqt_fn = os.path.splitext(spectrum_fn)[0] + '.sqt'
 147         with open(sqt_fn, 'w') as sqtf:
 148             print_header(sqtf, r)
 149             for spectrum, best_match in spec_match:
 150                 if spectrum['file_id'] == spectrum_n:
 151                     print_spectrum(sqtf, spectrum, best_match)
 152
 153
 154 if __name__ == '__main__':
 155     main()