sqt_filter.py

   1 #!/usr/bin/env greylag-python
   2
   3 """
   4 Filter a set of sqt files according to FPR and other criteria, optionally
   5 rewriting them with the validation marks set to 'N' for filtered-out spectra.
   6
   7 """
   8
   9 from __future__ import with_statement
  10
  11 __copyright__ = '''
  12     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  13
  14     This program is free software; you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation; either version 2 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License along
  25     with this program; if not, write to the Free Software Foundation, Inc.,
  26     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  27 '''
  28
  29 __version__ = "0.0"
  30
  31
  32 from collections import defaultdict
  33 import fileinput
  34 import optparse
  35 from pprint import pprint
  36 import sys
  37
  38
  39 def warn(s):
  40     print >> sys.stderr, 'warning:', s
  41 def error(s):
  42     sys.exit('error: ' + s)
  43 def fileerror(s, *args):
  44     error(s + (", at line %s of file '%s'"
  45                % (fileinput.filelineno(), fileinput.filename())),
  46           *args)
  47
  48 def inplace_warning():
  49     warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
  50          " be necessary to recover some or all of the .sqt files from the"
  51          " corresponding .sqt.bak files.\n!!!")
  52
  53
  54 def reset_marks(options, sqt_fns):
  55     """Rewrite all evaluation marks to 'U', in-place."""
  56
  57     try:
  58         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  59             if line.startswith("M\t"):
  60                 fs = line.split("\t")
  61                 if len(fs) >= 11:
  62                     fs[10] = 'U' + fs[10][1:]
  63                 line = '\t'.join(fs)
  64             sys.stdout.write(line)
  65     except:
  66         inplace_warning()
  67         raise
  68
  69
  70 def mark(options, thresholds, sp_scores, sqt_fns):
  71     """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
  72     score and delta thresholds."""
  73
  74     spectrum_no = -1
  75     mark_spectrum = False
  76
  77     try:
  78         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  79             if line.startswith("S\t"):
  80                 spectrum_no += 1
  81                 charge, score, delta = sp_scores[spectrum_no]
  82                 mark_spectrum = (charge in thresholds
  83                                  and score != None
  84                                  and (score < thresholds[charge][0]
  85                                       or delta < thresholds[charge][1]))
  86             elif line.startswith("M\t") and mark_spectrum:
  87                 fs = line.split("\t")
  88                 if len(fs) >= 11:
  89                     fs[10] = 'N' + fs[10][1:]
  90                 line = '\t'.join(fs)
  91             sys.stdout.write(line)
  92     except:
  93         inplace_warning()
  94         raise
  95
  96
  97 def read_sqt_info(decoy_prefix, sqt_fns):
  98     """Return a pair, the first a dict mapping each charge to a list of
  99     (score, delta, state), where state is 'real' or 'decoy', for all the
 100     spectra in sqt_fns, and the second a list of all (score, delta).
 101     """
 102
 103     # charge -> [ (score, delta, state), ... ]
 104     #   where state is either 'real' or 'decoy'
 105     z_scores = defaultdict(list)
 106
 107     # [ (charge, score, delta), ... ]
 108     sp_scores = []
 109
 110     current_charge = None
 111     current_score = None
 112     current_delta = 0
 113     current_state = set()
 114
 115     for line in fileinput.input(sqt_fns):
 116         fs = line.split('\t')
 117         if fs[0] == 'S':
 118             if current_score != None and len(current_state) == 1:
 119                 z_scores[current_charge].append((current_score, current_delta,
 120                                                  current_state.pop()))
 121             if current_charge != None:
 122                 sp_scores.append((current_charge, current_score, current_delta))
 123             current_charge = int(fs[3])
 124             current_score = None
 125             current_delta = 0
 126             current_state = set()
 127         elif fs[0] == 'M':
 128             delta, score = float(fs[4]), float(fs[5])
 129             if delta == 0:
 130                 current_score = score
 131             elif current_delta == 0:
 132                 current_delta = delta
 133         elif fs[0] == 'L':
 134             if current_delta == 0:
 135                 if fs[1].startswith(decoy_prefix):
 136                     current_state.add('decoy')
 137                 else:
 138                     current_state.add('real')
 139     # handle final spectrum, as above
 140     if current_score != None and len(current_state) == 1:
 141         z_scores[current_charge].append((current_score, current_delta,
 142                                          current_state.pop()))
 143     if current_charge != None:
 144         sp_scores.append((current_charge, current_score, current_delta))
 145
 146     return (z_scores, sp_scores)
 147
 148
 149 def specificity(positives, negatives):
 150     return (float(positives - negatives)
 151             / (positives + negatives))
 152
 153
 154 def calculate_inner_threshold(specificity_goal, charge, spinfo):
 155     spinfo = sorted(spinfo, key=lambda x: x[0])
 156
 157     real_count = sum(1 for x in spinfo if x[-1] == 'real')
 158     decoy_count = len(spinfo) - real_count
 159
 160     if real_count == 0:
 161         return (None, real_count, decoy_count) # give up
 162
 163     current_threshold = -1e100      # allow all spectra
 164     for n, sp in enumerate(spinfo):
 165         specificity_est = specificity(real_count, decoy_count)
 166         if specificity_est >= specificity_goal:
 167             break
 168         if sp[-1] == 'real':
 169             real_count -= 1
 170         else:
 171             decoy_count -= 1
 172         # set threshold just high enough to exclude this spectrum
 173         current_threshold = sp[0] + 1e-6
 174     else:
 175         current_threshold = spinfo[-1][0] + 1e-6 # couldn't meet goal
 176
 177     return (current_threshold, real_count, decoy_count)
 178
 179
 180 def calculate_combined_thresholds(options, z_scores):
 181     """Find best score/delta thresholds for each charge."""
 182
 183     specificity_goal = 1 - options.fpr
 184
 185     # Rather than search every possible value of delta, we're only going to
 186     # "sample" at this granularity.  This cuts search time dramatically (and
 187     # making it O(n) instead of O(n**2).  Extra precision wouldn't really be
 188     # useful in any case.
 189     SEARCH_GRANULARITY = 0.001
 190
 191     # charge -> (score, delta, passing_reals, passing_decoys)
 192     thresholds = {}
 193
 194     for charge, spinfo in z_scores.iteritems():
 195         spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
 196
 197         last_value = None
 198
 199         while spinfo0:
 200             this_value = spinfo0[-1][1] # current delta
 201             if (last_value == None
 202                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 203
 204                 # "inner" is score
 205                 r = calculate_inner_threshold(specificity_goal, charge,
 206                                               spinfo0)
 207                 if r[0] != None:
 208                     if options.debug:
 209                         print '#', charge, r[0], this_value, r[1], r[2]
 210                     if (charge not in thresholds
 211                         or r[1] > thresholds[charge][2]):
 212                         thresholds[charge] = (r[0], this_value, r[1], r[2])
 213
 214                 last_value = this_value
 215             spinfo0.pop()
 216
 217         if options.verbose and charge in thresholds:
 218             print ("%+d: score %s, delta %s -> %s real ids (fdr %.4f)"
 219                    % (charge, thresholds[charge][0], thresholds[charge][1],
 220                       thresholds[charge][2],
 221                       1 - specificity(thresholds[charge][2],
 222                                       thresholds[charge][3])))
 223
 224     return thresholds
 225
 226
 227 def main(args=sys.argv[1:]):
 228     parser = optparse.OptionParser(usage=
 229                                    "usage: %prog [options] <sqt-file>...",
 230                                    description=__doc__, version=__version__)
 231     pa = parser.add_option
 232     pa("--decoy-prefix", dest="decoy_prefix", default="SHUFFLED_",
 233        help='prefix given to locus name of decoy (e.g., shuffled) database'
 234        ' sequences [default="SHUFFLED_"]', metavar="PREFIX")
 235     pa("--fpr", dest="fpr", type="float", default="0.02",
 236        help="false positive rate [default=0.02]", metavar="PROPORTION")
 237     pa("-v", "--verbose", action="store_true", dest="verbose",
 238        help="be verbose")
 239     pa("--debug", action="store_true", dest="debug",
 240        help="show debug output")
 241     pa("-m", "--mark", action="store_true", dest="mark",
 242        help="rewrite the input files, changing some validation marks to 'N',"
 243        " according to filtering")
 244     pa("--reset-marks", action="store_true", dest="reset_marks",
 245        help="rewrite the input files, changing all validation marks to 'U'")
 246     pa("--copyright", action="store_true", dest="copyright",
 247        help="print copyright and exit")
 248     (options, args) = parser.parse_args(args=args)
 249
 250     if options.copyright:
 251         print __copyright__
 252         sys.exit(0)
 253
 254     if len(args) < 1:
 255         parser.print_help()
 256         sys.exit(1)
 257
 258     if not (0.0 <= options.fpr <= 1.0):
 259         error("--fpr must be within range [0.0, 1.0]")
 260     if options.mark and options.reset_marks:
 261         error("only one of --mark and --reset-marks may be specified")
 262
 263     if options.reset_marks:
 264         reset_marks(options, args)
 265         return
 266
 267     z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix, args)
 268
 269     thresholds = calculate_combined_thresholds(options, z_scores)
 270
 271     if options.debug:
 272         pprint(thresholds)
 273
 274     if options.mark:
 275         mark(options, thresholds, spectrum_scores, args)
 276
 277
 278 if __name__ == '__main__':
 279     main()