sqt_filter.py

   1 #!/usr/bin/env greylag-python
   2
   3 """
   4 Filter a set of sqt files according to FDR and other criteria, optionally
   5 rewriting them with the validation marks set to 'N' for filtered-out spectra.
   6
   7 """
   8
   9 from __future__ import with_statement
  10
  11 __copyright__ = '''
  12     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  13
  14     This program is free software; you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation; either version 2 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License along
  25     with this program; if not, write to the Free Software Foundation, Inc.,
  26     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  27 '''
  28
  29 __version__ = "0.0"
  30
  31
  32 from collections import defaultdict
  33 import fileinput
  34 import optparse
  35 from pprint import pprint
  36 import sys
  37
  38
  39 def warn(s):
  40     print >> sys.stderr, 'warning:', s
  41 def error(s):
  42     sys.exit('error: ' + s)
  43 def fileerror(s, *args):
  44     error(s + (", at line %s of file '%s'"
  45                % (fileinput.filelineno(), fileinput.filename())),
  46           *args)
  47
  48 def inplace_warning():
  49     warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
  50          " be necessary to recover some or all of the .sqt files from the"
  51          " corresponding .sqt.bak files.\n!!!")
  52
  53
  54 def reset_marks(options, sqt_fns):
  55     """Rewrite all evaluation marks to 'U', in-place."""
  56
  57     try:
  58         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  59             if line.startswith("M\t"):
  60                 fs = line.split("\t")
  61                 if len(fs) >= 11:
  62                     fs[10] = 'U' + fs[10][1:]
  63                 line = '\t'.join(fs)
  64             sys.stdout.write(line)
  65     except:
  66         inplace_warning()
  67         raise
  68
  69
  70 def mark(options, thresholds, sp_scores, sqt_fns):
  71     """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
  72     score and delta thresholds."""
  73
  74     spectrum_no = -1
  75     mark_spectrum = False
  76
  77     try:
  78         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  79             if line.startswith("S\t"):
  80                 spectrum_no += 1
  81                 charge, score, delta = sp_scores[spectrum_no]
  82                 mark_spectrum = (charge in thresholds
  83                                  and score != None
  84                                  and (score < thresholds[charge][0]
  85                                       or delta < thresholds[charge][1]))
  86             elif line.startswith("M\t") and mark_spectrum:
  87                 fs = line.split("\t")
  88                 if len(fs) >= 11:
  89                     fs[10] = 'N' + fs[10][1:]
  90                 line = '\t'.join(fs)
  91             sys.stdout.write(line)
  92     except:
  93         inplace_warning()
  94         raise
  95
  96
  97 def read_sqt_info(decoy_prefix, is_laf, sqt_fns):
  98     """Return a pair, the first a dict mapping each charge to a list of
  99     (score, delta, state), where state is 'real' or 'decoy', for all the
 100     spectra in sqt_fns, and the second a list of (charge, score, delta), one
 101     for each spectrum.
 102     """
 103
 104     # charge -> [ (score, delta, state), ... ]
 105     #   where state is either 'real' or 'decoy'
 106     z_scores = defaultdict(list)
 107
 108     # [ (charge, score, delta), ... ]
 109     sp_scores = []
 110
 111     current_charge = None
 112     current_score = None
 113     current_delta = 0
 114     current_sp_rank = None
 115     current_peptide_length = None
 116     current_state = set()
 117
 118     for line in fileinput.input(sqt_fns):
 119         fs = line.split('\t')
 120         if fs[0] == 'S':
 121             if (current_charge != None and
 122                 (not is_laf or (current_sp_rank <= 10))):
 123                 if current_score != None and len(current_state) == 1:
 124                     z_scores[current_charge].append((current_score,
 125                                                      current_delta,
 126                                                      current_state.pop()))
 127                 if current_charge != None:
 128                     sp_scores.append((current_charge, current_score,
 129                                       current_delta))
 130             current_charge = int(fs[3])
 131             current_score = None
 132             current_delta = 0
 133             current_sp_rank = None
 134             current_peptide_length = None
 135             current_state = set()
 136         elif fs[0] == 'M':
 137             delta, score, sp_rank, pep_length = (float(fs[4]), float(fs[5]),
 138                                                  int(fs[2]), len(fs[9])-4)
 139             if delta == 0:
 140                 current_score = score
 141             elif current_delta == 0:
 142                 current_delta = delta
 143             if current_sp_rank == None:
 144                 current_sp_rank = sp_rank
 145             if current_peptide_length == None:
 146                 current_peptide_length = pep_length
 147         elif fs[0] == 'L':
 148             if current_delta == 0:
 149                 if fs[1].startswith(decoy_prefix):
 150                     current_state.add('decoy')
 151                 else:
 152                     current_state.add('real')
 153     # handle final spectrum, as above
 154     if (current_charge != None and
 155         (not is_laf or (current_sp_rank <= 10))):
 156         if current_score != None and len(current_state) == 1:
 157             z_scores[current_charge].append((current_score, current_delta,
 158                                              current_state.pop()))
 159         if current_charge != None:
 160             sp_scores.append((current_charge, current_score, current_delta))
 161
 162     return (z_scores, sp_scores)
 163
 164
 165 def specificity(positives, negatives):
 166     return (float(positives - negatives)
 167             / (positives + negatives))
 168
 169
 170 def calculate_inner_threshold(specificity_goal, charge, spinfo):
 171     spinfo = sorted(spinfo, key=lambda x: x[0])
 172
 173     real_count = sum(1 for x in spinfo if x[-1] == 'real')
 174     decoy_count = len(spinfo) - real_count
 175
 176     if real_count == 0:
 177         return (None, real_count, decoy_count) # give up
 178
 179     current_threshold = -1e100      # allow all spectra
 180     for n, sp in enumerate(spinfo):
 181         specificity_est = specificity(real_count, decoy_count)
 182         if specificity_est >= specificity_goal:
 183             break
 184         if sp[-1] == 'real':
 185             real_count -= 1
 186         else:
 187             decoy_count -= 1
 188         # set threshold just high enough to exclude this spectrum
 189         current_threshold = sp[0] + 1e-6
 190     else:
 191         current_threshold = spinfo[-1][0] + 1e-6 # couldn't meet goal
 192
 193     return (current_threshold, real_count, decoy_count)
 194
 195
 196 def calculate_combined_thresholds(options, z_scores):
 197     """Find best score/delta thresholds for each charge."""
 198
 199     specificity_goal = 1 - options.fdr
 200
 201     # Rather than search every possible value of delta, we're only going to
 202     # "sample" at this granularity.  This cuts search time dramatically (and
 203     # making it O(n) instead of O(n**2).  Extra precision wouldn't really be
 204     # useful in any case.
 205     SEARCH_GRANULARITY = 0.001
 206
 207     # charge -> (score, delta, passing_reals, passing_decoys)
 208     thresholds = {}
 209
 210     for charge, spinfo in z_scores.iteritems():
 211         spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
 212
 213         last_value = None
 214
 215         while spinfo0:
 216             this_value = spinfo0[-1][1] # current delta
 217             if (last_value == None
 218                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 219
 220                 # "inner" is score
 221                 r = calculate_inner_threshold(specificity_goal, charge,
 222                                               spinfo0)
 223                 if r[0] != None:
 224                     if options.debug:
 225                         print '#', charge, r[0], this_value, r[1], r[2]
 226                     if (charge not in thresholds
 227                         or r[1] > thresholds[charge][2]):
 228                         thresholds[charge] = (r[0], this_value, r[1], r[2])
 229
 230                 last_value = this_value
 231             spinfo0.pop()
 232
 233         if charge in thresholds:
 234             if options.verbose:
 235                 print ("%+d: score %s, delta %s -> %s real ids (fdr %.4f)"
 236                        % (charge, thresholds[charge][0], thresholds[charge][1],
 237                           thresholds[charge][2],
 238                           1 - specificity(thresholds[charge][2],
 239                                           thresholds[charge][3])))
 240         else:
 241             warn("could not calculate thresholds for %+d" % charge)
 242
 243     return thresholds
 244
 245
 246 def main(args=sys.argv[1:]):
 247     parser = optparse.OptionParser(usage=
 248                                    "usage: %prog [options] <sqt-file>...",
 249                                    description=__doc__, version=__version__)
 250     pa = parser.add_option
 251     pa("--decoy-prefix", dest="decoy_prefix", default="SHUFFLED_",
 252        help='prefix given to locus name of decoy (e.g., shuffled) database'
 253        ' sequences [default="SHUFFLED_"]', metavar="PREFIX")
 254     pa("--fdr", dest="fdr", type="float", default="0.02",
 255        help="false discovery rate [default=0.02]", metavar="PROPORTION")
 256     pa("-v", "--verbose", action="store_true", dest="verbose",
 257        help="be verbose")
 258     pa("--laf", action="store_true", dest="laf",
 259        help="drop peptides of length < 7 or Sp rank > 10")
 260     pa("--debug", action="store_true", dest="debug",
 261        help="show debug output")
 262     pa("-m", "--mark", action="store_true", dest="mark",
 263        help="rewrite the input files, changing some validation marks to 'N',"
 264        " according to filtering")
 265     pa("--reset-marks", action="store_true", dest="reset_marks",
 266        help="rewrite the input files, changing all validation marks to 'U'")
 267     pa("--copyright", action="store_true", dest="copyright",
 268        help="print copyright and exit")
 269     (options, args) = parser.parse_args(args=args)
 270
 271     if options.copyright:
 272         print __copyright__
 273         sys.exit(0)
 274
 275     if len(args) < 1:
 276         parser.print_help()
 277         sys.exit(1)
 278
 279     if not (0.0 <= options.fdr <= 1.0):
 280         error("--fdr must be within range [0.0, 1.0]")
 281     if options.mark and options.reset_marks:
 282         error("only one of --mark and --reset-marks may be specified")
 283
 284     if options.reset_marks:
 285         reset_marks(options, args)
 286         return
 287
 288     z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix,
 289                                               options.laf, args)
 290
 291     thresholds = calculate_combined_thresholds(options, z_scores)
 292
 293     if options.debug:
 294         pprint(thresholds)
 295
 296         for charge, score, delta in spectrum_scores:
 297             print '#S', charge, score, delta
 298
 299     if options.mark:
 300         mark(options, thresholds, spectrum_scores, args)
 301
 302
 303 if __name__ == '__main__':
 304     main()