greylag_validate.py

   1 #!/usr/bin/env greylag-python
   2
   3 """
   4 Given a set of sqt files, determine a 'valid' set of identifications that
   5 satisfy the specified FDR (false discovery rate).  Optionally rewrite the sqt
   6 files with validation marks set to 'N' for filtered-out spectra.
   7
   8 """
   9
  10 from __future__ import with_statement
  11
  12 __copyright__ = '''
  13     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  14
  15     This program is free software; you can redistribute it and/or modify
  16     it under the terms of the GNU General Public License as published by
  17     the Free Software Foundation; either version 2 of the License, or
  18     (at your option) any later version.
  19
  20     This program is distributed in the hope that it will be useful,
  21     but WITHOUT ANY WARRANTY; without even the implied warranty of
  22     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23     GNU General Public License for more details.
  24
  25     You should have received a copy of the GNU General Public License along
  26     with this program; if not, write to the Free Software Foundation, Inc.,
  27     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  28 '''
  29
  30 __version__ = "0.0"
  31
  32
  33 from collections import defaultdict
  34 import fileinput
  35 import optparse
  36 from pprint import pprint
  37 import string
  38 import sys
  39
  40
  41 def warn(s):
  42     print >> sys.stderr, 'warning:', s
  43 def error(s):
  44     sys.exit('error: ' + s)
  45 def fileerror(s, *args):
  46     error(s + (", at line %s of file '%s'"
  47                % (fileinput.filelineno(), fileinput.filename())),
  48           *args)
  49
  50 def inplace_warning():
  51     warn("!!!\nan error occurred while modifying .sqt files in-place--it may"
  52          " be necessary to recover some or all of the .sqt files from the"
  53          " corresponding .sqt.bak files.\n!!!")
  54
  55
  56 def reset_marks(options, sqt_fns):
  57     """Rewrite all evaluation marks to 'U', in-place."""
  58
  59     try:
  60         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  61             if line.startswith("M\t"):
  62                 fs = line.split("\t")
  63                 if len(fs) >= 11:
  64                     fs[10] = 'U' + fs[10][1:]
  65                 line = '\t'.join(fs)
  66             sys.stdout.write(line)
  67     except:
  68         inplace_warning()
  69         raise
  70
  71
  72 def mark(options, thresholds, sp_scores, sqt_fns):
  73     """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
  74     score and delta thresholds."""
  75
  76     spectrum_no = -1
  77     mark_spectrum = False
  78
  79     try:
  80         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  81             if line.startswith("S\t"):
  82                 spectrum_no += 1
  83                 charge, score, delta = sp_scores[spectrum_no]
  84                 mark_spectrum = (charge in thresholds
  85                                  and score != None
  86                                  and (score < thresholds[charge][0]
  87                                       or delta < thresholds[charge][1]))
  88             elif line.startswith("M\t") and mark_spectrum:
  89                 fs = line.split("\t")
  90                 if len(fs) >= 11:
  91                     fs[10] = 'N' + fs[10][1:]
  92                 line = '\t'.join(fs)
  93             sys.stdout.write(line)
  94     except:
  95         inplace_warning()
  96         raise
  97
  98
  99 nulltrans = string.maketrans('','')
 100 non_aa_chars = string.digits + string.punctuation
 101
 102 def stripmods(s):
 103     """Given a peptide, return just the unmodified peptide seen (upcased)."""
 104     # assuming no control chars present
 105     return s.upper().translate(nulltrans, non_aa_chars)
 106
 107
 108 def read_sqt_info(decoy_prefix, minimum_trypticity, sqt_fns):
 109     """Return a pair, the first a dict mapping each charge to a list of
 110     (score, delta, state), where state is 'real' or 'decoy', for all the
 111     spectra in sqt_fns, and the second a list of (charge, score, delta), one
 112     for each spectrum.  Spectra lacking minimum_trypticity are dropped.
 113     """
 114
 115     # charge -> [ (score, delta, state), ... ]
 116     #   where state is either 'real' or 'decoy'
 117     z_scores = defaultdict(list)
 118
 119     # [ (charge, score, delta), ... ]
 120     sp_scores = []
 121
 122     current_charge = None
 123     current_score = None
 124     current_delta = 0
 125     current_peptide_trypticity = None   # or one of [ 0, 1, 2 ]
 126     current_state = set()               # 'real', 'decoy' or both
 127     highest_rank_seen = 0
 128     best_peptide_seen = None
 129
 130     for line in fileinput.input(sqt_fns):
 131         fs = line.split('\t')
 132         if fs[0] == 'S':
 133             if (current_charge != None and
 134                 current_peptide_trypticity >= minimum_trypticity):
 135                 if current_score != None:
 136                     if len(current_state) == 1:
 137                         z_scores[current_charge].append((current_score,
 138                                                          current_delta,
 139                                                          current_state.pop()))
 140                     sp_scores.append((current_charge, current_score,
 141                                       current_delta))
 142             current_charge = int(fs[3])
 143             current_score = None
 144             current_delta = 0
 145             current_peptide_trypticity = None
 146             current_state = set()
 147             highest_rank_seen = 0
 148             best_peptide_seen = None
 149         elif fs[0] == 'M':
 150             rank = int(fs[1])
 151             delta = float(fs[4])
 152             score = float(fs[5])
 153             sp_rank = int(fs[2])
 154             peptide = fs[9].strip()     # e.g., A.B@CD*.-
 155             mass = float(fs[3])
 156
 157             if rank > highest_rank_seen + 1:
 158                 # ignore aux top SpRank hits, because delta confounds
 159                 continue
 160             highest_rank_seen = rank
 161
 162             assert peptide[1] == '.' and peptide[-2] == '.'
 163             peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
 164             peptide = peptide[2:-2]     # 'B@CD*'
 165             peptide_stripped = stripmods(peptide) # 'BCD'
 166
 167             if best_peptide_seen == None:
 168                 best_peptide_seen = peptide_stripped
 169
 170             if delta == 0:
 171                 assert current_delta == 0
 172                 current_score = score
 173             elif (current_delta == 0 or best_peptide_seen == None
 174                   or best_peptide_seen == peptide_stripped):
 175                 current_delta = delta
 176                 if best_peptide_seen != peptide_stripped:
 177                     # once we've seen a non-equivalent peptide, don't set
 178                     # delta if we see further equivalent peptides
 179                     best_peptide_seen = '####'
 180             if current_peptide_trypticity == None:
 181                 current_peptide_trypticity = 0
 182                 if (peptide_flanks[0] in ('K', 'R')
 183                     and peptide_stripped[0] != 'P'):
 184                     current_peptide_trypticity += 1
 185                 if (peptide_stripped[-1] in ('K', 'R')
 186                     and peptide_flanks[1] != 'P'):
 187                     current_peptide_trypticity += 1
 188         elif fs[0] == 'L':
 189             if current_delta == 0:
 190                 if fs[1].startswith(decoy_prefix):
 191                     current_state.add('decoy')
 192                 else:
 193                     current_state.add('real')
 194
 195     # handle final spectrum, as above
 196     if (current_charge != None and
 197         current_peptide_trypticity >= minimum_trypticity):
 198         if current_score != None:
 199             if len(current_state) == 1:
 200                 z_scores[current_charge].append((current_score,
 201                                                  current_delta,
 202                                                  current_state.pop()))
 203             sp_scores.append((current_charge, current_score,
 204                               current_delta))
 205
 206     return (z_scores, sp_scores)
 207
 208
 209 def specificity(positives, negatives):
 210     return float(positives - negatives) / positives
 211
 212
 213 def calculate_inner_threshold(fdr, charge, spinfo):
 214     specificity_goal = 1 - fdr
 215
 216     spinfo = sorted(spinfo, key=lambda x: x[0])
 217
 218     epsilon = -1e-9
 219
 220     real_count = sum(1 for x in spinfo if x[-1] == 'real')
 221     decoy_count = len(spinfo) - real_count
 222
 223     current_threshold = -1e100      # allow all spectra
 224     for n, sp in enumerate(spinfo):
 225         if real_count == 0:
 226             return (None, real_count, decoy_count) # give up
 227
 228         specificity_est = specificity(real_count, decoy_count)
 229         if specificity_est >= specificity_goal:
 230             break
 231         if sp[-1] == 'real':
 232             real_count -= 1
 233         else:
 234             decoy_count -= 1
 235         # set threshold just high enough to exclude this spectrum
 236         current_threshold = sp[0] + epsilon
 237     else:
 238         current_threshold = spinfo[-1][0] + epsilon # couldn't meet goal
 239
 240     return (current_threshold, real_count, decoy_count)
 241
 242
 243 def calculate_combined_thresholds(options, z_scores):
 244     """Find best score/delta thresholds for each charge."""
 245
 246     specificity_goal = 1 - options.fdr
 247
 248     # Rather than search every possible value of delta, we're only going to
 249     # "sample" at this granularity.  This cuts search time dramatically (and
 250     # makes it O(n) instead of O(n**2).  Extra precision wouldn't really be
 251     # useful in any case.
 252     SEARCH_GRANULARITY = 0.001
 253
 254     # charge -> (score, delta, passing_reals, passing_decoys)
 255     thresholds = {}
 256
 257     for charge, spinfo in z_scores.iteritems():
 258         spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
 259
 260         last_value = None
 261
 262         while spinfo0:
 263             this_value = spinfo0[-1][1] # current delta
 264             if (last_value == None
 265                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 266
 267                 # "inner" is score
 268                 r = calculate_inner_threshold(options.fdr, charge, spinfo0)
 269                 if r[0] != None:
 270                     if options.debug:
 271                         print '#', charge, r[0], this_value, r[1], r[2]
 272                     if (charge not in thresholds
 273                         or r[1] > thresholds[charge][2]):
 274                         thresholds[charge] = (r[0], this_value, r[1], r[2])
 275
 276                 last_value = this_value
 277             spinfo0.pop()
 278
 279         if charge in thresholds:
 280             print ("%+d: score %s, delta %s -> %s real ids, %s decoys"
 281                    " (fdr %.4f)"
 282                    % (charge, thresholds[charge][0], thresholds[charge][1],
 283                       thresholds[charge][2], thresholds[charge][3],
 284                       1 - specificity(thresholds[charge][2],
 285                                       thresholds[charge][3])))
 286         else:
 287             warn("could not calculate thresholds for %+d" % charge)
 288
 289     return thresholds
 290
 291
 292 def main(args=sys.argv[1:]):
 293     parser = optparse.OptionParser(usage=
 294                                    "usage: %prog [options] <sqt-file>...",
 295                                    description=__doc__, version=__version__)
 296     pa = parser.add_option
 297     DEFAULT_DECOY_PREFIX = "SHUFFLED_"
 298     pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
 299        help='prefix given to locus name of decoy (e.g., shuffled) database'
 300        ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
 301     DEFAULT_FDR = 0.02
 302     pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
 303        help="false discovery rate [default=%s]" % DEFAULT_FDR,
 304        metavar="PROPORTION")
 305     pa("-v", "--verbose", action="store_true", dest="verbose",
 306        help="be verbose")
 307     pa("--list-peptides", action="store_true", dest="list_peptides",
 308        help="print information about passing peptides")
 309     pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
 310        default='0', dest="minimum_trypticity",
 311        help="drop peptides with too few tryptic ends"
 312        " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
 313     #pa("--graph", dest="graph",
 314     #   help='create distribution graphs, using the specified prefix',
 315     #   metavar="PATH PREFIX")
 316     pa("--debug", action="store_true", dest="debug",
 317        help="show debug output")
 318     pa("-m", "--mark", action="store_true", dest="mark",
 319        help="rewrite the input files, changing some validation marks to 'N',"
 320        " according to filtering")
 321     pa("--reset-marks", action="store_true", dest="reset_marks",
 322        help="rewrite the input files, changing all validation marks to 'U'")
 323     pa("--copyright", action="store_true", dest="copyright",
 324        help="print copyright and exit")
 325     (options, args) = parser.parse_args(args=args)
 326
 327     if options.copyright:
 328         print __copyright__
 329         sys.exit(0)
 330
 331     if len(args) < 1:
 332         parser.print_help()
 333         sys.exit(1)
 334
 335     if not (0.0 <= options.fdr <= 1.0):
 336         error("--fdr must be within range [0.0, 1.0]")
 337     if options.mark and options.reset_marks:
 338         error("only one of --mark and --reset-marks may be specified")
 339
 340     if options.reset_marks:
 341         reset_marks(options, args)
 342         return
 343
 344     z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix,
 345                                               int(options.minimum_trypticity),
 346                                               args)
 347
 348     if options.debug:
 349         print ('%s passing spectra, of which %s are not decoys'
 350                % (len(spectrum_scores), sum(len(z_scores[x])
 351                                             for x in z_scores)))
 352
 353     thresholds = calculate_combined_thresholds(options, z_scores)
 354
 355     if options.debug:
 356         pprint(thresholds)
 357
 358         for charge, score, delta in spectrum_scores:
 359             print '#S', charge, score, delta
 360
 361     if options.mark:
 362         mark(options, thresholds, spectrum_scores, args)
 363
 364
 365 if __name__ == '__main__':
 366     main()