greylag_validate.py

   1 #!/usr/bin/env python
   2
   3 """
   4 Given a set of sqt files, determine a 'valid' set of identifications that
   5 satisfy the specified FDR (false discovery rate).  Optionally rewrite the sqt
   6 files with validation marks set to 'N' for filtered-out spectra.
   7
   8 Note that a peptide end with an invalid flanking residue (currently any of
   9 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
  10 tryptic if it's a C-terminal end and otherwise qualifies.  (Flanking '-' is
  11 always considered tryptic.)
  12
  13 """
  14
  15 from __future__ import with_statement
  16
  17 __copyright__ = '''
  18     greylag, a collection of programs for MS/MS protein analysis
  19     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  20
  21     This program is free software: you can redistribute it and/or modify
  22     it under the terms of the GNU General Public License as published by
  23     the Free Software Foundation, either version 3 of the License, or
  24     (at your option) any later version.
  25
  26     This program is distributed in the hope that it will be useful,
  27     but WITHOUT ANY WARRANTY; without even the implied warranty of
  28     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  29     GNU General Public License for more details.
  30
  31     You should have received a copy of the GNU General Public License
  32     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  33
  34     Contact: Mike Coleman
  35              Stowers Institute for Medical Research
  36              1000 East 50th Street
  37              Kansas City, Missouri  64110
  38              USA
  39 '''
  40
  41
  42 from collections import defaultdict
  43 import fileinput
  44 import optparse
  45 from pprint import pprint
  46 import string
  47 import sys
  48
  49
  50 # allow this script to be run even if not "installed"
  51 try:
  52     from greylag import VERSION
  53 except:
  54     VERSION = 'unknown'
  55
  56
  57 def warn(s):
  58     print >> sys.stderr, 'warning:', s
  59 def error(s):
  60     sys.exit('error: ' + s)
  61 def fileerror(s, *args):
  62     error(s + (", at line %s of file '%s'"
  63                % (fileinput.filelineno(), fileinput.filename())),
  64           *args)
  65
  66 def inplace_warning():
  67     warn("\n!!!\nan error occurred while modifying .sqt files in-place--it may"
  68          " be necessary to recover some or all of the .sqt files from the"
  69          " corresponding .sqt.bak files.\n!!!\n")
  70
  71
  72 def reset_marks(options, sqt_fns):
  73     """Rewrite all evaluation marks to 'U', in-place."""
  74
  75     try:
  76         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  77             if line.startswith("M\t"):
  78                 fs = line.split("\t")
  79                 if len(fs) >= 11:
  80                     fs[10] = 'U' + fs[10][1:]
  81                 line = '\t'.join(fs)
  82             sys.stdout.write(line)
  83     except:
  84         inplace_warning()
  85         raise
  86
  87
  88 def mark(options, thresholds, sp_scores, sqt_fns):
  89     """Rewrite evaluation marks to 'N', in-place, for spectra not meeting
  90     score and delta thresholds."""
  91
  92     spectrum_no = -1
  93     mark_spectrum = False               # mark this spectrum 'N'
  94
  95     try:
  96         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
  97             if line.startswith("S\t"):
  98                 spectrum_no += 1
  99                 sps = sp_scores[spectrum_no]
 100                 if sps:
 101                     charge, score, delta = sp_scores[spectrum_no]
 102                     mark_spectrum = (charge in thresholds
 103                                      and score != None
 104                                      and (score < thresholds[charge][0]
 105                                           or delta < thresholds[charge][1]))
 106                 else:
 107                     mark_spectrum = True
 108             elif line.startswith("M\t") and mark_spectrum:
 109                 fs = line.split("\t")
 110                 if len(fs) >= 11:
 111                     fs[10] = 'N' + fs[10][1:]
 112                 line = '\t'.join(fs)
 113             sys.stdout.write(line)
 114     except:
 115         inplace_warning()
 116         raise
 117
 118
 119 def kill(options, thresholds, sp_scores, sqt_fns):
 120     """Remove, in-place, spectra not meeting score and delta thresholds."""
 121
 122     spectrum_no = -1
 123     remove_spectrum = False             # remove this spectrum
 124
 125     try:
 126         for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'):
 127             if fileinput.isfirstline():
 128                 remove_spectrum = False
 129             if line.startswith("S\t"):
 130                 spectrum_no += 1
 131                 sps = sp_scores[spectrum_no]
 132                 if sps:
 133                     charge, score, delta = sp_scores[spectrum_no]
 134                     remove_spectrum = (charge in thresholds
 135                                        and score != None
 136                                        and (score < thresholds[charge][0]
 137                                             or delta < thresholds[charge][1]))
 138                 else:
 139                     remove_spectrum = True
 140             if not remove_spectrum:
 141                 sys.stdout.write(line)
 142     except:
 143         inplace_warning()
 144         raise
 145
 146
 147 nulltrans = string.maketrans('','')
 148 non_aa_chars = string.digits + string.punctuation
 149
 150 def stripmods(s):
 151     """Given a peptide, return just the unmodified peptide seen (upcased)."""
 152     # assuming no control chars present
 153     return s.upper().translate(nulltrans, non_aa_chars)
 154
 155
 156 def read_sqt_info(decoy_prefix, minimum_trypticity, sqt_fns):
 157     """Return a pair, the first a dict mapping each charge to a list of
 158     (score, delta, state), where state is 'real' or 'decoy', for all the
 159     spectra in sqt_fns, and the second a list of (charge, score, delta), one
 160     for each spectrum.  Spectra lacking minimum_trypticity are dropped.
 161     """
 162
 163     # charge -> [ (score, delta, state, spectrum_name, peptide), ... ]
 164     #   where state is either 'real' or 'decoy'
 165     #     and peptide is the peptide as first seen, with mods, no flanks
 166     z_scores = defaultdict(list)
 167
 168     # information about spectra, in file order--None iff spectrum was filtered
 169     # out
 170     # [ (charge, score, delta) or None, ... ]
 171     sp_scores = []
 172
 173     current_charge = None
 174     current_score = None
 175     current_delta = 0
 176     current_peptide_trypticity = None   # or one of [ 0, 1, 2 ]
 177     current_state = set()               # 'real', 'decoy' or both
 178     highest_rank_seen = 0
 179     best_peptide_seen = None
 180     current_peptide = None
 181     spectrum_name = None
 182
 183     for line in fileinput.input(sqt_fns):
 184         fs = line.split('\t')
 185         if fs[0] == 'S':
 186             sps = None
 187             if (current_charge != None and
 188                 current_peptide_trypticity >= minimum_trypticity):
 189                 if current_score != None:
 190                     if len(current_state) == 1:
 191                         z_scores[current_charge].append((current_score,
 192                                                          current_delta,
 193                                                          current_state.pop(),
 194                                                          spectrum_name,
 195                                                          current_peptide))
 196                     sps = (current_charge, current_score, current_delta)
 197             if current_charge != None:
 198                 sp_scores.append(sps)
 199             current_charge = int(fs[3])
 200             current_score = None
 201             current_delta = 0
 202             current_peptide_trypticity = None
 203             current_state = set()
 204             highest_rank_seen = 0
 205             best_peptide_seen = None
 206             current_peptide = None
 207             spectrum_name = '.'.join((fileinput.filename(),
 208                                       fs[1], fs[2], fs[3]))
 209         elif fs[0] == 'M':
 210             rank = int(fs[1])
 211             delta = float(fs[4])
 212             score = float(fs[5])
 213             sp_rank = int(fs[2])
 214             peptide = fs[9].strip()     # e.g., A.B@CD*.-
 215             mass = float(fs[3])
 216
 217             if rank > highest_rank_seen + 1:
 218                 # ignore aux top SpRank hits, because delta confounds
 219                 continue
 220             highest_rank_seen = rank
 221
 222             assert peptide[1] == '.' and peptide[-2] == '.'
 223             peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
 224             peptide = peptide[2:-2]     # 'B@CD*'
 225             peptide_stripped = stripmods(peptide) # 'BCD'
 226
 227             if best_peptide_seen == None:
 228                 best_peptide_seen = peptide_stripped
 229             if current_peptide == None:
 230                 current_peptide = peptide
 231
 232             # MyriMatch gives delta=0 when score=0, even if there are better
 233             # scores!
 234             if score > 0:
 235                 if delta == 0:
 236                     assert current_delta == 0
 237                     current_score = score
 238                 elif (current_delta == 0 or best_peptide_seen == None
 239                       or best_peptide_seen == peptide_stripped):
 240                     current_delta = delta
 241                     if best_peptide_seen != peptide_stripped:
 242                         # once we've seen a non-equivalent peptide, don't set
 243                         # delta if we see further equivalent peptides
 244                         best_peptide_seen = '####'
 245             if current_peptide_trypticity == None:
 246                 current_peptide_trypticity = 0
 247                 if (peptide_flanks[0] in ('K', 'R')
 248                     and peptide_stripped[0] != 'P'
 249                     or peptide_flanks[0] == '-'):
 250                     current_peptide_trypticity += 1
 251                 if (peptide_stripped[-1] in ('K', 'R')
 252                     and peptide_flanks[1] != 'P'
 253                     or peptide_flanks[1] == '-'):
 254                     current_peptide_trypticity += 1
 255         elif fs[0] == 'L':
 256             if current_delta == 0:
 257                 if fs[1].startswith(decoy_prefix):
 258                     current_state.add('decoy')
 259                 else:
 260                     current_state.add('real')
 261
 262     # handle final spectrum, as above
 263     # FIX: factor out this common code (using a generator?)
 264     sps = None
 265     if (current_charge != None and
 266         current_peptide_trypticity >= minimum_trypticity):
 267         if current_score != None:
 268             if len(current_state) == 1:
 269                 z_scores[current_charge].append((current_score,
 270                                                  current_delta,
 271                                                  current_state.pop(),
 272                                                  spectrum_name,
 273                                                  current_peptide))
 274             sps = (current_charge, current_score, current_delta)
 275     if current_charge != None:
 276         sp_scores.append(sps)
 277     return (z_scores, sp_scores)
 278
 279
 280 def PPV(reals, decoys):
 281     """Returns the estimated Positive Predictive Value (== 1 - FDR), given
 282     counts of reals and decoys."""
 283
 284     # We know that the decoys are false positives, and we estimate that an
 285     # equal number of the "reals" are actually false, too.
 286     false_positives = 2*decoys
 287     true_positives = reals - decoys
 288
 289     return float(true_positives) / (true_positives + false_positives)
 290
 291
 292 def calculate_inner_threshold(fdr, charge, spinfo):
 293     ppv_goal = 1 - fdr
 294
 295     spinfo = sorted(spinfo, key=lambda x: x[0])
 296
 297     epsilon = +1e-6
 298
 299     real_count = sum(1 for x in spinfo if x[2] == 'real')
 300     decoy_count = len(spinfo) - real_count
 301
 302     current_threshold = -1e100      # allow all spectra
 303     for n, sp in enumerate(spinfo):
 304         if real_count == 0:
 305             return (None, real_count, decoy_count) # give up
 306
 307         ppv_est = PPV(real_count, decoy_count)
 308         if ppv_est >= ppv_goal:
 309             break
 310         if sp[2] == 'real':
 311             real_count -= 1
 312         else:
 313             decoy_count -= 1
 314         # set threshold just high enough to exclude this spectrum
 315         current_threshold = sp[0] + epsilon
 316     else:
 317         current_threshold = spinfo[-1][0] + epsilon # couldn't meet goal
 318
 319     return (current_threshold, real_count, decoy_count)
 320
 321
 322 def calculate_combined_thresholds(options, z_scores):
 323     """Find best score/delta thresholds for each charge."""
 324
 325     ppv_goal = 1 - options.fdr
 326
 327     # Rather than search every possible value of delta, we're only going to
 328     # "sample" at this granularity.  This cuts search time dramatically (and
 329     # makes it O(n) instead of O(n**2).  Extra precision wouldn't really be
 330     # useful in any case.
 331     SEARCH_GRANULARITY = 0.001
 332
 333     # charge -> (score, delta, passing_reals, passing_decoys)
 334     thresholds = {}
 335
 336     total_reals, total_decoys = 0, 0
 337
 338     for charge, spinfo in z_scores.iteritems():
 339         spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True)
 340
 341         last_value = None
 342
 343         while spinfo0:
 344             this_value = spinfo0[-1][1] # current delta
 345             if (last_value == None
 346                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 347
 348                 # "inner" is score
 349                 r = calculate_inner_threshold(options.fdr, charge, spinfo0)
 350                 if r[0] != None:
 351                     if options.debug:
 352                         print '#', charge, r[0], this_value, r[1], r[2]
 353                     if (charge not in thresholds
 354                         or r[1] > thresholds[charge][2]):
 355                         thresholds[charge] = (r[0], this_value, r[1], r[2])
 356
 357                 last_value = this_value
 358             spinfo0.pop()
 359
 360         if charge in thresholds:
 361             score_threshold, delta_threshold = thresholds[charge][0:2]
 362
 363             if options.list_peptides:
 364                 for (score, delta, state,
 365                      spectrum_name, peptide) in z_scores[charge]:
 366                     if score >= score_threshold and delta >= delta_threshold:
 367                         print "%+d %s %s %s" % (charge, spectrum_name, state,
 368                                                 peptide)
 369             else:
 370                 reals, decoys = thresholds[charge][2], thresholds[charge][3]
 371                 total_reals += reals
 372                 total_decoys += decoys
 373                 print ("%+d: score %s, delta %s -> %s real ids, %s decoys"
 374                        " (fdr %.4f)"
 375                        % (charge, score_threshold, delta_threshold,
 376                           reals, decoys,
 377                           1 - PPV(thresholds[charge][2],
 378                                   thresholds[charge][3])))
 379         else:
 380             warn("could not calculate thresholds for %+d" % charge)
 381
 382     if not options.list_peptides:
 383         print "# total: %s real ids, %s decoys" % (total_reals, total_decoys)
 384
 385     return thresholds
 386
 387
 388 def main(args=sys.argv[1:]):
 389     parser = optparse.OptionParser(usage=
 390                                    "usage: %prog [options] <sqt-file>...",
 391                                    description=__doc__, version=VERSION)
 392     pa = parser.add_option
 393     DEFAULT_DECOY_PREFIX = "SHUFFLED_"
 394     pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
 395        help='prefix given to locus name of decoy (e.g., shuffled) database'
 396        ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
 397     DEFAULT_FDR = 0.01
 398     pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
 399        help="false discovery rate [default=%s] (Note that this is the"
 400        " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
 401        metavar="PROPORTION")
 402     pa("-v", "--verbose", action="store_true", dest="verbose",
 403        help="be verbose")
 404     pa("--list-peptides", action="store_true", dest="list_peptides",
 405        help="print information about passing peptides")
 406     pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
 407        default='0', dest="minimum_trypticity",
 408        help="drop peptides with too few tryptic ends"
 409        " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
 410     #pa("--graph", dest="graph",
 411     #   help='create distribution graphs, using the specified prefix',
 412     #   metavar="PATH PREFIX")
 413     pa("--debug", action="store_true", dest="debug",
 414        help="show debug output")
 415     pa("-m", "--mark", action="store_true", dest="mark",
 416        help="rewrite the input files, changing some validation marks to 'N',"
 417        " according to filtering")
 418     pa("--reset-marks", action="store_true", dest="reset_marks",
 419        help="rewrite the input files, changing all validation marks to 'U'")
 420     pa("-k", "--kill", action="store_true", dest="kill",
 421        help="rewrite the input files, removing spectra that don't pass"
 422        " validation, according to filtering")
 423     pa("--copyright", action="store_true", dest="copyright",
 424        help="print copyright and exit")
 425     (options, args) = parser.parse_args(args=args)
 426
 427     if options.copyright:
 428         print __copyright__
 429         sys.exit(0)
 430
 431     if len(args) < 1:
 432         parser.print_help()
 433         sys.exit(1)
 434
 435     if not (0.0 <= options.fdr < 0.5):
 436         error("--fdr must be within range [0.0, 0.5)")
 437     if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
 438         > 1):
 439         error("only one of --mark, --reset-marks, --kill may be specified")
 440
 441     if options.reset_marks:
 442         reset_marks(options, args)
 443         return
 444
 445     # This is to translate the "effective" FDR, which is the rate after decoys
 446     # have been stripped out of the results (which is what users really care
 447     # about), into our internal FDR, which includes decoys.
 448     options.fdr *= 2
 449
 450     z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix,
 451                                               int(options.minimum_trypticity),
 452                                               args)
 453
 454     if options.debug:
 455         print ('%s passing spectra, of which %s are not decoys'
 456                % (len(spectrum_scores), sum(len(z_scores[x])
 457                                             for x in z_scores)))
 458
 459     thresholds = calculate_combined_thresholds(options, z_scores)
 460
 461     if options.debug:
 462         pprint(thresholds)
 463
 464         for charge, score, delta in spectrum_scores:
 465             print '#S', charge, score, delta
 466
 467     if options.kill:
 468         kill(options, thresholds, spectrum_scores, args)
 469
 470     if options.mark:
 471         mark(options, thresholds, spectrum_scores, args)
 472
 473
 474 if __name__ == '__main__':
 475     main()