greylag_validate.py

   1 #!/usr/bin/env python
   2
   3 """
   4 Given a set of sqt files, determine a 'valid' set of identifications that
   5 satisfy the specified FDR (false discovery rate) and other specified criteria.
   6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
   7 altogether.
   8
   9 Note that a peptide end with an invalid flanking residue (currently any of
  10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
  11 tryptic if it's a C-terminal end and otherwise qualifies.  (Flanking '-' is
  12 always considered tryptic.)
  13
  14 """
  15
  16 from __future__ import with_statement
  17
  18 __copyright__ = '''
  19     greylag, a collection of programs for MS/MS protein analysis
  20     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  21
  22     This program is free software: you can redistribute it and/or modify
  23     it under the terms of the GNU General Public License as published by
  24     the Free Software Foundation, either version 3 of the License, or
  25     (at your option) any later version.
  26
  27     This program is distributed in the hope that it will be useful,
  28     but WITHOUT ANY WARRANTY; without even the implied warranty of
  29     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  30     GNU General Public License for more details.
  31
  32     You should have received a copy of the GNU General Public License
  33     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  34
  35     Contact: Mike Coleman
  36              Stowers Institute for Medical Research
  37              1000 East 50th Street
  38              Kansas City, Missouri  64110
  39              USA
  40 '''
  41
  42
  43 from collections import defaultdict
  44 import itertools
  45 import optparse
  46 import os
  47 from pprint import pprint
  48 import re
  49 import string
  50 import sys
  51 import time
  52
  53
  54 # allow this script to be run even if not "installed"
  55 try:
  56     from greylag import VERSION
  57 except:
  58     VERSION = 'unknown'
  59
  60
  61 def warn(s):
  62     print >> sys.stderr, 'warning:', s
  63 def error(s):
  64     sys.exit('error: ' + s)
  65
  66
  67 def generate_spectra_from_files(sqt_filenames):
  68     """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
  69     ...]) for the given SQT files.  H+_lines, for example, is a list
  70     containing H lines and possibly other kinds of lines up to the next S
  71     line.  (mod_lines are non-L lines following an M-line.)
  72     """
  73     for sqt_filename in sqt_filenames:
  74         with open(sqt_filename) as sqtf:
  75             sqt_filename = os.path.splitext(sqt_filename)[0]
  76             H_lines = []
  77             S_line, ML_lines = None, []
  78             for line in sqtf:
  79                 if line.startswith('L\t'):
  80                     ML_lines[-1][2].append(line)
  81                 elif line.startswith('M\t'):
  82                     ML_lines.append((line, [], []))
  83                 elif line.startswith('S\t'):
  84                     if S_line:
  85                         yield (sqt_filename, H_lines, S_line, ML_lines)
  86                         S_line, ML_lines = None, []
  87                     S_line = line
  88                 else:
  89                     if not S_line:
  90                         # read headers up to first S line
  91                         H_lines.append(line)
  92                     else:
  93                         # other kind of line following M line
  94                         ML_lines[-1][1].append(line)
  95             if S_line:
  96                 yield (sqt_filename, H_lines, S_line, ML_lines)
  97
  98
  99 def write_spectra_to_files(spectra):
 100     """Given a generator of tuples of the form generated by
 101     'generate_spectra_from_files', write them to the implied files.  If any of
 102     those files already exist, they are renamed with a '.bak' suffix.  Temp
 103     files are written first, then renamed at the end.
 104     """
 105
 106     tmp_suffix = '.tmp.%s' % time.time()
 107
 108     # filename -> file
 109     filemap = {}
 110
 111     for sqt_filename, H_lines, S_line, ML_lines in spectra:
 112         sqtf = filemap.get(sqt_filename)
 113         if sqtf == None:
 114             sqtf = filemap[sqt_filename] = open(sqt_filename + tmp_suffix, 'w')
 115             for h in H_lines:
 116                 sqtf.write(h)
 117         sqtf.write(S_line)
 118         for m_line, mod_lines, l_lines in ML_lines:
 119             sqtf.write(m_line)
 120             for m in mod_lines:
 121                 sqtf.write(m)
 122             for l in l_lines:
 123                 sqtf.write(l)
 124
 125     for fd in filemap.values():
 126         fd.close()
 127     for fn in filemap:
 128         if os.path.exists(fn):
 129             os.rename(fn, fn + '.bak')
 130         os.rename(fn + tmp_suffix, fn)
 131
 132
 133 def reset_marks(spectra):
 134     """Given a generator of tuples of the form generated by
 135     'generate_spectra_from_files', yield them with the M line marks
 136     (destructively) reset to U.
 137     """
 138
 139     def reset_mark(m_line):
 140         return re.sub(r'\S(\s*)$', r'U\1', m_line, 1)
 141
 142     for sqt_filename, H_lines, S_line, ML_lines in spectra:
 143         ML_lines_1 = [ (reset_mark(M_line), mod_lines, L_lines)
 144                        for M_line, mod_lines, L_lines in ML_lines ]
 145         yield (sqt_filename, H_lines, S_line, ML_lines_1)
 146
 147
 148 def set_marks(options, valid_spectrum_info, spectra, kill=False):
 149     """Given a generator of tuples 'spectra' of the form generated by
 150     'generate_spectra_from_files', yield them with the M line marks
 151     (destructively) set to 'N', for spectra not listed in
 152     valid_spectrum_info.  If 'kill', omit marked spectra altogether.
 153     """
 154
 155     def set_mark(m_line):
 156         return re.sub(r'\S(\s*)$', r'N\1', m_line, 1)
 157
 158     passing_spectrum_numbers = frozenset(si[0] for si in valid_spectrum_info)
 159
 160     for spectrum_no, (sqt_filename, H_lines, S_line, ML_lines) \
 161             in enumerate(spectra):
 162         mark_spectrum = spectrum_no not in passing_spectrum_numbers
 163         if mark_spectrum and not kill:
 164             ML_lines = [ (set_mark(M_line), mod_lines, L_lines)
 165                          for M_line, mod_lines, L_lines in ML_lines ]
 166         if not (kill and mark_spectrum):
 167             yield (sqt_filename, H_lines, S_line, ML_lines)
 168
 169
 170 nulltrans = string.maketrans('','')
 171 non_aa_chars = string.digits + string.punctuation
 172
 173 def strip_mods(s):
 174     """Given a peptide, return just the unmodified peptide seen."""
 175     # assuming no control chars present
 176     return s.translate(nulltrans, non_aa_chars)
 177
 178
 179 def get_spectrum_info(decoy_prefix, minimum_trypticity, spectra):
 180     """Return a pair, the first a dict mapping each charge to a list of
 181     (score, delta, state), where state is 'real' or 'decoy', for all the
 182     spectra in sqt_fns, and the second a list of (charge, score, delta), one
 183     for each spectrum, given a generator of tuples 'spectra' of the form
 184     generated by 'generate_spectra_from_files'.  Spectra lacking
 185     minimum_trypticity are dropped.
 186     """
 187
 188     ## charge -> [ (score, delta, state), ... ]
 189     ##   where state is either 'real' or 'decoy'
 190     #z_scores = defaultdict(list)
 191
 192     ## information about spectra, in file order--None iff spectrum was filtered
 193     ## out
 194     ## [ (charge, score, delta) or None, ... ]
 195     #sp_scores = []
 196
 197     # [ (spectrum_no, charge, score, delta, state, stripped_peptide), ... ]
 198     spectrum_info = []
 199
 200     for spectrum_no, (sqt_filename,
 201                       H_lines, S_line, ML_lines) in enumerate(spectra):
 202         S_fields = S_line.split('\t')
 203         current_charge = int(S_fields[3])
 204         actual_mass = float(S_fields[6])
 205         spectrum_name = '.'.join([sqt_filename] + S_fields[1:4])
 206
 207         current_score = None
 208         current_delta = None
 209         current_peptide_trypticity = None
 210         current_state = set()
 211         current_loci = set()
 212         highest_rank_seen = 0
 213         best_peptide_seen = None
 214         current_peptide = None
 215
 216         for M_line, mod_lines, L_lines in ML_lines:
 217             M_fields = M_line.split('\t')
 218
 219             rank = int(M_fields[1])
 220             predicted_mass = float(M_fields[3])
 221             score = float(M_fields[5])
 222             peptide = M_fields[9].strip().upper() # e.g., A.B@CD*.-
 223             #delta = float(M_fields[4])
 224             #sp_rank = int(M_fields[2])
 225             assert score >= 0 and rank >= 1
 226
 227             if rank > highest_rank_seen + 1:
 228                 # ignore aux top SpRank hits, because delta confounds
 229                 break
 230             highest_rank_seen = rank
 231
 232             if current_score == None:
 233                 if score <= 0:
 234                     break               # spectrum is total crap
 235                 current_score = score
 236
 237             # Don't trust delta from input file.
 238             delta = (current_score - score) / current_score
 239
 240             assert peptide[1] == '.' and peptide[-2] == '.'
 241             peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
 242             peptide = peptide[2:-2]                    # 'B@CD*'
 243
 244             if current_peptide == None:
 245                 current_peptide = peptide
 246                 current_peptide_trypticity = 0
 247                 # NB: K*, R* not tryptic!
 248                 if (peptide_flanks[0] in ('K', 'R') and peptide[0] != 'P'
 249                     or peptide_flanks[0] == '-'):
 250                     current_peptide_trypticity += 1
 251                 if (peptide[-1] in ('K', 'R') and peptide_flanks[1] != 'P'
 252                     or peptide_flanks[1] == '-'):
 253                     current_peptide_trypticity += 1
 254
 255             # Choose delta from first M line such that peptide differs from
 256             # peptide in initial (top) M line.  We consider differently
 257             # modified versions of the same peptide to be different (at least
 258             # for now).
 259             if peptide != current_peptide:
 260                 current_delta = delta
 261                 break
 262
 263             for L_line in L_lines:
 264                 locus = L_line.split('\t')[1]
 265                 current_loci.add(locus)
 266                 if locus.startswith(decoy_prefix):
 267                     current_state.add('decoy')
 268                 else:
 269                     current_state.add('real')
 270
 271         if (current_score != None and current_delta != None
 272             and current_peptide_trypticity >= minimum_trypticity):
 273
 274             if len(current_state) == 2:
 275                 current_state = 'both'
 276             else:
 277                 assert len(current_state) == 1
 278                 current_state = current_state.pop()
 279
 280             spectrum_info.append((spectrum_no, spectrum_name, current_charge,
 281                                   current_score, current_delta, current_state,
 282                                   current_peptide,
 283                                   strip_mods(current_peptide),
 284                                   actual_mass, actual_mass-predicted_mass,
 285                                   frozenset(current_loci)))
 286
 287     return spectrum_info
 288
 289
 290 # General algorithm:
 291 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
 292 #     repeat until set of valid spectra converges:
 293 #         - do pr filter of spectra
 294 #             - if not first time, and no spectra filtered out, break
 295 #         - separately for each charge, choose valid spectra to maximize real ids subject to FDR
 296 #     - saturate (locally or completely)
 297
 298
 299 def PPV(reals, decoys):
 300     """Returns the estimated Positive Predictive Value (== 1 - FDR), given
 301     counts of reals and decoys."""
 302
 303     # We know that the decoys are false positives, and we estimate that an
 304     # equal number of the "reals" are actually false, too.
 305     false_positives = 2*decoys
 306     true_positives = reals - decoys
 307
 308     if true_positives <= 0:
 309         return 0
 310     return float(true_positives) / (true_positives + false_positives)
 311
 312
 313 def calculate_inner_threshold(fdr, charge, spinfo):
 314     ppv_goal = 1 - fdr
 315
 316     spinfo = sorted(spinfo, key=lambda x: x[3])
 317
 318     epsilon = +1e-6
 319
 320     real_count = sum(1 for x in spinfo if x[5] == 'real')
 321     decoy_count = sum(1 for x in spinfo if x[5] == 'decoy')
 322
 323     current_threshold = -1e100      # allow all spectra
 324     for n, sp in enumerate(spinfo):
 325         if real_count == 0:
 326             return (None, real_count, decoy_count) # give up
 327
 328         ppv_est = PPV(real_count, decoy_count)
 329         if ppv_est >= ppv_goal:
 330             break
 331         if sp[5] == 'real':
 332             real_count -= 1
 333         elif sp[5] == 'decoy':
 334             decoy_count -= 1
 335         # set threshold just high enough to exclude this spectrum
 336         current_threshold = sp[3] + epsilon
 337     else:
 338         current_threshold = spinfo[-1][3] + epsilon # couldn't meet goal
 339
 340     return (current_threshold, real_count, decoy_count)
 341
 342
 343 def calculate_combined_thresholds(fdr, options, spectrum_info):
 344     """Find best score/delta thresholds for each charge."""
 345
 346     ppv_goal = 1 - fdr
 347
 348     # Rather than search every possible value of delta, we're only going to
 349     # "sample" at this granularity.  This cuts search time dramatically (and
 350     # makes it O(n) instead of O(n**2).  Extra precision wouldn't really be
 351     # useful in any case.
 352     SEARCH_GRANULARITY = 0.001
 353
 354     # charge -> (score, delta, passing_reals, passing_decoys)
 355     thresholds = {}
 356
 357     by_charge = lambda x: x[2]
 358     spectrum_info.sort(key=by_charge)
 359     for charge, spinfo in itertools.groupby(spectrum_info, key=by_charge):
 360         spinfo0 = sorted(spinfo, key=lambda x: x[4], reverse=True)
 361
 362         last_value = None
 363
 364         while spinfo0:
 365             this_value = spinfo0[-1][4] # current delta
 366             if (last_value == None
 367                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 368
 369                 # "inner" is score
 370                 r = calculate_inner_threshold(fdr, charge, spinfo0)
 371                 threshold, real_count, decoy_count = r
 372                 if threshold != None:
 373                     if options.debug:
 374                         print '#', charge, threshold, this_value, \
 375                             real_count, decoy_count
 376                     if (charge not in thresholds
 377                         or real_count > thresholds[charge][2]):
 378                         thresholds[charge] = (threshold, this_value,
 379                                               real_count, decoy_count)
 380                 last_value = this_value
 381             spinfo0.pop()
 382
 383     return thresholds
 384
 385
 386 def flatten(gen_of_gens):
 387     for gen in gen_of_gens:
 388         for item in gen:
 389             yield item
 390
 391
 392 def redundancy_filter(options, spectrum_info):
 393     if options.minimum_spectra_per_locus < 2:
 394         return spectrum_info[:]
 395
 396     # si = (spectrum_no, spectrum_name, charge, score, delta, state,
 397     #       peptide, stripped_peptide, actual_mass, mass_delta, loci)
 398
 399     Rs = sorted(flatten(((locus, si) for locus in si[10])
 400                         for si in spectrum_info))
 401
 402     result = set()
 403     for locus, locus_Rs in itertools.groupby(Rs, key=lambda x: x[0]):
 404         list_locus_si = [ si for locus, si in locus_Rs ]
 405         if len(list_locus_si) < options.minimum_spectra_per_locus:
 406             continue
 407         result |= set(list_locus_si)
 408     return sorted(result)
 409
 410
 411 def saturate_by_prefix_suffix(spectrum_info, saturation_spectrum_info):
 412     PREFIX_LENGTH = 16          # empirically seems good
 413     peptide_prefixes = set(si[7][:PREFIX_LENGTH] for si in spectrum_info)
 414     sat_si = sorted(ssi for ssi in saturation_spectrum_info
 415                     if ssi[7][:PREFIX_LENGTH] in peptide_prefixes)
 416
 417     print ("saturation by prefix (w/%s): %s -> %s"
 418            % (len(saturation_spectrum_info), len(spectrum_info), len(sat_si)))
 419     return sat_si
 420
 421
 422 def saturate(spectrum_info, saturation_spectrum_info):
 423     peptides = set(si[7] for si in spectrum_info)
 424     # FIX: does this actually need to be sorted?
 425     sat_si = sorted(ssi for ssi in saturation_spectrum_info
 426                     if ssi[7] in peptides)
 427     print ("saturation (w/%s): %s -> %s"
 428            % (len(saturation_spectrum_info), len(spectrum_info), len(sat_si)))
 429     return saturate_by_prefix_suffix(sat_si, saturation_spectrum_info)
 430     #return sat_si
 431
 432
 433 def debug_fdr(si):
 434     print len(si), sum(1 for x in si if x[5] == 'real'), sum(1 for x in si if x[5] == 'decoy')
 435
 436
 437 def try_fdr(fdr_guess, options, remaining_spectrum_info, saturation_spectrum_info=None):
 438     print "try_fdr:", fdr_guess
 439     debug_fdr(remaining_spectrum_info)
 440
 441     remaining_spectrum_info_1 = None
 442     while True:
 443         thresholds = calculate_combined_thresholds(fdr_guess, options,
 444                                                    remaining_spectrum_info)
 445         remaining_spectrum_info = [ si for si in remaining_spectrum_info
 446                                     if (si[2] in thresholds
 447                                         and si[3] >= thresholds[si[2]][0]
 448                                         and si[4] >= thresholds[si[2]][1]) ]
 449         debug_fdr(remaining_spectrum_info)
 450         if remaining_spectrum_info == remaining_spectrum_info_1:
 451             break
 452         remaining_spectrum_info_1 = redundancy_filter(options,
 453                                                       remaining_spectrum_info)
 454         debug_fdr(remaining_spectrum_info_1)
 455         if len(remaining_spectrum_info_1) == len(remaining_spectrum_info):
 456             break
 457         remaining_spectrum_info = remaining_spectrum_info_1
 458
 459     if not options.no_saturation and saturation_spectrum_info:
 460         remaining_spectrum_info_1 = saturate(remaining_spectrum_info_1,
 461                                              saturation_spectrum_info)
 462         debug_fdr(remaining_spectrum_info_1)
 463
 464     total_reals = sum(1 for si in remaining_spectrum_info_1
 465                       if si[5] == 'real')
 466     total_decoys = sum(1 for si in remaining_spectrum_info_1
 467                        if si[5] == 'decoy')
 468     #total_reals = sum(thresholds[charge][2] for charge in thresholds)
 469     #total_decoys = sum(thresholds[charge][3] for charge in thresholds)
 470     fdr_result = 1 - PPV(total_reals, total_decoys)
 471
 472     return (fdr_result, thresholds, total_reals, total_decoys,
 473             remaining_spectrum_info_1)
 474
 475
 476 # FIX: at various points here we may have a goal of a minimum number of real
 477 # ids, based on goal FDR or whatever.  We can pass this inward to do
 478 # branch-and-bound, which may save time.
 479
 480 # FIX: this is now somewhat slower than before--any way to speed it up?
 481
 482 def search_adjusting_fdr(options, spectrum_info_0):
 483     """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
 484     guess that results in an appropriate goal FDR.  This might be needed
 485     because the thresholds found for per-charge FDRs will be saturated or have
 486     quantization errors, resulting in overly conservative thresholds.  Returns
 487     a list of items from spectrum_info that are considered "valid".
 488     """
 489
 490     # FIX: Does this generate enough additional real ids to be worth the
 491     # computational cost?
 492     #FDR_INFLATION_FACTOR = 1.1
 493     FDR_INFLATION_FACTOR = 3
 494     assert FDR_INFLATION_FACTOR > 1
 495
 496     spectrum_info = redundancy_filter(options, spectrum_info_0)
 497
 498     low_fdr = options.fdr
 499     # FIX: spectrum_info_0 has already been filtered for tryptic status
 500     # Should we use an unfiltered set instead?
 501     low_results = try_fdr(low_fdr, options, spectrum_info, spectrum_info_0)
 502
 503     if options.adjust_fdr:
 504         high_fdr = min(1.0, options.fdr * FDR_INFLATION_FACTOR)
 505         high_results = try_fdr(high_fdr, options, spectrum_info,
 506                                spectrum_info_0)
 507         initial_total_reals = low_results[2]
 508
 509         for i in range(32):
 510             if options.debug:
 511                 print ("adjust infl fdr %s %s -> %s"
 512                        % (low_fdr, high_fdr, high_results[0]))
 513             if high_results[0] >= options.fdr:
 514                 break
 515             low_fdr, low_results = high_fdr, high_results
 516             high_fdr = min(1.0, high_fdr * FDR_INFLATION_FACTOR)
 517             if high_fdr == 1.0:
 518                 break
 519             high_results = try_fdr(high_fdr, options, spectrum_info,
 520                                    spectrum_info_0)
 521         else:
 522             warn("FDR adjustment inflation failed")
 523
 524         #    #
 525         #    l        g            h
 526         #   l       g         h
 527         #    #
 528
 529         CONVERGENCE_FACTOR = 0.01
 530         assert CONVERGENCE_FACTOR > 0
 531         for i in range(32):
 532             assert high_fdr >= low_fdr
 533             if (high_fdr - low_fdr) <= CONVERGENCE_FACTOR*options.fdr:
 534                 break
 535
 536             # - seems to be slower than the naive method?
 537             # Bias is empirically determined.  It's supposed to speed
 538             # convergence, but should not affect the results.
 539             # FIX: better method possible?
 540             #GUESS_BIAS = 1.15
 541             #prop = ((options.fdr - low_results[0])
 542             #        / (high_results[0] - low_results[0]))
 543             #prop = min(1, max(0, prop))
 544             #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
 545
 546             ####
 547             guess_fdr = (high_fdr + low_fdr) / 2.0
 548
 549             guess_results = try_fdr(guess_fdr, options, spectrum_info,
 550                                     spectrum_info_0)
 551             if options.debug:
 552                 print ("adjust fdr %s %s %s -> %s"
 553                        % (low_fdr, guess_fdr, high_fdr, guess_results[0]))
 554             if guess_results[0] > options.fdr:
 555                 high_fdr, high_results = guess_fdr, guess_results
 556             else:
 557                 low_fdr, low_results = guess_fdr, guess_results
 558         else:
 559             warn("FDR adjustment convergence failed")
 560
 561     (fdr_result, thresholds, total_reals,
 562      total_decoys, valid_spectrum_info) = low_results
 563
 564     if options.output_peptides:
 565         valid_spectrum_info.sort(key=lambda x: x[1])
 566         for (spectrum_no, spectrum_name, charge, score, delta, state,
 567              peptide, stripped_peptide, actual_mass,
 568              mass_delta, loci) in valid_spectrum_info:
 569             print >> options.output_peptides, \
 570                 ("%+d %s %s %s %.8f %.8f"
 571                  % (charge, spectrum_name, state, peptide, actual_mass,
 572                     mass_delta))
 573
 574     for charge in sorted(thresholds.keys()):
 575         score_threshold, delta_threshold = thresholds[charge][0:2]
 576         reals, decoys = thresholds[charge][2:4]
 577         print ("%+d: score %f, delta %.6f -> %s real ids, %s decoys"
 578                " (fdr %.4f)"
 579                % (charge, score_threshold, delta_threshold,
 580                   reals, decoys, 0.5 * (1 - PPV(reals, decoys))))
 581     print ("# total: %s real ids, %s decoys (fdr %.4f)"
 582            % (total_reals, total_decoys, fdr_result / 2.0))
 583     if options.adjust_fdr:
 584         print ("# (FDR adjustment found %s more real ids)"
 585                % (total_reals - initial_total_reals))
 586
 587     return valid_spectrum_info
 588
 589
 590 def main(args=sys.argv[1:]):
 591     parser = optparse.OptionParser(usage=
 592                                    "usage: %prog [options] <sqt-file>...",
 593                                    description=__doc__, version=VERSION)
 594     pa = parser.add_option
 595     DEFAULT_DECOY_PREFIX = "SHUFFLED_"
 596     pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
 597        help='prefix given to locus name of decoy (e.g., shuffled) database'
 598        ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
 599     DEFAULT_FDR = 0.01
 600     pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
 601        help="false discovery rate [default=%s] (Note that this is the"
 602        " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
 603        metavar="PROPORTION")
 604     pa("--adjust-fdr", action="store_true", dest="adjust_fdr",
 605        help="try to adjust internal FDR to achieve a final FDR closer to"
 606        " requested FDR (this takes longer, but may find more read ids in some"
 607        " cases)")
 608     pa("--no-saturation", action="store_true", dest="no_saturation",
 609        help="skip peptide saturation")
 610     pa("-v", "--verbose", action="store_true", dest="verbose",
 611        help="be verbose")
 612     pa("--output-peptides", dest="output_peptides",
 613        help="output information about passing peptides to specified file"
 614        " ('-' for stdout)", metavar="FILENAME")
 615     pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
 616        default='0', dest="minimum_trypticity",
 617        help="drop peptides with too few tryptic ends"
 618        " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
 619     DEFAULT_MINIMUM_SPECTRA_PER_LOCUS = 1
 620     pa("-p", "--minimum-spectra-per-locus", dest="minimum_spectra_per_locus",
 621        type="int", default=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
 622        help="only loci with at least this many spectra are included, and"
 623        " spectra not associated with such a locus are excluded"
 624        " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
 625        metavar="COUNT")
 626     #pa("--graph", dest="graph",
 627     #   help='create distribution graphs, using the specified prefix',
 628     #   metavar="PATH PREFIX")
 629     pa("--debug", action="store_true", dest="debug",
 630        help="show debug output")
 631     pa("--mark", action="store_true", dest="mark",
 632        help="rewrite the input files, changing some validation marks to 'N',"
 633        " according to filtering")
 634     pa("--reset-marks", action="store_true", dest="reset_marks",
 635        help="rewrite the input files, changing all validation marks to 'U'")
 636     pa("--kill", action="store_true", dest="kill",
 637        help="rewrite the input files, removing spectra that don't pass"
 638        " validation, according to filtering")
 639     pa("--copyright", action="store_true", dest="copyright",
 640        help="print copyright and exit")
 641     (options, args) = parser.parse_args(args=args)
 642
 643     if options.copyright:
 644         print __copyright__
 645         sys.exit(0)
 646
 647     if len(args) < 1 or options.minimum_spectra_per_locus < 0:
 648         parser.print_help()
 649         sys.exit(1)
 650
 651     if not (0.0 <= options.fdr < 0.5):
 652         error("--fdr must be within range [0.0, 0.5)")
 653     if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
 654         > 1):
 655         error("only one of --mark, --reset-marks, --kill may be specified")
 656
 657     if options.reset_marks:
 658         write_spectra_to_files(reset_marks(generate_spectra_from_files(args)))
 659         return
 660
 661     if options.output_peptides:
 662         if options.output_peptides == '-':
 663             options.output_peptides = sys.stdout
 664         else:
 665             options.output_peptides = open(options.output_peptides, 'w')
 666
 667     # This is to translate the "effective" FDR, which is the rate after decoys
 668     # have been stripped out of the results (which is what users really care
 669     # about), into our internal FDR, which includes decoys.
 670     options.fdr *= 2
 671
 672     spectrum_info = get_spectrum_info(options.decoy_prefix,
 673                                       int(options.minimum_trypticity),
 674                                       generate_spectra_from_files(args))
 675
 676     if options.debug:
 677         print ('%s passing spectra, of which %s are not decoys'
 678                % (len(spectrum_info),
 679                   sum(1 for si in spectrum_info if si[5] == 'real')))
 680
 681     # valid_spectrum_info is the list of spectrum_info elements that have been
 682     # chosen as "valid"
 683     valid_spectrum_info = search_adjusting_fdr(options, spectrum_info)
 684
 685     if options.debug:
 686         valid_spectrum_info.sort()
 687         for (spectrum_no, spectrum_name, charge, score, delta, state,
 688              peptide, stripped_peptide, actual_mass,
 689              mass_delta, loci) in valid_spectrum_info:
 690             print '#S', charge, score, delta
 691
 692     if options.mark or options.kill:
 693         # Note that we are intentionally reading the input files again here,
 694         # to avoid having to keep them all in memory.
 695         write_spectra_to_files(set_marks(options, valid_spectrum_info,
 696                                          generate_spectra_from_files(args),
 697                                          options.kill))
 698
 699
 700 if __name__ == '__main__':
 701     main()