greylag_validate.py

   1 #!/usr/bin/env python
   2
   3 """
   4 Given a set of sqt files, determine a 'valid' set of identifications that
   5 satisfy the specified FDR (false discovery rate) and other specified criteria.
   6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
   7 altogether.
   8
   9 Note that a peptide end with an invalid flanking residue (currently any of
  10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
  11 tryptic if it's a C-terminal end and otherwise qualifies.  (Flanking '-' is
  12 always considered tryptic.)
  13
  14 """
  15
  16 from __future__ import with_statement
  17
  18 __copyright__ = '''
  19     greylag, a collection of programs for MS/MS protein analysis
  20     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  21
  22     This program is free software: you can redistribute it and/or modify
  23     it under the terms of the GNU General Public License as published by
  24     the Free Software Foundation, either version 3 of the License, or
  25     (at your option) any later version.
  26
  27     This program is distributed in the hope that it will be useful,
  28     but WITHOUT ANY WARRANTY; without even the implied warranty of
  29     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  30     GNU General Public License for more details.
  31
  32     You should have received a copy of the GNU General Public License
  33     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  34
  35     Contact: Mike Coleman
  36              Stowers Institute for Medical Research
  37              1000 East 50th Street
  38              Kansas City, Missouri  64110
  39              USA
  40 '''
  41
  42
  43 from collections import defaultdict
  44 import itertools
  45 import logging; from logging import debug, info, warning
  46 import optparse
  47 import os
  48 from pprint import pprint
  49 import re
  50 import string
  51 import sys
  52 import time
  53
  54
  55 from greylag import VERSION, set_logging
  56
  57
  58 def error(s, *args):
  59     "fatal error"
  60     # if we're unit testing, just throw an exception
  61     if __name__ != "__main__":
  62         raise Exception((s + " (fatal error)") % args)
  63     logging.error(s, *args)
  64     sys.exit(1)
  65
  66
  67
  68 def generate_spectra_from_files(sqt_filenames):
  69     """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
  70     ...]) for the given SQT files.  H+_lines, for example, is a list
  71     containing H lines and possibly other kinds of lines up to the next S
  72     line.  (mod_lines are non-L lines following an M-line.)
  73     """
  74     for sqt_filename in sqt_filenames:
  75         with open(sqt_filename) as sqtf:
  76             H_lines = []
  77             S_line, ML_lines = None, []
  78             for line in sqtf:
  79                 if line.startswith('L\t'):
  80                     ML_lines[-1][2].append(line)
  81                 elif line.startswith('M\t'):
  82                     ML_lines.append((line, [], []))
  83                 elif line.startswith('S\t'):
  84                     if S_line:
  85                         yield (sqt_filename, H_lines, S_line, ML_lines)
  86                         S_line, ML_lines = None, []
  87                     S_line = line
  88                 else:
  89                     if not S_line:
  90                         # read headers up to first S line
  91                         H_lines.append(line)
  92                     else:
  93                         # other kind of line following M line
  94                         ML_lines[-1][1].append(line)
  95             if S_line:
  96                 yield (sqt_filename, H_lines, S_line, ML_lines)
  97
  98
  99 def write_spectra_to_files(spectra, filenames):
 100     """Given a generator of tuples of the form generated by
 101     'generate_spectra_from_files', write them to the implied files.  If any of
 102     filenames already exist, they are renamed with a '.bak' suffix.  Temp
 103     files are written first, then renamed at the end.  (The filenames
 104     parameter is necessary because spectra may not contain any entries for a
 105     given filename, thus necessitating its removal.)
 106     """
 107
 108     tmp_suffix = '.tmp.%s' % time.time()
 109
 110     # filename -> file
 111     filemap = {}
 112
 113     for sqt_filename, H_lines, S_line, ML_lines in spectra:
 114         sqtf = filemap.get(sqt_filename)
 115         if sqtf == None:
 116             sqtf = filemap[sqt_filename] = open(sqt_filename + tmp_suffix, 'w')
 117             for h in H_lines:
 118                 sqtf.write(h)
 119         sqtf.write(S_line)
 120         for m_line, mod_lines, l_lines in ML_lines:
 121             sqtf.write(m_line)
 122             for m in mod_lines:
 123                 sqtf.write(m)
 124             for l in l_lines:
 125                 sqtf.write(l)
 126
 127     for fd in filemap.values():
 128         fd.close()
 129
 130     assert set(filemap.keys()) <= set(filenames)
 131     for fn in filenames:
 132         if os.path.exists(fn):
 133             os.rename(fn, fn + '.bak')
 134         if os.path.exists(fn + tmp_suffix):
 135             os.rename(fn + tmp_suffix, fn)
 136
 137
 138 def reset_marks(spectra):
 139     """Given a generator of tuples of the form generated by
 140     'generate_spectra_from_files', yield them with the M line marks
 141     (destructively) reset to U.
 142
 143     >>> list(reset_marks([('fn', [], 'S...', [('M... Y', [], [])])]))
 144     [('fn', [], 'S...', [('M... U', [], [])])]
 145
 146     """
 147
 148     def reset_mark(m_line):
 149         return re.sub(r'(\s+)\S(\s*)$', r'\1U\2', m_line, 1)
 150
 151     for sqt_filename, H_lines, S_line, ML_lines in spectra:
 152         ML_lines_1 = [ (reset_mark(M_line), mod_lines, L_lines)
 153                        for M_line, mod_lines, L_lines in ML_lines ]
 154         yield (sqt_filename, H_lines, S_line, ML_lines_1)
 155
 156
 157 def set_marks(valid_spectrum_info, spectra, kill=False):
 158     """Given a generator of tuples 'spectra' of the form generated by
 159     'generate_spectra_from_files', yield them with the M line marks
 160     (destructively) set to 'N', for spectra not listed in
 161     valid_spectrum_info.  If 'kill', omit marked spectra altogether.
 162
 163     >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
 164     ...                         ('fn', [], 'S 111...', [('M... U', [], [])])]))
 165     [('fn', [], 'S 000...', [('M... U', [], [])]), ('fn', [], 'S 111...', [('M... N', [], [])])]
 166     >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
 167     ...                         ('fn', [], 'S 111...', [('M... U', [], [])])],
 168     ...                kill=True))
 169     [('fn', [], 'S 000...', [('M... U', [], [])])]
 170
 171     """
 172
 173     def set_mark(m_line):
 174         return re.sub(r'\S(\s*)$', r'N\1', m_line, 1)
 175
 176     passing_spectrum_numbers = frozenset(si[0] for si in valid_spectrum_info)
 177
 178     for spectrum_no, (sqt_filename, H_lines, S_line, ML_lines) \
 179             in enumerate(spectra):
 180         mark_spectrum = spectrum_no not in passing_spectrum_numbers
 181         if mark_spectrum and not kill:
 182             ML_lines = [ (set_mark(M_line), mod_lines, L_lines)
 183                          for M_line, mod_lines, L_lines in ML_lines ]
 184         if not (kill and mark_spectrum):
 185             yield (sqt_filename, H_lines, S_line, ML_lines)
 186
 187
 188 nulltrans = string.maketrans('','')
 189 non_aa_chars = string.digits + string.punctuation
 190
 191 def strip_mods(s):
 192     """Given a peptide, return just the unmodified peptide seen.
 193
 194     >>> strip_mods('G*REY@LAG')
 195     'GREYLAG'
 196     >>> strip_mods('GREYLAG')
 197     'GREYLAG'
 198
 199     """
 200     # assuming no control chars present
 201     return s.translate(nulltrans, non_aa_chars)
 202
 203
 204 def get_spectrum_info(options, spectra):
 205     """Given a generator of tuples 'spectra' of the form generated by
 206     'generate_spectra_from_files', yield an info tuple for each spectrum that
 207     passes the criteria specified in options (e.g., options.minimum_trypticity
 208     and options.maximum_sp_rank).
 209     """
 210
 211     for spectrum_no, (sqt_filename,
 212                       H_lines, S_line, ML_lines) in enumerate(spectra):
 213         S_fields = S_line.split('\t')
 214         current_charge = int(S_fields[3])
 215         actual_mass = float(S_fields[6])
 216         spectrum_name = '.'.join([os.path.splitext(sqt_filename)[0]]
 217                                  + S_fields[1:4])
 218
 219         current_score = None
 220         current_delta = None
 221         current_peptide_trypticity = None
 222         current_sp_rank = None
 223         current_peptide_mass = None
 224         current_state = set()
 225         current_loci = set()
 226         highest_rank_seen = 0
 227         best_peptide_seen = None
 228         current_peptide = None
 229
 230         for M_line, mod_lines, L_lines in ML_lines:
 231             M_fields = M_line.split('\t')
 232
 233             rank = int(M_fields[1])
 234             sp_rank = int(M_fields[2])
 235             peptide_mass = float(M_fields[3])
 236             score = float(M_fields[5])
 237             peptide = M_fields[9].strip().upper() # e.g., A.B@CD*.-
 238             assert score >= 0 and rank >= 1 and sp_rank >= 1
 239
 240             if rank > highest_rank_seen + 1:
 241                 # ignore aux top SpRank hits, because delta confounds
 242                 break
 243             highest_rank_seen = rank
 244
 245             if current_score == None:
 246                 if score <= 0:
 247                     break               # spectrum is total crap
 248                 current_score = score
 249
 250             # Don't trust delta from input file.
 251             delta = (current_score - score) / current_score
 252
 253             assert peptide[1] == '.' and peptide[-2] == '.'
 254             peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
 255             peptide = peptide[2:-2]                    # 'B@CD*'
 256
 257             if current_peptide == None:
 258                 current_peptide = peptide
 259                 current_peptide_mass = peptide_mass
 260                 current_peptide_trypticity = 0
 261                 # NB: K*, R* not tryptic!
 262                 if (peptide_flanks[0] in ('K', 'R') and peptide[0] != 'P'
 263                     or peptide_flanks[0] == '-'):
 264                     current_peptide_trypticity += 1
 265                 if (peptide[-1] in ('K', 'R') and peptide_flanks[1] != 'P'
 266                     or peptide_flanks[1] == '-'):
 267                     current_peptide_trypticity += 1
 268                 current_sp_rank = sp_rank
 269
 270             # Choose delta from first M line such that peptide differs from
 271             # peptide in initial (top) M line.  We consider differently
 272             # modified versions of the same peptide to be different (at least
 273             # for now).  But, if there are two peptides that differ only in
 274             # their choice of I (isoleucine) or L (leucine), consider them
 275             # identical and keep looking for a delta.
 276             if peptide.replace('I', 'L') != current_peptide.replace('I', 'L'):
 277                 current_delta = delta
 278                 break
 279
 280             for L_line in L_lines:
 281                 locus = L_line.split('\t')[1]
 282                 current_loci.add(locus)
 283                 if locus.startswith(options.decoy_prefix):
 284                     current_state.add('decoy')
 285                 else:
 286                     current_state.add('real')
 287
 288         if (current_score != None and current_delta != None
 289             and current_peptide_trypticity >= options.minimum_trypticity
 290             and current_sp_rank <= options.maximum_sp_rank):
 291
 292             if len(current_state) == 2:
 293                 current_state = 'both'
 294             else:
 295                 assert len(current_state) == 1
 296                 current_state = current_state.pop()
 297
 298             yield (spectrum_no, spectrum_name, current_charge, current_score,
 299                    current_delta, current_state, current_peptide,
 300                    strip_mods(current_peptide), actual_mass,
 301                    actual_mass-current_peptide_mass, frozenset(current_loci))
 302
 303
 304 # FIX: can this be more succinct?
 305 def remove_charge_aliases(spectrum_generator):
 306     """Given a generator of spectra like get_spectrum_info, for spectra that
 307     received multiple id's at different charges, yield only the best id.
 308     (Probably unnecessary unless the precursor tolerance is wide.)
 309     """
 310
 311     sp_info_0 = None
 312     for sp_info_1 in spectrum_generator:
 313         assert sp_info_1[2] < 10, "single-digit charge"
 314         if not sp_info_0:
 315             sp_info_0 = sp_info_1
 316             continue
 317         if sp_info_0[1][:-1] != sp_info_1[1][:-1]:
 318             yield sp_info_0
 319             sp_info_0 = sp_info_1
 320             continue
 321         # if at least one has sufficiently small mass delta, choose the
 322         # one with the smaller
 323         if abs(sp_info_0[9]) < abs(sp_info_1[9]) < 10:
 324             continue
 325         if abs(sp_info_1[9]) < abs(sp_info_0[9]) < 10:
 326             sp_info_0 = sp_info_1
 327             continue
 328         # otherwise choose the better-scoring one
 329         if sp_info_0[3] < sp_info_1[3]:
 330             sp_info_0 = sp_info_1
 331
 332     if sp_info_0:
 333         yield sp_info_0
 334
 335
 336 # General algorithm:
 337 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
 338 #     repeat until set of valid spectra converges:
 339 #         - do pr filter of spectra
 340 #             - if not first time, and no spectra filtered out, break
 341 #         - separately for each charge, choose valid spectra to maximize real ids subject to FDR
 342 #     - saturate (locally or completely)
 343
 344
 345 def PPV(reals, decoys):
 346     """Returns the estimated Positive Predictive Value (== 1 - FDR), given
 347     counts of reals and decoys.
 348
 349     >>> PPV(100, 100)
 350     0.0
 351     >>> PPV(99, 100)
 352     0.0
 353     >>> PPV(100, 0)
 354     1.0
 355     >>> str(PPV(95, 5))
 356     '0.9'
 357
 358     """
 359
 360     # We know that the decoys are false positives, and we estimate that an
 361     # equal number of the "reals" are actually false, too.
 362     false_positives = 2*decoys
 363     true_positives = reals - decoys
 364
 365     if true_positives <= 0:
 366         return 0.0
 367     return float(true_positives) / (true_positives + false_positives)
 368
 369
 370 def calculate_inner_threshold(fdr, spinfo):
 371     ppv_goal = 1 - fdr
 372
 373     spinfo = sorted(spinfo, key=lambda x: x[3])
 374
 375     epsilon = +1e-6
 376
 377     real_count = sum(1 for x in spinfo if x[5] == 'real')
 378     decoy_count = sum(1 for x in spinfo if x[5] == 'decoy')
 379
 380     current_threshold = -1e100      # allow all spectra
 381     for n, sp in enumerate(spinfo):
 382         if real_count == 0:
 383             return (None, real_count, decoy_count) # give up
 384
 385         ppv_est = PPV(real_count, decoy_count)
 386         if ppv_est >= ppv_goal:
 387             break
 388         if sp[5] == 'real':
 389             real_count -= 1
 390         elif sp[5] == 'decoy':
 391             decoy_count -= 1
 392         # set threshold just high enough to exclude this spectrum
 393         current_threshold = sp[3] + epsilon
 394     else:
 395         current_threshold = spinfo[-1][3] + epsilon # couldn't meet goal
 396
 397     return (current_threshold, real_count, decoy_count)
 398
 399
 400 def calculate_combined_thresholds(fdr, spectrum_info):
 401     """Find best score/delta thresholds for each charge."""
 402
 403     ppv_goal = 1 - fdr
 404
 405     # Rather than search every possible value of delta, we're only going to
 406     # "sample" at this granularity.  This cuts search time dramatically (and
 407     # makes it O(n) instead of O(n**2).  Extra precision wouldn't really be
 408     # useful in any case.
 409     SEARCH_GRANULARITY = 0.001
 410
 411     # charge -> (score, delta, passing_reals, passing_decoys)
 412     thresholds = {}
 413
 414     by_charge = lambda x: x[2]
 415     spectrum_info.sort(key=by_charge)
 416     for charge, spinfo in itertools.groupby(spectrum_info, key=by_charge):
 417         spinfo0 = sorted(spinfo, key=lambda x: x[4], reverse=True)
 418
 419         last_value = None
 420
 421         while spinfo0:
 422             this_value = spinfo0[-1][4] # current delta
 423             if (last_value == None
 424                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 425
 426                 # "inner" is score
 427                 r = calculate_inner_threshold(fdr, spinfo0)
 428                 threshold, real_count, decoy_count = r
 429                 if threshold != None:
 430                     debug('# %s %s %s %s %s', charge, threshold, this_value,
 431                           real_count, decoy_count)
 432                     if (charge not in thresholds
 433                         or real_count > thresholds[charge][2]):
 434                         thresholds[charge] = (threshold, this_value,
 435                                               real_count, decoy_count)
 436                 last_value = this_value
 437             spinfo0.pop()
 438
 439     return thresholds
 440
 441
 442 def flatten(gen_of_gens):
 443     """Yield items from a generator of generators.
 444
 445     >>> list(flatten([(1,2,3), (4, (5, 'b'), 6)]))
 446     [1, 2, 3, 4, (5, 'b'), 6]
 447
 448     """
 449
 450     for gen in gen_of_gens:
 451         for item in gen:
 452             yield item
 453
 454
 455 def redundancy_filter(options, spectrum_info):
 456     if options.minimum_spectra_per_locus < 2:
 457         return [ si for si in spectrum_info ]
 458
 459     # si = (spectrum_no, spectrum_name, charge, score, delta, state,
 460     #       peptide, stripped_peptide, actual_mass, mass_delta, loci)
 461
 462     Rs = sorted(flatten(((locus, si) for locus in si[10])
 463                         for si in spectrum_info))
 464
 465     result = set()
 466     for locus, locus_Rs in itertools.groupby(Rs, key=lambda x: x[0]):
 467         list_locus_si = [ si for locus, si in locus_Rs ]
 468         if len(list_locus_si) < options.minimum_spectra_per_locus:
 469             continue
 470         result |= set(list_locus_si)
 471     return sorted(result)
 472
 473
 474 def saturate_by_prefix_suffix(spectrum_info, saturation_spectrum_info):
 475     PREFIX_LENGTH = 16          # empirically seems good
 476     peptide_prefixes = set(si[7][:PREFIX_LENGTH] for si in spectrum_info)
 477     sat_si = sorted(ssi for ssi in saturation_spectrum_info
 478                     if ssi[7][:PREFIX_LENGTH] in peptide_prefixes)
 479
 480     info("saturation by prefix (w/%s): %s -> %s",
 481          len(saturation_spectrum_info), len(spectrum_info), len(sat_si))
 482     return sat_si
 483
 484
 485 def saturate(spectrum_info, saturation_spectrum_info):
 486     peptides = set(si[7] for si in spectrum_info)
 487     # FIX: does this actually need to be sorted?
 488     sat_si = sorted(ssi for ssi in saturation_spectrum_info
 489                     if ssi[7] in peptides)
 490     info("saturation (w/%s): %s -> %s", len(saturation_spectrum_info),
 491          len(spectrum_info), len(sat_si))
 492     return saturate_by_prefix_suffix(sat_si, saturation_spectrum_info)
 493
 494
 495 def fdr_stats(spectrum_info):
 496     reals = sum(1 for si in spectrum_info if si[5] == 'real')
 497     decoys = sum(1 for si in spectrum_info if si[5] == 'decoy')
 498     fdr = 1 - PPV(reals, decoys)
 499     return fdr, reals, decoys
 500
 501 def debug_fdr(options, msg, si):
 502     if options.verbose:
 503         fdr, reals, decoys = fdr_stats(si)
 504         info('%s %s %s %s', msg, fdr, reals, decoys)
 505
 506 def try_fdr(fdr_guess, options, remaining_spectrum_info,
 507             saturation_spectrum_info=None):
 508     info("trying FDR = %s", fdr_guess)
 509     debug_fdr(options, "initial", remaining_spectrum_info)
 510
 511     remaining_spectrum_info_1 = None
 512     while True:
 513         thresholds = calculate_combined_thresholds(fdr_guess,
 514                                                    remaining_spectrum_info)
 515         remaining_spectrum_info = [ si for si in remaining_spectrum_info
 516                                     if (si[2] in thresholds
 517                                         and si[3] >= thresholds[si[2]][0]
 518                                         and si[4] >= thresholds[si[2]][1]) ]
 519         debug_fdr(options, "after thresholding", remaining_spectrum_info)
 520         if remaining_spectrum_info == remaining_spectrum_info_1:
 521             break
 522         remaining_spectrum_info_1 = redundancy_filter(options,
 523                                                       remaining_spectrum_info)
 524         debug_fdr(options, "after redundancy", remaining_spectrum_info_1)
 525         if len(remaining_spectrum_info_1) == len(remaining_spectrum_info):
 526             break
 527         remaining_spectrum_info = remaining_spectrum_info_1
 528
 529     fdr_result, total_reals, total_decoys = fdr_stats(remaining_spectrum_info_1)
 530     if not options.no_saturation and saturation_spectrum_info:
 531         remaining_spectrum_info_1 = saturate(remaining_spectrum_info_1,
 532                                              saturation_spectrum_info)
 533         debug_fdr(options, "after saturation", remaining_spectrum_info_1)
 534         (fdr_result,
 535          total_reals, total_decoys) = fdr_stats(remaining_spectrum_info_1)
 536
 537     return (fdr_result, thresholds, total_reals, total_decoys,
 538             remaining_spectrum_info_1)
 539
 540
 541 # FIX: at various points here we may have a goal of a minimum number of real
 542 # ids, based on goal FDR or whatever.  We can pass this inward to do
 543 # branch-and-bound, which may save time.
 544
 545 # FIX: this is now somewhat slower than before--any way to speed it up?
 546
 547 def search_adjusting_fdr(options, spectrum_info_0):
 548     """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
 549     guess that results in an appropriate goal FDR.  This might be needed
 550     because the thresholds found for per-charge FDRs will be saturated or have
 551     quantization errors, resulting in overly conservative thresholds.  Returns
 552     a list of items from spectrum_info that are considered valid.
 553     """
 554
 555     FDR_INFLATION_FACTOR = 3
 556     assert FDR_INFLATION_FACTOR > 1
 557
 558     spectrum_info = redundancy_filter(options, spectrum_info_0)
 559
 560     low_fdr = options.fdr
 561     # FIX: spectrum_info_0 has already been filtered for tryptic status
 562     # Should we use an unfiltered set instead?
 563     low_results = try_fdr(low_fdr, options, spectrum_info, spectrum_info_0)
 564
 565     if not options.no_adjust_fdr:
 566         high_fdr = min(1.0, options.fdr * FDR_INFLATION_FACTOR)
 567         high_results = try_fdr(high_fdr, options, spectrum_info,
 568                                spectrum_info_0)
 569         initial_total_reals = low_results[2]
 570
 571         for i in range(32):
 572             debug("adjust infl fdr %s %s -> %s", low_fdr, high_fdr,
 573                  high_results[0])
 574             if high_results[0] >= options.fdr:
 575                 break
 576             low_fdr, low_results = high_fdr, high_results
 577             high_fdr = min(1.0, high_fdr * FDR_INFLATION_FACTOR)
 578             if high_fdr == 1.0:
 579                 break
 580             high_results = try_fdr(high_fdr, options, spectrum_info,
 581                                    spectrum_info_0)
 582         else:
 583             warning("FDR adjustment inflation failed")
 584
 585         #    #
 586         #    l        g            h
 587         #   l       g         h
 588         #    #
 589
 590         CONVERGENCE_FACTOR = 0.01
 591         assert CONVERGENCE_FACTOR > 0
 592         for i in range(32):
 593             assert high_fdr >= low_fdr
 594             if (high_fdr - low_fdr) <= CONVERGENCE_FACTOR*options.fdr:
 595                 break
 596
 597             # - seems to be slower than the naive method?
 598             # Bias is empirically determined.  It's supposed to speed
 599             # convergence, but should not affect the results.
 600             # FIX: better method possible?
 601             #GUESS_BIAS = 1.15
 602             #prop = ((options.fdr - low_results[0])
 603             #        / (high_results[0] - low_results[0]))
 604             #prop = min(1, max(0, prop))
 605             #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
 606
 607             guess_fdr = (high_fdr + low_fdr) / 2.0
 608
 609             guess_results = try_fdr(guess_fdr, options, spectrum_info,
 610                                     spectrum_info_0)
 611             debug("adjust fdr %s %s %s -> %s", low_fdr, guess_fdr, high_fdr,
 612                  guess_results[0])
 613             if guess_results[0] > options.fdr:
 614                 high_fdr, high_results = guess_fdr, guess_results
 615             else:
 616                 low_fdr, low_results = guess_fdr, guess_results
 617         else:
 618             warning("FDR adjustment convergence failed")
 619
 620     if not options.no_adjust_fdr:
 621         info("FDR adjustment found %s extra real ids",
 622              low_results[2] - initial_total_reals)
 623
 624     return low_results
 625
 626
 627 def search_with_possible_saturation(options, spectrum_info):
 628
 629     for saturate in (True, False):
 630         if saturate and options.no_saturation:
 631             continue
 632         results = search_adjusting_fdr(options, spectrum_info)
 633         (fdr_result, thresholds,
 634          total_reals, total_decoys, valid_spectrum_info) = results
 635         if fdr_result <= options.fdr:
 636             break
 637         if saturate:
 638             info('could not achieve FDR with saturation, retrying without')
 639             options.no_saturation = True
 640
 641     if options.output_peptides:
 642         valid_spectrum_info.sort(key=lambda x: x[1])
 643         for (spectrum_no, spectrum_name, charge, score, delta, state,
 644              peptide, stripped_peptide, actual_mass,
 645              mass_delta, loci) in valid_spectrum_info:
 646             print >> options.output_peptides, \
 647                 ("%+d %s %s %s %.8f %.8f"
 648                  % (charge, spectrum_name, state, peptide, actual_mass,
 649                     mass_delta))
 650
 651     for charge in sorted(thresholds.keys()):
 652         score_threshold, delta_threshold = thresholds[charge][0:2]
 653         reals, decoys = thresholds[charge][2:4]
 654         info("%+d: score %f, delta %.6f -> %s real ids, %s decoys (fdr %.4f)",
 655              charge, score_threshold, delta_threshold, reals, decoys,
 656              0.5 * (1 - PPV(reals, decoys)))
 657
 658     if not options.quiet:
 659         print ("%s real ids, %s decoys (FDR = %.4f) [p%s]"
 660                % (total_reals, total_decoys, fdr_result / 2.0,
 661                   options.minimum_spectra_per_locus))
 662
 663     return valid_spectrum_info
 664
 665
 666 def print_mass_error_histogram(valid_spectrum_info):
 667     hist = defaultdict(int)
 668
 669     for (spectrum_no, spectrum_name, charge, score, delta, state, peptide,
 670          stripped_peptide, actual_mass, mass_delta,
 671          loci) in valid_spectrum_info:
 672         hist[round(mass_delta)] += 1
 673
 674     pairs_most_first = sorted(((bin, hist[bin]) for bin in hist),
 675                               key=lambda x: x[1], reverse=True)
 676
 677     total = sum(count for bin, count in pairs_most_first)
 678
 679     cumulative_total = 0
 680     for bin, count in pairs_most_first:
 681         cumulative_total += count
 682         print '# %+d\t%s\t%.3f' % (bin, count,
 683                                    (float(cumulative_total) / total))
 684
 685
 686 def main(args=sys.argv[1:]):
 687     parser = optparse.OptionParser(usage=
 688                                    "usage: %prog [options] <sqt-file>...",
 689                                    description=__doc__, version=VERSION)
 690     pa = parser.add_option
 691     DEFAULT_DECOY_PREFIX = "SHUFFLED_"
 692     pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX,
 693        help='prefix given to locus name of decoy (e.g., shuffled) database'
 694        ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX")
 695     DEFAULT_FDR = 0.01
 696     pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
 697        help="false discovery rate [default=%s] (Note that this is the"
 698        " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
 699        metavar="PROPORTION")
 700     pa("--no-adjust-fdr", action="store_true", dest="no_adjust_fdr",
 701        help="skip adjustment of internal FDR to achieve a final FDR closer to"
 702        " requested FDR (which takes longer, but may find more real ids in"
 703        " some cases)")
 704     pa("--no-saturation", action="store_true", dest="no_saturation",
 705        help="skip peptide saturation")
 706     pa("-v", "--verbose", action="store_true", dest="verbose",
 707        help="be verbose")
 708     pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
 709     pa("-l", "--logfile", dest="logfile",
 710        help="log to FILE instead of stderr", metavar="FILE")
 711     pa("--output-peptides", dest="output_peptides",
 712        help="output information about passing peptides to specified file"
 713        " ('-' for stdout)", metavar="FILENAME")
 714     pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
 715        default='0', dest="minimum_trypticity",
 716        help="drop peptides with too few tryptic ends"
 717        " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
 718     DEFAULT_MINIMUM_SPECTRA_PER_LOCUS = 1
 719     pa("-p", "--minimum-spectra-per-locus", dest="minimum_spectra_per_locus",
 720        type="int", default=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
 721        help="only loci with at least this many spectra are included, and"
 722        " spectra not associated with such a locus are excluded"
 723        " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
 724        metavar="COUNT")
 725     DEFAULT_MAXIMUM_SP_RANK = 1000000
 726     pa("--maximum-sp-rank", dest="maximum_sp_rank", type="int",
 727        default=DEFAULT_MAXIMUM_SP_RANK,
 728        help="drop peptides with greater Sp rank (the secondary rank in sqt"
 729        " files, for compatibility with other search programs)"
 730        " [default=%s]" % DEFAULT_MAXIMUM_SP_RANK, metavar="RANK")
 731     #pa("--graph", dest="graph",
 732     #   help='create distribution graphs, using the specified prefix',
 733     #   metavar="PATH PREFIX")
 734     pa("-M", "--mass-error-histogram", action="store_true",
 735        dest="mass_error_histogram",
 736        help="print a histogram of mass errors (1-Da-wide bins)")
 737     pa("--debug", action="store_true", dest="debug",
 738        help="show debug output")
 739     pa("--mark", action="store_true", dest="mark",
 740        help="rewrite the input files, changing some validation marks to 'N',"
 741        " according to filtering")
 742     pa("--reset-marks", action="store_true", dest="reset_marks",
 743        help="rewrite the input files, changing all validation marks to 'U'")
 744     pa("--kill", action="store_true", dest="kill",
 745        help="rewrite the input files, removing spectra that don't pass"
 746        " validation, according to filtering")
 747     pa("--copyright", action="store_true", dest="copyright",
 748        help="print copyright and exit")
 749     (options, args) = parser.parse_args(args=args)
 750
 751     if options.copyright:
 752         print __copyright__
 753         sys.exit(0)
 754
 755     if (len(args) < 1 or options.minimum_spectra_per_locus < 0
 756         or options.maximum_sp_rank < 1):
 757         parser.print_help()
 758         sys.exit(1)
 759
 760     if not (0.0 <= options.fdr < 0.5):
 761         error("--fdr must be within range [0.0, 0.5)")
 762     if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
 763         > 1):
 764         error("only one of --mark, --reset-marks, --kill may be specified")
 765
 766     options.minimum_trypticity = int(options.minimum_trypticity)
 767
 768     set_logging(options)
 769
 770     if options.reset_marks:
 771         write_spectra_to_files(reset_marks(generate_spectra_from_files(args)),
 772                                args)
 773         return
 774
 775     if options.output_peptides:
 776         if options.output_peptides == '-':
 777             options.output_peptides = sys.stdout
 778         else:
 779             options.output_peptides = open(options.output_peptides, 'w')
 780
 781     # This is to translate the "effective" FDR, which is the rate after decoys
 782     # have been stripped out of the results (which is what users really care
 783     # about), into our internal FDR, which includes decoys.
 784     options.fdr *= 2
 785
 786     info("reading spectra")
 787     spectrum_info = list(remove_charge_aliases(get_spectrum_info(options,
 788                                                                  generate_spectra_from_files(args))))
 789
 790     # valid_spectrum_info is the list of spectrum_info elements that have been
 791     # chosen as "valid"
 792     valid_spectrum_info = search_with_possible_saturation(options, spectrum_info)
 793
 794     if options.mass_error_histogram:
 795         print_mass_error_histogram(valid_spectrum_info)
 796
 797     if options.debug:
 798         valid_spectrum_info.sort()
 799         for (spectrum_no, spectrum_name, charge, score, delta, state,
 800              peptide, stripped_peptide, actual_mass,
 801              mass_delta, loci) in valid_spectrum_info:
 802             debug('#S %s %s %s', charge, score, delta)
 803
 804     if options.mark or options.kill:
 805         # Note that we are intentionally reading the input files again here,
 806         # to avoid having to keep them all in memory.
 807         info("rewriting spectrum files")
 808         write_spectra_to_files(set_marks(valid_spectrum_info,
 809                                          generate_spectra_from_files(args),
 810                                          options.kill),
 811                                args)
 812         info("finished rewriting")
 813
 814
 815 if __name__ == '__main__':
 816     main()