greylag_validate.py

   1 #!/usr/bin/env python
   2
   3 """
   4 Given a set of sqt files, determine a 'valid' set of identifications that
   5 satisfy the specified FDR (false discovery rate) and other specified criteria.
   6 Optionally rewrite the sqt files with invalid spectra marked 'N' or deleted
   7 altogether.
   8
   9 Note that a peptide end with an invalid flanking residue (currently any of
  10 BJOUXZ) is not considered tryptic if it's an N-terminal end and is considered
  11 tryptic if it's a C-terminal end and otherwise qualifies.  (Flanking '-' is
  12 always considered tryptic.)
  13
  14 """
  15
  16 from __future__ import with_statement
  17
  18 __copyright__ = '''
  19     greylag, a collection of programs for MS/MS protein analysis
  20     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  21
  22     This program is free software: you can redistribute it and/or modify
  23     it under the terms of the GNU General Public License as published by
  24     the Free Software Foundation, either version 3 of the License, or
  25     (at your option) any later version.
  26
  27     This program is distributed in the hope that it will be useful,
  28     but WITHOUT ANY WARRANTY; without even the implied warranty of
  29     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  30     GNU General Public License for more details.
  31
  32     You should have received a copy of the GNU General Public License
  33     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  34
  35     Contact: Mike Coleman
  36              Stowers Institute for Medical Research
  37              1000 East 50th Street
  38              Kansas City, Missouri  64110
  39              USA
  40 '''
  41
  42
  43 from collections import defaultdict
  44 import itertools
  45 import logging; from logging import debug, info, warning
  46 import optparse
  47 import os
  48 from pprint import pprint
  49 import re
  50 import string
  51 import sys
  52 import time
  53
  54 import greylag
  55
  56
  57 def error(s, *args):
  58     "fatal error"
  59     # if we're unit testing, just throw an exception
  60     if __name__ != "__main__":
  61         raise Exception((s + " (fatal error)") % args)
  62     logging.error(s, *args)
  63     sys.exit(1)
  64
  65 # errors are fatal
  66 greylag.chase_error = error
  67
  68
  69 def generate_spectra_from_files(sqt_filenames):
  70     """Yield tuples (filename, H+_lines, S_line, [(M_line, mod_lines, L_lines),
  71     ...]) for the given SQT files.  H+_lines, for example, is a list
  72     containing H lines and possibly other kinds of lines up to the next S
  73     line.  (mod_lines are non-L lines following an M-line.)
  74     """
  75     for sqt_filename in sqt_filenames:
  76         with open(sqt_filename) as sqtf:
  77             H_lines = []
  78             S_line, ML_lines = None, []
  79             for line in sqtf:
  80                 if line.startswith('L\t'):
  81                     ML_lines[-1][2].append(line)
  82                 elif line.startswith('M\t'):
  83                     ML_lines.append((line, [], []))
  84                 elif line.startswith('S\t'):
  85                     if S_line:
  86                         yield (sqt_filename, H_lines, S_line, ML_lines)
  87                         S_line, ML_lines = None, []
  88                     S_line = line
  89                 else:
  90                     if not S_line:
  91                         # read headers up to first S line
  92                         H_lines.append(line)
  93                     else:
  94                         # other kind of line following M line
  95                         ML_lines[-1][1].append(line)
  96             if S_line:
  97                 yield (sqt_filename, H_lines, S_line, ML_lines)
  98
  99
 100 def write_spectra_to_files(spectra, filenames):
 101     """Given a generator of tuples of the form generated by
 102     'generate_spectra_from_files', write them to the implied files.  If any of
 103     filenames already exist, they are renamed with a '.bak' suffix.  Temp
 104     files are written first, then renamed at the end.  (The filenames
 105     parameter is necessary because spectra may not contain any entries for a
 106     given filename, thus necessitating its removal.)
 107     """
 108
 109     tmp_suffix = '.tmp.%s' % time.time()
 110
 111     # filename -> file
 112     filemap = {}
 113
 114     for sqt_filename, H_lines, S_line, ML_lines in spectra:
 115         sqtf = filemap.get(sqt_filename)
 116         if sqtf == None:
 117             sqtf = filemap[sqt_filename] = open(sqt_filename + tmp_suffix, 'w')
 118             for h in H_lines:
 119                 sqtf.write(h)
 120         sqtf.write(S_line)
 121         for m_line, mod_lines, l_lines in ML_lines:
 122             sqtf.write(m_line)
 123             for m in mod_lines:
 124                 sqtf.write(m)
 125             for l in l_lines:
 126                 sqtf.write(l)
 127
 128     for fd in filemap.values():
 129         fd.close()
 130
 131     assert set(filemap.keys()) <= set(filenames)
 132     for fn in filenames:
 133         if os.path.exists(fn):
 134             os.rename(fn, fn + '.bak')
 135         if os.path.exists(fn + tmp_suffix):
 136             os.rename(fn + tmp_suffix, fn)
 137
 138
 139 def reset_marks(spectra):
 140     """Given a generator of tuples of the form generated by
 141     'generate_spectra_from_files', yield them with the M line marks
 142     (destructively) reset to U.
 143
 144     >>> list(reset_marks([('fn', [], 'S...', [('M... Y', [], [])])]))
 145     [('fn', [], 'S...', [('M... U', [], [])])]
 146
 147     """
 148
 149     def reset_mark(m_line):
 150         return re.sub(r'(\s+)\S(\s*)$', r'\1U\2', m_line, 1)
 151
 152     for sqt_filename, H_lines, S_line, ML_lines in spectra:
 153         ML_lines_1 = [ (reset_mark(M_line), mod_lines, L_lines)
 154                        for M_line, mod_lines, L_lines in ML_lines ]
 155         yield (sqt_filename, H_lines, S_line, ML_lines_1)
 156
 157
 158 def set_marks(valid_spectrum_info, spectra, kill=False):
 159     """Given a generator of tuples 'spectra' of the form generated by
 160     'generate_spectra_from_files', yield them with the M line marks
 161     (destructively) set to 'N', for spectra not listed in
 162     valid_spectrum_info.  If 'kill', omit marked spectra altogether.
 163
 164     >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
 165     ...                         ('fn', [], 'S 111...', [('M... U', [], [])])]))
 166     [('fn', [], 'S 000...', [('M... U', [], [])]), ('fn', [], 'S 111...', [('M... N', [], [])])]
 167     >>> list(set_marks([(0,)], [('fn', [], 'S 000...', [('M... U', [], [])]),
 168     ...                         ('fn', [], 'S 111...', [('M... U', [], [])])],
 169     ...                kill=True))
 170     [('fn', [], 'S 000...', [('M... U', [], [])])]
 171
 172     """
 173
 174     def set_mark(m_line):
 175         return re.sub(r'\S(\s*)$', r'N\1', m_line, 1)
 176
 177     passing_spectrum_numbers = frozenset(si[0] for si in valid_spectrum_info)
 178
 179     for spectrum_no, (sqt_filename, H_lines, S_line, ML_lines) \
 180             in enumerate(spectra):
 181         mark_spectrum = spectrum_no not in passing_spectrum_numbers
 182         if mark_spectrum and not kill:
 183             ML_lines = [ (set_mark(M_line), mod_lines, L_lines)
 184                          for M_line, mod_lines, L_lines in ML_lines ]
 185         if not (kill and mark_spectrum):
 186             yield (sqt_filename, H_lines, S_line, ML_lines)
 187
 188
 189 nulltrans = string.maketrans('','')
 190 non_aa_chars = string.digits + string.punctuation
 191
 192 def strip_mods(s):
 193     """Given a peptide, return just the unmodified peptide seen.
 194
 195     >>> strip_mods('G*REY@LAG')
 196     'GREYLAG'
 197     >>> strip_mods('GREYLAG')
 198     'GREYLAG'
 199
 200     """
 201     # assuming no control chars present
 202     return s.translate(nulltrans, non_aa_chars)
 203
 204
 205 def get_spectrum_info(options, spectra):
 206     """Given a generator of tuples 'spectra' of the form generated by
 207     'generate_spectra_from_files', yield an info tuple for each spectrum that
 208     passes the criteria specified in options (e.g., options.minimum_trypticity
 209     and options.maximum_sp_rank).
 210     """
 211
 212     for spectrum_no, (sqt_filename,
 213                       H_lines, S_line, ML_lines) in enumerate(spectra):
 214         S_fields = S_line.split('\t')
 215         current_charge = int(S_fields[3])
 216         actual_mass = float(S_fields[6])
 217         spectrum_name = '.'.join([os.path.splitext(sqt_filename)[0]]
 218                                  + S_fields[1:4])
 219
 220         current_score = None
 221         current_delta = None
 222         current_peptide_trypticity = None
 223         current_sp_rank = None
 224         current_peptide_mass = None
 225         current_state = set()
 226         current_loci = set()
 227         highest_rank_seen = 0
 228         best_peptide_seen = None
 229         current_peptide = None
 230
 231         for M_line, mod_lines, L_lines in ML_lines:
 232             M_fields = M_line.split('\t')
 233
 234             rank = int(M_fields[1])
 235             sp_rank = int(M_fields[2])
 236             peptide_mass = float(M_fields[3])
 237             score = float(M_fields[5])
 238             peptide = M_fields[9].strip().upper() # e.g., A.B@CD*.-
 239             assert score >= 0 and rank >= 1 and sp_rank >= 1
 240
 241             if rank > highest_rank_seen + 1:
 242                 # ignore aux top SpRank hits, because delta confounds
 243                 break
 244             highest_rank_seen = rank
 245
 246             if current_score == None:
 247                 if score <= 0:
 248                     break               # spectrum is total crap
 249                 current_score = score
 250
 251             # Don't trust delta from input file.
 252             delta = (current_score - score) / current_score
 253
 254             assert peptide[1] == '.' and peptide[-2] == '.'
 255             peptide_flanks = (peptide[0], peptide[-1]) # ('A', '-')
 256             peptide = peptide[2:-2]                    # 'B@CD*'
 257
 258             if current_peptide == None:
 259                 current_peptide = peptide
 260                 current_peptide_mass = peptide_mass
 261                 current_peptide_trypticity = 0
 262                 # NB: K*, R* not tryptic!
 263                 if (peptide_flanks[0] in ('K', 'R') and peptide[0] != 'P'
 264                     or peptide_flanks[0] == '-'):
 265                     current_peptide_trypticity += 1
 266                 if (peptide[-1] in ('K', 'R') and peptide_flanks[1] != 'P'
 267                     or peptide_flanks[1] == '-'):
 268                     current_peptide_trypticity += 1
 269                 current_sp_rank = sp_rank
 270
 271             # Choose delta from first M line such that peptide differs from
 272             # peptide in initial (top) M line.  We consider differently
 273             # modified versions of the same peptide to be different (at least
 274             # for now).  But, if there are two peptides that differ only in
 275             # their choice of I (isoleucine) or L (leucine), consider them
 276             # identical and keep looking for a delta.
 277             if peptide.replace('I', 'L') != current_peptide.replace('I', 'L'):
 278                 current_delta = delta
 279                 break
 280
 281             for L_line in L_lines:
 282                 locus = L_line.split('\t')[1]
 283                 current_loci.add(locus)
 284                 if locus.startswith(options.decoy_prefix):
 285                     current_state.add('decoy')
 286                 else:
 287                     current_state.add('real')
 288
 289         if (current_score != None and current_delta != None
 290             and current_peptide_trypticity >= options.minimum_trypticity
 291             and current_sp_rank <= options.maximum_sp_rank):
 292
 293             if len(current_state) == 2:
 294                 current_state = 'both'
 295             else:
 296                 assert len(current_state) == 1
 297                 current_state = current_state.pop()
 298
 299             yield (spectrum_no, spectrum_name, current_charge, current_score,
 300                    current_delta, current_state, current_peptide,
 301                    strip_mods(current_peptide), actual_mass,
 302                    actual_mass-current_peptide_mass, frozenset(current_loci))
 303
 304
 305 # FIX: can this be more succinct?
 306 def remove_charge_aliases(spectrum_generator):
 307     """Given a generator of spectra like get_spectrum_info, for spectra that
 308     received multiple id's at different charges, yield only the best id.
 309     (Probably unnecessary unless the precursor tolerance is wide.)
 310     """
 311
 312     sp_info_0 = None
 313     for sp_info_1 in spectrum_generator:
 314         assert sp_info_1[2] < 10, "single-digit charge"
 315         if not sp_info_0:
 316             sp_info_0 = sp_info_1
 317             continue
 318         if sp_info_0[1][:-1] != sp_info_1[1][:-1]:
 319             yield sp_info_0
 320             sp_info_0 = sp_info_1
 321             continue
 322         # if at least one has sufficiently small mass delta, choose the
 323         # one with the smaller
 324         if abs(sp_info_0[9]) < abs(sp_info_1[9]) < 10:
 325             continue
 326         if abs(sp_info_1[9]) < abs(sp_info_0[9]) < 10:
 327             sp_info_0 = sp_info_1
 328             continue
 329         # otherwise choose the better-scoring one
 330         if sp_info_0[3] < sp_info_1[3]:
 331             sp_info_0 = sp_info_1
 332
 333     if sp_info_0:
 334         yield sp_info_0
 335
 336
 337 # General algorithm:
 338 # - repeat, adjusting fdr upwards (or downwards?) to meet user goal
 339 #     repeat until set of valid spectra converges:
 340 #         - do pr filter of spectra
 341 #             - if not first time, and no spectra filtered out, break
 342 #         - separately for each charge, choose valid spectra to maximize real ids subject to FDR
 343 #     - saturate (locally or completely)
 344
 345
 346 def PPV(reals, decoys):
 347     """Returns the estimated Positive Predictive Value (== 1 - FDR), given
 348     counts of reals and decoys.
 349
 350     >>> PPV(100, 100)
 351     0.0
 352     >>> PPV(99, 100)
 353     0.0
 354     >>> PPV(100, 0)
 355     1.0
 356     >>> str(PPV(95, 5))
 357     '0.9'
 358
 359     """
 360
 361     # We know that the decoys are false positives, and we estimate that an
 362     # equal number of the "reals" are actually false, too.
 363     false_positives = 2*decoys
 364     true_positives = reals - decoys
 365
 366     if true_positives <= 0:
 367         return 0.0
 368     return float(true_positives) / (true_positives + false_positives)
 369
 370
 371 def calculate_inner_threshold(fdr, spinfo):
 372     ppv_goal = 1 - fdr
 373
 374     spinfo = sorted(spinfo, key=lambda x: x[3])
 375
 376     epsilon = +1e-6
 377
 378     real_count = sum(1 for x in spinfo if x[5] == 'real')
 379     decoy_count = sum(1 for x in spinfo if x[5] == 'decoy')
 380
 381     current_threshold = -1e100      # allow all spectra
 382     for n, sp in enumerate(spinfo):
 383         if real_count == 0:
 384             return (None, real_count, decoy_count) # give up
 385
 386         ppv_est = PPV(real_count, decoy_count)
 387         if ppv_est >= ppv_goal:
 388             break
 389         if sp[5] == 'real':
 390             real_count -= 1
 391         elif sp[5] == 'decoy':
 392             decoy_count -= 1
 393         # set threshold just high enough to exclude this spectrum
 394         current_threshold = sp[3] + epsilon
 395     else:
 396         current_threshold = spinfo[-1][3] + epsilon # couldn't meet goal
 397
 398     return (current_threshold, real_count, decoy_count)
 399
 400
 401 def calculate_combined_thresholds(fdr, spectrum_info):
 402     """Find best score/delta thresholds for each charge."""
 403
 404     ppv_goal = 1 - fdr
 405
 406     # Rather than search every possible value of delta, we're only going to
 407     # "sample" at this granularity.  This cuts search time dramatically (and
 408     # makes it O(n) instead of O(n**2).  Extra precision wouldn't really be
 409     # useful in any case.
 410     SEARCH_GRANULARITY = 0.001
 411
 412     # charge -> (score, delta, passing_reals, passing_decoys)
 413     thresholds = {}
 414
 415     by_charge = lambda x: x[2]
 416     spectrum_info.sort(key=by_charge)
 417     for charge, spinfo in itertools.groupby(spectrum_info, key=by_charge):
 418         spinfo0 = sorted(spinfo, key=lambda x: x[4], reverse=True)
 419
 420         last_value = None
 421
 422         while spinfo0:
 423             this_value = spinfo0[-1][4] # current delta
 424             if (last_value == None
 425                 or abs(this_value - last_value) >= SEARCH_GRANULARITY):
 426
 427                 # "inner" is score
 428                 r = calculate_inner_threshold(fdr, spinfo0)
 429                 threshold, real_count, decoy_count = r
 430                 if threshold != None:
 431                     debug('# %s %s %s %s %s', charge, threshold, this_value,
 432                           real_count, decoy_count)
 433                     if (charge not in thresholds
 434                         or real_count > thresholds[charge][2]):
 435                         thresholds[charge] = (threshold, this_value,
 436                                               real_count, decoy_count)
 437                 last_value = this_value
 438             spinfo0.pop()
 439
 440     return thresholds
 441
 442
 443 def flatten(gen_of_gens):
 444     """Yield items from a generator of generators.
 445
 446     >>> list(flatten([(1,2,3), (4, (5, 'b'), 6)]))
 447     [1, 2, 3, 4, (5, 'b'), 6]
 448
 449     """
 450
 451     for gen in gen_of_gens:
 452         for item in gen:
 453             yield item
 454
 455
 456 def redundancy_filter(options, spectrum_info):
 457     if options.minimum_spectra_per_locus < 2:
 458         return [ si for si in spectrum_info ]
 459
 460     # si = (spectrum_no, spectrum_name, charge, score, delta, state,
 461     #       peptide, stripped_peptide, actual_mass, mass_delta, loci)
 462
 463     Rs = sorted(flatten(((locus, si) for locus in si[10])
 464                         for si in spectrum_info))
 465
 466     result = set()
 467     for locus, locus_Rs in itertools.groupby(Rs, key=lambda x: x[0]):
 468         list_locus_si = [ si for locus, si in locus_Rs ]
 469         if len(list_locus_si) < options.minimum_spectra_per_locus:
 470             continue
 471         result |= set(list_locus_si)
 472     return sorted(result)
 473
 474
 475 def saturate_by_prefix_suffix(spectrum_info, saturation_spectrum_info):
 476     PREFIX_LENGTH = 16          # empirically seems good
 477     peptide_prefixes = set(si[7][:PREFIX_LENGTH] for si in spectrum_info)
 478     sat_si = sorted(ssi for ssi in saturation_spectrum_info
 479                     if ssi[7][:PREFIX_LENGTH] in peptide_prefixes)
 480
 481     info("saturation by prefix (w/%s): %s -> %s",
 482          len(saturation_spectrum_info), len(spectrum_info), len(sat_si))
 483     return sat_si
 484
 485
 486 def saturate(spectrum_info, saturation_spectrum_info):
 487     peptides = set(si[7] for si in spectrum_info)
 488     # FIX: does this actually need to be sorted?
 489     sat_si = sorted(ssi for ssi in saturation_spectrum_info
 490                     if ssi[7] in peptides)
 491     info("saturation (w/%s): %s -> %s", len(saturation_spectrum_info),
 492          len(spectrum_info), len(sat_si))
 493     return saturate_by_prefix_suffix(sat_si, saturation_spectrum_info)
 494
 495
 496 def fdr_stats(spectrum_info):
 497     reals = sum(1 for si in spectrum_info if si[5] == 'real')
 498     decoys = sum(1 for si in spectrum_info if si[5] == 'decoy')
 499     fdr = 1 - PPV(reals, decoys)
 500     return fdr, reals, decoys
 501
 502 def debug_fdr(options, msg, si):
 503     if options.verbose:
 504         fdr, reals, decoys = fdr_stats(si)
 505         info('%s %s %s %s', msg, fdr, reals, decoys)
 506
 507 def try_fdr(fdr_guess, options, remaining_spectrum_info,
 508             saturation_spectrum_info=None):
 509     info("trying FDR = %s", fdr_guess)
 510     debug_fdr(options, "initial", remaining_spectrum_info)
 511
 512     remaining_spectrum_info_1 = None
 513     while True:
 514         thresholds = calculate_combined_thresholds(fdr_guess,
 515                                                    remaining_spectrum_info)
 516         remaining_spectrum_info = [ si for si in remaining_spectrum_info
 517                                     if (si[2] in thresholds
 518                                         and si[3] >= thresholds[si[2]][0]
 519                                         and si[4] >= thresholds[si[2]][1]) ]
 520         debug_fdr(options, "after thresholding", remaining_spectrum_info)
 521         if remaining_spectrum_info == remaining_spectrum_info_1:
 522             break
 523         remaining_spectrum_info_1 = redundancy_filter(options,
 524                                                       remaining_spectrum_info)
 525         debug_fdr(options, "after redundancy", remaining_spectrum_info_1)
 526         if len(remaining_spectrum_info_1) == len(remaining_spectrum_info):
 527             break
 528         remaining_spectrum_info = remaining_spectrum_info_1
 529
 530     fdr_result, total_reals, total_decoys = fdr_stats(remaining_spectrum_info_1)
 531     if not options.no_saturation and saturation_spectrum_info:
 532         remaining_spectrum_info_1 = saturate(remaining_spectrum_info_1,
 533                                              saturation_spectrum_info)
 534         debug_fdr(options, "after saturation", remaining_spectrum_info_1)
 535         (fdr_result,
 536          total_reals, total_decoys) = fdr_stats(remaining_spectrum_info_1)
 537
 538     return (fdr_result, thresholds, total_reals, total_decoys,
 539             remaining_spectrum_info_1)
 540
 541
 542 # FIX: at various points here we may have a goal of a minimum number of real
 543 # ids, based on goal FDR or whatever.  We can pass this inward to do
 544 # branch-and-bound, which may save time.
 545
 546 # FIX: this is now somewhat slower than before--any way to speed it up?
 547
 548 def search_adjusting_fdr(options, spectrum_info_0):
 549     """Repeatedly calculate thresholds while adjusting FDR, looking for an FDR
 550     guess that results in an appropriate goal FDR.  This might be needed
 551     because the thresholds found for per-charge FDRs will be saturated or have
 552     quantization errors, resulting in overly conservative thresholds.  Returns
 553     a list of items from spectrum_info that are considered valid.
 554     """
 555
 556     FDR_INFLATION_FACTOR = 3
 557     assert FDR_INFLATION_FACTOR > 1
 558
 559     spectrum_info = redundancy_filter(options, spectrum_info_0)
 560
 561     low_fdr = options.fdr
 562     # FIX: spectrum_info_0 has already been filtered for tryptic status
 563     # Should we use an unfiltered set instead?
 564     low_results = try_fdr(low_fdr, options, spectrum_info, spectrum_info_0)
 565
 566     if not options.no_adjust_fdr:
 567         high_fdr = min(1.0, options.fdr * FDR_INFLATION_FACTOR)
 568         high_results = try_fdr(high_fdr, options, spectrum_info,
 569                                spectrum_info_0)
 570         initial_total_reals = low_results[2]
 571
 572         for i in range(32):
 573             debug("adjust infl fdr %s %s -> %s", low_fdr, high_fdr,
 574                  high_results[0])
 575             if high_results[0] >= options.fdr:
 576                 break
 577             low_fdr, low_results = high_fdr, high_results
 578             high_fdr = min(1.0, high_fdr * FDR_INFLATION_FACTOR)
 579             if high_fdr == 1.0:
 580                 break
 581             high_results = try_fdr(high_fdr, options, spectrum_info,
 582                                    spectrum_info_0)
 583         else:
 584             warning("FDR adjustment inflation failed")
 585
 586         #    #
 587         #    l        g            h
 588         #   l       g         h
 589         #    #
 590
 591         CONVERGENCE_FACTOR = 0.01
 592         assert CONVERGENCE_FACTOR > 0
 593         for i in range(32):
 594             assert high_fdr >= low_fdr
 595             if (high_fdr - low_fdr) <= CONVERGENCE_FACTOR*options.fdr:
 596                 break
 597
 598             # - seems to be slower than the naive method?
 599             # Bias is empirically determined.  It's supposed to speed
 600             # convergence, but should not affect the results.
 601             # FIX: better method possible?
 602             #GUESS_BIAS = 1.15
 603             #prop = ((options.fdr - low_results[0])
 604             #        / (high_results[0] - low_results[0]))
 605             #prop = min(1, max(0, prop))
 606             #guess_fdr = low_fdr + prop**GUESS_BIAS * (high_fdr - low_fdr)
 607
 608             guess_fdr = (high_fdr + low_fdr) / 2.0
 609
 610             guess_results = try_fdr(guess_fdr, options, spectrum_info,
 611                                     spectrum_info_0)
 612             debug("adjust fdr %s %s %s -> %s", low_fdr, guess_fdr, high_fdr,
 613                  guess_results[0])
 614             if guess_results[0] > options.fdr:
 615                 high_fdr, high_results = guess_fdr, guess_results
 616             else:
 617                 low_fdr, low_results = guess_fdr, guess_results
 618         else:
 619             warning("FDR adjustment convergence failed")
 620
 621     if not options.no_adjust_fdr:
 622         info("FDR adjustment found %s extra real ids",
 623              low_results[2] - initial_total_reals)
 624
 625     return low_results
 626
 627
 628 def search_with_possible_saturation(options, spectrum_info):
 629
 630     for saturate in (True, False):
 631         if saturate and options.no_saturation:
 632             continue
 633         results = search_adjusting_fdr(options, spectrum_info)
 634         (fdr_result, thresholds,
 635          total_reals, total_decoys, valid_spectrum_info) = results
 636         if fdr_result <= options.fdr:
 637             break
 638         if saturate:
 639             info('could not achieve FDR with saturation, retrying without')
 640             options.no_saturation = True
 641
 642     if options.output_peptides:
 643         valid_spectrum_info.sort(key=lambda x: x[1])
 644         for (spectrum_no, spectrum_name, charge, score, delta, state,
 645              peptide, stripped_peptide, actual_mass,
 646              mass_delta, loci) in valid_spectrum_info:
 647             print >> options.output_peptides, \
 648                 ("%+d %s %s %s %.8f %.8f"
 649                  % (charge, spectrum_name, state, peptide, actual_mass,
 650                     mass_delta))
 651
 652     for charge in sorted(thresholds.keys()):
 653         score_threshold, delta_threshold = thresholds[charge][0:2]
 654         reals, decoys = thresholds[charge][2:4]
 655         info("%+d: score %f, delta %.6f -> %s real ids, %s decoys (fdr %.4f)",
 656              charge, score_threshold, delta_threshold, reals, decoys,
 657              0.5 * (1 - PPV(reals, decoys)))
 658
 659     if not options.quiet:
 660         print ("%s real ids, %s decoys (FDR = %.4f) [p%s]"
 661                % (total_reals, total_decoys, fdr_result / 2.0,
 662                   options.minimum_spectra_per_locus))
 663
 664     return valid_spectrum_info
 665
 666
 667 def print_mass_error_histogram(valid_spectrum_info):
 668     hist = defaultdict(int)
 669
 670     for (spectrum_no, spectrum_name, charge, score, delta, state, peptide,
 671          stripped_peptide, actual_mass, mass_delta,
 672          loci) in valid_spectrum_info:
 673         hist[round(mass_delta)] += 1
 674
 675     pairs_most_first = sorted(((bin, hist[bin]) for bin in hist),
 676                               key=lambda x: x[1], reverse=True)
 677
 678     total = sum(count for bin, count in pairs_most_first)
 679
 680     cumulative_total = 0
 681     for bin, count in pairs_most_first:
 682         cumulative_total += count
 683         print '# %+d\t%s\t%.3f' % (bin, count,
 684                                    (float(cumulative_total) / total))
 685
 686
 687 def main(args=sys.argv[1:]):
 688     parser = optparse.OptionParser(usage=
 689                                    "usage: %prog [options] <sqt-file>...",
 690                                    description=__doc__, version=greylag.VERSION)
 691     pa = parser.add_option
 692     pa("--decoy-prefix", dest="decoy_prefix",
 693        default=greylag.DEFAULT_DECOY_PREFIX,
 694        help='prefix given to locus name of decoy (e.g., shuffled) database'
 695        ' sequences [default=%r]' % greylag.DEFAULT_DECOY_PREFIX,
 696        metavar="PREFIX")
 697     DEFAULT_FDR = 0.01
 698     pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR,
 699        help="false discovery rate [default=%s] (Note that this is the"
 700        " final resulting FDR after the decoys have been removed" % DEFAULT_FDR,
 701        metavar="PROPORTION")
 702     pa("--no-adjust-fdr", action="store_true", dest="no_adjust_fdr",
 703        help="skip adjustment of internal FDR to achieve a final FDR closer to"
 704        " requested FDR (which takes longer, but may find more real ids in"
 705        " some cases)")
 706     pa("--no-saturation", action="store_true", dest="no_saturation",
 707        help="skip peptide saturation")
 708     pa("-v", "--verbose", action="store_true", dest="verbose",
 709        help="be verbose")
 710     pa("-q", "--quiet", action="store_true", dest="quiet", help="no warnings")
 711     pa("-l", "--logfile", dest="logfile",
 712        help="log to FILE instead of stderr", metavar="FILE")
 713     pa("--output-peptides", dest="output_peptides",
 714        help="output information about passing peptides to specified file"
 715        " ('-' for stdout)", metavar="FILENAME")
 716     pa("-t", "--minimum-trypticity", type="choice", choices=('0', '1', '2'),
 717        default='0', dest="minimum_trypticity",
 718        help="drop peptides with too few tryptic ends"
 719        " (2=fully tryptic, 1=semi-tryptic, 0=any)", metavar="TRYPTICITY")
 720     DEFAULT_MINIMUM_SPECTRA_PER_LOCUS = 1
 721     pa("-p", "--minimum-spectra-per-locus", dest="minimum_spectra_per_locus",
 722        type="int", default=DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
 723        help="only loci with at least this many spectra are included, and"
 724        " spectra not associated with such a locus are excluded"
 725        " [default=%s]" % DEFAULT_MINIMUM_SPECTRA_PER_LOCUS,
 726        metavar="COUNT")
 727     DEFAULT_MAXIMUM_SP_RANK = 1000000
 728     pa("--maximum-sp-rank", dest="maximum_sp_rank", type="int",
 729        default=DEFAULT_MAXIMUM_SP_RANK,
 730        help="drop peptides with greater Sp rank (the secondary rank in sqt"
 731        " files, for compatibility with other search programs)"
 732        " [default=%s]" % DEFAULT_MAXIMUM_SP_RANK, metavar="RANK")
 733     #pa("--graph", dest="graph",
 734     #   help='create distribution graphs, using the specified prefix',
 735     #   metavar="PATH PREFIX")
 736     pa("-M", "--mass-error-histogram", action="store_true",
 737        dest="mass_error_histogram",
 738        help="print a histogram of mass errors (1-Da-wide bins)")
 739     pa("--debug", action="store_true", dest="debug",
 740        help="show debug output")
 741     pa("--mark", action="store_true", dest="mark",
 742        help="rewrite the input files, changing some validation marks to 'N',"
 743        " according to filtering")
 744     pa("--reset-marks", action="store_true", dest="reset_marks",
 745        help="rewrite the input files, changing all validation marks to 'U'")
 746     pa("--kill", action="store_true", dest="kill",
 747        help="rewrite the input files, removing spectra that don't pass"
 748        " validation, according to filtering")
 749     pa("--copyright", action="store_true", dest="copyright",
 750        help="print copyright and exit")
 751     (options, args) = parser.parse_args(args=args)
 752
 753     if options.copyright:
 754         print __copyright__
 755         sys.exit(0)
 756
 757     if (len(args) < 1 or options.minimum_spectra_per_locus < 0
 758         or options.maximum_sp_rank < 1):
 759         parser.print_help()
 760         sys.exit(1)
 761
 762     if not (0.0 <= options.fdr < 0.5):
 763         error("--fdr must be within range [0.0, 0.5)")
 764     if (sum(1 for x in (options.mark, options.reset_marks, options.kill) if x)
 765         > 1):
 766         error("only one of --mark, --reset-marks, --kill may be specified")
 767
 768     options.minimum_trypticity = int(options.minimum_trypticity)
 769
 770     greylag.set_logging(options)
 771
 772     if options.reset_marks:
 773         write_spectra_to_files(reset_marks(generate_spectra_from_files(args)),
 774                                args)
 775         return
 776
 777     if options.output_peptides:
 778         if options.output_peptides == '-':
 779             options.output_peptides = sys.stdout
 780         else:
 781             options.output_peptides = open(options.output_peptides, 'w')
 782
 783     # This is to translate the "effective" FDR, which is the rate after decoys
 784     # have been stripped out of the results (which is what users really care
 785     # about), into our internal FDR, which includes decoys.
 786     options.fdr *= 2
 787
 788     info("reading spectra")
 789     spectrum_info = list(remove_charge_aliases(get_spectrum_info(options,
 790                                                                  generate_spectra_from_files(args))))
 791
 792     # valid_spectrum_info is the list of spectrum_info elements that have been
 793     # chosen as "valid"
 794     valid_spectrum_info = search_with_possible_saturation(options, spectrum_info)
 795
 796     if options.mass_error_histogram:
 797         print_mass_error_histogram(valid_spectrum_info)
 798
 799     if options.debug:
 800         valid_spectrum_info.sort()
 801         for (spectrum_no, spectrum_name, charge, score, delta, state,
 802              peptide, stripped_peptide, actual_mass,
 803              mass_delta, loci) in valid_spectrum_info:
 804             debug('#S %s %s %s', charge, score, delta)
 805
 806     if options.mark or options.kill:
 807         # Note that we are intentionally reading the input files again here,
 808         # to avoid having to keep them all in memory.
 809         info("rewriting spectrum files")
 810         write_spectra_to_files(set_marks(valid_spectrum_info,
 811                                          generate_spectra_from_files(args),
 812                                          options.kill),
 813                                args)
 814         info("finished rewriting")
 815
 816
 817 if __name__ == '__main__':
 818     main()