From 2137a83cefcb333bd9bba38330237bdf9100069b Mon Sep 17 00:00:00 2001 From: Michael Coleman Date: Mon, 26 Nov 2007 23:45:58 -0600 Subject: [PATCH] Merged sqt-filter into greylag-validate. This version just does the basic two-dimensional score/delta FDR climb. (no graphs) --- Makefile | 8 +- greylag_validate.py | 129 +++++++++++++++------- sqt_filter.py | 304 ---------------------------------------------------- 3 files changed, 97 insertions(+), 344 deletions(-) delete mode 100755 sqt_filter.py diff --git a/Makefile b/Makefile index 1597fe5..62d26b6 100644 --- a/Makefile +++ b/Makefile @@ -13,12 +13,12 @@ # MARCH = pentium3 # MARCH = pentium4 -MARCH = prescott +# MARCH = prescott # MARCH = opteron -# MARCH = nocona +MARCH = nocona -DEST = /n/site/inst/Linux-i686/bioinfo/greylag/ +DEST = /n/site/inst/Linux-x86_64/bioinfo/greylag/ PYTHONVER=2.5 PYTHONFLAGS = $(shell python$(PYTHONVER)-config --include) @@ -89,7 +89,7 @@ install:: install -p greylag_merge.py $(DEST)/greylag-merge install -p greylag_sqt.py $(DEST)/greylag-sqt install -p greylag_solo.py $(DEST)/greylag-solo - install -p sqt_filter.py $(DEST)/sqt-filter + install -p greylag_validate.py $(DEST)/greylag-validate @echo "# remember to link 'greylag-python' to your python $(PYTHONVER)" clean:: diff --git a/greylag_validate.py b/greylag_validate.py index 76acb88..d5d1227 100755 --- a/greylag_validate.py +++ b/greylag_validate.py @@ -1,8 +1,9 @@ #!/usr/bin/env greylag-python """ -Filter a set of sqt files according to FDR and other criteria, optionally -rewriting them with the validation marks set to 'N' for filtered-out spectra. +Given a set of sqt files, determine a 'valid' set of identifications that +satisfy the specified FDR (false discovery rate). Optionally rewrite the sqt +files with validation marks set to 'N' for filtered-out spectra. """ @@ -33,6 +34,7 @@ from collections import defaultdict import fileinput import optparse from pprint import pprint +import string import sys @@ -94,6 +96,15 @@ def mark(options, thresholds, sp_scores, sqt_fns): raise +nulltrans = string.maketrans('','') +non_aa_chars = string.digits + string.punctuation + +def stripmods(s): + """Given a peptide, return just the unmodified peptide seen (upcased).""" + # assuming no control chars present + return s.upper().translate(nulltrans, non_aa_chars) + + def read_sqt_info(decoy_prefix, is_laf, sqt_fns): """Return a pair, the first a dict mapping each charge to a list of (score, delta, state), where state is 'real' or 'decoy', for all the @@ -114,14 +125,20 @@ def read_sqt_info(decoy_prefix, is_laf, sqt_fns): current_sp_rank = None current_peptide_length = None current_peptide_trypticity = None # or one of [ 0, 1, 2 ] - current_state = set() + current_state = set() # 'real', 'decoy' or both + highest_rank_seen = 0 + best_peptide_seen = None + + def passes_laf(length, sp_rank, trypticity): + return length >= 7 and sp_rank <= 10 and trypticity == 2 for line in fileinput.input(sqt_fns): fs = line.split('\t') if fs[0] == 'S': if (current_charge != None and - (not is_laf or (current_sp_rank <= 10 - and current_peptide_trypticity == 2))): + (not is_laf or passes_laf(current_peptide_length, + current_sp_rank, + current_peptide_trypticity))): if current_score != None and len(current_state) == 1: z_scores[current_charge].append((current_score, current_delta, @@ -136,19 +153,41 @@ def read_sqt_info(decoy_prefix, is_laf, sqt_fns): current_peptide_length = None current_peptide_trypticity = None current_state = set() + highest_rank_seen = 0 + best_peptide_seen = None elif fs[0] == 'M': - delta, score, sp_rank, peptide = (float(fs[4]), float(fs[5]), - int(fs[2]), fs[9].strip()) - pep_length = len(peptide) - 4 + rank, delta, score, sp_rank, peptide, mass = (int(fs[1]), + float(fs[4]), + float(fs[5]), + int(fs[2]), + fs[9].strip(), + float(fs[3])) + assert peptide[1] == '.' and peptide[-2] == '.' + peptide = peptide[2:-2] + + if rank > highest_rank_seen + 1: + # ignore aux top SpRank hits, because delta confounds + continue + highest_rank_seen = rank + + peptide_stripped = stripmods(peptide) + if best_peptide_seen == None: + best_peptide_seen = peptide_stripped if delta == 0: + assert current_delta == 0 current_score = score - elif current_delta == 0: + elif (current_delta == 0 or best_peptide_seen == None + or best_peptide_seen == peptide_stripped): current_delta = delta + if best_peptide_seen != peptide_stripped: + # once we've seen a non-equivalent peptide, don't set + # delta if we see further equivalent peptides + best_peptide_seen = '####' if current_sp_rank == None: current_sp_rank = sp_rank if current_peptide_length == None: - current_peptide_length = pep_length + current_peptide_length = len(peptide) if current_peptide_trypticity == None: current_peptide_trypticity = 0 if (peptide[0] in ('K', 'R') @@ -163,35 +202,42 @@ def read_sqt_info(decoy_prefix, is_laf, sqt_fns): current_state.add('decoy') else: current_state.add('real') + # handle final spectrum, as above if (current_charge != None and - (not is_laf or (current_sp_rank <= 10 - and current_peptide_trypticity == 2))): + (not is_laf or passes_laf(current_peptide_length, + current_sp_rank, + current_peptide_trypticity))): if current_score != None and len(current_state) == 1: - z_scores[current_charge].append((current_score, current_delta, + z_scores[current_charge].append((current_score, + current_delta, current_state.pop())) if current_charge != None: - sp_scores.append((current_charge, current_score, current_delta)) + sp_scores.append((current_charge, current_score, + current_delta)) return (z_scores, sp_scores) def specificity(positives, negatives): - return (float(positives - negatives) - / (positives + negatives)) + return float(positives - negatives) / positives + +def calculate_inner_threshold(fdr, charge, spinfo): + specificity_goal = 1 - fdr -def calculate_inner_threshold(specificity_goal, charge, spinfo): spinfo = sorted(spinfo, key=lambda x: x[0]) + epsilon = -1e-9 + real_count = sum(1 for x in spinfo if x[-1] == 'real') decoy_count = len(spinfo) - real_count - if real_count == 0: - return (None, real_count, decoy_count) # give up - current_threshold = -1e100 # allow all spectra for n, sp in enumerate(spinfo): + if real_count == 0: + return (None, real_count, decoy_count) # give up + specificity_est = specificity(real_count, decoy_count) if specificity_est >= specificity_goal: break @@ -200,9 +246,9 @@ def calculate_inner_threshold(specificity_goal, charge, spinfo): else: decoy_count -= 1 # set threshold just high enough to exclude this spectrum - current_threshold = sp[0] + 1e-6 + current_threshold = sp[0] + epsilon else: - current_threshold = spinfo[-1][0] + 1e-6 # couldn't meet goal + current_threshold = spinfo[-1][0] + epsilon # couldn't meet goal return (current_threshold, real_count, decoy_count) @@ -214,7 +260,7 @@ def calculate_combined_thresholds(options, z_scores): # Rather than search every possible value of delta, we're only going to # "sample" at this granularity. This cuts search time dramatically (and - # making it O(n) instead of O(n**2). Extra precision wouldn't really be + # makes it O(n) instead of O(n**2). Extra precision wouldn't really be # useful in any case. SEARCH_GRANULARITY = 0.001 @@ -232,8 +278,7 @@ def calculate_combined_thresholds(options, z_scores): or abs(this_value - last_value) >= SEARCH_GRANULARITY): # "inner" is score - r = calculate_inner_threshold(specificity_goal, charge, - spinfo0) + r = calculate_inner_threshold(options.fdr, charge, spinfo0) if r[0] != None: if options.debug: print '#', charge, r[0], this_value, r[1], r[2] @@ -245,12 +290,12 @@ def calculate_combined_thresholds(options, z_scores): spinfo0.pop() if charge in thresholds: - if options.verbose: - print ("%+d: score %s, delta %s -> %s real ids (fdr %.4f)" - % (charge, thresholds[charge][0], thresholds[charge][1], - thresholds[charge][2], - 1 - specificity(thresholds[charge][2], - thresholds[charge][3]))) + print ("%+d: score %s, delta %s -> %s real ids, %s decoys" + " (fdr %.4f)" + % (charge, thresholds[charge][0], thresholds[charge][1], + thresholds[charge][2], thresholds[charge][3], + 1 - specificity(thresholds[charge][2], + thresholds[charge][3]))) else: warn("could not calculate thresholds for %+d" % charge) @@ -262,15 +307,22 @@ def main(args=sys.argv[1:]): "usage: %prog [options] ...", description=__doc__, version=__version__) pa = parser.add_option - pa("--decoy-prefix", dest="decoy_prefix", default="SHUFFLED_", + DEFAULT_DECOY_PREFIX = "SHUFFLED_" + pa("--decoy-prefix", dest="decoy_prefix", default=DEFAULT_DECOY_PREFIX, help='prefix given to locus name of decoy (e.g., shuffled) database' - ' sequences [default="SHUFFLED_"]', metavar="PREFIX") - pa("--fdr", dest="fdr", type="float", default="0.02", - help="false discovery rate [default=0.02]", metavar="PROPORTION") + ' sequences [default=%r]' % DEFAULT_DECOY_PREFIX, metavar="PREFIX") + DEFAULT_FDR = 0.02 + pa("--fdr", dest="fdr", type="float", default=DEFAULT_FDR, + help="false discovery rate [default=%s]" % DEFAULT_FDR, + metavar="PROPORTION") pa("-v", "--verbose", action="store_true", dest="verbose", help="be verbose") pa("--laf", action="store_true", dest="laf", - help="drop peptides of length < 7 or Sp rank > 10") + help="drop peptides that have length < 7, Sp rank > 10," + " or that are not fully tryptic") + #pa("--graph", dest="graph", + # help='create distribution graphs, using the specified prefix', + # metavar="PATH PREFIX") pa("--debug", action="store_true", dest="debug", help="show debug output") pa("-m", "--mark", action="store_true", dest="mark", @@ -302,6 +354,11 @@ def main(args=sys.argv[1:]): z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix, options.laf, args) + if options.debug: + print ('%s passing spectra, of which %s are not decoys' + % (len(spectrum_scores), sum(len(z_scores[x]) + for x in z_scores))) + thresholds = calculate_combined_thresholds(options, z_scores) if options.debug: diff --git a/sqt_filter.py b/sqt_filter.py deleted file mode 100755 index 49733df..0000000 --- a/sqt_filter.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env greylag-python - -""" -Filter a set of sqt files according to FDR and other criteria, optionally -rewriting them with the validation marks set to 'N' for filtered-out spectra. - -""" - -from __future__ import with_statement - -__copyright__ = ''' - greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -''' - -__version__ = "0.0" - - -from collections import defaultdict -import fileinput -import optparse -from pprint import pprint -import sys - - -def warn(s): - print >> sys.stderr, 'warning:', s -def error(s): - sys.exit('error: ' + s) -def fileerror(s, *args): - error(s + (", at line %s of file '%s'" - % (fileinput.filelineno(), fileinput.filename())), - *args) - -def inplace_warning(): - warn("!!!\nan error occurred while modifying .sqt files in-place--it may" - " be necessary to recover some or all of the .sqt files from the" - " corresponding .sqt.bak files.\n!!!") - - -def reset_marks(options, sqt_fns): - """Rewrite all evaluation marks to 'U', in-place.""" - - try: - for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'): - if line.startswith("M\t"): - fs = line.split("\t") - if len(fs) >= 11: - fs[10] = 'U' + fs[10][1:] - line = '\t'.join(fs) - sys.stdout.write(line) - except: - inplace_warning() - raise - - -def mark(options, thresholds, sp_scores, sqt_fns): - """Rewrite evaluation marks to 'N', in-place, for spectra not meeting - score and delta thresholds.""" - - spectrum_no = -1 - mark_spectrum = False - - try: - for line in fileinput.input(sqt_fns, inplace=1, backup='.bak'): - if line.startswith("S\t"): - spectrum_no += 1 - charge, score, delta = sp_scores[spectrum_no] - mark_spectrum = (charge in thresholds - and score != None - and (score < thresholds[charge][0] - or delta < thresholds[charge][1])) - elif line.startswith("M\t") and mark_spectrum: - fs = line.split("\t") - if len(fs) >= 11: - fs[10] = 'N' + fs[10][1:] - line = '\t'.join(fs) - sys.stdout.write(line) - except: - inplace_warning() - raise - - -def read_sqt_info(decoy_prefix, is_laf, sqt_fns): - """Return a pair, the first a dict mapping each charge to a list of - (score, delta, state), where state is 'real' or 'decoy', for all the - spectra in sqt_fns, and the second a list of (charge, score, delta), one - for each spectrum. - """ - - # charge -> [ (score, delta, state), ... ] - # where state is either 'real' or 'decoy' - z_scores = defaultdict(list) - - # [ (charge, score, delta), ... ] - sp_scores = [] - - current_charge = None - current_score = None - current_delta = 0 - current_sp_rank = None - current_peptide_length = None - current_state = set() - - for line in fileinput.input(sqt_fns): - fs = line.split('\t') - if fs[0] == 'S': - if (current_charge != None and - (not is_laf or (current_sp_rank <= 10))): - if current_score != None and len(current_state) == 1: - z_scores[current_charge].append((current_score, - current_delta, - current_state.pop())) - if current_charge != None: - sp_scores.append((current_charge, current_score, - current_delta)) - current_charge = int(fs[3]) - current_score = None - current_delta = 0 - current_sp_rank = None - current_peptide_length = None - current_state = set() - elif fs[0] == 'M': - delta, score, sp_rank, pep_length = (float(fs[4]), float(fs[5]), - int(fs[2]), len(fs[9])-4) - if delta == 0: - current_score = score - elif current_delta == 0: - current_delta = delta - if current_sp_rank == None: - current_sp_rank = sp_rank - if current_peptide_length == None: - current_peptide_length = pep_length - elif fs[0] == 'L': - if current_delta == 0: - if fs[1].startswith(decoy_prefix): - current_state.add('decoy') - else: - current_state.add('real') - # handle final spectrum, as above - if (current_charge != None and - (not is_laf or (current_sp_rank <= 10))): - if current_score != None and len(current_state) == 1: - z_scores[current_charge].append((current_score, current_delta, - current_state.pop())) - if current_charge != None: - sp_scores.append((current_charge, current_score, current_delta)) - - return (z_scores, sp_scores) - - -def specificity(positives, negatives): - return (float(positives - negatives) - / (positives + negatives)) - - -def calculate_inner_threshold(specificity_goal, charge, spinfo): - spinfo = sorted(spinfo, key=lambda x: x[0]) - - real_count = sum(1 for x in spinfo if x[-1] == 'real') - decoy_count = len(spinfo) - real_count - - if real_count == 0: - return (None, real_count, decoy_count) # give up - - current_threshold = -1e100 # allow all spectra - for n, sp in enumerate(spinfo): - specificity_est = specificity(real_count, decoy_count) - if specificity_est >= specificity_goal: - break - if sp[-1] == 'real': - real_count -= 1 - else: - decoy_count -= 1 - # set threshold just high enough to exclude this spectrum - current_threshold = sp[0] + 1e-6 - else: - current_threshold = spinfo[-1][0] + 1e-6 # couldn't meet goal - - return (current_threshold, real_count, decoy_count) - - -def calculate_combined_thresholds(options, z_scores): - """Find best score/delta thresholds for each charge.""" - - specificity_goal = 1 - options.fdr - - # Rather than search every possible value of delta, we're only going to - # "sample" at this granularity. This cuts search time dramatically (and - # making it O(n) instead of O(n**2). Extra precision wouldn't really be - # useful in any case. - SEARCH_GRANULARITY = 0.001 - - # charge -> (score, delta, passing_reals, passing_decoys) - thresholds = {} - - for charge, spinfo in z_scores.iteritems(): - spinfo0 = sorted(spinfo, key=lambda x: x[1], reverse=True) - - last_value = None - - while spinfo0: - this_value = spinfo0[-1][1] # current delta - if (last_value == None - or abs(this_value - last_value) >= SEARCH_GRANULARITY): - - # "inner" is score - r = calculate_inner_threshold(specificity_goal, charge, - spinfo0) - if r[0] != None: - if options.debug: - print '#', charge, r[0], this_value, r[1], r[2] - if (charge not in thresholds - or r[1] > thresholds[charge][2]): - thresholds[charge] = (r[0], this_value, r[1], r[2]) - - last_value = this_value - spinfo0.pop() - - if charge in thresholds: - if options.verbose: - print ("%+d: score %s, delta %s -> %s real ids (fdr %.4f)" - % (charge, thresholds[charge][0], thresholds[charge][1], - thresholds[charge][2], - 1 - specificity(thresholds[charge][2], - thresholds[charge][3]))) - else: - warn("could not calculate thresholds for %+d" % charge) - - return thresholds - - -def main(args=sys.argv[1:]): - parser = optparse.OptionParser(usage= - "usage: %prog [options] ...", - description=__doc__, version=__version__) - pa = parser.add_option - pa("--decoy-prefix", dest="decoy_prefix", default="SHUFFLED_", - help='prefix given to locus name of decoy (e.g., shuffled) database' - ' sequences [default="SHUFFLED_"]', metavar="PREFIX") - pa("--fdr", dest="fdr", type="float", default="0.02", - help="false discovery rate [default=0.02]", metavar="PROPORTION") - pa("-v", "--verbose", action="store_true", dest="verbose", - help="be verbose") - pa("--laf", action="store_true", dest="laf", - help="drop peptides of length < 7 or Sp rank > 10") - pa("--debug", action="store_true", dest="debug", - help="show debug output") - pa("-m", "--mark", action="store_true", dest="mark", - help="rewrite the input files, changing some validation marks to 'N'," - " according to filtering") - pa("--reset-marks", action="store_true", dest="reset_marks", - help="rewrite the input files, changing all validation marks to 'U'") - pa("--copyright", action="store_true", dest="copyright", - help="print copyright and exit") - (options, args) = parser.parse_args(args=args) - - if options.copyright: - print __copyright__ - sys.exit(0) - - if len(args) < 1: - parser.print_help() - sys.exit(1) - - if not (0.0 <= options.fdr <= 1.0): - error("--fdr must be within range [0.0, 1.0]") - if options.mark and options.reset_marks: - error("only one of --mark and --reset-marks may be specified") - - if options.reset_marks: - reset_marks(options, args) - return - - z_scores, spectrum_scores = read_sqt_info(options.decoy_prefix, - options.laf, args) - - thresholds = calculate_combined_thresholds(options, z_scores) - - if options.debug: - pprint(thresholds) - - for charge, score, delta in spectrum_scores: - print '#S', charge, score, delta - - if options.mark: - mark(options, thresholds, spectrum_scores, args) - - -if __name__ == '__main__': - main() -- 2.11.4.GIT