cgreylag.cpp

   1 // C++ module for greylag
   2
   3 //     greylag, a collection of programs for MS/MS protein analysis
   4 //     Copyright (C) 2006-2008  Stowers Institute for Medical Research
   5 //
   6 //     This program is free software: you can redistribute it and/or modify
   7 //     it under the terms of the GNU General Public License as published by
   8 //     the Free Software Foundation, either version 3 of the License, or
   9 //     (at your option) any later version.
  10 //
  11 //     This program is distributed in the hope that it will be useful,
  12 //     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 //     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 //     GNU General Public License for more details.
  15 //
  16 //     You should have received a copy of the GNU General Public License
  17 //     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 //
  19 //     Contact: Mike Coleman
  20 //              Stowers Institute for Medical Research
  21 //              1000 East 50th Street
  22 //              Kansas City, Missouri  64110
  23 //              USA
  24
  25
  26 #include "cgreylag.hpp"
  27
  28
  29 #include <cerrno>
  30 #include <cfloat>
  31 #include <climits>
  32 #include <cstdlib>
  33 #include <cmath>
  34 #include <iostream>
  35 #include <numeric>
  36 #include <set>
  37 #include <stdexcept>
  38
  39
  40
  41 #ifdef __GNU__
  42 // Use faster, non-threadsafe versions, since we don't use threads.  This is
  43 // maybe 10% faster?
  44 #define fgetsX fgets_unlocked
  45 #define fputsX fputs_unlocked
  46 #define fprintfX __builtin_fprintf_unlocked
  47 #define ferrorX ferror_unlocked
  48 #define getcX getc_unlocked
  49 #else
  50 #define fgetsX std::fgets
  51 #define fputsX std::fputs
  52 #define fprintfX std::fprintf
  53 #define ferrorX std::ferror
  54 #define getcX std::getc
  55 #endif
  56
  57
  58 parameters parameters::the;
  59 const parameters &CP = parameters::the;
  60
  61
  62 char *
  63 peak::__repr__() const {
  64   static char temp[128];
  65   sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
  66           mz, intensity, intensity_class);
  67   return &temp[0];
  68 }
  69
  70 const int FIRST_SPECTRUM_ID = 1;
  71 int spectrum::next_id = FIRST_SPECTRUM_ID;
  72 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
  73
  74 std::vector<spectrum> spectrum::searchable_spectra;
  75 // parent mass -> searchable_spectra index
  76 std::multimap<double,
  77               std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
  78
  79 typedef
  80   std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
  81   spmi_c_it;                    // ugh
  82
  83
  84 char *
  85 spectrum::__repr__() const {
  86   static char temp[1024];
  87   sprintf(temp,
  88           "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
  89           id, physical_id, name.c_str(), mass, charge, peaks.size());
  90   return &temp[0];
  91 }
  92
  93
  94 // Return ln of n_C_k.
  95 static inline double
  96 ln_combination(unsigned int n, unsigned int k) {
  97   // Occasionally happens due to problems in scoring function. (FIX)
  98   if (n < k)
  99     return 0;
 100
 101   assert(0 <= k and k <= n and n < CP.ln_factorial.size());
 102   return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
 103 }
 104
 105
 106 // FIX: Currently we're reading spectrum files in Python, which is fast enough?
 107 // // This is an error exit for read_spectra*.
 108 // static inline void
 109 // io_error(FILE *f, const char *message="") {
 110 //   if (ferrorX(f))
 111 //     message = "I/O error while reading (or writing) spectrum file";
 112 //   throw std::ios_base::failure(message);
 113 // }
 114
 115
 116 // // true iff s consists entirely of whitespace
 117 // static inline bool
 118 // at_eol(const char *s) {
 119 //   return s[strspn(s, " \t\r\n\f\v")] == 0;
 120 // }
 121
 122
 123 // // Read spectra from file in ms2 format, tagging them with file_id.
 124
 125 // // Multiply charged spectra (e.g., +2/+3) are split into separate spectra
 126 // // having the same physical id.  Note that depending on
 127 // // offset_begin/offset_end, we may end up with the first charge and not the
 128 // // second, or vice versa.
 129
 130 // // The peak list is initially sorted by mz.  Throws an exception on invalid
 131 // // input.  Error checking is stringent in this function.
 132
 133 // // This is pretty hideous.  Is there a simpler way to read, reasonably
 134 // // efficiently, catching any input error?
 135
 136 // std::vector<spectrum>
 137 // spectrum::read_spectra_from_ms2(FILE *f, const int file_id) {
 138 //   std::vector<spectrum> spectra;
 139
 140 //   const int bufsiz = 1024;
 141 //   char buf[bufsiz];
 142 //   char *endp;
 143
 144 //   char *result = fgetsX(buf, bufsiz, f);
 145 //   while (true) {
 146 //     std::vector<std::string> names;
 147 //     std::vector<double> masses;
 148 //     std::vector<int> charges;
 149
 150 //     // read headers
 151 //     while (true) {
 152 //       if (ferrorX(f))
 153 //      io_error(f);
 154 //       if (not result or buf[0] != ':')
 155 //      break;
 156 //       names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
 157 //       if (not fgetsX(buf, bufsiz, f))
 158 //      io_error(f, "bad ms2 format: mass/charge line expected");
 159 //       errno = 0;
 160 //       double mass = std::strtod(buf, &endp); // need double accuracy here
 161 //       masses.push_back(mass);
 162 //       if (errno or endp == buf or mass <= 0)
 163 //      io_error(f, "bad ms2 format: bad mass");
 164 //       const char *endp0 = endp;
 165 //       int charge = std::strtol(endp0, &endp, 10);
 166 //       charges.push_back(charge);
 167 //       if (errno or endp == endp0 or charge <= 0)
 168 //      io_error(f, "bad ms2 format: bad charge");
 169 //       if (not at_eol(endp))
 170 //      io_error(f, "bad ms2 format: junk at end of mass/charge line");
 171 //       result = fgetsX(buf, bufsiz, f);
 172 //     }
 173 //     if (not result) {
 174 //       if (not names.empty())
 175 //      io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
 176 //       break;
 177 //     }
 178 //     // read peaks
 179 //     std::vector<peak> peaks;
 180 //     while (true) {
 181 //       peak p;
 182 //       errno = 0;
 183 //       p.mz = strtod(buf, &endp);
 184 //       if (errno or endp == buf or p.mz <= 0)
 185 //      io_error(f, "bad ms2 format: bad peak mz");
 186 //       const char *endp0 = endp;
 187 //       p.intensity = strtod(endp0, &endp);
 188 //       if (errno or endp == endp0 or p.intensity <= 0)
 189 //      io_error(f, "bad ms2 format: bad peak intensity");
 190 //       if (not at_eol(endp))
 191 //      io_error(f, "bad ms2 format: junk at end of peak line");
 192 //       peaks.push_back(p);
 193
 194 //       result = fgetsX(buf, bufsiz, f);
 195 //       if (ferrorX(f))
 196 //      io_error(f);
 197 //       if (not result or buf[0] == ':')
 198 //      break;
 199 //     }
 200
 201 //     // add spectra to vector
 202 //     if (names.empty() and not peaks.empty())
 203 //       io_error(f, "bad ms2 format: missing header lines?");
 204
 205 //     std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 206
 207 //     for (std::vector<double>::size_type i=0; i<names.size(); i++) {
 208 //       spectrum sp(masses[i], charges[i]);
 209 //       sp.peaks = peaks;
 210 //       sp.name = names[i];
 211 //       sp.file_id = file_id;
 212 //       sp.physical_id = next_physical_id;
 213 //       spectra.push_back(sp);
 214 //     }
 215 //     if (file_id != -1)
 216 //       spectrum::next_physical_id += 1;
 217 //   }
 218 //   return spectra;
 219 // }
 220
 221
 222 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
 223 // to remove those too large to be fragment products.  (Peaks are assumed to
 224 // be ordered by increasing mz.)
 225 void
 226 spectrum::filter_peaks(double TIC_cutoff_proportion,
 227                        double parent_mass_tolerance_max) {
 228   if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
 229       or parent_mass_tolerance_max < 0)
 230     throw std::invalid_argument("invalid argument value");
 231
 232   // remove peaks too large to be fragment products
 233   // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
 234   const double peak_mz_limit = mass + parent_mass_tolerance_max;
 235
 236   std::vector<peak>::iterator pi;
 237   for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
 238   peaks.erase(pi, peaks.end());
 239
 240   // FIX: note mzLowerBound..mzUpperBound here
 241
 242   // now filter by TIC
 243   std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
 244
 245   total_ion_current = 0;
 246   for (pi=peaks.begin(); pi != peaks.end(); pi++)
 247     total_ion_current += pi->intensity;
 248
 249   const double i_limit = total_ion_current * TIC_cutoff_proportion;
 250   double accumulated_intensity = 0;
 251   for (pi=peaks.begin();
 252        pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
 253     accumulated_intensity += pi->intensity;
 254
 255   peaks.erase(pi, peaks.end());
 256
 257   // restore order
 258   std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 259 }
 260
 261
 262 // Classify peaks and update class_counts and peak bin counts.
 263 // FIX: hoist some of this up into Python?  use (1,2,4) as parameter
 264 void
 265 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
 266                    double fragment_mass_tolerance) {
 267   if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
 268     throw std::invalid_argument("invalid argument value");
 269
 270   intensity_class_counts.clear();
 271   intensity_class_counts.resize(intensity_class_count);
 272
 273   // e.g., 7 = 1 + 2 + 4
 274   // FIX: does this fail for non-integer ratios?
 275   int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
 276                       / (intensity_class_ratio - 1));
 277
 278   std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
 279   std::vector<peak>::iterator pi=peaks.begin();
 280   for (int i_class=0; i_class<intensity_class_count; i_class++) {
 281     int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
 282                                 / min_count) + 0.5);
 283     for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
 284       pi->intensity_class = i_class;
 285       intensity_class_counts[i_class]++;
 286     }
 287   }
 288   std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 289
 290   if (not peaks.empty()) {
 291     min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
 292     max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
 293     total_peak_bins = (int((max_peak_mz - min_peak_mz)
 294                            / (2 * fragment_mass_tolerance) + 0.5));
 295     // This is the MyriMatch estimate.  Wouldn't it be better to simply count
 296     // how many are empty?
 297     empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
 298   }
 299 }
 300
 301
 302 // Update the *_cache fields from the peaks field.
 303 void
 304 spectrum::update_peak_cache() {
 305   clear_peak_cache();
 306
 307   const unsigned int peak_count = peaks.size();
 308   peak_mz_cache = new double[peak_count+1];
 309   peak_intensity_class_cache = new int[peak_count+1];
 310
 311   for (unsigned int i=0; i<peak_count; i++) {
 312     peak_mz_cache[i] = peaks[i].mz;
 313     peak_intensity_class_cache[i] = peaks[i].intensity_class;
 314   }
 315
 316   // negative value is terminator
 317   peak_mz_cache[peak_count] = -1;
 318   peak_intensity_class_cache[peak_count] = -1;
 319 }
 320
 321
 322 // Store the list of spectra that search_peptide will search against, and also
 323 // build spectrum_mass_index.
 324 void
 325 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
 326   for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
 327     spectra[i].clear_peak_cache();
 328   searchable_spectra = spectra;
 329
 330   spectrum_mass_index.clear();
 331   for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++) {
 332     spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
 333     searchable_spectra[i].update_peak_cache();
 334   }
 335 }
 336
 337
 338 // Calculate peaks for a synthesized mass (not mz) ladder.
 339 static inline void
 340 get_synthetic_Y_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
 341                             const double *mass_list,
 342                             const double fragment_C_fixed_mass) {
 343   double m = fragment_C_fixed_mass;
 344
 345   for (unsigned int i=ladder_size; i>0; i--) {
 346     m += mass_list[i];
 347     mass_ladder[ladder_size-i] = m;
 348   }
 349 }
 350
 351
 352 // Calculate peaks for a synthesized mass (not mz) ladder.
 353 static inline void
 354 get_synthetic_B_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
 355                             const double *mass_list,
 356                             const double fragment_N_fixed_mass) {
 357   double m = fragment_N_fixed_mass;
 358
 359   for (unsigned int i=0; i<ladder_size; i++) {
 360     m += mass_list[i];
 361     mass_ladder[i] = m;
 362   }
 363 }
 364
 365
 366 // Generate synthetic spectra for a set of charges.  Only the charge and peaks
 367 // of these spectra are initialized.
 368 static inline void
 369 synthetic_spectra(double synth_sp_mz[/* max_fragment_charge+1 */]
 370                                     [spectrum::MAX_THEORETICAL_FRAGMENTS],
 371                   const double *mass_list, const unsigned int mass_list_size,
 372                   const double fragment_N_fixed_mass,
 373                   const double fragment_C_fixed_mass,
 374                   const int max_fragment_charge) {
 375   assert(mass_list_size >= 1);
 376   const unsigned int ladder_size = mass_list_size - 1;
 377   double Y_mass_ladder[ladder_size];
 378   double B_mass_ladder[ladder_size];
 379
 380   for (int charge=1; charge<=max_fragment_charge; charge++) {
 381     get_synthetic_Y_mass_ladder(Y_mass_ladder, ladder_size, mass_list,
 382                                 fragment_C_fixed_mass);
 383     get_synthetic_B_mass_ladder(B_mass_ladder, ladder_size, mass_list,
 384                                 fragment_N_fixed_mass);
 385
 386     unsigned int y_i=0, b_i=0;
 387     double *sp_mz_p = synth_sp_mz[charge];
 388
 389     // merge the two (already sorted) mass lists, converting them to mz values
 390     // in the process
 391     // FIX: try algorithm::merge?
 392     while (y_i < ladder_size and b_i < ladder_size)
 393       if (Y_mass_ladder[y_i] < B_mass_ladder[b_i])
 394         *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
 395       else
 396         *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
 397     while (y_i < ladder_size)
 398       *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
 399     while (b_i < ladder_size)
 400       *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
 401     *sp_mz_p = -1;              // invalid mz as terminator
 402   }
 403 }
 404
 405
 406 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
 407 // theoretical spectrum (y) generated from a peptide candidate.
 408 //
 409 // FIX: This code seems complicated.  Is the MyriMatch idea of using maps (or
 410 // better yet, sets) for peak lists simpler?  As fast or faster?
 411 //
 412 // This is the innermost loop, so it's worthwhile to optimize this some.
 413 // FIX const declaration
 414 static inline double
 415 score_spectrum(const spectrum &x,
 416                const double synth_mzs[spectrum::MAX_THEORETICAL_FRAGMENTS]) {
 417   // FIX this
 418   unsigned int y_peak_count=0;
 419   for (const double *p=synth_mzs; *p >= 0; p++, y_peak_count++);
 420
 421   double peak_best_delta[y_peak_count];
 422   int peak_best_class[y_peak_count];
 423   for (unsigned int i=0; i<y_peak_count; i++) {
 424     peak_best_delta[i] = CP.fragment_mass_tolerance;
 425     peak_best_class[i] = -1;
 426   }
 427
 428   // Iterate over all closest pairs of peaks, collecting info on the class of
 429   // the best (meaning closest mz) real peak matching each theoretical peak
 430   // (for real/theoretical peak pairs that are no farther apart than
 431   // fragment_mass_tolerance).
 432
 433   const unsigned int x_peak_count=x.peaks.size();
 434   assert(x_peak_count > 0 and y_peak_count > 0);
 435
 436   unsigned int x_index=0, y_index=0;
 437   while (true) {
 438     const double delta = synth_mzs[y_index] - x.peak_mz_cache[x_index];
 439     if (std::abs(delta) <= peak_best_delta[y_index]) {
 440       peak_best_class[y_index] = x.peak_intensity_class_cache[x_index];
 441       peak_best_delta[y_index] = std::abs(delta);
 442     }
 443     if (delta > 0) {
 444       if (++x_index >= x_peak_count)
 445         break;
 446     } else {
 447       if (++y_index >= y_peak_count)
 448         break;
 449     }
 450   }
 451
 452   assert(CP.intensity_class_count < INT_MAX);
 453   const unsigned int intensity_class_count = CP.intensity_class_count;
 454   unsigned int peak_hit_histogram[intensity_class_count];
 455   for (unsigned int i=0; i<intensity_class_count; i++)
 456     peak_hit_histogram[i] = 0;
 457   for (unsigned int i=0; i<y_peak_count; i++) {
 458     const int bc = peak_best_class[i];
 459     if (bc >= 0) {
 460       assert(bc < static_cast<int>(intensity_class_count));
 461       peak_hit_histogram[bc] += 1;
 462     }
 463   }
 464
 465   // How many theoretical peaks overlap the real peak range?
 466   int valid_theoretical_peaks = 0;
 467   for (unsigned int i=0; i<y_peak_count; i++)
 468     if (x.min_peak_mz <= synth_mzs[i] and synth_mzs[i] <= x.max_peak_mz)
 469       valid_theoretical_peaks++;
 470
 471   unsigned int total_peak_hits = 0;
 472   for (unsigned int i=0; i<intensity_class_count; i++)
 473     total_peak_hits += peak_hit_histogram[i];
 474
 475   // How many valid theoretical peaks were misses?
 476   const int peak_misses = valid_theoretical_peaks - total_peak_hits;
 477   assert(peak_misses >= 0);
 478
 479   double score = ln_combination(x.empty_peak_bins, peak_misses);
 480   for (unsigned int i=0; i<intensity_class_count; i++)
 481     score += ln_combination(x.intensity_class_counts[i],
 482                             peak_hit_histogram[i]);
 483   score -= ln_combination(x.total_peak_bins, valid_theoretical_peaks);
 484
 485   return score;
 486 }
 487
 488
 489 // Return the score of the specified spectrum against a group of synthetic
 490 // fragment spectra.
 491
 492 // For +1 and +2 precursors, this does what MyriMatch does.  For +3 (and
 493 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
 494 // may not make much sense.  (Wouldn't it be better to take the max of +1b/+2y
 495 // or +2b/+1y?)
 496 // FIX: implement MM smart +3 model, and/or come up with something better.
 497 // FIX const decl
 498 static inline double
 499 score_spectrum_over_charges(const double
 500                             synth_sp_mz[/* max_fragment_charge+1 */]
 501                             [spectrum::MAX_THEORETICAL_FRAGMENTS],
 502                             const spectrum &sp,
 503                             const int max_fragment_charge) {
 504   double best_score = score_spectrum(sp, synth_sp_mz[1]);
 505
 506   const int charge_limit = std::max<int>(1, sp.charge-1);
 507   for (int charge=2; charge<=charge_limit; charge++) {
 508     assert(charge <= max_fragment_charge);
 509     best_score += score_spectrum(sp, synth_sp_mz[charge]);
 510   }
 511   return best_score;
 512 }
 513
 514
 515 struct mass_trace_list {
 516   mass_trace_item item;
 517   const mass_trace_list *next;
 518
 519   mass_trace_list(const mass_trace_list *p=0) : next(p) { }
 520 };
 521
 522
 523 // fuzzy comparison might not be necessary, but it's safer
 524 static inline bool
 525 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
 526
 527
 528 // Search for matches of this particular peptide modification variation
 529 // against the spectra.  Updates score_stats and returns the number of
 530 // candidate spectra found.
 531 static inline void
 532 evaluate_peptide(const search_context &context, match &m,
 533                  const mass_trace_list *mtlp, const double *mass_list,
 534                  const std::vector<std::vector<spectrum>::size_type>
 535                      &candidate_spectra,
 536                  score_stats &stats) {
 537   int max_candidate_charge = 0;
 538   typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
 539   for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
 540     max_candidate_charge
 541       = std::max<int>(max_candidate_charge,
 542                       spectrum::searchable_spectra[*it].charge);
 543   assert(max_candidate_charge >= 1);
 544
 545   const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
 546   assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
 547
 548   const unsigned int peptide_size = m.peptide_sequence.size();
 549   // FIX: "2" assumes just two ion series (e.g., B and Y)
 550   assert(peptide_size <= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS-1));
 551
 552   // This array is preallocated to avoid the overhead of allocation and
 553   // deallocation within the inner loop.
 554   // FIX
 555   // charge -> mz array
 556   static double synth_sp_mz[spectrum::MAX_SUPPORTED_CHARGE+1]
 557                            [spectrum::MAX_THEORETICAL_FRAGMENTS];
 558
 559   synthetic_spectra(synth_sp_mz, mass_list, peptide_size,
 560                     context.fragment_N_fixed_mass,
 561                     context.fragment_C_fixed_mass, max_fragment_charge);
 562
 563   for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
 564     m.spectrum_index = *it;
 565     spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
 566
 567     sp.comparisons += 1;
 568     stats.candidate_spectrum_count += 1;
 569     if (CP.estimate_only)
 570       continue;
 571
 572     //std::cerr << "sp " << m.spectrum_index << std::endl;
 573     m.score = score_spectrum_over_charges(synth_sp_mz, sp, max_fragment_charge);
 574     //std::cerr << "score " << m.score << std::endl;
 575
 576     std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
 577     // Is this score good enough to be in the best score list?  (Better scores
 578     // are more negative.)
 579     if (m.score > sp_best_matches.back().score)
 580       continue;
 581
 582     // We're saving this match, so remember the mass trace info, too.
 583     m.mass_trace.clear();
 584     for (const mass_trace_list *p=mtlp; p; p=p->next)
 585       m.mass_trace.push_back(p->item);
 586
 587     // If this is duplicate, just append to the existing match and continue
 588     for (std::vector<match>::reverse_iterator rit
 589            = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
 590       if (score_equal(rit->score, m.score)
 591           and rit->peptide_sequence == m.peptide_sequence
 592           and rit->mass_regime_index == m.mass_regime_index
 593           and rit->conjunct_index == m.conjunct_index
 594           and rit->mass_trace == m.mass_trace) {
 595         rit->peptide_begin.push_back(m.peptide_begin[0]);
 596         rit->sequence_name.push_back(m.sequence_name[0]);
 597         continue;
 598       }
 599
 600     // Otherwise, insert this match in the correct position
 601     assert(sp_best_matches.size() >= 1);
 602     std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
 603     for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
 604       *rit = *(rit+1);
 605     *rit = m;
 606   }
 607 }
 608
 609
 610 // IDEA FIX: Use a cost parameter to define the iterative search front.
 611
 612
 613 // Choose one possible residue modification position.  Once they're all
 614 // chosen, then evaluate.
 615 static inline void
 616 choose_residue_mod(const search_context &context, match &m,
 617                    const mass_trace_list *mtlp, double *mass_list,
 618                    const std::vector<std::vector<spectrum>::size_type>
 619                        &candidate_spectra,
 620                    score_stats &stats,
 621                    std::vector<int> &db_remaining,
 622                    const unsigned int remaining_positions_to_choose,
 623                    const unsigned int next_position_to_consider) {
 624   assert(remaining_positions_to_choose
 625          <= m.peptide_sequence.size() - next_position_to_consider);
 626
 627   if (remaining_positions_to_choose == 0) {
 628     stats.evaluation_count += 1;
 629     evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
 630   } else {
 631     mass_trace_list mtl(mtlp);
 632
 633     // consider all of the positions where we could next add a mod
 634     for (unsigned int i=next_position_to_consider;
 635          i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
 636       mtl.item.position = i;
 637       const double save_pos_mass=mass_list[i];
 638       const char pos_res = m.peptide_sequence[i];
 639
 640       // consider the possibilities for this position
 641       for (std::vector<int>::const_iterator
 642              it=context.delta_bag_lookup[pos_res].begin();
 643            it != context.delta_bag_lookup[pos_res].end(); it++) {
 644         const int db_index = *it;
 645         if (db_remaining[db_index] < 1)
 646           continue;
 647         db_remaining[db_index] -= 1;
 648         mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
 649         mtl.item.conjunct_item_index = db_index;
 650         choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
 651                            stats, db_remaining,
 652                            remaining_positions_to_choose-1, i+1);
 653         db_remaining[db_index] += 1;
 654       }
 655       mass_list[i] = save_pos_mass;
 656     }
 657   }
 658 }
 659
 660
 661 static inline int
 662 search_peptide(const search_context &context, const double &p_mass, match &m,
 663                const std::string &run_sequence, const int &begin_index,
 664                const unsigned int &peptide_size, score_stats &stats,
 665                std::vector<int> &db_remaining) {
 666   // We'd like to consider only the spectra whose masses are within the
 667   // tolerance range of theoretical mass of the peptide (p_mass).
 668   // Unfortunately, the tolerance range of each spectrum depends on its
 669   // charge, since the tolerance is specified in m/z units.  Furthermore,
 670   // when deciding how to adjust begin/end to generate the next peptide,
 671   // we need to be conservative, as a high-charge spectrum may come into
 672   // range, even though only +1 spectra are currently in-range.  So, we
 673   // first screen using the maximum tolerance range, then check the actual
 674   // spectra below.
 675
 676   const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
 677   const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
 678
 679   const spmi_c_it candidate_spectra_info_begin
 680     = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
 681   if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
 682     // spectrum masses all too low to match peptide (peptide too long)
 683     return -1;
 684   }
 685
 686   const spmi_c_it candidate_spectra_info_end
 687     = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
 688   if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
 689     // spectrum masses all too high to match peptide (peptide too short)
 690     return 1;
 691   }
 692   if (candidate_spectra_info_begin == candidate_spectra_info_end) {
 693     // no spectrum with close-enough parent mass
 694     return 0;
 695   }
 696
 697   // Now generate the list of candidate spectra that are actually
 698   // in-range, considering the charge of each spectrum.
 699   std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
 700   for (spmi_c_it it=candidate_spectra_info_begin;
 701        it != candidate_spectra_info_end; it++) {
 702     const spectrum &csp = spectrum::searchable_spectra[it->second];
 703     if (std::abs(csp.mass - p_mass) <= (csp.charge
 704                                         * CP.parent_mass_tolerance_1))
 705       candidate_spectra_0.push_back(it->second);
 706   }
 707   if (candidate_spectra_0.empty())
 708     return 0;
 709
 710   m.predicted_parent_mass = p_mass;
 711   m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
 712
 713   double mass_list[peptide_size];
 714   for (unsigned int i=0; i<peptide_size; i++)
 715     mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
 716                     .fixed_residue_mass[m.peptide_sequence[i]]);
 717
 718   choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
 719                      stats, db_remaining, context.mod_count, 0);
 720   return 0;
 721 }
 722
 723
 724 // Search a sequence run for matches according to the context against the
 725 // spectra.  Updates score_stats and the number of candidate spectra found.
 726
 727 // FIX: examine carefully for signed/unsigned problems
 728
 729 static inline void
 730 search_run(const search_context &context, const sequence_run &sequence_run,
 731            score_stats &stats) {
 732   const unsigned int min_peptide_length
 733     = std::max<unsigned int>(CP.minimum_peptide_length, context.mod_count);
 734   const std::vector<double> &fixed_parent_mass
 735     = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
 736
 737   const std::string &run_sequence = sequence_run.sequence;
 738
 739   // This match will be passed inward and used to record information that we
 740   // need to remember about a match when we finally see one.  At that point, a
 741   // copy of this match will be saved.
 742   // FIX: can this be done better?
 743   match m;
 744   m.sequence_name.push_back(sequence_run.name);
 745   m.peptide_begin.push_back(0);
 746   int &peptide_begin = m.peptide_begin[0];
 747   m.mass_regime_index = context.mass_regime_index;
 748   m.conjunct_index = context.conjunct_index;
 749   m.pca_delta = context.pca_delta;
 750
 751   assert(context.delta_bag_delta.size() == context.delta_bag_count.size());
 752   // counts remaining as mod positions are chosen
 753   std::vector<int> db_remaining = context.delta_bag_count;
 754
 755   // the rightmost 'end' seen when all spectra masses were too high
 756   // (0 means none yet encountered.)
 757   unsigned int next_end = 0;
 758
 759   // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
 760   double p_mass = context.parent_fixed_mass;
 761   int p_begin=0, p_end=0;
 762
 763   const std::vector<int> *cleavage_points = &sequence_run.cleavage_points;
 764   std::vector<int>::size_type cleavage_points_size = cleavage_points->size();
 765   // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
 766   static std::vector<int> nonspecific_cleavage_points;
 767   if (context.nonspecific_cleavage) {
 768     cleavage_points_size = run_sequence.size() + 1;
 769     for (unsigned int i=nonspecific_cleavage_points.size();
 770          i<cleavage_points_size; i++)
 771       nonspecific_cleavage_points.push_back(i);
 772     cleavage_points = &nonspecific_cleavage_points;
 773   }
 774   assert(cleavage_points_size >= 2);    // endpoints must be present
 775
 776   for (unsigned int begin=0; begin<cleavage_points_size-1; begin++) {
 777     const int begin_index = (*cleavage_points)[begin];
 778     peptide_begin = begin_index + sequence_run.sequence_offset;
 779
 780     // If pca_residues specified, require one of them to be the peptide
 781     // N-terminal residue
 782     switch (context.pca_residues.size()) {
 783     case 0:
 784       break;
 785     case 2:
 786       if (context.pca_residues[1] == run_sequence[begin_index])
 787         break;
 788     case 1:
 789       if (context.pca_residues[0] == run_sequence[begin_index])
 790         break;
 791       continue;
 792     default:
 793       assert(false);
 794     }
 795
 796     assert(begin_index >= p_begin and begin_index >= 0);
 797     for (int i=p_begin; i<begin_index; i++)
 798       p_mass -= fixed_parent_mass[run_sequence[i]];
 799     p_begin = begin_index;
 800
 801     unsigned int end = std::max<unsigned int>(begin + 1, next_end);
 802     for (; end<cleavage_points_size; end++) {
 803       if (not context.nonspecific_cleavage) {
 804         const int missed_cleavage_count = end - begin - 1;
 805         if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
 806           break;
 807       }
 808
 809       const int end_index = (*cleavage_points)[end];
 810       assert(end_index > begin_index);
 811       const unsigned int peptide_size = end_index - begin_index;
 812       if (peptide_size < min_peptide_length)
 813         continue;
 814
 815       assert(end_index >= 0);
 816       if (end_index >= p_end)
 817         for (int i=p_end; i<end_index; i++)
 818           p_mass += fixed_parent_mass[run_sequence[i]];
 819       else
 820         for (int i=end_index; i<p_end; i++)
 821           p_mass -= fixed_parent_mass[run_sequence[i]];
 822       p_end = end_index;
 823
 824 //       std::cerr << "peptide: "
 825 //              << std::string(run_sequence, begin_index, peptide_size)
 826 //              << " p_mass: " << p_mass << std::endl;
 827
 828       int status = search_peptide(context, p_mass, m, run_sequence,
 829                                   begin_index, peptide_size, stats,
 830                                   db_remaining);
 831       if (status == -1)         // peptide was too long
 832         break;
 833       if (status == 1) {        // peptide was too short
 834         // so next start with a peptide at least this long
 835         if (end == cleavage_points_size - 1)
 836           return;
 837         next_end = end;
 838       }
 839     }
 840   }
 841 }
 842
 843
 844 // Search all sequence runs for matches according to the context against the
 845 // spectra.  Updates score_stats and the number of candidate spectra found.
 846 void
 847 spectrum::search_runs(const search_context &context, score_stats &stats) {
 848   const int no_runs = context.sequence_runs.size();
 849   for (int i=0; i<no_runs; i++) {
 850     search_run(context, context.sequence_runs[i], stats);
 851
 852     if (CP.show_progress)
 853       std::cerr << i+1 << " of " << no_runs << " sequences, "
 854                 << stats.candidate_spectrum_count << " candidates\r"
 855                 << std::flush;
 856   }
 857
 858   if (CP.show_progress)
 859     std::cerr << std::endl;
 860 }