cgreylag.cpp

   1 // C++ module for greylag
   2
   3 //     Copyright (C) 2006-2007, Stowers Institute for Medical Research
   4 //
   5 //     This program is free software; you can redistribute it and/or modify
   6 //     it under the terms of the GNU General Public License as published by
   7 //     the Free Software Foundation; either version 2 of the License, or
   8 //     (at your option) any later version.
   9 //
  10 //     This program is distributed in the hope that it will be useful,
  11 //     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 //     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 //     GNU General Public License for more details.
  14 //
  15 //     You should have received a copy of the GNU General Public License along
  16 //     with this program; if not, write to the Free Software Foundation, Inc.,
  17 //     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18
  19
  20 #include "cgreylag.hpp"
  21
  22
  23 #include <cerrno>
  24 #include <cfloat>
  25 #include <climits>
  26 #include <cstdlib>
  27 #include <cmath>
  28 #include <iostream>
  29 #include <numeric>
  30 #include <set>
  31 #include <stdexcept>
  32
  33
  34
  35 #ifdef __GNU__
  36 #define NOTHROW __attribute__ ((nothrow))
  37 // Use faster, non-threadsafe versions, since we don't use threads.  This is
  38 // maybe 10% faster?
  39 #define fgetsX fgets_unlocked
  40 #define fputsX fputs_unlocked
  41 #define fprintfX __builtin_fprintf_unlocked
  42 #define ferrorX ferror_unlocked
  43 #define getcX getc_unlocked
  44 #else
  45 #define NOTHROW
  46 #define fgetsX std::fgets
  47 #define fputsX std::fputs
  48 #define fprintfX std::fprintf
  49 #define ferrorX std::ferror
  50 #define getcX std::getc
  51 #endif
  52
  53
  54 parameters parameters::the;
  55 const parameters &CP = parameters::the;
  56
  57
  58 char *
  59 peak::__repr__() const {
  60   static char temp[128];
  61   sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
  62           mz, intensity, intensity_class);
  63   return &temp[0];
  64 }
  65
  66 const int FIRST_SPECTRUM_ID = 1;
  67 int spectrum::next_id = FIRST_SPECTRUM_ID;
  68 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
  69
  70 std::vector<spectrum> spectrum::searchable_spectra;
  71 // parent mass -> searchable_spectra index
  72 std::multimap<double,
  73               std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
  74
  75 typedef
  76   std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
  77   spmi_c_it;                    // ugh
  78
  79
  80 char *
  81 spectrum::__repr__() const {
  82   static char temp[1024];
  83   sprintf(temp,
  84           "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
  85           id, physical_id, name.c_str(), mass, charge, peaks.size());
  86   return &temp[0];
  87 }
  88
  89
  90 // FIX: does this actually help inlining?
  91 // Return ln of n_C_k.
  92 static inline double
  93 ln_combination_(unsigned int n, unsigned int k) NOTHROW {
  94   // Occasionally happens due to problems in scoring function. (FIX)
  95   if (n < k)
  96     return 0;
  97
  98   assert(0 <= k and k <= n and n < CP.ln_factorial.size());
  99   return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
 100 }
 101 double
 102 ln_combination(unsigned int n, unsigned int k) {
 103   return ln_combination(n, k);
 104 }
 105
 106
 107
 108 // This is an error exit for read_spectra*.
 109 static inline void
 110 io_error(FILE *f, const char *message="") {
 111   if (ferrorX(f))
 112     message = "I/O error while reading (or writing) spectrum file";
 113   throw std::ios_base::failure(message);
 114 }
 115
 116
 117 // true iff s consists entirely of whitespace
 118 static bool
 119 at_eol(const char *s) {
 120   return s[strspn(s, " \t\r\n\f\v")] == 0;
 121 }
 122
 123
 124 // Check whether we've past the end offset, throwing exception on I/O error.
 125 static inline bool
 126 check_past_end(FILE *f, const long offset_end) {
 127   if (offset_end == -1)
 128     return false;
 129   long pos = std::ftell(f);
 130   if (pos == -1)
 131     io_error(f);
 132   return pos >= offset_end;
 133 }
 134
 135
 136 // Read spectra from file in ms2 format, tagging them with file_id.  Before
 137 // reading, seek to absolute position offset_begin.  If offset_end != -1, any
 138 // spectra that begin after position offset_end in the file are not read.
 139
 140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
 141 // having the same physical id.  Note that depending on
 142 // offset_begin/offset_end, we may end up with the first charge and not the
 143 // second, or vice versa.
 144
 145 // The peak list is initially sorted by mz.  Throws an exception on invalid
 146 // input.  Error checking is the most stringent in this function.  Other
 147 // readers can be a little looser since all ms2 files eventually get read here
 148 // anyway.
 149
 150 // This is pretty hideous.  Is there a simpler way to read, reasonably
 151 // efficiently, catching any input error?
 152
 153 std::vector<spectrum>
 154 spectrum::read_spectra_from_ms2(FILE *f, const int file_id,
 155                                 const long offset_begin,
 156                                 const long offset_end) {
 157   std::vector<spectrum> spectra;
 158
 159   const int bufsiz = 1024;
 160   char buf[bufsiz];
 161   char *endp;
 162
 163   // first seek to offset_begin and synchronize at next "\n:"
 164   if (offset_begin > 0) {
 165     if (std::fseek(f, offset_begin-1, SEEK_SET) == -1)
 166       io_error(f);
 167     int c = getcX(f);
 168     while (c != '\n' and c != EOF)
 169       c = getcX(f);
 170     if (ferrorX(f))
 171       io_error(f);
 172     if (c == '\n') {
 173       do {
 174         c = getcX(f);
 175       } while (c != ':' and c != EOF);
 176       if (ferrorX(f))
 177         io_error(f);
 178       if (c == ':')
 179         std::ungetc(c, f);
 180     }
 181   }
 182   // at this point we expect to read the first ':' of a header, or EOF
 183
 184   bool past_end = check_past_end(f, offset_end);
 185   char *result = fgetsX(buf, bufsiz, f);
 186   while (true) {
 187     std::vector<std::string> names;
 188     std::vector<double> masses;
 189     std::vector<int> charges;
 190
 191     // read headers
 192     while (true) {
 193       if (ferrorX(f))
 194         io_error(f);
 195       if (not result or buf[0] != ':')
 196         break;
 197       if (not past_end)
 198         names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
 199       if (not fgetsX(buf, bufsiz, f))
 200         io_error(f, "bad ms2 format: mass/charge line expected");
 201       errno = 0;
 202       double mass = std::strtod(buf, &endp); // need double accuracy here
 203       if (not past_end)
 204         masses.push_back(mass);
 205       if (errno or endp == buf or mass <= 0)
 206         io_error(f, "bad ms2 format: bad mass");
 207       const char *endp0 = endp;
 208       int charge = std::strtol(endp0, &endp, 10);
 209       if (not past_end)
 210         charges.push_back(charge);
 211       if (errno or endp == endp0 or charge <= 0)
 212         io_error(f, "bad ms2 format: bad charge");
 213       if (not at_eol(endp))
 214         io_error(f, "bad ms2 format: junk at end of mass/charge line");
 215       past_end = past_end or check_past_end(f, offset_end);
 216       result = fgetsX(buf, bufsiz, f);
 217     }
 218     if (names.empty() and past_end)
 219       break;
 220     if (not result) {
 221       if (not names.empty())
 222         io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
 223       break;
 224     }
 225     // read peaks
 226     std::vector<peak> peaks;
 227     while (true) {
 228       peak p;
 229       errno = 0;
 230       p.mz = strtod(buf, &endp);
 231       if (errno or endp == buf or p.mz <= 0)
 232         io_error(f, "bad ms2 format: bad peak mz");
 233       const char *endp0 = endp;
 234       p.intensity = strtod(endp0, &endp);
 235       if (errno or endp == endp0 or p.intensity <= 0)
 236         io_error(f, "bad ms2 format: bad peak intensity");
 237       if (not at_eol(endp))
 238         io_error(f, "bad ms2 format: junk at end of peak line");
 239       peaks.push_back(p);
 240
 241       past_end = past_end or check_past_end(f, offset_end);
 242       result = fgetsX(buf, bufsiz, f);
 243       if (ferrorX(f))
 244         io_error(f);
 245       if (not result or buf[0] == ':')
 246         break;
 247     }
 248
 249     // add spectra to vector
 250     if (names.empty() and not peaks.empty())
 251       io_error(f, "bad ms2 format: missing header lines?");
 252
 253     std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 254
 255     for (std::vector<double>::size_type i=0; i<names.size(); i++) {
 256       spectrum sp(masses[i], charges[i]);
 257       sp.peaks = peaks;
 258       sp.name = names[i];
 259       sp.file_id = file_id;
 260       sp.physical_id = next_physical_id;
 261       spectra.push_back(sp);
 262     }
 263     if (file_id != -1)
 264       spectrum::next_physical_id += 1;
 265   }
 266   return spectra;
 267 }
 268
 269
 270 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
 271 // to remove those too large to be fragment products.  (Peaks are assumed to
 272 // be ordered by increasing mz.)
 273 void
 274 spectrum::filter_peaks(double TIC_cutoff_proportion,
 275                        double parent_mass_tolerance_max) {
 276   if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
 277       or parent_mass_tolerance_max < 0)
 278     throw std::invalid_argument("invalid argument value");
 279
 280   // remove peaks too large to be fragment products
 281   // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
 282   const double peak_mz_limit = mass + parent_mass_tolerance_max;
 283
 284   std::vector<peak>::iterator pi;
 285   for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
 286   peaks.erase(pi, peaks.end());
 287
 288   // FIX: note mzLowerBound..mzUpperBound here
 289
 290   // now filter by TIC
 291   std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
 292
 293   total_ion_current = 0;
 294   for (pi=peaks.begin(); pi != peaks.end(); pi++)
 295     total_ion_current += pi->intensity;
 296
 297   const double i_limit = total_ion_current * TIC_cutoff_proportion;
 298   double accumulated_intensity = 0;
 299   for (pi=peaks.begin();
 300        pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
 301     accumulated_intensity += pi->intensity;
 302
 303   peaks.erase(pi, peaks.end());
 304
 305   // restore order
 306   std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 307 }
 308
 309
 310 // Classify peaks and update class_counts and peak bin counts.
 311 // FIX: hoist some of this up into Python?  use (1,2,4) as parameter
 312 void
 313 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
 314                    double fragment_mass_tolerance) {
 315   if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
 316     throw std::invalid_argument("invalid argument value");
 317
 318   intensity_class_counts.clear();
 319   intensity_class_counts.resize(intensity_class_count);
 320
 321   // e.g., 7 = 1 + 2 + 4
 322   // FIX: does this fail for non-integer ratios?
 323   int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
 324                       / (intensity_class_ratio - 1));
 325
 326   std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
 327   std::vector<peak>::iterator pi=peaks.begin();
 328   for (int i_class=0; i_class<intensity_class_count; i_class++) {
 329     int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
 330                                 / min_count) + 0.5);
 331     for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
 332       pi->intensity_class = i_class;
 333       intensity_class_counts[i_class]++;
 334     }
 335   }
 336   std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 337
 338   if (not peaks.empty()) {
 339     min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
 340     max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
 341     total_peak_bins = (int((max_peak_mz - min_peak_mz)
 342                            / (2 * fragment_mass_tolerance) + 0.5));
 343     // This is the MyriMatch estimate.  Wouldn't it be better to simply count
 344     // how many are empty?
 345     empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
 346   }
 347 }
 348
 349
 350 // Update the *_cache fields from the peaks field.
 351 void
 352 spectrum::update_peak_cache() {
 353   clear_peak_cache();
 354
 355   const unsigned int peak_count = peaks.size();
 356   peak_mz_cache = new double[peak_count+1];
 357   peak_intensity_class_cache = new int[peak_count+1];
 358
 359   for (unsigned int i=0; i<peak_count; i++) {
 360     peak_mz_cache[i] = peaks[i].mz;
 361     peak_intensity_class_cache[i] = peaks[i].intensity_class;
 362   }
 363
 364   // negative value is terminator
 365   peak_mz_cache[peak_count] = -1;
 366   peak_intensity_class_cache[peak_count] = -1;
 367 }
 368
 369
 370 // Store the list of spectra that search_peptide will search against, and also
 371 // build spectrum_mass_index.
 372 void
 373 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
 374   searchable_spectra = spectra;
 375   for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++) {
 376     spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
 377     searchable_spectra[i].update_peak_cache();
 378   }
 379 }
 380
 381
 382 // Calculate peaks for a synthesized mass (not mz) ladder.
 383 static inline void
 384 get_synthetic_Y_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
 385                             const double *mass_list,
 386                             const double fragment_C_fixed_mass) NOTHROW {
 387   double m = fragment_C_fixed_mass;
 388
 389   for (unsigned int i=ladder_size; i>0; i--) {
 390     m += mass_list[i];
 391     mass_ladder[ladder_size-i] = m;
 392   }
 393 }
 394
 395
 396 // Calculate peaks for a synthesized mass (not mz) ladder.
 397 static inline void
 398 get_synthetic_B_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
 399                             const double *mass_list,
 400                             const double fragment_N_fixed_mass) NOTHROW {
 401   double m = fragment_N_fixed_mass;
 402
 403   for (unsigned int i=0; i<ladder_size; i++) {
 404     m += mass_list[i];
 405     mass_ladder[i] = m;
 406   }
 407 }
 408
 409
 410 // Generate synthetic spectra for a set of charges.  Only the charge and peaks
 411 // of these spectra are initialized.
 412 static inline void
 413 synthetic_spectra(double synth_sp_mz[/* max_fragment_charge+1 */]
 414                                     [spectrum::MAX_THEORETICAL_FRAGMENTS],
 415                   const double *mass_list, const unsigned int mass_list_size,
 416                   const double fragment_N_fixed_mass,
 417                   const double fragment_C_fixed_mass,
 418                   const int max_fragment_charge) NOTHROW {
 419   assert(mass_list_size >= 1);
 420   const unsigned int ladder_size = mass_list_size - 1;
 421   double Y_mass_ladder[ladder_size];
 422   double B_mass_ladder[ladder_size];
 423
 424   for (int charge=1; charge<=max_fragment_charge; charge++) {
 425     get_synthetic_Y_mass_ladder(Y_mass_ladder, ladder_size, mass_list,
 426                                 fragment_C_fixed_mass);
 427     get_synthetic_B_mass_ladder(B_mass_ladder, ladder_size, mass_list,
 428                                 fragment_N_fixed_mass);
 429
 430     unsigned int y_i=0, b_i=0;
 431     double *sp_mz_p = synth_sp_mz[charge];
 432
 433     // merge the two (already sorted) mass lists, converting them to mz values
 434     // in the process
 435     // FIX: try algorithm::merge?
 436     while (y_i < ladder_size and b_i < ladder_size)
 437       if (Y_mass_ladder[y_i] < B_mass_ladder[b_i])
 438         *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
 439       else
 440         *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
 441     while (y_i < ladder_size)
 442       *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
 443     while (b_i < ladder_size)
 444       *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
 445     *sp_mz_p = -1;              // invalid mz as terminator
 446   }
 447 }
 448
 449
 450 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
 451 // theoretical spectrum (y) generated from a peptide candidate.
 452 //
 453 // FIX: This code seems complicated.  Is the MyriMatch idea of using maps (or
 454 // better yet, sets) for peak lists simpler?  As fast or faster?
 455 //
 456 // This is the innermost loop, so it's worthwhile to optimize this some.
 457 // FIX const declaration
 458 static inline double
 459 score_spectrum(const spectrum &x,
 460                const double synth_mzs[spectrum::MAX_THEORETICAL_FRAGMENTS])
 461   NOTHROW {
 462   // FIX this
 463   unsigned int y_peak_count=0;
 464   for (const double *p=synth_mzs; *p >= 0; p++, y_peak_count++);
 465
 466   double peak_best_delta[y_peak_count];
 467   int peak_best_class[y_peak_count];
 468   for (unsigned int i=0; i<y_peak_count; i++) {
 469     peak_best_delta[i] = CP.fragment_mass_tolerance;
 470     peak_best_class[i] = -1;
 471   }
 472
 473   // Iterate over all closest pairs of peaks, collecting info on the class of
 474   // the best (meaning closest mz) real peak matching each theoretical peak
 475   // (for real/theoretical peak pairs that are no farther apart than
 476   // fragment_mass_tolerance).
 477
 478   const unsigned int x_peak_count=x.peaks.size();
 479   assert(x_peak_count > 0 and y_peak_count > 0);
 480
 481   unsigned int x_index=0, y_index=0;
 482   while (true) {
 483     const double delta = synth_mzs[y_index] - x.peak_mz_cache[x_index];
 484     if (std::abs(delta) <= peak_best_delta[y_index]) {
 485       peak_best_class[y_index] = x.peak_intensity_class_cache[x_index];
 486       peak_best_delta[y_index] = std::abs(delta);
 487     }
 488     if (delta > 0) {
 489       if (++x_index >= x_peak_count)
 490         break;
 491     } else {
 492       if (++y_index >= y_peak_count)
 493         break;
 494     }
 495   }
 496
 497   assert(CP.intensity_class_count < INT_MAX);
 498   const unsigned int intensity_class_count = CP.intensity_class_count;
 499   unsigned int peak_hit_histogram[intensity_class_count];
 500   for (unsigned int i=0; i<intensity_class_count; i++)
 501     peak_hit_histogram[i] = 0;
 502   for (unsigned int i=0; i<y_peak_count; i++) {
 503     const int bc = peak_best_class[i];
 504     if (bc >= 0) {
 505       assert(bc < static_cast<int>(intensity_class_count));
 506       peak_hit_histogram[bc] += 1;
 507     }
 508   }
 509
 510   // How many theoretical peaks overlap the real peak range?
 511   int valid_theoretical_peaks = 0;
 512   for (unsigned int i=0; i<y_peak_count; i++)
 513     if (x.min_peak_mz <= synth_mzs[i] and synth_mzs[i] <= x.max_peak_mz)
 514       valid_theoretical_peaks++;
 515
 516   unsigned int total_peak_hits = 0;
 517   for (unsigned int i=0; i<intensity_class_count; i++)
 518     total_peak_hits += peak_hit_histogram[i];
 519
 520   // How many valid theoretical peaks were misses?
 521   const int peak_misses = valid_theoretical_peaks - total_peak_hits;
 522   assert(peak_misses >= 0);
 523
 524   double score = ln_combination_(x.empty_peak_bins, peak_misses);
 525   for (unsigned int i=0; i<intensity_class_count; i++)
 526     score += ln_combination_(x.intensity_class_counts[i],
 527                              peak_hit_histogram[i]);
 528   score -= ln_combination_(x.total_peak_bins, valid_theoretical_peaks);
 529
 530   return score;
 531 }
 532
 533
 534 // Return the score of the specified spectrum against a group of synthetic
 535 // fragment spectra.
 536
 537 // For +1 and +2 precursors, this does what MyriMatch does.  For +3 (and
 538 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
 539 // may not make much sense.  (Wouldn't it be better to take the max of +1b/+2y
 540 // or +2b/+1y?)
 541 // FIX: implement MM smart +3 model, and/or come up with something better.
 542 // FIX const decl
 543 static inline double
 544 score_spectrum_over_charges(const double
 545                             synth_sp_mz[/* max_fragment_charge+1 */]
 546                             [spectrum::MAX_THEORETICAL_FRAGMENTS],
 547                             const spectrum &sp,
 548                             const int max_fragment_charge) NOTHROW {
 549   double best_score = score_spectrum(sp, synth_sp_mz[1]);
 550
 551   const int charge_limit = std::max<int>(1, sp.charge-1);
 552   for (int charge=2; charge<=charge_limit; charge++) {
 553     assert(charge <= max_fragment_charge);
 554     best_score += score_spectrum(sp, synth_sp_mz[charge]);
 555   }
 556   return best_score;
 557 }
 558
 559
 560 struct mass_trace_list {
 561   mass_trace_item item;
 562   const mass_trace_list *next;
 563
 564   mass_trace_list(const mass_trace_list *p=0) : next(p) { }
 565 };
 566
 567
 568 // fuzzy comparison might not be necessary, but it's safer
 569 static inline bool
 570 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
 571
 572
 573 // Search for matches of this particular peptide modification variation
 574 // against the spectra.  Updates score_stats and returns the number of
 575 // candidate spectra found.
 576 static inline void
 577 evaluate_peptide(const search_context &context, match &m,
 578                  const mass_trace_list *mtlp, const double *mass_list,
 579                  const std::vector<std::vector<spectrum>::size_type>
 580                      &candidate_spectra,
 581                  score_stats &stats) NOTHROW {
 582   int max_candidate_charge = 0;
 583   typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
 584   for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
 585     max_candidate_charge
 586       = std::max<int>(max_candidate_charge,
 587                       spectrum::searchable_spectra[*it].charge);
 588   assert(max_candidate_charge >= 1);
 589
 590   const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
 591   assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
 592
 593   const unsigned int peptide_size = m.peptide_sequence.size();
 594   // FIX: "2" assumes just two ion series (e.g., B and Y)
 595   assert(peptide_size <= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS-1));
 596
 597   // This array is preallocated to avoid the overhead of allocation and
 598   // deallocation within the inner loop.
 599   // FIX
 600   // charge -> mz array
 601   static double synth_sp_mz[spectrum::MAX_SUPPORTED_CHARGE+1]
 602                            [spectrum::MAX_THEORETICAL_FRAGMENTS];
 603
 604   synthetic_spectra(synth_sp_mz, mass_list, peptide_size,
 605                     context.fragment_N_fixed_mass,
 606                     context.fragment_C_fixed_mass, max_fragment_charge);
 607
 608   for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
 609     m.spectrum_index = *it;
 610     spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
 611
 612     sp.comparisons += 1;
 613     stats.candidate_spectrum_count += 1;
 614     if (CP.estimate_only)
 615       continue;
 616
 617     //std::cerr << "sp " << m.spectrum_index << std::endl;
 618     m.score = score_spectrum_over_charges(synth_sp_mz, sp, max_fragment_charge);
 619     //std::cerr << "score " << m.score << std::endl;
 620
 621     std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
 622     // Is this score good enough to be in the best score list?  (Better scores
 623     // are more negative.)
 624     if (m.score > sp_best_matches.back().score)
 625       continue;
 626
 627     // We're saving this match, so remember the mass trace info, too.
 628     m.mass_trace.clear();
 629     for (const mass_trace_list *p=mtlp; p; p=p->next)
 630       m.mass_trace.push_back(p->item);
 631
 632     // If this is duplicate, just append to the existing match and return
 633     for (std::vector<match>::reverse_iterator rit
 634            = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
 635       if (score_equal(rit->score, m.score)
 636           and rit->peptide_sequence == m.peptide_sequence
 637           and rit->mass_regime_index == m.mass_regime_index
 638           and rit->conjunct_index == m.conjunct_index
 639           and rit->mass_trace == m.mass_trace) {
 640         rit->peptide_begin.push_back(m.peptide_begin[0]);
 641         rit->sequence_name.push_back(m.sequence_name[0]);
 642         return;
 643       }
 644
 645     // Otherwise, insert this match in the correct position
 646     assert(sp_best_matches.size() >= 1);
 647     std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
 648     for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
 649       *rit = *(rit+1);
 650     *rit = m;
 651   }
 652 }
 653
 654
 655 // IDEA FIX: Use a cost parameter to define the iterative search front.
 656
 657
 658 // Choose one possible residue modification position.  Once they're all
 659 // chosen, then evaluate.
 660 static inline void
 661 choose_residue_mod(const search_context &context, match &m,
 662                    const mass_trace_list *mtlp, double *mass_list,
 663                    const std::vector<std::vector<spectrum>::size_type>
 664                        &candidate_spectra,
 665                    score_stats &stats,
 666                    std::vector<int> &db_remaining,
 667                    const unsigned int remaining_positions_to_choose,
 668                    const unsigned int next_position_to_consider) NOTHROW {
 669   assert(remaining_positions_to_choose
 670          <= m.peptide_sequence.size() - next_position_to_consider);
 671
 672   if (remaining_positions_to_choose == 0) {
 673     stats.combinations_searched += 1;
 674     evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
 675   } else {
 676     mass_trace_list mtl(mtlp);
 677
 678     // consider all of the positions where we could next add a mod
 679     for (unsigned int i=next_position_to_consider;
 680          i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
 681       mtl.item.position = i;
 682       const double save_pos_mass=mass_list[i];
 683       const char pos_res = m.peptide_sequence[i];
 684
 685       // consider the possibilities for this position
 686       for (std::vector<int>::const_iterator
 687              it=context.delta_bag_lookup[pos_res].begin();
 688            it != context.delta_bag_lookup[pos_res].end(); it++) {
 689         const int db_index = *it;
 690         if (db_remaining[db_index] < 1)
 691           continue;
 692         db_remaining[db_index] -= 1;
 693         mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
 694         mtl.item.delta = context.delta_bag_delta[db_index];
 695         choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
 696                            stats, db_remaining,
 697                            remaining_positions_to_choose-1, i+1);
 698         db_remaining[db_index] += 1;
 699       }
 700       mass_list[i] = save_pos_mass;
 701     }
 702   }
 703 }
 704
 705
 706 static inline int
 707 search_peptide(const search_context &context, const double &p_mass, match &m,
 708                const std::string &run_sequence, const int &begin_index,
 709                const unsigned int &peptide_size, score_stats &stats,
 710                std::vector<int> &db_remaining) {
 711   // We'd like to consider only the spectra whose masses are within the
 712   // tolerance range of theoretical mass of the peptide (p_mass).
 713   // Unfortunately, the tolerance range of each spectrum depends on its
 714   // charge, since the tolerance is specified in m/z units.  Furthermore,
 715   // when deciding how to adjust begin/end to generate the next peptide,
 716   // we need to be conservative, as a high-charge spectrum may come into
 717   // range, even though only +1 spectra are currently in-range.  So, we
 718   // first screen using the maximum tolerance range, then check the actual
 719   // spectra below.
 720
 721   const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
 722   const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
 723
 724   const spmi_c_it candidate_spectra_info_begin
 725     = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
 726   if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
 727     // spectrum masses all too low to match peptide (peptide too long)
 728     return -1;
 729   }
 730
 731   const spmi_c_it candidate_spectra_info_end
 732     = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
 733   if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
 734     // spectrum masses all too high to match peptide (peptide too short)
 735     return 1;
 736   }
 737   if (candidate_spectra_info_begin == candidate_spectra_info_end) {
 738     // no spectrum with close-enough parent mass
 739     return 0;
 740   }
 741
 742   // Now generate the list of candidate spectra that are actually
 743   // in-range, considering the charge of each spectrum.
 744   std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
 745   for (spmi_c_it it=candidate_spectra_info_begin;
 746        it != candidate_spectra_info_end; it++) {
 747     const spectrum &csp = spectrum::searchable_spectra[it->second];
 748     if (std::abs(csp.mass - p_mass) <= (csp.charge
 749                                         * CP.parent_mass_tolerance_1))
 750       candidate_spectra_0.push_back(it->second);
 751   }
 752   if (candidate_spectra_0.empty())
 753     return 0;
 754
 755   m.predicted_parent_mass = p_mass;
 756   m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
 757
 758   double mass_list[peptide_size];
 759   for (unsigned int i=0; i<peptide_size; i++)
 760     mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
 761                     .fixed_residue_mass[m.peptide_sequence[i]]);
 762
 763   choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
 764                      stats, db_remaining, context.mod_count, 0);
 765   return 0;
 766 }
 767
 768
 769 // Search a sequence run for matches according to the context against the
 770 // spectra.  Updates score_stats and the number of candidate spectra found.
 771
 772 // FIX: examine carefully for signed/unsigned problems
 773
 774 static inline void
 775 search_run(const search_context &context, const sequence_run &sequence_run,
 776            score_stats &stats) {
 777   const unsigned int min_peptide_length
 778     = std::max<unsigned int>(CP.minimum_peptide_length, context.mod_count);
 779   const std::vector<double> &fixed_parent_mass
 780     = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
 781
 782   const std::string &run_sequence = sequence_run.sequence;
 783
 784   // This match will be passed inward and used to record information that we
 785   // need to remember about a match when we finally see one.  At that point, a
 786   // copy of this match will be saved.
 787   // FIX: can this be done better?
 788   match m;
 789   m.sequence_name.push_back(sequence_run.name);
 790   m.peptide_begin.push_back(0);
 791   int &peptide_begin = m.peptide_begin[0];
 792   m.mass_regime_index = context.mass_regime_index;
 793   m.conjunct_index = context.conjunct_index;
 794   m.pca_delta = context.pca_delta;
 795
 796   assert(context.delta_bag_delta.size() == context.delta_bag_count.size());
 797   // counts remaining as mod positions are chosen
 798   std::vector<int> db_remaining = context.delta_bag_count;
 799
 800   // the rightmost 'end' seen when all spectra masses were too high
 801   // (0 means none yet encountered.)
 802   unsigned int next_end = 0;
 803
 804   // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
 805   double p_mass = context.parent_fixed_mass;
 806   int p_begin=0, p_end=0;
 807
 808   const std::vector<int> *cleavage_points = &sequence_run.cleavage_points;
 809   std::vector<int>::size_type cleavage_points_size = cleavage_points->size();
 810   // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
 811   static std::vector<int> nonspecific_cleavage_points;
 812   if (context.nonspecific_cleavage) {
 813     cleavage_points_size = run_sequence.size() + 1;
 814     for (unsigned int i=nonspecific_cleavage_points.size();
 815          i<cleavage_points_size; i++)
 816       nonspecific_cleavage_points.push_back(i);
 817     cleavage_points = &nonspecific_cleavage_points;
 818   }
 819   assert(cleavage_points_size >= 2);    // endpoints must be present
 820
 821   for (unsigned int begin=0; begin<cleavage_points_size-1; begin++) {
 822     const int begin_index = (*cleavage_points)[begin];
 823     peptide_begin = begin_index + sequence_run.sequence_offset;
 824
 825     // If pca_residues specified, require one of them to be the peptide
 826     // N-terminal residue
 827     switch (context.pca_residues.size()) {
 828     case 0:
 829       break;
 830     case 2:
 831       if (context.pca_residues[1] == run_sequence[begin_index])
 832         break;
 833     case 1:
 834       if (context.pca_residues[0] == run_sequence[begin_index])
 835         break;
 836       continue;
 837     default:
 838       assert(false);
 839     }
 840
 841     assert(begin_index >= p_begin and begin_index >= 0);
 842     for (int i=p_begin; i<begin_index; i++)
 843       p_mass -= fixed_parent_mass[run_sequence[i]];
 844     p_begin = begin_index;
 845
 846     unsigned int end = std::max<unsigned int>(begin + 1, next_end);
 847     for (; end<cleavage_points_size; end++) {
 848       if (not context.nonspecific_cleavage) {
 849         const int missed_cleavage_count = end - begin - 1;
 850         if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
 851           break;
 852       }
 853
 854       const int end_index = (*cleavage_points)[end];
 855       assert(end_index > begin_index);
 856       const unsigned int peptide_size = end_index - begin_index;
 857       if (peptide_size < min_peptide_length)
 858         continue;
 859
 860       assert(end_index >= 0);
 861       if (end_index >= p_end)
 862         for (int i=p_end; i<end_index; i++)
 863           p_mass += fixed_parent_mass[run_sequence[i]];
 864       else
 865         for (int i=end_index; i<p_end; i++)
 866           p_mass -= fixed_parent_mass[run_sequence[i]];
 867       p_end = end_index;
 868
 869
 870       //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
 871
 872       int status = search_peptide(context, p_mass, m, run_sequence,
 873                                   begin_index, peptide_size, stats,
 874                                   db_remaining);
 875       if (status == -1)         // peptide was too long
 876         break;
 877       if (status == 1) {        // peptide was too short
 878         // so next start with a peptide at least this long
 879         if (end == cleavage_points_size - 1)
 880           return;
 881         next_end = end;
 882       }
 883     }
 884   }
 885 }
 886
 887
 888 // Search all sequence runs for matches according to the context against the
 889 // spectra.  Updates score_stats and the number of candidate spectra found.
 890 void
 891 spectrum::search_runs(const search_context &context, score_stats &stats) {
 892   const int no_runs = context.sequence_runs.size();
 893   for (int i=0; i<no_runs; i++) {
 894     search_run(context, context.sequence_runs[i], stats);
 895
 896     if (CP.show_progress)
 897       std::cerr << i+1 << " of " << no_runs << " sequences, "
 898                 << stats.candidate_spectrum_count << " candidates\r"
 899                 << std::flush;
 900   }
 901
 902   if (CP.show_progress)
 903     std::cerr << std::endl;
 904 }