cgreylag.cpp

   1 // C++ module for greylag
   2
   3 //     Copyright (C) 2006-2007, Stowers Institute for Medical Research
   4 //
   5 //     This program is free software; you can redistribute it and/or modify
   6 //     it under the terms of the GNU General Public License as published by
   7 //     the Free Software Foundation; either version 2 of the License, or
   8 //     (at your option) any later version.
   9 //
  10 //     This program is distributed in the hope that it will be useful,
  11 //     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 //     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 //     GNU General Public License for more details.
  14 //
  15 //     You should have received a copy of the GNU General Public License along
  16 //     with this program; if not, write to the Free Software Foundation, Inc.,
  17 //     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18
  19
  20 #include "cgreylag.hpp"
  21
  22
  23 #include <cerrno>
  24 #include <cfloat>
  25 #include <climits>
  26 #include <cstdlib>
  27 #include <cmath>
  28 #include <iostream>
  29 #include <numeric>
  30 #include <set>
  31 #include <stdexcept>
  32
  33
  34
  35 #ifdef __GNU__
  36 #define NOTHROW __attribute__ ((nothrow))
  37 // Use faster, non-threadsafe versions, since we don't use threads.  This is
  38 // maybe 10% faster?
  39 #define fgetsX fgets_unlocked
  40 #define fputsX fputs_unlocked
  41 #define fprintfX __builtin_fprintf_unlocked
  42 #define ferrorX ferror_unlocked
  43 #define getcX getc_unlocked
  44 #else
  45 #define NOTHROW
  46 #define fgetsX std::fgets
  47 #define fputsX std::fputs
  48 #define fprintfX std::fprintf
  49 #define ferrorX std::ferror
  50 #define getcX std::getc
  51 #endif
  52
  53
  54 parameters parameters::the;
  55 const parameters &CP = parameters::the;
  56
  57
  58 char *
  59 peak::__repr__() const {
  60   static char temp[128];
  61   sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
  62           mz, intensity, intensity_class);
  63   return &temp[0];
  64 }
  65
  66 const int FIRST_SPECTRUM_ID = 1;
  67 int spectrum::next_id = FIRST_SPECTRUM_ID;
  68 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
  69
  70 std::vector<spectrum> spectrum::searchable_spectra;
  71 // parent mass -> searchable_spectra index
  72 std::multimap<double,
  73               std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
  74
  75 typedef
  76   std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
  77   spmi_c_it;                    // ugh
  78
  79
  80 char *
  81 spectrum::__repr__() const {
  82   static char temp[1024];
  83   sprintf(temp,
  84           "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
  85           id, physical_id, name.c_str(), mass, charge, peaks.size());
  86   return &temp[0];
  87 }
  88
  89
  90 // FIX: does this actually help inlining?
  91 // Return ln of n_C_k.
  92 static inline double
  93 ln_combination_(unsigned int n, unsigned int k) NOTHROW {
  94   // Occasionally happens due to problems in scoring function. (FIX)
  95   if (n < k)
  96     return 0;
  97
  98   assert(0 <= k and k <= n and n < CP.ln_factorial.size());
  99   return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
 100 }
 101 double
 102 ln_combination(unsigned int n, unsigned int k) {
 103   return ln_combination(n, k);
 104 }
 105
 106
 107
 108 // This is an error exit for read_spectra*.
 109 static inline void
 110 io_error(FILE *f, const char *message="") {
 111   if (ferrorX(f))
 112     message = "I/O error while reading (or writing) spectrum file";
 113   throw std::ios_base::failure(message);
 114 }
 115
 116
 117 // true iff s consists entirely of whitespace
 118 static bool
 119 at_eol(const char *s) {
 120   return s[strspn(s, " \t\r\n\f\v")] == 0;
 121 }
 122
 123
 124 // Check whether we've past the end offset, throwing exception on I/O error.
 125 static inline bool
 126 check_past_end(FILE *f, const long offset_end) {
 127   if (offset_end == -1)
 128     return false;
 129   long pos = std::ftell(f);
 130   if (pos == -1)
 131     io_error(f);
 132   return pos >= offset_end;
 133 }
 134
 135
 136 // Read spectra from file in ms2 format, tagging them with file_id.  Before
 137 // reading, seek to absolute position offset_begin.  If offset_end != -1, any
 138 // spectra that begin after position offset_end in the file are not read.
 139
 140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
 141 // having the same physical id.  Note that depending on
 142 // offset_begin/offset_end, we may end up with the first charge and not the
 143 // second, or vice versa.
 144
 145 // The peak list is initially sorted by mz.  Throws an exception on invalid
 146 // input.  Error checking is the most stringent in this function.  Other
 147 // readers can be a little looser since all ms2 files eventually get read here
 148 // anyway.
 149
 150 // This is pretty hideous.  Is there a simpler way to read, reasonably
 151 // efficiently, catching any input error?
 152
 153 std::vector<spectrum>
 154 spectrum::read_spectra_from_ms2(FILE *f, const int file_id,
 155                                 const long offset_begin,
 156                                 const long offset_end) {
 157   std::vector<spectrum> spectra;
 158
 159   const int bufsiz = 1024;
 160   char buf[bufsiz];
 161   char *endp;
 162
 163   // first seek to offset_begin and synchronize at next "\n:"
 164   if (offset_begin > 0) {
 165     if (std::fseek(f, offset_begin-1, SEEK_SET) == -1)
 166       io_error(f);
 167     int c = getcX(f);
 168     while (c != '\n' and c != EOF)
 169       c = getcX(f);
 170     if (ferrorX(f))
 171       io_error(f);
 172     if (c == '\n') {
 173       do {
 174         c = getcX(f);
 175       } while (c != ':' and c != EOF);
 176       if (ferrorX(f))
 177         io_error(f);
 178       if (c == ':')
 179         std::ungetc(c, f);
 180     }
 181   }
 182   // at this point we expect to read the first ':' of a header, or EOF
 183
 184   bool past_end = check_past_end(f, offset_end);
 185   char *result = fgetsX(buf, bufsiz, f);
 186   while (true) {
 187     std::vector<std::string> names;
 188     std::vector<double> masses;
 189     std::vector<int> charges;
 190     // for annotated ms2
 191     std::vector<int> file_ids;
 192     std::vector<int> physical_ids;
 193     std::vector<int> ids;
 194
 195     // read headers
 196     while (true) {
 197       if (ferrorX(f))
 198         io_error(f);
 199       if (not result or buf[0] != ':')
 200         break;
 201       if (not past_end)
 202         names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
 203       if (not fgetsX(buf, bufsiz, f))
 204         io_error(f, "bad ms2 format: mass/charge line expected");
 205       errno = 0;
 206       double mass = std::strtod(buf, &endp); // need double accuracy here
 207       if (not past_end)
 208         masses.push_back(mass);
 209       if (errno or endp == buf or mass <= 0)
 210         io_error(f, "bad ms2 format: bad mass");
 211       const char *endp0 = endp;
 212       int charge = std::strtol(endp0, &endp, 10);
 213       if (not past_end)
 214         charges.push_back(charge);
 215       if (errno or endp == endp0 or charge <= 0)
 216         io_error(f, "bad ms2 format: bad charge");
 217       if (not at_eol(endp))
 218         io_error(f, "bad ms2 format: junk at end of mass/charge line");
 219       past_end = past_end or check_past_end(f, offset_end);
 220       result = fgetsX(buf, bufsiz, f);
 221     }
 222     if (names.empty() and past_end)
 223       break;
 224     if (not result) {
 225       if (not names.empty())
 226         io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
 227       break;
 228     }
 229     // read peaks
 230     std::vector<peak> peaks;
 231     while (true) {
 232       peak p;
 233       errno = 0;
 234       p.mz = strtod(buf, &endp);
 235       if (errno or endp == buf or p.mz <= 0)
 236         io_error(f, "bad ms2 format: bad peak mz");
 237       const char *endp0 = endp;
 238       p.intensity = strtod(endp0, &endp);
 239       if (errno or endp == endp0 or p.intensity <= 0)
 240         io_error(f, "bad ms2 format: bad peak intensity");
 241       if (not at_eol(endp))
 242         io_error(f, "bad ms2 format: junk at end of peak line");
 243       peaks.push_back(p);
 244
 245       past_end = past_end or check_past_end(f, offset_end);
 246       result = fgetsX(buf, bufsiz, f);
 247       if (ferrorX(f))
 248         io_error(f);
 249       if (not result or buf[0] == ':')
 250         break;
 251     }
 252
 253     // add spectra to vector
 254     if (names.empty() and not peaks.empty())
 255       io_error(f, "bad ms2 format: missing header lines?");
 256
 257     std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 258
 259     for (std::vector<double>::size_type i=0; i<names.size(); i++) {
 260       spectrum sp(masses[i], charges[i]);
 261       sp.peaks = peaks;
 262       sp.name = names[i];
 263       if (file_id != -1) {
 264         sp.file_id = file_id;
 265         sp.physical_id = next_physical_id;
 266       } else {
 267         sp.file_id = file_ids[i];
 268         sp.physical_id = physical_ids[i];
 269         sp.id = ids[i];
 270         spectrum::next_id = std::max(spectrum::next_id, sp.id+1);
 271         spectrum::next_physical_id = std::max(spectrum::next_physical_id,
 272                                               sp.physical_id+1);
 273       }
 274       spectra.push_back(sp);
 275     }
 276     if (file_id != -1)
 277       spectrum::next_physical_id += 1;
 278   }
 279   return spectra;
 280 }
 281
 282
 283 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
 284 // to remove those too large to be fragment products.  (Peaks are assumed to
 285 // be ordered by increasing mz.)
 286 void
 287 spectrum::filter_peaks(double TIC_cutoff_proportion,
 288                        double parent_mass_tolerance_max) {
 289   if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
 290       or parent_mass_tolerance_max < 0)
 291     throw std::invalid_argument("invalid argument value");
 292
 293   // remove peaks too large to be fragment products
 294   // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
 295   const double peak_mz_limit = mass + parent_mass_tolerance_max;
 296
 297   std::vector<peak>::iterator pi;
 298   for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
 299   peaks.erase(pi, peaks.end());
 300
 301   // FIX: note mzLowerBound..mzUpperBound here
 302
 303   // now filter by TIC
 304   std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
 305
 306   total_ion_current = 0;
 307   for (pi=peaks.begin(); pi != peaks.end(); pi++)
 308     total_ion_current += pi->intensity;
 309
 310   const double i_limit = total_ion_current * TIC_cutoff_proportion;
 311   double accumulated_intensity = 0;
 312   for (pi=peaks.begin();
 313        pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
 314     accumulated_intensity += pi->intensity;
 315
 316   peaks.erase(pi, peaks.end());
 317
 318   // restore order
 319   std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 320 }
 321
 322
 323 // Classify peaks and update class_counts and peak bin counts.
 324 // FIX: hoist some of this up into Python?  use (1,2,4) as parameter
 325 void
 326 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
 327                    double fragment_mass_tolerance) {
 328   if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
 329     throw std::invalid_argument("invalid argument value");
 330
 331   intensity_class_counts.clear();
 332   intensity_class_counts.resize(intensity_class_count);
 333
 334   // e.g., 7 = 1 + 2 + 4
 335   // FIX: does this fail for non-integer ratios?
 336   int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
 337                       / (intensity_class_ratio - 1));
 338
 339   std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
 340   std::vector<peak>::iterator pi=peaks.begin();
 341   for (int i_class=0; i_class<intensity_class_count; i_class++) {
 342     int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
 343                                 / min_count) + 0.5);
 344     for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
 345       pi->intensity_class = i_class;
 346       intensity_class_counts[i_class]++;
 347     }
 348   }
 349   std::sort(peaks.begin(), peaks.end(), peak::less_mz);
 350
 351   if (not peaks.empty()) {
 352     min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
 353     max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
 354     total_peak_bins = (int((max_peak_mz - min_peak_mz)
 355                            / (2 * fragment_mass_tolerance) + 0.5));
 356     // This is the MyriMatch estimate.  Wouldn't it be better to simply count
 357     // how many are empty?
 358     empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
 359   }
 360 }
 361
 362
 363 // Store the list of spectra that search_peptide will search against, and also
 364 // build spectrum_mass_index.
 365 void
 366 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
 367   searchable_spectra = spectra;
 368   for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
 369     spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
 370 }
 371
 372
 373 // Calculate peaks for a synthesized mass (not mz) ladder.
 374 static inline void
 375 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
 376                             const std::vector<double> &mass_list,
 377                             const double fragment_C_fixed_mass) NOTHROW {
 378   double m = fragment_C_fixed_mass;
 379
 380   const int ladder_size = mass_ladder.size();
 381   for (int i=ladder_size-1; i>=0; i--) {
 382     m += mass_list[i+1];
 383     mass_ladder[ladder_size-1-i] = peak(m, 1.0);
 384   }
 385 }
 386
 387
 388 // Calculate peaks for a synthesized mass (not mz) ladder.
 389 static inline void
 390 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
 391                             const std::vector<double> &mass_list,
 392                             const double fragment_N_fixed_mass) NOTHROW {
 393   double m = fragment_N_fixed_mass;
 394
 395   const int ladder_size = mass_ladder.size();
 396   for (int i=0; i<=ladder_size-1; i++) {
 397     m += mass_list[i];
 398     mass_ladder[i] = peak(m, 1.0);
 399   }
 400 }
 401
 402
 403 // Generate synthetic spectra for a set of charges.  Only the charge and peaks
 404 // of these spectra are initialized.
 405 static inline void
 406 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */],
 407                   const std::vector<double> &mass_list,
 408                   const double fragment_N_fixed_mass,
 409                   const double fragment_C_fixed_mass,
 410                   const double max_fragment_charge) NOTHROW {
 411   std::vector<peak> mass_ladder(mass_list.size()-1);
 412
 413   for (int charge=1; charge<=max_fragment_charge; charge++) {
 414     spectrum &sp = synth_sp[charge];
 415     //sp.mass = fragment_peptide_mass;
 416     sp.charge = charge;
 417     sp.peaks.resize(mass_ladder.size() * (ION_MAX-ION_MIN));
 418     std::vector<peak>::size_type pi=0;
 419
 420     for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
 421       switch (ion_type) {
 422       case ION_Y:
 423         get_synthetic_Y_mass_ladder(mass_ladder, mass_list,
 424                                     fragment_C_fixed_mass);
 425         break;
 426       case ION_B:
 427         get_synthetic_B_mass_ladder(mass_ladder, mass_list,
 428                                     fragment_N_fixed_mass);
 429         break;
 430       default:
 431         assert(false);
 432       }
 433
 434       for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++) {
 435         sp.peaks[pi++].mz = peak::get_mz(mass_ladder[i].mz, charge);
 436       }
 437     }
 438     std::sort(sp.peaks.begin(), sp.peaks.end(), peak::less_mz);
 439   }
 440 }
 441
 442
 443 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
 444 // theoretical spectrum (y) generated from a peptide candidate.
 445 //
 446 // FIX: This code seems complicated.  Is the MyriMatch idea of using maps (or
 447 // better yet, sets) for peak lists simpler?  As fast or faster?
 448 //
 449 // This is the innermost loop, so it's worthwhile to optimize this some.
 450 // FIX: Should some of these vectors be arrays?
 451 static inline double
 452 score_spectrum(const spectrum &x, const spectrum &y) NOTHROW {
 453   assert (not x.peaks.empty() and not y.peaks.empty()); // FIX
 454
 455   std::vector<double> peak_best_delta(y.peaks.size(),
 456                                       CP.fragment_mass_tolerance);
 457   std::vector<int> peak_best_class(y.peaks.size(), -1);
 458
 459   std::vector<peak>::const_iterator x_it = x.peaks.begin();
 460   std::vector<peak>::const_iterator y_it = y.peaks.begin();
 461
 462   // Iterate over all closest pairs of peaks, collecting info on the class of
 463   // the best (meaning closest mz) real peak matching each theoretical peak
 464   // (for real/theoretical peak pairs that are no farther apart than
 465   // fragment_mass_tolerance).
 466   int y_index = 0;
 467   while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
 468     const double delta = y_it->mz - x_it->mz;
 469     if (std::abs(delta) <= peak_best_delta[y_index]) {
 470       peak_best_class[y_index] = x_it->intensity_class;
 471       peak_best_delta[y_index] = std::abs(delta);
 472     }
 473     if (delta > 0)
 474       x_it++;
 475     else {
 476       y_it++;
 477       y_index++;
 478     }
 479   }
 480
 481   assert(CP.intensity_class_count < INT_MAX);
 482   std::vector<unsigned> peak_hit_histogram(CP.intensity_class_count);
 483   std::vector<int>::const_iterator b_it;
 484   for (b_it = peak_best_class.begin(); b_it != peak_best_class.end(); b_it++)
 485     if (*b_it >= 0) {
 486       assert(*b_it < static_cast<int>(peak_hit_histogram.size()));
 487       peak_hit_histogram[*b_it] += 1;
 488     }
 489
 490   // How many theoretical peaks overlap the real peak range?
 491   int valid_theoretical_peaks = 0;
 492   for (y_it = y.peaks.begin(); y_it != y.peaks.end(); y_it++)
 493     if (x.min_peak_mz <= y_it->mz and y_it->mz <= x.max_peak_mz)
 494       valid_theoretical_peaks++;
 495
 496   // How many valid theoretical peaks were misses?
 497   const int peak_misses = (valid_theoretical_peaks
 498                            - accumulate(peak_hit_histogram.begin(),
 499                                         peak_hit_histogram.end(), 0));
 500   assert(peak_misses >= 0);
 501
 502   double score = 0.0;
 503   for (unsigned int i=0; i<peak_hit_histogram.size(); i++)
 504     score += ln_combination_(x.intensity_class_counts[i],
 505                              peak_hit_histogram[i]);
 506   score += ln_combination_(x.empty_peak_bins, peak_misses);
 507   score -= ln_combination_(x.total_peak_bins, valid_theoretical_peaks);
 508
 509   return score;
 510 }
 511
 512
 513 // Return the score of the specified spectrum against a group of synthetic
 514 // fragment spectra.
 515
 516 // For +1 and +2 precursors, this does what MyriMatch does.  For +3 (and
 517 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
 518 // may not make much sense.  (Wouldn't it be better to take the max of +1b/+2y
 519 // or +2b/+1y?)
 520 // FIX: implement MM smart +3 model, and/or come up with something better.
 521 static inline double
 522 score_spectrum_over_charges(spectrum synth_sp[/* max_fragment_charge+1 */],
 523                             const spectrum &sp,
 524                             const int max_fragment_charge) NOTHROW {
 525   double best_score = score_spectrum(sp, synth_sp[1]);
 526
 527   const int charge_limit = std::max<int>(1, sp.charge-1);
 528   for (int charge=2; charge<=charge_limit; charge++) {
 529     assert(charge <= max_fragment_charge);
 530     best_score += score_spectrum(sp, synth_sp[charge]);
 531   }
 532   return best_score;
 533 }
 534
 535
 536 struct mass_trace_list {
 537   mass_trace_item item;
 538   const mass_trace_list *next;
 539
 540   mass_trace_list(const mass_trace_list *p=0) : next(p) { }
 541 };
 542
 543
 544 // fuzzy comparison might not be necessary, but it's safer
 545 static inline bool
 546 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
 547
 548
 549 // Search for matches of this particular peptide modification variation
 550 // against the spectra.  Updates score_stats and returns the number of
 551 // candidate spectra found.
 552 static inline void
 553 evaluate_peptide(const search_context &context, match &m,
 554                  const mass_trace_list *mtlp,
 555                  const std::vector<double> &mass_list,
 556                  const std::vector<std::vector<spectrum>::size_type>
 557                      &candidate_spectra,
 558                  score_stats &stats) NOTHROW {
 559   assert(m.peptide_sequence.size() == mass_list.size());
 560
 561   int max_candidate_charge = 0;
 562   typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
 563   for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
 564     max_candidate_charge
 565       = std::max<int>(max_candidate_charge,
 566                       spectrum::searchable_spectra[*it].charge);
 567   assert(max_candidate_charge >= 1);
 568
 569   const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
 570   assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
 571   // e.g. +2 -> 'B' -> spectrum
 572   // FIX: should this be a std::vector??
 573   static spectrum synth_sp[spectrum::MAX_SUPPORTED_CHARGE+1];
 574   synthetic_spectra(synth_sp, mass_list, context.fragment_N_fixed_mass,
 575                     context.fragment_C_fixed_mass, max_fragment_charge);
 576
 577   for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
 578     m.spectrum_index = *it;
 579     spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
 580
 581     sp.comparisons += 1;
 582     stats.candidate_spectrum_count += 1;
 583     if (CP.estimate_only)
 584       continue;
 585
 586     //std::cerr << "sp " << m.spectrum_index << std::endl;
 587     m.score = score_spectrum_over_charges(synth_sp, sp, max_fragment_charge);
 588     //std::cerr << "score " << m.score << std::endl;
 589
 590     std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
 591     // Is this score good enough to be in the best score list?  (Better scores
 592     // are more negative.)
 593     if (m.score > sp_best_matches.back().score)
 594       continue;
 595
 596     // We're saving this match, so remember the mass trace info, too.
 597     m.mass_trace.clear();
 598     for (const mass_trace_list *p=mtlp; p; p=p->next)
 599       m.mass_trace.push_back(p->item);
 600
 601     // If this is duplicate, just append to the existing match and return
 602     // FIX: does comparison of mass trace work for all cases?
 603     for (std::vector<match>::reverse_iterator rit
 604            = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
 605       if (score_equal(rit->score, m.score)
 606           and rit->peptide_sequence == m.peptide_sequence
 607           and rit->mass_trace == m.mass_trace) {
 608         rit->peptide_begin.push_back(m.peptide_begin[0]);
 609         rit->sequence_name.push_back(m.sequence_name[0]);
 610         return;
 611       }
 612
 613     // Otherwise, insert this match in the correct position
 614     assert(sp_best_matches.size() >= 1);
 615     std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
 616     for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
 617       *rit = *(rit+1);
 618     *rit = m;
 619   }
 620 }
 621
 622
 623 // IDEA FIX: Use a cost parameter to define the iterative search front.
 624
 625
 626 // Choose one possible residue modification position.  Once they're all
 627 // chosen, then evaluate.
 628 static inline void
 629 choose_residue_mod(const search_context &context, match &m,
 630                    const mass_trace_list *mtlp, std::vector<double> &mass_list,
 631                    const std::vector<std::vector<spectrum>::size_type>
 632                        &candidate_spectra,
 633                    score_stats &stats,
 634                    std::vector<int> &db_remaining,
 635                    const unsigned remaining_positions_to_choose,
 636                    const unsigned next_position_to_consider) NOTHROW {
 637   assert(remaining_positions_to_choose
 638          <= m.peptide_sequence.size() - next_position_to_consider);
 639
 640   if (remaining_positions_to_choose == 0) {
 641     stats.combinations_searched += 1;
 642     evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
 643   } else {
 644     mass_trace_list mtl(mtlp);
 645
 646     // consider all of the positions where we could next add a mod
 647     for (unsigned int i=next_position_to_consider;
 648          i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
 649       mtl.item.position = i;
 650       const double save_pos_mass=mass_list[i];
 651       const char pos_res = m.peptide_sequence[i];
 652
 653       // consider the possibilities for this position
 654       for (std::vector<int>::const_iterator
 655              it=context.delta_bag_lookup[pos_res].begin();
 656            it != context.delta_bag_lookup[pos_res].end(); it++) {
 657         const int db_index = *it;
 658         if (db_remaining[db_index] < 1)
 659           continue;
 660         db_remaining[db_index] -= 1;
 661         mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
 662         mtl.item.delta = context.delta_bag_delta[db_index];
 663         choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
 664                            stats, db_remaining,
 665                            remaining_positions_to_choose-1, i+1);
 666         db_remaining[db_index] += 1;
 667       }
 668       mass_list[i] = save_pos_mass;
 669     }
 670   }
 671 }
 672
 673
 674 // p_begin (p_end) to begin_index (end_index), updating p_mass in the process.
 675 // The sign determines whether mass increases or decreases as p_begin (p_end)
 676 // is moved forward--so it should be -1 for the begin case and +1 for the end
 677 // case.
 678 // FIX: is this worth its complexity?  kill or else add reset
 679 static inline void
 680 update_p_mass(double &p_mass, int &p_begin, int begin_index, int sign,
 681               const std::string &run_sequence,
 682               const std::vector<double> &fixed_residue_mass) {
 683   assert(sign == +1 or sign == -1);
 684   assert(begin_index >= 0);
 685   const bool moving_forward = begin_index >= p_begin;
 686   int p0=p_begin, p1=begin_index;
 687   if (not moving_forward) {
 688     std::swap(p0, p1);
 689     sign = -sign;
 690   }
 691   for (int i=p0; i<p1; i++)
 692     p_mass += sign * fixed_residue_mass[run_sequence[i]];
 693   p_begin = begin_index;
 694 }
 695
 696
 697
 698 // Search a sequence run for matches according to the context against the
 699 // spectra.  Updates score_stats and the number of candidate spectra found.
 700
 701 // FIX: examine carefully for signed/unsigned problems
 702
 703 void inline
 704 search_run(const search_context &context, const sequence_run &sequence_run,
 705            score_stats &stats) {
 706   const int min_peptide_length = std::max<int>(CP.minimum_peptide_length,
 707                                                context.mod_count);
 708   const std::vector<double> &fixed_parent_mass \
 709     = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
 710
 711   const std::string &run_sequence = sequence_run.sequence;
 712   const std::vector<int> &cleavage_points = sequence_run.cleavage_points;
 713   assert(cleavage_points.size() >= 2); // endpoints assumed always present
 714
 715   // This match will be passed inward and used to record information that we
 716   // need to remember about a match when we finally see one.  At that point, a
 717   // copy of this match will be saved.
 718   // FIX: can this be done better?
 719   match m;
 720   m.sequence_name.push_back(sequence_run.name);
 721   m.peptide_begin.push_back(0);
 722   int &peptide_begin = m.peptide_begin[0];
 723
 724   assert(context.delta_bag_delta.size()
 725          == context.delta_bag_count.size());
 726   // counts remaining as mod positions are chosen
 727   std::vector<int> db_remaining = context.delta_bag_count;
 728
 729   // the rightmost 'end' seen when all spectra masses were too high
 730   // (0 means none yet encountered.)
 731   unsigned int next_end = 0;
 732
 733   // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
 734   double p_mass = context.parent_fixed_mass;
 735   int p_begin=0, p_end=0;
 736
 737   // FIX: optimize the non-specific cleavage case?
 738   for (unsigned int begin=0; begin<cleavage_points.size()-1; begin++) {
 739     const int begin_index = cleavage_points[begin];
 740     peptide_begin = begin_index + sequence_run.sequence_offset;
 741     if (not context.pca_residues.empty())
 742       if (context.pca_residues.find(run_sequence[begin_index])
 743           == std::string::npos)
 744         continue;
 745     update_p_mass(p_mass, p_begin, begin_index, -1, run_sequence,
 746                   fixed_parent_mass);
 747     unsigned int end = std::max<unsigned int>(begin + 1, next_end);
 748     for (; end<cleavage_points.size(); end++) {
 749       const int missed_cleavage_count = end - begin - 1;
 750       if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
 751         break;
 752
 753       const int end_index = cleavage_points[end];
 754       assert(end_index - begin_index > 0);
 755       const int peptide_size = end_index - begin_index;
 756       assert(peptide_size > 0);
 757       if (peptide_size < min_peptide_length)
 758         continue;
 759       update_p_mass(p_mass, p_end, end_index, +1, run_sequence,
 760                     fixed_parent_mass);
 761       //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
 762
 763       // We'd like to consider only the spectra whose masses are within the
 764       // tolerance range of theoretical mass of the peptide (p_mass).
 765       // Unfortunately, the tolerance range of each spectrum depends on its
 766       // charge, since the tolerance is specified in m/z units.  Furthermore,
 767       // when deciding how to adjust begin/end to generate the next peptide,
 768       // we need to be conservative, as a high-charge spectrum may come into
 769       // range, even though only +1 spectra are currently in-range.  So, we
 770       // first screen using the maximum tolerance range, then check the actual
 771       // spectra below.
 772
 773       const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
 774       const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
 775
 776       const spmi_c_it candidate_spectra_info_begin
 777         = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
 778       if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
 779         // spectrum masses all too low to match peptide (peptide too long)
 780         //std::cerr << "peptide: sp low" << std::endl;
 781         break;
 782       }
 783
 784       const spmi_c_it candidate_spectra_info_end
 785         = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
 786       if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
 787         // spectrum masses all too high to match peptide
 788         // (peptide is too short, so increase end, if possible, else return)
 789         //std::cerr << "peptide: sp high" << std::endl;
 790         if (end == cleavage_points.size() - 1)
 791           return;
 792         next_end = end;
 793         continue;
 794       }
 795       if (candidate_spectra_info_begin == candidate_spectra_info_end) {
 796         // no spectrum with close-enough parent mass
 797         //std::cerr << "peptide: sp none nearby" << std::endl;
 798         continue;
 799       }
 800
 801       // Now generate the list of candidate spectra that are actually
 802       // in-range, considering the charge of each spectrum.
 803       std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
 804       for (spmi_c_it it=candidate_spectra_info_begin;
 805            it != candidate_spectra_info_end; it++) {
 806         const spectrum &csp = spectrum::searchable_spectra[it->second];
 807         if (std::abs(csp.mass - p_mass) <= (csp.charge
 808                                             * CP.parent_mass_tolerance_1))
 809           candidate_spectra_0.push_back(it->second);
 810       }
 811       if (candidate_spectra_0.empty())
 812         continue;
 813
 814       m.predicted_parent_mass = p_mass;
 815       m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
 816
 817       std::vector<double> mass_list(peptide_size);
 818       for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size(); i++)
 819         mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
 820                         .fixed_residue_mass[m.peptide_sequence[i]]);
 821
 822       choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
 823                          stats, db_remaining, context.mod_count, 0);
 824     }
 825   }
 826 }
 827
 828
 829 // Search all sequence runs for matches according to the context against the
 830 // spectra.  Updates score_stats and the number of candidate spectra found.
 831 void
 832 spectrum::search_runs(const search_context &context, score_stats &stats) {
 833   const int no_runs = context.sequence_runs.size();
 834   for (int i=0; i<no_runs; i++) {
 835     search_run(context, context.sequence_runs[i], stats);
 836
 837     if (CP.show_progress)
 838       std::cerr << i+1 << " of " << no_runs << " sequences, "
 839                 << stats.candidate_spectrum_count << " candidates\r"
 840                 << std::flush;
 841   }
 842
 843   if (CP.show_progress)
 844     std::cerr << std::endl;
 845 }