add COPYING and --copyright flag
[greylag.git] / cgreylag.cpp
blobb096cd9bc36304e498a62cad64cbece06e3925eb
1 // C++ module for greylag
3 // $Id$
5 // Copyright (C) 2006, Stowers Institute for Medical Research
6 //
7 // This program is free software; you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation; either version 2 of the License, or
10 // (at your option) any later version.
12 // This program is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
17 // You should have received a copy of the GNU General Public License along
18 // with this program; if not, write to the Free Software Foundation, Inc.,
19 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 #include "cgreylag.hpp"
25 #include <cerrno>
26 #include <cfloat>
27 #include <climits>
28 #include <cstdlib>
29 #include <cmath>
30 #include <numeric>
31 #include <set>
32 #include <stdexcept>
34 #include <iostream>
37 #ifdef __GNU__
38 #define NOTHROW __attribute__ ((nothrow))
39 // use faster, non-threadsafe versions, since we don't use threads
40 #define fgetsX fgets_unlocked
41 #define fputsX fputs_unlocked
42 #define fprintfX __builtin_fprintf_unlocked
43 #define ferrorX ferror_unlocked
44 #else
45 #define NOTHROW
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #endif
53 parameters parameters::the;
56 char *
57 peak::__repr__() const {
58 static char temp[128];
59 sprintf(temp, "<peak mz=%.4f intensity=%.4f>", mz, intensity);
60 return &temp[0];
63 const int FIRST_SPECTRUM_ID = 1;
64 int spectrum::next_id = FIRST_SPECTRUM_ID;
65 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
67 std::vector<spectrum> spectrum::searchable_spectra;
68 // parent mass -> searchable_spectra index
69 std::multimap<double,
70 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
73 char *
74 spectrum::__repr__() const {
75 static char temp[1024];
76 sprintf(temp,
77 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
78 " [%d peaks, maxI=%f, sumI=%f]>",
79 id, physical_id, name.c_str(), mass, charge, peaks.size(),
80 max_peak_intensity, sum_peak_intensity);
81 return &temp[0];
86 // This is an error exit for read_spectra and filter_ms2_by_mass.
87 static void
88 io_error(FILE *f, const char *message="") {
89 if (ferrorX(f))
90 message = "I/O error while reading (or writing) spectrum file";
91 throw std::ios_base::failure(message);
95 // Return sorted list of parent masses present in the given ms2 files.
96 std::vector<double>
97 spectrum::read_ms2_spectrum_masses(std::vector<int> fds) {
98 std::vector<double> masses;
100 for (std::vector<int>::const_iterator fdit=fds.begin(); fdit != fds.end();
101 fdit++) {
102 FILE *f = fdopen(*fdit, "r");
104 const int bufsiz = 1024;
105 char buf[bufsiz];
106 char *endp;
107 char *result = fgetsX(buf, bufsiz, f);
108 while (true) {
109 // read headers
110 while (true) {
111 if (ferrorX(f))
112 io_error(f);
113 if (not result or buf[0] != ':')
114 break;
115 if (not fgetsX(buf, bufsiz, f))
116 io_error(f, "bad ms2 format: mass/charge line expected");
117 errno = 0;
118 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
119 if (errno or endp == buf)
120 io_error(f, "bad ms2 format: bad mass");
121 result = fgetsX(buf, bufsiz, f);
123 if (not result)
124 break;
125 // read peaks
126 std::vector<peak> peaks;
127 while (true) {
128 result = fgetsX(buf, bufsiz, f);
129 if (ferrorX(f))
130 io_error(f);
131 if (not result or buf[0] == ':')
132 break;
136 std::sort(masses.begin(), masses.end());
137 return masses;
141 // Read spectra from file in ms2 format, tagging them with file_id. Spectra
142 // with charge zero are omitted from the result. (All of them are read,
143 // though, to keep spectrum ids in sync.) If file_id == -1, the ms2 file is
144 // an annotated file produced by split_ms2_by_mass_band.
146 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
147 // having the same physical id. The peak list is initially sorted by mz.
148 // Throws an exception on invalid input.
150 std::vector<spectrum>
151 spectrum::read_spectra_from_ms2(FILE *f, const int file_id) {
152 std::vector<spectrum> spectra;
154 const int bufsiz = 1024;
155 char buf[bufsiz];
156 char *endp;
158 char *result = fgetsX(buf, bufsiz, f);
159 while (true) {
160 std::vector<std::string> names;
161 std::vector<double> masses;
162 std::vector<int> charges;
163 // for annotated ms2
164 std::vector<int> file_ids;
165 std::vector<int> physical_ids;
166 std::vector<int> ids;
168 // read headers
169 while (true) {
170 if (ferrorX(f))
171 io_error(f);
172 if (not result or buf[0] != ':')
173 break;
174 if (file_id != -1)
175 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
176 else {
177 char *anno_hash = std::strrchr(buf, '#'); // const, but C++ doesn't
178 // like it declared thus--yuck
179 if (not anno_hash)
180 io_error(f, "bad ms2+ format: '#' not found");
181 names.push_back(std::string(buf+1, anno_hash));
182 errno = 0;
183 file_ids.push_back(std::strtol(anno_hash+1, &endp, 10));
184 physical_ids.push_back(std::strtol(endp+1, &endp, 10));
185 ids.push_back(std::strtol(endp+1, &endp, 10));
186 if (errno) // FIX: improve error check
187 io_error(f, "bad ms2+ format: bad values");
189 if (not fgetsX(buf, bufsiz, f))
190 io_error(f, "bad ms2 format: mass/charge line expected");
191 errno = 0;
192 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
193 if (errno or endp == buf)
194 io_error(f, "bad ms2 format: bad mass");
195 const char *endp0 = endp;
196 charges.push_back(std::strtol(endp0, &endp, 10));
197 if (errno or endp == endp0)
198 io_error(f, "bad ms2 format: bad charge");
199 result = fgetsX(buf, bufsiz, f);
201 if (not result)
202 break;
203 // read peaks
204 std::vector<peak> peaks;
205 while (true) {
206 peak p;
207 errno = 0;
208 p.mz = strtod(buf, &endp);
209 if (errno or endp == buf)
210 io_error(f, "bad ms2 format: bad peak mz");
211 const char *endp0 = endp;
212 p.intensity = strtod(endp0, &endp);
213 if (errno or endp == endp0)
214 io_error(f, "bad ms2 format: bad peak intensity");
215 peaks.push_back(p);
217 result = fgetsX(buf, bufsiz, f);
218 if (ferrorX(f))
219 io_error(f);
220 if (not result or buf[0] == ':')
221 break;
224 // add spectra to vector
225 if (names.empty() and not peaks.empty())
226 io_error(f, "bad ms2 format: missing header lines?");
228 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
230 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
231 spectrum sp(masses[i], charges[i]);
232 sp.peaks = peaks;
233 sp.name = names[i];
234 if (file_id != -1) {
235 sp.file_id = file_id;
236 sp.physical_id = next_physical_id;
237 } else {
238 sp.file_id = file_ids[i];
239 sp.physical_id = physical_ids[i];
240 sp.id = ids[i];
241 spectrum::next_id = std::max(spectrum::next_id, sp.id+1);
242 spectrum::next_physical_id = std::max(spectrum::next_physical_id,
243 sp.physical_id+1);
245 sp.calculate_intensity_statistics();
246 spectra.push_back(sp);
248 if (file_id != -1)
249 spectrum::next_physical_id++;
251 return spectra;
255 // Copy spectra from inf (an open ms2 file) to outf, zeroing out any spectra
256 // with mass outside [lb, ub). Specifically, a zeroed spectrum will have an
257 // empty name and a mass and charge of zero. If all charges for a physical
258 // spectrum are zeroed, its peaklist will be replaced with a single peak
259 // having mass and intensity zero. Thus the zeroed spectra are validly
260 // formatted placeholders.
262 // The logic of this function is similar to read_spectra_from_ms2 above. Can
263 // they be unified?
265 void
266 spectrum::filter_ms2_by_mass(FILE *outf, FILE *inf, double lb, double ub) {
267 const int bufsiz = 1024;
268 char buf0[bufsiz], buf1[bufsiz];
269 char *endp;
271 char *result = fgetsX(buf0, bufsiz, inf);
272 while (true) {
273 bool zero_peaks = true;
274 double mass;
276 // copy headers
277 while (true) {
278 if (ferrorX(inf))
279 io_error(inf);
280 if (not result or buf0[0] != ':')
281 break;
282 if (not fgetsX(buf1, bufsiz, inf))
283 io_error(inf, "bad ms2 format: mass/charge line expected");
284 errno = 0;
285 mass = std::strtod(buf1, &endp); // need double accuracy here
286 if (errno or endp == buf1)
287 io_error(inf, "bad ms2 format: bad mass");
288 errno = 0;
289 if (lb <= mass and mass < ub) {
290 zero_peaks = false;
291 fputsX(buf0, outf);
292 fputsX(buf1, outf);
293 } else {
294 fputsX(":\n", outf);
295 fputsX("0 0\n", outf);
297 if (errno)
298 io_error(outf, "error writing ms2 file");
299 result = fgetsX(buf0, bufsiz, inf);
301 if (not result)
302 break;
303 // copy peaks
304 while (true) {
305 peak p;
306 errno = 0;
307 if (not zero_peaks) {
308 fputsX(buf0, outf);
309 if (errno)
310 io_error(outf, "error writing ms2 file");
312 result = fgetsX(buf0, bufsiz, inf);
313 if (ferrorX(inf))
314 io_error(inf);
315 if (not result or buf0[0] == ':')
316 break;
318 if (zero_peaks) {
319 errno = 0;
320 fputsX("0 0\n", outf);
321 if (errno)
322 io_error(outf, "error writing ms2 file");
328 // Copy spectra from an ms2 file to a set of output ms2 files, one for each
329 // band in the set of mass bands described by their upper bounds. Extra
330 // information is written in the first header line of each spectra so that
331 // id's may be recovered. So, for example, ':0002.0002.1' might become
332 // ':0002.0002.1 # 0 45 78', where 0, 45, 78 are the spectrum's file_id,
333 // physical_id, and id, respectively. Multiply charged spectra will be
334 // split into separate spectra (having the same physical_id).
336 // FIX: cleaner to just pass in a vector of filenames, rather than fds?
337 void
338 spectrum::split_ms2_by_mass_band(FILE *inf, const std::vector<int> &outfds,
339 const int file_id,
340 const std::vector<double> &mass_band_upper_bounds) {
341 std::map<double, FILE *> mass_file_index;
343 if (outfds.size() < 1
344 or outfds.size() != mass_band_upper_bounds.size())
345 throw std::invalid_argument("invalid argument value");
347 for (std::vector<int>::size_type i=0; i<outfds.size(); i++) {
348 FILE *f = fdopen(outfds[i], "w");
349 if (not f)
350 throw std::ios_base::failure("fdopen failed: fd limit exceeded?");
351 mass_file_index.insert(std::make_pair(mass_band_upper_bounds[i], f));
354 int o_next_id = FIRST_SPECTRUM_ID;
355 int o_next_physical_id = FIRST_SPECTRUM_ID;
357 const int bufsiz = 1024;
358 char buf0[bufsiz], buf1[bufsiz];
359 char *endp;
360 char *result = fgetsX(buf0, bufsiz, inf);
361 while (true) {
362 double mass;
363 std::set<FILE *> sp_files;
365 // copy headers
366 while (true) {
367 if (ferrorX(inf))
368 io_error(inf);
369 if (not result or buf0[0] != ':')
370 break;
371 if (not fgetsX(buf1, bufsiz, inf))
372 io_error(inf, "bad ms2 format: mass/charge line expected");
373 errno = 0;
374 mass = std::strtod(buf1, &endp); // need double accuracy here
375 if (errno or endp == buf1)
376 io_error(inf, "bad ms2 format: bad mass");
377 errno = 0;
378 std::map<double, FILE *>::const_iterator bit
379 = mass_file_index.lower_bound(mass);
380 if (bit == mass_file_index.end())
381 throw std::invalid_argument("internal error: mass out of range");
382 FILE *bandf = bit->second;
383 sp_files.insert(bandf);
384 assert(buf0[strlen(buf0)-1] == '\n');
385 buf0[strlen(buf0)-1] = 0; // chop newline
386 fprintfX(bandf, "%s # %d %d %d\n", buf0, file_id, o_next_physical_id,
387 o_next_id++);
388 fputsX(buf1, bandf);
389 if (errno)
390 io_error(bandf, "error writing ms2 file");
391 result = fgetsX(buf0, bufsiz, inf);
393 if (not result)
394 break;
395 if (sp_files.empty())
396 io_error(inf, "bad ms2 format: missing header lines?");
397 // copy peaks
398 while (true) {
399 peak p;
400 errno = 0;
401 for (std::set<FILE *>::const_iterator fit = sp_files.begin();
402 fit != sp_files.end(); fit++) {
403 fputsX(buf0, *fit);
404 if (errno)
405 io_error(*fit, "error writing ms2 file");
407 result = fgetsX(buf0, bufsiz, inf);
408 if (ferrorX(inf))
409 io_error(inf);
410 if (not result or buf0[0] == ':')
411 break;
413 o_next_physical_id++;
416 // flush all output files (is this necessary?)
417 for (std::map<double, FILE *>::const_iterator it = mass_file_index.begin();
418 it != mass_file_index.end(); it++)
419 std::fflush(it->second);
423 // For each *non-overlapping* group of peaks with mz within an interval of
424 // (less than) length 'limit', keep only the most intense peak. (The
425 // algorithm works from the right, which matter here.)
427 // FIX: Is this really the best approach? The obvious alternative would be to
428 // eliminate peaks so that no interval of length 'limit' has more than one
429 // peak (eliminating weak peaks to make this so).
431 static inline void
432 remove_close_peaks(std::vector<peak> &peaks, double limit) {
433 for (std::vector<peak>::size_type i=0; i+1<peaks.size(); i++) {
434 double ub_mz = peaks[i].mz + limit;
435 while (i+1 < peaks.size() and peaks[i+1].mz < ub_mz)
436 peaks.erase(peaks.begin()
437 + (peaks[i+1].intensity > peaks[i].intensity
438 ? i : i+1));
442 // static inline void
443 // remove_close_peaks(std::vector<peak> &peaks, double limit) {
444 // for (std::vector<peak>::size_type i=0; i+1<peaks.size();)
445 // if (peaks[i+1].mz - peaks[i].mz < limit)
446 // if (peaks[i+1].intensity > peaks[i].intensity)
447 // peaks.erase(peaks.begin() + i);
448 // else
449 // peaks.erase(peaks.begin() + i+1);
450 // else
451 // i++;
452 // }
455 // Examine this spectrum to see if it should be kept. If so, return true and
456 // sort the peaks by mz, normalize them and limit their number.
457 bool
458 spectrum::filter_and_normalize(double minimum_fragment_mz,
459 double dynamic_range, int minimum_peaks,
460 int total_peaks) {
461 // "spectrum, maximum parent charge", noise suppression check,
462 // "spectrum, minimum parent m+h" not implemented
464 if (minimum_fragment_mz < 0 or dynamic_range <= 0 or minimum_peaks < 0
465 or total_peaks < 0)
466 throw std::invalid_argument("invalid argument value");
468 // remove_isotopes
469 // >>> removes multiple entries within 0.95 Da of each other, retaining the
470 // highest value. this is necessary because of the behavior of some peak
471 // finding routines in commercial software
473 // peaks already sorted by mz upon read
474 remove_close_peaks(peaks, 0.95);
476 // remove parent
477 // >>> set up m/z regions to ignore: those immediately below the m/z of the
478 // parent ion which will contain uninformative neutral loss ions, and those
479 // immediately above the parent ion m/z, which will contain the parent ion
480 // and its isotope pattern
482 const double parent_mz = 1.00727 + (mass - 1.00727) / charge;
483 for (std::vector<peak>::size_type i=0; i<peaks.size();)
484 if (parent_mz >= peaks[i].mz
485 ? std::abs(parent_mz - peaks[i].mz) < (50.0 / charge)
486 : std::abs(parent_mz - peaks[i].mz) < (5.0 / charge))
487 peaks.erase(peaks.begin() + i);
488 else
489 i++;
492 // remove_low_masses
493 std::vector<peak>::iterator pi;
494 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= minimum_fragment_mz;
495 pi++);
496 peaks.erase(peaks.begin(), pi);
499 // normalize spectrum
500 // >>> use the dynamic range parameter to set the maximum intensity value
501 // for the spectrum. then remove all peaks with a normalized intensity < 1
502 if (peaks.empty())
503 return false;
504 std::vector<peak>::iterator most_intense_peak
505 = std::max_element(peaks.begin(), peaks.end(), peak::less_intensity);
506 normalization_factor
507 = std::max<double>(1.0, most_intense_peak->intensity) / dynamic_range;
508 for (std::vector<peak>::iterator p=peaks.begin(); p != peaks.end(); p++)
509 p->intensity /= normalization_factor;
510 for (std::vector<peak>::size_type i=0; i<peaks.size();)
511 if (peaks[i].intensity < 1.0)
512 peaks.erase(peaks.begin() + i);
513 else
514 i++;
516 if (peaks.size() < (unsigned int) minimum_peaks)
517 return false;
519 // # check is_noise (NYI)
520 // # * is_noise attempts to determine if the spectrum is simply
521 // # noise. if the spectrum
522 // # * does not have any peaks within a window near the parent ion mass,
523 // # it is considered
524 // # * noise.
525 // #limit = sp.mass / sp.charge
526 // #if sp.charge < 3:
527 // # limit = sp.mass - 600.0
528 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
529 // # return "looks like noise"
531 // clean_isotopes removes peaks that are probably C13 isotopes
532 remove_close_peaks(peaks, 1.5);
534 // keep only the most intense peaks
535 if (peaks.size() > (unsigned int) total_peaks) {
536 std::sort(peaks.begin(), peaks.end(), peak::less_intensity);
537 peaks.erase(peaks.begin(), peaks.end() - total_peaks);
538 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
541 // update statistics, now that we've removed some peaks
542 calculate_intensity_statistics();
544 return true;
548 // Store the list of spectra that search_peptide will search against, and also
549 // build spectrum_mass_index.
550 void
551 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
552 searchable_spectra = spectra;
553 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
554 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
558 // Multiply peak intensities by residue-dependent factors to generate a more
559 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
560 // reverse.
561 static inline void
562 synthesize_ladder_intensities(std::vector<peak> &mass_ladder,
563 const std::string &peptide_seq,
564 const bool is_XYZ) NOTHROW {
565 const parameters &CP = parameters::the;
567 assert(mass_ladder.size() == peptide_seq.size() - 1);
568 assert(mass_ladder.size() >= 2);
570 // An intensity boost is given for the second peptide bond/peak (*) if the
571 // N-terminal residue is 'P':
572 // 0 * 2 3 (peaks)
573 // A P C D E
574 const int proline_bonus_position = is_XYZ ? mass_ladder.size()-2 : 1;
575 mass_ladder[proline_bonus_position].intensity = 3.0;
576 if (peptide_seq[1] == 'P')
577 mass_ladder[proline_bonus_position].intensity = 10.0;
579 if (CP.spectrum_synthesis)
580 for (unsigned int i=0; i<mass_ladder.size(); i++) {
581 // FIX: there must be a better way
582 const char peptide_index = is_XYZ ? mass_ladder.size()-i-1 : i;
583 switch (peptide_seq[peptide_index]) {
584 case 'D':
585 mass_ladder[i].intensity *= 5.0;
586 break;
587 case 'V':
588 case 'E':
589 case 'I':
590 case 'L':
591 mass_ladder[i].intensity *= 3.0;
592 break;
593 case 'N':
594 case 'Q':
595 mass_ladder[i].intensity *= 2.0;
596 break;
598 switch (peptide_seq[peptide_index+1]) {
599 case 'P':
600 mass_ladder[i].intensity *= 5.0;
601 break;
607 // Calculate peaks for a synthesized mass (not mz) ladder.
608 static inline void
609 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
610 const std::string &peptide_seq,
611 const std::vector<double> &mass_list,
612 const double C_terminal_mass,
613 const unsigned mass_regime=0) NOTHROW {
614 const parameters &CP = parameters::the;
615 assert(peptide_seq.size() == mass_list.size());
616 double m = (CP.fragment_mass_regime[mass_regime].water_mass
617 + (CP.cleavage_C_terminal_mass_change
618 - CP.fragment_mass_regime[mass_regime].hydroxyl_mass)
619 + C_terminal_mass);
621 const int ladder_size = mass_ladder.size();
622 for (int i=ladder_size-1; i>=0; i--) {
623 m += mass_list[i+1];
624 mass_ladder[ladder_size-1-i] = peak(m, 1.0);
627 synthesize_ladder_intensities(mass_ladder, peptide_seq, true);
631 // Calculate peaks for a synthesized mass (not mz) ladder.
632 static inline void
633 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
634 const std::string &peptide_seq,
635 const std::vector<double> &mass_list,
636 const double N_terminal_mass,
637 const unsigned mass_regime=0) NOTHROW {
638 const parameters &CP = parameters::the;
639 double m = (0.0 + (CP.cleavage_N_terminal_mass_change - CP.hydrogen_mass)
640 + N_terminal_mass);
642 const int ladder_size = mass_ladder.size();
643 for (int i=0; i<=ladder_size-1; i++) {
644 m += mass_list[i];
645 mass_ladder[i] = peak(m, 1.0);
648 synthesize_ladder_intensities(mass_ladder, peptide_seq, false);
652 static inline void
653 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
654 const std::string &peptide_seq, const double peptide_mass,
655 const std::vector<double> &mass_list,
656 const double N_terminal_mass, const double C_terminal_mass,
657 const double max_fragment_charge) NOTHROW {
658 std::vector<peak> mass_ladder(mass_list.size()-1);
660 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
661 switch (ion_type) {
662 case ION_Y:
663 get_synthetic_Y_mass_ladder(mass_ladder, peptide_seq, mass_list,
664 C_terminal_mass);
665 break;
666 case ION_B:
667 get_synthetic_B_mass_ladder(mass_ladder, peptide_seq, mass_list,
668 N_terminal_mass);
669 break;
670 default:
671 assert(false);
674 //for (unsigned int i=0; i<mass_ladder.size(); i++)
675 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
677 for (int charge=1; charge<=max_fragment_charge; charge++) {
678 spectrum &sp = synth_sp[charge][ion_type];
679 sp.mass = peptide_mass;
680 sp.charge = charge;
681 sp.peaks.resize(mass_ladder.size());
682 for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++) {
683 sp.peaks[i].intensity = mass_ladder[i].intensity;
684 sp.peaks[i].mz = peak::get_mz(mass_ladder[i].mz, charge);
691 // Return the similarity score between this spectrum and that, and also a
692 // count of common peaks in *peak_count.
694 // xtandem quantizes mz values into bins of width fragment_mass_error first,
695 // and adds a bin on either side. This makes the effective fragment mass
696 // error for each peak an arbitrary value between 1x and 2x the parameter,
697 // based on quantization of that peak. If quirks_mode is on, we'll try to
698 // emulate this behavior.
699 static inline double
700 score_similarity_(const spectrum &x, const spectrum &y,
701 int *peak_count) NOTHROW {
702 const double frag_err = parameters::the.fragment_mass_error;
703 const bool quirks_mode = parameters::the.quirks_mode;
704 const double error_limit = quirks_mode ? frag_err*1.5 : frag_err;
706 double score = 0;
707 *peak_count = 0;
708 std::vector<peak>::const_iterator x_it = x.peaks.begin();
709 std::vector<peak>::const_iterator y_it = y.peaks.begin();
711 while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
712 double x_mz = x_it->mz, y_mz = y_it->mz;
713 if (quirks_mode) {
714 const float ffe = frag_err;
715 x_mz = int(static_cast<float>(x_mz) / ffe) * ffe;
716 y_mz = int(static_cast<float>(y_mz) / ffe) * ffe;
718 // in quirks mode, must be within one frag_err-wide bin
719 const double delta = y_mz - x_mz;
720 if (std::abs(delta) < error_limit) {
721 *peak_count += 1;
722 score += (x_it->intensity * y_it->intensity);
723 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
724 #if 0
725 // assume peaks aren't "close" together
726 x_it++;
727 y_it++;
728 continue;
729 #endif
731 if (delta > 0) // y_mz > x_mz
732 x_it++;
733 else
734 y_it++;
737 return score;
741 double
742 spectrum::score_similarity(const spectrum &x, const spectrum &y, int
743 *peak_count) {
744 return score_similarity_(x, y, peak_count);
748 // IDEA: could we combine all these spectra and just do the correlation once?
750 // Score the specified spectrum against a group of synthetic fragment
751 // spectra.
752 static inline void
753 score_spectrum(double &hyper_score, double &convolution_score,
754 std::vector<double> &ion_scores, std::vector<int> &ion_peaks,
755 spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
756 const int spectrum_id, const int max_fragment_charge) NOTHROW {
757 const parameters &CP = parameters::the;
758 const spectrum sp = spectrum::searchable_spectra[spectrum_id];
759 const int spectrum_charge = sp.charge;
760 hyper_score = 1.0;
761 convolution_score = 0;
763 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
764 int i_peaks = 0;
765 double i_scores = 0.0;
766 const int charge_limit = std::max<int>(1, spectrum_charge-1);
767 for (int charge=1; charge<=charge_limit; charge++) {
768 assert(charge <= max_fragment_charge);
770 // convolution score is just sum over charges/ions
771 // hyperscore is product of p! over charges/ions (where p is corr peak
772 // count) times the convolution score (clipped to FLT_MAX)
773 // > blurred!
774 int common_peak_count; // FIX!
775 double conv_score = spectrum::score_similarity(synth_sp[charge][ion_type],
776 sp, &common_peak_count);
777 i_peaks += common_peak_count;
778 i_scores += conv_score;
779 hyper_score *= CP.factorial[common_peak_count];
780 convolution_score += conv_score;
783 ion_peaks[ion_type] = i_peaks;
784 ion_scores[ion_type] = i_scores;
786 hyper_score *= convolution_score;
790 // #if 0
792 // struct generator;
795 // struct generator {
796 // // fx = &generator::generate_static_aa_mod;
798 // typedef void generator::generator_fx(score_stats &stats, const match &m,
799 // std::vector<double> position_mass,
800 // const std::vector<double> &terminal_mass,
801 // const std::vector<generator>::iterator g_it);
803 // generator() : fx(0) { }
805 // generator_fx generate_static_aa_mod;
807 // generator_fx generator::*fx;
808 // std::vector<double> args;
809 // };
812 // void
813 // generator::generate_static_aa_mod(score_stats &stats, const match &m,
814 // std::vector<double> position_mass,
815 // const std::vector<double> &terminal_mass,
816 // const std::vector<generator>::iterator g_it) {
817 // for (unsigned int i=0; i<m.peptide_sequence.size(); i++)
818 // position_mass[i] += args[m.peptide_sequence[i]];
820 // ((*g_it).*(g_it->fx))(stats, m, position_mass, terminal_mass, g_it+1);
821 // }
823 // #endif
826 // FIX: does this actually help inlining?
827 // returns log-scaled hyperscore
828 static inline double
829 scale_hyperscore_(double hyper_score) NOTHROW {
830 assert(hyper_score >= 0); // otherwise check for EDOM, ERANGE
831 if (hyper_score == 0)
832 return -DBL_MAX; // FIX: this shouldn't be reached?
833 return 4 * std::log10(hyper_score);
835 double
836 scale_hyperscore(double hyper_score) { return scale_hyperscore_(hyper_score); }
839 static inline double
840 get_peptide_mass(const std::vector<double> &mass_list,
841 const double N_terminal_mass,
842 const double C_terminal_mass) NOTHROW {
843 const parameters &CP = parameters::the;
845 double m = (N_terminal_mass + C_terminal_mass
846 + CP.cleavage_N_terminal_mass_change
847 + CP.cleavage_C_terminal_mass_change
848 + CP.proton_mass);
849 return std::accumulate(mass_list.begin(), mass_list.end(), m);
853 struct mass_trace_list {
854 mass_trace_item item;
855 const mass_trace_list *next;
857 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
861 // FIX: currently only one mass_list is implemented, meaning that parent and
862 // fragment masses must be the same (e.g., mono/mono). A workaround is to
863 // just increase the parent error plus a bit (the xtandem default seems to
864 // already do this).
866 // Search for matches of this particular peptide modification variation
867 // against the spectra. Updates score_stats and returns the number of
868 // candidate spectra found.
869 static inline void
870 evaluate_peptide_mod_variation(match &m, const mass_trace_list *mtlp,
871 const std::vector<double> &mass_list,
872 const double N_terminal_mass,
873 const double C_terminal_mass,
874 score_stats &stats) NOTHROW {
875 const parameters &CP = parameters::the;
876 assert(m.peptide_sequence.size() == mass_list.size());
878 m.peptide_mass = get_peptide_mass(mass_list, N_terminal_mass,
879 C_terminal_mass);
880 double sp_mass_lb = m.peptide_mass + CP.parent_monoisotopic_mass_error_minus;
881 double sp_mass_ub = m.peptide_mass + CP.parent_monoisotopic_mass_error_plus;
883 const std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
884 candidate_spectra_info_begin
885 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
886 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
887 stats.all_spectra_masses_too_high = false;
888 return; // spectrum masses all too low to match peptide
890 stats.all_spectra_masses_too_low = false;
892 const std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
893 candidate_spectra_info_end
894 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
895 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
896 return; // spectrum masses all too high to match peptide
898 stats.all_spectra_masses_too_high = false;
900 if (candidate_spectra_info_begin == candidate_spectra_info_end)
901 return; // no spectrum with close-enough parent mass
903 int max_candidate_charge = 0;
904 for (std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
905 it=candidate_spectra_info_begin;
906 it != candidate_spectra_info_end; it++)
907 max_candidate_charge
908 = std::max<int>(max_candidate_charge,
909 spectrum::searchable_spectra[it->second].charge);
910 assert(max_candidate_charge >= 1);
912 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
913 assert(max_fragment_charge <= spectrum::max_supported_charge);
914 // e.g. +2 -> 'B' -> spectrum
915 // FIX: should this be a std::vector??
916 static spectrum synth_sp[spectrum::max_supported_charge+1][ION_MAX];
917 synthetic_spectra(synth_sp, m.peptide_sequence, m.peptide_mass, mass_list,
918 N_terminal_mass, C_terminal_mass, max_fragment_charge);
920 for (std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
921 candidate_it = candidate_spectra_info_begin;
922 candidate_it != candidate_spectra_info_end; candidate_it++) {
923 m.spectrum_index = candidate_it->second;
924 stats.candidate_spectrum_count++;
925 score_spectrum(m.hyper_score, m.convolution_score, m.ion_scores,
926 m.ion_peaks, synth_sp, m.spectrum_index,
927 max_fragment_charge);
928 if (m.convolution_score <= 2.0)
929 continue;
931 //std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
932 //<< " " << m.peptide_mass << " " <<
933 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
935 // update spectrum histograms
936 std::vector<int> &hh = stats.hyperscore_histogram[m.spectrum_index];
937 int binned_scaled_hyper_score = int(0.5 + scale_hyperscore_(m.hyper_score));
938 assert(binned_scaled_hyper_score >= 0);
939 // FIX
940 if (static_cast<int>(hh.size())-1 < binned_scaled_hyper_score) {
941 assert(hh.size() < INT_MAX/2);
942 hh.resize(std::max<unsigned int>(binned_scaled_hyper_score+1,
943 2*hh.size()));
945 hh[binned_scaled_hyper_score] += 1;
946 // incr m_tPeptideScoredCount
947 // nyi: permute stuff
948 // FIX
949 const int sp_ion_count = m.ion_peaks[ION_B] + m.ion_peaks[ION_Y];
950 if (sp_ion_count < CP.minimum_ion_count)
951 continue;
952 const bool has_b_and_y = m.ion_peaks[ION_B] and m.ion_peaks[ION_Y];
953 if (not has_b_and_y)
954 continue;
956 // check that parent masses are within error range (isotope ni)
957 // already done above (why does xtandem do it here?)
958 // if check fails, only eligible for 2nd-best record
960 // Remember all of the highest-hyper-scoring matches against each
961 // spectrum. These might be in multiple domains (pos, length, mods) in
962 // multiple proteins.
964 // To avoid the problems that xtandem has with inexact float-point
965 // calculations, we consider hyperscores within the "epsilon ratio" to
966 // be "equal". The best_match vector will need to be re-screened
967 // against this ratio, because it may finally contain entries which
968 // don't meet our criterion. (We don't take the time to get it exactly
969 // right here, because we will often end up discarding it all anyway.)
971 const double current_ratio = (m.hyper_score
972 / stats.best_score[m.spectrum_index]);
974 if (CP.quirks_mode and (m.hyper_score < stats.best_score[m.spectrum_index])
975 or (not CP.quirks_mode
976 and (current_ratio < CP.hyper_score_epsilon_ratio))) {
977 // This isn't a best score, so see if it's a second-best score.
978 if (m.hyper_score > stats.second_best_score[m.spectrum_index])
979 stats.second_best_score[m.spectrum_index] = m.hyper_score;
980 continue;
982 if (CP.quirks_mode and (m.hyper_score > stats.best_score[m.spectrum_index])
983 or (not CP.quirks_mode
984 and (current_ratio > (1/CP.hyper_score_epsilon_ratio)))) {
985 // This score is significantly better, so forget previous matches.
986 stats.best_score[m.spectrum_index] = m.hyper_score;
987 stats.best_match[m.spectrum_index].clear();
989 // Now remember this match because it's at least as good as those
990 // previously seen.
991 m.mass_trace.clear();
992 for (const mass_trace_list *p=mtlp; p; p=p->next)
993 m.mass_trace.push_back(p->item);
994 stats.best_match[m.spectrum_index].push_back(m);
999 // The choose_* functions generate the different modification possibilities.
1001 // IDEA FIX: Use a cost parameter to define the iterative search front.
1004 // These values are used to keep track of the choices made so far (because
1005 // later choices may depend on earlier ones).
1006 enum chosen { CHOSEN_NONE, CHOSEN_PCA };
1009 // Choose a possible residue modification.
1010 static inline void
1011 choose_residue_mod(match &m,
1012 const std::vector< std::vector<double> > &pmm,
1013 const mass_trace_list *mtlp,
1014 std::vector<double> &mass_list,
1015 const double N_terminal_mass,
1016 const double C_terminal_mass, score_stats &stats,
1017 const unsigned remaining_residues_to_choose,
1018 const unsigned number_of_positions_to_consider,
1019 const int *const mod_positions_to_consider) NOTHROW {
1020 const parameters &CP = parameters::the;
1021 assert(remaining_residues_to_choose <= number_of_positions_to_consider);
1023 if (CP.quirks_mode and stats.combinations_searched >= (1 << 12))
1024 return;
1026 if (remaining_residues_to_choose == 0) {
1027 if (CP.quirks_mode)
1028 stats.combinations_searched++;
1030 evaluate_peptide_mod_variation(m, mtlp, mass_list, N_terminal_mass,
1031 C_terminal_mass, stats);
1032 } else {
1033 mass_trace_list mtl(mtlp);
1035 // consider all of the positions where we could next add a mod
1036 for (unsigned i=0;
1037 i<number_of_positions_to_consider-remaining_residues_to_choose+1;
1038 i++) {
1039 const int pos=mod_positions_to_consider[i];
1040 mtl.item.position = pos;
1041 assert(pos >= 0);
1042 const double save_mass=mass_list[pos];
1043 const std::vector<double> &deltas = pmm[m.peptide_sequence[pos]];
1044 // step through the deltas for this amino acid
1045 for (std::vector<double>::const_iterator it=deltas.begin();
1046 it != deltas.end(); it++) {
1047 mtl.item.delta = *it;
1048 mtl.item.description = "position mod"; // FIX
1049 mass_list[pos] = save_mass + *it;
1050 choose_residue_mod(m, pmm, &mtl, mass_list, N_terminal_mass,
1051 C_terminal_mass, stats,
1052 remaining_residues_to_choose-1,
1053 number_of_positions_to_consider-i-1,
1054 mod_positions_to_consider+i+1);
1056 mass_list[pos] = save_mass;
1062 // Choose the number of modified residue positions. Start with zero and count
1063 // up to the maximum number (which is not greater than the length of the
1064 // peptide).
1065 static inline void
1066 choose_residue_mod_count(match &m,
1067 const std::vector< std::vector<double> > &pmm,
1068 const mass_trace_list *mtlp,
1069 std::vector<double> &mass_list,
1070 const double N_terminal_mass,
1071 const double C_terminal_mass,
1072 score_stats &stats) NOTHROW {
1073 const parameters &CP = parameters::the;
1074 int mod_positions[m.peptide_sequence.size()+1];
1075 unsigned max_count=0;
1076 for (unsigned i=0; i<m.peptide_sequence.size(); i++)
1077 if (not pmm[m.peptide_sequence[i]].empty())
1078 mod_positions[max_count++] = i;
1079 mod_positions[max_count] = -1;
1081 if (CP.quirks_mode)
1082 stats.combinations_searched = 0;
1084 // start at one because the zero case is handled by
1085 // choose_potential_mod_alternative below (!)
1086 for (unsigned count=1; count <= max_count; count++)
1087 choose_residue_mod(m, pmm, mtlp, mass_list, N_terminal_mass,
1088 C_terminal_mass, stats, count, max_count,
1089 mod_positions);
1093 // FIX: Is there some clear and efficient way to treat terminal mods in the
1094 // same way as residue mods? Or, alternatively, is it possible to gracefully
1095 // merge the two choose_?_terminal_mod functions?
1097 // Choose a possible peptide C-terminal modification.
1098 static inline void
1099 choose_C_terminal_mod(match &m,
1100 const std::vector< std::vector<double> > &pmm,
1101 const mass_trace_list *mtlp,
1102 std::vector<double> &mass_list,
1103 const double N_terminal_mass,
1104 const double C_terminal_mass,
1105 score_stats &stats) NOTHROW {
1106 choose_residue_mod_count(m, pmm, mtlp, mass_list, N_terminal_mass,
1107 C_terminal_mass, stats);
1109 mass_trace_list mtl(mtlp);
1110 mtl.item.position = POSITION_CTERM;
1111 const std::vector<double> &C_deltas = pmm[']'];
1113 for (std::vector<double>::const_iterator it=C_deltas.begin();
1114 it != C_deltas.end(); it++) {
1115 mtl.item.delta = *it;
1116 mtl.item.description = "C-terminal mod"; // FIX
1117 choose_residue_mod_count(m, pmm, &mtl, mass_list, N_terminal_mass,
1118 C_terminal_mass+*it, stats);
1123 // Choose a possible peptide N-terminal modification, unless a PCA mod has
1124 // already been chosen.
1125 static inline void
1126 choose_N_terminal_mod(match &m, chosen ch,
1127 const std::vector< std::vector<double> > &pmm,
1128 const mass_trace_list *mtlp,
1129 std::vector<double> &mass_list,
1130 const double N_terminal_mass,
1131 const double C_terminal_mass,
1132 score_stats &stats) NOTHROW {
1133 choose_C_terminal_mod(m, pmm, mtlp, mass_list, N_terminal_mass,
1134 C_terminal_mass, stats);
1136 if (ch == CHOSEN_PCA)
1137 return;
1138 mass_trace_list mtl(mtlp);
1139 mtl.item.position = POSITION_NTERM;
1140 const std::vector<double> &N_deltas = pmm['['];
1142 for (std::vector<double>::const_iterator it=N_deltas.begin();
1143 it != N_deltas.end(); it++) {
1144 mtl.item.delta = *it;
1145 mtl.item.description = "N-terminal mod"; // FIX
1146 choose_C_terminal_mod(m, pmm, &mtl, mass_list, N_terminal_mass+*it,
1147 C_terminal_mass, stats);
1152 // Choose none or one of the (';'-separated) potential modification
1153 // alternative sets. In the 'none' case, no explicitly requested potential
1154 // mods will be chosen, but PCA mods (above) will still be searched. If an
1155 // alternative set is chosen, at least one explicit mod must be chosen, to
1156 // avoid overlap with the 'none' case.
1157 static inline void
1158 choose_potential_mod_alternative(match &m, chosen ch,
1159 const mass_trace_list *mtlp,
1160 std::vector<double> &mass_list,
1161 const double N_terminal_mass,
1162 const double C_terminal_mass,
1163 score_stats &stats) NOTHROW {
1164 const parameters &CP = parameters::the;
1166 // the 'none' case--jump over the rest of the choose_*, since in this case
1167 // they choose nothing
1168 evaluate_peptide_mod_variation(m, mtlp, mass_list, N_terminal_mass,
1169 C_terminal_mass, stats);
1171 const mass_regime_parameters &mrp = CP.fragment_mass_regime[m.mass_regime];
1172 const std::vector< std::vector< std::vector<double> > > &alts
1173 = mrp.potential_modification_mass;
1175 for (std::vector< std::vector< std::vector<double> > >::const_iterator
1176 it=alts.begin();
1177 it != alts.end(); it++)
1178 choose_N_terminal_mod(m, ch, *it, mtlp, mass_list, N_terminal_mass,
1179 C_terminal_mass, stats);
1183 // Choose a possible modification to account for PCA (pyrrolidone carboxyl
1184 // acid) circularization of the peptide N-terminal. PCA mods are excluded if
1185 // a static N-terminal mod has been specified. Likewise, choosing a PCA mod
1186 // will exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
1187 // dependent on a static mod of C+57 being in effect.)
1188 static inline void
1189 choose_PCA_mod(match &m, const mass_trace_list *mtlp,
1190 std::vector<double> &mass_list,
1191 const double N_terminal_mass,
1192 const double C_terminal_mass, score_stats &stats) NOTHROW {
1193 const parameters &CP = parameters::the;
1195 choose_potential_mod_alternative(m, CHOSEN_NONE, mtlp, mass_list,
1196 N_terminal_mass, C_terminal_mass, stats);
1198 mass_trace_list mtl(mtlp);
1199 mtl.item.position = POSITION_NTERM;
1200 const mass_regime_parameters &mrp = CP.fragment_mass_regime[m.mass_regime];
1202 // now try a possible PCA mod, if there is one
1203 // FIX: somehow xtandem generates these twice??
1204 if (mrp.modification_mass['['] == 0)
1205 switch (m.peptide_sequence[0]) {
1206 case 'E':
1207 mtl.item.delta = -mrp.water_mass;
1208 mtl.item.description = "PCA";
1209 choose_potential_mod_alternative(m, CHOSEN_PCA, &mtl, mass_list,
1210 N_terminal_mass - mrp.water_mass,
1211 C_terminal_mass, stats);
1212 break;
1213 case 'C': {
1214 // FIX: need better test for C+57? (symbolic?)
1215 const double C_mod = mrp.modification_mass['C'];
1216 if (CP.quirks_mode ? int(C_mod) != 57 : std::abs(C_mod - 57) > 0.5)
1217 break; // skip unless C+57
1218 // else fall through...
1220 case 'Q':
1221 mtl.item.delta = -mrp.ammonia_mass;
1222 mtl.item.description = "PCA";
1223 choose_potential_mod_alternative(m, CHOSEN_PCA, &mtl, mass_list,
1224 N_terminal_mass - mrp.ammonia_mass,
1225 C_terminal_mass, stats);
1226 break;
1231 // Choose among the requested mass regimes (e.g., isotopes), and also choose
1232 // the static mods (including N- and C-terminal mods), all of which are
1233 // completely determined by the mass regime choice.
1234 static inline void
1235 choose_mass_regime(match &m, std::vector<double> &mass_list,
1236 const double N_terminal_mass, const double C_terminal_mass,
1237 score_stats &stats) NOTHROW {
1238 const parameters &CP = parameters::the;
1240 m.mass_regime = 0; // FIX
1241 const mass_regime_parameters &mrp = CP.fragment_mass_regime[m.mass_regime];
1242 for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size(); i++) {
1243 const char res = m.peptide_sequence[i];
1244 mass_list[i] = mrp.residue_mass[res] + mrp.modification_mass[res];
1246 choose_PCA_mod(m, NULL, mass_list,
1247 N_terminal_mass + mrp.modification_mass['['],
1248 C_terminal_mass + mrp.modification_mass[']'], stats);
1252 // Search for matches of all modification variations of this peptide against
1253 // the spectra. Updates score_stats and the number of candidate spectra
1254 // found.
1255 void
1256 spectrum::search_peptide_all_mods(int idno, int offset, int begin,
1257 const std::string &peptide_seq,
1258 int missed_cleavage_count,
1259 score_stats &stats) {
1260 std::vector<double> mass_list(peptide_seq.size());
1261 double N_terminal_mass = 0.0;
1262 double C_terminal_mass = 0.0;
1264 // This match will be passed inward and used to record information that we
1265 // need to remember about a match when we finally see one. At that point, a
1266 // copy of this match will be saved.
1267 match m;
1268 m.sequence_index = idno;
1269 m.sequence_offset = offset;
1270 m.peptide_begin = begin;
1271 m.peptide_sequence = peptide_seq;
1272 m.missed_cleavage_count = missed_cleavage_count;
1274 choose_mass_regime(m, mass_list, N_terminal_mass, C_terminal_mass, stats);
1278 // Search for matches of all modification variations of peptides in this
1279 // sequence run against the spectra. Updates score_stats and the number of
1280 // candidate spectra found.
1281 // FIX: optimize the non-specific case?
1282 void
1283 spectrum::search_run_all_mods(const int maximum_missed_cleavage_sites,
1284 const int min_peptide_length,
1285 const bool no_N_term_mods,
1286 const int idno, const int offset,
1287 const std::string &run_sequence,
1288 const std::vector<int> cleavage_points,
1289 score_stats &stats) {
1290 double N_terminal_mass = 0.0;
1291 double C_terminal_mass = 0.0;
1293 // This match will be passed inward and used to record information that we
1294 // need to remember about a match when we finally see one. At that point, a
1295 // copy of this match will be saved.
1296 match m;
1297 m.sequence_index = idno;
1298 m.sequence_offset = offset;
1300 // FIX: examine carefully for signed/unsigned problems
1302 // The rightmost 'end' seen where 'all_spectra_masses_too_high' was true.
1303 // (0 means none yet encountered.)
1304 unsigned int next_end = 0;
1306 for (unsigned int begin=0; begin<cleavage_points.size()-1; begin++) {
1307 const int begin_index = m.peptide_begin = cleavage_points[begin];
1308 unsigned int end = begin + 1;
1309 if (no_N_term_mods)
1310 if (next_end != 0)
1311 end = std::max<unsigned int>(end, next_end);
1312 for (; end<cleavage_points.size(); end++) {
1313 m.missed_cleavage_count = end - begin - 1;
1314 if (m.missed_cleavage_count > maximum_missed_cleavage_sites)
1315 break;
1317 const int end_index = cleavage_points[end];
1318 const int peptide_size = end_index - begin_index;
1319 if (peptide_size < min_peptide_length)
1320 continue;
1321 // these start vacuously true, then to be falsified if possible
1322 stats.all_spectra_masses_too_high = true;
1323 stats.all_spectra_masses_too_low = true;
1324 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
1325 //std::cerr << "peptide: " << m.peptide_sequence << std::endl;
1326 std::vector<double> mass_list(peptide_size);
1327 choose_mass_regime(m, mass_list, N_terminal_mass, C_terminal_mass,
1328 stats);
1330 assert(not (stats.all_spectra_masses_too_high
1331 and stats.all_spectra_masses_too_low));
1332 if (stats.all_spectra_masses_too_high) {
1333 //std::cerr << "all_spectra_masses_too_high" << std::endl;
1334 if (end == cleavage_points.size() - 1)
1335 return;
1336 next_end = end;
1338 if (stats.all_spectra_masses_too_low) {
1339 //std::cerr << "all_spectra_masses_too_low" << std::endl;
1340 break;