checkpoint
[greylag.git] / cgreylag.cpp
blob2727bc3e1ab354c35283d2051c4b42e7d363e665
1 // C++ module for greylag
3 // $Id$
5 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
6 //
7 // This program is free software; you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation; either version 2 of the License, or
10 // (at your option) any later version.
12 // This program is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
17 // You should have received a copy of the GNU General Public License along
18 // with this program; if not, write to the Free Software Foundation, Inc.,
19 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 #include "cgreylag.hpp"
25 #include <cerrno>
26 #include <cfloat>
27 #include <climits>
28 #include <cstdlib>
29 #include <cmath>
30 #include <numeric>
31 #include <set>
32 #include <stdexcept>
34 #include <iostream>
37 #ifdef __GNU__
38 #define NOTHROW __attribute__ ((nothrow))
39 // use faster, non-threadsafe versions, since we don't use threads
40 #define fgetsX fgets_unlocked
41 #define fputsX fputs_unlocked
42 #define fprintfX __builtin_fprintf_unlocked
43 #define ferrorX ferror_unlocked
44 #else
45 #define NOTHROW
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #endif
53 parameters parameters::the;
56 char *
57 peak::__repr__() const {
58 static char temp[128];
59 sprintf(temp, "<peak mz=%.4f intensity=%.4f>", mz, intensity);
60 return &temp[0];
63 const int FIRST_SPECTRUM_ID = 1;
64 int spectrum::next_id = FIRST_SPECTRUM_ID;
65 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
67 std::vector<spectrum> spectrum::searchable_spectra;
68 // parent mass -> searchable_spectra index
69 std::multimap<double,
70 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
73 char *
74 spectrum::__repr__() const {
75 static char temp[1024];
76 sprintf(temp,
77 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
78 " [%d peaks, maxI=%f, sumI=%f]>",
79 id, physical_id, name.c_str(), mass, charge, peaks.size(),
80 max_peak_intensity, sum_peak_intensity);
81 return &temp[0];
86 // This is an error exit for read_spectra and filter_ms2_by_mass.
87 static inline void
88 io_error(FILE *f, const char *message="") {
89 if (ferrorX(f))
90 message = "I/O error while reading (or writing) spectrum file";
91 throw std::ios_base::failure(message);
95 // Return sorted list of parent masses present in the given ms2 files.
96 std::vector<double>
97 spectrum::read_ms2_spectrum_masses(std::vector<int> fds) {
98 std::vector<double> masses;
100 for (std::vector<int>::const_iterator fdit=fds.begin(); fdit != fds.end();
101 fdit++) {
102 FILE *f = fdopen(*fdit, "r");
104 const int bufsiz = 1024;
105 char buf[bufsiz];
106 char *endp;
107 char *result = fgetsX(buf, bufsiz, f);
108 while (true) {
109 // read headers
110 while (true) {
111 if (ferrorX(f))
112 io_error(f);
113 if (not result or buf[0] != ':')
114 break;
115 if (not fgetsX(buf, bufsiz, f))
116 io_error(f, "bad ms2 format: mass/charge line expected");
117 errno = 0;
118 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
119 if (errno or endp == buf)
120 io_error(f, "bad ms2 format: bad mass");
121 result = fgetsX(buf, bufsiz, f);
123 if (not result)
124 break;
125 // read peaks
126 std::vector<peak> peaks;
127 while (true) {
128 result = fgetsX(buf, bufsiz, f);
129 if (ferrorX(f))
130 io_error(f);
131 if (not result or buf[0] == ':')
132 break;
136 std::sort(masses.begin(), masses.end());
137 return masses;
141 // Read spectra from file in ms2 format, tagging them with file_id. Spectra
142 // with charge zero are omitted from the result. (All of them are read,
143 // though, to keep spectrum ids in sync.) If file_id == -1, the ms2 file is
144 // an annotated file produced by split_ms2_by_mass_band.
146 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
147 // having the same physical id. The peak list is initially sorted by mz.
148 // Throws an exception on invalid input.
150 std::vector<spectrum>
151 spectrum::read_spectra_from_ms2(FILE *f, const int file_id) {
152 std::vector<spectrum> spectra;
154 const int bufsiz = 1024;
155 char buf[bufsiz];
156 char *endp;
158 char *result = fgetsX(buf, bufsiz, f);
159 while (true) {
160 std::vector<std::string> names;
161 std::vector<double> masses;
162 std::vector<int> charges;
163 // for annotated ms2
164 std::vector<int> file_ids;
165 std::vector<int> physical_ids;
166 std::vector<int> ids;
168 // read headers
169 while (true) {
170 if (ferrorX(f))
171 io_error(f);
172 if (not result or buf[0] != ':')
173 break;
174 if (file_id != -1)
175 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
176 else {
177 char *anno_hash = std::strrchr(buf, '#'); // const, but C++ doesn't
178 // like it declared thus--yuck
179 if (not anno_hash)
180 io_error(f, "bad ms2+ format: '#' not found");
181 names.push_back(std::string(buf+1, anno_hash));
182 errno = 0;
183 file_ids.push_back(std::strtol(anno_hash+1, &endp, 10));
184 physical_ids.push_back(std::strtol(endp+1, &endp, 10));
185 ids.push_back(std::strtol(endp+1, &endp, 10));
186 if (errno) // FIX: improve error check
187 io_error(f, "bad ms2+ format: bad values");
189 if (not fgetsX(buf, bufsiz, f))
190 io_error(f, "bad ms2 format: mass/charge line expected");
191 errno = 0;
192 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
193 if (errno or endp == buf)
194 io_error(f, "bad ms2 format: bad mass");
195 const char *endp0 = endp;
196 charges.push_back(std::strtol(endp0, &endp, 10));
197 if (errno or endp == endp0)
198 io_error(f, "bad ms2 format: bad charge");
199 result = fgetsX(buf, bufsiz, f);
201 if (not result)
202 break;
203 // read peaks
204 std::vector<peak> peaks;
205 while (true) {
206 peak p;
207 errno = 0;
208 p.mz = strtod(buf, &endp);
209 if (errno or endp == buf)
210 io_error(f, "bad ms2 format: bad peak mz");
211 const char *endp0 = endp;
212 p.intensity = strtod(endp0, &endp);
213 if (errno or endp == endp0)
214 io_error(f, "bad ms2 format: bad peak intensity");
215 peaks.push_back(p);
217 result = fgetsX(buf, bufsiz, f);
218 if (ferrorX(f))
219 io_error(f);
220 if (not result or buf[0] == ':')
221 break;
224 // add spectra to vector
225 if (names.empty() and not peaks.empty())
226 io_error(f, "bad ms2 format: missing header lines?");
228 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
230 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
231 spectrum sp(masses[i], charges[i]);
232 sp.peaks = peaks;
233 sp.name = names[i];
234 if (file_id != -1) {
235 sp.file_id = file_id;
236 sp.physical_id = next_physical_id;
237 } else {
238 sp.file_id = file_ids[i];
239 sp.physical_id = physical_ids[i];
240 sp.id = ids[i];
241 spectrum::next_id = std::max(spectrum::next_id, sp.id+1);
242 spectrum::next_physical_id = std::max(spectrum::next_physical_id,
243 sp.physical_id+1);
245 sp.calculate_intensity_statistics();
246 spectra.push_back(sp);
248 if (file_id != -1)
249 spectrum::next_physical_id++;
251 return spectra;
255 // Copy spectra from an ms2 file to a set of output ms2 files, one for each
256 // band in the set of mass bands described by their upper bounds. Extra
257 // information is written in the first header line of each spectra so that
258 // id's may be recovered. So, for example, ':0002.0002.1' might become
259 // ':0002.0002.1 # 0 45 78', where 0, 45, 78 are the spectrum's file_id,
260 // physical_id, and id, respectively. Multiply charged spectra will be
261 // split into separate spectra (having the same physical_id).
263 // FIX: cleaner to just pass in a vector of filenames, rather than fds?
264 void
265 spectrum::split_ms2_by_mass_band(FILE *inf, const std::vector<int> &outfds,
266 const int file_id,
267 const std::vector<double> &mass_band_upper_bounds) {
268 std::map<double, FILE *> mass_file_index;
270 if (outfds.size() < 1
271 or outfds.size() != mass_band_upper_bounds.size())
272 throw std::invalid_argument("invalid argument value");
274 for (std::vector<int>::size_type i=0; i<outfds.size(); i++) {
275 FILE *f = fdopen(outfds[i], "w");
276 if (not f)
277 throw std::ios_base::failure("fdopen failed: fd limit exceeded?");
278 mass_file_index.insert(std::make_pair(mass_band_upper_bounds[i], f));
281 int o_next_id = FIRST_SPECTRUM_ID;
282 int o_next_physical_id = FIRST_SPECTRUM_ID;
284 const int bufsiz = 1024;
285 char buf0[bufsiz], buf1[bufsiz];
286 char *endp;
287 char *result = fgetsX(buf0, bufsiz, inf);
288 while (true) {
289 double mass;
290 std::set<FILE *> sp_files;
292 // copy headers
293 while (true) {
294 if (ferrorX(inf))
295 io_error(inf);
296 if (not result or buf0[0] != ':')
297 break;
298 if (not fgetsX(buf1, bufsiz, inf))
299 io_error(inf, "bad ms2 format: mass/charge line expected");
300 errno = 0;
301 mass = std::strtod(buf1, &endp); // need double accuracy here
302 if (errno or endp == buf1)
303 io_error(inf, "bad ms2 format: bad mass");
304 errno = 0;
305 std::map<double, FILE *>::const_iterator bit
306 = mass_file_index.lower_bound(mass);
307 if (bit == mass_file_index.end())
308 throw std::invalid_argument("internal error: mass out of range");
309 FILE *bandf = bit->second;
310 sp_files.insert(bandf);
311 assert(buf0[strlen(buf0)-1] == '\n');
312 buf0[strlen(buf0)-1] = 0; // chop newline
313 fprintfX(bandf, "%s # %d %d %d\n", buf0, file_id, o_next_physical_id,
314 o_next_id++);
315 fputsX(buf1, bandf);
316 if (errno)
317 io_error(bandf, "error writing ms2 file");
318 result = fgetsX(buf0, bufsiz, inf);
320 if (not result)
321 break;
322 if (sp_files.empty())
323 io_error(inf, "bad ms2 format: missing header lines?");
324 // copy peaks
325 while (true) {
326 peak p;
327 errno = 0;
328 for (std::set<FILE *>::const_iterator fit = sp_files.begin();
329 fit != sp_files.end(); fit++) {
330 fputsX(buf0, *fit);
331 if (errno)
332 io_error(*fit, "error writing ms2 file");
334 result = fgetsX(buf0, bufsiz, inf);
335 if (ferrorX(inf))
336 io_error(inf);
337 if (not result or buf0[0] == ':')
338 break;
340 o_next_physical_id++;
343 // flush all output files (is this necessary?)
344 for (std::map<double, FILE *>::const_iterator it = mass_file_index.begin();
345 it != mass_file_index.end(); it++)
346 std::fflush(it->second);
350 // For each *non-overlapping* group of peaks with mz within an interval of
351 // (less than) length 'limit', keep only the most intense peak. (The
352 // algorithm works from the right, which matter here.)
354 // FIX: Is this really the best approach? The obvious alternative would be to
355 // eliminate peaks so that no interval of length 'limit' has more than one
356 // peak (eliminating weak peaks to make this so).
358 static inline void
359 remove_close_peaks(std::vector<peak> &peaks, double limit) {
360 for (std::vector<peak>::size_type i=0; i+1<peaks.size(); i++) {
361 double ub_mz = peaks[i].mz + limit;
362 while (i+1 < peaks.size() and peaks[i+1].mz < ub_mz)
363 peaks.erase(peaks.begin()
364 + (peaks[i+1].intensity > peaks[i].intensity
365 ? i : i+1));
370 // Examine this spectrum to see if it should be kept. If so, return true and
371 // sort the peaks by mz, normalize them and limit their number.
372 bool
373 spectrum::filter_and_normalize(double minimum_fragment_mz,
374 double dynamic_range, int minimum_peaks,
375 int total_peaks) {
376 // "spectrum, maximum parent charge", noise suppression check,
377 // "spectrum, minimum parent m+h" not implemented
379 if (minimum_fragment_mz < 0 or dynamic_range <= 0 or minimum_peaks < 0
380 or total_peaks < 0)
381 throw std::invalid_argument("invalid argument value");
383 // remove_isotopes
384 // >>> removes multiple entries within 0.95 Da of each other, retaining the
385 // highest value. this is necessary because of the behavior of some peak
386 // finding routines in commercial software
388 // peaks already sorted by mz upon read
389 remove_close_peaks(peaks, 0.95);
391 // remove parent
392 // >>> set up m/z regions to ignore: those immediately below the m/z of the
393 // parent ion which will contain uninformative neutral loss ions, and those
394 // immediately above the parent ion m/z, which will contain the parent ion
395 // and its isotope pattern
397 const double parent_mz = 1.00727 + (mass - 1.00727) / charge;
398 for (std::vector<peak>::size_type i=0; i<peaks.size();)
399 if (parent_mz >= peaks[i].mz
400 ? std::abs(parent_mz - peaks[i].mz) < (50.0 / charge)
401 : std::abs(parent_mz - peaks[i].mz) < (5.0 / charge))
402 peaks.erase(peaks.begin() + i);
403 else
404 i++;
407 // remove_low_masses
408 std::vector<peak>::iterator pi;
409 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= minimum_fragment_mz;
410 pi++);
411 peaks.erase(peaks.begin(), pi);
414 // normalize spectrum
415 // >>> use the dynamic range parameter to set the maximum intensity value
416 // for the spectrum. then remove all peaks with a normalized intensity < 1
417 if (peaks.empty())
418 return false;
419 std::vector<peak>::iterator most_intense_peak
420 = std::max_element(peaks.begin(), peaks.end(), peak::less_intensity);
421 normalization_factor
422 = std::max<double>(1.0, most_intense_peak->intensity) / dynamic_range;
423 for (std::vector<peak>::iterator p=peaks.begin(); p != peaks.end(); p++)
424 p->intensity /= normalization_factor;
425 for (std::vector<peak>::size_type i=0; i<peaks.size();)
426 if (peaks[i].intensity < 1.0)
427 peaks.erase(peaks.begin() + i);
428 else
429 i++;
431 if (peaks.size() < (unsigned int) minimum_peaks)
432 return false;
434 // # check is_noise (NYI)
435 // # * is_noise attempts to determine if the spectrum is simply
436 // # noise. if the spectrum
437 // # * does not have any peaks within a window near the parent ion mass,
438 // # it is considered
439 // # * noise.
440 // #limit = sp.mass / sp.charge
441 // #if sp.charge < 3:
442 // # limit = sp.mass - 600.0
443 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
444 // # return "looks like noise"
446 // clean_isotopes removes peaks that are probably C13 isotopes
447 remove_close_peaks(peaks, 1.5);
449 // keep only the most intense peaks
450 if (peaks.size() > (unsigned int) total_peaks) {
451 std::sort(peaks.begin(), peaks.end(), peak::less_intensity);
452 peaks.erase(peaks.begin(), peaks.end() - total_peaks);
453 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
456 // update statistics, now that we've removed some peaks
457 calculate_intensity_statistics();
459 return true;
463 // Store the list of spectra that search_peptide will search against, and also
464 // build spectrum_mass_index.
465 void
466 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
467 searchable_spectra = spectra;
468 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
469 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
473 // Multiply peak intensities by residue-dependent factors to generate a more
474 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
475 // reverse.
476 static inline void
477 synthesize_ladder_intensities(std::vector<peak> &mass_ladder,
478 const std::string &peptide_seq,
479 const bool is_XYZ) NOTHROW {
480 const parameters &CP = parameters::the;
482 assert(mass_ladder.size() == peptide_seq.size() - 1);
483 assert(mass_ladder.size() >= 2);
485 // An intensity boost is given for the second peptide bond/peak (*) if the
486 // N-terminal residue is 'P':
487 // 0 * 2 3 (peaks)
488 // A P C D E
489 const int proline_bonus_position = is_XYZ ? mass_ladder.size()-2 : 1;
490 mass_ladder[proline_bonus_position].intensity = 3.0;
491 if (peptide_seq[1] == 'P')
492 mass_ladder[proline_bonus_position].intensity = 10.0;
494 if (CP.spectrum_synthesis)
495 for (unsigned int i=0; i<mass_ladder.size(); i++) {
496 // FIX: there must be a better way
497 const char peptide_index = is_XYZ ? mass_ladder.size()-i-1 : i;
498 switch (peptide_seq[peptide_index]) {
499 case 'D':
500 mass_ladder[i].intensity *= 5.0;
501 break;
502 case 'V':
503 case 'E':
504 case 'I':
505 case 'L':
506 mass_ladder[i].intensity *= 3.0;
507 break;
508 case 'N':
509 case 'Q':
510 mass_ladder[i].intensity *= 2.0;
511 break;
513 switch (peptide_seq[peptide_index+1]) {
514 case 'P':
515 mass_ladder[i].intensity *= 5.0;
516 break;
522 // Calculate peaks for a synthesized mass (not mz) ladder.
523 static inline void
524 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
525 const std::string &peptide_seq,
526 const std::vector<double> &mass_list,
527 const double C_terminal_mass,
528 const unsigned mass_regime=0) NOTHROW {
529 const parameters &CP = parameters::the;
530 assert(peptide_seq.size() == mass_list.size());
531 double m = (CP.fragment_mass_regime[mass_regime].water_mass
532 + (CP.cleavage_C_terminal_mass_change
533 - CP.fragment_mass_regime[mass_regime].hydroxyl_mass)
534 + C_terminal_mass);
536 const int ladder_size = mass_ladder.size();
537 for (int i=ladder_size-1; i>=0; i--) {
538 m += mass_list[i+1];
539 mass_ladder[ladder_size-1-i] = peak(m, 1.0);
542 synthesize_ladder_intensities(mass_ladder, peptide_seq, true);
546 // Calculate peaks for a synthesized mass (not mz) ladder.
547 static inline void
548 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
549 const std::string &peptide_seq,
550 const std::vector<double> &mass_list,
551 const double N_terminal_mass,
552 const unsigned mass_regime=0) NOTHROW {
553 const parameters &CP = parameters::the;
554 double m = (0.0 + (CP.cleavage_N_terminal_mass_change - CP.hydrogen_mass)
555 + N_terminal_mass);
557 const int ladder_size = mass_ladder.size();
558 for (int i=0; i<=ladder_size-1; i++) {
559 m += mass_list[i];
560 mass_ladder[i] = peak(m, 1.0);
563 synthesize_ladder_intensities(mass_ladder, peptide_seq, false);
567 static inline void
568 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
569 const std::string &peptide_seq, const double peptide_mass,
570 const std::vector<double> &mass_list,
571 const double N_terminal_mass, const double C_terminal_mass,
572 const double max_fragment_charge) NOTHROW {
573 std::vector<peak> mass_ladder(mass_list.size()-1);
575 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
576 switch (ion_type) {
577 case ION_Y:
578 get_synthetic_Y_mass_ladder(mass_ladder, peptide_seq, mass_list,
579 C_terminal_mass);
580 break;
581 case ION_B:
582 get_synthetic_B_mass_ladder(mass_ladder, peptide_seq, mass_list,
583 N_terminal_mass);
584 break;
585 default:
586 assert(false);
589 //for (unsigned int i=0; i<mass_ladder.size(); i++)
590 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
592 for (int charge=1; charge<=max_fragment_charge; charge++) {
593 spectrum &sp = synth_sp[charge][ion_type];
594 sp.mass = peptide_mass;
595 sp.charge = charge;
596 sp.peaks.resize(mass_ladder.size());
597 for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++) {
598 sp.peaks[i].intensity = mass_ladder[i].intensity;
599 sp.peaks[i].mz = peak::get_mz(mass_ladder[i].mz, charge);
606 // Return the similarity score between this spectrum and that, and also a
607 // count of common peaks in *peak_count.
609 // xtandem quantizes mz values into bins of width fragment_mass_error first,
610 // and adds a bin on either side. This makes the effective fragment mass
611 // error for each peak an arbitrary value between 1x and 2x the parameter,
612 // based on quantization of that peak. If quirks_mode is on, we'll try to
613 // emulate this behavior.
614 static inline double
615 score_similarity_(const spectrum &x, const spectrum &y,
616 int *peak_count) NOTHROW {
617 const double frag_err = parameters::the.fragment_mass_error;
618 const bool quirks_mode = parameters::the.quirks_mode;
619 const double error_limit = quirks_mode ? frag_err*1.5 : frag_err;
621 double score = 0;
622 *peak_count = 0;
623 std::vector<peak>::const_iterator x_it = x.peaks.begin();
624 std::vector<peak>::const_iterator y_it = y.peaks.begin();
626 while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
627 double x_mz = x_it->mz, y_mz = y_it->mz;
628 if (quirks_mode) {
629 const float ffe = frag_err;
630 x_mz = int(static_cast<float>(x_mz) / ffe) * ffe;
631 y_mz = int(static_cast<float>(y_mz) / ffe) * ffe;
633 // in quirks mode, must be within one frag_err-wide bin
634 const double delta = y_mz - x_mz;
635 if (std::abs(delta) < error_limit) {
636 *peak_count += 1;
637 score += (x_it->intensity * y_it->intensity);
638 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
639 #if 0
640 // assume peaks aren't "close" together
641 x_it++;
642 y_it++;
643 continue;
644 #endif
646 if (delta > 0) // y_mz > x_mz
647 x_it++;
648 else
649 y_it++;
652 return score;
656 double
657 spectrum::score_similarity(const spectrum &x, const spectrum &y, int
658 *peak_count) {
659 return score_similarity_(x, y, peak_count);
663 // IDEA: could we combine all these spectra and just do the correlation once?
665 // Score the specified spectrum against a group of synthetic fragment
666 // spectra.
667 static inline void
668 score_spectrum(double &hyper_score, double &convolution_score,
669 std::vector<double> &ion_scores, std::vector<int> &ion_peaks,
670 spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
671 const int spectrum_id, const int max_fragment_charge) NOTHROW {
672 const parameters &CP = parameters::the;
673 const spectrum sp = spectrum::searchable_spectra[spectrum_id];
674 const int spectrum_charge = sp.charge;
675 hyper_score = 1.0;
676 convolution_score = 0;
678 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
679 int i_peaks = 0;
680 double i_scores = 0.0;
681 const int charge_limit = std::max<int>(1, spectrum_charge-1);
682 for (int charge=1; charge<=charge_limit; charge++) {
683 assert(charge <= max_fragment_charge);
685 // convolution score is just sum over charges/ions
686 // hyperscore is product of p! over charges/ions (where p is corr peak
687 // count) times the convolution score (clipped to FLT_MAX)
688 // > blurred!
689 int common_peak_count; // FIX!
690 double conv_score = spectrum::score_similarity(synth_sp[charge][ion_type],
691 sp, &common_peak_count);
692 i_peaks += common_peak_count;
693 i_scores += conv_score;
694 hyper_score *= CP.factorial[common_peak_count];
695 convolution_score += conv_score;
698 ion_peaks[ion_type] = i_peaks;
699 ion_scores[ion_type] = i_scores;
701 hyper_score *= convolution_score;
705 // FIX: does this actually help inlining?
706 // returns log-scaled hyperscore
707 static inline double
708 scale_hyperscore_(double hyper_score) NOTHROW {
709 assert(hyper_score >= 0); // otherwise check for EDOM, ERANGE
710 if (hyper_score == 0)
711 return -DBL_MAX; // FIX: this shouldn't be reached?
712 return 4 * std::log10(hyper_score);
714 double
715 scale_hyperscore(double hyper_score) { return scale_hyperscore_(hyper_score); }
718 static inline double
719 get_peptide_mass(const std::vector<double> &mass_list,
720 const double N_terminal_mass,
721 const double C_terminal_mass) NOTHROW {
722 const parameters &CP = parameters::the;
724 double m = (N_terminal_mass + C_terminal_mass
725 + CP.cleavage_N_terminal_mass_change
726 + CP.cleavage_C_terminal_mass_change
727 + CP.proton_mass);
728 return std::accumulate(mass_list.begin(), mass_list.end(), m);
732 struct mass_trace_list {
733 mass_trace_item item;
734 const mass_trace_list *next;
736 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
740 // FIX: currently only one mass_list is implemented, meaning that parent and
741 // fragment masses must be the same (e.g., mono/mono). A workaround is to
742 // just increase the parent error plus a bit (the xtandem default seems to
743 // already do this).
745 // Search for matches of this particular peptide modification variation
746 // against the spectra. Updates score_stats and returns the number of
747 // candidate spectra found.
748 static inline void
749 evaluate_peptide_mod_variation(match &m, const mass_trace_list *mtlp,
750 const std::vector<double> &mass_list,
751 const double N_terminal_mass,
752 const double C_terminal_mass,
753 score_stats &stats) NOTHROW {
754 const parameters &CP = parameters::the;
755 assert(m.peptide_sequence.size() == mass_list.size());
757 m.peptide_mass = get_peptide_mass(mass_list, N_terminal_mass,
758 C_terminal_mass);
759 double sp_mass_lb = m.peptide_mass + CP.parent_monoisotopic_mass_error_minus;
760 double sp_mass_ub = m.peptide_mass + CP.parent_monoisotopic_mass_error_plus;
762 const std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
763 candidate_spectra_info_begin
764 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
765 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
766 stats.all_spectra_masses_too_high = false;
767 std::cerr << 'l';
768 return; // spectrum masses all too low to match peptide
770 stats.all_spectra_masses_too_low = false;
772 const std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
773 candidate_spectra_info_end
774 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
775 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
776 std::cerr << 'h';
777 return; // spectrum masses all too high to match peptide
779 stats.all_spectra_masses_too_high = false;
781 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
782 std::cerr << '-';
783 return; // no spectrum with close-enough parent mass
786 std::cerr << '+';
788 int max_candidate_charge = 0;
789 for (std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
790 it=candidate_spectra_info_begin;
791 it != candidate_spectra_info_end; it++)
792 max_candidate_charge
793 = std::max<int>(max_candidate_charge,
794 spectrum::searchable_spectra[it->second].charge);
795 assert(max_candidate_charge >= 1);
797 int max_fragment_charge = max_candidate_charge;
798 if (not CP.check_all_fragment_charges)
799 max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
800 assert(max_fragment_charge <= spectrum::max_supported_charge);
801 // e.g. +2 -> 'B' -> spectrum
802 // FIX: should this be a std::vector??
803 static spectrum synth_sp[spectrum::max_supported_charge+1][ION_MAX];
804 synthetic_spectra(synth_sp, m.peptide_sequence, m.peptide_mass, mass_list,
805 N_terminal_mass, C_terminal_mass, max_fragment_charge);
807 for (std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
808 candidate_it = candidate_spectra_info_begin;
809 candidate_it != candidate_spectra_info_end; candidate_it++) {
810 m.spectrum_index = candidate_it->second;
811 stats.candidate_spectrum_count++;
812 score_spectrum(m.hyper_score, m.convolution_score, m.ion_scores,
813 m.ion_peaks, synth_sp, m.spectrum_index,
814 max_fragment_charge);
815 if (m.convolution_score <= 2.0)
816 continue;
818 //std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
819 //<< " " << m.peptide_mass << " " <<
820 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
822 // update spectrum histograms
823 std::vector<int> &hh = stats.hyperscore_histogram[m.spectrum_index];
824 int binned_scaled_hyper_score = int(0.5 + scale_hyperscore_(m.hyper_score));
825 assert(binned_scaled_hyper_score >= 0);
826 // FIX
827 if (static_cast<int>(hh.size())-1 < binned_scaled_hyper_score) {
828 assert(hh.size() < INT_MAX/2);
829 hh.resize(std::max<unsigned int>(binned_scaled_hyper_score+1,
830 2*hh.size()));
832 hh[binned_scaled_hyper_score] += 1;
833 // incr m_tPeptideScoredCount
834 // nyi: permute stuff
835 // FIX
836 const int sp_ion_count = m.ion_peaks[ION_B] + m.ion_peaks[ION_Y];
837 if (sp_ion_count < CP.minimum_ion_count)
838 continue;
839 const bool has_b_and_y = m.ion_peaks[ION_B] and m.ion_peaks[ION_Y];
840 if (not has_b_and_y)
841 continue;
843 // check that parent masses are within error range (isotope ni)
844 // already done above (why does xtandem do it here?)
845 // if check fails, only eligible for 2nd-best record
847 // Remember all of the highest-hyper-scoring matches against each
848 // spectrum. These might be in multiple domains (pos, length, mods) in
849 // multiple proteins.
851 // To avoid the problems that xtandem has with inexact float-point
852 // calculations, we consider hyperscores within the "epsilon ratio" to
853 // be "equal". The best_match vector will need to be re-screened
854 // against this ratio, because it may finally contain entries which
855 // don't meet our criterion. (We don't take the time to get it exactly
856 // right here, because we will often end up discarding it all anyway.)
858 const double current_ratio = (m.hyper_score
859 / stats.best_score[m.spectrum_index]);
861 if (CP.quirks_mode and (m.hyper_score < stats.best_score[m.spectrum_index])
862 or (not CP.quirks_mode
863 and (current_ratio < CP.hyper_score_epsilon_ratio))) {
864 // This isn't a best score, so see if it's a second-best score.
865 if (m.hyper_score > stats.second_best_score[m.spectrum_index])
866 stats.second_best_score[m.spectrum_index] = m.hyper_score;
867 continue;
869 if (CP.quirks_mode and (m.hyper_score > stats.best_score[m.spectrum_index])
870 or (not CP.quirks_mode
871 and (current_ratio > (1/CP.hyper_score_epsilon_ratio)))) {
872 // This score is significantly better, so forget previous matches.
873 stats.best_score[m.spectrum_index] = m.hyper_score;
874 stats.best_match[m.spectrum_index].clear();
876 // Now remember this match because it's at least as good as those
877 // previously seen.
878 m.mass_trace.clear();
879 for (const mass_trace_list *p=mtlp; p; p=p->next)
880 m.mass_trace.push_back(p->item);
881 stats.best_match[m.spectrum_index].push_back(m);
886 // The choose_* functions generate the different modification possibilities.
888 // IDEA FIX: Use a cost parameter to define the iterative search front.
891 // These values are used to keep track of the choices made so far (because
892 // later choices may depend on earlier ones).
893 enum chosen { CHOSEN_NONE, CHOSEN_PCA };
896 // Choose a possible residue modification.
897 static inline void
898 choose_residue_mod(match &m,
899 const std::vector< std::vector<double> > &pmm,
900 const mass_trace_list *mtlp,
901 std::vector<double> &mass_list,
902 const double N_terminal_mass,
903 const double C_terminal_mass, score_stats &stats,
904 const unsigned remaining_residues_to_choose,
905 const unsigned number_of_positions_to_consider,
906 const int *const mod_positions_to_consider) NOTHROW {
907 const parameters &CP = parameters::the;
908 assert(remaining_residues_to_choose <= number_of_positions_to_consider);
910 if (CP.maximum_modification_combinations_searched
911 and (stats.combinations_searched
912 >= CP.maximum_modification_combinations_searched))
913 return;
915 if (remaining_residues_to_choose == 0) {
916 stats.combinations_searched++;
917 evaluate_peptide_mod_variation(m, mtlp, mass_list, N_terminal_mass,
918 C_terminal_mass, stats);
919 } else {
920 mass_trace_list mtl(mtlp);
922 // consider all of the positions where we could next add a mod
923 for (unsigned i=0;
924 i<number_of_positions_to_consider-remaining_residues_to_choose+1;
925 i++) {
926 const int pos=mod_positions_to_consider[i];
927 mtl.item.position = pos;
928 assert(pos >= 0);
929 const double save_mass=mass_list[pos];
930 const std::vector<double> &deltas = pmm[m.peptide_sequence[pos]];
931 // step through the deltas for this amino acid
932 for (std::vector<double>::const_iterator it=deltas.begin();
933 it != deltas.end(); it++) {
934 mtl.item.delta = *it;
935 mtl.item.description = "position mod"; // FIX
936 mass_list[pos] = save_mass + *it;
937 choose_residue_mod(m, pmm, &mtl, mass_list, N_terminal_mass,
938 C_terminal_mass, stats,
939 remaining_residues_to_choose-1,
940 number_of_positions_to_consider-i-1,
941 mod_positions_to_consider+i+1);
943 mass_list[pos] = save_mass;
949 // Choose the number of modified residue positions. Start with zero and count
950 // up to the maximum number (which is not greater than the length of the
951 // peptide).
952 static inline void
953 choose_residue_mod_count(match &m,
954 const std::vector< std::vector<double> > &pmm,
955 const mass_trace_list *mtlp,
956 std::vector<double> &mass_list,
957 const double N_terminal_mass,
958 const double C_terminal_mass,
959 score_stats &stats) NOTHROW {
960 const parameters &CP = parameters::the;
961 int mod_positions[m.peptide_sequence.size()+1];
962 unsigned int max_count=0;
963 for (unsigned i=0; i<m.peptide_sequence.size(); i++)
964 if (not pmm[m.peptide_sequence[i]].empty())
965 mod_positions[max_count++] = i;
966 mod_positions[max_count] = -1;
967 stats.combinations_searched = 0;
969 unsigned count_limit = max_count;
970 if (CP.maximum_simultaneous_modifications_searched > 0)
971 count_limit = std::min<unsigned int>(count_limit,
972 CP.maximum_simultaneous_modifications_searched);
974 // start at one because the zero case is handled by
975 // choose_potential_mod_alternative below (!)
976 for (unsigned count=1; count <= count_limit; count++)
977 choose_residue_mod(m, pmm, mtlp, mass_list, N_terminal_mass,
978 C_terminal_mass, stats, count, max_count,
979 mod_positions);
983 // FIX: Is there some clear and efficient way to treat terminal mods in the
984 // same way as residue mods? Or, alternatively, is it possible to gracefully
985 // merge the two choose_?_terminal_mod functions?
987 // Choose a possible peptide C-terminal modification.
988 static inline void
989 choose_C_terminal_mod(match &m,
990 const std::vector< std::vector<double> > &pmm,
991 const mass_trace_list *mtlp,
992 std::vector<double> &mass_list,
993 const double N_terminal_mass,
994 const double C_terminal_mass,
995 score_stats &stats) NOTHROW {
996 choose_residue_mod_count(m, pmm, mtlp, mass_list, N_terminal_mass,
997 C_terminal_mass, stats);
999 mass_trace_list mtl(mtlp);
1000 mtl.item.position = POSITION_CTERM;
1001 const std::vector<double> &C_deltas = pmm[']'];
1003 for (std::vector<double>::const_iterator it=C_deltas.begin();
1004 it != C_deltas.end(); it++) {
1005 mtl.item.delta = *it;
1006 mtl.item.description = "C-terminal mod"; // FIX
1007 choose_residue_mod_count(m, pmm, &mtl, mass_list, N_terminal_mass,
1008 C_terminal_mass+*it, stats);
1013 // Choose a possible peptide N-terminal modification, unless a PCA mod has
1014 // already been chosen.
1015 static inline void
1016 choose_N_terminal_mod(match &m, chosen ch,
1017 const std::vector< std::vector<double> > &pmm,
1018 const mass_trace_list *mtlp,
1019 std::vector<double> &mass_list,
1020 const double N_terminal_mass,
1021 const double C_terminal_mass,
1022 score_stats &stats) NOTHROW {
1023 choose_C_terminal_mod(m, pmm, mtlp, mass_list, N_terminal_mass,
1024 C_terminal_mass, stats);
1026 if (ch == CHOSEN_PCA)
1027 return;
1028 mass_trace_list mtl(mtlp);
1029 mtl.item.position = POSITION_NTERM;
1030 const std::vector<double> &N_deltas = pmm['['];
1032 for (std::vector<double>::const_iterator it=N_deltas.begin();
1033 it != N_deltas.end(); it++) {
1034 mtl.item.delta = *it;
1035 mtl.item.description = "N-terminal mod"; // FIX
1036 choose_C_terminal_mod(m, pmm, &mtl, mass_list, N_terminal_mass+*it,
1037 C_terminal_mass, stats);
1041 #if NOT_WORKING
1043 // Choose none or one of the (';'-separated) potential modification
1044 // alternative sets. In the 'none' case, no explicitly requested potential
1045 // mods will be chosen, but PCA mods (above) will still be searched. If an
1046 // alternative set is chosen, at least one explicit mod must be chosen, to
1047 // avoid overlap with the 'none' case.
1048 static inline void
1049 choose_potential_mod_alternative(match &m, chosen ch,
1050 const mass_trace_list *mtlp,
1051 std::vector<double> &mass_list,
1052 const double N_terminal_mass,
1053 const double C_terminal_mass,
1054 score_stats &stats) NOTHROW {
1055 const parameters &CP = parameters::the;
1057 // the 'none' case--jump over the rest of the choose_*, since in this case
1058 // they choose nothing
1059 evaluate_peptide_mod_variation(m, mtlp, mass_list, N_terminal_mass,
1060 C_terminal_mass, stats);
1062 const mass_regime_parameters &mrp = CP.fragment_mass_regime[m.mass_regime];
1063 const std::vector< std::vector< std::vector<double> > > &alts
1064 = mrp.potential_modification_mass;
1066 for (std::vector< std::vector< std::vector<double> > >::const_iterator
1067 it=alts.begin();
1068 it != alts.end(); it++)
1069 choose_N_terminal_mod(m, ch, *it, mtlp, mass_list, N_terminal_mass,
1070 C_terminal_mass, stats);
1074 // Choose a possible modification to account for PCA (pyrrolidone carboxyl
1075 // acid) circularization of the peptide N-terminal. PCA mods are excluded if
1076 // a static N-terminal mod has been specified. Likewise, choosing a PCA mod
1077 // will exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
1078 // dependent on a static mod of C+57 being in effect.)
1079 static inline void
1080 choose_PCA_mod(match &m, const mass_trace_list *mtlp,
1081 std::vector<double> &mass_list,
1082 const double N_terminal_mass,
1083 const double C_terminal_mass, score_stats &stats) NOTHROW {
1084 const parameters &CP = parameters::the;
1086 choose_potential_mod_alternative(m, CHOSEN_NONE, mtlp, mass_list,
1087 N_terminal_mass, C_terminal_mass, stats);
1089 mass_trace_list mtl(mtlp);
1090 mtl.item.position = POSITION_NTERM;
1091 const mass_regime_parameters &mrp = CP.fragment_mass_regime[m.mass_regime];
1093 // now try a possible PCA mod, if there is one
1094 // FIX: somehow xtandem generates these twice??
1095 if (mrp.modification_mass['['] == 0)
1096 switch (m.peptide_sequence[0]) {
1097 case 'E':
1098 mtl.item.delta = -mrp.water_mass;
1099 mtl.item.description = "PCA";
1100 choose_potential_mod_alternative(m, CHOSEN_PCA, &mtl, mass_list,
1101 N_terminal_mass - mrp.water_mass,
1102 C_terminal_mass, stats);
1103 break;
1104 case 'C': {
1105 // FIX: need better test for C+57? (symbolic?)
1106 const double C_mod = mrp.modification_mass['C'];
1107 if (CP.quirks_mode ? int(C_mod) != 57 : std::abs(C_mod - 57) > 0.5)
1108 break; // skip unless C+57
1109 // else fall through...
1111 case 'Q':
1112 mtl.item.delta = -mrp.ammonia_mass;
1113 mtl.item.description = "PCA";
1114 choose_potential_mod_alternative(m, CHOSEN_PCA, &mtl, mass_list,
1115 N_terminal_mass - mrp.ammonia_mass,
1116 C_terminal_mass, stats);
1117 break;
1122 // Choose among the requested mass regimes (e.g., isotopes), and also choose
1123 // the static mods (including N- and C-terminal mods), all of which are
1124 // completely determined by the mass regime choice.
1125 static inline void
1126 choose_mass_regime(match &m, std::vector<double> &mass_list,
1127 const double N_terminal_mass, const double C_terminal_mass,
1128 score_stats &stats) NOTHROW {
1129 const parameters &CP = parameters::the;
1131 m.mass_regime = 0; // FIX
1132 const mass_regime_parameters &mrp = CP.fragment_mass_regime[m.mass_regime];
1133 for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size(); i++) {
1134 const char res = m.peptide_sequence[i];
1135 mass_list[i] = mrp.residue_mass[res] + mrp.modification_mass[res];
1137 choose_PCA_mod(m, NULL, mass_list,
1138 N_terminal_mass + mrp.modification_mass['['],
1139 C_terminal_mass + mrp.modification_mass[']'], stats);
1143 // Search for matches of all modification variations of this peptide against
1144 // the spectra. Updates score_stats and the number of candidate spectra
1145 // found.
1146 void
1147 spectrum::search_peptide_all_mods(int idno, int offset, int begin,
1148 const std::string &peptide_seq,
1149 int missed_cleavage_count,
1150 score_stats &stats) {
1151 std::vector<double> mass_list(peptide_seq.size());
1152 double N_terminal_mass = 0.0;
1153 double C_terminal_mass = 0.0;
1155 // This match will be passed inward and used to record information that we
1156 // need to remember about a match when we finally see one. At that point, a
1157 // copy of this match will be saved.
1158 match m;
1159 m.sequence_index = idno;
1160 m.sequence_offset = offset;
1161 m.peptide_begin = begin;
1162 m.peptide_sequence = peptide_seq;
1163 m.missed_cleavage_count = missed_cleavage_count;
1165 choose_mass_regime(m, mass_list, N_terminal_mass, C_terminal_mass, stats);
1169 // Search for matches of all modification variations of peptides in this
1170 // sequence run against the spectra. Updates score_stats and the number of
1171 // candidate spectra found.
1172 // FIX: optimize the non-specific case?
1174 void
1175 spectrum::search_run_all_mods(const int maximum_missed_cleavage_sites,
1176 const int min_peptide_length,
1177 const bool no_N_term_mods,
1178 const int idno, const int offset,
1179 const std::string &run_sequence,
1180 const std::vector<int> cleavage_points,
1181 score_stats &stats) {
1182 double N_terminal_mass = 0.0;
1183 double C_terminal_mass = 0.0;
1185 // This match will be passed inward and used to record information that we
1186 // need to remember about a match when we finally see one. At that point, a
1187 // copy of this match will be saved.
1188 match m;
1189 m.sequence_index = idno;
1190 m.sequence_offset = offset;
1192 // FIX: examine carefully for signed/unsigned problems
1194 // The rightmost 'end' seen where 'all_spectra_masses_too_high' was true.
1195 // (0 means none yet encountered.)
1196 unsigned int next_end = 0;
1198 for (unsigned int begin=0; begin<cleavage_points.size()-1; begin++) {
1199 const int begin_index = m.peptide_begin = cleavage_points[begin];
1200 unsigned int end = begin + 1;
1201 if (no_N_term_mods)
1202 if (next_end != 0)
1203 end = std::max<unsigned int>(end, next_end);
1204 for (; end<cleavage_points.size(); end++) {
1205 m.missed_cleavage_count = end - begin - 1;
1206 if (m.missed_cleavage_count > maximum_missed_cleavage_sites)
1207 break;
1209 const int end_index = cleavage_points[end];
1210 const int peptide_size = end_index - begin_index;
1211 if (peptide_size < min_peptide_length)
1212 continue;
1213 // these start vacuously true, then to be falsified if possible
1214 stats.all_spectra_masses_too_high = true;
1215 stats.all_spectra_masses_too_low = true;
1216 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
1217 //std::cerr << "peptide: " << m.peptide_sequence << std::endl;
1218 std::vector<double> mass_list(peptide_size);
1219 choose_mass_regime(m, mass_list, N_terminal_mass, C_terminal_mass,
1220 stats);
1222 // FIX: double-check that these work right even with negative potential
1223 // mod deltas!
1224 assert(not (stats.all_spectra_masses_too_high
1225 and stats.all_spectra_masses_too_low));
1226 if (stats.all_spectra_masses_too_high) {
1227 //std::cerr << "all_spectra_masses_too_high" << std::endl;
1228 if (end == cleavage_points.size() - 1)
1229 return;
1230 next_end = end;
1232 if (stats.all_spectra_masses_too_low) {
1233 //std::cerr << "all_spectra_masses_too_low" << std::endl;
1234 break;
1240 #endif