Initial import from RCS
[greylag.git] / cgreylag.cpp
blob36aff421052d1492feceb868a68fe399c5a9844c
3 // $Id: cgreylag.cpp,v 1.9 2006/10/02 15:07:00 mkc Exp mkc $
6 #include "cgreylag.hpp"
9 #include <cerrno>
10 #include <cfloat>
11 #include <climits>
12 #include <cstdlib>
13 #include <cmath>
14 #include <numeric>
15 #include <stdexcept>
17 #include <iostream>
20 //#define FAST_LOWER_READ_ACCURACY
22 #ifdef __GNU__
23 #define FAST_GNU_ONLY
24 #endif
26 #ifdef FAST_GNU_ONLY
27 // use non-threadsafe versions, since we don't use threads
28 #define fgetsX fgets_unlocked
29 #define ferrorX ferror_unlocked
30 #else
31 #define fgetsX std::fgets
32 #define ferrorX std::ferror
33 #endif
35 #ifdef FAST_LOWER_READ_ACCURACY
36 // probably faster, at some cost in accuracy
37 #define strtodX std::strtof
38 #else
39 #define strtodX std::strtod
40 #endif
43 parameters parameters::the;
46 char *
47 peak::__repr__() const {
48 static char temp[128];
49 sprintf(temp, "<peak mz=%.4f intensity=%.4f>", mz, intensity);
50 return &temp[0];
54 int spectrum::next_id = 1;
55 int spectrum::next_physical_id = 1;
57 std::vector<spectrum> spectrum::searchable_spectra;
58 // parent mass -> searchable_spectra index
59 std::multimap<double,
60 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
63 char *
64 spectrum::__repr__() const {
65 static char temp[1024];
66 sprintf(temp,
67 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
68 " [%d peaks, maxI=%f, sumI=%f]>",
69 id, physical_id, name.c_str(), mass, charge, peaks.size(),
70 max_peak_intensity, sum_peak_intensity);
71 return &temp[0];
76 // This is an error exit for read_spectra.
77 static void
78 read_error(FILE *f, const char *message="") {
79 if (ferrorX(f))
80 message = "I/O error while reading ms2 file";
81 throw std::ios_base::failure(message);
85 // Read spectra from a file in ms2 format. Multiply charge spectra (e.g.,
86 // +2/+3) are split into separate spectra having the same physical id. The
87 // peak list is initially sorted by mz. Throws an exception on invalid
88 // input.
90 std::vector<spectrum>
91 spectrum::read_spectra(FILE *f, const int file_id) {
92 std::vector<spectrum> spectra;
94 const int bufsiz = 1024;
95 char buf[bufsiz];
96 char *endp;
98 char *result = fgetsX(buf, bufsiz, f);
99 while (true) {
100 std::vector<std::string> names;
101 std::vector<double> masses;
102 std::vector<int> charges;
103 std::vector<peak> peaks;
105 // read headers
106 while (true) {
107 if (ferrorX(f))
108 read_error(f);
109 if (not result or buf[0] != ':')
110 break;
111 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
112 if (not fgetsX(buf, bufsiz, f))
113 read_error(f, "bad ms2 format: mass/charge line expected");
114 errno = 0;
115 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
116 if (errno or endp == buf)
117 read_error(f, "bad ms2 format: bad mass");
118 charges.push_back(std::strtol(endp, &endp, 10));
119 if (errno or endp == buf)
120 read_error(f, "bad ms2 format: bad charge");
121 result = fgetsX(buf, bufsiz, f);
123 if (not result)
124 break;
125 // read peaks
126 while (true) {
127 peak p;
128 errno = 0;
129 p.mz = strtodX(buf, &endp);
130 if (errno or endp == buf)
131 read_error(f, "bad ms2 format: bad peak mz");
132 p.intensity = strtodX(endp, &endp);
133 if (errno or endp == buf)
134 read_error(f, "bad ms2 format: bad peak intensity");
135 peaks.push_back(p);
137 result = fgetsX(buf, bufsiz, f);
138 if (ferrorX(f))
139 read_error(f);
140 if (not result or buf[0] == ':')
141 break;
144 // add spectra to vector
145 if (names.empty() and not peaks.empty())
146 read_error(f, "bad ms2 format: missing header lines?");
148 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
150 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
151 spectrum sp(masses[i], charges[i]);
152 sp.peaks = peaks;
153 sp.name = names[i];
154 sp.file_id = file_id;
155 sp.physical_id = next_physical_id;
156 sp.calculate_intensity_statistics();
157 spectra.push_back(sp);
159 spectrum::next_physical_id++;
161 return spectra;
165 // Sets max/sum_peak_intensity, according to peaks and normalization_factor.
166 void
167 spectrum::calculate_intensity_statistics() {
168 sum_peak_intensity = 0;
169 max_peak_intensity = -1;
171 for (std::vector<peak>::const_iterator it=peaks.begin(); it != peaks.end();
172 it++) {
173 sum_peak_intensity += it->intensity;
174 max_peak_intensity = std::max<double>(it->intensity, max_peak_intensity);
176 max_peak_intensity *= normalization_factor;
177 sum_peak_intensity *= normalization_factor;
181 // For each *non-overlapping* group of peaks with mz within an interval of
182 // (less than) length 'limit', keep only the most intense peak. (The
183 // algorithm works from the right, which matter here.)
185 // FIX: Is this really the best approach? The obvious alternative would be to
186 // eliminate peaks so that no interval of length 'limit' has more than one
187 // peak (eliminating weak peaks to make this so).
189 static void
190 remove_close_peaks(std::vector<peak> &peaks, double limit) {
191 for (std::vector<peak>::size_type i=0; i+1<peaks.size(); i++) {
192 double ub_mz = peaks[i].mz + limit;
193 while (i+1 < peaks.size() and peaks[i+1].mz < ub_mz)
194 peaks.erase(peaks.begin()
195 + (peaks[i+1].intensity > peaks[i].intensity
196 ? i : i+1));
200 // static void
201 // remove_close_peaks(std::vector<peak> &peaks, double limit) {
202 // for (std::vector<peak>::size_type i=0; i+1<peaks.size();)
203 // if (peaks[i+1].mz - peaks[i].mz < limit)
204 // if (peaks[i+1].intensity > peaks[i].intensity)
205 // peaks.erase(peaks.begin() + i);
206 // else
207 // peaks.erase(peaks.begin() + i+1);
208 // else
209 // i++;
210 // }
213 // Examine this spectrum to see if it should be kept. If so, return true and
214 // sort the peaks by mz, normalize them and limit their number.
215 bool
216 spectrum::filter_and_normalize(double minimum_fragment_mz,
217 double dynamic_range, int minimum_peaks,
218 int total_peaks) {
219 // "spectrum, maximum parent charge", noise suppression check,
220 // "spectrum, minimum parent m+h" not implemented
222 if (minimum_fragment_mz < 0 or dynamic_range <= 0 or minimum_peaks < 0
223 or total_peaks < 0)
224 throw std::invalid_argument("invalid argument value");
226 // remove_isotopes
227 // >>> removes multiple entries within 0.95 Da of each other, retaining the
228 // highest value. this is necessary because of the behavior of some peak
229 // finding routines in commercial software
231 // peaks already sorted by mz upon read
232 remove_close_peaks(peaks, 0.95);
234 // remove parent
235 // >>> set up m/z regions to ignore: those immediately below the m/z of the
236 // parent ion which will contain uninformative neutral loss ions, and those
237 // immediately above the parent ion m/z, which will contain the parent ion
238 // and its isotope pattern
240 const double parent_mz = 1.00727 + (mass - 1.00727) / charge;
241 for (std::vector<peak>::size_type i=0; i<peaks.size();)
242 if (parent_mz >= peaks[i].mz
243 ? std::abs(parent_mz - peaks[i].mz) < (50.0 / charge)
244 : std::abs(parent_mz - peaks[i].mz) < (5.0 / charge))
245 peaks.erase(peaks.begin() + i);
246 else
247 i++;
250 // remove_low_masses
251 std::vector<peak>::iterator pi;
252 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= minimum_fragment_mz;
253 pi++);
254 peaks.erase(peaks.begin(), pi);
257 // normalize spectrum
258 // >>> use the dynamic range parameter to set the maximum intensity value
259 // for the spectrum. then remove all peaks with a normalized intensity < 1
260 if (peaks.empty())
261 return false;
262 std::vector<peak>::iterator most_intense_peak
263 = std::max_element(peaks.begin(), peaks.end(), peak::less_intensity);
264 normalization_factor
265 = std::max<double>(1.0, most_intense_peak->intensity) / dynamic_range;
266 for (std::vector<peak>::iterator p=peaks.begin(); p != peaks.end(); p++)
267 p->intensity /= normalization_factor;
268 for (std::vector<peak>::size_type i=0; i<peaks.size();)
269 if (peaks[i].intensity < 1.0)
270 peaks.erase(peaks.begin() + i);
271 else
272 i++;
274 if (peaks.size() < (unsigned int) minimum_peaks)
275 return false;
277 // # check is_noise (NYI)
278 // # * is_noise attempts to determine if the spectrum is simply
279 // # noise. if the spectrum
280 // # * does not have any peaks within a window near the parent ion mass,
281 // # it is considered
282 // # * noise.
283 // #limit = sp.mass / sp.charge
284 // #if sp.charge < 3:
285 // # limit = sp.mass - 600.0
286 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
287 // # return "looks like noise"
289 // clean_isotopes removes peaks that are probably C13 isotopes
290 remove_close_peaks(peaks, 1.5);
292 // keep only the most intense peaks
293 if (peaks.size() > (unsigned int) total_peaks) {
294 std::sort(peaks.begin(), peaks.end(), peak::less_intensity);
295 peaks.erase(peaks.begin(), peaks.end() - total_peaks);
296 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
299 // update statistics, now that we've removed some peaks
300 calculate_intensity_statistics();
302 return true;
306 // Store the list of spectra that search_peptide will search against, and also
307 // build spectrum_mass_index.
308 void
309 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
310 searchable_spectra = spectra;
311 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
312 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
316 // Multiply peak intensities by residue-dependent factors to generate a more
317 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
318 // reverse.
319 static inline void
320 synthesize_ladder_intensities(std::vector<peak> &mass_ladder,
321 const std::string &peptide_seq,
322 const bool is_XYZ) {
323 const parameters &CP = parameters::the;
325 assert(mass_ladder.size() == peptide_seq.size() - 1);
326 assert(mass_ladder.size() >= 2);
328 // An intensity boost is given for the second peptide bond/peak (*) if the
329 // N-terminal residue is 'P':
330 // 0 * 2 3 (peaks)
331 // A P C D E
332 const int proline_bonus_position = is_XYZ ? mass_ladder.size()-2 : 1;
333 mass_ladder[proline_bonus_position].intensity = 3.0;
334 if (peptide_seq[1] == 'P')
335 mass_ladder[proline_bonus_position].intensity = 10.0;
337 if (CP.spectrum_synthesis)
338 for (unsigned int i=0; i<mass_ladder.size(); i++) {
339 // FIX: there must be a better way
340 const char peptide_index = is_XYZ ? mass_ladder.size()-i-1 : i;
341 switch (peptide_seq[peptide_index]) {
342 case 'D':
343 mass_ladder[i].intensity *= 5.0;
344 break;
345 case 'V':
346 case 'E':
347 case 'I':
348 case 'L':
349 mass_ladder[i].intensity *= 3.0;
350 break;
351 case 'N':
352 case 'Q':
353 mass_ladder[i].intensity *= 2.0;
354 break;
356 switch (peptide_seq[peptide_index+1]) {
357 case 'P':
358 mass_ladder[i].intensity *= 5.0;
359 break;
365 // Calculate peaks for a synthesized mass (not mz) ladder.
366 static inline void
367 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
368 const std::string &peptide_seq,
369 const std::vector<double> &mass_list,
370 const double C_terminal_mass,
371 const unsigned mass_regime=0) {
372 const parameters &CP = parameters::the;
373 assert(peptide_seq.size() == mass_list.size());
374 double m = (CP.fragment_mass_regime[mass_regime].water_mass
375 + (CP.cleavage_C_terminal_mass_change
376 - CP.fragment_mass_regime[mass_regime].hydroxyl_mass)
377 + C_terminal_mass);
379 const int ladder_size = mass_ladder.size();
380 for (int i=ladder_size-1; i>=0; i--) {
381 m += mass_list[i+1];
382 mass_ladder[ladder_size-1-i] = peak(m, 1.0);
385 synthesize_ladder_intensities(mass_ladder, peptide_seq, true);
388 // Calculate peaks for a synthesized mass (not mz) ladder.
389 static inline void
390 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
391 const std::string &peptide_seq,
392 const std::vector<double> &mass_list,
393 const double N_terminal_mass,
394 const unsigned mass_regime=0) {
395 const parameters &CP = parameters::the;
396 double m = (0.0 + (CP.cleavage_N_terminal_mass_change - CP.hydrogen_mass)
397 + N_terminal_mass);
399 const int ladder_size = mass_ladder.size();
400 for (int i=0; i<=ladder_size-1; i++) {
401 m += mass_list[i];
402 mass_ladder[i] = peak(m, 1.0);
405 synthesize_ladder_intensities(mass_ladder, peptide_seq, false);
409 // Construct a synthetic spectrum from these parameters.
410 spectrum::spectrum(double peptide_mod_mass, int charge,
411 const std::vector<peak> &mass_ladder) {
412 init(peptide_mod_mass, charge);
414 peaks = mass_ladder;
415 for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++)
416 peaks[i].mz = peak::get_mz(peaks[i].mz, charge);
418 //std::cerr << "synth charge " << charge << std::endl;
419 //for (unsigned int i=0; i<mass_ladder.size(); i++)
420 // std::cerr << "peak mz " << peaks[i].mz << " i " << peaks[i].intensity << std::endl;
424 static inline void
425 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
426 const std::string &peptide_seq, const double peptide_mass,
427 const std::vector<double> &mass_list,
428 const double N_terminal_mass, const double C_terminal_mass,
429 const double max_fragment_charge) {
430 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
431 std::vector<peak> mass_ladder(mass_list.size()-1);
433 switch (ion_type) {
434 case ION_Y:
435 get_synthetic_Y_mass_ladder(mass_ladder, peptide_seq, mass_list,
436 C_terminal_mass);
437 break;
438 case ION_B:
439 get_synthetic_B_mass_ladder(mass_ladder, peptide_seq, mass_list,
440 N_terminal_mass);
441 break;
442 default:
443 assert(false);
446 //for (unsigned int i=0; i<mass_ladder.size(); i++)
447 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
449 for (int charge=1; charge<=max_fragment_charge; charge++)
450 synth_sp[charge][ion_type] = spectrum(peptide_mass, charge,
451 mass_ladder);
456 // IDEA: could we combine all these spectra and just do the correlation once?
458 // Score the specified spectrum against a group of synthetic fragment
459 // spectra.
460 static inline void
461 score_spectrum(double &hyper_score, double &convolution_score,
462 std::vector<double> &ion_scores, std::vector<int> &ion_peaks,
463 spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
464 const int spectrum_id, const int max_fragment_charge) {
465 const parameters &CP = parameters::the;
466 const spectrum sp = spectrum::searchable_spectra[spectrum_id];
467 const int spectrum_charge = sp.charge;
468 hyper_score = 1.0;
469 convolution_score = 0;
471 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
472 int i_peaks = 0;
473 double i_scores = 0.0;
474 const int charge_limit = std::max<int>(1, spectrum_charge-1);
475 for (int charge=1; charge<=charge_limit; charge++) {
476 assert(charge <= max_fragment_charge);
478 // convolution score is just sum over charges/ions
479 // hyperscore is product of p! over charges/ions (where p is corr peak
480 // count) times the convolution score (clipped to FLT_MAX)
481 // > blurred!
482 int common_peak_count; // FIX!
483 double conv_score = spectrum::score_similarity(synth_sp[charge][ion_type],
484 sp, &common_peak_count);
485 i_peaks += common_peak_count;
486 i_scores += conv_score;
487 hyper_score *= CP.factorial[common_peak_count];
488 convolution_score += conv_score;
491 ion_peaks[ion_type] = i_peaks;
492 ion_scores[ion_type] = i_scores;
494 hyper_score *= convolution_score;
498 #if 0
500 struct generator;
503 struct generator {
504 // fx = &generator::generate_static_aa_mod;
506 typedef void generator::generator_fx(score_stats &stats, const match &m,
507 std::vector<double> position_mass,
508 const std::vector<double> &terminal_mass,
509 const std::vector<generator>::iterator g_it);
511 generator() : fx(0) { }
513 generator_fx generate_static_aa_mod;
515 generator_fx generator::*fx;
516 std::vector<double> args;
520 void
521 generator::generate_static_aa_mod(score_stats &stats, const match &m,
522 std::vector<double> position_mass,
523 const std::vector<double> &terminal_mass,
524 const std::vector<generator>::iterator g_it) {
525 for (unsigned int i=0; i<m.peptide_sequence.size(); i++)
526 position_mass[i] += args[m.peptide_sequence[i]];
528 ((*g_it).*(g_it->fx))(stats, m, position_mass, terminal_mass, g_it+1);
531 #endif
534 // FIX: currently only one mass_list is implemented, meaning that parent and
535 // fragment masses must be the same (e.g., mono/mono). A workaround is to
536 // just increase the parent error plus a bit (the xtandem default seems to
537 // already do this).
539 // Search for matches of this particular peptide modification variation
540 // against the spectra. Updates score_stats and returns the number of
541 // candidate spectra found.
542 static inline void
543 evaluate_peptide_mod_variation(match &m,
544 const std::vector<double> &mass_list,
545 const double N_terminal_mass,
546 const double C_terminal_mass,
547 score_stats &stats) {
548 const parameters &CP = parameters::the;
549 assert(m.peptide_sequence.size() == mass_list.size());
551 m.peptide_mass = get_peptide_mass(mass_list, N_terminal_mass,
552 C_terminal_mass);
553 double sp_mass_lb = m.peptide_mass + CP.parent_monoisotopic_mass_error_minus;
554 double sp_mass_ub = m.peptide_mass + CP.parent_monoisotopic_mass_error_plus;
555 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
556 candidate_spectra_info_begin
557 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
558 // FIX: is there a better way?
559 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
560 candidate_spectra_info_end
561 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
562 if (candidate_spectra_info_begin == candidate_spectra_info_end)
563 return;
565 int max_candidate_charge = 0;
566 for (std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
567 it = candidate_spectra_info_begin;
568 it != candidate_spectra_info_end; it++)
569 max_candidate_charge
570 = std::max<int>(max_candidate_charge,
571 spectrum::searchable_spectra[it->second].charge);
573 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
574 assert(max_fragment_charge <= spectrum::max_supported_charge);
575 // e.g. +2 -> 'B' -> spectrum
576 // FIX: should this be a std::vector??
577 static spectrum synth_sp[spectrum::max_supported_charge+1][ION_MAX];
578 synthetic_spectra(synth_sp, m.peptide_sequence, m.peptide_mass, mass_list,
579 N_terminal_mass, C_terminal_mass, max_fragment_charge);
581 for (std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
582 candidate_it = candidate_spectra_info_begin;
583 candidate_it != candidate_spectra_info_end; candidate_it++) {
584 m.spectrum_index = candidate_it->second;
585 stats.candidate_spectrum_count++;
586 score_spectrum(m.hyper_score, m.convolution_score, m.ion_scores,
587 m.ion_peaks, synth_sp, m.spectrum_index,
588 max_fragment_charge);
589 if (m.convolution_score <= 2.0)
590 continue;
592 //std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
593 //<< " " << m.peptide_mass << " " <<
594 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
596 // update spectrum histograms
597 std::vector<int> &hh = stats.hyperscore_histogram[m.spectrum_index];
598 int binned_scaled_hyper_score = int(0.5 + scale_hyperscore(m.hyper_score));
599 assert(binned_scaled_hyper_score >= 0);
600 // FIX
601 if (static_cast<int>(hh.size())-1 < binned_scaled_hyper_score) {
602 assert(hh.size() < INT_MAX/2);
603 hh.resize(std::max<unsigned int>(binned_scaled_hyper_score+1,
604 2*hh.size()));
606 hh[binned_scaled_hyper_score] += 1;
607 // incr m_tPeptideScoredCount
608 // nyi: permute stuff
609 // FIX
610 const int sp_ion_count = m.ion_peaks[ION_B] + m.ion_peaks[ION_Y];
611 if (sp_ion_count < CP.minimum_ion_count)
612 continue;
613 const bool has_b_and_y = m.ion_peaks[ION_B] and m.ion_peaks[ION_Y];
614 if (not has_b_and_y)
615 continue;
617 // check that parent masses are within error range (isotope ni)
618 // already done above (why does xtandem do it here?)
619 // if check fails, only eligible for 2nd-best record
621 // Remember all of the highest-hyper-scoring matches against each
622 // spectrum. These might be in multiple domains (pos, length, mods) in
623 // multiple proteins.
625 // To avoid the problems that xtandem has with inexact float-point
626 // calculations, we consider hyperscores within the "epsilon ratio" to
627 // be "equal". The best_match vector will need to be re-screened
628 // against this ratio, because it may finally contain entries which
629 // don't meet our criterion. (We don't take the time to get it exactly
630 // right here, because we may end up discarding it all anyway.)
632 const double current_ratio = (m.hyper_score
633 / stats.best_score[m.spectrum_index]);
635 if (CP.quirks_mode and (m.hyper_score < stats.best_score[m.spectrum_index])
636 or (not CP.quirks_mode
637 and (current_ratio < CP.hyper_score_epsilon_ratio))) {
638 // This isn't a best score, so see if it's a second-best score.
639 if (m.hyper_score > stats.second_best_score[m.spectrum_index])
640 stats.second_best_score[m.spectrum_index] = m.hyper_score;
641 continue;
643 if (CP.quirks_mode and (m.hyper_score > stats.best_score[m.spectrum_index])
644 or (not CP.quirks_mode
645 and (current_ratio > (1/CP.hyper_score_epsilon_ratio)))) {
646 // This score is significantly better, so forget previous matches.
647 stats.best_score[m.spectrum_index] = m.hyper_score;
648 stats.best_match[m.spectrum_index].clear();
650 // Now remember this match because it's at least as good as those
651 // previously seen.
652 stats.best_match[m.spectrum_index].push_back(m);
657 // Choose a possible modification to account for PCA (pyrrolidone carboxyl
658 // acid) circularization of the peptide N-terminal. This is excluded if a
659 // static N-terminal mod was specified. (FIX: Does a PCA mod exclude other
660 // potential mods?)
661 static inline void
662 choose_PCA_mod(match &m, std::vector<double> &mass_list,
663 const double N_terminal_mass, const double C_terminal_mass,
664 score_stats &stats) {
665 const parameters &CP = parameters::the;
666 evaluate_peptide_mod_variation(m, mass_list, N_terminal_mass,
667 C_terminal_mass, stats);
669 const int mass_regime=0; // FIX
671 if (CP.fragment_mass_regime[mass_regime].modification_mass['['] == 0) {
672 for (int bug=0; bug<(CP.quirks_mode? 2 : 1); bug++) // FIX
673 switch (m.peptide_sequence[0]) {
674 case 'E':
675 evaluate_peptide_mod_variation(m, mass_list,
676 N_terminal_mass
677 -CP.fragment_mass_regime[mass_regime].water_mass,
678 C_terminal_mass, stats);
679 break;
680 case 'C': {
681 // FIX: need better test for C+57? (symbolic?)
682 const double C_mod
683 = CP.fragment_mass_regime[mass_regime].modification_mass['C'];
684 if (CP.quirks_mode ? int(C_mod) != 57 : std::abs(C_mod - 57) > 0.5)
685 break; // skip unless C+57
687 case 'Q':
688 evaluate_peptide_mod_variation(m, mass_list,
689 N_terminal_mass
690 -CP.fragment_mass_regime[mass_regime].ammonia_mass,
691 C_terminal_mass, stats);
692 break;
698 // Choose among the requested mass regimes (e.g., isotopes), and also choose
699 // the static mods (including N- and C-terminal mods), which are determined by
700 // the mass regime choice.
701 static inline void
702 choose_mass_regime(match &m, std::vector<double> &mass_list,
703 const double N_terminal_mass, const double C_terminal_mass,
704 score_stats &stats) {
705 const parameters &CP = parameters::the;
707 const int mass_regime=0; // FIX
708 for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size(); i++) {
709 const char res = m.peptide_sequence[i];
710 mass_list[i] = (CP.fragment_mass_regime[mass_regime].residue_mass[res]
711 + CP.fragment_mass_regime[mass_regime].modification_mass[res]);
713 choose_PCA_mod(m, mass_list,
714 N_terminal_mass
715 + CP.fragment_mass_regime[mass_regime].modification_mass['['],
716 C_terminal_mass
717 + CP.fragment_mass_regime[mass_regime].modification_mass[']'],
718 stats);
722 // Search for matches of all modification variations of this peptide against
723 // the spectra. Updates score_stats and returns the number of candidate
724 // spectra found.
725 void
726 spectrum::search_peptide_all_mods(int idno, int offset, int begin,
727 const std::string &peptide_seq,
728 int missed_cleavage_count,
729 score_stats &stats) {
730 std::vector<double> mass_list(peptide_seq.size());
731 double N_terminal_mass = 0.0;
732 double C_terminal_mass = 0.0;
734 // This match will be passed inward and used to record information that we
735 // need to remember about a match when we finally see one. At that point, a
736 // copy of this match will be saved.
737 match m;
738 m.sequence_index = idno;
739 m.sequence_offset = offset;
740 m.peptide_begin = begin;
741 m.peptide_sequence = peptide_seq;
742 m.missed_cleavage_count = missed_cleavage_count;
744 choose_mass_regime(m, mass_list, N_terminal_mass, C_terminal_mass, stats);
748 // Return the similarity score between this spectrum and that, and also a
749 // count of common peaks in *peak_count.
751 // xtandem quantizes mz values into bins of width fragment_mass_error first,
752 // and adds a bin on either side. This makes the effective fragment mass
753 // error for each peak an arbitrary value between 1x and 2x the parameter,
754 // based on quantization of that peak. If quirks_mode is on, we'll try to
755 // emulate this behavior.
756 double
757 spectrum::score_similarity(const spectrum &x, const spectrum &y,
758 int *peak_count) {
759 const double frag_err = parameters::the.fragment_mass_error;
760 const bool quirks_mode = parameters::the.quirks_mode;
762 double score = 0;
763 *peak_count = 0;
764 std::vector<peak>::const_iterator x_it = x.peaks.begin();
765 std::vector<peak>::const_iterator y_it = y.peaks.begin();
767 while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
768 double x_mz = x_it->mz, y_mz = y_it->mz;
769 if (quirks_mode) {
770 const float ffe = frag_err;
771 x_mz = std::floor(static_cast<float>(x_mz) / ffe) * ffe;
772 y_mz = std::floor(static_cast<float>(y_mz) / ffe) * ffe;
774 // in quirks mode, must be within one frag_err-wide bin
775 if (std::abs(x_mz - y_mz) < (quirks_mode ? frag_err*1.5 : frag_err)) {
776 *peak_count += 1;
777 score += (x_it->intensity * y_it->intensity);
778 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
779 #if 0
780 // assume peaks aren't "close" together
781 x_it++;
782 y_it++;
783 continue;
784 #endif
786 if (x_it->mz < y_it->mz)
787 x_it++;
788 else
789 y_it++;
792 return score;
796 // FIX: obsolete?
797 double
798 get_parent_peptide_mass(const std::string &peptide_seq,
799 const unsigned mass_regime) {
800 const parameters &CP = parameters::the;
801 const bool is_N=false, is_C=false; // FIX
802 double NC_mods = 0;
803 if (is_N)
804 NC_mods += CP.parent_mass_regime[mass_regime].modification_mass['['];
805 if (is_C)
806 NC_mods += CP.parent_mass_regime[mass_regime].modification_mass[']'];
808 double residue_sum = 0;
809 for (std::string::const_iterator it=peptide_seq.begin();
810 it != peptide_seq.end(); it++)
811 residue_sum += (CP.parent_mass_regime[mass_regime].residue_mass[*it]
812 + CP.parent_mass_regime[mass_regime].modification_mass[*it]);
813 return (NC_mods + residue_sum
814 + CP.cleavage_N_terminal_mass_change
815 + CP.cleavage_C_terminal_mass_change
816 + CP.proton_mass);
820 double
821 get_peptide_mass(const std::vector<double> &mass_list,
822 const double N_terminal_mass, const double C_terminal_mass) {
823 const parameters &CP = parameters::the;
825 double m = (N_terminal_mass + C_terminal_mass
826 + CP.cleavage_N_terminal_mass_change
827 + CP.cleavage_C_terminal_mass_change
828 + CP.proton_mass);
829 return std::accumulate(mass_list.begin(), mass_list.end(), m);
833 // returns log-scaled hyperscore
834 double
835 scale_hyperscore(double hyper_score) {
836 assert(hyper_score >= 0); // otherwise check for EDOM, ERANGE
837 if (hyper_score == 0)
838 return -DBL_MAX; // FIX: this shouldn't be reached?
839 return 4 * std::log10(hyper_score);